diff --git a/.gitmodules b/.gitmodules
index fbf2f59b38da1..3426b1bc8dbde 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -18,10 +18,6 @@
 	path = src/doc/rust-by-example
 	url = https://github.com/rust-lang/rust-by-example.git
 	shallow = true
-[submodule "library/stdarch"]
-	path = library/stdarch
-	url = https://github.com/rust-lang/stdarch.git
-	shallow = true
 [submodule "src/doc/edition-guide"]
 	path = src/doc/edition-guide
 	url = https://github.com/rust-lang/edition-guide.git
diff --git a/library/stdarch b/library/stdarch
deleted file mode 160000
index 5c1c436524c0b..0000000000000
--- a/library/stdarch
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 5c1c436524c0bbc8db83577f42f8bea9006a7b75
diff --git a/library/stdarch/.cirrus.yml b/library/stdarch/.cirrus.yml
new file mode 100644
index 0000000000000..a0ecc03b953fd
--- /dev/null
+++ b/library/stdarch/.cirrus.yml
@@ -0,0 +1,16 @@
+task:
+  name: x86_64-unknown-freebsd
+  freebsd_instance:
+    image_family: freebsd-13-4
+  env:
+    # FIXME(freebsd): FreeBSD has a segfault when `RUST_BACKTRACE` is set
+    # https://github.com/rust-lang/rust/issues/132185
+    RUST_BACKTRACE: "0"
+  setup_script:
+    - curl https://sh.rustup.rs -sSf --output rustup.sh
+    - sh rustup.sh --default-toolchain nightly -y
+    - . $HOME/.cargo/env
+    - rustup default nightly
+  test_script:
+    - . $HOME/.cargo/env
+    - cargo build --all
diff --git a/library/stdarch/.git-blame-ignore-revs b/library/stdarch/.git-blame-ignore-revs
new file mode 100644
index 0000000000000..d6021c4f2adb1
--- /dev/null
+++ b/library/stdarch/.git-blame-ignore-revs
@@ -0,0 +1,4 @@
+# Use `git config blame.ignorerevsfile .git-blame-ignore-revs` to make `git blame` ignore the following commits.
+
+# format with style edition 2024
+fc87bd98d689590a0b6f5ee4110c5b9f962faa66
diff --git a/library/stdarch/.github/workflows/main.yml b/library/stdarch/.github/workflows/main.yml
new file mode 100644
index 0000000000000..8c6dee16fb618
--- /dev/null
+++ b/library/stdarch/.github/workflows/main.yml
@@ -0,0 +1,288 @@
+name: CI
+on:
+  pull_request:
+  merge_group:
+
+jobs:
+  style:
+    name: Check Style
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install Rust
+      run: rustup update nightly --no-self-update && rustup default nightly
+    - run: ci/style.sh
+
+  docs:
+    name: Build Documentation
+    needs: [style]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install Rust
+      run: rustup update nightly --no-self-update && rustup default nightly
+    - run: ci/dox.sh
+      env:
+        CI: 1
+
+  verify:
+    name: Automatic intrinsic verification
+    needs: [style]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install Rust
+      run: rustup update nightly --no-self-update && rustup default nightly
+    - run: cargo test --manifest-path crates/stdarch-verify/Cargo.toml
+
+  test:
+    needs: [style]
+    name: Test
+    runs-on: ${{ matrix.target.os }}
+    strategy:
+      matrix:
+        profile:
+        - dev
+        - release
+        target:
+        # Dockers that are run through docker on linux
+        - tuple: i686-unknown-linux-gnu
+          os: ubuntu-latest
+        - tuple: x86_64-unknown-linux-gnu
+          os: ubuntu-latest
+        - tuple: arm-unknown-linux-gnueabihf
+          os: ubuntu-latest
+        - tuple: armv7-unknown-linux-gnueabihf
+          os: ubuntu-latest
+        - tuple: aarch64-unknown-linux-gnu
+          os: ubuntu-latest
+        - tuple: aarch64_be-unknown-linux-gnu
+          os: ubuntu-latest
+        - tuple: riscv32gc-unknown-linux-gnu
+          os: ubuntu-latest
+        - tuple: riscv64gc-unknown-linux-gnu
+          os: ubuntu-latest
+        - tuple: powerpc-unknown-linux-gnu
+          os: ubuntu-latest
+        - tuple: powerpc64-unknown-linux-gnu
+          os: ubuntu-latest
+        - tuple: powerpc64le-unknown-linux-gnu
+          os: ubuntu-latest
+        # MIPS targets disabled since they are dropped to tier 3.
+        # See https://github.com/rust-lang/compiler-team/issues/648
+        #- tuple: mips-unknown-linux-gnu
+        #  os: ubuntu-latest
+        #- tuple: mips64-unknown-linux-gnuabi64
+        #  os: ubuntu-latest
+        #- tuple: mips64el-unknown-linux-gnuabi64
+        #  os: ubuntu-latest
+        #- tuple: mipsel-unknown-linux-musl
+        #  os: ubuntu-latest
+        - tuple: s390x-unknown-linux-gnu
+          os: ubuntu-latest
+        - tuple: i586-unknown-linux-gnu
+          os: ubuntu-latest
+        - tuple: nvptx64-nvidia-cuda
+          os: ubuntu-latest
+        - tuple: thumbv6m-none-eabi
+          os: ubuntu-latest
+        - tuple: thumbv7m-none-eabi
+          os: ubuntu-latest
+        - tuple: thumbv7em-none-eabi
+          os: ubuntu-latest
+        - tuple: thumbv7em-none-eabihf
+          os: ubuntu-latest
+        - tuple: loongarch64-unknown-linux-gnu
+          os: ubuntu-latest
+        - tuple: wasm32-wasip1
+          os: ubuntu-latest
+
+        # macOS targets
+        - tuple: x86_64-apple-darwin
+          os: macos-15-large
+        - tuple: x86_64-apple-ios-macabi
+          os: macos-15-large
+        - tuple: aarch64-apple-darwin
+          os: macos-15
+        - tuple: aarch64-apple-ios-macabi
+          os: macos-15
+        # FIXME: gh-actions build environment doesn't have linker support
+        # - tuple: i686-apple-darwin
+        #   os: macos-13
+
+        # Windows targets
+        - tuple: x86_64-pc-windows-msvc
+          os: windows-2025
+        - tuple: i686-pc-windows-msvc
+          os: windows-2025
+        - tuple: aarch64-pc-windows-msvc
+          os: windows-11-arm
+        - tuple: x86_64-pc-windows-gnu
+          os: windows-2025
+        # - tuple: i686-pc-windows-gnu
+        #   os: windows-latest
+
+        # Add additional variables to the matrix variations generated above using `include`:
+        include:
+        # `TEST_EVERYTHING` setups - there should be at least 1 for each architecture
+        - target:
+            tuple: aarch64-unknown-linux-gnu
+            os: ubuntu-latest
+          test_everything: true
+        - target:
+            tuple: aarch64_be-unknown-linux-gnu
+            os: ubuntu-latest
+          test_everything: true
+          build_std: true
+        - target:
+            tuple: armv7-unknown-linux-gnueabihf
+            os: ubuntu-latest
+          test_everything: true
+        - target:
+            tuple: loongarch64-unknown-linux-gnu
+            os: ubuntu-latest
+          test_everything: true
+        - target:
+            tuple: powerpc-unknown-linux-gnu
+            os: ubuntu-latest
+          disable_assert_instr: true
+          test_everything: true
+        - target:
+            tuple: powerpc64-unknown-linux-gnu
+            os: ubuntu-latest
+          disable_assert_instr: true
+          test_everything: true
+        - target:
+            tuple: powerpc64le-unknown-linux-gnu
+            os: ubuntu-latest
+          test_everything: true
+        - target:
+            tuple: riscv32gc-unknown-linux-gnu
+            os: ubuntu-latest
+          test_everything: true
+          build_std: true
+        - target:
+            tuple: riscv64gc-unknown-linux-gnu
+            os: ubuntu-latest
+          test_everything: true
+        - target:
+            tuple: s390x-unknown-linux-gnu
+            os: ubuntu-latest
+          test_everything: true
+        - target:
+            tuple: x86_64-unknown-linux-gnu
+            os: ubuntu-latest
+          test_everything: true
+        # MIPS targets disabled since they are dropped to tier 3.
+        # See https://github.com/rust-lang/compiler-team/issues/648
+        #- target:
+        #    tuple: mips-unknown-linux-gnu
+        #    os: ubuntu-latest
+        #  norun: true
+        #- target:
+        #    tuple: mips64-unknown-linux-gnuabi64
+        #    os: ubuntu-latest
+        #  norun: true
+        #- target:
+        #    tuple: mips64el-unknown-linux-gnuabi64
+        #    os: ubuntu-latest
+        #  norun: true
+        #- target:
+        #    tuple: mipsel-unknown-linux-musl
+        #    os: ubuntu-latest
+        #  norun: true
+        - target:
+            tuple: aarch64-apple-darwin
+            os: macos-15
+          norun: true # https://github.com/rust-lang/stdarch/issues/1206
+        - target:
+            tuple: aarch64-apple-ios-macabi
+            os: macos-15
+          norun: true # https://github.com/rust-lang/stdarch/issues/1206
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install Rust
+      run: |
+        rustup update nightly --no-self-update
+        rustup default nightly
+      shell: bash
+      if: matrix.target.os != 'windows-11-arm'
+    - name: Install Rust for `windows-11-arm` runners
+      # The arm runners don't have Rust pre-installed (https://github.com/actions/partner-runner-images/issues/77)
+      run: |
+        curl https://sh.rustup.rs | sh -s -- -y --default-toolchain nightly
+        echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+      shell: bash
+      if: matrix.target.os == 'windows-11-arm'
+
+    - run: rustup target add ${{ matrix.target.tuple }}
+      shell: bash
+      if: matrix.build_std == ''
+    - run: |
+        rustup component add rust-src
+        echo "CARGO_UNSTABLE_BUILD_STD=std" >> $GITHUB_ENV
+      shell: bash
+      if: matrix.build_std != ''
+
+    # Configure some env vars based on matrix configuration
+    - run: echo "PROFILE=--profile=${{matrix.profile}}" >> $GITHUB_ENV
+      shell: bash
+    - run: echo "NORUN=1" >> $GITHUB_ENV
+      shell: bash
+      if: matrix.norun != '' || startsWith(matrix.target.tuple, 'thumb') || matrix.target.tuple == 'nvptx64-nvidia-cuda'
+    - run: echo "STDARCH_TEST_EVERYTHING=1" >> $GITHUB_ENV
+      shell: bash
+      if: matrix.test_everything != ''
+    - run: echo "STDARCH_DISABLE_ASSERT_INSTR=1" >> $GITHUB_ENV
+      shell: bash
+      if: matrix.disable_assert_instr != ''
+    - run: echo "NOSTD=1" >> $GITHUB_ENV
+      shell: bash
+      if: startsWith(matrix.target.tuple, 'thumb') || matrix.target.tuple == 'nvptx64-nvidia-cuda'
+
+    # Windows & OSX go straight to `run.sh` ...
+    - run: ./ci/run.sh
+      shell: bash
+      if: matrix.target.os != 'ubuntu-latest' || startsWith(matrix.target.tuple, 'thumb')
+      env:
+        TARGET: ${{ matrix.target.tuple }}
+
+    # ... while Linux goes to `run-docker.sh`
+    - run: ./ci/run-docker.sh ${{ matrix.target.tuple }}
+      shell: bash
+      if: matrix.target.os == 'ubuntu-latest' && !startsWith(matrix.target.tuple, 'thumb')
+      env:
+        TARGET: ${{ matrix.target.tuple }}
+
+  build-std-detect:
+    needs: [style]
+    name: Build std_detect
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install Rust
+      run: rustup update nightly && rustup default nightly
+    - run: ./ci/build-std-detect.sh
+
+  conclusion:
+    needs:
+      - docs
+      - verify
+      - test
+      - build-std-detect
+    runs-on: ubuntu-latest
+    # We need to ensure this job does *not* get skipped if its dependencies fail,
+    # because a skipped job is considered a success by GitHub. So we have to
+    # overwrite `if:`. We use `!cancelled()` to ensure the job does still not get run
+    # when the workflow is canceled manually.
+    #
+    # ALL THE PREVIOUS JOBS NEED TO BE ADDED TO THE `needs` SECTION OF THIS JOB!
+    if: ${{ !cancelled() }} # make sure this is never "skipped"
+    steps:
+      - name: Conclusion
+        run: |
+          # Print the dependent jobs to see them in the CI log
+          jq -C <<< '${{ toJson(needs) }}'
+          # Check if all jobs that we depend on (in the needs array) were successful.
+          jq --exit-status 'all(.result == "success")' <<< '${{ toJson(needs) }}'
diff --git a/library/stdarch/.gitignore b/library/stdarch/.gitignore
new file mode 100644
index 0000000000000..3789bafe78630
--- /dev/null
+++ b/library/stdarch/.gitignore
@@ -0,0 +1,10 @@
+Cargo.lock
+.*.swp
+target
+tags
+crates/stdarch-gen-arm/aarch64.rs
+crates/stdarch-gen-arm/arm.rs
+crates/stdarch-gen-loongarch/lasx.c
+crates/stdarch-gen-loongarch/lsx.c
+c_programs/*
+rust_programs/*
diff --git a/library/stdarch/.gitmodules b/library/stdarch/.gitmodules
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/library/stdarch/CONTRIBUTING.md b/library/stdarch/CONTRIBUTING.md
new file mode 100644
index 0000000000000..97a710d7b0854
--- /dev/null
+++ b/library/stdarch/CONTRIBUTING.md
@@ -0,0 +1,93 @@
+# Contributing to stdarch
+
+The `stdarch` crate is more than willing to accept contributions! First you'll
+probably want to check out the repository and make sure that tests pass for you:
+
+```
+$ git clone https://github.com/rust-lang/stdarch
+$ cd stdarch
+$ TARGET="<your-target-arch>" ci/run.sh
+```
+
+Where `<your-target-arch>` is the target triple as used by `rustup`, e.g. `x86_64-unknown-linux-gnu` (without any preceding `nightly-` or similar).
+Also remember that this repository requires the nightly channel of Rust!
+The above tests do in fact require nightly rust to be the default on your system, to set that use `rustup default nightly` (and `rustup default stable` to revert).
+
+If any of the above steps don't work, [please let us know][new]!
+
+Next up you can [find an issue][issues] to help out on, we've selected a few
+with the [`help wanted`][help] tag which could
+particularly use some help. You may be most interested in [#40][vendor],
+implementing all vendor intrinsics on x86. That issue's got some good pointers
+about where to get started!
+
+If you've got general questions feel free to [join us on gitter][gitter] and ask
+around! Feel free to ping either @BurntSushi or @alexcrichton with questions.
+
+[gitter]: https://gitter.im/rust-impl-period/WG-libs-simd
+
+# How to write examples for stdarch intrinsics
+
+There are a few features that must be enabled for the given intrinsic to work
+properly and the example must only be run by `cargo test --doc` when the feature
+is supported by the CPU. As a result, the default `fn main` that is generated by
+`rustdoc` will not work (in most cases). Consider using the following as a guide
+to ensure your example works as expected.
+
+```rust
+/// # // We need cfg_target_feature to ensure the example is only
+/// # // run by `cargo test --doc` when the CPU supports the feature
+/// # #![feature(cfg_target_feature)]
+/// # // We need target_feature for the intrinsic to work
+/// # #![feature(target_feature)]
+/// #
+/// # // rustdoc by default uses `extern crate stdarch`, but we need the
+/// # // `#[macro_use]`
+/// # #[macro_use] extern crate stdarch;
+/// #
+/// # // The real main function
+/// # fn main() {
+/// #     // Only run this if `<target feature>` is supported
+/// #     if cfg_feature_enabled!("<target feature>") {
+/// #         // Create a `worker` function that will only be run if the target feature
+/// #         // is supported and ensure that `target_feature` is enabled for your worker
+/// #         // function
+/// #         #[target_feature(enable = "<target feature>")]
+/// #         unsafe fn worker() {
+///
+/// // Write your example here. Feature specific intrinsics will work here! Go wild!
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+```
+
+If some of the above syntax does not look familiar, the [Documentation as tests] section
+of the [Rust Book] describes the `rustdoc` syntax quite well. As always, feel free
+to [join us on gitter][gitter] and ask us if you hit any snags, and thank you for helping
+to improve the documentation of `stdarch`!
+
+# Alternative Testing Instructions
+
+It is generally recommended that you use `ci/run-docker.sh` to run the tests.
+However this might not work for you, e.g. if you are on Windows.
+
+In that case you can fall back to running `cargo +nightly test` and `cargo +nightly test --release -p core_arch` for testing the code generation.
+Note that these require the nightly toolchain to be installed and for `rustc` to know about your target triple and its CPU.
+In particular you need to set the `TARGET` environment variable as you would for `ci/run.sh`.
+In addition you need to set `RUSTCFLAGS` (need the `C`) to indicate target features, e.g. `RUSTCFLAGS="-C -target-features=+avx2"`.
+You can also set `-C -target-cpu=native` if you're "just" developing against your current CPU.
+
+Be warned that when you use these alternative instructions, [things may go less smoothly than they would with `ci/run-docker.sh`][ci-run-good], e.g. instruction generation tests may fail because the disassembler named them differently, e.g. it may generate `vaesenc` instead of `aesenc` instructions despite them behaving the same.
+Also these instructions execute less tests than would normally be done, so don't be surprised that when you eventually pull-request some errors may show up for tests not covered here.
+
+
+[new]: https://github.com/rust-lang/stdarch/issues/new
+[issues]: https://github.com/rust-lang/stdarch/issues
+[help]: https://github.com/rust-lang/stdarch/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22
+[impl]: https://github.com/rust-lang/stdarch/issues?q=is%3Aissue+is%3Aopen+label%3Aimpl-period
+[vendor]: https://github.com/rust-lang/stdarch/issues/40
+[Documentation as tests]: https://doc.rust-lang.org/book/first-edition/documentation.html#documentation-as-tests
+[Rust Book]: https://doc.rust-lang.org/book/first-edition
+[ci-run-good]: https://github.com/rust-lang/stdarch/issues/931#issuecomment-711412126
diff --git a/library/stdarch/Cargo.toml b/library/stdarch/Cargo.toml
new file mode 100644
index 0000000000000..0db26f31a2d3b
--- /dev/null
+++ b/library/stdarch/Cargo.toml
@@ -0,0 +1,19 @@
+[workspace]
+resolver = "1"
+members = [
+  "crates/*",
+  "examples",
+]
+exclude = [
+  "crates/wasm-assert-instr-tests"
+]
+
+[profile.release]
+debug = true
+opt-level = 3
+incremental = true
+
+[profile.bench]
+debug = 1
+opt-level = 3
+incremental = true
diff --git a/library/stdarch/LICENSE-APACHE b/library/stdarch/LICENSE-APACHE
new file mode 100644
index 0000000000000..16fe87b06e802
--- /dev/null
+++ b/library/stdarch/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/library/stdarch/LICENSE-MIT b/library/stdarch/LICENSE-MIT
new file mode 100644
index 0000000000000..52d82415d8b60
--- /dev/null
+++ b/library/stdarch/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2017 The Rust Project Developers
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/library/stdarch/README.md b/library/stdarch/README.md
new file mode 100644
index 0000000000000..70ec256e681e0
--- /dev/null
+++ b/library/stdarch/README.md
@@ -0,0 +1,18 @@
+stdarch - Rust's standard library SIMD components
+=======
+
+[![Actions Status](https://github.com/rust-lang/stdarch/workflows/CI/badge.svg)](https://github.com/rust-lang/stdarch/actions)
+
+
+# Crates
+
+This repository contains two main crates:
+
+* [`core_arch`](crates/core_arch/README.md) implements `core::arch` - Rust's
+  core library architecture-specific intrinsics, and
+  
+* [`std_detect`](crates/std_detect/README.md) implements `std::detect` - Rust's
+  standard library run-time CPU feature detection.
+
+The `std::simd` component now lives in the
+[`packed_simd_2`](https://github.com/rust-lang/packed_simd) crate.
diff --git a/library/stdarch/ci/build-std-detect.sh b/library/stdarch/ci/build-std-detect.sh
new file mode 100755
index 0000000000000..e79a497cc3591
--- /dev/null
+++ b/library/stdarch/ci/build-std-detect.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+# Build std_detect on non-Linux & non-x86 targets.
+#
+# In std_detect, non-x86 targets have OS-specific implementations,
+# but we can test only Linux in CI. This script builds targets supported
+# by std_detect but cannot be tested in CI.
+
+set -ex
+cd "$(dirname "$0")"/..
+
+targets=(
+    # Linux
+    aarch64-unknown-linux-musl
+    armv5te-unknown-linux-musleabi
+    aarch64-unknown-linux-ohos
+    armv7-unknown-linux-ohos
+
+    # Android
+    aarch64-linux-android
+    arm-linux-androideabi
+
+    # FreeBSD
+    aarch64-unknown-freebsd
+    armv6-unknown-freebsd
+    powerpc-unknown-freebsd
+    powerpc64-unknown-freebsd
+
+    # OpenBSD
+    aarch64-unknown-openbsd
+
+    # Windows
+    aarch64-pc-windows-msvc
+)
+
+rustup component add rust-src # for -Z build-std
+
+cd crates/std_detect
+for target in "${targets[@]}"; do
+    if rustup target add "${target}" &>/dev/null; then
+        cargo build --target "${target}"
+    else
+        # tier 3 targets requires -Z build-std.
+        cargo build -Z build-std="core,alloc" --target "${target}"
+    fi
+done
diff --git a/library/stdarch/ci/docker/aarch64-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000000..17025efffea60
--- /dev/null
+++ b/library/stdarch/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,19 @@
+FROM ubuntu:25.04
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc \
+  g++ \
+  ca-certificates \
+  libc6-dev \
+  gcc-aarch64-linux-gnu \
+  g++-aarch64-linux-gnu \
+  libc6-dev-arm64-cross \
+  qemu-user \
+  make \
+  file \
+  clang-19 \
+  lld
+
+ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc \
+    CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER="qemu-aarch64 -cpu max -L /usr/aarch64-linux-gnu" \
+    OBJDUMP=aarch64-linux-gnu-objdump \
+    STDARCH_TEST_SKIP_FEATURE=tme
diff --git a/library/stdarch/ci/docker/aarch64_be-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/aarch64_be-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000000..74f770556dbe3
--- /dev/null
+++ b/library/stdarch/ci/docker/aarch64_be-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,30 @@
+FROM ubuntu:25.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc \
+  g++ \
+  ca-certificates \
+  libc6-dev \
+  libc6-dev-arm64-cross \
+  qemu-user \
+  make \
+  file \
+  clang-19 \
+  curl \
+  xz-utils \
+  lld
+
+ENV TOOLCHAIN="arm-gnu-toolchain-14.2.rel1-x86_64-aarch64_be-none-linux-gnu"
+
+# Download the aarch64_be gcc toolchain
+RUN curl -L "https://developer.arm.com/-/media/Files/downloads/gnu/14.2.rel1/binrel/${TOOLCHAIN}.tar.xz" -o "${TOOLCHAIN}.tar.xz"
+RUN tar -xvf "${TOOLCHAIN}.tar.xz"
+RUN mkdir /toolchains && mv "./${TOOLCHAIN}" /toolchains
+
+ENV AARCH64_BE_TOOLCHAIN="/toolchains/${TOOLCHAIN}"
+ENV AARCH64_BE_LIBC="${AARCH64_BE_TOOLCHAIN}/aarch64_be-none-linux-gnu/libc"
+
+ENV CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER="${AARCH64_BE_TOOLCHAIN}/bin/aarch64_be-none-linux-gnu-gcc"
+ENV CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_RUNNER="qemu-aarch64_be -cpu max -L ${AARCH64_BE_LIBC}"
+ENV OBJDUMP="${AARCH64_BE_TOOLCHAIN}/bin/aarch64_be-none-linux-gnu-objdump"
+ENV STDARCH_TEST_SKIP_FEATURE=tme
diff --git a/library/stdarch/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile b/library/stdarch/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
new file mode 100644
index 0000000000000..14eaf9f9eef06
--- /dev/null
+++ b/library/stdarch/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
@@ -0,0 +1,13 @@
+FROM ubuntu:25.04
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc \
+  ca-certificates \
+  libc6-dev \
+  gcc-arm-linux-gnueabihf \
+  libc6-dev-armhf-cross \
+  qemu-user \
+  make \
+  file
+ENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \
+    CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -cpu max -L /usr/arm-linux-gnueabihf" \
+    OBJDUMP=arm-linux-gnueabihf-objdump
diff --git a/library/stdarch/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile b/library/stdarch/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
new file mode 100644
index 0000000000000..2086e117d92bc
--- /dev/null
+++ b/library/stdarch/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
@@ -0,0 +1,17 @@
+FROM ubuntu:24.04
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc \
+  g++ \
+  ca-certificates \
+  libc6-dev \
+  gcc-arm-linux-gnueabihf \
+  g++-arm-linux-gnueabihf \
+  libc6-dev-armhf-cross \
+  qemu-user \
+  make \
+  file \
+  clang-19 \
+  lld
+ENV CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \
+    CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -cpu max -L /usr/arm-linux-gnueabihf" \
+    OBJDUMP=arm-linux-gnueabihf-objdump
diff --git a/library/stdarch/ci/docker/i586-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/i586-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000000..5a4a22369a805
--- /dev/null
+++ b/library/stdarch/ci/docker/i586-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,7 @@
+FROM ubuntu:25.04
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc-multilib \
+  libc6-dev \
+  file \
+  make \
+  ca-certificates
diff --git a/library/stdarch/ci/docker/i686-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/i686-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000000..5a4a22369a805
--- /dev/null
+++ b/library/stdarch/ci/docker/i686-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,7 @@
+FROM ubuntu:25.04
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc-multilib \
+  libc6-dev \
+  file \
+  make \
+  ca-certificates
diff --git a/library/stdarch/ci/docker/loongarch64-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/loongarch64-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000000..99ccf286f36de
--- /dev/null
+++ b/library/stdarch/ci/docker/loongarch64-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,12 @@
+FROM ubuntu:25.04
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev qemu-user-static ca-certificates \
+    gcc-14-loongarch64-linux-gnu libc6-dev-loong64-cross
+
+
+ENV CARGO_TARGET_LOONGARCH64_UNKNOWN_LINUX_GNU_LINKER=loongarch64-linux-gnu-gcc-14 \
+    CARGO_TARGET_LOONGARCH64_UNKNOWN_LINUX_GNU_RUNNER="qemu-loongarch64-static -cpu max -L /usr/loongarch64-linux-gnu" \
+    OBJDUMP=loongarch64-linux-gnu-objdump \
+    STDARCH_TEST_SKIP_FEATURE=frecipe
diff --git a/library/stdarch/ci/docker/mips-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/mips-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000000..f43a3c966331e
--- /dev/null
+++ b/library/stdarch/ci/docker/mips-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,13 @@
+FROM ubuntu:25.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-mips-linux-gnu libc6-dev-mips-cross \
+        qemu-system-mips \
+        qemu-user \
+        make \
+        file
+
+ENV CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_LINKER=mips-linux-gnu-gcc \
+    CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_RUNNER="qemu-mips -L /usr/mips-linux-gnu" \
+    OBJDUMP=mips-linux-gnu-objdump
\ No newline at end of file
diff --git a/library/stdarch/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile b/library/stdarch/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
new file mode 100644
index 0000000000000..235ac0997b1e3
--- /dev/null
+++ b/library/stdarch/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:25.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-mips64-linux-gnuabi64 libc6-dev-mips64-cross \
+        qemu-system-mips64 qemu-user
+
+ENV CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_LINKER=mips64-linux-gnuabi64-gcc \
+    CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_RUNNER="qemu-mips64 -L /usr/mips64-linux-gnuabi64" \
+    OBJDUMP=mips64-linux-gnuabi64-objdump
\ No newline at end of file
diff --git a/library/stdarch/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile b/library/stdarch/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
new file mode 100644
index 0000000000000..6041d8911749e
--- /dev/null
+++ b/library/stdarch/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:25.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-mips64el-linux-gnuabi64 libc6-dev-mips64el-cross \
+        qemu-system-mips64el
+
+ENV CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_LINKER=mips64el-linux-gnuabi64-gcc \
+    CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_RUNNER="qemu-mips64el -L /usr/mips64el-linux-gnuabi64" \
+    OBJDUMP=mips64el-linux-gnuabi64-objdump
\ No newline at end of file
diff --git a/library/stdarch/ci/docker/mipsel-unknown-linux-musl/Dockerfile b/library/stdarch/ci/docker/mipsel-unknown-linux-musl/Dockerfile
new file mode 100644
index 0000000000000..cd38348eeb5c1
--- /dev/null
+++ b/library/stdarch/ci/docker/mipsel-unknown-linux-musl/Dockerfile
@@ -0,0 +1,25 @@
+FROM ubuntu:25.04
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    ca-certificates \
+    gcc \
+    libc6-dev \
+    make \
+    qemu-user \
+    qemu-system-mips \
+    bzip2 \
+    curl \
+    file
+
+RUN mkdir /toolchain
+
+# Note that this originally came from:
+# https://downloads.openwrt.org/snapshots/trunk/malta/generic/OpenWrt-Toolchain-malta-le_gcc-5.3.0_musl-1.1.15.Linux-x86_64.tar.bz2
+RUN curl -L https://ci-mirrors.rust-lang.org/libc/OpenWrt-Toolchain-malta-le_gcc-5.3.0_musl-1.1.15.Linux-x86_64.tar.bz2 | \
+      tar xjf - -C /toolchain --strip-components=2
+
+ENV PATH=$PATH:/rust/bin:/toolchain/bin \
+    CC_mipsel_unknown_linux_musl=mipsel-openwrt-linux-gcc \
+    CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_MUSL_LINKER=mipsel-openwrt-linux-gcc \
+    CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_MUSL_RUNNER="qemu-mipsel -L /toolchain"
diff --git a/library/stdarch/ci/docker/nvptx64-nvidia-cuda/Dockerfile b/library/stdarch/ci/docker/nvptx64-nvidia-cuda/Dockerfile
new file mode 100644
index 0000000000000..5b4869863c705
--- /dev/null
+++ b/library/stdarch/ci/docker/nvptx64-nvidia-cuda/Dockerfile
@@ -0,0 +1,5 @@
+FROM ubuntu:25.04
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc \
+  libc6-dev \
+  ca-certificates
diff --git a/library/stdarch/ci/docker/powerpc-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000000..baad95d57843e
--- /dev/null
+++ b/library/stdarch/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,12 @@
+FROM ubuntu:25.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-powerpc-linux-gnu libc6-dev-powerpc-cross \
+        qemu-system-ppc make file
+
+ENV CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_LINKER=powerpc-linux-gnu-gcc \
+    CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc -cpu mpc8610 -L /usr/powerpc-linux-gnu" \
+    CC=powerpc-linux-gnu-gcc \
+    OBJDUMP=powerpc-linux-gnu-objdump \
+    STDARCH_TEST_SKIP_FEATURE=vsx
diff --git a/library/stdarch/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000000..dcbcb43513ee6
--- /dev/null
+++ b/library/stdarch/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:25.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-powerpc64-linux-gnu libc6-dev-ppc64-cross \
+        file make
+
+ENV CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_LINKER=powerpc64-linux-gnu-gcc \
+    CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc64 -cpu power11 -L /usr/powerpc64-linux-gnu" \
+    CC=powerpc64-linux-gnu-gcc \
+    OBJDUMP=powerpc64-linux-gnu-objdump \
+    STDARCH_TEST_SKIP_FEATURE=vsx \
+#   These 2 tests have erratic behaviour with qemu, see https://gitlab.com/qemu-project/qemu/-/issues/1623#note_2449012173
+    STDARCH_TEST_SKIP_FUNCTION=vec_lde_u16,vec_lde_u32
diff --git a/library/stdarch/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000000..8dfac0ec1e412
--- /dev/null
+++ b/library/stdarch/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,12 @@
+FROM ubuntu:25.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-powerpc64le-linux-gnu libc6-dev-ppc64el-cross \
+        file make
+
+# Work around qemu triggering a sigill on vec_subs if the cpu target is not defined.
+ENV CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_LINKER=powerpc64le-linux-gnu-gcc \
+    CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc64le -cpu power11 -L /usr/powerpc64le-linux-gnu" \
+    CC=powerpc64le-linux-gnu-gcc \
+    OBJDUMP=powerpc64le-linux-gnu-objdump
diff --git a/library/stdarch/ci/docker/riscv32gc-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/riscv32gc-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000000..81f7b6239af53
--- /dev/null
+++ b/library/stdarch/ci/docker/riscv32gc-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,15 @@
+FROM ubuntu:25.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        wget xz-utils make file llvm
+
+ENV VERSION=2025.01.20
+
+RUN wget "https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/${VERSION}/riscv32-glibc-ubuntu-24.04-gcc-nightly-${VERSION}-nightly.tar.xz" \
+    -O riscv-toolchain.tar.xz
+RUN tar -xJf riscv-toolchain.tar.xz
+
+ENV CARGO_TARGET_RISCV32GC_UNKNOWN_LINUX_GNU_LINKER=/riscv/bin/riscv32-unknown-linux-gnu-gcc \
+    CARGO_TARGET_RISCV32GC_UNKNOWN_LINUX_GNU_RUNNER="qemu-riscv32 -cpu max -L /riscv/sysroot" \
+    OBJDUMP=llvm-objdump
diff --git a/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000000..7ee69e46e2e5b
--- /dev/null
+++ b/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:25.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-riscv64-linux-gnu libc6-dev-riscv64-cross \
+        llvm
+
+ENV CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_LINKER=riscv64-linux-gnu-gcc \
+    CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUNNER="qemu-riscv64 -cpu max -L /usr/riscv64-linux-gnu" \
+    OBJDUMP=llvm-objdump
diff --git a/library/stdarch/ci/docker/s390x-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/s390x-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000000..af02ebcbd169c
--- /dev/null
+++ b/library/stdarch/ci/docker/s390x-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:25.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        curl ca-certificates \
+        gcc libc6-dev \
+        gcc-s390x-linux-gnu libc6-dev-s390x-cross \
+        qemu-user \
+        make \
+        clang \
+        file
+
+ENV CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_LINKER=s390x-linux-gnu-gcc \
+    CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_RUNNER="qemu-s390x -cpu max -L /usr/s390x-linux-gnu" \
+    OBJDUMP=s390x-linux-gnu-objdump
diff --git a/library/stdarch/ci/docker/wasm32-wasip1/Dockerfile b/library/stdarch/ci/docker/wasm32-wasip1/Dockerfile
new file mode 100644
index 0000000000000..eeafde79733eb
--- /dev/null
+++ b/library/stdarch/ci/docker/wasm32-wasip1/Dockerfile
@@ -0,0 +1,13 @@
+FROM ubuntu:25.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -y && apt-get install -y --no-install-recommends \
+  ca-certificates \
+  curl \
+  xz-utils \
+  clang
+
+RUN curl -L https://github.com/bytecodealliance/wasmtime/releases/download/v18.0.2/wasmtime-v18.0.2-x86_64-linux.tar.xz | tar xJf -
+ENV PATH=$PATH:/wasmtime-v18.0.2-x86_64-linux
+
+ENV CARGO_TARGET_WASM32_WASIP1_RUNNER="wasmtime --dir /checkout/target/wasm32-wasip1/release/deps::."
diff --git a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000000..acde432794e5f
--- /dev/null
+++ b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,18 @@
+FROM ubuntu:25.04
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc \
+  libc6-dev \
+  file \
+  make \
+  ca-certificates \
+  wget \
+  xz-utils
+
+RUN wget http://ci-mirrors.rust-lang.org/stdarch/sde-external-9.53.0-2025-03-16-lin.tar.xz -O sde.tar.xz
+RUN mkdir intel-sde
+RUN tar -xJf sde.tar.xz --strip-components=1 -C intel-sde
+ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/intel-sde/sde64 \
+            -cpuid-in /checkout/ci/docker/x86_64-unknown-linux-gnu/cpuid.def \
+            -rtm-mode full -tsx --"
+# These tests fail with SDE as it doesn't support saving register data
+ENV STDARCH_TEST_SKIP_FUNCTION="xsave,xsaveopt,xsave64,xsaveopt64"
diff --git a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/cpuid.def b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/cpuid.def
new file mode 100644
index 0000000000000..4cce9d7a3c002
--- /dev/null
+++ b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/cpuid.def
@@ -0,0 +1,71 @@
+# Copyright (C) 2024-2024 Intel Corporation.
+#
+# This software and the related documents are Intel copyrighted materials, and your
+# use of them is governed by the express license under which they were provided to
+# you ("License"). Unless the License provides otherwise, you may not use, modify,
+# copy, publish, distribute, disclose or transmit this software or the related
+# documents without Intel's prior written permission.
+#
+# This software and the related documents are provided as is, with no express or
+# implied warranties, other than those that are expressly stated in the License.
+#
+# The CPUID information in this file is for software enabling purposes only and
+# it is not a full and accurate representation of the CPU under development which
+# it represents.
+# The CPUID information in this file is not a guarantee of the availability of
+# features or characteristics in the final released CPU.
+#
+# CPUID_VERSION = 1.0
+#      Input      =>               Output
+# EAX      ECX    =>   EAX      EBX      ECX      EDX
+00000000 ******** => 00000024 68747541 444d4163 69746e65
+00000001 ******** => 000d06f0 00100800 7ffaf3ff bfebfbff
+00000002 ******** => 76035a01 00f0b6ff 00000000 00c10000
+00000003 ******** => 00000000 00000000 00000000 00000000
+00000004 00000000 => 7c004121 02c0003f 0000003f 00000000 #Deterministic Cache
+00000004 00000001 => 7c004122 01c0003f 0000003f 00000000
+00000004 00000002 => 7c004143 03c0003f 000007ff 00000000
+00000004 00000003 => 7c0fc163 04c0003f 0005ffff 00000004
+00000004 00000004 => 00000000 00000000 00000000 00000000
+00000005 ******** => 00000040 00000040 00000003 00042120 #MONITOR/MWAIT
+00000006 ******** => 00000077 00000002 00000001 00000000 #Thermal and Power
+00000007 00000000 => 00000001 f3bfbfbf bbc05ffe 03d55130 #Extended Features
+00000007 00000001 => 88ee00bf 00000002 00000000 1d29cd3e
+00000008 ******** => 00000000 00000000 00000000 00000000
+00000009 ******** => 00000000 00000000 00000000 00000000 #Direct Cache
+0000000a ******** => 07300403 00000000 00000000 00000603
+0000000b 00000000 => 00000001 00000002 00000100 0000001e #Extended Topology
+0000000b 00000001 => 00000004 00000002 00000201 0000001e
+0000000c ******** => 00000000 00000000 00000000 00000000
+0000000d 00000000 => 000e02e7 00002b00 00002b00 00000000 #xcr0
+0000000d 00000001 => 0000001f 00000240 00000100 00000000
+0000000d 00000002 => 00000100 00000240 00000000 00000000
+0000000d 00000005 => 00000040 00000440 00000000 00000000 #zmasks
+0000000d 00000006 => 00000200 00000480 00000000 00000000 #zmmh
+0000000d 00000007 => 00000400 00000680 00000000 00000000 #zmm
+0000000d 00000011 => 00000040 00000ac0 00000002 00000000 #tileconfig
+0000000d 00000012 => 00002000 00000b00 00000006 00000000 #tiles
+0000000d 00000013 => 00000080 000003c0 00000000 00000000 #APX
+00000014 00000000 => 00000000 00000010 00000000 00000000 #ptwrite
+00000019 ******** => 00000000 00000005 00000000 00000000 #Key Locker
+0000001d 00000000 => 00000001 00000000 00000000 00000000 #AMX Tile
+0000001d 00000001 => 04002000 00080040 00000010 00000000 #AMX Palette1
+0000001e 00000000 => 00000001 00004010 00000000 00000000 #AMX Tmul
+0000001e 00000001 => 000001ff 00000000 00000000 00000000
+0000001f 00000000 => 00000001 00000002 00000100 0000001e
+0000001f 00000001 => 00000007 00000070 00000201 0000001e
+0000001f 00000002 => 00000000 00000000 00000002 0000001e
+00000024 00000000 => 00000000 00070002 00000000 00000000 #AVX10
+80000000 ******** => 80000008 00000000 00000000 00000000
+80000001 ******** => 00000000 00000000 00200961 2c100000
+80000002 ******** => 00000000 00000000 00000000 00000000
+80000003 ******** => 00000000 00000000 00000000 00000000
+80000004 ******** => 00000000 00000000 00000000 00000000
+80000005 ******** => 00000000 00000000 00000000 00000000
+80000006 ******** => 00000000 00000000 01006040 00000000
+80000007 ******** => 00000000 00000000 00000000 00000100
+80000008 ******** => 00003028 00000200 00000200 00000000
+
+# This file was copied from intel-sde/misc/cpuid/dmr/cpuid.def, and modified to
+# use "AuthenticAMD" as the vendor and the support for `XOP`, `SSE4a`, `TBM`,
+# `AVX512_VP2INTERSECT` and the VEX variants of AVX512 was added in the CPUID.
diff --git a/library/stdarch/ci/dox.sh b/library/stdarch/ci/dox.sh
new file mode 100755
index 0000000000000..910265fad84de
--- /dev/null
+++ b/library/stdarch/ci/dox.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+# Builds documentation for all target triples that we have a registered URL for
+# in liblibc. This scrapes the list of triples to document from `src/lib.rs`
+# which has a bunch of `html_root_url` directives we pick up.
+
+set -ex
+
+export RUSTDOCFLAGS="-D warnings"
+
+dox() {
+  if [ "$CI" != "" ]; then
+    rustup target add "${1}" || true
+  fi
+
+  cargo clean --target "${1}"
+
+  cargo build --verbose --target "${1}" --manifest-path crates/core_arch/Cargo.toml
+  cargo build --verbose --target "${1}" --manifest-path crates/std_detect/Cargo.toml
+
+  cargo doc --verbose --target "${1}" --manifest-path crates/core_arch/Cargo.toml
+  cargo doc --verbose --target "${1}" --manifest-path crates/std_detect/Cargo.toml
+}
+
+if [ -z "$1" ]; then
+  dox i686-unknown-linux-gnu
+  dox x86_64-unknown-linux-gnu
+  dox armv7-unknown-linux-gnueabihf
+  dox aarch64-unknown-linux-gnu
+  dox powerpc-unknown-linux-gnu
+  dox powerpc64le-unknown-linux-gnu
+  dox loongarch64-unknown-linux-gnu
+  # MIPS targets disabled since they are dropped to tier 3.
+  # See https://github.com/rust-lang/compiler-team/issues/648
+  #dox mips-unknown-linux-gnu
+  #dox mips64-unknown-linux-gnuabi64
+  dox wasm32-unknown-unknown
+  dox nvptx64-nvidia-cuda
+else
+  dox "${1}"
+fi
diff --git a/library/stdarch/ci/run-docker.sh b/library/stdarch/ci/run-docker.sh
new file mode 100755
index 0000000000000..657353004dcb5
--- /dev/null
+++ b/library/stdarch/ci/run-docker.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env sh
+
+# Small script to run tests for a target (or all targets) inside all the
+# respective docker images.
+
+set -ex
+
+if [ $# -lt 1 ]; then
+    >&2 echo "Usage: $0 <TARGET>"
+    exit 1
+fi
+
+run() {
+    # Set the linker that is used for the host (e.g. when compiling a build.rs)
+    # This overrides any configuration in e.g. `.cargo/config.toml`, which will
+    # probably not work within the docker container.
+    HOST_LINKER="CARGO_TARGET_$(rustc --print host-tuple | tr '[:lower:]-' '[:upper:]_')_LINKER"
+
+    # Prevent `Read-only file system (os error 30)`.
+    cargo generate-lockfile
+
+    echo "Building docker container for TARGET=${1}"
+    docker build -t stdarch -f "ci/docker/${1}/Dockerfile" ci/
+    mkdir -p target c_programs rust_programs
+    echo "Running docker"
+    # shellcheck disable=SC2016
+    docker run \
+      --rm \
+      --user "$(id -u)":"$(id -g)" \
+      --env CARGO_HOME=/cargo \
+      --env CARGO_TARGET_DIR=/checkout/target \
+      --env TARGET="${1}" \
+      --env "${HOST_LINKER}"="cc" \
+      --env STDARCH_TEST_EVERYTHING \
+      --env STDARCH_DISABLE_ASSERT_INSTR \
+      --env NOSTD \
+      --env NORUN \
+      --env RUSTFLAGS \
+      --env CARGO_UNSTABLE_BUILD_STD \
+      --env RUST_STD_DETECT_UNSTABLE \
+      --volume "${HOME}/.cargo":/cargo \
+      --volume "$(rustc --print sysroot)":/rust:ro \
+      --volume "$(pwd)":/checkout:ro \
+      --volume "$(pwd)"/target:/checkout/target \
+      --volume "$(pwd)"/c_programs:/checkout/c_programs \
+      --volume "$(pwd)"/rust_programs:/checkout/rust_programs \
+      --init \
+      --workdir /checkout \
+      --privileged \
+      stdarch \
+      sh -c "HOME=/tmp PATH=\$PATH:/rust/bin exec ci/run.sh ${1}"
+}
+
+if [ -z "$1" ]; then
+  for d in ci/docker/*; do
+    run "${d}"
+  done
+else
+  run "${1}"
+fi
diff --git a/library/stdarch/ci/run.sh b/library/stdarch/ci/run.sh
new file mode 100755
index 0000000000000..8eadb9285c992
--- /dev/null
+++ b/library/stdarch/ci/run.sh
@@ -0,0 +1,203 @@
+#!/usr/bin/env sh
+
+set -ex
+
+: "${TARGET?The TARGET environment variable must be set.}"
+
+# Tests are all super fast anyway, and they fault often enough on travis that
+# having only one thread increases debuggability to be worth it.
+#export RUST_BACKTRACE=full
+#export RUST_TEST_NOCAPTURE=1
+#export RUST_TEST_THREADS=1
+
+export RUSTFLAGS="${RUSTFLAGS} -D warnings -Z merge-functions=disabled -Z verify-llvm-ir"
+export HOST_RUSTFLAGS="${RUSTFLAGS}"
+export PROFILE="${PROFILE:="--profile=release"}"
+
+case ${TARGET} in
+    # On Windows the linker performs identical COMDAT folding (ICF) by default
+    # in release mode which removes identical COMDAT sections. This interferes
+    # with our instruction assertions just like LLVM's MergeFunctions pass so
+    # we disable it.
+    *-pc-windows-msvc)
+        export RUSTFLAGS="${RUSTFLAGS} -Clink-args=/OPT:NOICF"
+        ;;
+    # On 32-bit use a static relocation model which avoids some extra
+    # instructions when dealing with static data, notably allowing some
+    # instruction assertion checks to pass below the 20 instruction limit. If
+    # this is the default, dynamic, then too many instructions are generated
+    # when we assert the instruction for a function and it causes tests to fail.
+    i686-* | i586-*)
+        export RUSTFLAGS="${RUSTFLAGS} -C relocation-model=static"
+        ;;
+    # Some x86_64 targets enable by default more features beyond SSE2,
+    # which cause some instruction assertion checks to fail.
+    x86_64-*)
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=-sse3"
+        ;;
+    #Unoptimized build uses fast-isel which breaks with msa
+    mips-* | mipsel-*)
+	export RUSTFLAGS="${RUSTFLAGS} -C llvm-args=-fast-isel=false"
+	;;
+    armv7-*eabihf | thumbv7-*eabihf)
+        export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+neon"
+        ;;
+    # Some of our test dependencies use the deprecated `gcc` crates which
+    # doesn't detect RISC-V compilers automatically, so do it manually here.
+    riscv*)
+        export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+zk,+zks,+zbb,+zbc"
+        ;;
+esac
+
+echo "RUSTFLAGS=${RUSTFLAGS}"
+echo "OBJDUMP=${OBJDUMP}"
+echo "STDARCH_DISABLE_ASSERT_INSTR=${STDARCH_DISABLE_ASSERT_INSTR}"
+echo "STDARCH_TEST_EVERYTHING=${STDARCH_TEST_EVERYTHING}"
+echo "STDARCH_TEST_SKIP_FEATURE=${STDARCH_TEST_SKIP_FEATURE}"
+echo "STDARCH_TEST_SKIP_FUNCTION=${STDARCH_TEST_SKIP_FUNCTION}"
+echo "PROFILE=${PROFILE}"
+
+cargo_test() {
+    cmd="cargo"
+    subcmd="test"
+    if [ "$NORUN" = "1" ]; then
+        export subcmd="build"
+    fi
+    cmd="$cmd ${subcmd} --target=$TARGET $1"
+    cmd="$cmd -- $2"
+
+    case ${TARGET} in
+        # wasm targets can't catch panics so if a test failures make sure the test
+        # harness isn't trying to capture output, otherwise we won't get any useful
+        # output.
+        wasm32*)
+            cmd="$cmd --nocapture"
+            ;;
+    esac
+    $cmd
+}
+
+CORE_ARCH="--manifest-path=crates/core_arch/Cargo.toml"
+STD_DETECT="--manifest-path=crates/std_detect/Cargo.toml"
+STDARCH_EXAMPLES="--manifest-path=examples/Cargo.toml"
+INTRINSIC_TEST="--manifest-path=crates/intrinsic-test/Cargo.toml"
+
+cargo_test "${CORE_ARCH} ${PROFILE}"
+
+if [ "$NOSTD" != "1" ]; then
+    cargo_test "${STD_DETECT} ${PROFILE}"
+
+    cargo_test "${STD_DETECT} --no-default-features"
+    cargo_test "${STD_DETECT} --no-default-features --features=std_detect_file_io"
+    cargo_test "${STD_DETECT} --no-default-features --features=std_detect_dlsym_getauxval"
+    cargo_test "${STD_DETECT} --no-default-features --features=std_detect_dlsym_getauxval,std_detect_file_io"
+
+    cargo_test "${STDARCH_EXAMPLES} ${PROFILE}"
+fi
+
+
+# Test targets compiled with extra features.
+case ${TARGET} in
+    x86_64-unknown-linux-gnu)
+        export STDARCH_DISABLE_ASSERT_INSTR=1
+
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx"
+        cargo_test "${PROFILE}"
+
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx512f"
+        cargo_test "${PROFILE}"
+        ;;
+    x86_64* | i686*)
+        export STDARCH_DISABLE_ASSERT_INSTR=1
+
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx"
+        cargo_test "${PROFILE}"
+        ;;
+    # FIXME: don't build anymore
+    #mips-*gnu* | mipsel-*gnu*)
+    #    export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+msa,+fp64,+mips32r5"
+    #    cargo_test "${PROFILE}"
+	  #    ;;
+    mips64*)
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+msa"
+        cargo_test "${PROFILE}"
+	      ;;
+    s390x*)
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+vector-enhancements-1"
+        cargo_test "${PROFILE}"
+	      ;;
+    powerpc64*)
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+altivec"
+        cargo_test "${PROFILE}"
+
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+vsx"
+        cargo_test "${PROFILE}"
+        ;;
+    powerpc*)
+        # qemu has a bug in PPC32 which leads to a crash when compiled with `vsx`
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+altivec"
+        cargo_test "${PROFILE}"
+        ;;
+
+    # Setup aarch64 & armv7 specific variables, the runner, along with some 
+    # tests to skip
+    aarch64-unknown-linux-gnu*)
+        TEST_CPPFLAGS="-fuse-ld=lld -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/9/aarch64-linux-gnu/"
+        TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt
+        TEST_CXX_COMPILER="clang++-19"
+        TEST_RUNNER="${CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER}"
+        ;;
+
+    aarch64_be-unknown-linux-gnu*)
+        TEST_CPPFLAGS="-fuse-ld=lld"
+        TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt
+        TEST_CXX_COMPILER="clang++-19"
+        TEST_RUNNER="${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_RUNNER}"
+        ;;
+
+    armv7-unknown-linux-gnueabihf*)
+        TEST_CPPFLAGS="-fuse-ld=lld -I/usr/arm-linux-gnueabihf/include/ -I/usr/arm-linux-gnueabihf/include/c++/9/arm-linux-gnueabihf/"
+        TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_arm.txt
+        TEST_CXX_COMPILER="clang++-19"
+        TEST_RUNNER="${CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER}"
+        ;;
+    *)
+        ;;
+
+esac
+
+# Arm specific
+case "${TARGET}" in
+    aarch64-unknown-linux-gnu*|armv7-unknown-linux-gnueabihf*)
+        CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \
+            cargo run "${INTRINSIC_TEST}" "${PROFILE}" \
+            --bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \
+            --runner "${TEST_RUNNER}" \
+            --cppcompiler "${TEST_CXX_COMPILER}" \
+            --skip "${TEST_SKIP_INTRINSICS}" \
+            --target "${TARGET}"
+        ;;
+
+    aarch64_be-unknown-linux-gnu*)
+        CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \
+            cargo run "${INTRINSIC_TEST}" "${PROFILE}"  \
+            --bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \
+            --runner "${TEST_RUNNER}" \
+            --cppcompiler "${TEST_CXX_COMPILER}" \
+            --skip "${TEST_SKIP_INTRINSICS}" \
+            --target "${TARGET}" \
+            --linker "${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER}" \
+            --cxx-toolchain-dir "${AARCH64_BE_TOOLCHAIN}"
+        ;;
+     *)
+        ;;
+esac
+
+if [ "$NORUN" != "1" ] && [ "$NOSTD" != 1 ]; then
+    # Test examples
+    (
+        cd examples
+        cargo test --target "$TARGET" "${PROFILE}"
+        echo test | cargo run --target "$TARGET" "${PROFILE}" hex
+    )
+fi
diff --git a/library/stdarch/ci/style.sh b/library/stdarch/ci/style.sh
new file mode 100755
index 0000000000000..8f81883f3f61a
--- /dev/null
+++ b/library/stdarch/ci/style.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env sh
+
+set -ex
+
+if rustup component add rustfmt-preview ; then
+    command -v rustfmt
+    rustfmt -V
+    cargo fmt --all -- --check
+fi
+
+# if rustup component add clippy-preview ; then
+#     cargo clippy -V
+#     cargo clippy --all -- -D clippy::pedantic
+# fi
+
+if shellcheck --version ; then
+    shellcheck -e SC2103 ci/*.sh
+else
+    echo "shellcheck not found"
+    exit 1
+fi
+
diff --git a/library/stdarch/crates/assert-instr-macro/Cargo.toml b/library/stdarch/crates/assert-instr-macro/Cargo.toml
new file mode 100644
index 0000000000000..77ee571ccbb82
--- /dev/null
+++ b/library/stdarch/crates/assert-instr-macro/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "assert-instr-macro"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+edition = "2024"
+
+[lib]
+proc-macro = true
+test = false
+
+[dependencies]
+proc-macro2 = "1.0"
+quote = "1.0"
+syn = { version = "2.0", features = ["full"] }
+
+[lints.rust]
+unexpected_cfgs = {level = "warn", check-cfg = ['cfg(optimized)'] }
diff --git a/library/stdarch/crates/assert-instr-macro/build.rs b/library/stdarch/crates/assert-instr-macro/build.rs
new file mode 100644
index 0000000000000..360bc274213c6
--- /dev/null
+++ b/library/stdarch/crates/assert-instr-macro/build.rs
@@ -0,0 +1,12 @@
+use std::env;
+
+fn main() {
+    let opt_level = env::var("OPT_LEVEL")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(0);
+    let profile = env::var("PROFILE").unwrap_or_default();
+    if profile == "release" || opt_level >= 2 {
+        println!("cargo:rustc-cfg=optimized");
+    }
+}
diff --git a/library/stdarch/crates/assert-instr-macro/src/lib.rs b/library/stdarch/crates/assert-instr-macro/src/lib.rs
new file mode 100644
index 0000000000000..96b86d93bbce5
--- /dev/null
+++ b/library/stdarch/crates/assert-instr-macro/src/lib.rs
@@ -0,0 +1,222 @@
+//! Implementation of the `#[assert_instr]` macro
+//!
+//! This macro is used when testing the `stdarch` crate and is used to generate
+//! test cases to assert that functions do indeed contain the instructions that
+//! we're expecting them to contain.
+//!
+//! The procedural macro here is relatively simple, it simply appends a
+//! `#[test]` function to the original token stream which asserts that the
+//! function itself contains the relevant instruction.
+#![deny(rust_2018_idioms)]
+
+#[macro_use]
+extern crate quote;
+
+use proc_macro2::TokenStream;
+use quote::ToTokens;
+
+#[proc_macro_attribute]
+pub fn assert_instr(
+    attr: proc_macro::TokenStream,
+    item: proc_macro::TokenStream,
+) -> proc_macro::TokenStream {
+    let invoc = match syn::parse::<Invoc>(attr) {
+        Ok(s) => s,
+        Err(e) => return e.to_compile_error().into(),
+    };
+    let item = match syn::parse::<syn::Item>(item) {
+        Ok(s) => s,
+        Err(e) => return e.to_compile_error().into(),
+    };
+    let func = match item {
+        syn::Item::Fn(ref f) => f,
+        _ => panic!("must be attached to a function"),
+    };
+
+    let instr = &invoc.instr;
+    let name = &func.sig.ident;
+    let maybe_allow_deprecated = if func
+        .attrs
+        .iter()
+        .any(|attr| attr.path().is_ident("deprecated"))
+    {
+        quote! { #[allow(deprecated)] }
+    } else {
+        quote! {}
+    };
+
+    // Disable assert_instr for x86 targets compiled with avx enabled, which
+    // causes LLVM to generate different intrinsics that the ones we are
+    // testing for.
+    let disable_assert_instr = std::env::var("STDARCH_DISABLE_ASSERT_INSTR").is_ok();
+
+    // If instruction tests are disabled avoid emitting this shim at all, just
+    // return the original item without our attribute.
+    if !cfg!(optimized) || disable_assert_instr {
+        return (quote! { #item }).into();
+    }
+
+    let instr_str = instr
+        .replace(['.', '/', ':'], "_")
+        .replace(char::is_whitespace, "");
+    let assert_name = syn::Ident::new(&format!("assert_{name}_{instr_str}"), name.span());
+    // These name has to be unique enough for us to find it in the disassembly later on:
+    let shim_name = syn::Ident::new(
+        &format!("stdarch_test_shim_{name}_{instr_str}"),
+        name.span(),
+    );
+    let mut inputs = Vec::new();
+    let mut input_vals = Vec::new();
+    let mut const_vals = Vec::new();
+    let ret = &func.sig.output;
+    for arg in func.sig.inputs.iter() {
+        let capture = match *arg {
+            syn::FnArg::Typed(ref c) => c,
+            ref v => panic!(
+                "arguments must not have patterns: `{:?}`",
+                v.clone().into_token_stream()
+            ),
+        };
+        let ident = match *capture.pat {
+            syn::Pat::Ident(ref i) => &i.ident,
+            _ => panic!("must have bare arguments"),
+        };
+        if let Some((_, tokens)) = invoc.args.iter().find(|a| *ident == a.0) {
+            input_vals.push(quote! { #tokens });
+        } else {
+            inputs.push(capture);
+            input_vals.push(quote! { #ident });
+        }
+    }
+    for arg in func.sig.generics.params.iter() {
+        let c = match *arg {
+            syn::GenericParam::Const(ref c) => c,
+            ref v => panic!(
+                "only const generics are allowed: `{:?}`",
+                v.clone().into_token_stream()
+            ),
+        };
+        if let Some((_, tokens)) = invoc.args.iter().find(|a| c.ident == a.0) {
+            const_vals.push(quote! { #tokens });
+        } else {
+            panic!("const generics must have a value for tests");
+        }
+    }
+
+    let attrs = func
+        .attrs
+        .iter()
+        .filter(|attr| {
+            attr.path()
+                .segments
+                .first()
+                .expect("attr.path.segments.first() failed")
+                .ident
+                .to_string()
+                .starts_with("target")
+        })
+        .collect::<Vec<_>>();
+    let attrs = Append(&attrs);
+
+    // Use an ABI on Windows that passes SIMD values in registers, like what
+    // happens on Unix (I think?) by default.
+    let abi = if cfg!(windows) {
+        let target = std::env::var("TARGET").unwrap();
+        if target.contains("x86_64") {
+            syn::LitStr::new("sysv64", proc_macro2::Span::call_site())
+        } else {
+            syn::LitStr::new("vectorcall", proc_macro2::Span::call_site())
+        }
+    } else {
+        syn::LitStr::new("C", proc_macro2::Span::call_site())
+    };
+    let to_test = quote! {
+        #attrs
+        #maybe_allow_deprecated
+        #[unsafe(no_mangle)]
+        #[inline(never)]
+        pub unsafe extern #abi fn #shim_name(#(#inputs),*) #ret {
+            #name::<#(#const_vals),*>(#(#input_vals),*)
+        }
+    };
+
+    let tokens: TokenStream = quote! {
+        #[test]
+        #[allow(non_snake_case)]
+        fn #assert_name() {
+            #to_test
+
+            ::stdarch_test::assert(#shim_name as usize, stringify!(#shim_name), #instr);
+        }
+    };
+
+    let tokens: TokenStream = quote! {
+        #item
+        #tokens
+    };
+    tokens.into()
+}
+
+struct Invoc {
+    instr: String,
+    args: Vec<(syn::Ident, syn::Expr)>,
+}
+
+impl syn::parse::Parse for Invoc {
+    fn parse(input: syn::parse::ParseStream<'_>) -> syn::Result<Self> {
+        use syn::{Token, ext::IdentExt};
+
+        let mut instr = String::new();
+        while !input.is_empty() {
+            if input.parse::<Token![,]>().is_ok() {
+                break;
+            }
+            if let Ok(ident) = syn::Ident::parse_any(input) {
+                instr.push_str(&ident.to_string());
+                continue;
+            }
+            if input.parse::<Token![.]>().is_ok() {
+                instr.push('.');
+                continue;
+            }
+            if let Ok(s) = input.parse::<syn::LitStr>() {
+                instr.push_str(&s.value());
+                continue;
+            }
+            println!("{:?}", input.cursor().token_stream());
+            return Err(input.error("expected an instruction"));
+        }
+        if instr.is_empty() {
+            return Err(input.error("expected an instruction before comma"));
+        }
+        let mut args = Vec::new();
+        while !input.is_empty() {
+            let name = input.parse::<syn::Ident>()?;
+            input.parse::<Token![=]>()?;
+            let expr = input.parse::<syn::Expr>()?;
+            args.push((name, expr));
+
+            if input.parse::<Token![,]>().is_err() {
+                if !input.is_empty() {
+                    return Err(input.error("extra tokens at end"));
+                }
+                break;
+            }
+        }
+        Ok(Self { instr, args })
+    }
+}
+
+struct Append<T>(T);
+
+impl<T> quote::ToTokens for Append<T>
+where
+    T: Clone + IntoIterator,
+    T::Item: quote::ToTokens,
+{
+    fn to_tokens(&self, tokens: &mut proc_macro2::TokenStream) {
+        for item in self.0.clone() {
+            item.to_tokens(tokens);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/Cargo.toml b/library/stdarch/crates/core_arch/Cargo.toml
new file mode 100644
index 0000000000000..f4bd5fc552afe
--- /dev/null
+++ b/library/stdarch/crates/core_arch/Cargo.toml
@@ -0,0 +1,33 @@
+[package]
+name = "core_arch"
+version = "0.1.5"
+authors = [
+    "Alex Crichton <alex@alexcrichton.com>",
+    "Andrew Gallant <jamslam@gmail.com>",
+    "Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>",
+]
+description = "`core::arch` - Rust's core library architecture-specific intrinsics."
+homepage = "https://github.com/rust-lang/stdarch"
+repository = "https://github.com/rust-lang/stdarch"
+readme = "README.md"
+keywords = ["core", "simd", "arch", "intrinsics"]
+categories = ["hardware-support", "no-std"]
+license = "MIT OR Apache-2.0"
+edition = "2024"
+
+[badges]
+is-it-maintained-issue-resolution = { repository = "rust-lang/stdarch" }
+is-it-maintained-open-issues = { repository = "rust-lang/stdarch" }
+maintenance = { status = "experimental" }
+
+[dev-dependencies]
+stdarch-test = { version = "0.*", path = "../stdarch-test" }
+std_detect = { version = "0.*", path = "../std_detect" }
+
+[target.'cfg(all(target_arch = "x86_64", target_os = "linux"))'.dev-dependencies]
+syscalls = { version = "0.6.18", default-features = false }
+
+[lints.clippy]
+too_long_first_doc_paragraph = "allow"
+missing_transmute_annotations = "allow"
+useless_transmute = "allow"
diff --git a/library/stdarch/crates/core_arch/LICENSE-APACHE b/library/stdarch/crates/core_arch/LICENSE-APACHE
new file mode 100644
index 0000000000000..16fe87b06e802
--- /dev/null
+++ b/library/stdarch/crates/core_arch/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/library/stdarch/crates/core_arch/LICENSE-MIT b/library/stdarch/crates/core_arch/LICENSE-MIT
new file mode 100644
index 0000000000000..52d82415d8b60
--- /dev/null
+++ b/library/stdarch/crates/core_arch/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2017 The Rust Project Developers
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/library/stdarch/crates/core_arch/MISSING.md b/library/stdarch/crates/core_arch/MISSING.md
new file mode 100644
index 0000000000000..c948f3f8c90e1
--- /dev/null
+++ b/library/stdarch/crates/core_arch/MISSING.md
@@ -0,0 +1,116 @@
+## The following neon instructions are currently not implemented in stdarch
+
+### Not implemented on arm:
+
+`vcadd_rot270_f32`
+
+`vcadd_rot90_f32`
+
+`vcaddq_rot270_f32`
+
+`vcaddq_rot90_f32`
+
+`vdot_s32`
+
+`vdot_u32`
+
+`vdotq_s32`
+
+`vdotq_u32`
+
+`vdot_lane_s32`
+
+`vdot_lane_u32`
+
+`vdotq_lane_s32`
+
+`vdotq_lane_u32`
+
+`vcmla_f32`
+
+`vcmla_lane_f32`
+
+`vcmla_laneq_f32`
+
+`vcmla_rot180_f32`
+
+`vcmla_rot180_lane_f32`
+
+`vcmla_rot180_laneq_f32`
+
+`vcmla_rot270_f32`
+
+`vcmla_rot270_lane_f32`
+
+`vcmla_rot270_laneq_f32`
+
+`vcmla_rot90_f32`
+
+`vcmla_rot90_lane_f32`
+
+`vcmla_rot90_laneq_f32`
+
+`vcmlaq_f32`
+
+`vcmlaq_lane_f32`
+
+`vcmlaq_laneq_f32`
+
+`vcmlaq_rot180_f32`
+
+`vcmlaq_rot180_lane_f32`
+
+`vcmlaq_rot180_laneq_f32`
+
+`vcmlaq_rot270_f32`
+
+`vcmlaq_rot270_lane_f32`
+
+`vcmlaq_rot270_laneq_f32`
+
+`vcmlaq_rot90_f32`
+
+`vcmlaq_rot90_lane_f32`
+
+`vcmlaq_rot90_laneq_f32`
+
+### Not implemented in LLVM:
+
+`vrnd32x_f64`
+
+`vrnd32xq_f64`
+
+`vrnd32z_f64`
+
+`vrnd32zq_f64`
+
+`vrnd64x_f64`
+
+`vrnd64xq_f64`
+
+`vrnd64z_f64`
+
+`vrnd64zq_f64`
+
+### LLVM Select errors may occur:
+
+`vsudot_lane_s32`
+
+`vsudot_laneq_s32`
+
+`vsudotq_lane_s32`
+
+`vsudotq_laneq_s32`
+
+`vusdot_lane_s32`
+
+`vusdot_laneq_s32`
+
+`vusdot_s32`
+
+`vusdotq_lane_s32`
+
+`vusdotq_laneq_s32`
+
+`vusdotq_s32v`
+
diff --git a/library/stdarch/crates/core_arch/README.md b/library/stdarch/crates/core_arch/README.md
new file mode 100644
index 0000000000000..fc18a5759dbe4
--- /dev/null
+++ b/library/stdarch/crates/core_arch/README.md
@@ -0,0 +1,58 @@
+`core::arch` - Rust's core library architecture-specific intrinsics
+=======
+
+The `core::arch` module implements architecture-dependent intrinsics (e.g. SIMD).
+
+# Usage 
+
+`core::arch` is available as part of `libcore` and it is re-exported by
+`libstd`. Prefer using it via `core::arch` or `std::arch` than via this crate.
+
+Using `core::arch` via this crate requires nightly Rust, and it can (and does)
+break often. The only cases in which you should consider using it via this crate
+are:
+
+* if you need to re-compile `core::arch` yourself, e.g., with particular
+  target-features enabled that are not enabled for `libcore`/`libstd`. Note: if
+  you need to re-compile it for a non-standard target, please prefer using
+  `xargo` and re-compiling `libcore`/`libstd` as appropriate instead of using
+  this crate.
+  
+* using some features that might not be available even behind unstable Rust
+  features. We try to keep these to a minimum. If you need to use some of these
+  features, please open an issue so that we can expose them in nightly Rust and
+  you can use them from there.
+
+# Documentation
+
+* [Documentation - i686][i686]
+* [Documentation - x86\_64][x86_64]
+* [Documentation - arm][arm]
+* [Documentation - aarch64][aarch64]
+* [Documentation - powerpc][powerpc]
+* [Documentation - powerpc64][powerpc64]
+* [How to get started][contrib]
+* [How to help implement intrinsics][help-implement]
+
+[contrib]: https://github.com/rust-lang/stdarch/blob/master/CONTRIBUTING.md
+[help-implement]: https://github.com/rust-lang/stdarch/issues/40
+[i686]: https://rust-lang.github.io/stdarch/i686/core_arch/
+[x86_64]: https://rust-lang.github.io/stdarch/x86_64/core_arch/
+[arm]: https://rust-lang.github.io/stdarch/arm/core_arch/
+[aarch64]: https://rust-lang.github.io/stdarch/aarch64/core_arch/
+[powerpc]: https://rust-lang.github.io/stdarch/powerpc/core_arch/
+[powerpc64]: https://rust-lang.github.io/stdarch/powerpc64/core_arch/
+
+# License
+
+`core_arch` is primarily distributed under the terms of both the MIT license and
+the Apache License (Version 2.0), with portions covered by various BSD-like
+licenses.
+
+See LICENSE-APACHE, and LICENSE-MIT for details.
+
+# Contribution
+
+Unless you explicitly state otherwise, any contribution intentionally submitted
+for inclusion in `core_arch` by you, as defined in the Apache-2.0 license,
+shall be dual licensed as above, without any additional terms or conditions.
diff --git a/library/stdarch/crates/core_arch/missing-x86.md b/library/stdarch/crates/core_arch/missing-x86.md
new file mode 100644
index 0000000000000..640ec7d0fe7d1
--- /dev/null
+++ b/library/stdarch/crates/core_arch/missing-x86.md
@@ -0,0 +1,258 @@
+
+<details><summary>["AMX-BF16"]</summary><p>
+
+  * [ ] [`__tile_dpbf16ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__tile_dpbf16ps)
+</p></details>
+
+
+<details><summary>["AMX-COMPLEX"]</summary><p>
+
+  * [ ] [`__tile_cmmimfp16ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__tile_cmmimfp16ps)
+  * [ ] [`__tile_cmmrlfp16ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__tile_cmmrlfp16ps)
+</p></details>
+
+
+<details><summary>["AMX-FP16"]</summary><p>
+
+  * [ ] [`__tile_dpfp16ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__tile_dpfp16ps)
+</p></details>
+
+
+<details><summary>["AMX-INT8"]</summary><p>
+
+  * [ ] [`__tile_dpbssd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__tile_dpbssd)
+  * [ ] [`__tile_dpbsud`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__tile_dpbsud)
+  * [ ] [`__tile_dpbusd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__tile_dpbusd)
+  * [ ] [`__tile_dpbuud`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__tile_dpbuud)
+</p></details>
+
+
+<details><summary>["AMX-TILE"]</summary><p>
+
+  * [ ] [`__tile_loadd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__tile_loadd)
+  * [ ] [`__tile_stored`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__tile_stored)
+  * [ ] [`__tile_stream_loadd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__tile_stream_loadd)
+  * [ ] [`__tile_zero`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__tile_zero)
+</p></details>
+
+
+<details><summary>["AVX512_FP16"]</summary><p>
+
+  * [ ] [`_mm256_set1_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_pch)
+  * [ ] [`_mm512_set1_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_pch)
+  * [ ] [`_mm_set1_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pch)
+</p></details>
+
+
+<details><summary>["AVX512_VP2INTERSECT", "AVX512F"]</summary><p>
+
+  * [ ] [`_mm512_2intersect_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_2intersect_epi32)
+  * [ ] [`_mm512_2intersect_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_2intersect_epi64)
+</p></details>
+
+
+<details><summary>["AVX512_VP2INTERSECT", "AVX512VL"]</summary><p>
+
+  * [ ] [`_mm256_2intersect_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_2intersect_epi32)
+  * [ ] [`_mm256_2intersect_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_2intersect_epi64)
+  * [ ] [`_mm_2intersect_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_2intersect_epi32)
+  * [ ] [`_mm_2intersect_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_2intersect_epi64)
+</p></details>
+
+
+<details><summary>["CET_SS"]</summary><p>
+
+  * [ ] [`_clrssbsy`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_clrssbsy)
+  * [ ] [`_get_ssp`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_get_ssp)
+  * [ ] [`_get_ssp`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_get_ssp)
+  * [ ] [`_inc_ssp`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_inc_ssp)
+  * [ ] [`_incsspd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_incsspd)
+  * [ ] [`_incsspq`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_incsspq)
+  * [ ] [`_rdsspd_i32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdsspd_i32)
+  * [ ] [`_rdsspq_i64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdsspq_i64)
+  * [ ] [`_rstorssp`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rstorssp)
+  * [ ] [`_saveprevssp`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_saveprevssp)
+  * [ ] [`_setssbsy`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_setssbsy)
+  * [ ] [`_wrssd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_wrssd)
+  * [ ] [`_wrssq`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_wrssq)
+  * [ ] [`_wrussd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_wrussd)
+  * [ ] [`_wrussq`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_wrussq)
+</p></details>
+
+
+<details><summary>["CLDEMOTE"]</summary><p>
+
+  * [ ] [`_mm_cldemote`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cldemote)
+</p></details>
+
+
+<details><summary>["CLFLUSHOPT"]</summary><p>
+
+  * [ ] [`_mm_clflushopt`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clflushopt)
+</p></details>
+
+
+<details><summary>["CLWB"]</summary><p>
+
+  * [ ] [`_mm_clwb`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clwb)
+</p></details>
+
+
+<details><summary>["CMPCCXADD"]</summary><p>
+
+  * [ ] [`_cmpccxadd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpccxadd_epi32)
+  * [ ] [`_cmpccxadd_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpccxadd_epi64)
+</p></details>
+
+
+<details><summary>["ENQCMD"]</summary><p>
+
+  * [ ] [`_enqcmd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_enqcmd)
+  * [ ] [`_enqcmds`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_enqcmds)
+</p></details>
+
+
+<details><summary>["FSGSBASE"]</summary><p>
+
+  * [ ] [`_readfsbase_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_readfsbase_u32)
+  * [ ] [`_readfsbase_u64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_readfsbase_u64)
+  * [ ] [`_readgsbase_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_readgsbase_u32)
+  * [ ] [`_readgsbase_u64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_readgsbase_u64)
+  * [ ] [`_writefsbase_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_writefsbase_u32)
+  * [ ] [`_writefsbase_u64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_writefsbase_u64)
+  * [ ] [`_writegsbase_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_writegsbase_u32)
+  * [ ] [`_writegsbase_u64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_writegsbase_u64)
+</p></details>
+
+
+<details><summary>["HRESET"]</summary><p>
+
+  * [ ] [`_hreset`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_hreset)
+</p></details>
+
+
+<details><summary>["INVPCID"]</summary><p>
+
+  * [ ] [`_invpcid`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_invpcid)
+</p></details>
+
+
+<details><summary>["MONITOR"]</summary><p>
+
+  * [ ] [`_mm_monitor`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_monitor)
+  * [ ] [`_mm_mwait`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mwait)
+</p></details>
+
+
+<details><summary>["MOVBE"]</summary><p>
+
+  * [ ] [`_loadbe_i16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_loadbe_i16)
+  * [ ] [`_loadbe_i32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_loadbe_i32)
+  * [ ] [`_loadbe_i64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_loadbe_i64)
+  * [ ] [`_storebe_i16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_storebe_i16)
+  * [ ] [`_storebe_i32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_storebe_i32)
+  * [ ] [`_storebe_i64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_storebe_i64)
+</p></details>
+
+
+<details><summary>["MOVDIR64B"]</summary><p>
+
+  * [ ] [`_movdir64b`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_movdir64b)
+</p></details>
+
+
+<details><summary>["MOVDIRI"]</summary><p>
+
+  * [ ] [`_directstoreu_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_directstoreu_u32)
+  * [ ] [`_directstoreu_u64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_directstoreu_u64)
+</p></details>
+
+
+<details><summary>["PCONFIG"]</summary><p>
+
+  * [ ] [`_pconfig_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pconfig_u32)
+</p></details>
+
+
+<details><summary>["POPCNT"]</summary><p>
+
+  * [ ] [`_mm_popcnt_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32)
+  * [ ] [`_mm_popcnt_u64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64)
+</p></details>
+
+
+<details><summary>["PREFETCHI"]</summary><p>
+
+  * [ ] [`_m_prefetchit0`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_prefetchit0)
+  * [ ] [`_m_prefetchit1`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_prefetchit1)
+</p></details>
+
+
+<details><summary>["RAO_INT"]</summary><p>
+
+  * [ ] [`_aadd_i32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_aadd_i32)
+  * [ ] [`_aadd_i64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_aadd_i64)
+  * [ ] [`_aand_i32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_aand_i32)
+  * [ ] [`_aand_i64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_aand_i64)
+  * [ ] [`_aor_i32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_aor_i32)
+  * [ ] [`_aor_i64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_aor_i64)
+  * [ ] [`_axor_i32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_axor_i32)
+  * [ ] [`_axor_i64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_axor_i64)
+</p></details>
+
+
+<details><summary>["RDPID"]</summary><p>
+
+  * [ ] [`_rdpid_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdpid_u32)
+</p></details>
+
+
+<details><summary>["SERIALIZE"]</summary><p>
+
+  * [ ] [`_serialize`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_serialize)
+</p></details>
+
+
+<details><summary>["SSE"]</summary><p>
+
+  * [ ] [`_mm_free`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free)
+  * [ ] [`_mm_malloc`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_malloc)
+</p></details>
+
+
+<details><summary>["TSXLDTRK"]</summary><p>
+
+  * [ ] [`_xresldtrk`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xresldtrk)
+  * [ ] [`_xsusldtrk`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsusldtrk)
+</p></details>
+
+
+<details><summary>["UINTR"]</summary><p>
+
+  * [ ] [`_clui`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_clui)
+  * [ ] [`_senduipi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_senduipi)
+  * [ ] [`_stui`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_stui)
+  * [ ] [`_testui`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_testui)
+</p></details>
+
+
+<details><summary>["USER_MSR"]</summary><p>
+
+  * [ ] [`_urdmsr`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_urdmsr)
+  * [ ] [`_uwrmsr`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_uwrmsr)
+</p></details>
+
+
+<details><summary>["WAITPKG"]</summary><p>
+
+  * [ ] [`_tpause`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_tpause)
+  * [ ] [`_umonitor`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_umonitor)
+  * [ ] [`_umwait`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_umwait)
+</p></details>
+
+
+<details><summary>["WBNOINVD"]</summary><p>
+
+  * [ ] [`_wbnoinvd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_wbnoinvd)
+</p></details>
+
diff --git a/library/stdarch/crates/core_arch/rustfmt.toml b/library/stdarch/crates/core_arch/rustfmt.toml
new file mode 100644
index 0000000000000..4ae742ba8d501
--- /dev/null
+++ b/library/stdarch/crates/core_arch/rustfmt.toml
@@ -0,0 +1,3 @@
+ignore = [
+    "src/simd.rs",
+]
diff --git a/library/stdarch/crates/core_arch/src/aarch64/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/mod.rs
new file mode 100644
index 0000000000000..f4b9b1c30251e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/mod.rs
@@ -0,0 +1,39 @@
+//! AArch64 intrinsics.
+//!
+//! The reference for NEON is [Arm's NEON Intrinsics Reference][arm_ref]. The
+//! [Arm's NEON Intrinsics Online Database][arm_dat] is also useful.
+//!
+//! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
+//! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
+
+#![cfg_attr(
+    all(target_arch = "aarch64", target_abi = "softfloat"),
+    // Just allow the warning: anyone soundly using the intrinsics has to enable
+    // the target feature, and that will generate a warning for them.
+    allow(aarch64_softfloat_neon)
+)]
+
+mod mte;
+#[unstable(feature = "stdarch_aarch64_mte", issue = "129010")]
+pub use self::mte::*;
+
+mod neon;
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub use self::neon::*;
+
+mod tme;
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub use self::tme::*;
+
+mod prefetch;
+#[unstable(feature = "stdarch_aarch64_prefetch", issue = "117217")]
+pub use self::prefetch::*;
+
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub use super::arm_shared::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[cfg(test)]
+pub(crate) mod test_support;
diff --git a/library/stdarch/crates/core_arch/src/aarch64/mte.rs b/library/stdarch/crates/core_arch/src/aarch64/mte.rs
new file mode 100644
index 0000000000000..c400f774bcce0
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/mte.rs
@@ -0,0 +1,171 @@
+//! AArch64 Memory tagging intrinsics
+//!
+//! [ACLE documentation](https://arm-software.github.io/acle/main/acle.html#markdown-toc-mte-intrinsics)
+
+unsafe extern "unadjusted" {
+    #[cfg_attr(
+        any(target_arch = "aarch64", target_arch = "arm64ec"),
+        link_name = "llvm.aarch64.irg"
+    )]
+    fn irg_(ptr: *const (), exclude: i64) -> *const ();
+    #[cfg_attr(
+        any(target_arch = "aarch64", target_arch = "arm64ec"),
+        link_name = "llvm.aarch64.gmi"
+    )]
+    fn gmi_(ptr: *const (), exclude: i64) -> i64;
+    #[cfg_attr(
+        any(target_arch = "aarch64", target_arch = "arm64ec"),
+        link_name = "llvm.aarch64.ldg"
+    )]
+    fn ldg_(ptr: *const (), tag_ptr: *const ()) -> *const ();
+    #[cfg_attr(
+        any(target_arch = "aarch64", target_arch = "arm64ec"),
+        link_name = "llvm.aarch64.stg"
+    )]
+    fn stg_(tagged_ptr: *const (), addr_to_tag: *const ());
+    #[cfg_attr(
+        any(target_arch = "aarch64", target_arch = "arm64ec"),
+        link_name = "llvm.aarch64.addg"
+    )]
+    fn addg_(ptr: *const (), value: i64) -> *const ();
+    #[cfg_attr(
+        any(target_arch = "aarch64", target_arch = "arm64ec"),
+        link_name = "llvm.aarch64.subp"
+    )]
+    fn subp_(ptr_a: *const (), ptr_b: *const ()) -> i64;
+}
+
+/// Return a pointer containing a randomly generated logical address tag.
+///
+/// `src`: A pointer containing an address.
+/// `mask`: A mask where each of the lower 16 bits specifies logical
+///         tags which must be excluded from consideration. Zero excludes no
+///         tags.
+///
+/// The returned pointer contains a copy of the `src` address, but with a
+/// randomly generated logical tag, excluding any specified by `mask`.
+///
+/// SAFETY: The pointer provided by this intrinsic will be invalid until the memory
+/// has been appropriately tagged with `__arm_mte_set_tag`. If using that intrinsic
+/// on the provided pointer is itself invalid, then it will be permanently invalid
+/// and Undefined Behavior to dereference it.
+#[inline]
+#[target_feature(enable = "mte")]
+#[unstable(feature = "stdarch_aarch64_mte", issue = "129010")]
+pub unsafe fn __arm_mte_create_random_tag<T>(src: *const T, mask: u64) -> *const T {
+    irg_(src as *const (), mask as i64) as *const T
+}
+
+/// Return a pointer with the logical address tag offset by a value.
+///
+/// `src`: A pointer containing an address and a logical tag.
+/// `OFFSET`: A compile-time constant value in the range [0, 15].
+///
+/// Adds offset to the logical address tag in `src`, wrapping if the result is
+/// outside of the valid 16 tags.
+///
+/// SAFETY: See `__arm_mte_create_random_tag`.
+#[inline]
+#[target_feature(enable = "mte")]
+#[unstable(feature = "stdarch_aarch64_mte", issue = "129010")]
+pub unsafe fn __arm_mte_increment_tag<const OFFSET: i64, T>(src: *const T) -> *const T {
+    addg_(src as *const (), OFFSET) as *const T
+}
+
+/// Add a logical tag to the set of excluded logical tags.
+///
+/// `src`: A pointer containing an address and a logical tag.
+/// `excluded`: A mask where the lower 16 bits each specify currently-excluded
+///             logical tags.
+///
+/// Adds the logical tag stored in `src` to the set in `excluded`, and returns
+/// the result.
+#[inline]
+#[target_feature(enable = "mte")]
+#[unstable(feature = "stdarch_aarch64_mte", issue = "129010")]
+pub unsafe fn __arm_mte_exclude_tag<T>(src: *const T, excluded: u64) -> u64 {
+    gmi_(src as *const (), excluded as i64) as u64
+}
+
+/// Store an allocation tag for the 16-byte granule of memory.
+///
+/// `tag_address`: A pointer containing an address and a logical tag, which
+///                must be 16-byte aligned.
+///
+/// SAFETY: `tag_address` must be 16-byte aligned. The tag will apply to the
+/// entire 16-byte memory granule.
+#[inline]
+#[target_feature(enable = "mte")]
+#[unstable(feature = "stdarch_aarch64_mte", issue = "129010")]
+pub unsafe fn __arm_mte_set_tag<T>(tag_address: *const T) {
+    stg_(tag_address as *const (), tag_address as *const ());
+}
+
+/// Load an allocation tag from memory, returning a new pointer with the
+/// corresponding logical tag.
+///
+/// `address`: A pointer containing an address from which allocation tag memory
+///            is read. This does not need to be 16-byte aligned.
+#[inline]
+#[target_feature(enable = "mte")]
+#[unstable(feature = "stdarch_aarch64_mte", issue = "129010")]
+pub unsafe fn __arm_mte_get_tag<T>(address: *const T) -> *const T {
+    ldg_(address as *const (), address as *const ()) as *const T
+}
+
+/// Calculate the difference between the address parts of two pointers, ignoring
+/// the tags, and sign-extending the result.
+#[inline]
+#[target_feature(enable = "mte")]
+#[unstable(feature = "stdarch_aarch64_mte", issue = "129010")]
+pub unsafe fn __arm_mte_ptrdiff<T, U>(a: *const T, b: *const U) -> i64 {
+    subp_(a as *const (), b as *const ())
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use stdarch_test::assert_instr;
+
+    #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(irg))] // FIXME: MSVC  `dumpbin` doesn't support MTE
+    #[allow(dead_code)]
+    #[target_feature(enable = "mte")]
+    unsafe fn test_arm_mte_create_random_tag(src: *const (), mask: u64) -> *const () {
+        __arm_mte_create_random_tag(src, mask)
+    }
+
+    #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(addg))]
+    #[allow(dead_code)]
+    #[target_feature(enable = "mte")]
+    unsafe fn test_arm_mte_increment_tag(src: *const ()) -> *const () {
+        __arm_mte_increment_tag::<1, _>(src)
+    }
+
+    #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(gmi))]
+    #[allow(dead_code)]
+    #[target_feature(enable = "mte")]
+    unsafe fn test_arm_mte_exclude_tag(src: *const (), excluded: u64) -> u64 {
+        __arm_mte_exclude_tag(src, excluded)
+    }
+
+    #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(stg))]
+    #[allow(dead_code)]
+    #[target_feature(enable = "mte")]
+    unsafe fn test_arm_mte_set_tag(src: *const ()) {
+        __arm_mte_set_tag(src)
+    }
+
+    #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(ldg))]
+    #[allow(dead_code)]
+    #[target_feature(enable = "mte")]
+    unsafe fn test_arm_mte_get_tag(src: *const ()) -> *const () {
+        __arm_mte_get_tag(src)
+    }
+
+    #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(subp))]
+    #[allow(dead_code)]
+    #[target_feature(enable = "mte")]
+    unsafe fn test_arm_mte_ptrdiff(a: *const (), b: *const ()) -> i64 {
+        __arm_mte_ptrdiff(a, b)
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
new file mode 100644
index 0000000000000..96ed82021b4b2
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
@@ -0,0 +1,29470 @@
+// This code is automatically generated. DO NOT MODIFY.
+//
+// Instead, modify `crates/stdarch-gen-arm/spec/` and run the following command to re-generate this file:
+//
+// ```
+// cargo run --bin=stdarch-gen-arm -- crates/stdarch-gen-arm/spec
+// ```
+#![allow(improper_ctypes)]
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use super::*;
+
+#[doc = "CRC32-C single round checksum for quad words (64 bits)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32cd)"]
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(crc32cx))]
+#[stable(feature = "stdarch_aarch64_crc32", since = "1.80.0")]
+pub fn __crc32cd(crc: u32, data: u64) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crc32cx"
+        )]
+        fn ___crc32cd(crc: u32, data: u64) -> u32;
+    }
+    unsafe { ___crc32cd(crc, data) }
+}
+#[doc = "CRC32 single round checksum for quad words (64 bits)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32d)"]
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(crc32x))]
+#[stable(feature = "stdarch_aarch64_crc32", since = "1.80.0")]
+pub fn __crc32d(crc: u32, data: u64) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crc32x"
+        )]
+        fn ___crc32d(crc: u32, data: u64) -> u32;
+    }
+    unsafe { ___crc32d(crc, data) }
+}
+#[doc = "Signed Absolute difference and Accumulate Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_high_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sabal))]
+pub fn vabal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t {
+    unsafe {
+        let d: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let e: int8x8_t = simd_shuffle!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let f: int8x8_t = vabd_s8(d, e);
+        let f: uint8x8_t = simd_cast(f);
+        simd_add(a, simd_cast(f))
+    }
+}
+#[doc = "Signed Absolute difference and Accumulate Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sabal))]
+pub fn vabal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    unsafe {
+        let d: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        let e: int16x4_t = simd_shuffle!(c, c, [4, 5, 6, 7]);
+        let f: int16x4_t = vabd_s16(d, e);
+        let f: uint16x4_t = simd_cast(f);
+        simd_add(a, simd_cast(f))
+    }
+}
+#[doc = "Signed Absolute difference and Accumulate Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sabal))]
+pub fn vabal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    unsafe {
+        let d: int32x2_t = simd_shuffle!(b, b, [2, 3]);
+        let e: int32x2_t = simd_shuffle!(c, c, [2, 3]);
+        let f: int32x2_t = vabd_s32(d, e);
+        let f: uint32x2_t = simd_cast(f);
+        simd_add(a, simd_cast(f))
+    }
+}
+#[doc = "Unsigned Absolute difference and Accumulate Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_high_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uabal))]
+pub fn vabal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t {
+    unsafe {
+        let d: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let e: uint8x8_t = simd_shuffle!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let f: uint8x8_t = vabd_u8(d, e);
+        simd_add(a, simd_cast(f))
+    }
+}
+#[doc = "Unsigned Absolute difference and Accumulate Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uabal))]
+pub fn vabal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
+    unsafe {
+        let d: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        let e: uint16x4_t = simd_shuffle!(c, c, [4, 5, 6, 7]);
+        let f: uint16x4_t = vabd_u16(d, e);
+        simd_add(a, simd_cast(f))
+    }
+}
+#[doc = "Unsigned Absolute difference and Accumulate Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uabal))]
+pub fn vabal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
+    unsafe {
+        let d: uint32x2_t = simd_shuffle!(b, b, [2, 3]);
+        let e: uint32x2_t = simd_shuffle!(c, c, [2, 3]);
+        let f: uint32x2_t = vabd_u32(d, e);
+        simd_add(a, simd_cast(f))
+    }
+}
+#[doc = "Absolute difference between the arguments of Floating"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fabd))]
+pub fn vabd_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fabd.v1f64"
+        )]
+        fn _vabd_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vabd_f64(a, b) }
+}
+#[doc = "Absolute difference between the arguments of Floating"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fabd))]
+pub fn vabdq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fabd.v2f64"
+        )]
+        fn _vabdq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vabdq_f64(a, b) }
+}
+#[doc = "Floating-point absolute difference"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fabd))]
+pub fn vabdd_f64(a: f64, b: f64) -> f64 {
+    unsafe { simd_extract!(vabd_f64(vdup_n_f64(a), vdup_n_f64(b)), 0) }
+}
+#[doc = "Floating-point absolute difference"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabds_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fabd))]
+pub fn vabds_f32(a: f32, b: f32) -> f32 {
+    unsafe { simd_extract!(vabd_f32(vdup_n_f32(a), vdup_n_f32(b)), 0) }
+}
+#[doc = "Floating-point absolute difference"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fabd))]
+pub fn vabdh_f16(a: f16, b: f16) -> f16 {
+    unsafe { simd_extract!(vabd_f16(vdup_n_f16(a), vdup_n_f16(b)), 0) }
+}
+#[doc = "Signed Absolute difference Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sabdl))]
+pub fn vabdl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    unsafe {
+        let c: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        let d: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        let e: uint16x4_t = simd_cast(vabd_s16(c, d));
+        simd_cast(e)
+    }
+}
+#[doc = "Signed Absolute difference Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sabdl))]
+pub fn vabdl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    unsafe {
+        let c: int32x2_t = simd_shuffle!(a, a, [2, 3]);
+        let d: int32x2_t = simd_shuffle!(b, b, [2, 3]);
+        let e: uint32x2_t = simd_cast(vabd_s32(c, d));
+        simd_cast(e)
+    }
+}
+#[doc = "Signed Absolute difference Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sabdl))]
+pub fn vabdl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
+    unsafe {
+        let c: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let d: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let e: uint8x8_t = simd_cast(vabd_s8(c, d));
+        simd_cast(e)
+    }
+}
+#[doc = "Unsigned Absolute difference Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uabdl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vabdl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
+    unsafe {
+        let c: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let d: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        simd_cast(vabd_u8(c, d))
+    }
+}
+#[doc = "Unsigned Absolute difference Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uabdl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vabdl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    unsafe {
+        let c: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        let d: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        simd_cast(vabd_u16(c, d))
+    }
+}
+#[doc = "Unsigned Absolute difference Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uabdl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vabdl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    unsafe {
+        let c: uint32x2_t = simd_shuffle!(a, a, [2, 3]);
+        let d: uint32x2_t = simd_shuffle!(b, b, [2, 3]);
+        simd_cast(vabd_u32(c, d))
+    }
+}
+#[doc = "Floating-point absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabs_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fabs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vabs_f64(a: float64x1_t) -> float64x1_t {
+    unsafe { simd_fabs(a) }
+}
+#[doc = "Floating-point absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabsq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fabs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vabsq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe { simd_fabs(a) }
+}
+#[doc = "Absolute Value (wrapping)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabs_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(abs))]
+pub fn vabs_s64(a: int64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.abs.v1i64"
+        )]
+        fn _vabs_s64(a: int64x1_t) -> int64x1_t;
+    }
+    unsafe { _vabs_s64(a) }
+}
+#[doc = "Absolute Value (wrapping)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabsd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(abs))]
+pub fn vabsd_s64(a: i64) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.abs.i64"
+        )]
+        fn _vabsd_s64(a: i64) -> i64;
+    }
+    unsafe { _vabsd_s64(a) }
+}
+#[doc = "Absolute Value (wrapping)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabsq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(abs))]
+pub fn vabsq_s64(a: int64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.abs.v2i64"
+        )]
+        fn _vabsq_s64(a: int64x2_t) -> int64x2_t;
+    }
+    unsafe { _vabsq_s64(a) }
+}
+#[doc = "Add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vaddd_s64(a: i64, b: i64) -> i64 {
+    a.wrapping_add(b)
+}
+#[doc = "Add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vaddd_u64(a: u64, b: u64) -> u64 {
+    a.wrapping_add(b)
+}
+#[doc = "Signed Add Long across Vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddlv_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(saddlv))]
+pub fn vaddlv_s16(a: int16x4_t) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddlv.i32.v4i16"
+        )]
+        fn _vaddlv_s16(a: int16x4_t) -> i32;
+    }
+    unsafe { _vaddlv_s16(a) }
+}
+#[doc = "Signed Add Long across Vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddlvq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(saddlv))]
+pub fn vaddlvq_s16(a: int16x8_t) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddlv.i32.v8i16"
+        )]
+        fn _vaddlvq_s16(a: int16x8_t) -> i32;
+    }
+    unsafe { _vaddlvq_s16(a) }
+}
+#[doc = "Signed Add Long across Vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddlvq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(saddlv))]
+pub fn vaddlvq_s32(a: int32x4_t) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddlv.i64.v4i32"
+        )]
+        fn _vaddlvq_s32(a: int32x4_t) -> i64;
+    }
+    unsafe { _vaddlvq_s32(a) }
+}
+#[doc = "Signed Add Long across Vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddlv_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(saddlp))]
+pub fn vaddlv_s32(a: int32x2_t) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddlv.i64.v2i32"
+        )]
+        fn _vaddlv_s32(a: int32x2_t) -> i64;
+    }
+    unsafe { _vaddlv_s32(a) }
+}
+#[doc = "Signed Add Long across Vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddlv_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(saddlv))]
+pub fn vaddlv_s8(a: int8x8_t) -> i16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddlv.i32.v8i8"
+        )]
+        fn _vaddlv_s8(a: int8x8_t) -> i32;
+    }
+    unsafe { _vaddlv_s8(a) as i16 }
+}
+#[doc = "Signed Add Long across Vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddlvq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(saddlv))]
+pub fn vaddlvq_s8(a: int8x16_t) -> i16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddlv.i32.v16i8"
+        )]
+        fn _vaddlvq_s8(a: int8x16_t) -> i32;
+    }
+    unsafe { _vaddlvq_s8(a) as i16 }
+}
+#[doc = "Unsigned Add Long across Vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddlv_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uaddlv))]
+pub fn vaddlv_u16(a: uint16x4_t) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddlv.i32.v4i16"
+        )]
+        fn _vaddlv_u16(a: uint16x4_t) -> u32;
+    }
+    unsafe { _vaddlv_u16(a) }
+}
+#[doc = "Unsigned Add Long across Vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddlvq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uaddlv))]
+pub fn vaddlvq_u16(a: uint16x8_t) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddlv.i32.v8i16"
+        )]
+        fn _vaddlvq_u16(a: uint16x8_t) -> u32;
+    }
+    unsafe { _vaddlvq_u16(a) }
+}
+#[doc = "Unsigned Add Long across Vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddlvq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uaddlv))]
+pub fn vaddlvq_u32(a: uint32x4_t) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddlv.i64.v4i32"
+        )]
+        fn _vaddlvq_u32(a: uint32x4_t) -> u64;
+    }
+    unsafe { _vaddlvq_u32(a) }
+}
+#[doc = "Unsigned Add Long across Vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddlv_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uaddlp))]
+pub fn vaddlv_u32(a: uint32x2_t) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddlv.i64.v2i32"
+        )]
+        fn _vaddlv_u32(a: uint32x2_t) -> u64;
+    }
+    unsafe { _vaddlv_u32(a) }
+}
+#[doc = "Unsigned Add Long across Vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddlv_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uaddlv))]
+pub fn vaddlv_u8(a: uint8x8_t) -> u16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddlv.i32.v8i8"
+        )]
+        fn _vaddlv_u8(a: uint8x8_t) -> i32;
+    }
+    unsafe { _vaddlv_u8(a) as u16 }
+}
+#[doc = "Unsigned Add Long across Vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddlvq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uaddlv))]
+pub fn vaddlvq_u8(a: uint8x16_t) -> u16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddlv.i32.v16i8"
+        )]
+        fn _vaddlvq_u8(a: uint8x16_t) -> i32;
+    }
+    unsafe { _vaddlvq_u8(a) as u16 }
+}
+#[doc = "Floating-point add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddv_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(faddp))]
+pub fn vaddv_f32(a: float32x2_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.faddv.f32.v2f32"
+        )]
+        fn _vaddv_f32(a: float32x2_t) -> f32;
+    }
+    unsafe { _vaddv_f32(a) }
+}
+#[doc = "Floating-point add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddvq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(faddp))]
+pub fn vaddvq_f32(a: float32x4_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.faddv.f32.v4f32"
+        )]
+        fn _vaddvq_f32(a: float32x4_t) -> f32;
+    }
+    unsafe { _vaddvq_f32(a) }
+}
+#[doc = "Floating-point add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddvq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(faddp))]
+pub fn vaddvq_f64(a: float64x2_t) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.faddv.f64.v2f64"
+        )]
+        fn _vaddvq_f64(a: float64x2_t) -> f64;
+    }
+    unsafe { _vaddvq_f64(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddv_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vaddv_s32(a: int32x2_t) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddv.i32.v2i32"
+        )]
+        fn _vaddv_s32(a: int32x2_t) -> i32;
+    }
+    unsafe { _vaddv_s32(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddv_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addv))]
+pub fn vaddv_s8(a: int8x8_t) -> i8 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddv.i8.v8i8"
+        )]
+        fn _vaddv_s8(a: int8x8_t) -> i8;
+    }
+    unsafe { _vaddv_s8(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddvq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addv))]
+pub fn vaddvq_s8(a: int8x16_t) -> i8 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddv.i8.v16i8"
+        )]
+        fn _vaddvq_s8(a: int8x16_t) -> i8;
+    }
+    unsafe { _vaddvq_s8(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddv_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addv))]
+pub fn vaddv_s16(a: int16x4_t) -> i16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddv.i16.v4i16"
+        )]
+        fn _vaddv_s16(a: int16x4_t) -> i16;
+    }
+    unsafe { _vaddv_s16(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddvq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addv))]
+pub fn vaddvq_s16(a: int16x8_t) -> i16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddv.i16.v8i16"
+        )]
+        fn _vaddvq_s16(a: int16x8_t) -> i16;
+    }
+    unsafe { _vaddvq_s16(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddvq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addv))]
+pub fn vaddvq_s32(a: int32x4_t) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddv.i32.v4i32"
+        )]
+        fn _vaddvq_s32(a: int32x4_t) -> i32;
+    }
+    unsafe { _vaddvq_s32(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddv_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vaddv_u32(a: uint32x2_t) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddv.i32.v2i32"
+        )]
+        fn _vaddv_u32(a: uint32x2_t) -> u32;
+    }
+    unsafe { _vaddv_u32(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddv_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addv))]
+pub fn vaddv_u8(a: uint8x8_t) -> u8 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddv.i8.v8i8"
+        )]
+        fn _vaddv_u8(a: uint8x8_t) -> u8;
+    }
+    unsafe { _vaddv_u8(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddvq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addv))]
+pub fn vaddvq_u8(a: uint8x16_t) -> u8 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddv.i8.v16i8"
+        )]
+        fn _vaddvq_u8(a: uint8x16_t) -> u8;
+    }
+    unsafe { _vaddvq_u8(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddv_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addv))]
+pub fn vaddv_u16(a: uint16x4_t) -> u16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddv.i16.v4i16"
+        )]
+        fn _vaddv_u16(a: uint16x4_t) -> u16;
+    }
+    unsafe { _vaddv_u16(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddvq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addv))]
+pub fn vaddvq_u16(a: uint16x8_t) -> u16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddv.i16.v8i16"
+        )]
+        fn _vaddvq_u16(a: uint16x8_t) -> u16;
+    }
+    unsafe { _vaddvq_u16(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddvq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addv))]
+pub fn vaddvq_u32(a: uint32x4_t) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddv.i32.v4i32"
+        )]
+        fn _vaddvq_u32(a: uint32x4_t) -> u32;
+    }
+    unsafe { _vaddvq_u32(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddvq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vaddvq_s64(a: int64x2_t) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddv.i64.v2i64"
+        )]
+        fn _vaddvq_s64(a: int64x2_t) -> i64;
+    }
+    unsafe { _vaddvq_s64(a) }
+}
+#[doc = "Add across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddvq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vaddvq_u64(a: uint64x2_t) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddv.i64.v2i64"
+        )]
+        fn _vaddvq_u64(a: uint64x2_t) -> u64;
+    }
+    unsafe { _vaddvq_u64(a) }
+}
+#[doc = "Multi-vector floating-point absolute maximum"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vamax_f32)"]
+#[inline]
+#[target_feature(enable = "neon,faminmax")]
+#[cfg_attr(test, assert_instr(nop))]
+#[unstable(feature = "faminmax", issue = "137933")]
+pub fn vamax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.famax.v2f32"
+        )]
+        fn _vamax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vamax_f32(a, b) }
+}
+#[doc = "Multi-vector floating-point absolute maximum"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vamaxq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,faminmax")]
+#[cfg_attr(test, assert_instr(nop))]
+#[unstable(feature = "faminmax", issue = "137933")]
+pub fn vamaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.famax.v4f32"
+        )]
+        fn _vamaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vamaxq_f32(a, b) }
+}
+#[doc = "Multi-vector floating-point absolute maximum"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vamaxq_f64)"]
+#[inline]
+#[target_feature(enable = "neon,faminmax")]
+#[cfg_attr(test, assert_instr(nop))]
+#[unstable(feature = "faminmax", issue = "137933")]
+pub fn vamaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.famax.v2f64"
+        )]
+        fn _vamaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vamaxq_f64(a, b) }
+}
+#[doc = "Multi-vector floating-point absolute minimum"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vamin_f32)"]
+#[inline]
+#[target_feature(enable = "neon,faminmax")]
+#[cfg_attr(test, assert_instr(nop))]
+#[unstable(feature = "faminmax", issue = "137933")]
+pub fn vamin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.famin.v2f32"
+        )]
+        fn _vamin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vamin_f32(a, b) }
+}
+#[doc = "Multi-vector floating-point absolute minimum"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaminq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,faminmax")]
+#[cfg_attr(test, assert_instr(nop))]
+#[unstable(feature = "faminmax", issue = "137933")]
+pub fn vaminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.famin.v4f32"
+        )]
+        fn _vaminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vaminq_f32(a, b) }
+}
+#[doc = "Multi-vector floating-point absolute minimum"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaminq_f64)"]
+#[inline]
+#[target_feature(enable = "neon,faminmax")]
+#[cfg_attr(test, assert_instr(nop))]
+#[unstable(feature = "faminmax", issue = "137933")]
+pub fn vaminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.famin.v2f64"
+        )]
+        fn _vaminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vaminq_f64(a, b) }
+}
+#[doc = "Bit clear and exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbcaxq_s8)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub fn vbcaxq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.bcaxs.v16i8"
+        )]
+        fn _vbcaxq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vbcaxq_s8(a, b, c) }
+}
+#[doc = "Bit clear and exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbcaxq_s16)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub fn vbcaxq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.bcaxs.v8i16"
+        )]
+        fn _vbcaxq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vbcaxq_s16(a, b, c) }
+}
+#[doc = "Bit clear and exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbcaxq_s32)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub fn vbcaxq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.bcaxs.v4i32"
+        )]
+        fn _vbcaxq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vbcaxq_s32(a, b, c) }
+}
+#[doc = "Bit clear and exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbcaxq_s64)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub fn vbcaxq_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.bcaxs.v2i64"
+        )]
+        fn _vbcaxq_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t;
+    }
+    unsafe { _vbcaxq_s64(a, b, c) }
+}
+#[doc = "Bit clear and exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbcaxq_u8)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub fn vbcaxq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.bcaxu.v16i8"
+        )]
+        fn _vbcaxq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vbcaxq_u8(a, b, c) }
+}
+#[doc = "Bit clear and exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbcaxq_u16)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub fn vbcaxq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.bcaxu.v8i16"
+        )]
+        fn _vbcaxq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vbcaxq_u16(a, b, c) }
+}
+#[doc = "Bit clear and exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbcaxq_u32)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub fn vbcaxq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.bcaxu.v4i32"
+        )]
+        fn _vbcaxq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vbcaxq_u32(a, b, c) }
+}
+#[doc = "Bit clear and exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbcaxq_u64)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub fn vbcaxq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.bcaxu.v2i64"
+        )]
+        fn _vbcaxq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vbcaxq_u64(a, b, c) }
+}
+#[doc = "Floating-point complex add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcadd_rot270_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fcma"))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub fn vcadd_rot270_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcadd.rot270.v4f16"
+        )]
+        fn _vcadd_rot270_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vcadd_rot270_f16(a, b) }
+}
+#[doc = "Floating-point complex add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaddq_rot270_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fcma"))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub fn vcaddq_rot270_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcadd.rot270.v8f16"
+        )]
+        fn _vcaddq_rot270_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vcaddq_rot270_f16(a, b) }
+}
+#[doc = "Floating-point complex add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcadd_rot270_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub fn vcadd_rot270_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcadd.rot270.v2f32"
+        )]
+        fn _vcadd_rot270_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vcadd_rot270_f32(a, b) }
+}
+#[doc = "Floating-point complex add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaddq_rot270_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub fn vcaddq_rot270_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcadd.rot270.v4f32"
+        )]
+        fn _vcaddq_rot270_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vcaddq_rot270_f32(a, b) }
+}
+#[doc = "Floating-point complex add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaddq_rot270_f64)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub fn vcaddq_rot270_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcadd.rot270.v2f64"
+        )]
+        fn _vcaddq_rot270_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vcaddq_rot270_f64(a, b) }
+}
+#[doc = "Floating-point complex add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcadd_rot90_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fcma"))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub fn vcadd_rot90_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcadd.rot90.v4f16"
+        )]
+        fn _vcadd_rot90_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vcadd_rot90_f16(a, b) }
+}
+#[doc = "Floating-point complex add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaddq_rot90_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fcma"))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub fn vcaddq_rot90_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcadd.rot90.v8f16"
+        )]
+        fn _vcaddq_rot90_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vcaddq_rot90_f16(a, b) }
+}
+#[doc = "Floating-point complex add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcadd_rot90_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub fn vcadd_rot90_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcadd.rot90.v2f32"
+        )]
+        fn _vcadd_rot90_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vcadd_rot90_f32(a, b) }
+}
+#[doc = "Floating-point complex add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaddq_rot90_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub fn vcaddq_rot90_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcadd.rot90.v4f32"
+        )]
+        fn _vcaddq_rot90_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vcaddq_rot90_f32(a, b) }
+}
+#[doc = "Floating-point complex add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaddq_rot90_f64)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub fn vcaddq_rot90_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcadd.rot90.v2f64"
+        )]
+        fn _vcaddq_rot90_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vcaddq_rot90_f64(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcage_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcage_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facge.v1i64.v1f64"
+        )]
+        fn _vcage_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vcage_f64(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcageq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcageq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facge.v2i64.v2f64"
+        )]
+        fn _vcageq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vcageq_f64(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaged_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcaged_f64(a: f64, b: f64) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facge.i64.f64"
+        )]
+        fn _vcaged_f64(a: f64, b: f64) -> u64;
+    }
+    unsafe { _vcaged_f64(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcages_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcages_f32(a: f32, b: f32) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facge.i32.f32"
+        )]
+        fn _vcages_f32(a: f32, b: f32) -> u32;
+    }
+    unsafe { _vcages_f32(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcageh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(facge))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcageh_f16(a: f16, b: f16) -> u16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facge.i32.f16"
+        )]
+        fn _vcageh_f16(a: f16, b: f16) -> i32;
+    }
+    unsafe { _vcageh_f16(a, b) as u16 }
+}
+#[doc = "Floating-point absolute compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcagt_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcagt_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facgt.v1i64.v1f64"
+        )]
+        fn _vcagt_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vcagt_f64(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcagtq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcagtq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facgt.v2i64.v2f64"
+        )]
+        fn _vcagtq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vcagtq_f64(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcagtd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcagtd_f64(a: f64, b: f64) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facgt.i64.f64"
+        )]
+        fn _vcagtd_f64(a: f64, b: f64) -> u64;
+    }
+    unsafe { _vcagtd_f64(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcagts_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcagts_f32(a: f32, b: f32) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facgt.i32.f32"
+        )]
+        fn _vcagts_f32(a: f32, b: f32) -> u32;
+    }
+    unsafe { _vcagts_f32(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcagth_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(facgt))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcagth_f16(a: f16, b: f16) -> u16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facgt.i32.f16"
+        )]
+        fn _vcagth_f16(a: f16, b: f16) -> i32;
+    }
+    unsafe { _vcagth_f16(a, b) as u16 }
+}
+#[doc = "Floating-point absolute compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcale_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcale_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    vcage_f64(b, a)
+}
+#[doc = "Floating-point absolute compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaleq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcaleq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    vcageq_f64(b, a)
+}
+#[doc = "Floating-point absolute compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaled_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcaled_f64(a: f64, b: f64) -> u64 {
+    vcaged_f64(b, a)
+}
+#[doc = "Floating-point absolute compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcales_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcales_f32(a: f32, b: f32) -> u32 {
+    vcages_f32(b, a)
+}
+#[doc = "Floating-point absolute compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaleh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(facge))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcaleh_f16(a: f16, b: f16) -> u16 {
+    vcageh_f16(b, a)
+}
+#[doc = "Floating-point absolute compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcalt_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcalt_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    vcagt_f64(b, a)
+}
+#[doc = "Floating-point absolute compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaltq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcaltq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    vcagtq_f64(b, a)
+}
+#[doc = "Floating-point absolute compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaltd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcaltd_f64(a: f64, b: f64) -> u64 {
+    vcagtd_f64(b, a)
+}
+#[doc = "Floating-point absolute compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcalts_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcalts_f32(a: f32, b: f32) -> u32 {
+    vcagts_f32(b, a)
+}
+#[doc = "Floating-point absolute compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcalth_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(facgt))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcalth_f16(a: f16, b: f16) -> u16 {
+    vcagth_f16(b, a)
+}
+#[doc = "Floating-point compare equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceq_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Floating-point compare equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceq_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceq_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceq_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceq_p64(a: poly64x1_t, b: poly64x1_t) -> uint64x1_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqq_p64(a: poly64x2_t, b: poly64x2_t) -> uint64x2_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Floating-point compare equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqd_f64(a: f64, b: f64) -> u64 {
+    unsafe { simd_extract!(vceq_f64(vdup_n_f64(a), vdup_n_f64(b)), 0) }
+}
+#[doc = "Floating-point compare equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqs_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqs_f32(a: f32, b: f32) -> u32 {
+    unsafe { simd_extract!(vceq_f32(vdup_n_f32(a), vdup_n_f32(b)), 0) }
+}
+#[doc = "Compare bitwise equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqd_s64(a: i64, b: i64) -> u64 {
+    unsafe { transmute(vceq_s64(transmute(a), transmute(b))) }
+}
+#[doc = "Compare bitwise equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqd_u64(a: u64, b: u64) -> u64 {
+    unsafe { transmute(vceq_u64(transmute(a), transmute(b))) }
+}
+#[doc = "Floating-point compare equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vceqh_f16(a: f16, b: f16) -> u16 {
+    unsafe { simd_extract!(vceq_f16(vdup_n_f16(a), vdup_n_f16(b)), 0) }
+}
+#[doc = "Floating-point compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vceqz_f16(a: float16x4_t) -> uint16x4_t {
+    let b: f16x4 = f16x4::new(0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Floating-point compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vceqzq_f16(a: float16x8_t) -> uint16x8_t {
+    let b: f16x8 = f16x8::new(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Floating-point compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqz_f32(a: float32x2_t) -> uint32x2_t {
+    let b: f32x2 = f32x2::new(0.0, 0.0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Floating-point compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzq_f32(a: float32x4_t) -> uint32x4_t {
+    let b: f32x4 = f32x4::new(0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Floating-point compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqz_f64(a: float64x1_t) -> uint64x1_t {
+    let b: f64 = 0.0;
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Floating-point compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzq_f64(a: float64x2_t) -> uint64x2_t {
+    let b: f64x2 = f64x2::new(0.0, 0.0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Signed compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqz_s8(a: int8x8_t) -> uint8x8_t {
+    let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Signed compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzq_s8(a: int8x16_t) -> uint8x16_t {
+    let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Signed compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqz_s16(a: int16x4_t) -> uint16x4_t {
+    let b: i16x4 = i16x4::new(0, 0, 0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Signed compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzq_s16(a: int16x8_t) -> uint16x8_t {
+    let b: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Signed compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqz_s32(a: int32x2_t) -> uint32x2_t {
+    let b: i32x2 = i32x2::new(0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Signed compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzq_s32(a: int32x4_t) -> uint32x4_t {
+    let b: i32x4 = i32x4::new(0, 0, 0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Signed compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqz_s64(a: int64x1_t) -> uint64x1_t {
+    let b: i64x1 = i64x1::new(0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Signed compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzq_s64(a: int64x2_t) -> uint64x2_t {
+    let b: i64x2 = i64x2::new(0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Signed compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqz_p8(a: poly8x8_t) -> uint8x8_t {
+    let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Signed compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzq_p8(a: poly8x16_t) -> uint8x16_t {
+    let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Signed compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqz_p64(a: poly64x1_t) -> uint64x1_t {
+    let b: i64x1 = i64x1::new(0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Signed compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzq_p64(a: poly64x2_t) -> uint64x2_t {
+    let b: i64x2 = i64x2::new(0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Unsigned compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqz_u8(a: uint8x8_t) -> uint8x8_t {
+    let b: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Unsigned compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzq_u8(a: uint8x16_t) -> uint8x16_t {
+    let b: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Unsigned compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqz_u16(a: uint16x4_t) -> uint16x4_t {
+    let b: u16x4 = u16x4::new(0, 0, 0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Unsigned compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzq_u16(a: uint16x8_t) -> uint16x8_t {
+    let b: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Unsigned compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqz_u32(a: uint32x2_t) -> uint32x2_t {
+    let b: u32x2 = u32x2::new(0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Unsigned compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzq_u32(a: uint32x4_t) -> uint32x4_t {
+    let b: u32x4 = u32x4::new(0, 0, 0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Unsigned compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqz_u64(a: uint64x1_t) -> uint64x1_t {
+    let b: u64x1 = u64x1::new(0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Unsigned compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzq_u64(a: uint64x2_t) -> uint64x2_t {
+    let b: u64x2 = u64x2::new(0, 0);
+    unsafe { simd_eq(a, transmute(b)) }
+}
+#[doc = "Compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzd_s64(a: i64) -> u64 {
+    unsafe { transmute(vceqz_s64(transmute(a))) }
+}
+#[doc = "Compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzd_u64(a: u64) -> u64 {
+    unsafe { transmute(vceqz_u64(transmute(a))) }
+}
+#[doc = "Floating-point compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vceqzh_f16(a: f16) -> u16 {
+    unsafe { simd_extract!(vceqz_f16(vdup_n_f16(a)), 0) }
+}
+#[doc = "Floating-point compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzs_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzs_f32(a: f32) -> u32 {
+    unsafe { simd_extract!(vceqz_f32(vdup_n_f32(a)), 0) }
+}
+#[doc = "Floating-point compare bitwise equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vceqzd_f64(a: f64) -> u64 {
+    unsafe { simd_extract!(vceqz_f64(vdup_n_f64(a)), 0) }
+}
+#[doc = "Floating-point compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcge_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcge_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Floating-point compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgeq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgeq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare signed greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcge_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcge_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare signed greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgeq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgeq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare unsigned greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcge_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcge_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare unsigned greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgeq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgeq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Floating-point compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcged_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcged_f64(a: f64, b: f64) -> u64 {
+    unsafe { simd_extract!(vcge_f64(vdup_n_f64(a), vdup_n_f64(b)), 0) }
+}
+#[doc = "Floating-point compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcges_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcges_f32(a: f32, b: f32) -> u32 {
+    unsafe { simd_extract!(vcge_f32(vdup_n_f32(a), vdup_n_f32(b)), 0) }
+}
+#[doc = "Compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcged_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcged_s64(a: i64, b: i64) -> u64 {
+    unsafe { transmute(vcge_s64(transmute(a), transmute(b))) }
+}
+#[doc = "Compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcged_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcged_u64(a: u64, b: u64) -> u64 {
+    unsafe { transmute(vcge_u64(transmute(a), transmute(b))) }
+}
+#[doc = "Floating-point compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgeh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcgeh_f16(a: f16, b: f16) -> u16 {
+    unsafe { simd_extract!(vcge_f16(vdup_n_f16(a), vdup_n_f16(b)), 0) }
+}
+#[doc = "Floating-point compare greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgez_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgez_f32(a: float32x2_t) -> uint32x2_t {
+    let b: f32x2 = f32x2::new(0.0, 0.0);
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Floating-point compare greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgezq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgezq_f32(a: float32x4_t) -> uint32x4_t {
+    let b: f32x4 = f32x4::new(0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Floating-point compare greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgez_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgez_f64(a: float64x1_t) -> uint64x1_t {
+    let b: f64 = 0.0;
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Floating-point compare greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgezq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgezq_f64(a: float64x2_t) -> uint64x2_t {
+    let b: f64x2 = f64x2::new(0.0, 0.0);
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgez_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgez_s8(a: int8x8_t) -> uint8x8_t {
+    let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgezq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgezq_s8(a: int8x16_t) -> uint8x16_t {
+    let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgez_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgez_s16(a: int16x4_t) -> uint16x4_t {
+    let b: i16x4 = i16x4::new(0, 0, 0, 0);
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgezq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgezq_s16(a: int16x8_t) -> uint16x8_t {
+    let b: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgez_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgez_s32(a: int32x2_t) -> uint32x2_t {
+    let b: i32x2 = i32x2::new(0, 0);
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgezq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgezq_s32(a: int32x4_t) -> uint32x4_t {
+    let b: i32x4 = i32x4::new(0, 0, 0, 0);
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgez_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgez_s64(a: int64x1_t) -> uint64x1_t {
+    let b: i64x1 = i64x1::new(0);
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgezq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgezq_s64(a: int64x2_t) -> uint64x2_t {
+    let b: i64x2 = i64x2::new(0, 0);
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Floating-point compare greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgezd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgezd_f64(a: f64) -> u64 {
+    unsafe { simd_extract!(vcgez_f64(vdup_n_f64(a)), 0) }
+}
+#[doc = "Floating-point compare greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgezs_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgezs_f32(a: f32) -> u32 {
+    unsafe { simd_extract!(vcgez_f32(vdup_n_f32(a)), 0) }
+}
+#[doc = "Compare signed greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgezd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgezd_s64(a: i64) -> u64 {
+    unsafe { transmute(vcgez_s64(transmute(a))) }
+}
+#[doc = "Floating-point compare greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgezh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcgezh_f16(a: f16) -> u16 {
+    unsafe { simd_extract!(vcgez_f16(vdup_n_f16(a)), 0) }
+}
+#[doc = "Floating-point compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgt_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Floating-point compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare signed greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgt_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare signed greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare unsigned greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhi))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgt_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare unsigned greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhi))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Floating-point compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtd_f64(a: f64, b: f64) -> u64 {
+    unsafe { simd_extract!(vcgt_f64(vdup_n_f64(a), vdup_n_f64(b)), 0) }
+}
+#[doc = "Floating-point compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgts_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgts_f32(a: f32, b: f32) -> u32 {
+    unsafe { simd_extract!(vcgt_f32(vdup_n_f32(a), vdup_n_f32(b)), 0) }
+}
+#[doc = "Compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtd_s64(a: i64, b: i64) -> u64 {
+    unsafe { transmute(vcgt_s64(transmute(a), transmute(b))) }
+}
+#[doc = "Compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtd_u64(a: u64, b: u64) -> u64 {
+    unsafe { transmute(vcgt_u64(transmute(a), transmute(b))) }
+}
+#[doc = "Floating-point compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgth_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcgth_f16(a: f16, b: f16) -> u16 {
+    unsafe { simd_extract!(vcgt_f16(vdup_n_f16(a), vdup_n_f16(b)), 0) }
+}
+#[doc = "Floating-point compare greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtz_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtz_f32(a: float32x2_t) -> uint32x2_t {
+    let b: f32x2 = f32x2::new(0.0, 0.0);
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Floating-point compare greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtzq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtzq_f32(a: float32x4_t) -> uint32x4_t {
+    let b: f32x4 = f32x4::new(0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Floating-point compare greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtz_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtz_f64(a: float64x1_t) -> uint64x1_t {
+    let b: f64 = 0.0;
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Floating-point compare greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtzq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtzq_f64(a: float64x2_t) -> uint64x2_t {
+    let b: f64x2 = f64x2::new(0.0, 0.0);
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtz_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtz_s8(a: int8x8_t) -> uint8x8_t {
+    let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtzq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtzq_s8(a: int8x16_t) -> uint8x16_t {
+    let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtz_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtz_s16(a: int16x4_t) -> uint16x4_t {
+    let b: i16x4 = i16x4::new(0, 0, 0, 0);
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtzq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtzq_s16(a: int16x8_t) -> uint16x8_t {
+    let b: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtz_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtz_s32(a: int32x2_t) -> uint32x2_t {
+    let b: i32x2 = i32x2::new(0, 0);
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtzq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtzq_s32(a: int32x4_t) -> uint32x4_t {
+    let b: i32x4 = i32x4::new(0, 0, 0, 0);
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtz_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtz_s64(a: int64x1_t) -> uint64x1_t {
+    let b: i64x1 = i64x1::new(0);
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Compare signed greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtzq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtzq_s64(a: int64x2_t) -> uint64x2_t {
+    let b: i64x2 = i64x2::new(0, 0);
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Floating-point compare greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtzd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtzd_f64(a: f64) -> u64 {
+    unsafe { simd_extract!(vcgtz_f64(vdup_n_f64(a)), 0) }
+}
+#[doc = "Floating-point compare greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtzs_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtzs_f32(a: f32) -> u32 {
+    unsafe { simd_extract!(vcgtz_f32(vdup_n_f32(a)), 0) }
+}
+#[doc = "Compare signed greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtzd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcgtzd_s64(a: i64) -> u64 {
+    unsafe { transmute(vcgtz_s64(transmute(a))) }
+}
+#[doc = "Floating-point compare greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtzh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcgtzh_f16(a: f16) -> u16 {
+    unsafe { simd_extract!(vcgtz_f16(vdup_n_f16(a)), 0) }
+}
+#[doc = "Floating-point compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcle_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcle_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Floating-point compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcleq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcleq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare signed less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcle_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcle_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare signed less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcleq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcleq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare unsigned less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcle_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcle_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare unsigned less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcleq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcleq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Floating-point compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcled_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcled_f64(a: f64, b: f64) -> u64 {
+    unsafe { simd_extract!(vcle_f64(vdup_n_f64(a), vdup_n_f64(b)), 0) }
+}
+#[doc = "Floating-point compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcles_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcles_f32(a: f32, b: f32) -> u32 {
+    unsafe { simd_extract!(vcle_f32(vdup_n_f32(a), vdup_n_f32(b)), 0) }
+}
+#[doc = "Compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcled_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcled_u64(a: u64, b: u64) -> u64 {
+    unsafe { transmute(vcle_u64(transmute(a), transmute(b))) }
+}
+#[doc = "Compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcled_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcled_s64(a: i64, b: i64) -> u64 {
+    unsafe { transmute(vcle_s64(transmute(a), transmute(b))) }
+}
+#[doc = "Floating-point compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcleh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcleh_f16(a: f16, b: f16) -> u16 {
+    unsafe { simd_extract!(vcle_f16(vdup_n_f16(a), vdup_n_f16(b)), 0) }
+}
+#[doc = "Floating-point compare less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclez_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclez_f32(a: float32x2_t) -> uint32x2_t {
+    let b: f32x2 = f32x2::new(0.0, 0.0);
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Floating-point compare less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclezq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclezq_f32(a: float32x4_t) -> uint32x4_t {
+    let b: f32x4 = f32x4::new(0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Floating-point compare less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclez_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclez_f64(a: float64x1_t) -> uint64x1_t {
+    let b: f64 = 0.0;
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Floating-point compare less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclezq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclezq_f64(a: float64x2_t) -> uint64x2_t {
+    let b: f64x2 = f64x2::new(0.0, 0.0);
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Compare signed less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclez_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclez_s8(a: int8x8_t) -> uint8x8_t {
+    let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Compare signed less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclezq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclezq_s8(a: int8x16_t) -> uint8x16_t {
+    let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Compare signed less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclez_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclez_s16(a: int16x4_t) -> uint16x4_t {
+    let b: i16x4 = i16x4::new(0, 0, 0, 0);
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Compare signed less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclezq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclezq_s16(a: int16x8_t) -> uint16x8_t {
+    let b: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Compare signed less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclez_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclez_s32(a: int32x2_t) -> uint32x2_t {
+    let b: i32x2 = i32x2::new(0, 0);
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Compare signed less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclezq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclezq_s32(a: int32x4_t) -> uint32x4_t {
+    let b: i32x4 = i32x4::new(0, 0, 0, 0);
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Compare signed less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclez_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclez_s64(a: int64x1_t) -> uint64x1_t {
+    let b: i64x1 = i64x1::new(0);
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Compare signed less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclezq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclezq_s64(a: int64x2_t) -> uint64x2_t {
+    let b: i64x2 = i64x2::new(0, 0);
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Floating-point compare less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclezd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclezd_f64(a: f64) -> u64 {
+    unsafe { simd_extract!(vclez_f64(vdup_n_f64(a)), 0) }
+}
+#[doc = "Floating-point compare less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclezs_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclezs_f32(a: f32) -> u32 {
+    unsafe { simd_extract!(vclez_f32(vdup_n_f32(a)), 0) }
+}
+#[doc = "Compare less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclezd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclezd_s64(a: i64) -> u64 {
+    unsafe { transmute(vclez_s64(transmute(a))) }
+}
+#[doc = "Floating-point compare less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclezh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vclezh_f16(a: f16) -> u16 {
+    unsafe { simd_extract!(vclez_f16(vdup_n_f16(a)), 0) }
+}
+#[doc = "Floating-point compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclt_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclt_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Floating-point compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare signed less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclt_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclt_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare signed less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare unsigned less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclt_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhi))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclt_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare unsigned less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhi))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltd_u64(a: u64, b: u64) -> u64 {
+    unsafe { transmute(vclt_u64(transmute(a), transmute(b))) }
+}
+#[doc = "Compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltd_s64(a: i64, b: i64) -> u64 {
+    unsafe { transmute(vclt_s64(transmute(a), transmute(b))) }
+}
+#[doc = "Floating-point compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclth_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vclth_f16(a: f16, b: f16) -> u16 {
+    unsafe { simd_extract!(vclt_f16(vdup_n_f16(a), vdup_n_f16(b)), 0) }
+}
+#[doc = "Floating-point compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclts_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vclts_f32(a: f32, b: f32) -> u32 {
+    unsafe { simd_extract!(vclt_f32(vdup_n_f32(a), vdup_n_f32(b)), 0) }
+}
+#[doc = "Floating-point compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltd_f64(a: f64, b: f64) -> u64 {
+    unsafe { simd_extract!(vclt_f64(vdup_n_f64(a), vdup_n_f64(b)), 0) }
+}
+#[doc = "Floating-point compare less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltz_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltz_f32(a: float32x2_t) -> uint32x2_t {
+    let b: f32x2 = f32x2::new(0.0, 0.0);
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Floating-point compare less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltzq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltzq_f32(a: float32x4_t) -> uint32x4_t {
+    let b: f32x4 = f32x4::new(0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Floating-point compare less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltz_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltz_f64(a: float64x1_t) -> uint64x1_t {
+    let b: f64 = 0.0;
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Floating-point compare less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltzq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltzq_f64(a: float64x2_t) -> uint64x2_t {
+    let b: f64x2 = f64x2::new(0.0, 0.0);
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Compare signed less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltz_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltz_s8(a: int8x8_t) -> uint8x8_t {
+    let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Compare signed less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltzq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltzq_s8(a: int8x16_t) -> uint8x16_t {
+    let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Compare signed less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltz_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltz_s16(a: int16x4_t) -> uint16x4_t {
+    let b: i16x4 = i16x4::new(0, 0, 0, 0);
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Compare signed less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltzq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltzq_s16(a: int16x8_t) -> uint16x8_t {
+    let b: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Compare signed less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltz_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltz_s32(a: int32x2_t) -> uint32x2_t {
+    let b: i32x2 = i32x2::new(0, 0);
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Compare signed less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltzq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltzq_s32(a: int32x4_t) -> uint32x4_t {
+    let b: i32x4 = i32x4::new(0, 0, 0, 0);
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Compare signed less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltz_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltz_s64(a: int64x1_t) -> uint64x1_t {
+    let b: i64x1 = i64x1::new(0);
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Compare signed less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltzq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltzq_s64(a: int64x2_t) -> uint64x2_t {
+    let b: i64x2 = i64x2::new(0, 0);
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Floating-point compare less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltzd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltzd_f64(a: f64) -> u64 {
+    unsafe { simd_extract!(vcltz_f64(vdup_n_f64(a)), 0) }
+}
+#[doc = "Floating-point compare less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltzs_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltzs_f32(a: f32) -> u32 {
+    unsafe { simd_extract!(vcltz_f32(vdup_n_f32(a)), 0) }
+}
+#[doc = "Compare less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltzd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(asr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcltzd_s64(a: i64) -> u64 {
+    unsafe { transmute(vcltz_s64(transmute(a))) }
+}
+#[doc = "Floating-point compare less than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltzh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcltzh_f16(a: f16) -> u16 {
+    unsafe { simd_extract!(vcltz_f16(vdup_n_f16(a)), 0) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmla_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot0.v4f16"
+        )]
+        fn _vcmla_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vcmla_f16(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmlaq_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot0.v8f16"
+        )]
+        fn _vcmlaq_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vcmlaq_f16(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmla_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot0.v2f32"
+        )]
+        fn _vcmla_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vcmla_f32(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmlaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot0.v4f32"
+        )]
+        fn _vcmlaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vcmlaq_f32(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_f64)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmlaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot0.v2f64"
+        )]
+        fn _vcmlaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vcmlaq_f64(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_lane_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmla_lane_f16<const LANE: i32>(
+    a: float16x4_t,
+    b: float16x4_t,
+    c: float16x4_t,
+) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float16x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmla_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_lane_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmlaq_lane_f16<const LANE: i32>(
+    a: float16x8_t,
+    b: float16x8_t,
+    c: float16x4_t,
+) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float16x8_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmla_lane_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x2_t,
+) -> float32x2_t {
+    static_assert!(LANE == 0);
+    unsafe {
+        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
+        vcmla_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmlaq_lane_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x2_t,
+) -> float32x4_t {
+    static_assert!(LANE == 0);
+    unsafe {
+        let c: float32x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_laneq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmla_laneq_f16<const LANE: i32>(
+    a: float16x4_t,
+    b: float16x4_t,
+    c: float16x8_t,
+) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: float16x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmla_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_laneq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmlaq_laneq_f16<const LANE: i32>(
+    a: float16x8_t,
+    b: float16x8_t,
+    c: float16x8_t,
+) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: float16x8_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmla_laneq_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x4_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
+        vcmla_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmlaq_laneq_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x4_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float32x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmla_rot180_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot180.v4f16"
+        )]
+        fn _vcmla_rot180_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vcmla_rot180_f16(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmlaq_rot180_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot180.v8f16"
+        )]
+        fn _vcmlaq_rot180_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vcmlaq_rot180_f16(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmla_rot180_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot180.v2f32"
+        )]
+        fn _vcmla_rot180_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vcmla_rot180_f32(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmlaq_rot180_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot180.v4f32"
+        )]
+        fn _vcmlaq_rot180_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vcmlaq_rot180_f32(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_f64)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmlaq_rot180_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot180.v2f64"
+        )]
+        fn _vcmlaq_rot180_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vcmlaq_rot180_f64(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_lane_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmla_rot180_lane_f16<const LANE: i32>(
+    a: float16x4_t,
+    b: float16x4_t,
+    c: float16x4_t,
+) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float16x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmla_rot180_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_lane_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmlaq_rot180_lane_f16<const LANE: i32>(
+    a: float16x8_t,
+    b: float16x8_t,
+    c: float16x4_t,
+) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float16x8_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_rot180_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmla_rot180_lane_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x2_t,
+) -> float32x2_t {
+    static_assert!(LANE == 0);
+    unsafe {
+        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
+        vcmla_rot180_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmlaq_rot180_lane_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x2_t,
+) -> float32x4_t {
+    static_assert!(LANE == 0);
+    unsafe {
+        let c: float32x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_rot180_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_laneq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmla_rot180_laneq_f16<const LANE: i32>(
+    a: float16x4_t,
+    b: float16x4_t,
+    c: float16x8_t,
+) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: float16x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmla_rot180_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_laneq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmlaq_rot180_laneq_f16<const LANE: i32>(
+    a: float16x8_t,
+    b: float16x8_t,
+    c: float16x8_t,
+) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: float16x8_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_rot180_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmla_rot180_laneq_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x4_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
+        vcmla_rot180_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmlaq_rot180_laneq_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x4_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float32x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_rot180_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmla_rot270_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot270.v4f16"
+        )]
+        fn _vcmla_rot270_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vcmla_rot270_f16(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmlaq_rot270_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot270.v8f16"
+        )]
+        fn _vcmlaq_rot270_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vcmlaq_rot270_f16(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmla_rot270_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot270.v2f32"
+        )]
+        fn _vcmla_rot270_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vcmla_rot270_f32(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmlaq_rot270_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot270.v4f32"
+        )]
+        fn _vcmlaq_rot270_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vcmlaq_rot270_f32(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_f64)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmlaq_rot270_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot270.v2f64"
+        )]
+        fn _vcmlaq_rot270_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vcmlaq_rot270_f64(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_lane_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmla_rot270_lane_f16<const LANE: i32>(
+    a: float16x4_t,
+    b: float16x4_t,
+    c: float16x4_t,
+) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float16x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmla_rot270_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_lane_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmlaq_rot270_lane_f16<const LANE: i32>(
+    a: float16x8_t,
+    b: float16x8_t,
+    c: float16x4_t,
+) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float16x8_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_rot270_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmla_rot270_lane_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x2_t,
+) -> float32x2_t {
+    static_assert!(LANE == 0);
+    unsafe {
+        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
+        vcmla_rot270_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmlaq_rot270_lane_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x2_t,
+) -> float32x4_t {
+    static_assert!(LANE == 0);
+    unsafe {
+        let c: float32x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_rot270_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_laneq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmla_rot270_laneq_f16<const LANE: i32>(
+    a: float16x4_t,
+    b: float16x4_t,
+    c: float16x8_t,
+) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: float16x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmla_rot270_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_laneq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmlaq_rot270_laneq_f16<const LANE: i32>(
+    a: float16x8_t,
+    b: float16x8_t,
+    c: float16x8_t,
+) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: float16x8_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_rot270_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmla_rot270_laneq_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x4_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
+        vcmla_rot270_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmlaq_rot270_laneq_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x4_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float32x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_rot270_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmla_rot90_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot90.v4f16"
+        )]
+        fn _vcmla_rot90_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vcmla_rot90_f16(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmlaq_rot90_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot90.v8f16"
+        )]
+        fn _vcmlaq_rot90_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vcmlaq_rot90_f16(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmla_rot90_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot90.v2f32"
+        )]
+        fn _vcmla_rot90_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vcmla_rot90_f32(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmlaq_rot90_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot90.v4f32"
+        )]
+        fn _vcmlaq_rot90_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vcmlaq_rot90_f32(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_f64)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub fn vcmlaq_rot90_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcmla.rot90.v2f64"
+        )]
+        fn _vcmlaq_rot90_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vcmlaq_rot90_f64(a, b, c) }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_lane_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmla_rot90_lane_f16<const LANE: i32>(
+    a: float16x4_t,
+    b: float16x4_t,
+    c: float16x4_t,
+) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float16x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmla_rot90_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_lane_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmlaq_rot90_lane_f16<const LANE: i32>(
+    a: float16x8_t,
+    b: float16x8_t,
+    c: float16x4_t,
+) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float16x8_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_rot90_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmla_rot90_lane_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x2_t,
+) -> float32x2_t {
+    static_assert!(LANE == 0);
+    unsafe {
+        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
+        vcmla_rot90_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmlaq_rot90_lane_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x2_t,
+) -> float32x4_t {
+    static_assert!(LANE == 0);
+    unsafe {
+        let c: float32x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_rot90_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_laneq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmla_rot90_laneq_f16<const LANE: i32>(
+    a: float16x4_t,
+    b: float16x4_t,
+    c: float16x8_t,
+) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: float16x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmla_rot90_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_laneq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcmlaq_rot90_laneq_f16<const LANE: i32>(
+    a: float16x8_t,
+    b: float16x8_t,
+    c: float16x8_t,
+) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: float16x8_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_rot90_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmla_rot90_laneq_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x4_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
+        vcmla_rot90_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point complex multiply accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+pub fn vcmlaq_rot90_laneq_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x4_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: float32x4_t = simd_shuffle!(
+            c,
+            c,
+            [
+                2 * LANE as u32,
+                2 * LANE as u32 + 1,
+                2 * LANE as u32,
+                2 * LANE as u32 + 1
+            ]
+        );
+        vcmlaq_rot90_f32(a, b, c)
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_lane_f32<const LANE1: i32, const LANE2: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert_uimm_bits!(LANE2, 1);
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_lane_s8<const LANE1: i32, const LANE2: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert_uimm_bits!(LANE1, 3);
+    static_assert_uimm_bits!(LANE2, 3);
+    unsafe {
+        match LANE1 & 0b111 {
+            0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+            4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+            5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+            6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+            7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_lane_s16<const LANE1: i32, const LANE2: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE1, 2);
+    static_assert_uimm_bits!(LANE2, 2);
+    unsafe {
+        match LANE1 & 0b11 {
+            0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]),
+            2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_lane_s32<const LANE1: i32, const LANE2: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert_uimm_bits!(LANE2, 1);
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_lane_u8<const LANE1: i32, const LANE2: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert_uimm_bits!(LANE1, 3);
+    static_assert_uimm_bits!(LANE2, 3);
+    unsafe {
+        match LANE1 & 0b111 {
+            0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+            4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+            5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+            6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+            7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_lane_u16<const LANE1: i32, const LANE2: i32>(
+    a: uint16x4_t,
+    b: uint16x4_t,
+) -> uint16x4_t {
+    static_assert_uimm_bits!(LANE1, 2);
+    static_assert_uimm_bits!(LANE2, 2);
+    unsafe {
+        match LANE1 & 0b11 {
+            0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]),
+            2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_lane_u32<const LANE1: i32, const LANE2: i32>(
+    a: uint32x2_t,
+    b: uint32x2_t,
+) -> uint32x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert_uimm_bits!(LANE2, 1);
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_lane_p8<const LANE1: i32, const LANE2: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert_uimm_bits!(LANE1, 3);
+    static_assert_uimm_bits!(LANE2, 3);
+    unsafe {
+        match LANE1 & 0b111 {
+            0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+            4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+            5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+            6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+            7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_lane_p16<const LANE1: i32, const LANE2: i32>(
+    a: poly16x4_t,
+    b: poly16x4_t,
+) -> poly16x4_t {
+    static_assert_uimm_bits!(LANE1, 2);
+    static_assert_uimm_bits!(LANE2, 2);
+    unsafe {
+        match LANE1 & 0b11 {
+            0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]),
+            2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_laneq_f32<const LANE1: i32, const LANE2: i32>(
+    a: float32x2_t,
+    b: float32x4_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert_uimm_bits!(LANE2, 2);
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) };
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_laneq_s8<const LANE1: i32, const LANE2: i32>(a: int8x8_t, b: int8x16_t) -> int8x8_t {
+    static_assert_uimm_bits!(LANE1, 3);
+    static_assert_uimm_bits!(LANE2, 4);
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) };
+    unsafe {
+        match LANE1 & 0b111 {
+            0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]),
+            4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]),
+            5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]),
+            6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]),
+            7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_laneq_s16<const LANE1: i32, const LANE2: i32>(
+    a: int16x4_t,
+    b: int16x8_t,
+) -> int16x4_t {
+    static_assert_uimm_bits!(LANE1, 2);
+    static_assert_uimm_bits!(LANE2, 3);
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) };
+    unsafe {
+        match LANE1 & 0b11 {
+            0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3]),
+            2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_laneq_s32<const LANE1: i32, const LANE2: i32>(
+    a: int32x2_t,
+    b: int32x4_t,
+) -> int32x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert_uimm_bits!(LANE2, 2);
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) };
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_laneq_u8<const LANE1: i32, const LANE2: i32>(
+    a: uint8x8_t,
+    b: uint8x16_t,
+) -> uint8x8_t {
+    static_assert_uimm_bits!(LANE1, 3);
+    static_assert_uimm_bits!(LANE2, 4);
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) };
+    unsafe {
+        match LANE1 & 0b111 {
+            0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]),
+            4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]),
+            5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]),
+            6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]),
+            7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_laneq_u16<const LANE1: i32, const LANE2: i32>(
+    a: uint16x4_t,
+    b: uint16x8_t,
+) -> uint16x4_t {
+    static_assert_uimm_bits!(LANE1, 2);
+    static_assert_uimm_bits!(LANE2, 3);
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) };
+    unsafe {
+        match LANE1 & 0b11 {
+            0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3]),
+            2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_laneq_u32<const LANE1: i32, const LANE2: i32>(
+    a: uint32x2_t,
+    b: uint32x4_t,
+) -> uint32x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert_uimm_bits!(LANE2, 2);
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) };
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_laneq_p8<const LANE1: i32, const LANE2: i32>(
+    a: poly8x8_t,
+    b: poly8x16_t,
+) -> poly8x8_t {
+    static_assert_uimm_bits!(LANE1, 3);
+    static_assert_uimm_bits!(LANE2, 4);
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) };
+    unsafe {
+        match LANE1 & 0b111 {
+            0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]),
+            4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]),
+            5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]),
+            6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]),
+            7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_laneq_p16<const LANE1: i32, const LANE2: i32>(
+    a: poly16x4_t,
+    b: poly16x8_t,
+) -> poly16x4_t {
+    static_assert_uimm_bits!(LANE1, 2);
+    static_assert_uimm_bits!(LANE2, 3);
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) };
+    unsafe {
+        match LANE1 & 0b11 {
+            0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3]),
+            2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_lane_f32<const LANE1: i32, const LANE2: i32>(
+    a: float32x4_t,
+    b: float32x2_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE1, 2);
+    static_assert_uimm_bits!(LANE2, 1);
+    let b: float32x4_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3]) };
+    unsafe {
+        match LANE1 & 0b11 {
+            0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]),
+            2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_lane_f64<const LANE1: i32, const LANE2: i32>(
+    a: float64x2_t,
+    b: float64x1_t,
+) -> float64x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert!(LANE2 == 0);
+    let b: float64x2_t = unsafe { simd_shuffle!(b, b, [0, 1]) };
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_lane_s64<const LANE1: i32, const LANE2: i32>(
+    a: int64x2_t,
+    b: int64x1_t,
+) -> int64x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert!(LANE2 == 0);
+    let b: int64x2_t = unsafe { simd_shuffle!(b, b, [0, 1]) };
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_lane_u64<const LANE1: i32, const LANE2: i32>(
+    a: uint64x2_t,
+    b: uint64x1_t,
+) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert!(LANE2 == 0);
+    let b: uint64x2_t = unsafe { simd_shuffle!(b, b, [0, 1]) };
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_lane_p64<const LANE1: i32, const LANE2: i32>(
+    a: poly64x2_t,
+    b: poly64x1_t,
+) -> poly64x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert!(LANE2 == 0);
+    let b: poly64x2_t = unsafe { simd_shuffle!(b, b, [0, 1]) };
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_lane_s8<const LANE1: i32, const LANE2: i32>(a: int8x16_t, b: int8x8_t) -> int8x16_t {
+    static_assert_uimm_bits!(LANE1, 4);
+    static_assert_uimm_bits!(LANE2, 3);
+    let b: int8x16_t =
+        unsafe { simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) };
+    unsafe {
+        match LANE1 & 0b1111 {
+            0 => simd_shuffle!(
+                a,
+                b,
+                [
+                    16 + LANE2 as u32,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            1 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    16 + LANE2 as u32,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            2 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    16 + LANE2 as u32,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            3 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    16 + LANE2 as u32,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            4 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    16 + LANE2 as u32,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            5 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    16 + LANE2 as u32,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            6 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    16 + LANE2 as u32,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            7 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    16 + LANE2 as u32,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            8 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    16 + LANE2 as u32,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            9 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    16 + LANE2 as u32,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            10 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    16 + LANE2 as u32,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            11 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    16 + LANE2 as u32,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            12 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    16 + LANE2 as u32,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            13 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    16 + LANE2 as u32,
+                    14,
+                    15
+                ]
+            ),
+            14 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    16 + LANE2 as u32,
+                    15
+                ]
+            ),
+            15 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    16 + LANE2 as u32
+                ]
+            ),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_lane_s16<const LANE1: i32, const LANE2: i32>(
+    a: int16x8_t,
+    b: int16x4_t,
+) -> int16x8_t {
+    static_assert_uimm_bits!(LANE1, 3);
+    static_assert_uimm_bits!(LANE2, 2);
+    let b: int16x8_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]) };
+    unsafe {
+        match LANE1 & 0b111 {
+            0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+            4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+            5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+            6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+            7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_lane_s32<const LANE1: i32, const LANE2: i32>(
+    a: int32x4_t,
+    b: int32x2_t,
+) -> int32x4_t {
+    static_assert_uimm_bits!(LANE1, 2);
+    static_assert_uimm_bits!(LANE2, 1);
+    let b: int32x4_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3]) };
+    unsafe {
+        match LANE1 & 0b11 {
+            0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]),
+            2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_lane_u8<const LANE1: i32, const LANE2: i32>(
+    a: uint8x16_t,
+    b: uint8x8_t,
+) -> uint8x16_t {
+    static_assert_uimm_bits!(LANE1, 4);
+    static_assert_uimm_bits!(LANE2, 3);
+    let b: uint8x16_t =
+        unsafe { simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) };
+    unsafe {
+        match LANE1 & 0b1111 {
+            0 => simd_shuffle!(
+                a,
+                b,
+                [
+                    16 + LANE2 as u32,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            1 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    16 + LANE2 as u32,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            2 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    16 + LANE2 as u32,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            3 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    16 + LANE2 as u32,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            4 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    16 + LANE2 as u32,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            5 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    16 + LANE2 as u32,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            6 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    16 + LANE2 as u32,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            7 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    16 + LANE2 as u32,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            8 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    16 + LANE2 as u32,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            9 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    16 + LANE2 as u32,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            10 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    16 + LANE2 as u32,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            11 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    16 + LANE2 as u32,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            12 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    16 + LANE2 as u32,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            13 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    16 + LANE2 as u32,
+                    14,
+                    15
+                ]
+            ),
+            14 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    16 + LANE2 as u32,
+                    15
+                ]
+            ),
+            15 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    16 + LANE2 as u32
+                ]
+            ),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_lane_u16<const LANE1: i32, const LANE2: i32>(
+    a: uint16x8_t,
+    b: uint16x4_t,
+) -> uint16x8_t {
+    static_assert_uimm_bits!(LANE1, 3);
+    static_assert_uimm_bits!(LANE2, 2);
+    let b: uint16x8_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]) };
+    unsafe {
+        match LANE1 & 0b111 {
+            0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+            4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+            5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+            6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+            7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_lane_u32<const LANE1: i32, const LANE2: i32>(
+    a: uint32x4_t,
+    b: uint32x2_t,
+) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE1, 2);
+    static_assert_uimm_bits!(LANE2, 1);
+    let b: uint32x4_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3]) };
+    unsafe {
+        match LANE1 & 0b11 {
+            0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]),
+            2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_lane_p8<const LANE1: i32, const LANE2: i32>(
+    a: poly8x16_t,
+    b: poly8x8_t,
+) -> poly8x16_t {
+    static_assert_uimm_bits!(LANE1, 4);
+    static_assert_uimm_bits!(LANE2, 3);
+    let b: poly8x16_t =
+        unsafe { simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) };
+    unsafe {
+        match LANE1 & 0b1111 {
+            0 => simd_shuffle!(
+                a,
+                b,
+                [
+                    16 + LANE2 as u32,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            1 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    16 + LANE2 as u32,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            2 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    16 + LANE2 as u32,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            3 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    16 + LANE2 as u32,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            4 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    16 + LANE2 as u32,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            5 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    16 + LANE2 as u32,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            6 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    16 + LANE2 as u32,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            7 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    16 + LANE2 as u32,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            8 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    16 + LANE2 as u32,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            9 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    16 + LANE2 as u32,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            10 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    16 + LANE2 as u32,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            11 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    16 + LANE2 as u32,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            12 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    16 + LANE2 as u32,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            13 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    16 + LANE2 as u32,
+                    14,
+                    15
+                ]
+            ),
+            14 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    16 + LANE2 as u32,
+                    15
+                ]
+            ),
+            15 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    16 + LANE2 as u32
+                ]
+            ),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_lane_p16<const LANE1: i32, const LANE2: i32>(
+    a: poly16x8_t,
+    b: poly16x4_t,
+) -> poly16x8_t {
+    static_assert_uimm_bits!(LANE1, 3);
+    static_assert_uimm_bits!(LANE2, 2);
+    let b: poly16x8_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]) };
+    unsafe {
+        match LANE1 & 0b111 {
+            0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+            4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+            5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+            6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+            7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_laneq_f32<const LANE1: i32, const LANE2: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE1, 2);
+    static_assert_uimm_bits!(LANE2, 2);
+    unsafe {
+        match LANE1 & 0b11 {
+            0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]),
+            2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_laneq_f64<const LANE1: i32, const LANE2: i32>(
+    a: float64x2_t,
+    b: float64x2_t,
+) -> float64x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert_uimm_bits!(LANE2, 1);
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_laneq_s8<const LANE1: i32, const LANE2: i32>(
+    a: int8x16_t,
+    b: int8x16_t,
+) -> int8x16_t {
+    static_assert_uimm_bits!(LANE1, 4);
+    static_assert_uimm_bits!(LANE2, 4);
+    unsafe {
+        match LANE1 & 0b1111 {
+            0 => simd_shuffle!(
+                a,
+                b,
+                [
+                    16 + LANE2 as u32,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            1 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    16 + LANE2 as u32,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            2 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    16 + LANE2 as u32,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            3 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    16 + LANE2 as u32,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            4 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    16 + LANE2 as u32,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            5 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    16 + LANE2 as u32,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            6 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    16 + LANE2 as u32,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            7 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    16 + LANE2 as u32,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            8 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    16 + LANE2 as u32,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            9 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    16 + LANE2 as u32,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            10 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    16 + LANE2 as u32,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            11 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    16 + LANE2 as u32,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            12 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    16 + LANE2 as u32,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            13 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    16 + LANE2 as u32,
+                    14,
+                    15
+                ]
+            ),
+            14 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    16 + LANE2 as u32,
+                    15
+                ]
+            ),
+            15 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    16 + LANE2 as u32
+                ]
+            ),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_laneq_s16<const LANE1: i32, const LANE2: i32>(
+    a: int16x8_t,
+    b: int16x8_t,
+) -> int16x8_t {
+    static_assert_uimm_bits!(LANE1, 3);
+    static_assert_uimm_bits!(LANE2, 3);
+    unsafe {
+        match LANE1 & 0b111 {
+            0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+            4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+            5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+            6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+            7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_laneq_s32<const LANE1: i32, const LANE2: i32>(
+    a: int32x4_t,
+    b: int32x4_t,
+) -> int32x4_t {
+    static_assert_uimm_bits!(LANE1, 2);
+    static_assert_uimm_bits!(LANE2, 2);
+    unsafe {
+        match LANE1 & 0b11 {
+            0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]),
+            2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_laneq_s64<const LANE1: i32, const LANE2: i32>(
+    a: int64x2_t,
+    b: int64x2_t,
+) -> int64x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert_uimm_bits!(LANE2, 1);
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_laneq_u8<const LANE1: i32, const LANE2: i32>(
+    a: uint8x16_t,
+    b: uint8x16_t,
+) -> uint8x16_t {
+    static_assert_uimm_bits!(LANE1, 4);
+    static_assert_uimm_bits!(LANE2, 4);
+    unsafe {
+        match LANE1 & 0b1111 {
+            0 => simd_shuffle!(
+                a,
+                b,
+                [
+                    16 + LANE2 as u32,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            1 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    16 + LANE2 as u32,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            2 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    16 + LANE2 as u32,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            3 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    16 + LANE2 as u32,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            4 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    16 + LANE2 as u32,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            5 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    16 + LANE2 as u32,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            6 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    16 + LANE2 as u32,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            7 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    16 + LANE2 as u32,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            8 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    16 + LANE2 as u32,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            9 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    16 + LANE2 as u32,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            10 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    16 + LANE2 as u32,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            11 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    16 + LANE2 as u32,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            12 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    16 + LANE2 as u32,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            13 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    16 + LANE2 as u32,
+                    14,
+                    15
+                ]
+            ),
+            14 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    16 + LANE2 as u32,
+                    15
+                ]
+            ),
+            15 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    16 + LANE2 as u32
+                ]
+            ),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_laneq_u16<const LANE1: i32, const LANE2: i32>(
+    a: uint16x8_t,
+    b: uint16x8_t,
+) -> uint16x8_t {
+    static_assert_uimm_bits!(LANE1, 3);
+    static_assert_uimm_bits!(LANE2, 3);
+    unsafe {
+        match LANE1 & 0b111 {
+            0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+            4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+            5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+            6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+            7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_laneq_u32<const LANE1: i32, const LANE2: i32>(
+    a: uint32x4_t,
+    b: uint32x4_t,
+) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE1, 2);
+    static_assert_uimm_bits!(LANE2, 2);
+    unsafe {
+        match LANE1 & 0b11 {
+            0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]),
+            2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_laneq_u64<const LANE1: i32, const LANE2: i32>(
+    a: uint64x2_t,
+    b: uint64x2_t,
+) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert_uimm_bits!(LANE2, 1);
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_laneq_p8<const LANE1: i32, const LANE2: i32>(
+    a: poly8x16_t,
+    b: poly8x16_t,
+) -> poly8x16_t {
+    static_assert_uimm_bits!(LANE1, 4);
+    static_assert_uimm_bits!(LANE2, 4);
+    unsafe {
+        match LANE1 & 0b1111 {
+            0 => simd_shuffle!(
+                a,
+                b,
+                [
+                    16 + LANE2 as u32,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            1 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    16 + LANE2 as u32,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            2 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    16 + LANE2 as u32,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            3 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    16 + LANE2 as u32,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            4 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    16 + LANE2 as u32,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            5 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    16 + LANE2 as u32,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            6 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    16 + LANE2 as u32,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            7 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    16 + LANE2 as u32,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            8 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    16 + LANE2 as u32,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            9 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    16 + LANE2 as u32,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            10 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    16 + LANE2 as u32,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            11 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    16 + LANE2 as u32,
+                    12,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            12 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    16 + LANE2 as u32,
+                    13,
+                    14,
+                    15
+                ]
+            ),
+            13 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    16 + LANE2 as u32,
+                    14,
+                    15
+                ]
+            ),
+            14 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    16 + LANE2 as u32,
+                    15
+                ]
+            ),
+            15 => simd_shuffle!(
+                a,
+                b,
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    16 + LANE2 as u32
+                ]
+            ),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_laneq_p16<const LANE1: i32, const LANE2: i32>(
+    a: poly16x8_t,
+    b: poly16x8_t,
+) -> poly16x8_t {
+    static_assert_uimm_bits!(LANE1, 3);
+    static_assert_uimm_bits!(LANE2, 3);
+    unsafe {
+        match LANE1 & 0b111 {
+            0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+            3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+            4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+            5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+            6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+            7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopyq_laneq_p64<const LANE1: i32, const LANE2: i32>(
+    a: poly64x2_t,
+    b: poly64x2_t,
+) -> poly64x2_t {
+    static_assert_uimm_bits!(LANE1, 1);
+    static_assert_uimm_bits!(LANE2, 1);
+    unsafe {
+        match LANE1 & 0b1 {
+            0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]),
+            1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcreate_f64(a: u64) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Floating-point convert"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_f32_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_f32_f64(a: float64x2_t) -> float32x2_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Floating-point convert to higher precision long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_f64_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_f64_f32(a: float32x2_t) -> float64x2_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_f64_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_f64_s64(a: int64x1_t) -> float64x1_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_f64_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtq_f64_s64(a: int64x2_t) -> float64x2_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_f64_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_f64_u64(a: uint64x1_t) -> float64x1_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_f64_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtq_f64_u64(a: uint64x2_t) -> float64x2_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Floating-point convert to lower precision"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_high_f16_f32)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtn2))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvt_high_f16_f32(a: float16x4_t, b: float32x4_t) -> float16x8_t {
+    vcombine_f16(a, vcvt_f16_f32(b))
+}
+#[doc = "Floating-point convert to higher precision"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_high_f32_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtl2))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvt_high_f32_f16(a: float16x8_t) -> float32x4_t {
+    vcvt_f32_f16(vget_high_f16(a))
+}
+#[doc = "Floating-point convert to lower precision narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_high_f32_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_high_f32_f64(a: float32x2_t, b: float64x2_t) -> float32x4_t {
+    unsafe { simd_shuffle!(a, simd_cast(b), [0, 1, 2, 3]) }
+}
+#[doc = "Floating-point convert to higher precision long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_high_f64_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_high_f64_f32(a: float32x4_t) -> float64x2_t {
+    unsafe {
+        let b: float32x2_t = simd_shuffle!(a, a, [2, 3]);
+        simd_cast(b)
+    }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_f64_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_n_f64_s64<const N: i32>(a: int64x1_t) -> float64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64"
+        )]
+        fn _vcvt_n_f64_s64(a: int64x1_t, n: i32) -> float64x1_t;
+    }
+    unsafe { _vcvt_n_f64_s64(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_f64_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtq_n_f64_s64<const N: i32>(a: int64x2_t) -> float64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64"
+        )]
+        fn _vcvtq_n_f64_s64(a: int64x2_t, n: i32) -> float64x2_t;
+    }
+    unsafe { _vcvtq_n_f64_s64(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_f64_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_n_f64_u64<const N: i32>(a: uint64x1_t) -> float64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64"
+        )]
+        fn _vcvt_n_f64_u64(a: uint64x1_t, n: i32) -> float64x1_t;
+    }
+    unsafe { _vcvt_n_f64_u64(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_f64_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtq_n_f64_u64<const N: i32>(a: uint64x2_t) -> float64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64"
+        )]
+        fn _vcvtq_n_f64_u64(a: uint64x2_t, n: i32) -> float64x2_t;
+    }
+    unsafe { _vcvtq_n_f64_u64(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_n_s64_f64<const N: i32>(a: float64x1_t) -> int64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64"
+        )]
+        fn _vcvt_n_s64_f64(a: float64x1_t, n: i32) -> int64x1_t;
+    }
+    unsafe { _vcvt_n_s64_f64(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtq_n_s64_f64<const N: i32>(a: float64x2_t) -> int64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64"
+        )]
+        fn _vcvtq_n_s64_f64(a: float64x2_t, n: i32) -> int64x2_t;
+    }
+    unsafe { _vcvtq_n_s64_f64(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_n_u64_f64<const N: i32>(a: float64x1_t) -> uint64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64"
+        )]
+        fn _vcvt_n_u64_f64(a: float64x1_t, n: i32) -> uint64x1_t;
+    }
+    unsafe { _vcvt_n_u64_f64(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtq_n_u64_f64<const N: i32>(a: float64x2_t) -> uint64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64"
+        )]
+        fn _vcvtq_n_u64_f64(a: float64x2_t, n: i32) -> uint64x2_t;
+    }
+    unsafe { _vcvtq_n_u64_f64(a, N) }
+}
+#[doc = "Floating-point convert to signed fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_s64_f64(a: float64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.fptosi.sat.v1i64.v1f64"
+        )]
+        fn _vcvt_s64_f64(a: float64x1_t) -> int64x1_t;
+    }
+    unsafe { _vcvt_s64_f64(a) }
+}
+#[doc = "Floating-point convert to signed fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtq_s64_f64(a: float64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.fptosi.sat.v2i64.v2f64"
+        )]
+        fn _vcvtq_s64_f64(a: float64x2_t) -> int64x2_t;
+    }
+    unsafe { _vcvtq_s64_f64(a) }
+}
+#[doc = "Floating-point convert to unsigned fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_u64_f64(a: float64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.fptoui.sat.v1i64.v1f64"
+        )]
+        fn _vcvt_u64_f64(a: float64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vcvt_u64_f64(a) }
+}
+#[doc = "Floating-point convert to unsigned fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtq_u64_f64(a: float64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.fptoui.sat.v2i64.v2f64"
+        )]
+        fn _vcvtq_u64_f64(a: float64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vcvtq_u64_f64(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvta_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvta_s16_f16(a: float16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtas.v4i16.v4f16"
+        )]
+        fn _vcvta_s16_f16(a: float16x4_t) -> int16x4_t;
+    }
+    unsafe { _vcvta_s16_f16(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtaq_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtaq_s16_f16(a: float16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtas.v8i16.v8f16"
+        )]
+        fn _vcvtaq_s16_f16(a: float16x8_t) -> int16x8_t;
+    }
+    unsafe { _vcvtaq_s16_f16(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvta_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvta_s32_f32(a: float32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtas.v2i32.v2f32"
+        )]
+        fn _vcvta_s32_f32(a: float32x2_t) -> int32x2_t;
+    }
+    unsafe { _vcvta_s32_f32(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtaq_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtaq_s32_f32(a: float32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtas.v4i32.v4f32"
+        )]
+        fn _vcvtaq_s32_f32(a: float32x4_t) -> int32x4_t;
+    }
+    unsafe { _vcvtaq_s32_f32(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvta_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvta_s64_f64(a: float64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtas.v1i64.v1f64"
+        )]
+        fn _vcvta_s64_f64(a: float64x1_t) -> int64x1_t;
+    }
+    unsafe { _vcvta_s64_f64(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtaq_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtaq_s64_f64(a: float64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtas.v2i64.v2f64"
+        )]
+        fn _vcvtaq_s64_f64(a: float64x2_t) -> int64x2_t;
+    }
+    unsafe { _vcvtaq_s64_f64(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvta_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvta_u16_f16(a: float16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtau.v4i16.v4f16"
+        )]
+        fn _vcvta_u16_f16(a: float16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vcvta_u16_f16(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtaq_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtaq_u16_f16(a: float16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtau.v8i16.v8f16"
+        )]
+        fn _vcvtaq_u16_f16(a: float16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vcvtaq_u16_f16(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvta_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvta_u32_f32(a: float32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtau.v2i32.v2f32"
+        )]
+        fn _vcvta_u32_f32(a: float32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vcvta_u32_f32(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtaq_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtaq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtau.v4i32.v4f32"
+        )]
+        fn _vcvtaq_u32_f32(a: float32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vcvtaq_u32_f32(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvta_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvta_u64_f64(a: float64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtau.v1i64.v1f64"
+        )]
+        fn _vcvta_u64_f64(a: float64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vcvta_u64_f64(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtaq_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtaq_u64_f64(a: float64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtau.v2i64.v2f64"
+        )]
+        fn _vcvtaq_u64_f64(a: float64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vcvtaq_u64_f64(a) }
+}
+#[doc = "Floating-point convert to integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtah_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtah_s16_f16(a: f16) -> i16 {
+    vcvtah_s32_f16(a) as i16
+}
+#[doc = "Floating-point convert to integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtah_s32_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtah_s32_f16(a: f16) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtas.i32.f16"
+        )]
+        fn _vcvtah_s32_f16(a: f16) -> i32;
+    }
+    unsafe { _vcvtah_s32_f16(a) }
+}
+#[doc = "Floating-point convert to integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtah_s64_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtah_s64_f16(a: f16) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtas.i64.f16"
+        )]
+        fn _vcvtah_s64_f16(a: f16) -> i64;
+    }
+    unsafe { _vcvtah_s64_f16(a) }
+}
+#[doc = "Floating-point convert to integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtah_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtah_u16_f16(a: f16) -> u16 {
+    vcvtah_u32_f16(a) as u16
+}
+#[doc = "Floating-point convert to integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtah_u32_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtah_u32_f16(a: f16) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtau.i32.f16"
+        )]
+        fn _vcvtah_u32_f16(a: f16) -> u32;
+    }
+    unsafe { _vcvtah_u32_f16(a) }
+}
+#[doc = "Floating-point convert to integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtah_u64_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtah_u64_f16(a: f16) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtau.i64.f16"
+        )]
+        fn _vcvtah_u64_f16(a: f16) -> u64;
+    }
+    unsafe { _vcvtah_u64_f16(a) }
+}
+#[doc = "Floating-point convert to integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtas_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtas_s32_f32(a: f32) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtas.i32.f32"
+        )]
+        fn _vcvtas_s32_f32(a: f32) -> i32;
+    }
+    unsafe { _vcvtas_s32_f32(a) }
+}
+#[doc = "Floating-point convert to integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtad_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtad_s64_f64(a: f64) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtas.i64.f64"
+        )]
+        fn _vcvtad_s64_f64(a: f64) -> i64;
+    }
+    unsafe { _vcvtad_s64_f64(a) }
+}
+#[doc = "Floating-point convert to integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtas_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtas_u32_f32(a: f32) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtau.i32.f32"
+        )]
+        fn _vcvtas_u32_f32(a: f32) -> u32;
+    }
+    unsafe { _vcvtas_u32_f32(a) }
+}
+#[doc = "Floating-point convert to integer, rounding to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtad_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtad_u64_f64(a: f64) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtau.i64.f64"
+        )]
+        fn _vcvtad_u64_f64(a: f64) -> u64;
+    }
+    unsafe { _vcvtad_u64_f64(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtd_f64_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtd_f64_s64(a: i64) -> f64 {
+    a as f64
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvts_f32_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvts_f32_s32(a: i32) -> f32 {
+    a as f32
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_f16_s16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(scvtf))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_f16_s16(a: i16) -> f16 {
+    a as f16
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_f16_s32)"]
+#[inline]
+#[cfg_attr(test, assert_instr(scvtf))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_f16_s32(a: i32) -> f16 {
+    a as f16
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_f16_s64)"]
+#[inline]
+#[cfg_attr(test, assert_instr(scvtf))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_f16_s64(a: i64) -> f16 {
+    a as f16
+}
+#[doc = "Unsigned fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_f16_u16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(ucvtf))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_f16_u16(a: u16) -> f16 {
+    a as f16
+}
+#[doc = "Unsigned fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_f16_u32)"]
+#[inline]
+#[cfg_attr(test, assert_instr(ucvtf))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_f16_u32(a: u32) -> f16 {
+    a as f16
+}
+#[doc = "Unsigned fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_f16_u64)"]
+#[inline]
+#[cfg_attr(test, assert_instr(ucvtf))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_f16_u64(a: u64) -> f16 {
+    a as f16
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_n_f16_s16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_n_f16_s16<const N: i32>(a: i16) -> f16 {
+    static_assert!(N >= 1 && N <= 16);
+    vcvth_n_f16_s32::<N>(a as i32) as f16
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_n_f16_s32)"]
+#[inline]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_n_f16_s32<const N: i32>(a: i32) -> f16 {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxs2fp.f16.i32"
+        )]
+        fn _vcvth_n_f16_s32(a: i32, n: i32) -> f16;
+    }
+    unsafe { _vcvth_n_f16_s32(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_n_f16_s64)"]
+#[inline]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_n_f16_s64<const N: i32>(a: i64) -> f16 {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxs2fp.f16.i64"
+        )]
+        fn _vcvth_n_f16_s64(a: i64, n: i32) -> f16;
+    }
+    unsafe { _vcvth_n_f16_s64(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_n_f16_u16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_n_f16_u16<const N: i32>(a: u16) -> f16 {
+    static_assert!(N >= 1 && N <= 16);
+    vcvth_n_f16_u32::<N>(a as u32) as f16
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_n_f16_u32)"]
+#[inline]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_n_f16_u32<const N: i32>(a: u32) -> f16 {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxu2fp.f16.i32"
+        )]
+        fn _vcvth_n_f16_u32(a: u32, n: i32) -> f16;
+    }
+    unsafe { _vcvth_n_f16_u32(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_n_f16_u64)"]
+#[inline]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_n_f16_u64<const N: i32>(a: u64) -> f16 {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxu2fp.f16.i64"
+        )]
+        fn _vcvth_n_f16_u64(a: u64, n: i32) -> f16;
+    }
+    unsafe { _vcvth_n_f16_u64(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_n_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_n_s16_f16<const N: i32>(a: f16) -> i16 {
+    static_assert!(N >= 1 && N <= 16);
+    vcvth_n_s32_f16::<N>(a) as i16
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_n_s32_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_n_s32_f16<const N: i32>(a: f16) -> i32 {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxs.i32.f16"
+        )]
+        fn _vcvth_n_s32_f16(a: f16, n: i32) -> i32;
+    }
+    unsafe { _vcvth_n_s32_f16(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_n_s64_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_n_s64_f16<const N: i32>(a: f16) -> i64 {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxs.i64.f16"
+        )]
+        fn _vcvth_n_s64_f16(a: f16, n: i32) -> i64;
+    }
+    unsafe { _vcvth_n_s64_f16(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_n_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_n_u16_f16<const N: i32>(a: f16) -> u16 {
+    static_assert!(N >= 1 && N <= 16);
+    vcvth_n_u32_f16::<N>(a) as u16
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_n_u32_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_n_u32_f16<const N: i32>(a: f16) -> u32 {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxu.i32.f16"
+        )]
+        fn _vcvth_n_u32_f16(a: f16, n: i32) -> u32;
+    }
+    unsafe { _vcvth_n_u32_f16(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_n_u64_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_n_u64_f16<const N: i32>(a: f16) -> u64 {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxu.i64.f16"
+        )]
+        fn _vcvth_n_u64_f16(a: f16, n: i32) -> u64;
+    }
+    unsafe { _vcvth_n_u64_f16(a, N) }
+}
+#[doc = "Floating-point convert to signed fixed-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtzs))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_s16_f16(a: f16) -> i16 {
+    a as i16
+}
+#[doc = "Floating-point convert to signed fixed-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_s32_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtzs))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_s32_f16(a: f16) -> i32 {
+    a as i32
+}
+#[doc = "Floating-point convert to signed fixed-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_s64_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtzs))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_s64_f16(a: f16) -> i64 {
+    a as i64
+}
+#[doc = "Floating-point convert to unsigned fixed-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtzu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_u16_f16(a: f16) -> u16 {
+    a as u16
+}
+#[doc = "Floating-point convert to unsigned fixed-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_u32_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtzu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_u32_f16(a: f16) -> u32 {
+    a as u32
+}
+#[doc = "Floating-point convert to unsigned fixed-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvth_u64_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtzu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvth_u64_f16(a: f16) -> u64 {
+    a as u64
+}
+#[doc = "Floating-point convert to signed integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtm_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtm_s16_f16(a: float16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtms.v4i16.v4f16"
+        )]
+        fn _vcvtm_s16_f16(a: float16x4_t) -> int16x4_t;
+    }
+    unsafe { _vcvtm_s16_f16(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmq_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtmq_s16_f16(a: float16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtms.v8i16.v8f16"
+        )]
+        fn _vcvtmq_s16_f16(a: float16x8_t) -> int16x8_t;
+    }
+    unsafe { _vcvtmq_s16_f16(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtm_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtm_s32_f32(a: float32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtms.v2i32.v2f32"
+        )]
+        fn _vcvtm_s32_f32(a: float32x2_t) -> int32x2_t;
+    }
+    unsafe { _vcvtm_s32_f32(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmq_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtmq_s32_f32(a: float32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtms.v4i32.v4f32"
+        )]
+        fn _vcvtmq_s32_f32(a: float32x4_t) -> int32x4_t;
+    }
+    unsafe { _vcvtmq_s32_f32(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtm_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtm_s64_f64(a: float64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtms.v1i64.v1f64"
+        )]
+        fn _vcvtm_s64_f64(a: float64x1_t) -> int64x1_t;
+    }
+    unsafe { _vcvtm_s64_f64(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmq_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtmq_s64_f64(a: float64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtms.v2i64.v2f64"
+        )]
+        fn _vcvtmq_s64_f64(a: float64x2_t) -> int64x2_t;
+    }
+    unsafe { _vcvtmq_s64_f64(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtm_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtm_u16_f16(a: float16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtmu.v4i16.v4f16"
+        )]
+        fn _vcvtm_u16_f16(a: float16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vcvtm_u16_f16(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmq_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtmq_u16_f16(a: float16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtmu.v8i16.v8f16"
+        )]
+        fn _vcvtmq_u16_f16(a: float16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vcvtmq_u16_f16(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtm_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtm_u32_f32(a: float32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtmu.v2i32.v2f32"
+        )]
+        fn _vcvtm_u32_f32(a: float32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vcvtm_u32_f32(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmq_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtmq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtmu.v4i32.v4f32"
+        )]
+        fn _vcvtmq_u32_f32(a: float32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vcvtmq_u32_f32(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtm_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtm_u64_f64(a: float64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtmu.v1i64.v1f64"
+        )]
+        fn _vcvtm_u64_f64(a: float64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vcvtm_u64_f64(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmq_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtmq_u64_f64(a: float64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtmu.v2i64.v2f64"
+        )]
+        fn _vcvtmq_u64_f64(a: float64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vcvtmq_u64_f64(a) }
+}
+#[doc = "Floating-point convert to integer, rounding towards minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmh_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtmh_s16_f16(a: f16) -> i16 {
+    vcvtmh_s32_f16(a) as i16
+}
+#[doc = "Floating-point convert to integer, rounding towards minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmh_s32_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtmh_s32_f16(a: f16) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtms.i32.f16"
+        )]
+        fn _vcvtmh_s32_f16(a: f16) -> i32;
+    }
+    unsafe { _vcvtmh_s32_f16(a) }
+}
+#[doc = "Floating-point convert to integer, rounding towards minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmh_s64_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtmh_s64_f16(a: f16) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtms.i64.f16"
+        )]
+        fn _vcvtmh_s64_f16(a: f16) -> i64;
+    }
+    unsafe { _vcvtmh_s64_f16(a) }
+}
+#[doc = "Floating-point convert to integer, rounding towards minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmh_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtmh_u16_f16(a: f16) -> u16 {
+    vcvtmh_u32_f16(a) as u16
+}
+#[doc = "Floating-point convert to unsigned integer, rounding towards minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmh_u32_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtmh_u32_f16(a: f16) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtmu.i32.f16"
+        )]
+        fn _vcvtmh_u32_f16(a: f16) -> u32;
+    }
+    unsafe { _vcvtmh_u32_f16(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding towards minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmh_u64_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtmh_u64_f16(a: f16) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtmu.i64.f16"
+        )]
+        fn _vcvtmh_u64_f16(a: f16) -> u64;
+    }
+    unsafe { _vcvtmh_u64_f16(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtms_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtms_s32_f32(a: f32) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtms.i32.f32"
+        )]
+        fn _vcvtms_s32_f32(a: f32) -> i32;
+    }
+    unsafe { _vcvtms_s32_f32(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmd_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtmd_s64_f64(a: f64) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtms.i64.f64"
+        )]
+        fn _vcvtmd_s64_f64(a: f64) -> i64;
+    }
+    unsafe { _vcvtmd_s64_f64(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtms_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtms_u32_f32(a: f32) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtmu.i32.f32"
+        )]
+        fn _vcvtms_u32_f32(a: f32) -> u32;
+    }
+    unsafe { _vcvtms_u32_f32(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtmd_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtmd_u64_f64(a: f64) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtmu.i64.f64"
+        )]
+        fn _vcvtmd_u64_f64(a: f64) -> u64;
+    }
+    unsafe { _vcvtmd_u64_f64(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtn_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtn_s16_f16(a: float16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtns.v4i16.v4f16"
+        )]
+        fn _vcvtn_s16_f16(a: float16x4_t) -> int16x4_t;
+    }
+    unsafe { _vcvtn_s16_f16(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnq_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtnq_s16_f16(a: float16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtns.v8i16.v8f16"
+        )]
+        fn _vcvtnq_s16_f16(a: float16x8_t) -> int16x8_t;
+    }
+    unsafe { _vcvtnq_s16_f16(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtn_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtn_s32_f32(a: float32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtns.v2i32.v2f32"
+        )]
+        fn _vcvtn_s32_f32(a: float32x2_t) -> int32x2_t;
+    }
+    unsafe { _vcvtn_s32_f32(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnq_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtnq_s32_f32(a: float32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtns.v4i32.v4f32"
+        )]
+        fn _vcvtnq_s32_f32(a: float32x4_t) -> int32x4_t;
+    }
+    unsafe { _vcvtnq_s32_f32(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtn_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtn_s64_f64(a: float64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtns.v1i64.v1f64"
+        )]
+        fn _vcvtn_s64_f64(a: float64x1_t) -> int64x1_t;
+    }
+    unsafe { _vcvtn_s64_f64(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnq_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtnq_s64_f64(a: float64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtns.v2i64.v2f64"
+        )]
+        fn _vcvtnq_s64_f64(a: float64x2_t) -> int64x2_t;
+    }
+    unsafe { _vcvtnq_s64_f64(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtn_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtn_u16_f16(a: float16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtnu.v4i16.v4f16"
+        )]
+        fn _vcvtn_u16_f16(a: float16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vcvtn_u16_f16(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnq_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtnq_u16_f16(a: float16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtnu.v8i16.v8f16"
+        )]
+        fn _vcvtnq_u16_f16(a: float16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vcvtnq_u16_f16(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtn_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtn_u32_f32(a: float32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtnu.v2i32.v2f32"
+        )]
+        fn _vcvtn_u32_f32(a: float32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vcvtn_u32_f32(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnq_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtnq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtnu.v4i32.v4f32"
+        )]
+        fn _vcvtnq_u32_f32(a: float32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vcvtnq_u32_f32(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtn_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtn_u64_f64(a: float64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtnu.v1i64.v1f64"
+        )]
+        fn _vcvtn_u64_f64(a: float64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vcvtn_u64_f64(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnq_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtnq_u64_f64(a: float64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtnu.v2i64.v2f64"
+        )]
+        fn _vcvtnq_u64_f64(a: float64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vcvtnq_u64_f64(a) }
+}
+#[doc = "Floating-point convert to integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnh_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtnh_s16_f16(a: f16) -> i16 {
+    vcvtnh_s32_f16(a) as i16
+}
+#[doc = "Floating-point convert to integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnh_s32_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtnh_s32_f16(a: f16) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtns.i32.f16"
+        )]
+        fn _vcvtnh_s32_f16(a: f16) -> i32;
+    }
+    unsafe { _vcvtnh_s32_f16(a) }
+}
+#[doc = "Floating-point convert to integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnh_s64_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtnh_s64_f16(a: f16) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtns.i64.f16"
+        )]
+        fn _vcvtnh_s64_f16(a: f16) -> i64;
+    }
+    unsafe { _vcvtnh_s64_f16(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnh_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtnh_u16_f16(a: f16) -> u16 {
+    vcvtnh_u32_f16(a) as u16
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnh_u32_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtnh_u32_f16(a: f16) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtnu.i32.f16"
+        )]
+        fn _vcvtnh_u32_f16(a: f16) -> u32;
+    }
+    unsafe { _vcvtnh_u32_f16(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnh_u64_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtnh_u64_f16(a: f16) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtnu.i64.f16"
+        )]
+        fn _vcvtnh_u64_f16(a: f16) -> u64;
+    }
+    unsafe { _vcvtnh_u64_f16(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtns_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtns_s32_f32(a: f32) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtns.i32.f32"
+        )]
+        fn _vcvtns_s32_f32(a: f32) -> i32;
+    }
+    unsafe { _vcvtns_s32_f32(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnd_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtnd_s64_f64(a: f64) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtns.i64.f64"
+        )]
+        fn _vcvtnd_s64_f64(a: f64) -> i64;
+    }
+    unsafe { _vcvtnd_s64_f64(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtns_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtns_u32_f32(a: f32) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtnu.i32.f32"
+        )]
+        fn _vcvtns_u32_f32(a: f32) -> u32;
+    }
+    unsafe { _vcvtns_u32_f32(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtnd_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtnd_u64_f64(a: f64) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtnu.i64.f64"
+        )]
+        fn _vcvtnd_u64_f64(a: f64) -> u64;
+    }
+    unsafe { _vcvtnd_u64_f64(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtp_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtp_s16_f16(a: float16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtps.v4i16.v4f16"
+        )]
+        fn _vcvtp_s16_f16(a: float16x4_t) -> int16x4_t;
+    }
+    unsafe { _vcvtp_s16_f16(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding to plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtpq_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtpq_s16_f16(a: float16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtps.v8i16.v8f16"
+        )]
+        fn _vcvtpq_s16_f16(a: float16x8_t) -> int16x8_t;
+    }
+    unsafe { _vcvtpq_s16_f16(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtp_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtp_s32_f32(a: float32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtps.v2i32.v2f32"
+        )]
+        fn _vcvtp_s32_f32(a: float32x2_t) -> int32x2_t;
+    }
+    unsafe { _vcvtp_s32_f32(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtpq_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtpq_s32_f32(a: float32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtps.v4i32.v4f32"
+        )]
+        fn _vcvtpq_s32_f32(a: float32x4_t) -> int32x4_t;
+    }
+    unsafe { _vcvtpq_s32_f32(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtp_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtp_s64_f64(a: float64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtps.v1i64.v1f64"
+        )]
+        fn _vcvtp_s64_f64(a: float64x1_t) -> int64x1_t;
+    }
+    unsafe { _vcvtp_s64_f64(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtpq_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtpq_s64_f64(a: float64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtps.v2i64.v2f64"
+        )]
+        fn _vcvtpq_s64_f64(a: float64x2_t) -> int64x2_t;
+    }
+    unsafe { _vcvtpq_s64_f64(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtp_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtp_u16_f16(a: float16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtpu.v4i16.v4f16"
+        )]
+        fn _vcvtp_u16_f16(a: float16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vcvtp_u16_f16(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtpq_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtpq_u16_f16(a: float16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtpu.v8i16.v8f16"
+        )]
+        fn _vcvtpq_u16_f16(a: float16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vcvtpq_u16_f16(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtp_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtp_u32_f32(a: float32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtpu.v2i32.v2f32"
+        )]
+        fn _vcvtp_u32_f32(a: float32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vcvtp_u32_f32(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtpq_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtpq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtpu.v4i32.v4f32"
+        )]
+        fn _vcvtpq_u32_f32(a: float32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vcvtpq_u32_f32(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtp_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtp_u64_f64(a: float64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtpu.v1i64.v1f64"
+        )]
+        fn _vcvtp_u64_f64(a: float64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vcvtp_u64_f64(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtpq_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtpq_u64_f64(a: float64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtpu.v2i64.v2f64"
+        )]
+        fn _vcvtpq_u64_f64(a: float64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vcvtpq_u64_f64(a) }
+}
+#[doc = "Floating-point convert to integer, rounding to plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtph_s16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtph_s16_f16(a: f16) -> i16 {
+    vcvtph_s32_f16(a) as i16
+}
+#[doc = "Floating-point convert to integer, rounding to plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtph_s32_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtph_s32_f16(a: f16) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtps.i32.f16"
+        )]
+        fn _vcvtph_s32_f16(a: f16) -> i32;
+    }
+    unsafe { _vcvtph_s32_f16(a) }
+}
+#[doc = "Floating-point convert to integer, rounding to plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtph_s64_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtph_s64_f16(a: f16) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtps.i64.f16"
+        )]
+        fn _vcvtph_s64_f16(a: f16) -> i64;
+    }
+    unsafe { _vcvtph_s64_f16(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtph_u16_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtph_u16_f16(a: f16) -> u16 {
+    vcvtph_u32_f16(a) as u16
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtph_u32_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtph_u32_f16(a: f16) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtpu.i32.f16"
+        )]
+        fn _vcvtph_u32_f16(a: f16) -> u32;
+    }
+    unsafe { _vcvtph_u32_f16(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding to plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtph_u64_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtph_u64_f16(a: f16) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtpu.i64.f16"
+        )]
+        fn _vcvtph_u64_f16(a: f16) -> u64;
+    }
+    unsafe { _vcvtph_u64_f16(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtps_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtps_s32_f32(a: f32) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtps.i32.f32"
+        )]
+        fn _vcvtps_s32_f32(a: f32) -> i32;
+    }
+    unsafe { _vcvtps_s32_f32(a) }
+}
+#[doc = "Floating-point convert to signed integer, rounding toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtpd_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtpd_s64_f64(a: f64) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtps.i64.f64"
+        )]
+        fn _vcvtpd_s64_f64(a: f64) -> i64;
+    }
+    unsafe { _vcvtpd_s64_f64(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtps_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtps_u32_f32(a: f32) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtpu.i32.f32"
+        )]
+        fn _vcvtps_u32_f32(a: f32) -> u32;
+    }
+    unsafe { _vcvtps_u32_f32(a) }
+}
+#[doc = "Floating-point convert to unsigned integer, rounding toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtpd_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtpd_u64_f64(a: f64) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtpu.i64.f64"
+        )]
+        fn _vcvtpd_u64_f64(a: f64) -> u64;
+    }
+    unsafe { _vcvtpd_u64_f64(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvts_f32_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvts_f32_u32(a: u32) -> f32 {
+    a as f32
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtd_f64_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtd_f64_u64(a: u64) -> f64 {
+    a as f64
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvts_n_f32_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvts_n_f32_s32<const N: i32>(a: i32) -> f32 {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxs2fp.f32.i32"
+        )]
+        fn _vcvts_n_f32_s32(a: i32, n: i32) -> f32;
+    }
+    unsafe { _vcvts_n_f32_s32(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtd_n_f64_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtd_n_f64_s64<const N: i32>(a: i64) -> f64 {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxs2fp.f64.i64"
+        )]
+        fn _vcvtd_n_f64_s64(a: i64, n: i32) -> f64;
+    }
+    unsafe { _vcvtd_n_f64_s64(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvts_n_f32_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvts_n_f32_u32<const N: i32>(a: u32) -> f32 {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxu2fp.f32.i32"
+        )]
+        fn _vcvts_n_f32_u32(a: u32, n: i32) -> f32;
+    }
+    unsafe { _vcvts_n_f32_u32(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtd_n_f64_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtd_n_f64_u64<const N: i32>(a: u64) -> f64 {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxu2fp.f64.i64"
+        )]
+        fn _vcvtd_n_f64_u64(a: u64, n: i32) -> f64;
+    }
+    unsafe { _vcvtd_n_f64_u64(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvts_n_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvts_n_s32_f32<const N: i32>(a: f32) -> i32 {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxs.i32.f32"
+        )]
+        fn _vcvts_n_s32_f32(a: f32, n: i32) -> i32;
+    }
+    unsafe { _vcvts_n_s32_f32(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtd_n_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtd_n_s64_f64<const N: i32>(a: f64) -> i64 {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxs.i64.f64"
+        )]
+        fn _vcvtd_n_s64_f64(a: f64, n: i32) -> i64;
+    }
+    unsafe { _vcvtd_n_s64_f64(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvts_n_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvts_n_u32_f32<const N: i32>(a: f32) -> u32 {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxu.i32.f32"
+        )]
+        fn _vcvts_n_u32_f32(a: f32, n: i32) -> u32;
+    }
+    unsafe { _vcvts_n_u32_f32(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtd_n_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtd_n_u64_f64<const N: i32>(a: f64) -> u64 {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxu.i64.f64"
+        )]
+        fn _vcvtd_n_u64_f64(a: f64, n: i32) -> u64;
+    }
+    unsafe { _vcvtd_n_u64_f64(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvts_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvts_s32_f32(a: f32) -> i32 {
+    a as i32
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtd_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtd_s64_f64(a: f64) -> i64 {
+    a as i64
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvts_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvts_u32_f32(a: f32) -> u32 {
+    a as u32
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtd_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtd_u64_f64(a: f64) -> u64 {
+    a as u64
+}
+#[doc = "Floating-point convert to lower precision narrow, rounding to odd"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtx_f32_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtxn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtx_f32_f64(a: float64x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fcvtxn.v2f32.v2f64"
+        )]
+        fn _vcvtx_f32_f64(a: float64x2_t) -> float32x2_t;
+    }
+    unsafe { _vcvtx_f32_f64(a) }
+}
+#[doc = "Floating-point convert to lower precision narrow, rounding to odd"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtx_high_f32_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtxn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtx_high_f32_f64(a: float32x2_t, b: float64x2_t) -> float32x4_t {
+    unsafe { simd_shuffle!(a, vcvtx_f32_f64(b), [0, 1, 2, 3]) }
+}
+#[doc = "Floating-point convert to lower precision narrow, rounding to odd"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtxd_f32_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtxn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtxd_f32_f64(a: f64) -> f32 {
+    unsafe { simd_extract!(vcvtx_f32_f64(vdupq_n_f64(a)), 0) }
+}
+#[doc = "Divide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdiv_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fdiv))]
+pub fn vdiv_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe { simd_div(a, b) }
+}
+#[doc = "Divide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdivq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fdiv))]
+pub fn vdivq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe { simd_div(a, b) }
+}
+#[doc = "Divide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdiv_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fdiv))]
+pub fn vdiv_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe { simd_div(a, b) }
+}
+#[doc = "Divide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdivq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fdiv))]
+pub fn vdivq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe { simd_div(a, b) }
+}
+#[doc = "Divide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdiv_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fdiv))]
+pub fn vdiv_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    unsafe { simd_div(a, b) }
+}
+#[doc = "Divide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdivq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fdiv))]
+pub fn vdivq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe { simd_div(a, b) }
+}
+#[doc = "Divide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdivh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vdivh_f16(a: f16, b: f16) -> f16 {
+    a / b
+}
+#[doc = "Dot product arithmetic (indexed)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(sdot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_dotprod", issue = "117224")]
+pub fn vdot_laneq_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x16_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: int32x4_t = transmute(c);
+        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        vdot_s32(a, b, transmute(c))
+    }
+}
+#[doc = "Dot product arithmetic (indexed)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(sdot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_dotprod", issue = "117224")]
+pub fn vdotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: int32x4_t = transmute(c);
+        let c: int32x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vdotq_s32(a, b, transmute(c))
+    }
+}
+#[doc = "Dot product arithmetic (indexed)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(udot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_dotprod", issue = "117224")]
+pub fn vdot_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x16_t) -> uint32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: uint32x4_t = transmute(c);
+        let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        vdot_u32(a, b, transmute(c))
+    }
+}
+#[doc = "Dot product arithmetic (indexed)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(udot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_dotprod", issue = "117224")]
+pub fn vdotq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: uint32x4_t = transmute(c);
+        let c: uint32x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vdotq_u32(a, b, transmute(c))
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdup_lane_f64<const N: i32>(a: float64x1_t) -> float64x1_t {
+    static_assert!(N == 0);
+    a
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdup_lane_p64<const N: i32>(a: poly64x1_t) -> poly64x1_t {
+    static_assert!(N == 0);
+    a
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdup_laneq_f64<const N: i32>(a: float64x2_t) -> float64x1_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { transmute::<f64, _>(simd_extract!(a, N as u32)) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdup_laneq_p64<const N: i32>(a: poly64x2_t) -> poly64x1_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { transmute::<u64, _>(simd_extract!(a, N as u32)) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupb_lane_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupb_lane_s8<const N: i32>(a: int8x8_t) -> i8 {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vduph_laneq_s16<const N: i32>(a: int16x8_t) -> i16 {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupb_lane_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupb_lane_u8<const N: i32>(a: uint8x8_t) -> u8 {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vduph_laneq_u16<const N: i32>(a: uint16x8_t) -> u16 {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupb_lane_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupb_lane_p8<const N: i32>(a: poly8x8_t) -> p8 {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_laneq_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vduph_laneq_p16<const N: i32>(a: poly16x8_t) -> p16 {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Extract an element from a vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupb_laneq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupb_laneq_s8<const N: i32>(a: int8x16_t) -> i8 {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Extract an element from a vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupb_laneq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupb_laneq_u8<const N: i32>(a: uint8x16_t) -> u8 {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Extract an element from a vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupb_laneq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupb_laneq_p8<const N: i32>(a: poly8x16_t) -> p8 {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupd_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupd_lane_f64<const N: i32>(a: float64x1_t) -> f64 {
+    static_assert!(N == 0);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupd_lane_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupd_lane_s64<const N: i32>(a: int64x1_t) -> i64 {
+    static_assert!(N == 0);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupd_lane_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupd_lane_u64<const N: i32>(a: uint64x1_t) -> u64 {
+    static_assert!(N == 0);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_lane_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vduph_lane_f16<const N: i32>(a: float16x4_t) -> f16 {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Extract an element from a vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_laneq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(nop, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vduph_laneq_f16<const N: i32>(a: float16x8_t) -> f16 {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupq_lane_f64<const N: i32>(a: float64x1_t) -> float64x2_t {
+    static_assert!(N == 0);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupq_lane_p64<const N: i32>(a: poly64x1_t) -> poly64x2_t {
+    static_assert!(N == 0);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupq_laneq_f64<const N: i32>(a: float64x2_t) -> float64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupq_laneq_p64<const N: i32>(a: poly64x2_t) -> poly64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdups_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdups_lane_f32<const N: i32>(a: float32x2_t) -> f32 {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupd_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupd_laneq_f64<const N: i32>(a: float64x2_t) -> f64 {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdups_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdups_lane_s32<const N: i32>(a: int32x2_t) -> i32 {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupd_laneq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupd_laneq_s64<const N: i32>(a: int64x2_t) -> i64 {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdups_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdups_lane_u32<const N: i32>(a: uint32x2_t) -> u32 {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupd_laneq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupd_laneq_u64<const N: i32>(a: uint64x2_t) -> u64 {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdups_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdups_laneq_f32<const N: i32>(a: float32x4_t) -> f32 {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vduph_lane_s16<const N: i32>(a: int16x4_t) -> i16 {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdups_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdups_laneq_s32<const N: i32>(a: int32x4_t) -> i32 {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vduph_lane_u16<const N: i32>(a: uint16x4_t) -> u16 {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdups_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdups_laneq_u32<const N: i32>(a: uint32x4_t) -> u32 {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_lane_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vduph_lane_p16<const N: i32>(a: poly16x4_t) -> p16 {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_extract!(a, N as u32) }
+}
+#[doc = "Three-way exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor3q_s8)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub fn veor3q_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.eor3s.v16i8"
+        )]
+        fn _veor3q_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _veor3q_s8(a, b, c) }
+}
+#[doc = "Three-way exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor3q_s16)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub fn veor3q_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.eor3s.v8i16"
+        )]
+        fn _veor3q_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _veor3q_s16(a, b, c) }
+}
+#[doc = "Three-way exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor3q_s32)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub fn veor3q_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.eor3s.v4i32"
+        )]
+        fn _veor3q_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _veor3q_s32(a, b, c) }
+}
+#[doc = "Three-way exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor3q_s64)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub fn veor3q_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.eor3s.v2i64"
+        )]
+        fn _veor3q_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t;
+    }
+    unsafe { _veor3q_s64(a, b, c) }
+}
+#[doc = "Three-way exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor3q_u8)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub fn veor3q_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.eor3u.v16i8"
+        )]
+        fn _veor3q_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _veor3q_u8(a, b, c) }
+}
+#[doc = "Three-way exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor3q_u16)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub fn veor3q_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.eor3u.v8i16"
+        )]
+        fn _veor3q_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t;
+    }
+    unsafe { _veor3q_u16(a, b, c) }
+}
+#[doc = "Three-way exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor3q_u32)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub fn veor3q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.eor3u.v4i32"
+        )]
+        fn _veor3q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _veor3q_u32(a, b, c) }
+}
+#[doc = "Three-way exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor3q_u64)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub fn veor3q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.eor3u.v2i64"
+        )]
+        fn _veor3q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    unsafe { _veor3q_u64(a, b, c) }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ext, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vextq_f64<const N: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe {
+        match N & 0b1 {
+            0 => simd_shuffle!(a, b, [0, 1]),
+            1 => simd_shuffle!(a, b, [1, 2]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ext, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vextq_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe {
+        match N & 0b1 {
+            0 => simd_shuffle!(a, b, [0, 1]),
+            1 => simd_shuffle!(a, b, [1, 2]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Floating-point fused Multiply-Add to accumulator(vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmadd))]
+pub fn vfma_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
+    unsafe { simd_fma(b, c, a) }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_lane_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfma_lane_f16<const LANE: i32>(
+    a: float16x4_t,
+    b: float16x4_t,
+    c: float16x4_t,
+) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfma_f16(a, b, vdup_n_f16(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_laneq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfma_laneq_f16<const LANE: i32>(
+    a: float16x4_t,
+    b: float16x4_t,
+    c: float16x8_t,
+) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vfma_f16(a, b, vdup_n_f16(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_lane_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmaq_lane_f16<const LANE: i32>(
+    a: float16x8_t,
+    b: float16x8_t,
+    c: float16x4_t,
+) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfmaq_f16(a, b, vdupq_n_f16(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_laneq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmaq_laneq_f16<const LANE: i32>(
+    a: float16x8_t,
+    b: float16x8_t,
+    c: float16x8_t,
+) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vfmaq_f16(a, b, vdupq_n_f16(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfma_lane_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x2_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vfma_f32(a, b, vdup_n_f32(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfma_laneq_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x4_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfma_f32(a, b, vdup_n_f32(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmaq_lane_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x2_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vfmaq_f32(a, b, vdupq_n_f32(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmaq_laneq_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x4_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfmaq_f32(a, b, vdupq_n_f32(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmaq_laneq_f64<const LANE: i32>(
+    a: float64x2_t,
+    b: float64x2_t,
+    c: float64x2_t,
+) -> float64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vfmaq_f64(a, b, vdupq_n_f64(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfma_lane_f64<const LANE: i32>(
+    a: float64x1_t,
+    b: float64x1_t,
+    c: float64x1_t,
+) -> float64x1_t {
+    static_assert!(LANE == 0);
+    unsafe { vfma_f64(a, b, vdup_n_f64(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfma_laneq_f64<const LANE: i32>(
+    a: float64x1_t,
+    b: float64x1_t,
+    c: float64x2_t,
+) -> float64x1_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vfma_f64(a, b, vdup_n_f64(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Subtract from accumulator."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_n_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmla))]
+pub fn vfma_n_f16(a: float16x4_t, b: float16x4_t, c: f16) -> float16x4_t {
+    vfma_f16(a, b, vdup_n_f16(c))
+}
+#[doc = "Floating-point fused Multiply-Subtract from accumulator."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_n_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmla))]
+pub fn vfmaq_n_f16(a: float16x8_t, b: float16x8_t, c: f16) -> float16x8_t {
+    vfmaq_f16(a, b, vdupq_n_f16(c))
+}
+#[doc = "Floating-point fused Multiply-Add to accumulator(vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_n_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmadd))]
+pub fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t {
+    vfma_f64(a, b, vdup_n_f64(c))
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmad_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmad_lane_f64<const LANE: i32>(a: f64, b: f64, c: float64x1_t) -> f64 {
+    static_assert!(LANE == 0);
+    unsafe {
+        let c: f64 = simd_extract!(c, LANE as u32);
+        fmaf64(b, c, a)
+    }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmah_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmadd))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmah_f16(a: f16, b: f16, c: f16) -> f16 {
+    unsafe { fmaf16(b, c, a) }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmah_lane_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmah_lane_f16<const LANE: i32>(a: f16, b: f16, v: float16x4_t) -> f16 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: f16 = simd_extract!(v, LANE as u32);
+        vfmah_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmah_laneq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmah_laneq_f16<const LANE: i32>(a: f16, b: f16, v: float16x8_t) -> f16 {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        let c: f16 = simd_extract!(v, LANE as u32);
+        vfmah_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point fused Multiply-Add to accumulator(vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmla))]
+pub fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    unsafe { simd_fma(b, c, a) }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmaq_lane_f64<const LANE: i32>(
+    a: float64x2_t,
+    b: float64x2_t,
+    c: float64x1_t,
+) -> float64x2_t {
+    static_assert!(LANE == 0);
+    unsafe { vfmaq_f64(a, b, vdupq_n_f64(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Add to accumulator(vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_n_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmla))]
+pub fn vfmaq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t {
+    vfmaq_f64(a, b, vdupq_n_f64(c))
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmas_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmas_lane_f32<const LANE: i32>(a: f32, b: f32, c: float32x2_t) -> f32 {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: f32 = simd_extract!(c, LANE as u32);
+        fmaf32(b, c, a)
+    }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmas_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmas_laneq_f32<const LANE: i32>(a: f32, b: f32, c: float32x4_t) -> f32 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: f32 = simd_extract!(c, LANE as u32);
+        fmaf32(b, c, a)
+    }
+}
+#[doc = "Floating-point fused multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmad_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmad_laneq_f64<const LANE: i32>(a: f64, b: f64, c: float64x2_t) -> f64 {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: f64 = simd_extract!(c, LANE as u32);
+        fmaf64(b, c, a)
+    }
+}
+#[doc = "Floating-point fused Multiply-Add Long to accumulator (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlal_high_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmlal2))]
+pub fn vfmlal_high_f16(r: float32x2_t, a: float16x4_t, b: float16x4_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmlal2.v2f32.v4f16"
+        )]
+        fn _vfmlal_high_f16(r: float32x2_t, a: float16x4_t, b: float16x4_t) -> float32x2_t;
+    }
+    unsafe { _vfmlal_high_f16(r, a, b) }
+}
+#[doc = "Floating-point fused Multiply-Add Long to accumulator (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_high_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmlal2))]
+pub fn vfmlalq_high_f16(r: float32x4_t, a: float16x8_t, b: float16x8_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmlal2.v4f32.v8f16"
+        )]
+        fn _vfmlalq_high_f16(r: float32x4_t, a: float16x8_t, b: float16x8_t) -> float32x4_t;
+    }
+    unsafe { _vfmlalq_high_f16(r, a, b) }
+}
+#[doc = "Floating-point fused Multiply-Add Long to accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlal_lane_high_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlal2, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlal_lane_high_f16<const LANE: i32>(
+    r: float32x2_t,
+    a: float16x4_t,
+    b: float16x4_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfmlal_high_f16(r, a, vdup_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Add Long to accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlal_laneq_high_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlal2, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlal_laneq_high_f16<const LANE: i32>(
+    r: float32x2_t,
+    a: float16x4_t,
+    b: float16x8_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vfmlal_high_f16(r, a, vdup_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Add Long to accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_lane_high_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlal2, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlalq_lane_high_f16<const LANE: i32>(
+    r: float32x4_t,
+    a: float16x8_t,
+    b: float16x4_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfmlalq_high_f16(r, a, vdupq_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Add Long to accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_laneq_high_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlal2, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlalq_laneq_high_f16<const LANE: i32>(
+    r: float32x4_t,
+    a: float16x8_t,
+    b: float16x8_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vfmlalq_high_f16(r, a, vdupq_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Add Long to accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlal_lane_low_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlal, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlal_lane_low_f16<const LANE: i32>(
+    r: float32x2_t,
+    a: float16x4_t,
+    b: float16x4_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfmlal_low_f16(r, a, vdup_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Add Long to accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlal_laneq_low_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlal, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlal_laneq_low_f16<const LANE: i32>(
+    r: float32x2_t,
+    a: float16x4_t,
+    b: float16x8_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vfmlal_low_f16(r, a, vdup_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Add Long to accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_lane_low_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlal, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlalq_lane_low_f16<const LANE: i32>(
+    r: float32x4_t,
+    a: float16x8_t,
+    b: float16x4_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfmlalq_low_f16(r, a, vdupq_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Add Long to accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_laneq_low_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlal, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlalq_laneq_low_f16<const LANE: i32>(
+    r: float32x4_t,
+    a: float16x8_t,
+    b: float16x8_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vfmlalq_low_f16(r, a, vdupq_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Add Long to accumulator (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlal_low_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmlal))]
+pub fn vfmlal_low_f16(r: float32x2_t, a: float16x4_t, b: float16x4_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmlal.v2f32.v4f16"
+        )]
+        fn _vfmlal_low_f16(r: float32x2_t, a: float16x4_t, b: float16x4_t) -> float32x2_t;
+    }
+    unsafe { _vfmlal_low_f16(r, a, b) }
+}
+#[doc = "Floating-point fused Multiply-Add Long to accumulator (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_low_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmlal))]
+pub fn vfmlalq_low_f16(r: float32x4_t, a: float16x8_t, b: float16x8_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmlal.v4f32.v8f16"
+        )]
+        fn _vfmlalq_low_f16(r: float32x4_t, a: float16x8_t, b: float16x8_t) -> float32x4_t;
+    }
+    unsafe { _vfmlalq_low_f16(r, a, b) }
+}
+#[doc = "Floating-point fused Multiply-Subtract Long from accumulator (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlsl_high_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmlsl2))]
+pub fn vfmlsl_high_f16(r: float32x2_t, a: float16x4_t, b: float16x4_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmlsl2.v2f32.v4f16"
+        )]
+        fn _vfmlsl_high_f16(r: float32x2_t, a: float16x4_t, b: float16x4_t) -> float32x2_t;
+    }
+    unsafe { _vfmlsl_high_f16(r, a, b) }
+}
+#[doc = "Floating-point fused Multiply-Subtract Long from accumulator (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlslq_high_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmlsl2))]
+pub fn vfmlslq_high_f16(r: float32x4_t, a: float16x8_t, b: float16x8_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmlsl2.v4f32.v8f16"
+        )]
+        fn _vfmlslq_high_f16(r: float32x4_t, a: float16x8_t, b: float16x8_t) -> float32x4_t;
+    }
+    unsafe { _vfmlslq_high_f16(r, a, b) }
+}
+#[doc = "Floating-point fused Multiply-Subtract Long from accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlsl_lane_high_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlsl2, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlsl_lane_high_f16<const LANE: i32>(
+    r: float32x2_t,
+    a: float16x4_t,
+    b: float16x4_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfmlsl_high_f16(r, a, vdup_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Subtract Long from accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlsl_laneq_high_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlsl2, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlsl_laneq_high_f16<const LANE: i32>(
+    r: float32x2_t,
+    a: float16x4_t,
+    b: float16x8_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vfmlsl_high_f16(r, a, vdup_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Subtract Long from accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlslq_lane_high_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlsl2, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlslq_lane_high_f16<const LANE: i32>(
+    r: float32x4_t,
+    a: float16x8_t,
+    b: float16x4_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfmlslq_high_f16(r, a, vdupq_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Subtract Long from accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlslq_laneq_high_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlsl2, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlslq_laneq_high_f16<const LANE: i32>(
+    r: float32x4_t,
+    a: float16x8_t,
+    b: float16x8_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vfmlslq_high_f16(r, a, vdupq_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Subtract Long from accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlsl_lane_low_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlsl, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlsl_lane_low_f16<const LANE: i32>(
+    r: float32x2_t,
+    a: float16x4_t,
+    b: float16x4_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfmlsl_low_f16(r, a, vdup_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Subtract Long from accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlsl_laneq_low_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlsl, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlsl_laneq_low_f16<const LANE: i32>(
+    r: float32x2_t,
+    a: float16x4_t,
+    b: float16x8_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vfmlsl_low_f16(r, a, vdup_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Subtract Long from accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlslq_lane_low_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlsl, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlslq_lane_low_f16<const LANE: i32>(
+    r: float32x4_t,
+    a: float16x8_t,
+    b: float16x4_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfmlslq_low_f16(r, a, vdupq_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Subtract Long from accumulator (by element)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlslq_laneq_low_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmlsl, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmlslq_laneq_low_f16<const LANE: i32>(
+    r: float32x4_t,
+    a: float16x8_t,
+    b: float16x8_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vfmlslq_low_f16(r, a, vdupq_n_f16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Subtract Long from accumulator (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlsl_low_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmlsl))]
+pub fn vfmlsl_low_f16(r: float32x2_t, a: float16x4_t, b: float16x4_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmlsl.v2f32.v4f16"
+        )]
+        fn _vfmlsl_low_f16(r: float32x2_t, a: float16x4_t, b: float16x4_t) -> float32x2_t;
+    }
+    unsafe { _vfmlsl_low_f16(r, a, b) }
+}
+#[doc = "Floating-point fused Multiply-Subtract Long from accumulator (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlslq_low_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmlsl))]
+pub fn vfmlslq_low_f16(r: float32x4_t, a: float16x8_t, b: float16x8_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmlsl.v4f32.v8f16"
+        )]
+        fn _vfmlslq_low_f16(r: float32x4_t, a: float16x8_t, b: float16x8_t) -> float32x4_t;
+    }
+    unsafe { _vfmlslq_low_f16(r, a, b) }
+}
+#[doc = "Floating-point fused multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfms_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
+    unsafe {
+        let b: float64x1_t = simd_neg(b);
+        vfma_f64(a, b, c)
+    }
+}
+#[doc = "Floating-point fused multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_lane_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfms_lane_f16<const LANE: i32>(
+    a: float16x4_t,
+    b: float16x4_t,
+    c: float16x4_t,
+) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfms_f16(a, b, vdup_n_f16(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_laneq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfms_laneq_f16<const LANE: i32>(
+    a: float16x4_t,
+    b: float16x4_t,
+    c: float16x8_t,
+) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vfms_f16(a, b, vdup_n_f16(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_lane_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmsq_lane_f16<const LANE: i32>(
+    a: float16x8_t,
+    b: float16x8_t,
+    c: float16x4_t,
+) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfmsq_f16(a, b, vdupq_n_f16(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_laneq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmsq_laneq_f16<const LANE: i32>(
+    a: float16x8_t,
+    b: float16x8_t,
+    c: float16x8_t,
+) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vfmsq_f16(a, b, vdupq_n_f16(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-subtract to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfms_lane_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x2_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vfms_f32(a, b, vdup_n_f32(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-subtract to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfms_laneq_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x4_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfms_f32(a, b, vdup_n_f32(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-subtract to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmsq_lane_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x2_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vfmsq_f32(a, b, vdupq_n_f32(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-subtract to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmsq_laneq_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x4_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vfmsq_f32(a, b, vdupq_n_f32(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-subtract to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmsq_laneq_f64<const LANE: i32>(
+    a: float64x2_t,
+    b: float64x2_t,
+    c: float64x2_t,
+) -> float64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vfmsq_f64(a, b, vdupq_n_f64(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-subtract to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfms_lane_f64<const LANE: i32>(
+    a: float64x1_t,
+    b: float64x1_t,
+    c: float64x1_t,
+) -> float64x1_t {
+    static_assert!(LANE == 0);
+    unsafe { vfms_f64(a, b, vdup_n_f64(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused multiply-subtract to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfms_laneq_f64<const LANE: i32>(
+    a: float64x1_t,
+    b: float64x1_t,
+    c: float64x2_t,
+) -> float64x1_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vfms_f64(a, b, vdup_n_f64(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-Subtract from accumulator."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_n_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmls))]
+pub fn vfms_n_f16(a: float16x4_t, b: float16x4_t, c: f16) -> float16x4_t {
+    vfms_f16(a, b, vdup_n_f16(c))
+}
+#[doc = "Floating-point fused Multiply-Subtract from accumulator."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_n_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmls))]
+pub fn vfmsq_n_f16(a: float16x8_t, b: float16x8_t, c: f16) -> float16x8_t {
+    vfmsq_f16(a, b, vdupq_n_f16(c))
+}
+#[doc = "Floating-point fused Multiply-subtract to accumulator(vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_n_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfms_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t {
+    vfms_f64(a, b, vdup_n_f64(c))
+}
+#[doc = "Floating-point fused multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmsub))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmsh_f16(a: f16, b: f16, c: f16) -> f16 {
+    vfmah_f16(a, -b, c)
+}
+#[doc = "Floating-point fused multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsh_lane_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmsh_lane_f16<const LANE: i32>(a: f16, b: f16, v: float16x4_t) -> f16 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: f16 = simd_extract!(v, LANE as u32);
+        vfmsh_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point fused multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsh_laneq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmsh_laneq_f16<const LANE: i32>(a: f16, b: f16, v: float16x8_t) -> f16 {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        let c: f16 = simd_extract!(v, LANE as u32);
+        vfmsh_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point fused multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    unsafe {
+        let b: float64x2_t = simd_neg(b);
+        vfmaq_f64(a, b, c)
+    }
+}
+#[doc = "Floating-point fused multiply-subtract to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmsq_lane_f64<const LANE: i32>(
+    a: float64x2_t,
+    b: float64x2_t,
+    c: float64x1_t,
+) -> float64x2_t {
+    static_assert!(LANE == 0);
+    unsafe { vfmsq_f64(a, b, vdupq_n_f64(simd_extract!(c, LANE as u32))) }
+}
+#[doc = "Floating-point fused Multiply-subtract to accumulator(vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_n_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmsq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t {
+    vfmsq_f64(a, b, vdupq_n_f64(c))
+}
+#[doc = "Floating-point fused multiply-subtract to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmss_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmss_lane_f32<const LANE: i32>(a: f32, b: f32, c: float32x2_t) -> f32 {
+    vfmas_lane_f32::<LANE>(a, -b, c)
+}
+#[doc = "Floating-point fused multiply-subtract to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmss_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmss_laneq_f32<const LANE: i32>(a: f32, b: f32, c: float32x4_t) -> f32 {
+    vfmas_laneq_f32::<LANE>(a, -b, c)
+}
+#[doc = "Floating-point fused multiply-subtract to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsd_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmsd_lane_f64<const LANE: i32>(a: f64, b: f64, c: float64x1_t) -> f64 {
+    vfmad_lane_f64::<LANE>(a, -b, c)
+}
+#[doc = "Floating-point fused multiply-subtract to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsd_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vfmsd_laneq_f64<const LANE: i32>(a: f64, b: f64, c: float64x2_t) -> f64 {
+    vfmad_laneq_f64::<LANE>(a, -b, c)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld1_f16(ptr: *const f16) -> float16x4_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld1q_f16(ptr: *const f16) -> float16x8_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_f32(ptr: *const f32) -> float32x2_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_f32(ptr: *const f32) -> float32x4_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_f64(ptr: *const f64) -> float64x1_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_f64(ptr: *const f64) -> float64x2_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_s8(ptr: *const i8) -> int8x8_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_s8(ptr: *const i8) -> int8x16_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_s16(ptr: *const i16) -> int16x4_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_s16(ptr: *const i16) -> int16x8_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_s32(ptr: *const i32) -> int32x2_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_s32(ptr: *const i32) -> int32x4_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_s64(ptr: *const i64) -> int64x1_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_s64(ptr: *const i64) -> int64x2_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_u8(ptr: *const u8) -> uint8x8_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_u8(ptr: *const u8) -> uint8x16_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_u16(ptr: *const u16) -> uint16x4_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_u16(ptr: *const u16) -> uint16x8_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_u32(ptr: *const u32) -> uint32x2_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_u32(ptr: *const u32) -> uint32x4_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_u64(ptr: *const u64) -> uint64x1_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_u64(ptr: *const u64) -> uint64x2_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_p8(ptr: *const p8) -> poly8x8_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_p8(ptr: *const p8) -> poly8x16_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_p16(ptr: *const p16) -> poly16x4_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_p64(ptr: *const p64) -> poly64x1_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_p64(ptr: *const p64) -> poly64x2_t {
+    crate::ptr::read_unaligned(ptr.cast())
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld1))]
+pub unsafe fn vld1_f64_x2(a: *const f64) -> float64x1x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v1f64.p0"
+        )]
+        fn _vld1_f64_x2(a: *const f64) -> float64x1x2_t;
+    }
+    _vld1_f64_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld1))]
+pub unsafe fn vld1_f64_x3(a: *const f64) -> float64x1x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v1f64.p0"
+        )]
+        fn _vld1_f64_x3(a: *const f64) -> float64x1x3_t;
+    }
+    _vld1_f64_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld1))]
+pub unsafe fn vld1_f64_x4(a: *const f64) -> float64x1x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v1f64.p0"
+        )]
+        fn _vld1_f64_x4(a: *const f64) -> float64x1x4_t;
+    }
+    _vld1_f64_x4(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld1))]
+pub unsafe fn vld1q_f64_x2(a: *const f64) -> float64x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v2f64.p0"
+        )]
+        fn _vld1q_f64_x2(a: *const f64) -> float64x2x2_t;
+    }
+    _vld1q_f64_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld1))]
+pub unsafe fn vld1q_f64_x3(a: *const f64) -> float64x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v2f64.p0"
+        )]
+        fn _vld1q_f64_x3(a: *const f64) -> float64x2x3_t;
+    }
+    _vld1q_f64_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld1))]
+pub unsafe fn vld1q_f64_x4(a: *const f64) -> float64x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v2f64.p0"
+        )]
+        fn _vld1q_f64_x4(a: *const f64) -> float64x2x4_t;
+    }
+    _vld1q_f64_x4(a)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2_dup_f64(a: *const f64) -> float64x1x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v1f64.p0"
+        )]
+        fn _vld2_dup_f64(ptr: *const f64) -> float64x1x2_t;
+    }
+    _vld2_dup_f64(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2q_dup_f64(a: *const f64) -> float64x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v2f64.p0"
+        )]
+        fn _vld2q_dup_f64(ptr: *const f64) -> float64x2x2_t;
+    }
+    _vld2q_dup_f64(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2q_dup_s64(a: *const i64) -> int64x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v2i64.p0"
+        )]
+        fn _vld2q_dup_s64(ptr: *const i64) -> int64x2x2_t;
+    }
+    _vld2q_dup_s64(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld2_f64(a: *const f64) -> float64x1x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v1f64.p0"
+        )]
+        fn _vld2_f64(ptr: *const float64x1_t) -> float64x1x2_t;
+    }
+    _vld2_f64(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_f64<const LANE: i32>(a: *const f64, b: float64x1x2_t) -> float64x1x2_t {
+    static_assert!(LANE == 0);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v1f64.p0"
+        )]
+        fn _vld2_lane_f64(a: float64x1_t, b: float64x1_t, n: i64, ptr: *const i8) -> float64x1x2_t;
+    }
+    _vld2_lane_f64(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_s64<const LANE: i32>(a: *const i64, b: int64x1x2_t) -> int64x1x2_t {
+    static_assert!(LANE == 0);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v1i64.p0"
+        )]
+        fn _vld2_lane_s64(a: int64x1_t, b: int64x1_t, n: i64, ptr: *const i8) -> int64x1x2_t;
+    }
+    _vld2_lane_s64(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_p64<const LANE: i32>(a: *const p64, b: poly64x1x2_t) -> poly64x1x2_t {
+    static_assert!(LANE == 0);
+    transmute(vld2_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_u64<const LANE: i32>(a: *const u64, b: uint64x1x2_t) -> uint64x1x2_t {
+    static_assert!(LANE == 0);
+    transmute(vld2_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2q_dup_p64(a: *const p64) -> poly64x2x2_t {
+    transmute(vld2q_dup_s64(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2q_dup_p64(a: *const p64) -> poly64x2x2_t {
+    let mut ret_val: poly64x2x2_t = transmute(vld2q_dup_s64(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2q_dup_u64(a: *const u64) -> uint64x2x2_t {
+    transmute(vld2q_dup_s64(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2q_dup_u64(a: *const u64) -> uint64x2x2_t {
+    let mut ret_val: uint64x2x2_t = transmute(vld2q_dup_s64(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2q_f64(a: *const f64) -> float64x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v2f64.p0"
+        )]
+        fn _vld2q_f64(ptr: *const float64x2_t) -> float64x2x2_t;
+    }
+    _vld2q_f64(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2q_s64(a: *const i64) -> int64x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v2i64.p0"
+        )]
+        fn _vld2q_s64(ptr: *const int64x2_t) -> int64x2x2_t;
+    }
+    _vld2q_s64(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_f64<const LANE: i32>(a: *const f64, b: float64x2x2_t) -> float64x2x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v2f64.p0"
+        )]
+        fn _vld2q_lane_f64(a: float64x2_t, b: float64x2_t, n: i64, ptr: *const i8)
+            -> float64x2x2_t;
+    }
+    _vld2q_lane_f64(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_s8<const LANE: i32>(a: *const i8, b: int8x16x2_t) -> int8x16x2_t {
+    static_assert_uimm_bits!(LANE, 4);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v16i8.p0"
+        )]
+        fn _vld2q_lane_s8(a: int8x16_t, b: int8x16_t, n: i64, ptr: *const i8) -> int8x16x2_t;
+    }
+    _vld2q_lane_s8(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_s64<const LANE: i32>(a: *const i64, b: int64x2x2_t) -> int64x2x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v2i64.p0"
+        )]
+        fn _vld2q_lane_s64(a: int64x2_t, b: int64x2_t, n: i64, ptr: *const i8) -> int64x2x2_t;
+    }
+    _vld2q_lane_s64(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_p64<const LANE: i32>(a: *const p64, b: poly64x2x2_t) -> poly64x2x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    transmute(vld2q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_u8<const LANE: i32>(a: *const u8, b: uint8x16x2_t) -> uint8x16x2_t {
+    static_assert_uimm_bits!(LANE, 4);
+    transmute(vld2q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_u64<const LANE: i32>(a: *const u64, b: uint64x2x2_t) -> uint64x2x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    transmute(vld2q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_p8<const LANE: i32>(a: *const p8, b: poly8x16x2_t) -> poly8x16x2_t {
+    static_assert_uimm_bits!(LANE, 4);
+    transmute(vld2q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2q_p64(a: *const p64) -> poly64x2x2_t {
+    transmute(vld2q_s64(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2q_p64(a: *const p64) -> poly64x2x2_t {
+    let mut ret_val: poly64x2x2_t = transmute(vld2q_s64(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2q_u64(a: *const u64) -> uint64x2x2_t {
+    transmute(vld2q_s64(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2q_u64(a: *const u64) -> uint64x2x2_t {
+    let mut ret_val: uint64x2x2_t = transmute(vld2q_s64(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3_dup_f64(a: *const f64) -> float64x1x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v1f64.p0"
+        )]
+        fn _vld3_dup_f64(ptr: *const f64) -> float64x1x3_t;
+    }
+    _vld3_dup_f64(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3q_dup_f64(a: *const f64) -> float64x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v2f64.p0"
+        )]
+        fn _vld3q_dup_f64(ptr: *const f64) -> float64x2x3_t;
+    }
+    _vld3q_dup_f64(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3q_dup_s64(a: *const i64) -> int64x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v2i64.p0"
+        )]
+        fn _vld3q_dup_s64(ptr: *const i64) -> int64x2x3_t;
+    }
+    _vld3q_dup_s64(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld3_f64(a: *const f64) -> float64x1x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v1f64.p0"
+        )]
+        fn _vld3_f64(ptr: *const float64x1_t) -> float64x1x3_t;
+    }
+    _vld3_f64(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_f64<const LANE: i32>(a: *const f64, b: float64x1x3_t) -> float64x1x3_t {
+    static_assert!(LANE == 0);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v1f64.p0"
+        )]
+        fn _vld3_lane_f64(
+            a: float64x1_t,
+            b: float64x1_t,
+            c: float64x1_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> float64x1x3_t;
+    }
+    _vld3_lane_f64(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_p64<const LANE: i32>(a: *const p64, b: poly64x1x3_t) -> poly64x1x3_t {
+    static_assert!(LANE == 0);
+    transmute(vld3_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_s64<const LANE: i32>(a: *const i64, b: int64x1x3_t) -> int64x1x3_t {
+    static_assert!(LANE == 0);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v1i64.p0"
+        )]
+        fn _vld3_lane_s64(
+            a: int64x1_t,
+            b: int64x1_t,
+            c: int64x1_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int64x1x3_t;
+    }
+    _vld3_lane_s64(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_u64<const LANE: i32>(a: *const u64, b: uint64x1x3_t) -> uint64x1x3_t {
+    static_assert!(LANE == 0);
+    transmute(vld3_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3q_dup_p64(a: *const p64) -> poly64x2x3_t {
+    transmute(vld3q_dup_s64(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3q_dup_p64(a: *const p64) -> poly64x2x3_t {
+    let mut ret_val: poly64x2x3_t = transmute(vld3q_dup_s64(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3q_dup_u64(a: *const u64) -> uint64x2x3_t {
+    transmute(vld3q_dup_s64(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3q_dup_u64(a: *const u64) -> uint64x2x3_t {
+    let mut ret_val: uint64x2x3_t = transmute(vld3q_dup_s64(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3q_f64(a: *const f64) -> float64x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v2f64.p0"
+        )]
+        fn _vld3q_f64(ptr: *const float64x2_t) -> float64x2x3_t;
+    }
+    _vld3q_f64(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3q_s64(a: *const i64) -> int64x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v2i64.p0"
+        )]
+        fn _vld3q_s64(ptr: *const int64x2_t) -> int64x2x3_t;
+    }
+    _vld3q_s64(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_f64<const LANE: i32>(a: *const f64, b: float64x2x3_t) -> float64x2x3_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v2f64.p0"
+        )]
+        fn _vld3q_lane_f64(
+            a: float64x2_t,
+            b: float64x2_t,
+            c: float64x2_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> float64x2x3_t;
+    }
+    _vld3q_lane_f64(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_p64<const LANE: i32>(a: *const p64, b: poly64x2x3_t) -> poly64x2x3_t {
+    static_assert_uimm_bits!(LANE, 1);
+    transmute(vld3q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_s8<const LANE: i32>(a: *const i8, b: int8x16x3_t) -> int8x16x3_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v16i8.p0"
+        )]
+        fn _vld3q_lane_s8(
+            a: int8x16_t,
+            b: int8x16_t,
+            c: int8x16_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int8x16x3_t;
+    }
+    _vld3q_lane_s8(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_s64<const LANE: i32>(a: *const i64, b: int64x2x3_t) -> int64x2x3_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v2i64.p0"
+        )]
+        fn _vld3q_lane_s64(
+            a: int64x2_t,
+            b: int64x2_t,
+            c: int64x2_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int64x2x3_t;
+    }
+    _vld3q_lane_s64(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_u8<const LANE: i32>(a: *const u8, b: uint8x16x3_t) -> uint8x16x3_t {
+    static_assert_uimm_bits!(LANE, 4);
+    transmute(vld3q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_u64<const LANE: i32>(a: *const u64, b: uint64x2x3_t) -> uint64x2x3_t {
+    static_assert_uimm_bits!(LANE, 1);
+    transmute(vld3q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_p8<const LANE: i32>(a: *const p8, b: poly8x16x3_t) -> poly8x16x3_t {
+    static_assert_uimm_bits!(LANE, 4);
+    transmute(vld3q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3q_p64(a: *const p64) -> poly64x2x3_t {
+    transmute(vld3q_s64(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3q_p64(a: *const p64) -> poly64x2x3_t {
+    let mut ret_val: poly64x2x3_t = transmute(vld3q_s64(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3q_u64(a: *const u64) -> uint64x2x3_t {
+    transmute(vld3q_s64(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3q_u64(a: *const u64) -> uint64x2x3_t {
+    let mut ret_val: uint64x2x3_t = transmute(vld3q_s64(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_dup_f64(a: *const f64) -> float64x1x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v1f64.p0"
+        )]
+        fn _vld4_dup_f64(ptr: *const f64) -> float64x1x4_t;
+    }
+    _vld4_dup_f64(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_f64(a: *const f64) -> float64x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v2f64.p0"
+        )]
+        fn _vld4q_dup_f64(ptr: *const f64) -> float64x2x4_t;
+    }
+    _vld4q_dup_f64(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_s64(a: *const i64) -> int64x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v2i64.p0"
+        )]
+        fn _vld4q_dup_s64(ptr: *const i64) -> int64x2x4_t;
+    }
+    _vld4q_dup_s64(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld4_f64(a: *const f64) -> float64x1x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v1f64.p0"
+        )]
+        fn _vld4_f64(ptr: *const float64x1_t) -> float64x1x4_t;
+    }
+    _vld4_f64(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_f64<const LANE: i32>(a: *const f64, b: float64x1x4_t) -> float64x1x4_t {
+    static_assert!(LANE == 0);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v1f64.p0"
+        )]
+        fn _vld4_lane_f64(
+            a: float64x1_t,
+            b: float64x1_t,
+            c: float64x1_t,
+            d: float64x1_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> float64x1x4_t;
+    }
+    _vld4_lane_f64(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_s64<const LANE: i32>(a: *const i64, b: int64x1x4_t) -> int64x1x4_t {
+    static_assert!(LANE == 0);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v1i64.p0"
+        )]
+        fn _vld4_lane_s64(
+            a: int64x1_t,
+            b: int64x1_t,
+            c: int64x1_t,
+            d: int64x1_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int64x1x4_t;
+    }
+    _vld4_lane_s64(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_p64<const LANE: i32>(a: *const p64, b: poly64x1x4_t) -> poly64x1x4_t {
+    static_assert!(LANE == 0);
+    transmute(vld4_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_u64<const LANE: i32>(a: *const u64, b: uint64x1x4_t) -> uint64x1x4_t {
+    static_assert!(LANE == 0);
+    transmute(vld4_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_p64(a: *const p64) -> poly64x2x4_t {
+    transmute(vld4q_dup_s64(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_p64(a: *const p64) -> poly64x2x4_t {
+    let mut ret_val: poly64x2x4_t = transmute(vld4q_dup_s64(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [1, 0]) };
+    ret_val
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_u64(a: *const u64) -> uint64x2x4_t {
+    transmute(vld4q_dup_s64(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_u64(a: *const u64) -> uint64x2x4_t {
+    let mut ret_val: uint64x2x4_t = transmute(vld4q_dup_s64(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4q_f64(a: *const f64) -> float64x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v2f64.p0"
+        )]
+        fn _vld4q_f64(ptr: *const float64x2_t) -> float64x2x4_t;
+    }
+    _vld4q_f64(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4q_s64(a: *const i64) -> int64x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v2i64.p0"
+        )]
+        fn _vld4q_s64(ptr: *const int64x2_t) -> int64x2x4_t;
+    }
+    _vld4q_s64(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_f64<const LANE: i32>(a: *const f64, b: float64x2x4_t) -> float64x2x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v2f64.p0"
+        )]
+        fn _vld4q_lane_f64(
+            a: float64x2_t,
+            b: float64x2_t,
+            c: float64x2_t,
+            d: float64x2_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> float64x2x4_t;
+    }
+    _vld4q_lane_f64(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_s8<const LANE: i32>(a: *const i8, b: int8x16x4_t) -> int8x16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v16i8.p0"
+        )]
+        fn _vld4q_lane_s8(
+            a: int8x16_t,
+            b: int8x16_t,
+            c: int8x16_t,
+            d: int8x16_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int8x16x4_t;
+    }
+    _vld4q_lane_s8(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_s64<const LANE: i32>(a: *const i64, b: int64x2x4_t) -> int64x2x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v2i64.p0"
+        )]
+        fn _vld4q_lane_s64(
+            a: int64x2_t,
+            b: int64x2_t,
+            c: int64x2_t,
+            d: int64x2_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int64x2x4_t;
+    }
+    _vld4q_lane_s64(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_p64<const LANE: i32>(a: *const p64, b: poly64x2x4_t) -> poly64x2x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    transmute(vld4q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_u8<const LANE: i32>(a: *const u8, b: uint8x16x4_t) -> uint8x16x4_t {
+    static_assert_uimm_bits!(LANE, 4);
+    transmute(vld4q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_u64<const LANE: i32>(a: *const u64, b: uint64x2x4_t) -> uint64x2x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    transmute(vld4q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_p8<const LANE: i32>(a: *const p8, b: poly8x16x4_t) -> poly8x16x4_t {
+    static_assert_uimm_bits!(LANE, 4);
+    transmute(vld4q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4q_p64(a: *const p64) -> poly64x2x4_t {
+    transmute(vld4q_s64(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4q_p64(a: *const p64) -> poly64x2x4_t {
+    let mut ret_val: poly64x2x4_t = transmute(vld4q_s64(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4q_u64(a: *const u64) -> uint64x2x4_t {
+    transmute(vld4q_s64(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4q_u64(a: *const u64) -> uint64x2x4_t {
+    let mut ret_val: uint64x2x4_t = transmute(vld4q_s64(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [1, 0]) };
+    ret_val
+}
+#[doc = "Lookup table read with 2-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti2_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 1))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti2_lane_s8<const LANE: i32>(a: int8x8_t, b: uint8x8_t) -> int8x16_t {
+    static_assert!(LANE >= 0 && LANE <= 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vluti2.lane.v16i8.v8i8"
+        )]
+        fn _vluti2_lane_s8(a: int8x8_t, b: uint8x8_t, n: i32) -> int8x16_t;
+    }
+    _vluti2_lane_s8(a, b, LANE)
+}
+#[doc = "Lookup table read with 2-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti2q_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 1))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti2q_lane_s8<const LANE: i32>(a: int8x16_t, b: uint8x8_t) -> int8x16_t {
+    static_assert!(LANE >= 0 && LANE <= 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vluti2.lane.v16i8.v16i8"
+        )]
+        fn _vluti2q_lane_s8(a: int8x16_t, b: uint8x8_t, n: i32) -> int8x16_t;
+    }
+    _vluti2q_lane_s8(a, b, LANE)
+}
+#[doc = "Lookup table read with 2-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti2_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 1))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti2_lane_s16<const LANE: i32>(a: int16x4_t, b: uint8x8_t) -> int16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vluti2.lane.v8i16.v4i16"
+        )]
+        fn _vluti2_lane_s16(a: int16x4_t, b: uint8x8_t, n: i32) -> int16x8_t;
+    }
+    _vluti2_lane_s16(a, b, LANE)
+}
+#[doc = "Lookup table read with 2-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti2q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 1))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti2q_lane_s16<const LANE: i32>(a: int16x8_t, b: uint8x8_t) -> int16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vluti2.lane.v8i16.v8i16"
+        )]
+        fn _vluti2q_lane_s16(a: int16x8_t, b: uint8x8_t, n: i32) -> int16x8_t;
+    }
+    _vluti2q_lane_s16(a, b, LANE)
+}
+#[doc = "Lookup table read with 2-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti2_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 1))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti2_lane_u8<const LANE: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x16_t {
+    static_assert!(LANE >= 0 && LANE <= 1);
+    transmute(vluti2_lane_s8::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 2-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti2q_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 1))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti2q_lane_u8<const LANE: i32>(a: uint8x16_t, b: uint8x8_t) -> uint8x16_t {
+    static_assert!(LANE >= 0 && LANE <= 1);
+    transmute(vluti2q_lane_s8::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 2-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti2_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 1))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti2_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint8x8_t) -> uint16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 3);
+    transmute(vluti2_lane_s16::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 2-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti2q_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 1))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti2q_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 3);
+    transmute(vluti2q_lane_s16::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 2-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti2_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 1))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti2_lane_p8<const LANE: i32>(a: poly8x8_t, b: uint8x8_t) -> poly8x16_t {
+    static_assert!(LANE >= 0 && LANE <= 1);
+    transmute(vluti2_lane_s8::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 2-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti2q_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 1))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti2q_lane_p8<const LANE: i32>(a: poly8x16_t, b: uint8x8_t) -> poly8x16_t {
+    static_assert!(LANE >= 0 && LANE <= 1);
+    transmute(vluti2q_lane_s8::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 2-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti2_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 1))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti2_lane_p16<const LANE: i32>(a: poly16x4_t, b: uint8x8_t) -> poly16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 3);
+    transmute(vluti2_lane_s16::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 2-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti2q_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 1))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti2q_lane_p16<const LANE: i32>(a: poly16x8_t, b: uint8x8_t) -> poly16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 3);
+    transmute(vluti2q_lane_s16::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_lane_f16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut,fp16")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_lane_f16_x2<const LANE: i32>(a: float16x8x2_t, b: uint8x8_t) -> float16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 1);
+    transmute(vluti4q_lane_s16_x2::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_lane_u16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_lane_u16_x2<const LANE: i32>(a: uint16x8x2_t, b: uint8x8_t) -> uint16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 1);
+    transmute(vluti4q_lane_s16_x2::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_lane_p16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_lane_p16_x2<const LANE: i32>(a: poly16x8x2_t, b: uint8x8_t) -> poly16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 1);
+    transmute(vluti4q_lane_s16_x2::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_lane_s16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_lane_s16_x2<const LANE: i32>(a: int16x8x2_t, b: uint8x8_t) -> int16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vluti4q.lane.x2.v8i16"
+        )]
+        fn _vluti4q_lane_s16_x2(a: int16x8_t, a: int16x8_t, b: uint8x8_t, n: i32) -> int16x8_t;
+    }
+    _vluti4q_lane_s16_x2(a.0, a.1, b, LANE)
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_lane_s8<const LANE: i32>(a: int8x16_t, b: uint8x8_t) -> int8x16_t {
+    static_assert!(LANE == 0);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vluti4q.lane.v8i8"
+        )]
+        fn _vluti4q_lane_s8(a: int8x16_t, b: uint8x8_t, n: i32) -> int8x16_t;
+    }
+    _vluti4q_lane_s8(a, b, LANE)
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_lane_u8<const LANE: i32>(a: uint8x16_t, b: uint8x8_t) -> uint8x16_t {
+    static_assert!(LANE == 0);
+    transmute(vluti4q_lane_s8::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_lane_p8<const LANE: i32>(a: poly8x16_t, b: uint8x8_t) -> poly8x16_t {
+    static_assert!(LANE == 0);
+    transmute(vluti4q_lane_s8::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_laneq_f16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut,fp16")]
+#[cfg_attr(test, assert_instr(nop, LANE = 3))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_laneq_f16_x2<const LANE: i32>(
+    a: float16x8x2_t,
+    b: uint8x16_t,
+) -> float16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 3);
+    transmute(vluti4q_laneq_s16_x2::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_laneq_u16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 3))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_laneq_u16_x2<const LANE: i32>(a: uint16x8x2_t, b: uint8x16_t) -> uint16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 3);
+    transmute(vluti4q_laneq_s16_x2::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_laneq_p16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 3))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_laneq_p16_x2<const LANE: i32>(a: poly16x8x2_t, b: uint8x16_t) -> poly16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 3);
+    transmute(vluti4q_laneq_s16_x2::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_laneq_s16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 3))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_laneq_s16_x2<const LANE: i32>(a: int16x8x2_t, b: uint8x16_t) -> int16x8_t {
+    static_assert!(LANE >= 0 && LANE <= 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vluti4q.laneq.x2.v8i16"
+        )]
+        fn _vluti4q_laneq_s16_x2(a: int16x8_t, b: int16x8_t, c: uint8x16_t, n: i32) -> int16x8_t;
+    }
+    _vluti4q_laneq_s16_x2(a.0, a.1, b, LANE)
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_laneq_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_laneq_s8<const LANE: i32>(a: int8x16_t, b: uint8x16_t) -> int8x16_t {
+    static_assert!(LANE >= 0 && LANE <= 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vluti4q.laneq.v16i8"
+        )]
+        fn _vluti4q_laneq_s8(a: int8x16_t, b: uint8x16_t, n: i32) -> int8x16_t;
+    }
+    _vluti4q_laneq_s8(a, b, LANE)
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_laneq_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_laneq_u8<const LANE: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(LANE >= 0 && LANE <= 1);
+    transmute(vluti4q_laneq_s8::<LANE>(transmute(a), b))
+}
+#[doc = "Lookup table read with 4-bit indices"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vluti4q_laneq_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,lut")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vluti4q_laneq_p8<const LANE: i32>(a: poly8x16_t, b: uint8x16_t) -> poly8x16_t {
+    static_assert!(LANE >= 0 && LANE <= 1);
+    transmute(vluti4q_laneq_s8::<LANE>(transmute(a), b))
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmax_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmax))]
+pub fn vmax_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmax.v1f64"
+        )]
+        fn _vmax_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vmax_f64(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmax))]
+pub fn vmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmax.v2f64"
+        )]
+        fn _vmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vmaxq_f64(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmax))]
+pub fn vmaxh_f16(a: f16, b: f16) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmax.f16"
+        )]
+        fn _vmaxh_f16(a: f16, b: f16) -> f16;
+    }
+    unsafe { _vmaxh_f16(a, b) }
+}
+#[doc = "Floating-point Maximum Number (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnm_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmaxnm))]
+pub fn vmaxnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.v1f64"
+        )]
+        fn _vmaxnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vmaxnm_f64(a, b) }
+}
+#[doc = "Floating-point Maximum Number (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmaxnm))]
+pub fn vmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.v2f64"
+        )]
+        fn _vmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vmaxnmq_f64(a, b) }
+}
+#[doc = "Floating-point Maximum Number"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmaxnm))]
+pub fn vmaxnmh_f16(a: f16, b: f16) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.f16"
+        )]
+        fn _vmaxnmh_f16(a: f16, b: f16) -> f16;
+    }
+    unsafe { _vmaxnmh_f16(a, b) }
+}
+#[doc = "Floating-point maximum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmv_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmaxnmv))]
+pub fn vmaxnmv_f16(a: float16x4_t) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmv.f16.v4f16"
+        )]
+        fn _vmaxnmv_f16(a: float16x4_t) -> f16;
+    }
+    unsafe { _vmaxnmv_f16(a) }
+}
+#[doc = "Floating-point maximum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmvq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmaxnmv))]
+pub fn vmaxnmvq_f16(a: float16x8_t) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmv.f16.v8f16"
+        )]
+        fn _vmaxnmvq_f16(a: float16x8_t) -> f16;
+    }
+    unsafe { _vmaxnmvq_f16(a) }
+}
+#[doc = "Floating-point maximum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmv_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+pub fn vmaxnmv_f32(a: float32x2_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmv.f32.v2f32"
+        )]
+        fn _vmaxnmv_f32(a: float32x2_t) -> f32;
+    }
+    unsafe { _vmaxnmv_f32(a) }
+}
+#[doc = "Floating-point maximum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmvq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+pub fn vmaxnmvq_f64(a: float64x2_t) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmv.f64.v2f64"
+        )]
+        fn _vmaxnmvq_f64(a: float64x2_t) -> f64;
+    }
+    unsafe { _vmaxnmvq_f64(a) }
+}
+#[doc = "Floating-point maximum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmvq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmaxnmv))]
+pub fn vmaxnmvq_f32(a: float32x4_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmv.f32.v4f32"
+        )]
+        fn _vmaxnmvq_f32(a: float32x4_t) -> f32;
+    }
+    unsafe { _vmaxnmvq_f32(a) }
+}
+#[doc = "Floating-point maximum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmaxv))]
+pub fn vmaxv_f16(a: float16x4_t) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxv.f16.v4f16"
+        )]
+        fn _vmaxv_f16(a: float16x4_t) -> f16;
+    }
+    unsafe { _vmaxv_f16(a) }
+}
+#[doc = "Floating-point maximum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmaxv))]
+pub fn vmaxvq_f16(a: float16x8_t) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxv.f16.v8f16"
+        )]
+        fn _vmaxvq_f16(a: float16x8_t) -> f16;
+    }
+    unsafe { _vmaxvq_f16(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+pub fn vmaxv_f32(a: float32x2_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxv.f32.v2f32"
+        )]
+        fn _vmaxv_f32(a: float32x2_t) -> f32;
+    }
+    unsafe { _vmaxv_f32(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmaxv))]
+pub fn vmaxvq_f32(a: float32x4_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxv.f32.v4f32"
+        )]
+        fn _vmaxvq_f32(a: float32x4_t) -> f32;
+    }
+    unsafe { _vmaxvq_f32(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+pub fn vmaxvq_f64(a: float64x2_t) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxv.f64.v2f64"
+        )]
+        fn _vmaxvq_f64(a: float64x2_t) -> f64;
+    }
+    unsafe { _vmaxvq_f64(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(smaxv))]
+pub fn vmaxv_s8(a: int8x8_t) -> i8 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxv.i8.v8i8"
+        )]
+        fn _vmaxv_s8(a: int8x8_t) -> i8;
+    }
+    unsafe { _vmaxv_s8(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(smaxv))]
+pub fn vmaxvq_s8(a: int8x16_t) -> i8 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxv.i8.v16i8"
+        )]
+        fn _vmaxvq_s8(a: int8x16_t) -> i8;
+    }
+    unsafe { _vmaxvq_s8(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(smaxv))]
+pub fn vmaxv_s16(a: int16x4_t) -> i16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxv.i16.v4i16"
+        )]
+        fn _vmaxv_s16(a: int16x4_t) -> i16;
+    }
+    unsafe { _vmaxv_s16(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(smaxv))]
+pub fn vmaxvq_s16(a: int16x8_t) -> i16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxv.i16.v8i16"
+        )]
+        fn _vmaxvq_s16(a: int16x8_t) -> i16;
+    }
+    unsafe { _vmaxvq_s16(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(smaxp))]
+pub fn vmaxv_s32(a: int32x2_t) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxv.i32.v2i32"
+        )]
+        fn _vmaxv_s32(a: int32x2_t) -> i32;
+    }
+    unsafe { _vmaxv_s32(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(smaxv))]
+pub fn vmaxvq_s32(a: int32x4_t) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxv.i32.v4i32"
+        )]
+        fn _vmaxvq_s32(a: int32x4_t) -> i32;
+    }
+    unsafe { _vmaxvq_s32(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(umaxv))]
+pub fn vmaxv_u8(a: uint8x8_t) -> u8 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxv.i8.v8i8"
+        )]
+        fn _vmaxv_u8(a: uint8x8_t) -> u8;
+    }
+    unsafe { _vmaxv_u8(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(umaxv))]
+pub fn vmaxvq_u8(a: uint8x16_t) -> u8 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxv.i8.v16i8"
+        )]
+        fn _vmaxvq_u8(a: uint8x16_t) -> u8;
+    }
+    unsafe { _vmaxvq_u8(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(umaxv))]
+pub fn vmaxv_u16(a: uint16x4_t) -> u16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxv.i16.v4i16"
+        )]
+        fn _vmaxv_u16(a: uint16x4_t) -> u16;
+    }
+    unsafe { _vmaxv_u16(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(umaxv))]
+pub fn vmaxvq_u16(a: uint16x8_t) -> u16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxv.i16.v8i16"
+        )]
+        fn _vmaxvq_u16(a: uint16x8_t) -> u16;
+    }
+    unsafe { _vmaxvq_u16(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(umaxp))]
+pub fn vmaxv_u32(a: uint32x2_t) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxv.i32.v2i32"
+        )]
+        fn _vmaxv_u32(a: uint32x2_t) -> u32;
+    }
+    unsafe { _vmaxv_u32(a) }
+}
+#[doc = "Horizontal vector max."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(umaxv))]
+pub fn vmaxvq_u32(a: uint32x4_t) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxv.i32.v4i32"
+        )]
+        fn _vmaxvq_u32(a: uint32x4_t) -> u32;
+    }
+    unsafe { _vmaxvq_u32(a) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmin_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmin))]
+pub fn vmin_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmin.v1f64"
+        )]
+        fn _vmin_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vmin_f64(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmin))]
+pub fn vminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmin.v2f64"
+        )]
+        fn _vminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vminq_f64(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmin))]
+pub fn vminh_f16(a: f16, b: f16) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmin.f16"
+        )]
+        fn _vminh_f16(a: f16, b: f16) -> f16;
+    }
+    unsafe { _vminh_f16(a, b) }
+}
+#[doc = "Floating-point Minimum Number (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnm_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fminnm))]
+pub fn vminnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.v1f64"
+        )]
+        fn _vminnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vminnm_f64(a, b) }
+}
+#[doc = "Floating-point Minimum Number (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fminnm))]
+pub fn vminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.v2f64"
+        )]
+        fn _vminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vminnmq_f64(a, b) }
+}
+#[doc = "Floating-point Minimum Number"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fminnm))]
+pub fn vminnmh_f16(a: f16, b: f16) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.f16"
+        )]
+        fn _vminnmh_f16(a: f16, b: f16) -> f16;
+    }
+    unsafe { _vminnmh_f16(a, b) }
+}
+#[doc = "Floating-point minimum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmv_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fminnmv))]
+pub fn vminnmv_f16(a: float16x4_t) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmv.f16.v4f16"
+        )]
+        fn _vminnmv_f16(a: float16x4_t) -> f16;
+    }
+    unsafe { _vminnmv_f16(a) }
+}
+#[doc = "Floating-point minimum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmvq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fminnmv))]
+pub fn vminnmvq_f16(a: float16x8_t) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmv.f16.v8f16"
+        )]
+        fn _vminnmvq_f16(a: float16x8_t) -> f16;
+    }
+    unsafe { _vminnmvq_f16(a) }
+}
+#[doc = "Floating-point minimum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmv_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vminnmv_f32(a: float32x2_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmv.f32.v2f32"
+        )]
+        fn _vminnmv_f32(a: float32x2_t) -> f32;
+    }
+    unsafe { _vminnmv_f32(a) }
+}
+#[doc = "Floating-point minimum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmvq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vminnmvq_f64(a: float64x2_t) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmv.f64.v2f64"
+        )]
+        fn _vminnmvq_f64(a: float64x2_t) -> f64;
+    }
+    unsafe { _vminnmvq_f64(a) }
+}
+#[doc = "Floating-point minimum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmvq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vminnmvq_f32(a: float32x4_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmv.f32.v4f32"
+        )]
+        fn _vminnmvq_f32(a: float32x4_t) -> f32;
+    }
+    unsafe { _vminnmvq_f32(a) }
+}
+#[doc = "Floating-point minimum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fminv))]
+pub fn vminv_f16(a: float16x4_t) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminv.f16.v4f16"
+        )]
+        fn _vminv_f16(a: float16x4_t) -> f16;
+    }
+    unsafe { _vminv_f16(a) }
+}
+#[doc = "Floating-point minimum number across vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fminv))]
+pub fn vminvq_f16(a: float16x8_t) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminv.f16.v8f16"
+        )]
+        fn _vminvq_f16(a: float16x8_t) -> f16;
+    }
+    unsafe { _vminvq_f16(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fminp))]
+pub fn vminv_f32(a: float32x2_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminv.f32.v2f32"
+        )]
+        fn _vminv_f32(a: float32x2_t) -> f32;
+    }
+    unsafe { _vminv_f32(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fminv))]
+pub fn vminvq_f32(a: float32x4_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminv.f32.v4f32"
+        )]
+        fn _vminvq_f32(a: float32x4_t) -> f32;
+    }
+    unsafe { _vminvq_f32(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fminp))]
+pub fn vminvq_f64(a: float64x2_t) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminv.f64.v2f64"
+        )]
+        fn _vminvq_f64(a: float64x2_t) -> f64;
+    }
+    unsafe { _vminvq_f64(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sminv))]
+pub fn vminv_s8(a: int8x8_t) -> i8 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminv.i8.v8i8"
+        )]
+        fn _vminv_s8(a: int8x8_t) -> i8;
+    }
+    unsafe { _vminv_s8(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sminv))]
+pub fn vminvq_s8(a: int8x16_t) -> i8 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminv.i8.v16i8"
+        )]
+        fn _vminvq_s8(a: int8x16_t) -> i8;
+    }
+    unsafe { _vminvq_s8(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sminv))]
+pub fn vminv_s16(a: int16x4_t) -> i16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminv.i16.v4i16"
+        )]
+        fn _vminv_s16(a: int16x4_t) -> i16;
+    }
+    unsafe { _vminv_s16(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sminv))]
+pub fn vminvq_s16(a: int16x8_t) -> i16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminv.i16.v8i16"
+        )]
+        fn _vminvq_s16(a: int16x8_t) -> i16;
+    }
+    unsafe { _vminvq_s16(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sminp))]
+pub fn vminv_s32(a: int32x2_t) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminv.i32.v2i32"
+        )]
+        fn _vminv_s32(a: int32x2_t) -> i32;
+    }
+    unsafe { _vminv_s32(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sminv))]
+pub fn vminvq_s32(a: int32x4_t) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminv.i32.v4i32"
+        )]
+        fn _vminvq_s32(a: int32x4_t) -> i32;
+    }
+    unsafe { _vminvq_s32(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uminv))]
+pub fn vminv_u8(a: uint8x8_t) -> u8 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminv.i8.v8i8"
+        )]
+        fn _vminv_u8(a: uint8x8_t) -> u8;
+    }
+    unsafe { _vminv_u8(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uminv))]
+pub fn vminvq_u8(a: uint8x16_t) -> u8 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminv.i8.v16i8"
+        )]
+        fn _vminvq_u8(a: uint8x16_t) -> u8;
+    }
+    unsafe { _vminvq_u8(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uminv))]
+pub fn vminv_u16(a: uint16x4_t) -> u16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminv.i16.v4i16"
+        )]
+        fn _vminv_u16(a: uint16x4_t) -> u16;
+    }
+    unsafe { _vminv_u16(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uminv))]
+pub fn vminvq_u16(a: uint16x8_t) -> u16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminv.i16.v8i16"
+        )]
+        fn _vminvq_u16(a: uint16x8_t) -> u16;
+    }
+    unsafe { _vminvq_u16(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uminp))]
+pub fn vminv_u32(a: uint32x2_t) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminv.i32.v2i32"
+        )]
+        fn _vminv_u32(a: uint32x2_t) -> u32;
+    }
+    unsafe { _vminv_u32(a) }
+}
+#[doc = "Horizontal vector min."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uminv))]
+pub fn vminvq_u32(a: uint32x4_t) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminv.i32.v4i32"
+        )]
+        fn _vminvq_u32(a: uint32x4_t) -> u32;
+    }
+    unsafe { _vminvq_u32(a) }
+}
+#[doc = "Floating-point multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmla_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Floating-point multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlal_high_s16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_laneq_s16<const LANE: i32>(
+    a: int32x4_t,
+    b: int16x8_t,
+    c: int16x8_t,
+) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmlal_high_s16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        vmlal_high_s32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_laneq_s32<const LANE: i32>(
+    a: int64x2_t,
+    b: int32x4_t,
+    c: int32x4_t,
+) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlal_high_s32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_lane_u16<const LANE: i32>(
+    a: uint32x4_t,
+    b: uint16x8_t,
+    c: uint16x4_t,
+) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlal_high_u16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_laneq_u16<const LANE: i32>(
+    a: uint32x4_t,
+    b: uint16x8_t,
+    c: uint16x8_t,
+) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmlal_high_u16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_lane_u32<const LANE: i32>(
+    a: uint64x2_t,
+    b: uint32x4_t,
+    c: uint32x2_t,
+) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        vmlal_high_u32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_laneq_u32<const LANE: i32>(
+    a: uint64x2_t,
+    b: uint32x4_t,
+    c: uint32x4_t,
+) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlal_high_u32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t {
+    vmlal_high_s16(a, b, vdupq_n_s16(c))
+}
+#[doc = "Multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t {
+    vmlal_high_s32(a, b, vdupq_n_s32(c))
+}
+#[doc = "Multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_n_u16(a: uint32x4_t, b: uint16x8_t, c: u16) -> uint32x4_t {
+    vmlal_high_u16(a, b, vdupq_n_u16(c))
+}
+#[doc = "Multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2_t {
+    vmlal_high_u32(a, b, vdupq_n_u32(c))
+}
+#[doc = "Signed multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t {
+    unsafe {
+        let b: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let c: int8x8_t = simd_shuffle!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+        vmlal_s8(a, b, c)
+    }
+}
+#[doc = "Signed multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    unsafe {
+        let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        let c: int16x4_t = simd_shuffle!(c, c, [4, 5, 6, 7]);
+        vmlal_s16(a, b, c)
+    }
+}
+#[doc = "Signed multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    unsafe {
+        let b: int32x2_t = simd_shuffle!(b, b, [2, 3]);
+        let c: int32x2_t = simd_shuffle!(c, c, [2, 3]);
+        vmlal_s32(a, b, c)
+    }
+}
+#[doc = "Unsigned multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t {
+    unsafe {
+        let b: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let c: uint8x8_t = simd_shuffle!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+        vmlal_u8(a, b, c)
+    }
+}
+#[doc = "Unsigned multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
+    unsafe {
+        let b: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        let c: uint16x4_t = simd_shuffle!(c, c, [4, 5, 6, 7]);
+        vmlal_u16(a, b, c)
+    }
+}
+#[doc = "Unsigned multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
+    unsafe {
+        let b: uint32x2_t = simd_shuffle!(b, b, [2, 3]);
+        let c: uint32x2_t = simd_shuffle!(c, c, [2, 3]);
+        vmlal_u32(a, b, c)
+    }
+}
+#[doc = "Floating-point multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmls_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Floating-point multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlsl_high_s16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_laneq_s16<const LANE: i32>(
+    a: int32x4_t,
+    b: int16x8_t,
+    c: int16x8_t,
+) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmlsl_high_s16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        vmlsl_high_s32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_laneq_s32<const LANE: i32>(
+    a: int64x2_t,
+    b: int32x4_t,
+    c: int32x4_t,
+) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlsl_high_s32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_lane_u16<const LANE: i32>(
+    a: uint32x4_t,
+    b: uint16x8_t,
+    c: uint16x4_t,
+) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlsl_high_u16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_laneq_u16<const LANE: i32>(
+    a: uint32x4_t,
+    b: uint16x8_t,
+    c: uint16x8_t,
+) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmlsl_high_u16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_lane_u32<const LANE: i32>(
+    a: uint64x2_t,
+    b: uint32x4_t,
+    c: uint32x2_t,
+) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        vmlsl_high_u32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_laneq_u32<const LANE: i32>(
+    a: uint64x2_t,
+    b: uint32x4_t,
+    c: uint32x4_t,
+) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlsl_high_u32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t {
+    vmlsl_high_s16(a, b, vdupq_n_s16(c))
+}
+#[doc = "Multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t {
+    vmlsl_high_s32(a, b, vdupq_n_s32(c))
+}
+#[doc = "Multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_n_u16(a: uint32x4_t, b: uint16x8_t, c: u16) -> uint32x4_t {
+    vmlsl_high_u16(a, b, vdupq_n_u16(c))
+}
+#[doc = "Multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2_t {
+    vmlsl_high_u32(a, b, vdupq_n_u32(c))
+}
+#[doc = "Signed multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t {
+    unsafe {
+        let b: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let c: int8x8_t = simd_shuffle!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+        vmlsl_s8(a, b, c)
+    }
+}
+#[doc = "Signed multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    unsafe {
+        let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        let c: int16x4_t = simd_shuffle!(c, c, [4, 5, 6, 7]);
+        vmlsl_s16(a, b, c)
+    }
+}
+#[doc = "Signed multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    unsafe {
+        let b: int32x2_t = simd_shuffle!(b, b, [2, 3]);
+        let c: int32x2_t = simd_shuffle!(c, c, [2, 3]);
+        vmlsl_s32(a, b, c)
+    }
+}
+#[doc = "Unsigned multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t {
+    unsafe {
+        let b: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let c: uint8x8_t = simd_shuffle!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+        vmlsl_u8(a, b, c)
+    }
+}
+#[doc = "Unsigned multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
+    unsafe {
+        let b: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        let c: uint16x4_t = simd_shuffle!(c, c, [4, 5, 6, 7]);
+        vmlsl_u16(a, b, c)
+    }
+}
+#[doc = "Unsigned multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmlsl_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
+    unsafe {
+        let b: uint32x2_t = simd_shuffle!(b, b, [2, 3]);
+        let c: uint32x2_t = simd_shuffle!(c, c, [2, 3]);
+        vmlsl_u32(a, b, c)
+    }
+}
+#[doc = "Vector move"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_high_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sxtl2))]
+pub fn vmovl_high_s8(a: int8x16_t) -> int16x8_t {
+    unsafe {
+        let a: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        vmovl_s8(a)
+    }
+}
+#[doc = "Vector move"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sxtl2))]
+pub fn vmovl_high_s16(a: int16x8_t) -> int32x4_t {
+    unsafe {
+        let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        vmovl_s16(a)
+    }
+}
+#[doc = "Vector move"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sxtl2))]
+pub fn vmovl_high_s32(a: int32x4_t) -> int64x2_t {
+    unsafe {
+        let a: int32x2_t = simd_shuffle!(a, a, [2, 3]);
+        vmovl_s32(a)
+    }
+}
+#[doc = "Vector move"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_high_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uxtl2))]
+pub fn vmovl_high_u8(a: uint8x16_t) -> uint16x8_t {
+    unsafe {
+        let a: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        vmovl_u8(a)
+    }
+}
+#[doc = "Vector move"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uxtl2))]
+pub fn vmovl_high_u16(a: uint16x8_t) -> uint32x4_t {
+    unsafe {
+        let a: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        vmovl_u16(a)
+    }
+}
+#[doc = "Vector move"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uxtl2))]
+pub fn vmovl_high_u32(a: uint32x4_t) -> uint64x2_t {
+    unsafe {
+        let a: uint32x2_t = simd_shuffle!(a, a, [2, 3]);
+        vmovl_u32(a)
+    }
+}
+#[doc = "Extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(xtn2))]
+pub fn vmovn_high_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t {
+    unsafe {
+        let c: int8x8_t = simd_cast(b);
+        simd_shuffle!(a, c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    }
+}
+#[doc = "Extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(xtn2))]
+pub fn vmovn_high_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t {
+    unsafe {
+        let c: int16x4_t = simd_cast(b);
+        simd_shuffle!(a, c, [0, 1, 2, 3, 4, 5, 6, 7])
+    }
+}
+#[doc = "Extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_high_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(xtn2))]
+pub fn vmovn_high_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t {
+    unsafe {
+        let c: int32x2_t = simd_cast(b);
+        simd_shuffle!(a, c, [0, 1, 2, 3])
+    }
+}
+#[doc = "Extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(xtn2))]
+pub fn vmovn_high_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
+    unsafe {
+        let c: uint8x8_t = simd_cast(b);
+        simd_shuffle!(a, c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    }
+}
+#[doc = "Extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(xtn2))]
+pub fn vmovn_high_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
+    unsafe {
+        let c: uint16x4_t = simd_cast(b);
+        simd_shuffle!(a, c, [0, 1, 2, 3, 4, 5, 6, 7])
+    }
+}
+#[doc = "Extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_high_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(xtn2))]
+pub fn vmovn_high_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
+    unsafe {
+        let c: uint32x2_t = simd_cast(b);
+        simd_shuffle!(a, c, [0, 1, 2, 3])
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmul))]
+pub fn vmul_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmul))]
+pub fn vmulq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmul_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    static_assert!(LANE == 0);
+    unsafe { simd_mul(a, transmute::<f64, _>(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmul_laneq_f16<const LANE: i32>(a: float16x4_t, b: float16x8_t) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulq_laneq_f16<const LANE: i32>(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(
+                b,
+                b,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmul_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x2_t) -> float64x1_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { simd_mul(a, transmute::<f64, _>(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_n_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmul_n_f64(a: float64x1_t, b: f64) -> float64x1_t {
+    unsafe { simd_mul(a, vdup_n_f64(b)) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_n_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulq_n_f64(a: float64x2_t, b: f64) -> float64x2_t {
+    unsafe { simd_mul(a, vdupq_n_f64(b)) }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmuld_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmuld_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> f64 {
+    static_assert!(LANE == 0);
+    unsafe {
+        let b: f64 = simd_extract!(b, LANE as u32);
+        a * b
+    }
+}
+#[doc = "Add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vmulh_f16(a: f16, b: f16) -> f16 {
+    a * b
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulh_lane_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulh_lane_f16<const LANE: i32>(a: f16, b: float16x4_t) -> f16 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let b: f16 = simd_extract!(b, LANE as u32);
+        a * b
+    }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulh_laneq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulh_laneq_f16<const LANE: i32>(a: f16, b: float16x8_t) -> f16 {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        let b: f16 = simd_extract!(b, LANE as u32);
+        a * b
+    }
+}
+#[doc = "Multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmull_high_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmull_high_s16(
+            a,
+            simd_shuffle!(
+                b,
+                b,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmull_high_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmull_high_s16(
+            a,
+            simd_shuffle!(
+                b,
+                b,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmull_high_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        vmull_high_s32(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmull_high_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmull_high_s32(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmull_high_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmull_high_u16(
+            a,
+            simd_shuffle!(
+                b,
+                b,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmull_high_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmull_high_u16(
+            a,
+            simd_shuffle!(
+                b,
+                b,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmull_high_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        vmull_high_u32(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmull_high_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmull_high_u32(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmull_high_n_s16(a: int16x8_t, b: i16) -> int32x4_t {
+    vmull_high_s16(a, vdupq_n_s16(b))
+}
+#[doc = "Multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmull_high_n_s32(a: int32x4_t, b: i32) -> int64x2_t {
+    vmull_high_s32(a, vdupq_n_s32(b))
+}
+#[doc = "Multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmull_high_n_u16(a: uint16x8_t, b: u16) -> uint32x4_t {
+    vmull_high_u16(a, vdupq_n_u16(b))
+}
+#[doc = "Multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmull_high_n_u32(a: uint32x4_t, b: u32) -> uint64x2_t {
+    vmull_high_u32(a, vdupq_n_u32(b))
+}
+#[doc = "Polynomial multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_p64)"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(pmull))]
+pub fn vmull_high_p64(a: poly64x2_t, b: poly64x2_t) -> p128 {
+    unsafe { vmull_p64(simd_extract!(a, 1), simd_extract!(b, 1)) }
+}
+#[doc = "Polynomial multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(pmull))]
+pub fn vmull_high_p8(a: poly8x16_t, b: poly8x16_t) -> poly16x8_t {
+    unsafe {
+        let a: poly8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let b: poly8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        vmull_p8(a, b)
+    }
+}
+#[doc = "Signed multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(smull2))]
+pub fn vmull_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
+    unsafe {
+        let a: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let b: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        vmull_s8(a, b)
+    }
+}
+#[doc = "Signed multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(smull2))]
+pub fn vmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    unsafe {
+        let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        vmull_s16(a, b)
+    }
+}
+#[doc = "Signed multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(smull2))]
+pub fn vmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    unsafe {
+        let a: int32x2_t = simd_shuffle!(a, a, [2, 3]);
+        let b: int32x2_t = simd_shuffle!(b, b, [2, 3]);
+        vmull_s32(a, b)
+    }
+}
+#[doc = "Unsigned multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(umull2))]
+pub fn vmull_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
+    unsafe {
+        let a: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let b: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        vmull_u8(a, b)
+    }
+}
+#[doc = "Unsigned multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(umull2))]
+pub fn vmull_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    unsafe {
+        let a: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        let b: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        vmull_u16(a, b)
+    }
+}
+#[doc = "Unsigned multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(umull2))]
+pub fn vmull_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    unsafe {
+        let a: uint32x2_t = simd_shuffle!(a, a, [2, 3]);
+        let b: uint32x2_t = simd_shuffle!(b, b, [2, 3]);
+        vmull_u32(a, b)
+    }
+}
+#[doc = "Polynomial multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_p64)"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(pmull))]
+pub fn vmull_p64(a: p64, b: p64) -> p128 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.pmull64"
+        )]
+        fn _vmull_p64(a: p64, b: p64) -> int8x16_t;
+    }
+    unsafe { transmute(_vmull_p64(a, b)) }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x1_t) -> float64x2_t {
+    static_assert!(LANE == 0);
+    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmuls_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmuls_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> f32 {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let b: f32 = simd_extract!(b, LANE as u32);
+        a * b
+    }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmuls_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmuls_laneq_f32<const LANE: i32>(a: f32, b: float32x4_t) -> f32 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let b: f32 = simd_extract!(b, LANE as u32);
+        a * b
+    }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmuld_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmuld_laneq_f64<const LANE: i32>(a: f64, b: float64x2_t) -> f64 {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let b: f64 = simd_extract!(b, LANE as u32);
+        a * b
+    }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub fn vmulx_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmulx.v4f16"
+        )]
+        fn _vmulx_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vmulx_f16(a, b) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub fn vmulxq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmulx.v8f16"
+        )]
+        fn _vmulxq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vmulxq_f16(a, b) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub fn vmulx_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmulx.v2f32"
+        )]
+        fn _vmulx_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vmulx_f32(a, b) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub fn vmulxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmulx.v4f32"
+        )]
+        fn _vmulxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vmulxq_f32(a, b) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub fn vmulx_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmulx.v1f64"
+        )]
+        fn _vmulx_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vmulx_f64(a, b) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub fn vmulxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmulx.v2f64"
+        )]
+        fn _vmulxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vmulxq_f64(a, b) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_lane_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulx_lane_f16<const LANE: i32>(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmulx_f16(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_laneq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulx_laneq_f16<const LANE: i32>(a: float16x4_t, b: float16x8_t) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmulx_f16(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_lane_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulxq_lane_f16<const LANE: i32>(a: float16x8_t, b: float16x4_t) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmulxq_f16(
+            a,
+            simd_shuffle!(
+                b,
+                b,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_laneq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulxq_laneq_f16<const LANE: i32>(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmulxq_f16(
+            a,
+            simd_shuffle!(
+                b,
+                b,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulx_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmulx_f32(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulx_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmulx_f32(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulxq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        vmulxq_f32(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulxq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmulxq_f32(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulxq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmulxq_f64(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulx_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    static_assert!(LANE == 0);
+    unsafe { vmulx_f64(a, transmute::<f64, _>(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulx_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x2_t) -> float64x1_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmulx_f64(a, transmute::<f64, _>(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_n_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmulx))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulx_n_f16(a: float16x4_t, b: f16) -> float16x4_t {
+    vmulx_f16(a, vdup_n_f16(b))
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_n_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmulx))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulxq_n_f16(a: float16x8_t, b: f16) -> float16x8_t {
+    vmulxq_f16(a, vdupq_n_f16(b))
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub fn vmulxd_f64(a: f64, b: f64) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmulx.f64"
+        )]
+        fn _vmulxd_f64(a: f64, b: f64) -> f64;
+    }
+    unsafe { _vmulxd_f64(a, b) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxs_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub fn vmulxs_f32(a: f32, b: f32) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmulx.f32"
+        )]
+        fn _vmulxs_f32(a: f32, b: f32) -> f32;
+    }
+    unsafe { _vmulxs_f32(a, b) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxd_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulxd_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> f64 {
+    static_assert!(LANE == 0);
+    unsafe { vmulxd_f64(a, simd_extract!(b, LANE as u32)) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxd_laneq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulxd_laneq_f64<const LANE: i32>(a: f64, b: float64x2_t) -> f64 {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmulxd_f64(a, simd_extract!(b, LANE as u32)) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxs_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulxs_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> f32 {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmulxs_f32(a, simd_extract!(b, LANE as u32)) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxs_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulxs_laneq_f32<const LANE: i32>(a: f32, b: float32x4_t) -> f32 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmulxs_f32(a, simd_extract!(b, LANE as u32)) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub fn vmulxh_f16(a: f16, b: f16) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmulx.f16"
+        )]
+        fn _vmulxh_f16(a: f16, b: f16) -> f16;
+    }
+    unsafe { _vmulxh_f16(a, b) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxh_lane_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulxh_lane_f16<const LANE: i32>(a: f16, b: float16x4_t) -> f16 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmulxh_f16(a, simd_extract!(b, LANE as u32)) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxh_laneq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulxh_laneq_f16<const LANE: i32>(a: f16, b: float16x8_t) -> f16 {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vmulxh_f16(a, simd_extract!(b, LANE as u32)) }
+}
+#[doc = "Floating-point multiply extended"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmulxq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x1_t) -> float64x2_t {
+    static_assert!(LANE == 0);
+    unsafe { vmulxq_f64(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vneg_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fneg))]
+pub fn vneg_f64(a: float64x1_t) -> float64x1_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vnegq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fneg))]
+pub fn vnegq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vneg_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(neg))]
+pub fn vneg_s64(a: int64x1_t) -> int64x1_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vnegq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(neg))]
+pub fn vnegq_s64(a: int64x2_t) -> int64x2_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vnegd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(neg))]
+pub fn vnegd_s64(a: i64) -> i64 {
+    a.wrapping_neg()
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vnegh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fneg))]
+pub fn vnegh_f16(a: f16) -> f16 {
+    -a
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vpaddd_f64(a: float64x2_t) -> f64 {
+    unsafe {
+        let a1: f64 = simd_extract!(a, 0);
+        let a2: f64 = simd_extract!(a, 1);
+        a1 + a2
+    }
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadds_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vpadds_f32(a: float32x2_t) -> f32 {
+    unsafe {
+        let a1: f32 = simd_extract!(a, 0);
+        let a2: f32 = simd_extract!(a, 1);
+        a1 + a2
+    }
+}
+#[doc = "Add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddd_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddd_s64(a: int64x2_t) -> i64 {
+    unsafe { transmute(vaddvq_u64(transmute(a))) }
+}
+#[doc = "Add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddd_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddd_s64(a: int64x2_t) -> i64 {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(vaddvq_u64(transmute(a))) }
+}
+#[doc = "Add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddd_u64(a: uint64x2_t) -> u64 {
+    vaddvq_u64(a)
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(faddp))]
+pub fn vpaddq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.faddp.v8f16"
+        )]
+        fn _vpaddq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vpaddq_f16(a, b) }
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(faddp))]
+pub fn vpaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.faddp.v4f32"
+        )]
+        fn _vpaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vpaddq_f32(a, b) }
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(faddp))]
+pub fn vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.faddp.v2f64"
+        )]
+        fn _vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vpaddq_f64(a, b) }
+}
+#[doc = "Add Pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.addp.v16i8"
+        )]
+        fn _vpaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vpaddq_s8(a, b) }
+}
+#[doc = "Add Pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.addp.v8i16"
+        )]
+        fn _vpaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vpaddq_s16(a, b) }
+}
+#[doc = "Add Pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.addp.v4i32"
+        )]
+        fn _vpaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vpaddq_s32(a, b) }
+}
+#[doc = "Add Pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.addp.v2i64"
+        )]
+        fn _vpaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+    unsafe { _vpaddq_s64(a, b) }
+}
+#[doc = "Add Pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { transmute(vpaddq_s8(transmute(a), transmute(b))) }
+}
+#[doc = "Add Pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x16_t =
+        unsafe { simd_shuffle!(b, b, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(vpaddq_s8(transmute(a), transmute(b)));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Add Pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { transmute(vpaddq_s16(transmute(a), transmute(b))) }
+}
+#[doc = "Add Pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint16x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(vpaddq_s16(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Add Pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { transmute(vpaddq_s32(transmute(a), transmute(b))) }
+}
+#[doc = "Add Pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    let b: uint32x4_t = unsafe { simd_shuffle!(b, b, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(vpaddq_s32(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Add Pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { transmute(vpaddq_s64(transmute(a), transmute(b))) }
+}
+#[doc = "Add Pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    let b: uint64x2_t = unsafe { simd_shuffle!(b, b, [1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(vpaddq_s64(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmax_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+pub fn vpmax_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxp.v4f16"
+        )]
+        fn _vpmax_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vpmax_f16(a, b) }
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+pub fn vpmaxq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxp.v8f16"
+        )]
+        fn _vpmaxq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vpmaxq_f16(a, b) }
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxnm_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+pub fn vpmaxnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmp.v4f16"
+        )]
+        fn _vpmaxnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vpmaxnm_f16(a, b) }
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxnmq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+pub fn vpmaxnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmp.v8f16"
+        )]
+        fn _vpmaxnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vpmaxnmq_f16(a, b) }
+}
+#[doc = "Floating-point Maximum Number Pairwise (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxnm_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vpmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmp.v2f32"
+        )]
+        fn _vpmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vpmaxnm_f32(a, b) }
+}
+#[doc = "Floating-point Maximum Number Pairwise (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxnmq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vpmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmp.v4f32"
+        )]
+        fn _vpmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vpmaxnmq_f32(a, b) }
+}
+#[doc = "Floating-point Maximum Number Pairwise (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxnmq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vpmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmp.v2f64"
+        )]
+        fn _vpmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vpmaxnmq_f64(a, b) }
+}
+#[doc = "Floating-point maximum number pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxnmqd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vpmaxnmqd_f64(a: float64x2_t) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmv.f64.v2f64"
+        )]
+        fn _vpmaxnmqd_f64(a: float64x2_t) -> f64;
+    }
+    unsafe { _vpmaxnmqd_f64(a) }
+}
+#[doc = "Floating-point maximum number pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxnms_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vpmaxnms_f32(a: float32x2_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmv.f32.v2f32"
+        )]
+        fn _vpmaxnms_f32(a: float32x2_t) -> f32;
+    }
+    unsafe { _vpmaxnms_f32(a) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+pub fn vpmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxp.v4f32"
+        )]
+        fn _vpmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vpmaxq_f32(a, b) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+pub fn vpmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxp.v2f64"
+        )]
+        fn _vpmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vpmaxq_f64(a, b) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(smaxp))]
+pub fn vpmaxq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxp.v16i8"
+        )]
+        fn _vpmaxq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vpmaxq_s8(a, b) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(smaxp))]
+pub fn vpmaxq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxp.v8i16"
+        )]
+        fn _vpmaxq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vpmaxq_s16(a, b) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(smaxp))]
+pub fn vpmaxq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxp.v4i32"
+        )]
+        fn _vpmaxq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vpmaxq_s32(a, b) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(umaxp))]
+pub fn vpmaxq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxp.v16i8"
+        )]
+        fn _vpmaxq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vpmaxq_u8(a, b) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(umaxp))]
+pub fn vpmaxq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxp.v8i16"
+        )]
+        fn _vpmaxq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vpmaxq_u16(a, b) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(umaxp))]
+pub fn vpmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxp.v4i32"
+        )]
+        fn _vpmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vpmaxq_u32(a, b) }
+}
+#[doc = "Floating-point maximum pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxqd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+pub fn vpmaxqd_f64(a: float64x2_t) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxv.f64.v2f64"
+        )]
+        fn _vpmaxqd_f64(a: float64x2_t) -> f64;
+    }
+    unsafe { _vpmaxqd_f64(a) }
+}
+#[doc = "Floating-point maximum pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmaxs_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+pub fn vpmaxs_f32(a: float32x2_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxv.f32.v2f32"
+        )]
+        fn _vpmaxs_f32(a: float32x2_t) -> f32;
+    }
+    unsafe { _vpmaxs_f32(a) }
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmin_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fminp))]
+pub fn vpmin_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminp.v4f16"
+        )]
+        fn _vpmin_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vpmin_f16(a, b) }
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fminp))]
+pub fn vpminq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminp.v8f16"
+        )]
+        fn _vpminq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vpminq_f16(a, b) }
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminnm_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+pub fn vpminnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmp.v4f16"
+        )]
+        fn _vpminnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vpminnm_f16(a, b) }
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminnmq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+pub fn vpminnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmp.v8f16"
+        )]
+        fn _vpminnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vpminnmq_f16(a, b) }
+}
+#[doc = "Floating-point Minimum Number Pairwise (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminnm_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vpminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmp.v2f32"
+        )]
+        fn _vpminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vpminnm_f32(a, b) }
+}
+#[doc = "Floating-point Minimum Number Pairwise (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminnmq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vpminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmp.v4f32"
+        )]
+        fn _vpminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vpminnmq_f32(a, b) }
+}
+#[doc = "Floating-point Minimum Number Pairwise (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminnmq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vpminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmp.v2f64"
+        )]
+        fn _vpminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vpminnmq_f64(a, b) }
+}
+#[doc = "Floating-point minimum number pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminnmqd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vpminnmqd_f64(a: float64x2_t) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmv.f64.v2f64"
+        )]
+        fn _vpminnmqd_f64(a: float64x2_t) -> f64;
+    }
+    unsafe { _vpminnmqd_f64(a) }
+}
+#[doc = "Floating-point minimum number pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminnms_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vpminnms_f32(a: float32x2_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmv.f32.v2f32"
+        )]
+        fn _vpminnms_f32(a: float32x2_t) -> f32;
+    }
+    unsafe { _vpminnms_f32(a) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fminp))]
+pub fn vpminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminp.v4f32"
+        )]
+        fn _vpminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vpminq_f32(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fminp))]
+pub fn vpminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminp.v2f64"
+        )]
+        fn _vpminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vpminq_f64(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sminp))]
+pub fn vpminq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminp.v16i8"
+        )]
+        fn _vpminq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vpminq_s8(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sminp))]
+pub fn vpminq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminp.v8i16"
+        )]
+        fn _vpminq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vpminq_s16(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sminp))]
+pub fn vpminq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminp.v4i32"
+        )]
+        fn _vpminq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vpminq_s32(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uminp))]
+pub fn vpminq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminp.v16i8"
+        )]
+        fn _vpminq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vpminq_u8(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uminp))]
+pub fn vpminq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminp.v8i16"
+        )]
+        fn _vpminq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vpminq_u16(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uminp))]
+pub fn vpminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminp.v4i32"
+        )]
+        fn _vpminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vpminq_u32(a, b) }
+}
+#[doc = "Floating-point minimum pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpminqd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fminp))]
+pub fn vpminqd_f64(a: float64x2_t) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminv.f64.v2f64"
+        )]
+        fn _vpminqd_f64(a: float64x2_t) -> f64;
+    }
+    unsafe { _vpminqd_f64(a) }
+}
+#[doc = "Floating-point minimum pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmins_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fminp))]
+pub fn vpmins_f32(a: float32x2_t) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminv.f32.v2f32"
+        )]
+        fn _vpmins_f32(a: float32x2_t) -> f32;
+    }
+    unsafe { _vpmins_f32(a) }
+}
+#[doc = "Signed saturating Absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabs_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sqabs))]
+pub fn vqabs_s64(a: int64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqabs.v1i64"
+        )]
+        fn _vqabs_s64(a: int64x1_t) -> int64x1_t;
+    }
+    unsafe { _vqabs_s64(a) }
+}
+#[doc = "Signed saturating Absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabsq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sqabs))]
+pub fn vqabsq_s64(a: int64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqabs.v2i64"
+        )]
+        fn _vqabsq_s64(a: int64x2_t) -> int64x2_t;
+    }
+    unsafe { _vqabsq_s64(a) }
+}
+#[doc = "Signed saturating absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabsb_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sqabs))]
+pub fn vqabsb_s8(a: i8) -> i8 {
+    unsafe { simd_extract!(vqabs_s8(vdup_n_s8(a)), 0) }
+}
+#[doc = "Signed saturating absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabsh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sqabs))]
+pub fn vqabsh_s16(a: i16) -> i16 {
+    unsafe { simd_extract!(vqabs_s16(vdup_n_s16(a)), 0) }
+}
+#[doc = "Signed saturating absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabss_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sqabs))]
+pub fn vqabss_s32(a: i32) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqabs.i32"
+        )]
+        fn _vqabss_s32(a: i32) -> i32;
+    }
+    unsafe { _vqabss_s32(a) }
+}
+#[doc = "Signed saturating absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabsd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sqabs))]
+pub fn vqabsd_s64(a: i64) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqabs.i64"
+        )]
+        fn _vqabsd_s64(a: i64) -> i64;
+    }
+    unsafe { _vqabsd_s64(a) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddb_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqadd))]
+pub fn vqaddb_s8(a: i8, b: i8) -> i8 {
+    let a: int8x8_t = vdup_n_s8(a);
+    let b: int8x8_t = vdup_n_s8(b);
+    unsafe { simd_extract!(vqadd_s8(a, b), 0) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqadd))]
+pub fn vqaddh_s16(a: i16, b: i16) -> i16 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    unsafe { simd_extract!(vqadd_s16(a, b), 0) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddb_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uqadd))]
+pub fn vqaddb_u8(a: u8, b: u8) -> u8 {
+    let a: uint8x8_t = vdup_n_u8(a);
+    let b: uint8x8_t = vdup_n_u8(b);
+    unsafe { simd_extract!(vqadd_u8(a, b), 0) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddh_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uqadd))]
+pub fn vqaddh_u16(a: u16, b: u16) -> u16 {
+    let a: uint16x4_t = vdup_n_u16(a);
+    let b: uint16x4_t = vdup_n_u16(b);
+    unsafe { simd_extract!(vqadd_u16(a, b), 0) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqadds_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqadd))]
+pub fn vqadds_s32(a: i32, b: i32) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqadd.i32"
+        )]
+        fn _vqadds_s32(a: i32, b: i32) -> i32;
+    }
+    unsafe { _vqadds_s32(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqadd))]
+pub fn vqaddd_s64(a: i64, b: i64) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqadd.i64"
+        )]
+        fn _vqaddd_s64(a: i64, b: i64) -> i64;
+    }
+    unsafe { _vqaddd_s64(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqadds_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uqadd))]
+pub fn vqadds_u32(a: u32, b: u32) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqadd.i32"
+        )]
+        fn _vqadds_u32(a: u32, b: u32) -> u32;
+    }
+    unsafe { _vqadds_u32(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uqadd))]
+pub fn vqaddd_u64(a: u64, b: u64) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqadd.i64"
+        )]
+        fn _vqaddd_u64(a: u64, b: u64) -> u64;
+    }
+    unsafe { _vqaddd_u64(a, b) }
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_high_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlal_high_lane_s16<const N: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 2);
+    vqaddq_s32(a, vqdmull_high_lane_s16::<N>(b, c))
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_high_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlal_high_laneq_s16<const N: i32>(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 3);
+    vqaddq_s32(a, vqdmull_high_laneq_s16::<N>(b, c))
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_high_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlal_high_lane_s32<const N: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    vqaddq_s64(a, vqdmull_high_lane_s32::<N>(b, c))
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_high_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlal_high_laneq_s32<const N: i32>(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 2);
+    vqaddq_s64(a, vqdmull_high_laneq_s32::<N>(b, c))
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_high_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlal_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t {
+    vqaddq_s32(a, vqdmull_high_n_s16(b, c))
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    vqaddq_s32(a, vqdmull_high_s16(b, c))
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_high_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlal_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t {
+    vqaddq_s64(a, vqdmull_high_n_s32(b, c))
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    vqaddq_s64(a, vqdmull_high_s32(b, c))
+}
+#[doc = "Vector widening saturating doubling multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal, N = 2))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlal_laneq_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 3);
+    vqaddq_s32(a, vqdmull_laneq_s16::<N>(b, c))
+}
+#[doc = "Vector widening saturating doubling multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlal_laneq_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 2);
+    vqaddq_s64(a, vqdmull_laneq_s32::<N>(b, c))
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlalh_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlalh_lane_s16<const LANE: i32>(a: i32, b: i16, c: int16x4_t) -> i32 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqdmlalh_s16(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlalh_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlalh_laneq_s16<const LANE: i32>(a: i32, b: i16, c: int16x8_t) -> i32 {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vqdmlalh_s16(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlals_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlals_lane_s32<const LANE: i32>(a: i64, b: i32, c: int32x2_t) -> i64 {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vqdmlals_s32(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlals_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlals_laneq_s32<const LANE: i32>(a: i64, b: i32, c: int32x4_t) -> i64 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqdmlals_s32(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlalh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlalh_s16(a: i32, b: i16, c: i16) -> i32 {
+    let x: int32x4_t = vqdmull_s16(vdup_n_s16(b), vdup_n_s16(c));
+    unsafe { vqadds_s32(a, simd_extract!(x, 0)) }
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlals_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlals_s32(a: i64, b: i32, c: i32) -> i64 {
+    let x: i64 = vqaddd_s64(a, vqdmulls_s32(b, c));
+    x as i64
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_high_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlsl_high_lane_s16<const N: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 2);
+    vqsubq_s32(a, vqdmull_high_lane_s16::<N>(b, c))
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_high_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlsl_high_laneq_s16<const N: i32>(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 3);
+    vqsubq_s32(a, vqdmull_high_laneq_s16::<N>(b, c))
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_high_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlsl_high_lane_s32<const N: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    vqsubq_s64(a, vqdmull_high_lane_s32::<N>(b, c))
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_high_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlsl_high_laneq_s32<const N: i32>(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 2);
+    vqsubq_s64(a, vqdmull_high_laneq_s32::<N>(b, c))
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_high_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlsl_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t {
+    vqsubq_s32(a, vqdmull_high_n_s16(b, c))
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlsl_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    vqsubq_s32(a, vqdmull_high_s16(b, c))
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_high_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlsl_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t {
+    vqsubq_s64(a, vqdmull_high_n_s32(b, c))
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlsl_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    vqsubq_s64(a, vqdmull_high_s32(b, c))
+}
+#[doc = "Vector widening saturating doubling multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl, N = 2))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlsl_laneq_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 3);
+    vqsubq_s32(a, vqdmull_laneq_s16::<N>(b, c))
+}
+#[doc = "Vector widening saturating doubling multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlsl_laneq_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 2);
+    vqsubq_s64(a, vqdmull_laneq_s32::<N>(b, c))
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlslh_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlslh_lane_s16<const LANE: i32>(a: i32, b: i16, c: int16x4_t) -> i32 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqdmlslh_s16(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlslh_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlslh_laneq_s16<const LANE: i32>(a: i32, b: i16, c: int16x8_t) -> i32 {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vqdmlslh_s16(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsls_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlsls_lane_s32<const LANE: i32>(a: i64, b: i32, c: int32x2_t) -> i64 {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vqdmlsls_s32(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsls_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlsls_laneq_s32<const LANE: i32>(a: i64, b: i32, c: int32x4_t) -> i64 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqdmlsls_s32(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlslh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlslh_s16(a: i32, b: i16, c: i16) -> i32 {
+    let x: int32x4_t = vqdmull_s16(vdup_n_s16(b), vdup_n_s16(c));
+    unsafe { vqsubs_s32(a, simd_extract!(x, 0)) }
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsls_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmlsls_s32(a: i64, b: i32, c: i32) -> i64 {
+    let x: i64 = vqsubd_s64(a, vqdmulls_s32(b, c));
+    x as i64
+}
+#[doc = "Vector saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulh_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmulh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqdmulh_s16(a, vdup_n_s16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Vector saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhq_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmulhq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqdmulhq_s16(a, vdupq_n_s16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Vector saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulh_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmulh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vqdmulh_s32(a, vdup_n_s32(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Vector saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhq_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmulhq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vqdmulhq_s32(a, vdupq_n_s32(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Signed saturating doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhh_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmulhh_lane_s16<const N: i32>(a: i16, b: int16x4_t) -> i16 {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        let b: i16 = simd_extract!(b, N as u32);
+        vqdmulhh_s16(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhh_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmulhh_laneq_s16<const N: i32>(a: i16, b: int16x8_t) -> i16 {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        let b: i16 = simd_extract!(b, N as u32);
+        vqdmulhh_s16(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmulhh_s16(a: i16, b: i16) -> i16 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    unsafe { simd_extract!(vqdmulh_s16(a, b), 0) }
+}
+#[doc = "Signed saturating doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhs_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmulhs_s32(a: i32, b: i32) -> i32 {
+    let a: int32x2_t = vdup_n_s32(a);
+    let b: int32x2_t = vdup_n_s32(b);
+    unsafe { simd_extract!(vqdmulh_s32(a, b), 0) }
+}
+#[doc = "Signed saturating doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhs_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmulhs_lane_s32<const N: i32>(a: i32, b: int32x2_t) -> i32 {
+    static_assert_uimm_bits!(N, 1);
+    unsafe {
+        let b: i32 = simd_extract!(b, N as u32);
+        vqdmulhs_s32(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhs_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmulhs_laneq_s32<const N: i32>(a: i32, b: int32x4_t) -> i32 {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        let b: i32 = simd_extract!(b, N as u32);
+        vqdmulhs_s32(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmull_high_lane_s16<const N: i32>(a: int16x8_t, b: int16x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        let b: int16x4_t = simd_shuffle!(b, b, [N as u32, N as u32, N as u32, N as u32]);
+        vqdmull_s16(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmull_high_laneq_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        let a: int32x2_t = simd_shuffle!(a, a, [2, 3]);
+        let b: int32x2_t = simd_shuffle!(b, b, [N as u32, N as u32]);
+        vqdmull_s32(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmull_high_lane_s32<const N: i32>(a: int32x4_t, b: int32x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe {
+        let a: int32x2_t = simd_shuffle!(a, a, [2, 3]);
+        let b: int32x2_t = simd_shuffle!(b, b, [N as u32, N as u32]);
+        vqdmull_s32(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2, N = 4))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmull_high_laneq_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        let b: int16x4_t = simd_shuffle!(b, b, [N as u32, N as u32, N as u32, N as u32]);
+        vqdmull_s16(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmull_high_n_s16(a: int16x8_t, b: i16) -> int32x4_t {
+    unsafe {
+        let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        let b: int16x4_t = vdup_n_s16(b);
+        vqdmull_s16(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmull_high_n_s32(a: int32x4_t, b: i32) -> int64x2_t {
+    unsafe {
+        let a: int32x2_t = simd_shuffle!(a, a, [2, 3]);
+        let b: int32x2_t = vdup_n_s32(b);
+        vqdmull_s32(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    unsafe {
+        let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        vqdmull_s16(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    unsafe {
+        let a: int32x2_t = simd_shuffle!(a, a, [2, 3]);
+        let b: int32x2_t = simd_shuffle!(b, b, [2, 3]);
+        vqdmull_s32(a, b)
+    }
+}
+#[doc = "Vector saturating doubling long multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, N = 4))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmull_laneq_s16<const N: i32>(a: int16x4_t, b: int16x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        let b: int16x4_t = simd_shuffle!(b, b, [N as u32, N as u32, N as u32, N as u32]);
+        vqdmull_s16(a, b)
+    }
+}
+#[doc = "Vector saturating doubling long multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmull_laneq_s32<const N: i32>(a: int32x2_t, b: int32x4_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        let b: int32x2_t = simd_shuffle!(b, b, [N as u32, N as u32]);
+        vqdmull_s32(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmullh_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmullh_lane_s16<const N: i32>(a: i16, b: int16x4_t) -> i32 {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        let b: i16 = simd_extract!(b, N as u32);
+        vqdmullh_s16(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulls_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmulls_laneq_s32<const N: i32>(a: i32, b: int32x4_t) -> i64 {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        let b: i32 = simd_extract!(b, N as u32);
+        vqdmulls_s32(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmullh_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, N = 4))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmullh_laneq_s16<const N: i32>(a: i16, b: int16x8_t) -> i32 {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        let b: i16 = simd_extract!(b, N as u32);
+        vqdmullh_s16(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmullh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmullh_s16(a: i16, b: i16) -> i32 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    unsafe { simd_extract!(vqdmull_s16(a, b), 0) }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulls_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmulls_lane_s32<const N: i32>(a: i32, b: int32x2_t) -> i64 {
+    static_assert_uimm_bits!(N, 1);
+    unsafe {
+        let b: i32 = simd_extract!(b, N as u32);
+        vqdmulls_s32(a, b)
+    }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulls_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqdmulls_s32(a: i32, b: i32) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqdmulls.scalar"
+        )]
+        fn _vqdmulls_s32(a: i32, b: i32) -> i64;
+    }
+    unsafe { _vqdmulls_s32(a, b) }
+}
+#[doc = "Signed saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovn_high_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            vqmovn_s16(b),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        )
+    }
+}
+#[doc = "Signed saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovn_high_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t {
+    unsafe { simd_shuffle!(a, vqmovn_s32(b), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Signed saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_high_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovn_high_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t {
+    unsafe { simd_shuffle!(a, vqmovn_s64(b), [0, 1, 2, 3]) }
+}
+#[doc = "Signed saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqxtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovn_high_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            vqmovn_u16(b),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        )
+    }
+}
+#[doc = "Signed saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqxtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovn_high_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
+    unsafe { simd_shuffle!(a, vqmovn_u32(b), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Signed saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_high_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqxtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovn_high_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
+    unsafe { simd_shuffle!(a, vqmovn_u64(b), [0, 1, 2, 3]) }
+}
+#[doc = "Saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovnd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovnd_s64(a: i64) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.scalar.sqxtn.i32.i64"
+        )]
+        fn _vqmovnd_s64(a: i64) -> i32;
+    }
+    unsafe { _vqmovnd_s64(a) }
+}
+#[doc = "Saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovnd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqxtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovnd_u64(a: u64) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.scalar.uqxtn.i32.i64"
+        )]
+        fn _vqmovnd_u64(a: u64) -> u32;
+    }
+    unsafe { _vqmovnd_u64(a) }
+}
+#[doc = "Saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovnh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovnh_s16(a: i16) -> i8 {
+    unsafe { simd_extract!(vqmovn_s16(vdupq_n_s16(a)), 0) }
+}
+#[doc = "Saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovns_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovns_s32(a: i32) -> i16 {
+    unsafe { simd_extract!(vqmovn_s32(vdupq_n_s32(a)), 0) }
+}
+#[doc = "Saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovnh_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqxtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovnh_u16(a: u16) -> u8 {
+    unsafe { simd_extract!(vqmovn_u16(vdupq_n_u16(a)), 0) }
+}
+#[doc = "Saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovns_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqxtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovns_u32(a: u32) -> u16 {
+    unsafe { simd_extract!(vqmovn_u32(vdupq_n_u32(a)), 0) }
+}
+#[doc = "Signed saturating extract unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovun_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtun2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovun_high_s16(a: uint8x8_t, b: int16x8_t) -> uint8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            vqmovun_s16(b),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        )
+    }
+}
+#[doc = "Signed saturating extract unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovun_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtun2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovun_high_s32(a: uint16x4_t, b: int32x4_t) -> uint16x8_t {
+    unsafe { simd_shuffle!(a, vqmovun_s32(b), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Signed saturating extract unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovun_high_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtun2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovun_high_s64(a: uint32x2_t, b: int64x2_t) -> uint32x4_t {
+    unsafe { simd_shuffle!(a, vqmovun_s64(b), [0, 1, 2, 3]) }
+}
+#[doc = "Signed saturating extract unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovunh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtun))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovunh_s16(a: i16) -> u8 {
+    unsafe { simd_extract!(vqmovun_s16(vdupq_n_s16(a)), 0) }
+}
+#[doc = "Signed saturating extract unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovuns_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtun))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovuns_s32(a: i32) -> u16 {
+    unsafe { simd_extract!(vqmovun_s32(vdupq_n_s32(a)), 0) }
+}
+#[doc = "Signed saturating extract unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovund_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtun))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqmovund_s64(a: i64) -> u32 {
+    unsafe { simd_extract!(vqmovun_s64(vdupq_n_s64(a)), 0) }
+}
+#[doc = "Signed saturating negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqneg_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqneg))]
+pub fn vqneg_s64(a: int64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqneg.v1i64"
+        )]
+        fn _vqneg_s64(a: int64x1_t) -> int64x1_t;
+    }
+    unsafe { _vqneg_s64(a) }
+}
+#[doc = "Signed saturating negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqnegq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqneg))]
+pub fn vqnegq_s64(a: int64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqneg.v2i64"
+        )]
+        fn _vqnegq_s64(a: int64x2_t) -> int64x2_t;
+    }
+    unsafe { _vqnegq_s64(a) }
+}
+#[doc = "Signed saturating negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqnegb_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqneg))]
+pub fn vqnegb_s8(a: i8) -> i8 {
+    unsafe { simd_extract!(vqneg_s8(vdup_n_s8(a)), 0) }
+}
+#[doc = "Signed saturating negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqnegh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqneg))]
+pub fn vqnegh_s16(a: i16) -> i16 {
+    unsafe { simd_extract!(vqneg_s16(vdup_n_s16(a)), 0) }
+}
+#[doc = "Signed saturating negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqnegs_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqneg))]
+pub fn vqnegs_s32(a: i32) -> i32 {
+    unsafe { simd_extract!(vqneg_s32(vdup_n_s32(a)), 0) }
+}
+#[doc = "Signed saturating negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqnegd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqneg))]
+pub fn vqnegd_s64(a: i64) -> i64 {
+    unsafe { simd_extract!(vqneg_s64(vdup_n_s64(a)), 0) }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlah_lane_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlah_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: int16x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vqrdmlah_s16(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlah_lane_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlah_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        vqrdmlah_s32(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlah_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlah_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        let c: int16x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vqrdmlah_s16(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlah_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlah_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        vqrdmlah_s32(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahq_lane_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlahq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: int16x8_t = simd_shuffle!(
+            c,
+            c,
+            [
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32
+            ]
+        );
+        vqrdmlahq_s16(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahq_lane_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlahq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: int32x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vqrdmlahq_s32(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahq_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlahq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        let c: int16x8_t = simd_shuffle!(
+            c,
+            c,
+            [
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32
+            ]
+        );
+        vqrdmlahq_s16(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahq_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlahq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: int32x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vqrdmlahq_s32(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlah_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlah_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrdmlah.v4i16"
+        )]
+        fn _vqrdmlah_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vqrdmlah_s16(a, b, c) }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahq_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlahq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrdmlah.v8i16"
+        )]
+        fn _vqrdmlahq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vqrdmlahq_s16(a, b, c) }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlah_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlah_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrdmlah.v2i32"
+        )]
+        fn _vqrdmlah_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vqrdmlah_s32(a, b, c) }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahq_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlahq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrdmlah.v4i32"
+        )]
+        fn _vqrdmlahq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vqrdmlahq_s32(a, b, c) }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahh_lane_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlahh_lane_s16<const LANE: i32>(a: i16, b: i16, c: int16x4_t) -> i16 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqrdmlahh_s16(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahh_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlahh_laneq_s16<const LANE: i32>(a: i16, b: i16, c: int16x8_t) -> i16 {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vqrdmlahh_s16(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahs_lane_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlahs_lane_s32<const LANE: i32>(a: i32, b: i32, c: int32x2_t) -> i32 {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vqrdmlahs_s32(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahs_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlahs_laneq_s32<const LANE: i32>(a: i32, b: i32, c: int32x4_t) -> i32 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqrdmlahs_s32(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahh_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlahh_s16(a: i16, b: i16, c: i16) -> i16 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    let c: int16x4_t = vdup_n_s16(c);
+    unsafe { simd_extract!(vqrdmlah_s16(a, b, c), 0) }
+}
+#[doc = "Signed saturating rounding doubling multiply accumulate returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahs_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlahs_s32(a: i32, b: i32, c: i32) -> i32 {
+    let a: int32x2_t = vdup_n_s32(a);
+    let b: int32x2_t = vdup_n_s32(b);
+    let c: int32x2_t = vdup_n_s32(c);
+    unsafe { simd_extract!(vqrdmlah_s32(a, b, c), 0) }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlsh_lane_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlsh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: int16x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vqrdmlsh_s16(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlsh_lane_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlsh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        vqrdmlsh_s32(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlsh_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlsh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        let c: int16x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vqrdmlsh_s16(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlsh_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlsh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        vqrdmlsh_s32(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshq_lane_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlshq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: int16x8_t = simd_shuffle!(
+            c,
+            c,
+            [
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32
+            ]
+        );
+        vqrdmlshq_s16(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshq_lane_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlshq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: int32x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vqrdmlshq_s32(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshq_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlshq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        let c: int16x8_t = simd_shuffle!(
+            c,
+            c,
+            [
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32
+            ]
+        );
+        vqrdmlshq_s16(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshq_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlshq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: int32x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vqrdmlshq_s32(a, b, c)
+    }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlsh_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlsh_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrdmlsh.v4i16"
+        )]
+        fn _vqrdmlsh_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vqrdmlsh_s16(a, b, c) }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshq_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlshq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrdmlsh.v8i16"
+        )]
+        fn _vqrdmlshq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vqrdmlshq_s16(a, b, c) }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlsh_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlsh_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrdmlsh.v2i32"
+        )]
+        fn _vqrdmlsh_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vqrdmlsh_s32(a, b, c) }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshq_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlshq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrdmlsh.v4i32"
+        )]
+        fn _vqrdmlshq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vqrdmlshq_s32(a, b, c) }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshh_lane_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlshh_lane_s16<const LANE: i32>(a: i16, b: i16, c: int16x4_t) -> i16 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqrdmlshh_s16(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshh_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlshh_laneq_s16<const LANE: i32>(a: i16, b: i16, c: int16x8_t) -> i16 {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vqrdmlshh_s16(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshs_lane_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlshs_lane_s32<const LANE: i32>(a: i32, b: i32, c: int32x2_t) -> i32 {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vqrdmlshs_s32(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshs_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlshs_laneq_s32<const LANE: i32>(a: i32, b: i32, c: int32x4_t) -> i32 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqrdmlshs_s32(a, b, simd_extract!(c, LANE as u32)) }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshh_s16)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlshh_s16(a: i16, b: i16, c: i16) -> i16 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    let c: int16x4_t = vdup_n_s16(c);
+    unsafe { simd_extract!(vqrdmlsh_s16(a, b, c), 0) }
+}
+#[doc = "Signed saturating rounding doubling multiply subtract returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshs_s32)"]
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub fn vqrdmlshs_s32(a: i32, b: i32, c: i32) -> i32 {
+    let a: int32x2_t = vdup_n_s32(a);
+    let b: int32x2_t = vdup_n_s32(b);
+    let c: int32x2_t = vdup_n_s32(c);
+    unsafe { simd_extract!(vqrdmlsh_s32(a, b, c), 0) }
+}
+#[doc = "Signed saturating rounding doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhh_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrdmulhh_lane_s16<const LANE: i32>(a: i16, b: int16x4_t) -> i16 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqrdmulhh_s16(a, simd_extract!(b, LANE as u32)) }
+}
+#[doc = "Signed saturating rounding doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhh_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrdmulhh_laneq_s16<const LANE: i32>(a: i16, b: int16x8_t) -> i16 {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vqrdmulhh_s16(a, simd_extract!(b, LANE as u32)) }
+}
+#[doc = "Signed saturating rounding doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhs_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrdmulhs_lane_s32<const LANE: i32>(a: i32, b: int32x2_t) -> i32 {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vqrdmulhs_s32(a, simd_extract!(b, LANE as u32)) }
+}
+#[doc = "Signed saturating rounding doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhs_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrdmulhs_laneq_s32<const LANE: i32>(a: i32, b: int32x4_t) -> i32 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqrdmulhs_s32(a, simd_extract!(b, LANE as u32)) }
+}
+#[doc = "Signed saturating rounding doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrdmulh))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrdmulhh_s16(a: i16, b: i16) -> i16 {
+    unsafe { simd_extract!(vqrdmulh_s16(vdup_n_s16(a), vdup_n_s16(b)), 0) }
+}
+#[doc = "Signed saturating rounding doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhs_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrdmulh))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrdmulhs_s32(a: i32, b: i32) -> i32 {
+    unsafe { simd_extract!(vqrdmulh_s32(vdup_n_s32(a), vdup_n_s32(b)), 0) }
+}
+#[doc = "Signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlb_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshlb_s8(a: i8, b: i8) -> i8 {
+    let a: int8x8_t = vdup_n_s8(a);
+    let b: int8x8_t = vdup_n_s8(b);
+    unsafe { simd_extract!(vqrshl_s8(a, b), 0) }
+}
+#[doc = "Signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshlh_s16(a: i16, b: i16) -> i16 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    unsafe { simd_extract!(vqrshl_s16(a, b), 0) }
+}
+#[doc = "Unsigned signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlb_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshlb_u8(a: u8, b: i8) -> u8 {
+    let a: uint8x8_t = vdup_n_u8(a);
+    let b: int8x8_t = vdup_n_s8(b);
+    unsafe { simd_extract!(vqrshl_u8(a, b), 0) }
+}
+#[doc = "Unsigned signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlh_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshlh_u16(a: u16, b: i16) -> u16 {
+    let a: uint16x4_t = vdup_n_u16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    unsafe { simd_extract!(vqrshl_u16(a, b), 0) }
+}
+#[doc = "Signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshld_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshld_s64(a: i64, b: i64) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshl.i64"
+        )]
+        fn _vqrshld_s64(a: i64, b: i64) -> i64;
+    }
+    unsafe { _vqrshld_s64(a, b) }
+}
+#[doc = "Signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshls_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshls_s32(a: i32, b: i32) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshl.i32"
+        )]
+        fn _vqrshls_s32(a: i32, b: i32) -> i32;
+    }
+    unsafe { _vqrshls_s32(a, b) }
+}
+#[doc = "Unsigned signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshls_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshls_u32(a: u32, b: i32) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqrshl.i32"
+        )]
+        fn _vqrshls_u32(a: u32, b: i32) -> u32;
+    }
+    unsafe { _vqrshls_u32(a, b) }
+}
+#[doc = "Unsigned signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshld_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshld_u64(a: u64, b: i64) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqrshl.i64"
+        )]
+        fn _vqrshld_u64(a: u64, b: i64) -> u64;
+    }
+    unsafe { _vqrshld_u64(a, b) }
+}
+#[doc = "Signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_high_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            vqrshrn_n_s16::<N>(b),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        )
+    }
+}
+#[doc = "Signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_high_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_shuffle!(a, vqrshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_high_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_shuffle!(a, vqrshrn_n_s64::<N>(b), [0, 1, 2, 3]) }
+}
+#[doc = "Unsigned saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_high_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            vqrshrn_n_u16::<N>(b),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        )
+    }
+}
+#[doc = "Unsigned saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_high_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_shuffle!(a, vqrshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Unsigned saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_high_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_shuffle!(a, vqrshrn_n_u64::<N>(b), [0, 1, 2, 3]) }
+}
+#[doc = "Unsigned saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrnd_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrnd_n_u64<const N: i32>(a: u64) -> u32 {
+    static_assert!(N >= 1 && N <= 32);
+    let a: uint64x2_t = vdupq_n_u64(a);
+    unsafe { simd_extract!(vqrshrn_n_u64::<N>(a), 0) }
+}
+#[doc = "Unsigned saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrnh_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrnh_n_u16<const N: i32>(a: u16) -> u8 {
+    static_assert!(N >= 1 && N <= 8);
+    let a: uint16x8_t = vdupq_n_u16(a);
+    unsafe { simd_extract!(vqrshrn_n_u16::<N>(a), 0) }
+}
+#[doc = "Unsigned saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrns_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrns_n_u32<const N: i32>(a: u32) -> u16 {
+    static_assert!(N >= 1 && N <= 16);
+    let a: uint32x4_t = vdupq_n_u32(a);
+    unsafe { simd_extract!(vqrshrn_n_u32::<N>(a), 0) }
+}
+#[doc = "Signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrnh_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrnh_n_s16<const N: i32>(a: i16) -> i8 {
+    static_assert!(N >= 1 && N <= 8);
+    let a: int16x8_t = vdupq_n_s16(a);
+    unsafe { simd_extract!(vqrshrn_n_s16::<N>(a), 0) }
+}
+#[doc = "Signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrns_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrns_n_s32<const N: i32>(a: i32) -> i16 {
+    static_assert!(N >= 1 && N <= 16);
+    let a: int32x4_t = vdupq_n_s32(a);
+    unsafe { simd_extract!(vqrshrn_n_s32::<N>(a), 0) }
+}
+#[doc = "Signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrnd_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrnd_n_s64<const N: i32>(a: i64) -> i32 {
+    static_assert!(N >= 1 && N <= 32);
+    let a: int64x2_t = vdupq_n_s64(a);
+    unsafe { simd_extract!(vqrshrn_n_s64::<N>(a), 0) }
+}
+#[doc = "Signed saturating rounded shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_high_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrun_high_n_s16<const N: i32>(a: uint8x8_t, b: int16x8_t) -> uint8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            vqrshrun_n_s16::<N>(b),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        )
+    }
+}
+#[doc = "Signed saturating rounded shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_high_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrun_high_n_s32<const N: i32>(a: uint16x4_t, b: int32x4_t) -> uint16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_shuffle!(a, vqrshrun_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Signed saturating rounded shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_high_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrun_high_n_s64<const N: i32>(a: uint32x2_t, b: int64x2_t) -> uint32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_shuffle!(a, vqrshrun_n_s64::<N>(b), [0, 1, 2, 3]) }
+}
+#[doc = "Signed saturating rounded shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrund_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrund_n_s64<const N: i32>(a: i64) -> u32 {
+    static_assert!(N >= 1 && N <= 32);
+    let a: int64x2_t = vdupq_n_s64(a);
+    unsafe { simd_extract!(vqrshrun_n_s64::<N>(a), 0) }
+}
+#[doc = "Signed saturating rounded shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrunh_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrunh_n_s16<const N: i32>(a: i16) -> u8 {
+    static_assert!(N >= 1 && N <= 8);
+    let a: int16x8_t = vdupq_n_s16(a);
+    unsafe { simd_extract!(vqrshrun_n_s16::<N>(a), 0) }
+}
+#[doc = "Signed saturating rounded shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshruns_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshruns_n_s32<const N: i32>(a: i32) -> u16 {
+    static_assert!(N >= 1 && N <= 16);
+    let a: int32x4_t = vdupq_n_s32(a);
+    unsafe { simd_extract!(vqrshrun_n_s32::<N>(a), 0) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlb_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlb_n_s8<const N: i32>(a: i8) -> i8 {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_extract!(vqshl_n_s8::<N>(vdup_n_s8(a)), 0) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshld_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshld_n_s64<const N: i32>(a: i64) -> i64 {
+    static_assert_uimm_bits!(N, 6);
+    unsafe { simd_extract!(vqshl_n_s64::<N>(vdup_n_s64(a)), 0) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlh_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlh_n_s16<const N: i32>(a: i16) -> i16 {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { simd_extract!(vqshl_n_s16::<N>(vdup_n_s16(a)), 0) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshls_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshls_n_s32<const N: i32>(a: i32) -> i32 {
+    static_assert_uimm_bits!(N, 5);
+    unsafe { simd_extract!(vqshl_n_s32::<N>(vdup_n_s32(a)), 0) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlb_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlb_n_u8<const N: i32>(a: u8) -> u8 {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_extract!(vqshl_n_u8::<N>(vdup_n_u8(a)), 0) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshld_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshld_n_u64<const N: i32>(a: u64) -> u64 {
+    static_assert_uimm_bits!(N, 6);
+    unsafe { simd_extract!(vqshl_n_u64::<N>(vdup_n_u64(a)), 0) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlh_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlh_n_u16<const N: i32>(a: u16) -> u16 {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { simd_extract!(vqshl_n_u16::<N>(vdup_n_u16(a)), 0) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshls_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshls_n_u32<const N: i32>(a: u32) -> u32 {
+    static_assert_uimm_bits!(N, 5);
+    unsafe { simd_extract!(vqshl_n_u32::<N>(vdup_n_u32(a)), 0) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlb_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlb_s8(a: i8, b: i8) -> i8 {
+    let c: int8x8_t = vqshl_s8(vdup_n_s8(a), vdup_n_s8(b));
+    unsafe { simd_extract!(c, 0) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlh_s16(a: i16, b: i16) -> i16 {
+    let c: int16x4_t = vqshl_s16(vdup_n_s16(a), vdup_n_s16(b));
+    unsafe { simd_extract!(c, 0) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshls_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshls_s32(a: i32, b: i32) -> i32 {
+    let c: int32x2_t = vqshl_s32(vdup_n_s32(a), vdup_n_s32(b));
+    unsafe { simd_extract!(c, 0) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlb_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlb_u8(a: u8, b: i8) -> u8 {
+    let c: uint8x8_t = vqshl_u8(vdup_n_u8(a), vdup_n_s8(b));
+    unsafe { simd_extract!(c, 0) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlh_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlh_u16(a: u16, b: i16) -> u16 {
+    let c: uint16x4_t = vqshl_u16(vdup_n_u16(a), vdup_n_s16(b));
+    unsafe { simd_extract!(c, 0) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshls_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshls_u32(a: u32, b: i32) -> u32 {
+    let c: uint32x2_t = vqshl_u32(vdup_n_u32(a), vdup_n_s32(b));
+    unsafe { simd_extract!(c, 0) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshld_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshld_s64(a: i64, b: i64) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshl.i64"
+        )]
+        fn _vqshld_s64(a: i64, b: i64) -> i64;
+    }
+    unsafe { _vqshld_s64(a, b) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshld_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshld_u64(a: u64, b: i64) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqshl.i64"
+        )]
+        fn _vqshld_u64(a: u64, b: i64) -> u64;
+    }
+    unsafe { _vqshld_u64(a, b) }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlub_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlub_n_s8<const N: i32>(a: i8) -> u8 {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_extract!(vqshlu_n_s8::<N>(vdup_n_s8(a)), 0) }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlud_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlud_n_s64<const N: i32>(a: i64) -> u64 {
+    static_assert_uimm_bits!(N, 6);
+    unsafe { simd_extract!(vqshlu_n_s64::<N>(vdup_n_s64(a)), 0) }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluh_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshluh_n_s16<const N: i32>(a: i16) -> u16 {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { simd_extract!(vqshlu_n_s16::<N>(vdup_n_s16(a)), 0) }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlus_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlus_n_s32<const N: i32>(a: i32) -> u32 {
+    static_assert_uimm_bits!(N, 5);
+    unsafe { simd_extract!(vqshlu_n_s32::<N>(vdup_n_s32(a)), 0) }
+}
+#[doc = "Signed saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_high_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            vqshrn_n_s16::<N>(b),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        )
+    }
+}
+#[doc = "Signed saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_high_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_shuffle!(a, vqshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Signed saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_high_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_shuffle!(a, vqshrn_n_s64::<N>(b), [0, 1, 2, 3]) }
+}
+#[doc = "Unsigned saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_high_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            vqshrn_n_u16::<N>(b),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        )
+    }
+}
+#[doc = "Unsigned saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_high_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_shuffle!(a, vqshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Unsigned saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_high_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_shuffle!(a, vqshrn_n_u64::<N>(b), [0, 1, 2, 3]) }
+}
+#[doc = "Signed saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrnd_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrnd_n_s64<const N: i32>(a: i64) -> i32 {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshrn.i32"
+        )]
+        fn _vqshrnd_n_s64(a: i64, n: i32) -> i32;
+    }
+    unsafe { _vqshrnd_n_s64(a, N) }
+}
+#[doc = "Unsigned saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrnd_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrnd_n_u64<const N: i32>(a: u64) -> u32 {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqshrn.i32"
+        )]
+        fn _vqshrnd_n_u64(a: u64, n: i32) -> u32;
+    }
+    unsafe { _vqshrnd_n_u64(a, N) }
+}
+#[doc = "Signed saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrnh_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrnh_n_s16<const N: i32>(a: i16) -> i8 {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { simd_extract!(vqshrn_n_s16::<N>(vdupq_n_s16(a)), 0) }
+}
+#[doc = "Signed saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrns_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrns_n_s32<const N: i32>(a: i32) -> i16 {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_extract!(vqshrn_n_s32::<N>(vdupq_n_s32(a)), 0) }
+}
+#[doc = "Unsigned saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrnh_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrnh_n_u16<const N: i32>(a: u16) -> u8 {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { simd_extract!(vqshrn_n_u16::<N>(vdupq_n_u16(a)), 0) }
+}
+#[doc = "Unsigned saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrns_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrns_n_u32<const N: i32>(a: u32) -> u16 {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_extract!(vqshrn_n_u32::<N>(vdupq_n_u32(a)), 0) }
+}
+#[doc = "Signed saturating shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_high_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrun_high_n_s16<const N: i32>(a: uint8x8_t, b: int16x8_t) -> uint8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            vqshrun_n_s16::<N>(b),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        )
+    }
+}
+#[doc = "Signed saturating shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_high_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrun_high_n_s32<const N: i32>(a: uint16x4_t, b: int32x4_t) -> uint16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_shuffle!(a, vqshrun_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Signed saturating shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_high_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrun_high_n_s64<const N: i32>(a: uint32x2_t, b: int64x2_t) -> uint32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_shuffle!(a, vqshrun_n_s64::<N>(b), [0, 1, 2, 3]) }
+}
+#[doc = "Signed saturating shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrund_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrund_n_s64<const N: i32>(a: i64) -> u32 {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_extract!(vqshrun_n_s64::<N>(vdupq_n_s64(a)), 0) }
+}
+#[doc = "Signed saturating shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrunh_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrunh_n_s16<const N: i32>(a: i16) -> u8 {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { simd_extract!(vqshrun_n_s16::<N>(vdupq_n_s16(a)), 0) }
+}
+#[doc = "Signed saturating shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshruns_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshruns_n_s32<const N: i32>(a: i32) -> u16 {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_extract!(vqshrun_n_s32::<N>(vdupq_n_s32(a)), 0) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubb_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqsub))]
+pub fn vqsubb_s8(a: i8, b: i8) -> i8 {
+    let a: int8x8_t = vdup_n_s8(a);
+    let b: int8x8_t = vdup_n_s8(b);
+    unsafe { simd_extract!(vqsub_s8(a, b), 0) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqsub))]
+pub fn vqsubh_s16(a: i16, b: i16) -> i16 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    unsafe { simd_extract!(vqsub_s16(a, b), 0) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubb_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uqsub))]
+pub fn vqsubb_u8(a: u8, b: u8) -> u8 {
+    let a: uint8x8_t = vdup_n_u8(a);
+    let b: uint8x8_t = vdup_n_u8(b);
+    unsafe { simd_extract!(vqsub_u8(a, b), 0) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubh_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uqsub))]
+pub fn vqsubh_u16(a: u16, b: u16) -> u16 {
+    let a: uint16x4_t = vdup_n_u16(a);
+    let b: uint16x4_t = vdup_n_u16(b);
+    unsafe { simd_extract!(vqsub_u16(a, b), 0) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubs_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqsub))]
+pub fn vqsubs_s32(a: i32, b: i32) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqsub.i32"
+        )]
+        fn _vqsubs_s32(a: i32, b: i32) -> i32;
+    }
+    unsafe { _vqsubs_s32(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(sqsub))]
+pub fn vqsubd_s64(a: i64, b: i64) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqsub.i64"
+        )]
+        fn _vqsubd_s64(a: i64, b: i64) -> i64;
+    }
+    unsafe { _vqsubd_s64(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubs_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uqsub))]
+pub fn vqsubs_u32(a: u32, b: u32) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqsub.i32"
+        )]
+        fn _vqsubs_u32(a: u32, b: u32) -> u32;
+    }
+    unsafe { _vqsubs_u32(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(uqsub))]
+pub fn vqsubd_u64(a: u64, b: u64) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqsub.i64"
+        )]
+        fn _vqsubd_u64(a: u64, b: u64) -> u64;
+    }
+    unsafe { _vqsubd_u64(a, b) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl1)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbl1(a: int8x16_t, b: uint8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbl1.v8i8"
+        )]
+        fn _vqtbl1(a: int8x16_t, b: uint8x8_t) -> int8x8_t;
+    }
+    unsafe { _vqtbl1(a, b) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl1q)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbl1q(a: int8x16_t, b: uint8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbl1.v16i8"
+        )]
+        fn _vqtbl1q(a: int8x16_t, b: uint8x16_t) -> int8x16_t;
+    }
+    unsafe { _vqtbl1q(a, b) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl1_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl1_s8(a: int8x16_t, b: uint8x8_t) -> int8x8_t {
+    vqtbl1(a, b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl1q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl1q_s8(a: int8x16_t, b: uint8x16_t) -> int8x16_t {
+    vqtbl1q(a, b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl1_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl1_u8(a: uint8x16_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe {
+        let x = transmute(vqtbl1(transmute(a), b));
+        x
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl1q_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe {
+        let x = transmute(vqtbl1q(transmute(a), b));
+        x
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl1_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl1_p8(a: poly8x16_t, b: uint8x8_t) -> poly8x8_t {
+    unsafe {
+        let x = transmute(vqtbl1(transmute(a), b));
+        x
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl1q_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl1q_p8(a: poly8x16_t, b: uint8x16_t) -> poly8x16_t {
+    unsafe {
+        let x = transmute(vqtbl1q(transmute(a), b));
+        x
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl2)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbl2(a: int8x16_t, b: int8x16_t, c: uint8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbl2.v8i8"
+        )]
+        fn _vqtbl2(a: int8x16_t, b: int8x16_t, c: uint8x8_t) -> int8x8_t;
+    }
+    unsafe { _vqtbl2(a, b, c) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl2q)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbl2q(a: int8x16_t, b: int8x16_t, c: uint8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbl2.v16i8"
+        )]
+        fn _vqtbl2q(a: int8x16_t, b: int8x16_t, c: uint8x16_t) -> int8x16_t;
+    }
+    unsafe { _vqtbl2q(a, b, c) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl2_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl2_s8(a: int8x16x2_t, b: uint8x8_t) -> int8x8_t {
+    vqtbl2(a.0, a.1, b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl2q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl2q_s8(a: int8x16x2_t, b: uint8x16_t) -> int8x16_t {
+    vqtbl2q(a.0, a.1, b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl2_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl2_u8(a: uint8x16x2_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { transmute(vqtbl2(transmute(a.0), transmute(a.1), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl2_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl2_u8(a: uint8x16x2_t, b: uint8x8_t) -> uint8x8_t {
+    let mut a: uint8x16x2_t = a;
+    a.0 = unsafe {
+        simd_shuffle!(
+            a.0,
+            a.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.1 = unsafe {
+        simd_shuffle!(
+            a.1,
+            a.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vqtbl2(transmute(a.0), transmute(a.1), b));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl2q_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl2q_u8(a: uint8x16x2_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { transmute(vqtbl2q(transmute(a.0), transmute(a.1), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl2q_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl2q_u8(a: uint8x16x2_t, b: uint8x16_t) -> uint8x16_t {
+    let mut a: uint8x16x2_t = a;
+    a.0 = unsafe {
+        simd_shuffle!(
+            a.0,
+            a.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.1 = unsafe {
+        simd_shuffle!(
+            a.1,
+            a.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let b: uint8x16_t =
+        unsafe { simd_shuffle!(b, b, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(vqtbl2q(transmute(a.0), transmute(a.1), b));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl2_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl2_p8(a: poly8x16x2_t, b: uint8x8_t) -> poly8x8_t {
+    unsafe { transmute(vqtbl2(transmute(a.0), transmute(a.1), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl2_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl2_p8(a: poly8x16x2_t, b: uint8x8_t) -> poly8x8_t {
+    let mut a: poly8x16x2_t = a;
+    a.0 = unsafe {
+        simd_shuffle!(
+            a.0,
+            a.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.1 = unsafe {
+        simd_shuffle!(
+            a.1,
+            a.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vqtbl2(transmute(a.0), transmute(a.1), b));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl2q_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl2q_p8(a: poly8x16x2_t, b: uint8x16_t) -> poly8x16_t {
+    unsafe { transmute(vqtbl2q(transmute(a.0), transmute(a.1), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl2q_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl2q_p8(a: poly8x16x2_t, b: uint8x16_t) -> poly8x16_t {
+    let mut a: poly8x16x2_t = a;
+    a.0 = unsafe {
+        simd_shuffle!(
+            a.0,
+            a.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.1 = unsafe {
+        simd_shuffle!(
+            a.1,
+            a.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let b: uint8x16_t =
+        unsafe { simd_shuffle!(b, b, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(vqtbl2q(transmute(a.0), transmute(a.1), b));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl3)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbl3(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: uint8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbl3.v8i8"
+        )]
+        fn _vqtbl3(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: uint8x8_t) -> int8x8_t;
+    }
+    unsafe { _vqtbl3(a, b, c, d) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl3q)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbl3q(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: uint8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbl3.v16i8"
+        )]
+        fn _vqtbl3q(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: uint8x16_t) -> int8x16_t;
+    }
+    unsafe { _vqtbl3q(a, b, c, d) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl3_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl3_s8(a: int8x16x3_t, b: uint8x8_t) -> int8x8_t {
+    vqtbl3(a.0, a.1, a.2, b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl3q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl3q_s8(a: int8x16x3_t, b: uint8x16_t) -> int8x16_t {
+    vqtbl3q(a.0, a.1, a.2, b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl3_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl3_u8(a: uint8x16x3_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { transmute(vqtbl3(transmute(a.0), transmute(a.1), transmute(a.2), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl3_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl3_u8(a: uint8x16x3_t, b: uint8x8_t) -> uint8x8_t {
+    let mut a: uint8x16x3_t = a;
+    a.0 = unsafe {
+        simd_shuffle!(
+            a.0,
+            a.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.1 = unsafe {
+        simd_shuffle!(
+            a.1,
+            a.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.2 = unsafe {
+        simd_shuffle!(
+            a.2,
+            a.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t =
+            transmute(vqtbl3(transmute(a.0), transmute(a.1), transmute(a.2), b));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl3q_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl3q_u8(a: uint8x16x3_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { transmute(vqtbl3q(transmute(a.0), transmute(a.1), transmute(a.2), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl3q_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl3q_u8(a: uint8x16x3_t, b: uint8x16_t) -> uint8x16_t {
+    let mut a: uint8x16x3_t = a;
+    a.0 = unsafe {
+        simd_shuffle!(
+            a.0,
+            a.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.1 = unsafe {
+        simd_shuffle!(
+            a.1,
+            a.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.2 = unsafe {
+        simd_shuffle!(
+            a.2,
+            a.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let b: uint8x16_t =
+        unsafe { simd_shuffle!(b, b, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t =
+            transmute(vqtbl3q(transmute(a.0), transmute(a.1), transmute(a.2), b));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl3_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl3_p8(a: poly8x16x3_t, b: uint8x8_t) -> poly8x8_t {
+    unsafe { transmute(vqtbl3(transmute(a.0), transmute(a.1), transmute(a.2), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl3_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl3_p8(a: poly8x16x3_t, b: uint8x8_t) -> poly8x8_t {
+    let mut a: poly8x16x3_t = a;
+    a.0 = unsafe {
+        simd_shuffle!(
+            a.0,
+            a.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.1 = unsafe {
+        simd_shuffle!(
+            a.1,
+            a.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.2 = unsafe {
+        simd_shuffle!(
+            a.2,
+            a.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t =
+            transmute(vqtbl3(transmute(a.0), transmute(a.1), transmute(a.2), b));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl3q_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl3q_p8(a: poly8x16x3_t, b: uint8x16_t) -> poly8x16_t {
+    unsafe { transmute(vqtbl3q(transmute(a.0), transmute(a.1), transmute(a.2), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl3q_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl3q_p8(a: poly8x16x3_t, b: uint8x16_t) -> poly8x16_t {
+    let mut a: poly8x16x3_t = a;
+    a.0 = unsafe {
+        simd_shuffle!(
+            a.0,
+            a.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.1 = unsafe {
+        simd_shuffle!(
+            a.1,
+            a.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.2 = unsafe {
+        simd_shuffle!(
+            a.2,
+            a.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let b: uint8x16_t =
+        unsafe { simd_shuffle!(b, b, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t =
+            transmute(vqtbl3q(transmute(a.0), transmute(a.1), transmute(a.2), b));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl4)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbl4(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, e: uint8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbl4.v8i8"
+        )]
+        fn _vqtbl4(
+            a: int8x16_t,
+            b: int8x16_t,
+            c: int8x16_t,
+            d: int8x16_t,
+            e: uint8x8_t,
+        ) -> int8x8_t;
+    }
+    unsafe { _vqtbl4(a, b, c, d, e) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl4q)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbl4q(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, e: uint8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbl4.v16i8"
+        )]
+        fn _vqtbl4q(
+            a: int8x16_t,
+            b: int8x16_t,
+            c: int8x16_t,
+            d: int8x16_t,
+            e: uint8x16_t,
+        ) -> int8x16_t;
+    }
+    unsafe { _vqtbl4q(a, b, c, d, e) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl4_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl4_s8(a: int8x16x4_t, b: uint8x8_t) -> int8x8_t {
+    vqtbl4(a.0, a.1, a.2, a.3, b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl4q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl4q_s8(a: int8x16x4_t, b: uint8x16_t) -> int8x16_t {
+    vqtbl4q(a.0, a.1, a.2, a.3, b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl4_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl4_u8(a: uint8x16x4_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe {
+        transmute(vqtbl4(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(a.3),
+            b,
+        ))
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl4_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl4_u8(a: uint8x16x4_t, b: uint8x8_t) -> uint8x8_t {
+    let mut a: uint8x16x4_t = a;
+    a.0 = unsafe {
+        simd_shuffle!(
+            a.0,
+            a.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.1 = unsafe {
+        simd_shuffle!(
+            a.1,
+            a.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.2 = unsafe {
+        simd_shuffle!(
+            a.2,
+            a.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.3 = unsafe {
+        simd_shuffle!(
+            a.3,
+            a.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vqtbl4(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(a.3),
+            b,
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl4q_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl4q_u8(a: uint8x16x4_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe {
+        transmute(vqtbl4q(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(a.3),
+            b,
+        ))
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl4q_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl4q_u8(a: uint8x16x4_t, b: uint8x16_t) -> uint8x16_t {
+    let mut a: uint8x16x4_t = a;
+    a.0 = unsafe {
+        simd_shuffle!(
+            a.0,
+            a.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.1 = unsafe {
+        simd_shuffle!(
+            a.1,
+            a.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.2 = unsafe {
+        simd_shuffle!(
+            a.2,
+            a.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.3 = unsafe {
+        simd_shuffle!(
+            a.3,
+            a.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let b: uint8x16_t =
+        unsafe { simd_shuffle!(b, b, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(vqtbl4q(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(a.3),
+            b,
+        ));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl4_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl4_p8(a: poly8x16x4_t, b: uint8x8_t) -> poly8x8_t {
+    unsafe {
+        transmute(vqtbl4(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(a.3),
+            b,
+        ))
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl4_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl4_p8(a: poly8x16x4_t, b: uint8x8_t) -> poly8x8_t {
+    let mut a: poly8x16x4_t = a;
+    a.0 = unsafe {
+        simd_shuffle!(
+            a.0,
+            a.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.1 = unsafe {
+        simd_shuffle!(
+            a.1,
+            a.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.2 = unsafe {
+        simd_shuffle!(
+            a.2,
+            a.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.3 = unsafe {
+        simd_shuffle!(
+            a.3,
+            a.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vqtbl4(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(a.3),
+            b,
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl4q_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl4q_p8(a: poly8x16x4_t, b: uint8x16_t) -> poly8x16_t {
+    unsafe {
+        transmute(vqtbl4q(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(a.3),
+            b,
+        ))
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbl4q_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbl4q_p8(a: poly8x16x4_t, b: uint8x16_t) -> poly8x16_t {
+    let mut a: poly8x16x4_t = a;
+    a.0 = unsafe {
+        simd_shuffle!(
+            a.0,
+            a.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.1 = unsafe {
+        simd_shuffle!(
+            a.1,
+            a.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.2 = unsafe {
+        simd_shuffle!(
+            a.2,
+            a.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    a.3 = unsafe {
+        simd_shuffle!(
+            a.3,
+            a.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let b: uint8x16_t =
+        unsafe { simd_shuffle!(b, b, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(vqtbl4q(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(a.3),
+            b,
+        ));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx1)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbx1(a: int8x8_t, b: int8x16_t, c: uint8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbx1.v8i8"
+        )]
+        fn _vqtbx1(a: int8x8_t, b: int8x16_t, c: uint8x8_t) -> int8x8_t;
+    }
+    unsafe { _vqtbx1(a, b, c) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx1q)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbx1q(a: int8x16_t, b: int8x16_t, c: uint8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbx1.v16i8"
+        )]
+        fn _vqtbx1q(a: int8x16_t, b: int8x16_t, c: uint8x16_t) -> int8x16_t;
+    }
+    unsafe { _vqtbx1q(a, b, c) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx1_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx1_s8(a: int8x8_t, b: int8x16_t, c: uint8x8_t) -> int8x8_t {
+    vqtbx1(a, b, c)
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx1q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx1q_s8(a: int8x16_t, b: int8x16_t, c: uint8x16_t) -> int8x16_t {
+    vqtbx1q(a, b, c)
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx1_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx1_u8(a: uint8x8_t, b: uint8x16_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe {
+        let x = transmute(vqtbx1(transmute(a), transmute(b), c));
+        x
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx1q_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx1q_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    unsafe {
+        let x = transmute(vqtbx1q(transmute(a), transmute(b), c));
+        x
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx1_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx1_p8(a: poly8x8_t, b: poly8x16_t, c: uint8x8_t) -> poly8x8_t {
+    unsafe {
+        let x = transmute(vqtbx1(transmute(a), transmute(b), c));
+        x
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx1q_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx1q_p8(a: poly8x16_t, b: poly8x16_t, c: uint8x16_t) -> poly8x16_t {
+    unsafe {
+        let x = transmute(vqtbx1q(transmute(a), transmute(b), c));
+        x
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx2)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbx2(a: int8x8_t, b: int8x16_t, c: int8x16_t, d: uint8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbx2.v8i8"
+        )]
+        fn _vqtbx2(a: int8x8_t, b: int8x16_t, c: int8x16_t, d: uint8x8_t) -> int8x8_t;
+    }
+    unsafe { _vqtbx2(a, b, c, d) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx2q)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbx2q(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: uint8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbx2.v16i8"
+        )]
+        fn _vqtbx2q(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: uint8x16_t) -> int8x16_t;
+    }
+    unsafe { _vqtbx2q(a, b, c, d) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx2_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx2_s8(a: int8x8_t, b: int8x16x2_t, c: uint8x8_t) -> int8x8_t {
+    vqtbx2(a, b.0, b.1, c)
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx2q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx2q_s8(a: int8x16_t, b: int8x16x2_t, c: uint8x16_t) -> int8x16_t {
+    vqtbx2q(a, b.0, b.1, c)
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx2_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx2_u8(a: uint8x8_t, b: uint8x16x2_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe { transmute(vqtbx2(transmute(a), transmute(b.0), transmute(b.1), c)) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx2_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx2_u8(a: uint8x8_t, b: uint8x16x2_t, c: uint8x8_t) -> uint8x8_t {
+    let mut b: uint8x16x2_t = b;
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe {
+        simd_shuffle!(
+            b.0,
+            b.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.1 = unsafe {
+        simd_shuffle!(
+            b.1,
+            b.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vqtbx2(transmute(a), transmute(b.0), transmute(b.1), c));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx2q_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx2q_u8(a: uint8x16_t, b: uint8x16x2_t, c: uint8x16_t) -> uint8x16_t {
+    unsafe { transmute(vqtbx2q(transmute(a), transmute(b.0), transmute(b.1), c)) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx2q_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx2q_u8(a: uint8x16_t, b: uint8x16x2_t, c: uint8x16_t) -> uint8x16_t {
+    let mut b: uint8x16x2_t = b;
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe {
+        simd_shuffle!(
+            b.0,
+            b.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.1 = unsafe {
+        simd_shuffle!(
+            b.1,
+            b.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let c: uint8x16_t =
+        unsafe { simd_shuffle!(c, c, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t =
+            transmute(vqtbx2q(transmute(a), transmute(b.0), transmute(b.1), c));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx2_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx2_p8(a: poly8x8_t, b: poly8x16x2_t, c: uint8x8_t) -> poly8x8_t {
+    unsafe { transmute(vqtbx2(transmute(a), transmute(b.0), transmute(b.1), c)) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx2_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx2_p8(a: poly8x8_t, b: poly8x16x2_t, c: uint8x8_t) -> poly8x8_t {
+    let mut b: poly8x16x2_t = b;
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe {
+        simd_shuffle!(
+            b.0,
+            b.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.1 = unsafe {
+        simd_shuffle!(
+            b.1,
+            b.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vqtbx2(transmute(a), transmute(b.0), transmute(b.1), c));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx2q_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx2q_p8(a: poly8x16_t, b: poly8x16x2_t, c: uint8x16_t) -> poly8x16_t {
+    unsafe { transmute(vqtbx2q(transmute(a), transmute(b.0), transmute(b.1), c)) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx2q_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx2q_p8(a: poly8x16_t, b: poly8x16x2_t, c: uint8x16_t) -> poly8x16_t {
+    let mut b: poly8x16x2_t = b;
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe {
+        simd_shuffle!(
+            b.0,
+            b.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.1 = unsafe {
+        simd_shuffle!(
+            b.1,
+            b.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let c: uint8x16_t =
+        unsafe { simd_shuffle!(c, c, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t =
+            transmute(vqtbx2q(transmute(a), transmute(b.0), transmute(b.1), c));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx3)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbx3(a: int8x8_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, e: uint8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbx3.v8i8"
+        )]
+        fn _vqtbx3(a: int8x8_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, e: uint8x8_t)
+            -> int8x8_t;
+    }
+    unsafe { _vqtbx3(a, b, c, d, e) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx3q)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbx3q(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, e: uint8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbx3.v16i8"
+        )]
+        fn _vqtbx3q(
+            a: int8x16_t,
+            b: int8x16_t,
+            c: int8x16_t,
+            d: int8x16_t,
+            e: uint8x16_t,
+        ) -> int8x16_t;
+    }
+    unsafe { _vqtbx3q(a, b, c, d, e) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx3_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx3_s8(a: int8x8_t, b: int8x16x3_t, c: uint8x8_t) -> int8x8_t {
+    vqtbx3(a, b.0, b.1, b.2, c)
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx3q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx3q_s8(a: int8x16_t, b: int8x16x3_t, c: uint8x16_t) -> int8x16_t {
+    vqtbx3q(a, b.0, b.1, b.2, c)
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx3_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx3_u8(a: uint8x8_t, b: uint8x16x3_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe {
+        transmute(vqtbx3(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            c,
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx3_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx3_u8(a: uint8x8_t, b: uint8x16x3_t, c: uint8x8_t) -> uint8x8_t {
+    let mut b: uint8x16x3_t = b;
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe {
+        simd_shuffle!(
+            b.0,
+            b.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.1 = unsafe {
+        simd_shuffle!(
+            b.1,
+            b.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.2 = unsafe {
+        simd_shuffle!(
+            b.2,
+            b.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vqtbx3(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            c,
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx3q_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx3q_u8(a: uint8x16_t, b: uint8x16x3_t, c: uint8x16_t) -> uint8x16_t {
+    unsafe {
+        transmute(vqtbx3q(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            c,
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx3q_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx3q_u8(a: uint8x16_t, b: uint8x16x3_t, c: uint8x16_t) -> uint8x16_t {
+    let mut b: uint8x16x3_t = b;
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe {
+        simd_shuffle!(
+            b.0,
+            b.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.1 = unsafe {
+        simd_shuffle!(
+            b.1,
+            b.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.2 = unsafe {
+        simd_shuffle!(
+            b.2,
+            b.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let c: uint8x16_t =
+        unsafe { simd_shuffle!(c, c, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(vqtbx3q(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            c,
+        ));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx3_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx3_p8(a: poly8x8_t, b: poly8x16x3_t, c: uint8x8_t) -> poly8x8_t {
+    unsafe {
+        transmute(vqtbx3(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            c,
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx3_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx3_p8(a: poly8x8_t, b: poly8x16x3_t, c: uint8x8_t) -> poly8x8_t {
+    let mut b: poly8x16x3_t = b;
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe {
+        simd_shuffle!(
+            b.0,
+            b.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.1 = unsafe {
+        simd_shuffle!(
+            b.1,
+            b.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.2 = unsafe {
+        simd_shuffle!(
+            b.2,
+            b.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vqtbx3(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            c,
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx3q_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx3q_p8(a: poly8x16_t, b: poly8x16x3_t, c: uint8x16_t) -> poly8x16_t {
+    unsafe {
+        transmute(vqtbx3q(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            c,
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx3q_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx3q_p8(a: poly8x16_t, b: poly8x16x3_t, c: uint8x16_t) -> poly8x16_t {
+    let mut b: poly8x16x3_t = b;
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe {
+        simd_shuffle!(
+            b.0,
+            b.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.1 = unsafe {
+        simd_shuffle!(
+            b.1,
+            b.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.2 = unsafe {
+        simd_shuffle!(
+            b.2,
+            b.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let c: uint8x16_t =
+        unsafe { simd_shuffle!(c, c, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(vqtbx3q(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            c,
+        ));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx4)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbx4(
+    a: int8x8_t,
+    b: int8x16_t,
+    c: int8x16_t,
+    d: int8x16_t,
+    e: int8x16_t,
+    f: uint8x8_t,
+) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbx4.v8i8"
+        )]
+        fn _vqtbx4(
+            a: int8x8_t,
+            b: int8x16_t,
+            c: int8x16_t,
+            d: int8x16_t,
+            e: int8x16_t,
+            f: uint8x8_t,
+        ) -> int8x8_t;
+    }
+    unsafe { _vqtbx4(a, b, c, d, e, f) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx4q)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+fn vqtbx4q(
+    a: int8x16_t,
+    b: int8x16_t,
+    c: int8x16_t,
+    d: int8x16_t,
+    e: int8x16_t,
+    f: uint8x16_t,
+) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.tbx4.v16i8"
+        )]
+        fn _vqtbx4q(
+            a: int8x16_t,
+            b: int8x16_t,
+            c: int8x16_t,
+            d: int8x16_t,
+            e: int8x16_t,
+            f: uint8x16_t,
+        ) -> int8x16_t;
+    }
+    unsafe { _vqtbx4q(a, b, c, d, e, f) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx4_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx4_s8(a: int8x8_t, b: int8x16x4_t, c: uint8x8_t) -> int8x8_t {
+    vqtbx4(a, b.0, b.1, b.2, b.3, c)
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx4q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx4q_s8(a: int8x16_t, b: int8x16x4_t, c: uint8x16_t) -> int8x16_t {
+    vqtbx4q(a, b.0, b.1, b.2, b.3, c)
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx4_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx4_u8(a: uint8x8_t, b: uint8x16x4_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe {
+        transmute(vqtbx4(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            c,
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx4_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx4_u8(a: uint8x8_t, b: uint8x16x4_t, c: uint8x8_t) -> uint8x8_t {
+    let mut b: uint8x16x4_t = b;
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe {
+        simd_shuffle!(
+            b.0,
+            b.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.1 = unsafe {
+        simd_shuffle!(
+            b.1,
+            b.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.2 = unsafe {
+        simd_shuffle!(
+            b.2,
+            b.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.3 = unsafe {
+        simd_shuffle!(
+            b.3,
+            b.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vqtbx4(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            c,
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx4q_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx4q_u8(a: uint8x16_t, b: uint8x16x4_t, c: uint8x16_t) -> uint8x16_t {
+    unsafe {
+        transmute(vqtbx4q(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            c,
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx4q_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx4q_u8(a: uint8x16_t, b: uint8x16x4_t, c: uint8x16_t) -> uint8x16_t {
+    let mut b: uint8x16x4_t = b;
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe {
+        simd_shuffle!(
+            b.0,
+            b.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.1 = unsafe {
+        simd_shuffle!(
+            b.1,
+            b.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.2 = unsafe {
+        simd_shuffle!(
+            b.2,
+            b.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.3 = unsafe {
+        simd_shuffle!(
+            b.3,
+            b.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let c: uint8x16_t =
+        unsafe { simd_shuffle!(c, c, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(vqtbx4q(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            c,
+        ));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx4_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx4_p8(a: poly8x8_t, b: poly8x16x4_t, c: uint8x8_t) -> poly8x8_t {
+    unsafe {
+        transmute(vqtbx4(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            c,
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx4_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx4_p8(a: poly8x8_t, b: poly8x16x4_t, c: uint8x8_t) -> poly8x8_t {
+    let mut b: poly8x16x4_t = b;
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe {
+        simd_shuffle!(
+            b.0,
+            b.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.1 = unsafe {
+        simd_shuffle!(
+            b.1,
+            b.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.2 = unsafe {
+        simd_shuffle!(
+            b.2,
+            b.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.3 = unsafe {
+        simd_shuffle!(
+            b.3,
+            b.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vqtbx4(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            c,
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx4q_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx4q_p8(a: poly8x16_t, b: poly8x16x4_t, c: uint8x16_t) -> poly8x16_t {
+    unsafe {
+        transmute(vqtbx4q(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            c,
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqtbx4q_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqtbx4q_p8(a: poly8x16_t, b: poly8x16x4_t, c: uint8x16_t) -> poly8x16_t {
+    let mut b: poly8x16x4_t = b;
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe {
+        simd_shuffle!(
+            b.0,
+            b.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.1 = unsafe {
+        simd_shuffle!(
+            b.1,
+            b.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.2 = unsafe {
+        simd_shuffle!(
+            b.2,
+            b.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    b.3 = unsafe {
+        simd_shuffle!(
+            b.3,
+            b.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    let c: uint8x16_t =
+        unsafe { simd_shuffle!(c, c, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(vqtbx4q(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            c,
+        ));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Rotate and exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrax1q_u64)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(rax1))]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+pub fn vrax1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.rax1"
+        )]
+        fn _vrax1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vrax1q_u64(a, b) }
+}
+#[doc = "Reverse bit order"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrbit_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn vrbit_s8(a: int8x8_t) -> int8x8_t {
+    unsafe { simd_bitreverse(a) }
+}
+#[doc = "Reverse bit order"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrbitq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn vrbitq_s8(a: int8x16_t) -> int8x16_t {
+    unsafe { simd_bitreverse(a) }
+}
+#[doc = "Reverse bit order"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrbit_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn vrbit_u8(a: uint8x8_t) -> uint8x8_t {
+    unsafe { transmute(vrbit_s8(transmute(a))) }
+}
+#[doc = "Reverse bit order"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrbit_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn vrbit_u8(a: uint8x8_t) -> uint8x8_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vrbit_s8(transmute(a)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Reverse bit order"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrbitq_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn vrbitq_u8(a: uint8x16_t) -> uint8x16_t {
+    unsafe { transmute(vrbitq_s8(transmute(a))) }
+}
+#[doc = "Reverse bit order"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrbitq_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn vrbitq_u8(a: uint8x16_t) -> uint8x16_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(vrbitq_s8(transmute(a)));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Reverse bit order"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrbit_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn vrbit_p8(a: poly8x8_t) -> poly8x8_t {
+    unsafe { transmute(vrbit_s8(transmute(a))) }
+}
+#[doc = "Reverse bit order"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrbit_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn vrbit_p8(a: poly8x8_t) -> poly8x8_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vrbit_s8(transmute(a)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Reverse bit order"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrbitq_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn vrbitq_p8(a: poly8x16_t) -> poly8x16_t {
+    unsafe { transmute(vrbitq_s8(transmute(a))) }
+}
+#[doc = "Reverse bit order"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrbitq_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn vrbitq_p8(a: poly8x16_t) -> poly8x16_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(vrbitq_s8(transmute(a)));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Reciprocal estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpe_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecpe))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrecpe_f64(a: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecpe.v1f64"
+        )]
+        fn _vrecpe_f64(a: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vrecpe_f64(a) }
+}
+#[doc = "Reciprocal estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpeq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecpe))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrecpeq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecpe.v2f64"
+        )]
+        fn _vrecpeq_f64(a: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vrecpeq_f64(a) }
+}
+#[doc = "Reciprocal estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecped_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecpe))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrecped_f64(a: f64) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecpe.f64"
+        )]
+        fn _vrecped_f64(a: f64) -> f64;
+    }
+    unsafe { _vrecped_f64(a) }
+}
+#[doc = "Reciprocal estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpes_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecpe))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrecpes_f32(a: f32) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecpe.f32"
+        )]
+        fn _vrecpes_f32(a: f32) -> f32;
+    }
+    unsafe { _vrecpes_f32(a) }
+}
+#[doc = "Reciprocal estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpeh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(frecpe))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrecpeh_f16(a: f16) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecpe.f16"
+        )]
+        fn _vrecpeh_f16(a: f16) -> f16;
+    }
+    unsafe { _vrecpeh_f16(a) }
+}
+#[doc = "Floating-point reciprocal step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecps_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrecps_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecps.v1f64"
+        )]
+        fn _vrecps_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vrecps_f64(a, b) }
+}
+#[doc = "Floating-point reciprocal step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpsq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrecpsq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecps.v2f64"
+        )]
+        fn _vrecpsq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vrecpsq_f64(a, b) }
+}
+#[doc = "Floating-point reciprocal step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpsd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrecpsd_f64(a: f64, b: f64) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecps.f64"
+        )]
+        fn _vrecpsd_f64(a: f64, b: f64) -> f64;
+    }
+    unsafe { _vrecpsd_f64(a, b) }
+}
+#[doc = "Floating-point reciprocal step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpss_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrecpss_f32(a: f32, b: f32) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecps.f32"
+        )]
+        fn _vrecpss_f32(a: f32, b: f32) -> f32;
+    }
+    unsafe { _vrecpss_f32(a, b) }
+}
+#[doc = "Floating-point reciprocal step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpsh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(frecps))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrecpsh_f16(a: f16, b: f16) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecps.f16"
+        )]
+        fn _vrecpsh_f16(a: f16, b: f16) -> f16;
+    }
+    unsafe { _vrecpsh_f16(a, b) }
+}
+#[doc = "Floating-point reciprocal exponent"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpxd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecpx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrecpxd_f64(a: f64) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecpx.f64"
+        )]
+        fn _vrecpxd_f64(a: f64) -> f64;
+    }
+    unsafe { _vrecpxd_f64(a) }
+}
+#[doc = "Floating-point reciprocal exponent"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpxs_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecpx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrecpxs_f32(a: f32) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecpx.f32"
+        )]
+        fn _vrecpxs_f32(a: f32) -> f32;
+    }
+    unsafe { _vrecpxs_f32(a) }
+}
+#[doc = "Floating-point reciprocal exponent"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpxh_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(frecpx))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrecpxh_f16(a: f16) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecpx.f16"
+        )]
+        fn _vrecpxh_f16(a: f16) -> f16;
+    }
+    unsafe { _vrecpxh_f16(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_f16(a: float16x4_t) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_f16(a: float16x4_t) -> float64x1_t {
+    let a: float16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_f16(a: float16x8_t) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_f16(a: float16x8_t) -> float64x2_t {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f16_f64(a: float64x1_t) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f16_f64(a: float64x1_t) -> float16x4_t {
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f16_f64(a: float64x2_t) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f16_f64(a: float64x2_t) -> float16x8_t {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_p128(a: p128) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_p128(a: p128) -> float64x2_t {
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_f32(a: float32x2_t) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_f32(a: float32x2_t) -> float64x1_t {
+    let a: float32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_p64_f32(a: float32x2_t) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_p64_f32(a: float32x2_t) -> poly64x1_t {
+    let a: float32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_f32(a: float32x4_t) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_f32(a: float32x4_t) -> float64x2_t {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p64_f32(a: float32x4_t) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p64_f32(a: float32x4_t) -> poly64x2_t {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f32_f64(a: float64x1_t) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f32_f64(a: float64x1_t) -> float32x2_t {
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_s8_f64(a: float64x1_t) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_s8_f64(a: float64x1_t) -> int8x8_t {
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_s16_f64(a: float64x1_t) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_s16_f64(a: float64x1_t) -> int16x4_t {
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_s32_f64(a: float64x1_t) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_s32_f64(a: float64x1_t) -> int32x2_t {
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_s64_f64(a: float64x1_t) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_u8_f64(a: float64x1_t) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_u8_f64(a: float64x1_t) -> uint8x8_t {
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_u16_f64(a: float64x1_t) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_u16_f64(a: float64x1_t) -> uint16x4_t {
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_u32_f64(a: float64x1_t) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_u32_f64(a: float64x1_t) -> uint32x2_t {
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_u64_f64(a: float64x1_t) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_p8_f64(a: float64x1_t) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_p8_f64(a: float64x1_t) -> poly8x8_t {
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_p16_f64(a: float64x1_t) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_p16_f64(a: float64x1_t) -> poly16x4_t {
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_p64_f64(a: float64x1_t) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p128_f64(a: float64x2_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p128_f64(a: float64x2_t) -> p128 {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f32_f64(a: float64x2_t) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f32_f64(a: float64x2_t) -> float32x4_t {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_s8_f64(a: float64x2_t) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_s8_f64(a: float64x2_t) -> int8x16_t {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_s16_f64(a: float64x2_t) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_s16_f64(a: float64x2_t) -> int16x8_t {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_s32_f64(a: float64x2_t) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_s32_f64(a: float64x2_t) -> int32x4_t {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_s64_f64(a: float64x2_t) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_s64_f64(a: float64x2_t) -> int64x2_t {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_u8_f64(a: float64x2_t) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_u8_f64(a: float64x2_t) -> uint8x16_t {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_u16_f64(a: float64x2_t) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_u16_f64(a: float64x2_t) -> uint16x8_t {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_u32_f64(a: float64x2_t) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_u32_f64(a: float64x2_t) -> uint32x4_t {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_u64_f64(a: float64x2_t) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_u64_f64(a: float64x2_t) -> uint64x2_t {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p8_f64(a: float64x2_t) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p8_f64(a: float64x2_t) -> poly8x16_t {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p16_f64(a: float64x2_t) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p16_f64(a: float64x2_t) -> poly16x8_t {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_f64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p64_f64(a: float64x2_t) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_f64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p64_f64(a: float64x2_t) -> poly64x2_t {
+    let a: float64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_s8(a: int8x8_t) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_s8(a: int8x8_t) -> float64x1_t {
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_s8(a: int8x16_t) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_s8(a: int8x16_t) -> float64x2_t {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_s16(a: int16x4_t) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_s16(a: int16x4_t) -> float64x1_t {
+    let a: int16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_s16(a: int16x8_t) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_s16(a: int16x8_t) -> float64x2_t {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_s32(a: int32x2_t) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_s32(a: int32x2_t) -> float64x1_t {
+    let a: int32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_s32(a: int32x4_t) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_s32(a: int32x4_t) -> float64x2_t {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_s64(a: int64x1_t) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_p64_s64(a: int64x1_t) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_s64(a: int64x2_t) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_s64(a: int64x2_t) -> float64x2_t {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p64_s64(a: int64x2_t) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p64_s64(a: int64x2_t) -> poly64x2_t {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_u8(a: uint8x8_t) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_u8(a: uint8x8_t) -> float64x1_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_u8(a: uint8x16_t) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_u8(a: uint8x16_t) -> float64x2_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_u16(a: uint16x4_t) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_u16(a: uint16x4_t) -> float64x1_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_u16(a: uint16x8_t) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_u16(a: uint16x8_t) -> float64x2_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_u32(a: uint32x2_t) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_u32(a: uint32x2_t) -> float64x1_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_u32(a: uint32x4_t) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_u32(a: uint32x4_t) -> float64x2_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_u64(a: uint64x1_t) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_p64_u64(a: uint64x1_t) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_u64(a: uint64x2_t) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_u64(a: uint64x2_t) -> float64x2_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p64_u64(a: uint64x2_t) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_p64_u64(a: uint64x2_t) -> poly64x2_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_p8(a: poly8x8_t) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_p8(a: poly8x8_t) -> float64x1_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_p8(a: poly8x16_t) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_p8(a: poly8x16_t) -> float64x2_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_p16(a: poly16x4_t) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_p16(a: poly16x4_t) -> float64x1_t {
+    let a: poly16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_p16(a: poly16x8_t) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_p16(a: poly16x8_t) -> float64x2_t {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f32_p64(a: poly64x1_t) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f32_p64(a: poly64x1_t) -> float32x2_t {
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f64_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_f64_p64(a: poly64x1_t) -> float64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_s64_p64(a: poly64x1_t) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpret_u64_p64(a: poly64x1_t) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f32_p64(a: poly64x2_t) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f32_p64(a: poly64x2_t) -> float32x4_t {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_p64(a: poly64x2_t) -> float64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f64_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_f64_p64(a: poly64x2_t) -> float64x2_t {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_s64_p64(a: poly64x2_t) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_s64_p64(a: poly64x2_t) -> int64x2_t {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_u64_p64(a: poly64x2_t) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vreinterpretq_u64_p64(a: poly64x2_t) -> uint64x2_t {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Floating-point round to 32-bit integer, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32x_f32)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint32x))]
+pub fn vrnd32x_f32(a: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frint32x.v2f32"
+        )]
+        fn _vrnd32x_f32(a: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vrnd32x_f32(a) }
+}
+#[doc = "Floating-point round to 32-bit integer, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32xq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint32x))]
+pub fn vrnd32xq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frint32x.v4f32"
+        )]
+        fn _vrnd32xq_f32(a: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vrnd32xq_f32(a) }
+}
+#[doc = "Floating-point round to 32-bit integer, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32xq_f64)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint32x))]
+pub fn vrnd32xq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frint32x.v2f64"
+        )]
+        fn _vrnd32xq_f64(a: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vrnd32xq_f64(a) }
+}
+#[doc = "Floating-point round to 32-bit integer, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32x_f64)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint32x))]
+pub fn vrnd32x_f64(a: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.frint32x.f64"
+        )]
+        fn _vrnd32x_f64(a: f64) -> f64;
+    }
+    unsafe { transmute(_vrnd32x_f64(simd_extract!(a, 0))) }
+}
+#[doc = "Floating-point round to 32-bit integer toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32z_f32)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint32z))]
+pub fn vrnd32z_f32(a: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frint32z.v2f32"
+        )]
+        fn _vrnd32z_f32(a: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vrnd32z_f32(a) }
+}
+#[doc = "Floating-point round to 32-bit integer toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32zq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint32z))]
+pub fn vrnd32zq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frint32z.v4f32"
+        )]
+        fn _vrnd32zq_f32(a: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vrnd32zq_f32(a) }
+}
+#[doc = "Floating-point round to 32-bit integer toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32zq_f64)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint32z))]
+pub fn vrnd32zq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frint32z.v2f64"
+        )]
+        fn _vrnd32zq_f64(a: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vrnd32zq_f64(a) }
+}
+#[doc = "Floating-point round to 32-bit integer toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32z_f64)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint32z))]
+pub fn vrnd32z_f64(a: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.frint32z.f64"
+        )]
+        fn _vrnd32z_f64(a: f64) -> f64;
+    }
+    unsafe { transmute(_vrnd32z_f64(simd_extract!(a, 0))) }
+}
+#[doc = "Floating-point round to 64-bit integer, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64x_f32)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint64x))]
+pub fn vrnd64x_f32(a: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frint64x.v2f32"
+        )]
+        fn _vrnd64x_f32(a: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vrnd64x_f32(a) }
+}
+#[doc = "Floating-point round to 64-bit integer, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64xq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint64x))]
+pub fn vrnd64xq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frint64x.v4f32"
+        )]
+        fn _vrnd64xq_f32(a: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vrnd64xq_f32(a) }
+}
+#[doc = "Floating-point round to 64-bit integer, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64xq_f64)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint64x))]
+pub fn vrnd64xq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frint64x.v2f64"
+        )]
+        fn _vrnd64xq_f64(a: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vrnd64xq_f64(a) }
+}
+#[doc = "Floating-point round to 64-bit integer, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64x_f64)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint64x))]
+pub fn vrnd64x_f64(a: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.frint64x.f64"
+        )]
+        fn _vrnd64x_f64(a: f64) -> f64;
+    }
+    unsafe { transmute(_vrnd64x_f64(simd_extract!(a, 0))) }
+}
+#[doc = "Floating-point round to 64-bit integer toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64z_f32)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint64z))]
+pub fn vrnd64z_f32(a: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frint64z.v2f32"
+        )]
+        fn _vrnd64z_f32(a: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vrnd64z_f32(a) }
+}
+#[doc = "Floating-point round to 64-bit integer toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64zq_f32)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint64z))]
+pub fn vrnd64zq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frint64z.v4f32"
+        )]
+        fn _vrnd64zq_f32(a: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vrnd64zq_f32(a) }
+}
+#[doc = "Floating-point round to 64-bit integer toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64zq_f64)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint64z))]
+pub fn vrnd64zq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frint64z.v2f64"
+        )]
+        fn _vrnd64zq_f64(a: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vrnd64zq_f64(a) }
+}
+#[doc = "Floating-point round to 64-bit integer toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64z_f64)"]
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[unstable(feature = "stdarch_neon_ftts", issue = "117227")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(frint64z))]
+pub fn vrnd64z_f64(a: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.frint64z.f64"
+        )]
+        fn _vrnd64z_f64(a: f64) -> f64;
+    }
+    unsafe { transmute(_vrnd64z_f64(simd_extract!(a, 0))) }
+}
+#[doc = "Floating-point round to integral, toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frintz))]
+pub fn vrnd_f16(a: float16x4_t) -> float16x4_t {
+    unsafe { simd_trunc(a) }
+}
+#[doc = "Floating-point round to integral, toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frintz))]
+pub fn vrndq_f16(a: float16x8_t) -> float16x8_t {
+    unsafe { simd_trunc(a) }
+}
+#[doc = "Floating-point round to integral, toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintz))]
+pub fn vrnd_f32(a: float32x2_t) -> float32x2_t {
+    unsafe { simd_trunc(a) }
+}
+#[doc = "Floating-point round to integral, toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintz))]
+pub fn vrndq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe { simd_trunc(a) }
+}
+#[doc = "Floating-point round to integral, toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintz))]
+pub fn vrnd_f64(a: float64x1_t) -> float64x1_t {
+    unsafe { simd_trunc(a) }
+}
+#[doc = "Floating-point round to integral, toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintz))]
+pub fn vrndq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe { simd_trunc(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnda_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frinta))]
+pub fn vrnda_f16(a: float16x4_t) -> float16x4_t {
+    unsafe { simd_round(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndaq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frinta))]
+pub fn vrndaq_f16(a: float16x8_t) -> float16x8_t {
+    unsafe { simd_round(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnda_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frinta))]
+pub fn vrnda_f32(a: float32x2_t) -> float32x2_t {
+    unsafe { simd_round(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndaq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frinta))]
+pub fn vrndaq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe { simd_round(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnda_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frinta))]
+pub fn vrnda_f64(a: float64x1_t) -> float64x1_t {
+    unsafe { simd_round(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndaq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frinta))]
+pub fn vrndaq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe { simd_round(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndah_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frinta))]
+pub fn vrndah_f16(a: f16) -> f16 {
+    unsafe { roundf16(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to away"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frintz))]
+pub fn vrndh_f16(a: f16) -> f16 {
+    unsafe { truncf16(a) }
+}
+#[doc = "Floating-point round to integral, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndi_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frinti))]
+pub fn vrndi_f16(a: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.nearbyint.v4f16"
+        )]
+        fn _vrndi_f16(a: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vrndi_f16(a) }
+}
+#[doc = "Floating-point round to integral, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndiq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frinti))]
+pub fn vrndiq_f16(a: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.nearbyint.v8f16"
+        )]
+        fn _vrndiq_f16(a: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vrndiq_f16(a) }
+}
+#[doc = "Floating-point round to integral, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndi_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frinti))]
+pub fn vrndi_f32(a: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.nearbyint.v2f32"
+        )]
+        fn _vrndi_f32(a: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vrndi_f32(a) }
+}
+#[doc = "Floating-point round to integral, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndiq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frinti))]
+pub fn vrndiq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.nearbyint.v4f32"
+        )]
+        fn _vrndiq_f32(a: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vrndiq_f32(a) }
+}
+#[doc = "Floating-point round to integral, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndi_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frinti))]
+pub fn vrndi_f64(a: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.nearbyint.v1f64"
+        )]
+        fn _vrndi_f64(a: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vrndi_f64(a) }
+}
+#[doc = "Floating-point round to integral, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndiq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frinti))]
+pub fn vrndiq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.nearbyint.v2f64"
+        )]
+        fn _vrndiq_f64(a: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vrndiq_f64(a) }
+}
+#[doc = "Floating-point round to integral, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndih_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frinti))]
+pub fn vrndih_f16(a: f16) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.nearbyint.f16"
+        )]
+        fn _vrndih_f16(a: f16) -> f16;
+    }
+    unsafe { _vrndih_f16(a) }
+}
+#[doc = "Floating-point round to integral, toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndm_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frintm))]
+pub fn vrndm_f16(a: float16x4_t) -> float16x4_t {
+    unsafe { simd_floor(a) }
+}
+#[doc = "Floating-point round to integral, toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndmq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frintm))]
+pub fn vrndmq_f16(a: float16x8_t) -> float16x8_t {
+    unsafe { simd_floor(a) }
+}
+#[doc = "Floating-point round to integral, toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndm_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintm))]
+pub fn vrndm_f32(a: float32x2_t) -> float32x2_t {
+    unsafe { simd_floor(a) }
+}
+#[doc = "Floating-point round to integral, toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndmq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintm))]
+pub fn vrndmq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe { simd_floor(a) }
+}
+#[doc = "Floating-point round to integral, toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndm_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintm))]
+pub fn vrndm_f64(a: float64x1_t) -> float64x1_t {
+    unsafe { simd_floor(a) }
+}
+#[doc = "Floating-point round to integral, toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndmq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintm))]
+pub fn vrndmq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe { simd_floor(a) }
+}
+#[doc = "Floating-point round to integral, toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndmh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frintm))]
+pub fn vrndmh_f16(a: f16) -> f16 {
+    unsafe { floorf16(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndn_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintn))]
+pub fn vrndn_f64(a: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.roundeven.v1f64"
+        )]
+        fn _vrndn_f64(a: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vrndn_f64(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndnq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintn))]
+pub fn vrndnq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.roundeven.v2f64"
+        )]
+        fn _vrndnq_f64(a: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vrndnq_f64(a) }
+}
+#[doc = "Floating-point round to integral, toward minus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndnh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frintn))]
+pub fn vrndnh_f16(a: f16) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.roundeven.f16"
+        )]
+        fn _vrndnh_f16(a: f16) -> f16;
+    }
+    unsafe { _vrndnh_f16(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndns_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintn))]
+pub fn vrndns_f32(a: f32) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.roundeven.f32"
+        )]
+        fn _vrndns_f32(a: f32) -> f32;
+    }
+    unsafe { _vrndns_f32(a) }
+}
+#[doc = "Floating-point round to integral, toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndp_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frintp))]
+pub fn vrndp_f16(a: float16x4_t) -> float16x4_t {
+    unsafe { simd_ceil(a) }
+}
+#[doc = "Floating-point round to integral, toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndpq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frintp))]
+pub fn vrndpq_f16(a: float16x8_t) -> float16x8_t {
+    unsafe { simd_ceil(a) }
+}
+#[doc = "Floating-point round to integral, toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndp_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintp))]
+pub fn vrndp_f32(a: float32x2_t) -> float32x2_t {
+    unsafe { simd_ceil(a) }
+}
+#[doc = "Floating-point round to integral, toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndpq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintp))]
+pub fn vrndpq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe { simd_ceil(a) }
+}
+#[doc = "Floating-point round to integral, toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndp_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintp))]
+pub fn vrndp_f64(a: float64x1_t) -> float64x1_t {
+    unsafe { simd_ceil(a) }
+}
+#[doc = "Floating-point round to integral, toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndpq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintp))]
+pub fn vrndpq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe { simd_ceil(a) }
+}
+#[doc = "Floating-point round to integral, toward plus infinity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndph_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frintp))]
+pub fn vrndph_f16(a: f16) -> f16 {
+    unsafe { ceilf16(a) }
+}
+#[doc = "Floating-point round to integral exact, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndx_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frintx))]
+pub fn vrndx_f16(a: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.rint.v4f16"
+        )]
+        fn _vrndx_f16(a: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vrndx_f16(a) }
+}
+#[doc = "Floating-point round to integral exact, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndxq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frintx))]
+pub fn vrndxq_f16(a: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.rint.v8f16"
+        )]
+        fn _vrndxq_f16(a: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vrndxq_f16(a) }
+}
+#[doc = "Floating-point round to integral exact, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndx_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintx))]
+pub fn vrndx_f32(a: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.rint.v2f32"
+        )]
+        fn _vrndx_f32(a: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vrndx_f32(a) }
+}
+#[doc = "Floating-point round to integral exact, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndxq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintx))]
+pub fn vrndxq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.rint.v4f32"
+        )]
+        fn _vrndxq_f32(a: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vrndxq_f32(a) }
+}
+#[doc = "Floating-point round to integral exact, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndx_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintx))]
+pub fn vrndx_f64(a: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.rint.v1f64"
+        )]
+        fn _vrndx_f64(a: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vrndx_f64(a) }
+}
+#[doc = "Floating-point round to integral exact, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndxq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(frintx))]
+pub fn vrndxq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.rint.v2f64"
+        )]
+        fn _vrndxq_f64(a: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vrndxq_f64(a) }
+}
+#[doc = "Floating-point round to integral, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndxh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(frintx))]
+pub fn vrndxh_f16(a: f16) -> f16 {
+    round_ties_even_f16(a)
+}
+#[doc = "Signed rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshld_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(srshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrshld_s64(a: i64, b: i64) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srshl.i64"
+        )]
+        fn _vrshld_s64(a: i64, b: i64) -> i64;
+    }
+    unsafe { _vrshld_s64(a, b) }
+}
+#[doc = "Unsigned rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshld_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(urshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrshld_u64(a: u64, b: i64) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urshl.i64"
+        )]
+        fn _vrshld_u64(a: u64, b: i64) -> u64;
+    }
+    unsafe { _vrshld_u64(a, b) }
+}
+#[doc = "Signed rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrd_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(srshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrshrd_n_s64<const N: i32>(a: i64) -> i64 {
+    static_assert!(N >= 1 && N <= 64);
+    vrshld_s64(a, -N as i64)
+}
+#[doc = "Unsigned rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrd_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrshrd_n_u64<const N: i32>(a: u64) -> u64 {
+    static_assert!(N >= 1 && N <= 64);
+    vrshld_u64(a, -N as i64)
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_high_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            vrshrn_n_s16::<N>(b),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        )
+    }
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_high_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_shuffle!(a, vrshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_high_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_shuffle!(a, vrshrn_n_s64::<N>(b), [0, 1, 2, 3]) }
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_high_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            vrshrn_n_u16::<N>(b),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        )
+    }
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_high_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_shuffle!(a, vrshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_high_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_shuffle!(a, vrshrn_n_u64::<N>(b), [0, 1, 2, 3]) }
+}
+#[doc = "Reciprocal square-root estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrte_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsqrte_f64(a: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrte.v1f64"
+        )]
+        fn _vrsqrte_f64(a: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vrsqrte_f64(a) }
+}
+#[doc = "Reciprocal square-root estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrteq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsqrteq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrte.v2f64"
+        )]
+        fn _vrsqrteq_f64(a: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vrsqrteq_f64(a) }
+}
+#[doc = "Reciprocal square-root estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrted_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsqrted_f64(a: f64) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrte.f64"
+        )]
+        fn _vrsqrted_f64(a: f64) -> f64;
+    }
+    unsafe { _vrsqrted_f64(a) }
+}
+#[doc = "Reciprocal square-root estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrtes_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsqrtes_f32(a: f32) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrte.f32"
+        )]
+        fn _vrsqrtes_f32(a: f32) -> f32;
+    }
+    unsafe { _vrsqrtes_f32(a) }
+}
+#[doc = "Reciprocal square-root estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrteh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrsqrteh_f16(a: f16) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrte.f16"
+        )]
+        fn _vrsqrteh_f16(a: f16) -> f16;
+    }
+    unsafe { _vrsqrteh_f16(a) }
+}
+#[doc = "Floating-point reciprocal square root step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrts_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrts))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsqrts_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrts.v1f64"
+        )]
+        fn _vrsqrts_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vrsqrts_f64(a, b) }
+}
+#[doc = "Floating-point reciprocal square root step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrtsq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrts))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsqrtsq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrts.v2f64"
+        )]
+        fn _vrsqrtsq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vrsqrtsq_f64(a, b) }
+}
+#[doc = "Floating-point reciprocal square root step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrtsd_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrts))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsqrtsd_f64(a: f64, b: f64) -> f64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrts.f64"
+        )]
+        fn _vrsqrtsd_f64(a: f64, b: f64) -> f64;
+    }
+    unsafe { _vrsqrtsd_f64(a, b) }
+}
+#[doc = "Floating-point reciprocal square root step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrtss_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrts))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsqrtss_f32(a: f32, b: f32) -> f32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrts.f32"
+        )]
+        fn _vrsqrtss_f32(a: f32, b: f32) -> f32;
+    }
+    unsafe { _vrsqrtss_f32(a, b) }
+}
+#[doc = "Floating-point reciprocal square root step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrtsh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(test, assert_instr(frsqrts))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrsqrtsh_f16(a: f16, b: f16) -> f16 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrts.f16"
+        )]
+        fn _vrsqrtsh_f16(a: f16, b: f16) -> f16;
+    }
+    unsafe { _vrsqrtsh_f16(a, b) }
+}
+#[doc = "Signed rounding shift right and accumulate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsrad_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(srshr, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsrad_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
+    static_assert!(N >= 1 && N <= 64);
+    let b: i64 = vrshrd_n_s64::<N>(b);
+    a.wrapping_add(b)
+}
+#[doc = "Unsigned rounding shift right and accumulate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsrad_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsrad_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
+    static_assert!(N >= 1 && N <= 64);
+    let b: u64 = vrshrd_n_u64::<N>(b);
+    a.wrapping_add(b)
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "little")]
+#[cfg_attr(test, assert_instr(rsubhn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
+    let x: int8x8_t = vrsubhn_s16(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "little")]
+#[cfg_attr(test, assert_instr(rsubhn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
+    let x: int16x4_t = vrsubhn_s32(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "little")]
+#[cfg_attr(test, assert_instr(rsubhn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
+    let x: int32x2_t = vrsubhn_s64(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "little")]
+#[cfg_attr(test, assert_instr(rsubhn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
+    let x: uint8x8_t = vrsubhn_u16(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "little")]
+#[cfg_attr(test, assert_instr(rsubhn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
+    let x: uint16x4_t = vrsubhn_u32(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "little")]
+#[cfg_attr(test, assert_instr(rsubhn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
+    let x: uint32x2_t = vrsubhn_u64(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "big")]
+#[cfg_attr(test, assert_instr(rsubhn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
+    let x: int8x8_t = vrsubhn_s16(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "big")]
+#[cfg_attr(test, assert_instr(rsubhn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
+    let x: int16x4_t = vrsubhn_s32(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "big")]
+#[cfg_attr(test, assert_instr(rsubhn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
+    let x: int32x2_t = vrsubhn_s64(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "big")]
+#[cfg_attr(test, assert_instr(rsubhn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
+    let x: uint8x8_t = vrsubhn_u16(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "big")]
+#[cfg_attr(test, assert_instr(rsubhn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
+    let x: uint16x4_t = vrsubhn_u32(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "big")]
+#[cfg_attr(test, assert_instr(rsubhn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
+    let x: uint32x2_t = vrsubhn_u64(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3]) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vset_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> float64x1_t {
+    static_assert!(LANE == 0);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsetq_lane_f64<const LANE: i32>(a: f64, b: float64x2_t) -> float64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "SHA512 hash update part 2"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha512h2q_u64)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(sha512h2))]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+pub fn vsha512h2q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha512h2"
+        )]
+        fn _vsha512h2q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vsha512h2q_u64(a, b, c) }
+}
+#[doc = "SHA512 hash update part 1"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha512hq_u64)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(sha512h))]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+pub fn vsha512hq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha512h"
+        )]
+        fn _vsha512hq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vsha512hq_u64(a, b, c) }
+}
+#[doc = "SHA512 schedule update 0"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha512su0q_u64)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(sha512su0))]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+pub fn vsha512su0q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha512su0"
+        )]
+        fn _vsha512su0q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vsha512su0q_u64(a, b) }
+}
+#[doc = "SHA512 schedule update 1"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha512su1q_u64)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(sha512su1))]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+pub fn vsha512su1q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha512su1"
+        )]
+        fn _vsha512su1q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vsha512su1q_u64(a, b, c) }
+}
+#[doc = "Signed Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshld_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshld_s64(a: i64, b: i64) -> i64 {
+    unsafe { transmute(vshl_s64(transmute(a), transmute(b))) }
+}
+#[doc = "Unsigned Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshld_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ushl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshld_u64(a: u64, b: i64) -> u64 {
+    unsafe { transmute(vshl_u64(transmute(a), transmute(b))) }
+}
+#[doc = "Signed shift left long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_high_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sshll2, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshll_high_n_s8<const N: i32>(a: int8x16_t) -> int16x8_t {
+    static_assert!(N >= 0 && N <= 8);
+    unsafe {
+        let b: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        vshll_n_s8::<N>(b)
+    }
+}
+#[doc = "Signed shift left long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_high_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sshll2, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshll_high_n_s16<const N: i32>(a: int16x8_t) -> int32x4_t {
+    static_assert!(N >= 0 && N <= 16);
+    unsafe {
+        let b: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        vshll_n_s16::<N>(b)
+    }
+}
+#[doc = "Signed shift left long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_high_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sshll2, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshll_high_n_s32<const N: i32>(a: int32x4_t) -> int64x2_t {
+    static_assert!(N >= 0 && N <= 32);
+    unsafe {
+        let b: int32x2_t = simd_shuffle!(a, a, [2, 3]);
+        vshll_n_s32::<N>(b)
+    }
+}
+#[doc = "Signed shift left long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_high_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ushll2, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshll_high_n_u8<const N: i32>(a: uint8x16_t) -> uint16x8_t {
+    static_assert!(N >= 0 && N <= 8);
+    unsafe {
+        let b: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        vshll_n_u8::<N>(b)
+    }
+}
+#[doc = "Signed shift left long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_high_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ushll2, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshll_high_n_u16<const N: i32>(a: uint16x8_t) -> uint32x4_t {
+    static_assert!(N >= 0 && N <= 16);
+    unsafe {
+        let b: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        vshll_n_u16::<N>(b)
+    }
+}
+#[doc = "Signed shift left long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_high_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ushll2, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshll_high_n_u32<const N: i32>(a: uint32x4_t) -> uint64x2_t {
+    static_assert!(N >= 0 && N <= 32);
+    unsafe {
+        let b: uint32x2_t = simd_shuffle!(a, a, [2, 3]);
+        vshll_n_u32::<N>(b)
+    }
+}
+#[doc = "Shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_high_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(shrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            vshrn_n_s16::<N>(b),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        )
+    }
+}
+#[doc = "Shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_high_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(shrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_shuffle!(a, vshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_high_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(shrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_shuffle!(a, vshrn_n_s64::<N>(b), [0, 1, 2, 3]) }
+}
+#[doc = "Shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_high_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(shrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            vshrn_n_u16::<N>(b),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        )
+    }
+}
+#[doc = "Shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_high_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(shrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_shuffle!(a, vshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_high_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(shrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_shuffle!(a, vshrn_n_u64::<N>(b), [0, 1, 2, 3]) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsli_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsli.v8i8"
+        )]
+        fn _vsli_n_s8(a: int8x8_t, b: int8x8_t, n: i32) -> int8x8_t;
+    }
+    unsafe { _vsli_n_s8(a, b, N) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsliq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsli.v16i8"
+        )]
+        fn _vsliq_n_s8(a: int8x16_t, b: int8x16_t, n: i32) -> int8x16_t;
+    }
+    unsafe { _vsliq_n_s8(a, b, N) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsli_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsli.v4i16"
+        )]
+        fn _vsli_n_s16(a: int16x4_t, b: int16x4_t, n: i32) -> int16x4_t;
+    }
+    unsafe { _vsli_n_s16(a, b, N) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsliq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsli.v8i16"
+        )]
+        fn _vsliq_n_s16(a: int16x8_t, b: int16x8_t, n: i32) -> int16x8_t;
+    }
+    unsafe { _vsliq_n_s16(a, b, N) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsli_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N >= 0 && N <= 31);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsli.v2i32"
+        )]
+        fn _vsli_n_s32(a: int32x2_t, b: int32x2_t, n: i32) -> int32x2_t;
+    }
+    unsafe { _vsli_n_s32(a, b, N) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsliq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N >= 0 && N <= 31);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsli.v4i32"
+        )]
+        fn _vsliq_n_s32(a: int32x4_t, b: int32x4_t, n: i32) -> int32x4_t;
+    }
+    unsafe { _vsliq_n_s32(a, b, N) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsli_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N >= 0 && N <= 63);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsli.v1i64"
+        )]
+        fn _vsli_n_s64(a: int64x1_t, b: int64x1_t, n: i32) -> int64x1_t;
+    }
+    unsafe { _vsli_n_s64(a, b, N) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsliq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N >= 0 && N <= 63);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsli.v2i64"
+        )]
+        fn _vsliq_n_s64(a: int64x2_t, b: int64x2_t, n: i32) -> int64x2_t;
+    }
+    unsafe { _vsliq_n_s64(a, b, N) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsli_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { transmute(vsli_n_s8::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsliq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { transmute(vsliq_n_s8::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsli_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { transmute(vsli_n_s16::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsliq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { transmute(vsliq_n_s16::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsli_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N >= 0 && N <= 31);
+    unsafe { transmute(vsli_n_s32::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsliq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N >= 0 && N <= 31);
+    unsafe { transmute(vsliq_n_s32::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsli_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N >= 0 && N <= 63);
+    unsafe { transmute(vsli_n_s64::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsliq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N >= 0 && N <= 63);
+    unsafe { transmute(vsliq_n_s64::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsli_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { transmute(vsli_n_s8::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsliq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { transmute(vsliq_n_s8::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsli_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { transmute(vsli_n_s16::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsliq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { transmute(vsliq_n_s16::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_p64)"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsli_n_p64<const N: i32>(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(N >= 0 && N <= 63);
+    unsafe { transmute(vsli_n_s64::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_p64)"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsliq_n_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert!(N >= 0 && N <= 63);
+    unsafe { transmute(vsliq_n_s64::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift left and insert"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vslid_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sli, N = 2))]
+pub fn vslid_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
+    static_assert!(N >= 0 && N <= 63);
+    unsafe { transmute(vsli_n_s64::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift left and insert"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vslid_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sli, N = 2))]
+pub fn vslid_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
+    static_assert!(N >= 0 && N <= 63);
+    unsafe { transmute(vsli_n_u64::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "SM3PARTW1"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsm3partw1q_u32)"]
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3partw1))]
+#[unstable(feature = "stdarch_neon_sm4", issue = "117226")]
+pub fn vsm3partw1q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sm3partw1"
+        )]
+        fn _vsm3partw1q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vsm3partw1q_u32(a, b, c) }
+}
+#[doc = "SM3PARTW2"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsm3partw2q_u32)"]
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3partw2))]
+#[unstable(feature = "stdarch_neon_sm4", issue = "117226")]
+pub fn vsm3partw2q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sm3partw2"
+        )]
+        fn _vsm3partw2q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vsm3partw2q_u32(a, b, c) }
+}
+#[doc = "SM3SS1"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsm3ss1q_u32)"]
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3ss1))]
+#[unstable(feature = "stdarch_neon_sm4", issue = "117226")]
+pub fn vsm3ss1q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sm3ss1"
+        )]
+        fn _vsm3ss1q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vsm3ss1q_u32(a, b, c) }
+}
+#[doc = "SM3TT1A"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsm3tt1aq_u32)"]
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3tt1a, IMM2 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_sm4", issue = "117226")]
+pub fn vsm3tt1aq_u32<const IMM2: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(IMM2, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sm3tt1a"
+        )]
+        fn _vsm3tt1aq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, n: i64) -> uint32x4_t;
+    }
+    unsafe { _vsm3tt1aq_u32(a, b, c, IMM2 as i64) }
+}
+#[doc = "SM3TT1B"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsm3tt1bq_u32)"]
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3tt1b, IMM2 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_sm4", issue = "117226")]
+pub fn vsm3tt1bq_u32<const IMM2: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(IMM2, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sm3tt1b"
+        )]
+        fn _vsm3tt1bq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, n: i64) -> uint32x4_t;
+    }
+    unsafe { _vsm3tt1bq_u32(a, b, c, IMM2 as i64) }
+}
+#[doc = "SM3TT2A"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsm3tt2aq_u32)"]
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3tt2a, IMM2 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_sm4", issue = "117226")]
+pub fn vsm3tt2aq_u32<const IMM2: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(IMM2, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sm3tt2a"
+        )]
+        fn _vsm3tt2aq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, n: i64) -> uint32x4_t;
+    }
+    unsafe { _vsm3tt2aq_u32(a, b, c, IMM2 as i64) }
+}
+#[doc = "SM3TT2B"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsm3tt2bq_u32)"]
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3tt2b, IMM2 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_sm4", issue = "117226")]
+pub fn vsm3tt2bq_u32<const IMM2: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(IMM2, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sm3tt2b"
+        )]
+        fn _vsm3tt2bq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, n: i64) -> uint32x4_t;
+    }
+    unsafe { _vsm3tt2bq_u32(a, b, c, IMM2 as i64) }
+}
+#[doc = "SM4 key"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsm4ekeyq_u32)"]
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm4ekey))]
+#[unstable(feature = "stdarch_neon_sm4", issue = "117226")]
+pub fn vsm4ekeyq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sm4ekey"
+        )]
+        fn _vsm4ekeyq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vsm4ekeyq_u32(a, b) }
+}
+#[doc = "SM4 encode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsm4eq_u32)"]
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm4e))]
+#[unstable(feature = "stdarch_neon_sm4", issue = "117226")]
+pub fn vsm4eq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sm4e"
+        )]
+        fn _vsm4eq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vsm4eq_u32(a, b) }
+}
+#[doc = "Unsigned saturating Accumulate of Signed value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqadd_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub fn vsqadd_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.usqadd.v8i8"
+        )]
+        fn _vsqadd_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vsqadd_u8(a, b) }
+}
+#[doc = "Unsigned saturating Accumulate of Signed value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqaddq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub fn vsqaddq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.usqadd.v16i8"
+        )]
+        fn _vsqaddq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vsqaddq_u8(a, b) }
+}
+#[doc = "Unsigned saturating Accumulate of Signed value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqadd_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub fn vsqadd_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.usqadd.v4i16"
+        )]
+        fn _vsqadd_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vsqadd_u16(a, b) }
+}
+#[doc = "Unsigned saturating Accumulate of Signed value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqaddq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub fn vsqaddq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.usqadd.v8i16"
+        )]
+        fn _vsqaddq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vsqaddq_u16(a, b) }
+}
+#[doc = "Unsigned saturating Accumulate of Signed value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqadd_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub fn vsqadd_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.usqadd.v2i32"
+        )]
+        fn _vsqadd_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vsqadd_u32(a, b) }
+}
+#[doc = "Unsigned saturating Accumulate of Signed value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqaddq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub fn vsqaddq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.usqadd.v4i32"
+        )]
+        fn _vsqaddq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vsqaddq_u32(a, b) }
+}
+#[doc = "Unsigned saturating Accumulate of Signed value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqadd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub fn vsqadd_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.usqadd.v1i64"
+        )]
+        fn _vsqadd_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vsqadd_u64(a, b) }
+}
+#[doc = "Unsigned saturating Accumulate of Signed value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqaddq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub fn vsqaddq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.usqadd.v2i64"
+        )]
+        fn _vsqaddq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vsqaddq_u64(a, b) }
+}
+#[doc = "Unsigned saturating accumulate of signed value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqaddb_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsqaddb_u8(a: u8, b: i8) -> u8 {
+    unsafe { simd_extract!(vsqadd_u8(vdup_n_u8(a), vdup_n_s8(b)), 0) }
+}
+#[doc = "Unsigned saturating accumulate of signed value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqaddh_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsqaddh_u16(a: u16, b: i16) -> u16 {
+    unsafe { simd_extract!(vsqadd_u16(vdup_n_u16(a), vdup_n_s16(b)), 0) }
+}
+#[doc = "Unsigned saturating accumulate of signed value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqaddd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsqaddd_u64(a: u64, b: i64) -> u64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.usqadd.i64"
+        )]
+        fn _vsqaddd_u64(a: u64, b: i64) -> u64;
+    }
+    unsafe { _vsqaddd_u64(a, b) }
+}
+#[doc = "Unsigned saturating accumulate of signed value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqadds_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsqadds_u32(a: u32, b: i32) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.usqadd.i32"
+        )]
+        fn _vsqadds_u32(a: u32, b: i32) -> u32;
+    }
+    unsafe { _vsqadds_u32(a, b) }
+}
+#[doc = "Calculates the square root of each lane."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqrt_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fsqrt))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vsqrt_f16(a: float16x4_t) -> float16x4_t {
+    unsafe { simd_fsqrt(a) }
+}
+#[doc = "Calculates the square root of each lane."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqrtq_f16)"]
+#[inline]
+#[cfg_attr(test, assert_instr(fsqrt))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vsqrtq_f16(a: float16x8_t) -> float16x8_t {
+    unsafe { simd_fsqrt(a) }
+}
+#[doc = "Calculates the square root of each lane."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqrt_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsqrt_f32(a: float32x2_t) -> float32x2_t {
+    unsafe { simd_fsqrt(a) }
+}
+#[doc = "Calculates the square root of each lane."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqrtq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsqrtq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe { simd_fsqrt(a) }
+}
+#[doc = "Calculates the square root of each lane."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqrt_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsqrt_f64(a: float64x1_t) -> float64x1_t {
+    unsafe { simd_fsqrt(a) }
+}
+#[doc = "Calculates the square root of each lane."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqrtq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsqrtq_f64(a: float64x2_t) -> float64x2_t {
+    unsafe { simd_fsqrt(a) }
+}
+#[doc = "Floating-point round to integral, using current rounding mode"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqrth_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+pub fn vsqrth_f16(a: f16) -> f16 {
+    unsafe { sqrtf16(a) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsri_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsri.v8i8"
+        )]
+        fn _vsri_n_s8(a: int8x8_t, b: int8x8_t, n: i32) -> int8x8_t;
+    }
+    unsafe { _vsri_n_s8(a, b, N) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsriq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsri.v16i8"
+        )]
+        fn _vsriq_n_s8(a: int8x16_t, b: int8x16_t, n: i32) -> int8x16_t;
+    }
+    unsafe { _vsriq_n_s8(a, b, N) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsri_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsri.v4i16"
+        )]
+        fn _vsri_n_s16(a: int16x4_t, b: int16x4_t, n: i32) -> int16x4_t;
+    }
+    unsafe { _vsri_n_s16(a, b, N) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsriq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsri.v8i16"
+        )]
+        fn _vsriq_n_s16(a: int16x8_t, b: int16x8_t, n: i32) -> int16x8_t;
+    }
+    unsafe { _vsriq_n_s16(a, b, N) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsri_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsri.v2i32"
+        )]
+        fn _vsri_n_s32(a: int32x2_t, b: int32x2_t, n: i32) -> int32x2_t;
+    }
+    unsafe { _vsri_n_s32(a, b, N) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsriq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsri.v4i32"
+        )]
+        fn _vsriq_n_s32(a: int32x4_t, b: int32x4_t, n: i32) -> int32x4_t;
+    }
+    unsafe { _vsriq_n_s32(a, b, N) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsri_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsri.v1i64"
+        )]
+        fn _vsri_n_s64(a: int64x1_t, b: int64x1_t, n: i32) -> int64x1_t;
+    }
+    unsafe { _vsri_n_s64(a, b, N) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsriq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vsri.v2i64"
+        )]
+        fn _vsriq_n_s64(a: int64x2_t, b: int64x2_t, n: i32) -> int64x2_t;
+    }
+    unsafe { _vsriq_n_s64(a, b, N) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsri_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { transmute(vsri_n_s8::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsriq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { transmute(vsriq_n_s8::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsri_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { transmute(vsri_n_s16::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsriq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { transmute(vsriq_n_s16::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsri_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { transmute(vsri_n_s32::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsriq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { transmute(vsriq_n_s32::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsri_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { transmute(vsri_n_s64::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsriq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { transmute(vsriq_n_s64::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsri_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { transmute(vsri_n_s8::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsriq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { transmute(vsriq_n_s8::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsri_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { transmute(vsri_n_s16::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsriq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { transmute(vsriq_n_s16::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_p64)"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsri_n_p64<const N: i32>(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { transmute(vsri_n_s64::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_p64)"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsriq_n_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { transmute(vsriq_n_s64::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift right and insert"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsrid_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sri, N = 2))]
+pub fn vsrid_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { transmute(vsri_n_s64::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Shift right and insert"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsrid_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sri, N = 2))]
+pub fn vsrid_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { transmute(vsri_n_u64::<N>(transmute(a), transmute(b))) }
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1_f16(ptr: *mut f16, a: float16x4_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1q_f16(ptr: *mut f16, a: float16x8_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f32(ptr: *mut f32, a: float32x2_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f32(ptr: *mut f32, a: float32x4_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f64(ptr: *mut f64, a: float64x1_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f64(ptr: *mut f64, a: float64x2_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s8(ptr: *mut i8, a: int8x8_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s8(ptr: *mut i8, a: int8x16_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s16(ptr: *mut i16, a: int16x4_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s16(ptr: *mut i16, a: int16x8_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s32(ptr: *mut i32, a: int32x2_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s32(ptr: *mut i32, a: int32x4_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s64(ptr: *mut i64, a: int64x1_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s64(ptr: *mut i64, a: int64x2_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_u8(ptr: *mut u8, a: uint8x8_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_u8(ptr: *mut u8, a: uint8x16_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_u16(ptr: *mut u16, a: uint16x4_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_u16(ptr: *mut u16, a: uint16x8_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_u32(ptr: *mut u32, a: uint32x2_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_u32(ptr: *mut u32, a: uint32x4_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_u64(ptr: *mut u64, a: uint64x1_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_u64(ptr: *mut u64, a: uint64x2_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_p8(ptr: *mut p8, a: poly8x8_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_p8(ptr: *mut p8, a: poly8x16_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_p16(ptr: *mut p16, a: poly16x4_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_p16(ptr: *mut p16, a: poly16x8_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_p64(ptr: *mut p64, a: poly64x1_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_p64(ptr: *mut p64, a: poly64x2_t) {
+    crate::ptr::write_unaligned(ptr.cast(), a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f64_x2(a: *mut f64, b: float64x1x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v1f64.p0"
+        )]
+        fn _vst1_f64_x2(a: float64x1_t, b: float64x1_t, ptr: *mut f64);
+    }
+    _vst1_f64_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f64_x2(a: *mut f64, b: float64x2x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v2f64.p0"
+        )]
+        fn _vst1q_f64_x2(a: float64x2_t, b: float64x2_t, ptr: *mut f64);
+    }
+    _vst1q_f64_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f64_x3(a: *mut f64, b: float64x1x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v1f64.p0"
+        )]
+        fn _vst1_f64_x3(a: float64x1_t, b: float64x1_t, c: float64x1_t, ptr: *mut f64);
+    }
+    _vst1_f64_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f64_x3(a: *mut f64, b: float64x2x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v2f64.p0"
+        )]
+        fn _vst1q_f64_x3(a: float64x2_t, b: float64x2_t, c: float64x2_t, ptr: *mut f64);
+    }
+    _vst1q_f64_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f64_x4(a: *mut f64, b: float64x1x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v1f64.p0"
+        )]
+        fn _vst1_f64_x4(
+            a: float64x1_t,
+            b: float64x1_t,
+            c: float64x1_t,
+            d: float64x1_t,
+            ptr: *mut f64,
+        );
+    }
+    _vst1_f64_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f64_x4(a: *mut f64, b: float64x2x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v2f64.p0"
+        )]
+        fn _vst1q_f64_x4(
+            a: float64x2_t,
+            b: float64x2_t,
+            c: float64x2_t,
+            d: float64x2_t,
+            ptr: *mut f64,
+        );
+    }
+    _vst1q_f64_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_lane_f64<const LANE: i32>(a: *mut f64, b: float64x1_t) {
+    static_assert!(LANE == 0);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst2_f64(a: *mut f64, b: float64x1x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v1f64.p0"
+        )]
+        fn _vst2_f64(a: float64x1_t, b: float64x1_t, ptr: *mut i8);
+    }
+    _vst2_f64(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_f64<const LANE: i32>(a: *mut f64, b: float64x1x2_t) {
+    static_assert!(LANE == 0);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v1f64.p0"
+        )]
+        fn _vst2_lane_f64(a: float64x1_t, b: float64x1_t, n: i64, ptr: *mut i8);
+    }
+    _vst2_lane_f64(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_s64<const LANE: i32>(a: *mut i64, b: int64x1x2_t) {
+    static_assert!(LANE == 0);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v1i64.p0"
+        )]
+        fn _vst2_lane_s64(a: int64x1_t, b: int64x1_t, n: i64, ptr: *mut i8);
+    }
+    _vst2_lane_s64(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x1x2_t) {
+    static_assert!(LANE == 0);
+    vst2_lane_s64::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x1x2_t) {
+    static_assert!(LANE == 0);
+    vst2_lane_s64::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st2))]
+pub unsafe fn vst2q_f64(a: *mut f64, b: float64x2x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v2f64.p0"
+        )]
+        fn _vst2q_f64(a: float64x2_t, b: float64x2_t, ptr: *mut i8);
+    }
+    _vst2q_f64(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st2))]
+pub unsafe fn vst2q_s64(a: *mut i64, b: int64x2x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v2i64.p0"
+        )]
+        fn _vst2q_s64(a: int64x2_t, b: int64x2_t, ptr: *mut i8);
+    }
+    _vst2q_s64(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v2f64.p0"
+        )]
+        fn _vst2q_lane_f64(a: float64x2_t, b: float64x2_t, n: i64, ptr: *mut i8);
+    }
+    _vst2q_lane_f64(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_s8<const LANE: i32>(a: *mut i8, b: int8x16x2_t) {
+    static_assert_uimm_bits!(LANE, 4);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v16i8.p0"
+        )]
+        fn _vst2q_lane_s8(a: int8x16_t, b: int8x16_t, n: i64, ptr: *mut i8);
+    }
+    _vst2q_lane_s8(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_s64<const LANE: i32>(a: *mut i64, b: int64x2x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v2i64.p0"
+        )]
+        fn _vst2q_lane_s64(a: int64x2_t, b: int64x2_t, n: i64, ptr: *mut i8);
+    }
+    _vst2q_lane_s64(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x2x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    vst2q_lane_s64::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x16x2_t) {
+    static_assert_uimm_bits!(LANE, 4);
+    vst2q_lane_s8::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x2x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    vst2q_lane_s64::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x16x2_t) {
+    static_assert_uimm_bits!(LANE, 4);
+    vst2q_lane_s8::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_p64(a: *mut p64, b: poly64x2x2_t) {
+    vst2q_s64(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st2))]
+pub unsafe fn vst2q_u64(a: *mut u64, b: uint64x2x2_t) {
+    vst2q_s64(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vst3_f64(a: *mut f64, b: float64x1x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v1f64.p0"
+        )]
+        fn _vst3_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t, ptr: *mut i8);
+    }
+    _vst3_f64(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_f64<const LANE: i32>(a: *mut f64, b: float64x1x3_t) {
+    static_assert!(LANE == 0);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v1f64.p0"
+        )]
+        fn _vst3_lane_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t, n: i64, ptr: *mut i8);
+    }
+    _vst3_lane_f64(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_s64<const LANE: i32>(a: *mut i64, b: int64x1x3_t) {
+    static_assert!(LANE == 0);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v1i64.p0"
+        )]
+        fn _vst3_lane_s64(a: int64x1_t, b: int64x1_t, c: int64x1_t, n: i64, ptr: *mut i8);
+    }
+    _vst3_lane_s64(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst3_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x1x3_t) {
+    static_assert!(LANE == 0);
+    vst3_lane_s64::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst3_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x1x3_t) {
+    static_assert!(LANE == 0);
+    vst3_lane_s64::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3q_f64(a: *mut f64, b: float64x2x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v2f64.p0"
+        )]
+        fn _vst3q_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t, ptr: *mut i8);
+    }
+    _vst3q_f64(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3q_s64(a: *mut i64, b: int64x2x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v2i64.p0"
+        )]
+        fn _vst3q_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t, ptr: *mut i8);
+    }
+    _vst3q_s64(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2x3_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v2f64.p0"
+        )]
+        fn _vst3q_lane_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t, n: i64, ptr: *mut i8);
+    }
+    _vst3q_lane_f64(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_s8<const LANE: i32>(a: *mut i8, b: int8x16x3_t) {
+    static_assert_uimm_bits!(LANE, 4);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v16i8.p0"
+        )]
+        fn _vst3q_lane_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t, n: i64, ptr: *mut i8);
+    }
+    _vst3q_lane_s8(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_s64<const LANE: i32>(a: *mut i64, b: int64x2x3_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v2i64.p0"
+        )]
+        fn _vst3q_lane_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t, n: i64, ptr: *mut i8);
+    }
+    _vst3q_lane_s64(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst3q_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x2x3_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    vst3q_lane_s64::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst3q_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x16x3_t) {
+    static_assert_uimm_bits!(LANE, 4);
+    vst3q_lane_s8::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst3q_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x2x3_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    vst3q_lane_s64::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst3q_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x16x3_t) {
+    static_assert_uimm_bits!(LANE, 4);
+    vst3q_lane_s8::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3q_p64(a: *mut p64, b: poly64x2x3_t) {
+    vst3q_s64(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3q_u64(a: *mut u64, b: uint64x2x3_t) {
+    vst3q_s64(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vst4_f64(a: *mut f64, b: float64x1x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v1f64.p0"
+        )]
+        fn _vst4_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, ptr: *mut i8);
+    }
+    _vst4_f64(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_f64<const LANE: i32>(a: *mut f64, b: float64x1x4_t) {
+    static_assert!(LANE == 0);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v1f64.p0"
+        )]
+        fn _vst4_lane_f64(
+            a: float64x1_t,
+            b: float64x1_t,
+            c: float64x1_t,
+            d: float64x1_t,
+            n: i64,
+            ptr: *mut i8,
+        );
+    }
+    _vst4_lane_f64(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_s64<const LANE: i32>(a: *mut i64, b: int64x1x4_t) {
+    static_assert!(LANE == 0);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v1i64.p0"
+        )]
+        fn _vst4_lane_s64(
+            a: int64x1_t,
+            b: int64x1_t,
+            c: int64x1_t,
+            d: int64x1_t,
+            n: i64,
+            ptr: *mut i8,
+        );
+    }
+    _vst4_lane_s64(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst4_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x1x4_t) {
+    static_assert!(LANE == 0);
+    vst4_lane_s64::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst4_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x1x4_t) {
+    static_assert!(LANE == 0);
+    vst4_lane_s64::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4q_f64(a: *mut f64, b: float64x2x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v2f64.p0"
+        )]
+        fn _vst4q_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, ptr: *mut i8);
+    }
+    _vst4q_f64(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4q_s64(a: *mut i64, b: int64x2x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v2i64.p0"
+        )]
+        fn _vst4q_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, ptr: *mut i8);
+    }
+    _vst4q_s64(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_f64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2x4_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v2f64.p0"
+        )]
+        fn _vst4q_lane_f64(
+            a: float64x2_t,
+            b: float64x2_t,
+            c: float64x2_t,
+            d: float64x2_t,
+            n: i64,
+            ptr: *mut i8,
+        );
+    }
+    _vst4q_lane_f64(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_s8<const LANE: i32>(a: *mut i8, b: int8x16x4_t) {
+    static_assert_uimm_bits!(LANE, 4);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v16i8.p0"
+        )]
+        fn _vst4q_lane_s8(
+            a: int8x16_t,
+            b: int8x16_t,
+            c: int8x16_t,
+            d: int8x16_t,
+            n: i64,
+            ptr: *mut i8,
+        );
+    }
+    _vst4q_lane_s8(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_s64<const LANE: i32>(a: *mut i64, b: int64x2x4_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v2i64.p0"
+        )]
+        fn _vst4q_lane_s64(
+            a: int64x2_t,
+            b: int64x2_t,
+            c: int64x2_t,
+            d: int64x2_t,
+            n: i64,
+            ptr: *mut i8,
+        );
+    }
+    _vst4q_lane_s64(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst4q_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x2x4_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    vst4q_lane_s64::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst4q_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x16x4_t) {
+    static_assert_uimm_bits!(LANE, 4);
+    vst4q_lane_s8::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst4q_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x2x4_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    vst4q_lane_s64::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst4q_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x16x4_t) {
+    static_assert_uimm_bits!(LANE, 4);
+    vst4q_lane_s8::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4q_p64(a: *mut p64, b: poly64x2x4_t) {
+    vst4q_s64(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4q_u64(a: *mut u64, b: uint64x2x4_t) {
+    vst4q_s64(transmute(a), transmute(b))
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsub_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fsub))]
+pub fn vsub_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubq_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(fsub))]
+pub fn vsubq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vsubd_s64(a: i64, b: i64) -> i64 {
+    a.wrapping_sub(b)
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vsubd_u64(a: u64, b: u64) -> u64 {
+    a.wrapping_sub(b)
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubh_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vsubh_f16(a: f16, b: f16) -> f16 {
+    a - b
+}
+#[doc = "Signed Subtract Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubl_high_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ssubl))]
+pub fn vsubl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
+    unsafe {
+        let c: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let d: int16x8_t = simd_cast(c);
+        let e: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let f: int16x8_t = simd_cast(e);
+        simd_sub(d, f)
+    }
+}
+#[doc = "Signed Subtract Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubl_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ssubl))]
+pub fn vsubl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    unsafe {
+        let c: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        let d: int32x4_t = simd_cast(c);
+        let e: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        let f: int32x4_t = simd_cast(e);
+        simd_sub(d, f)
+    }
+}
+#[doc = "Signed Subtract Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubl_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ssubl))]
+pub fn vsubl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    unsafe {
+        let c: int32x2_t = simd_shuffle!(a, a, [2, 3]);
+        let d: int64x2_t = simd_cast(c);
+        let e: int32x2_t = simd_shuffle!(b, b, [2, 3]);
+        let f: int64x2_t = simd_cast(e);
+        simd_sub(d, f)
+    }
+}
+#[doc = "Unsigned Subtract Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubl_high_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usubl))]
+pub fn vsubl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
+    unsafe {
+        let c: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let d: uint16x8_t = simd_cast(c);
+        let e: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let f: uint16x8_t = simd_cast(e);
+        simd_sub(d, f)
+    }
+}
+#[doc = "Unsigned Subtract Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubl_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usubl))]
+pub fn vsubl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    unsafe {
+        let c: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        let d: uint32x4_t = simd_cast(c);
+        let e: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        let f: uint32x4_t = simd_cast(e);
+        simd_sub(d, f)
+    }
+}
+#[doc = "Unsigned Subtract Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubl_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usubl))]
+pub fn vsubl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    unsafe {
+        let c: uint32x2_t = simd_shuffle!(a, a, [2, 3]);
+        let d: uint64x2_t = simd_cast(c);
+        let e: uint32x2_t = simd_shuffle!(b, b, [2, 3]);
+        let f: uint64x2_t = simd_cast(e);
+        simd_sub(d, f)
+    }
+}
+#[doc = "Signed Subtract Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ssubw))]
+pub fn vsubw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    unsafe {
+        let c: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        simd_sub(a, simd_cast(c))
+    }
+}
+#[doc = "Signed Subtract Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ssubw))]
+pub fn vsubw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    unsafe {
+        let c: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        simd_sub(a, simd_cast(c))
+    }
+}
+#[doc = "Signed Subtract Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ssubw))]
+pub fn vsubw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    unsafe {
+        let c: int32x2_t = simd_shuffle!(b, b, [2, 3]);
+        simd_sub(a, simd_cast(c))
+    }
+}
+#[doc = "Unsigned Subtract Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usubw))]
+pub fn vsubw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    unsafe {
+        let c: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        simd_sub(a, simd_cast(c))
+    }
+}
+#[doc = "Unsigned Subtract Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usubw))]
+pub fn vsubw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    unsafe {
+        let c: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        simd_sub(a, simd_cast(c))
+    }
+}
+#[doc = "Unsigned Subtract Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(usubw))]
+pub fn vsubw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    unsafe {
+        let c: uint32x2_t = simd_shuffle!(b, b, [2, 3]);
+        simd_sub(a, simd_cast(c))
+    }
+}
+#[doc = "Dot product index form with signed and unsigned integers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudot_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(test, assert_instr(sudot, LANE = 3))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_i8mm", issue = "117223")]
+pub fn vsudot_laneq_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: uint8x16_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: uint32x4_t = transmute(c);
+        let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        vusdot_s32(a, transmute(c), b)
+    }
+}
+#[doc = "Dot product index form with signed and unsigned integers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudotq_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(test, assert_instr(sudot, LANE = 3))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_i8mm", issue = "117223")]
+pub fn vsudotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: uint8x16_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: uint32x4_t = transmute(c);
+        let c: uint32x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vusdotq_s32(a, transmute(c), b)
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl1_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    vqtbl1_s8(vcombine_s8(a, unsafe { crate::mem::zeroed() }), unsafe {
+        {
+            transmute(b)
+        }
+    })
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl1_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    vqtbl1_u8(vcombine_u8(a, unsafe { crate::mem::zeroed() }), b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl1_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl1_p8(a: poly8x8_t, b: uint8x8_t) -> poly8x8_t {
+    vqtbl1_p8(vcombine_p8(a, unsafe { crate::mem::zeroed() }), b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl2_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl2_s8(a: int8x8x2_t, b: int8x8_t) -> int8x8_t {
+    unsafe { vqtbl1(transmute(vcombine_s8(a.0, a.1)), transmute(b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl2_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl2_u8(a: uint8x8x2_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { transmute(vqtbl1(transmute(vcombine_u8(a.0, a.1)), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl2_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl2_u8(a: uint8x8x2_t, b: uint8x8_t) -> uint8x8_t {
+    let mut a: uint8x8x2_t = a;
+    a.0 = unsafe { simd_shuffle!(a.0, a.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.1 = unsafe { simd_shuffle!(a.1, a.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vqtbl1(transmute(vcombine_u8(a.0, a.1)), b));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl2_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl2_p8(a: poly8x8x2_t, b: uint8x8_t) -> poly8x8_t {
+    unsafe { transmute(vqtbl1(transmute(vcombine_p8(a.0, a.1)), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl2_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl2_p8(a: poly8x8x2_t, b: uint8x8_t) -> poly8x8_t {
+    let mut a: poly8x8x2_t = a;
+    a.0 = unsafe { simd_shuffle!(a.0, a.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.1 = unsafe { simd_shuffle!(a.1, a.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vqtbl1(transmute(vcombine_p8(a.0, a.1)), b));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl3_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl3_s8(a: int8x8x3_t, b: int8x8_t) -> int8x8_t {
+    let x = int8x16x2_t(
+        vcombine_s8(a.0, a.1),
+        vcombine_s8(a.2, unsafe { crate::mem::zeroed() }),
+    );
+    unsafe { transmute(vqtbl2(transmute(x.0), transmute(x.1), transmute(b))) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl3_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl3_u8(a: uint8x8x3_t, b: uint8x8_t) -> uint8x8_t {
+    let x = uint8x16x2_t(
+        vcombine_u8(a.0, a.1),
+        vcombine_u8(a.2, unsafe { crate::mem::zeroed() }),
+    );
+    unsafe { transmute(vqtbl2(transmute(x.0), transmute(x.1), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl3_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl3_u8(a: uint8x8x3_t, b: uint8x8_t) -> uint8x8_t {
+    let mut a: uint8x8x3_t = a;
+    a.0 = unsafe { simd_shuffle!(a.0, a.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.1 = unsafe { simd_shuffle!(a.1, a.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.2 = unsafe { simd_shuffle!(a.2, a.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let x = uint8x16x2_t(
+        vcombine_u8(a.0, a.1),
+        vcombine_u8(a.2, unsafe { crate::mem::zeroed() }),
+    );
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vqtbl2(transmute(x.0), transmute(x.1), b));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl3_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl3_p8(a: poly8x8x3_t, b: uint8x8_t) -> poly8x8_t {
+    let x = poly8x16x2_t(
+        vcombine_p8(a.0, a.1),
+        vcombine_p8(a.2, unsafe { crate::mem::zeroed() }),
+    );
+    unsafe { transmute(vqtbl2(transmute(x.0), transmute(x.1), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl3_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl3_p8(a: poly8x8x3_t, b: uint8x8_t) -> poly8x8_t {
+    let mut a: poly8x8x3_t = a;
+    a.0 = unsafe { simd_shuffle!(a.0, a.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.1 = unsafe { simd_shuffle!(a.1, a.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.2 = unsafe { simd_shuffle!(a.2, a.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let x = poly8x16x2_t(
+        vcombine_p8(a.0, a.1),
+        vcombine_p8(a.2, unsafe { crate::mem::zeroed() }),
+    );
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vqtbl2(transmute(x.0), transmute(x.1), b));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl4_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl4_s8(a: int8x8x4_t, b: int8x8_t) -> int8x8_t {
+    let x = int8x16x2_t(vcombine_s8(a.0, a.1), vcombine_s8(a.2, a.3));
+    unsafe { transmute(vqtbl2(transmute(x.0), transmute(x.1), transmute(b))) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl4_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl4_u8(a: uint8x8x4_t, b: uint8x8_t) -> uint8x8_t {
+    let x = uint8x16x2_t(vcombine_u8(a.0, a.1), vcombine_u8(a.2, a.3));
+    unsafe { transmute(vqtbl2(transmute(x.0), transmute(x.1), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl4_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl4_u8(a: uint8x8x4_t, b: uint8x8_t) -> uint8x8_t {
+    let mut a: uint8x8x4_t = a;
+    a.0 = unsafe { simd_shuffle!(a.0, a.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.1 = unsafe { simd_shuffle!(a.1, a.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.2 = unsafe { simd_shuffle!(a.2, a.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.3 = unsafe { simd_shuffle!(a.3, a.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let x = uint8x16x2_t(vcombine_u8(a.0, a.1), vcombine_u8(a.2, a.3));
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vqtbl2(transmute(x.0), transmute(x.1), b));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl4_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t {
+    let x = poly8x16x2_t(vcombine_p8(a.0, a.1), vcombine_p8(a.2, a.3));
+    unsafe { transmute(vqtbl2(transmute(x.0), transmute(x.1), b)) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl4_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t {
+    let mut a: poly8x8x4_t = a;
+    a.0 = unsafe { simd_shuffle!(a.0, a.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.1 = unsafe { simd_shuffle!(a.1, a.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.2 = unsafe { simd_shuffle!(a.2, a.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.3 = unsafe { simd_shuffle!(a.3, a.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let x = poly8x16x2_t(vcombine_p8(a.0, a.1), vcombine_p8(a.2, a.3));
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vqtbl2(transmute(x.0), transmute(x.1), b));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx1_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx1_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    unsafe {
+        simd_select(
+            simd_lt::<int8x8_t, int8x8_t>(c, transmute(i8x8::splat(8))),
+            transmute(vqtbx1(
+                transmute(a),
+                transmute(vcombine_s8(b, crate::mem::zeroed())),
+                transmute(c),
+            )),
+            a,
+        )
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx1_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe {
+        simd_select(
+            simd_lt::<uint8x8_t, int8x8_t>(c, transmute(u8x8::splat(8))),
+            transmute(vqtbx1(
+                transmute(a),
+                transmute(vcombine_u8(b, crate::mem::zeroed())),
+                c,
+            )),
+            a,
+        )
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx1_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx1_p8(a: poly8x8_t, b: poly8x8_t, c: uint8x8_t) -> poly8x8_t {
+    unsafe {
+        simd_select(
+            simd_lt::<uint8x8_t, int8x8_t>(c, transmute(u8x8::splat(8))),
+            transmute(vqtbx1(
+                transmute(a),
+                transmute(vcombine_p8(b, crate::mem::zeroed())),
+                c,
+            )),
+            a,
+        )
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx2_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx2_s8(a: int8x8_t, b: int8x8x2_t, c: int8x8_t) -> int8x8_t {
+    unsafe { vqtbx1(transmute(a), transmute(vcombine_s8(b.0, b.1)), transmute(c)) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx2_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx2_u8(a: uint8x8_t, b: uint8x8x2_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe { transmute(vqtbx1(transmute(a), transmute(vcombine_u8(b.0, b.1)), c)) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx2_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx2_u8(a: uint8x8_t, b: uint8x8x2_t, c: uint8x8_t) -> uint8x8_t {
+    let mut b: uint8x8x2_t = b;
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe { simd_shuffle!(b.0, b.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.1 = unsafe { simd_shuffle!(b.1, b.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t =
+            transmute(vqtbx1(transmute(a), transmute(vcombine_u8(b.0, b.1)), c));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx2_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t {
+    unsafe { transmute(vqtbx1(transmute(a), transmute(vcombine_p8(b.0, b.1)), c)) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx2_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t {
+    let mut b: poly8x8x2_t = b;
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe { simd_shuffle!(b.0, b.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.1 = unsafe { simd_shuffle!(b.1, b.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t =
+            transmute(vqtbx1(transmute(a), transmute(vcombine_p8(b.0, b.1)), c));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx3_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx3_s8(a: int8x8_t, b: int8x8x3_t, c: int8x8_t) -> int8x8_t {
+    let x = int8x16x2_t(
+        vcombine_s8(b.0, b.1),
+        vcombine_s8(b.2, unsafe { crate::mem::zeroed() }),
+    );
+    unsafe {
+        transmute(simd_select(
+            simd_lt::<int8x8_t, int8x8_t>(transmute(c), transmute(i8x8::splat(24))),
+            transmute(vqtbx2(
+                transmute(a),
+                transmute(x.0),
+                transmute(x.1),
+                transmute(c),
+            )),
+            a,
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx3_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t {
+    let x = uint8x16x2_t(
+        vcombine_u8(b.0, b.1),
+        vcombine_u8(b.2, unsafe { crate::mem::zeroed() }),
+    );
+    unsafe {
+        transmute(simd_select(
+            simd_lt::<uint8x8_t, int8x8_t>(transmute(c), transmute(u8x8::splat(24))),
+            transmute(vqtbx2(transmute(a), transmute(x.0), transmute(x.1), c)),
+            a,
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx3_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t {
+    let mut b: uint8x8x3_t = b;
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe { simd_shuffle!(b.0, b.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.1 = unsafe { simd_shuffle!(b.1, b.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.2 = unsafe { simd_shuffle!(b.2, b.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let x = uint8x16x2_t(
+        vcombine_u8(b.0, b.1),
+        vcombine_u8(b.2, unsafe { crate::mem::zeroed() }),
+    );
+    unsafe {
+        let ret_val: uint8x8_t = transmute(simd_select(
+            simd_lt::<uint8x8_t, int8x8_t>(transmute(c), transmute(u8x8::splat(24))),
+            transmute(vqtbx2(transmute(a), transmute(x.0), transmute(x.1), c)),
+            a,
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx3_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t {
+    let x = poly8x16x2_t(
+        vcombine_p8(b.0, b.1),
+        vcombine_p8(b.2, unsafe { crate::mem::zeroed() }),
+    );
+    unsafe {
+        transmute(simd_select(
+            simd_lt::<poly8x8_t, int8x8_t>(transmute(c), transmute(u8x8::splat(24))),
+            transmute(vqtbx2(transmute(a), transmute(x.0), transmute(x.1), c)),
+            a,
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx3_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t {
+    let mut b: poly8x8x3_t = b;
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe { simd_shuffle!(b.0, b.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.1 = unsafe { simd_shuffle!(b.1, b.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.2 = unsafe { simd_shuffle!(b.2, b.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let x = poly8x16x2_t(
+        vcombine_p8(b.0, b.1),
+        vcombine_p8(b.2, unsafe { crate::mem::zeroed() }),
+    );
+    unsafe {
+        let ret_val: poly8x8_t = transmute(simd_select(
+            simd_lt::<poly8x8_t, int8x8_t>(transmute(c), transmute(u8x8::splat(24))),
+            transmute(vqtbx2(transmute(a), transmute(x.0), transmute(x.1), c)),
+            a,
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx4_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx4_s8(a: int8x8_t, b: int8x8x4_t, c: int8x8_t) -> int8x8_t {
+    unsafe {
+        vqtbx2(
+            transmute(a),
+            transmute(vcombine_s8(b.0, b.1)),
+            transmute(vcombine_s8(b.2, b.3)),
+            transmute(c),
+        )
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx4_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx4_u8(a: uint8x8_t, b: uint8x8x4_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe {
+        transmute(vqtbx2(
+            transmute(a),
+            transmute(vcombine_u8(b.0, b.1)),
+            transmute(vcombine_u8(b.2, b.3)),
+            c,
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx4_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx4_u8(a: uint8x8_t, b: uint8x8x4_t, c: uint8x8_t) -> uint8x8_t {
+    let mut b: uint8x8x4_t = b;
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe { simd_shuffle!(b.0, b.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.1 = unsafe { simd_shuffle!(b.1, b.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.2 = unsafe { simd_shuffle!(b.2, b.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.3 = unsafe { simd_shuffle!(b.3, b.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vqtbx2(
+            transmute(a),
+            transmute(vcombine_u8(b.0, b.1)),
+            transmute(vcombine_u8(b.2, b.3)),
+            c,
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx4_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t {
+    unsafe {
+        transmute(vqtbx2(
+            transmute(a),
+            transmute(vcombine_p8(b.0, b.1)),
+            transmute(vcombine_p8(b.2, b.3)),
+            c,
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx4_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t {
+    let mut b: poly8x8x4_t = b;
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe { simd_shuffle!(b.0, b.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.1 = unsafe { simd_shuffle!(b.1, b.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.2 = unsafe { simd_shuffle!(b.2, b.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.3 = unsafe { simd_shuffle!(b.3, b.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vqtbx2(
+            transmute(a),
+            transmute(vcombine_p8(b.0, b.1)),
+            transmute(vcombine_p8(b.2, b.3)),
+            c,
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1q_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vtrn1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vtrn1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vtrn1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vtrn1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vtrn1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vtrn1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vtrn1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]
+        )
+    }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]
+        )
+    }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]
+        )
+    }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1q_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn1))]
+pub fn vtrn1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2q_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vtrn2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vtrn2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vtrn2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vtrn2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vtrn2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vtrn2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vtrn2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]
+        )
+    }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]
+        )
+    }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]
+        )
+    }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
+}
+#[doc = "Transpose vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn2q_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(trn2))]
+pub fn vtrn2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]) }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtst_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmtst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtst_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t {
+    unsafe {
+        let c: int64x1_t = simd_and(a, b);
+        let d: i64x1 = i64x1::new(0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtstq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmtst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtstq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t {
+    unsafe {
+        let c: int64x2_t = simd_and(a, b);
+        let d: i64x2 = i64x2::new(0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtst_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmtst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtst_p64(a: poly64x1_t, b: poly64x1_t) -> uint64x1_t {
+    unsafe {
+        let c: poly64x1_t = simd_and(a, b);
+        let d: i64x1 = i64x1::new(0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtstq_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmtst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtstq_p64(a: poly64x2_t, b: poly64x2_t) -> uint64x2_t {
+    unsafe {
+        let c: poly64x2_t = simd_and(a, b);
+        let d: i64x2 = i64x2::new(0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Unsigned compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtst_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmtst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtst_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    unsafe {
+        let c: uint64x1_t = simd_and(a, b);
+        let d: u64x1 = u64x1::new(0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Unsigned compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtstq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmtst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtstq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let c: uint64x2_t = simd_and(a, b);
+        let d: u64x2 = u64x2::new(0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Compare bitwise test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtstd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtstd_s64(a: i64, b: i64) -> u64 {
+    unsafe { transmute(vtst_s64(transmute(a), transmute(b))) }
+}
+#[doc = "Compare bitwise test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtstd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vtstd_u64(a: u64, b: u64) -> u64 {
+    unsafe { transmute(vtst_u64(transmute(a), transmute(b))) }
+}
+#[doc = "Signed saturating Accumulate of Unsigned value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqadd_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub fn vuqadd_s8(a: int8x8_t, b: uint8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.suqadd.v8i8"
+        )]
+        fn _vuqadd_s8(a: int8x8_t, b: uint8x8_t) -> int8x8_t;
+    }
+    unsafe { _vuqadd_s8(a, b) }
+}
+#[doc = "Signed saturating Accumulate of Unsigned value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqaddq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub fn vuqaddq_s8(a: int8x16_t, b: uint8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.suqadd.v16i8"
+        )]
+        fn _vuqaddq_s8(a: int8x16_t, b: uint8x16_t) -> int8x16_t;
+    }
+    unsafe { _vuqaddq_s8(a, b) }
+}
+#[doc = "Signed saturating Accumulate of Unsigned value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqadd_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub fn vuqadd_s16(a: int16x4_t, b: uint16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.suqadd.v4i16"
+        )]
+        fn _vuqadd_s16(a: int16x4_t, b: uint16x4_t) -> int16x4_t;
+    }
+    unsafe { _vuqadd_s16(a, b) }
+}
+#[doc = "Signed saturating Accumulate of Unsigned value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqaddq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub fn vuqaddq_s16(a: int16x8_t, b: uint16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.suqadd.v8i16"
+        )]
+        fn _vuqaddq_s16(a: int16x8_t, b: uint16x8_t) -> int16x8_t;
+    }
+    unsafe { _vuqaddq_s16(a, b) }
+}
+#[doc = "Signed saturating Accumulate of Unsigned value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqadd_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub fn vuqadd_s32(a: int32x2_t, b: uint32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.suqadd.v2i32"
+        )]
+        fn _vuqadd_s32(a: int32x2_t, b: uint32x2_t) -> int32x2_t;
+    }
+    unsafe { _vuqadd_s32(a, b) }
+}
+#[doc = "Signed saturating Accumulate of Unsigned value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqaddq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub fn vuqaddq_s32(a: int32x4_t, b: uint32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.suqadd.v4i32"
+        )]
+        fn _vuqaddq_s32(a: int32x4_t, b: uint32x4_t) -> int32x4_t;
+    }
+    unsafe { _vuqaddq_s32(a, b) }
+}
+#[doc = "Signed saturating Accumulate of Unsigned value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqadd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub fn vuqadd_s64(a: int64x1_t, b: uint64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.suqadd.v1i64"
+        )]
+        fn _vuqadd_s64(a: int64x1_t, b: uint64x1_t) -> int64x1_t;
+    }
+    unsafe { _vuqadd_s64(a, b) }
+}
+#[doc = "Signed saturating Accumulate of Unsigned value."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqaddq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub fn vuqaddq_s64(a: int64x2_t, b: uint64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.suqadd.v2i64"
+        )]
+        fn _vuqaddq_s64(a: int64x2_t, b: uint64x2_t) -> int64x2_t;
+    }
+    unsafe { _vuqaddq_s64(a, b) }
+}
+#[doc = "Signed saturating accumulate of unsigned value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqaddb_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vuqaddb_s8(a: i8, b: u8) -> i8 {
+    unsafe { simd_extract!(vuqadd_s8(vdup_n_s8(a), vdup_n_u8(b)), 0) }
+}
+#[doc = "Signed saturating accumulate of unsigned value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqaddh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vuqaddh_s16(a: i16, b: u16) -> i16 {
+    unsafe { simd_extract!(vuqadd_s16(vdup_n_s16(a), vdup_n_u16(b)), 0) }
+}
+#[doc = "Signed saturating accumulate of unsigned value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqaddd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vuqaddd_s64(a: i64, b: u64) -> i64 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.suqadd.i64"
+        )]
+        fn _vuqaddd_s64(a: i64, b: u64) -> i64;
+    }
+    unsafe { _vuqaddd_s64(a, b) }
+}
+#[doc = "Signed saturating accumulate of unsigned value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqadds_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vuqadds_s32(a: i32, b: u32) -> i32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.suqadd.i32"
+        )]
+        fn _vuqadds_s32(a: i32, b: u32) -> i32;
+    }
+    unsafe { _vuqadds_s32(a, b) }
+}
+#[doc = "Dot product index form with unsigned and signed integers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(test, assert_instr(usdot, LANE = 3))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_i8mm", issue = "117223")]
+pub fn vusdot_laneq_s32<const LANE: i32>(a: int32x2_t, b: uint8x8_t, c: int8x16_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: int32x4_t = transmute(c);
+        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        vusdot_s32(a, b, transmute(c))
+    }
+}
+#[doc = "Dot product index form with unsigned and signed integers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(test, assert_instr(usdot, LANE = 3))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_neon_i8mm", issue = "117223")]
+pub fn vusdotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let c: int32x4_t = transmute(c);
+        let c: int32x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vusdotq_s32(a, b, transmute(c))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1q_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vuzp1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vuzp1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vuzp1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vuzp1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vuzp1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vuzp1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vuzp1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
+        )
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
+        )
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
+        )
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp1q_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp1))]
+pub fn vuzp1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2q_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vuzp2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vuzp2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vuzp2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vuzp2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vuzp2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vuzp2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vuzp2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
+        )
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
+        )
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
+        )
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7]) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp2q_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(uzp2))]
+pub fn vuzp2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    unsafe { simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]) }
+}
+#[doc = "Exclusive OR and rotate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vxarq_u64)"]
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(xar, IMM6 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_neon_sha3", since = "1.79.0")]
+pub fn vxarq_u64<const IMM6: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert_uimm_bits!(IMM6, 6);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.xar"
+        )]
+        fn _vxarq_u64(a: uint64x2_t, b: uint64x2_t, n: i64) -> uint64x2_t;
+    }
+    unsafe { _vxarq_u64(a, b, IMM6 as i64) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]
+        )
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]
+        )
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]
+        )
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip1q_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip1))]
+pub fn vzip1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe { simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_f64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe { simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]
+        )
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]
+        )
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    unsafe { simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]
+        )
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    unsafe { simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]) }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip2q_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(zip2))]
+pub fn vzip2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
new file mode 100644
index 0000000000000..b172b57f32543
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
@@ -0,0 +1,1006 @@
+//! ARMv8 ASIMD intrinsics
+
+#![allow(non_camel_case_types)]
+
+#[rustfmt::skip]
+mod generated;
+#[rustfmt::skip]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub use self::generated::*;
+
+// FIXME: replace neon with asimd
+
+use crate::{
+    core_arch::{arm_shared::*, simd::*},
+    hint::unreachable_unchecked,
+    intrinsics::{simd::*, *},
+    mem::transmute,
+};
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+types! {
+    #![stable(feature = "neon_intrinsics", since = "1.59.0")]
+
+    /// ARM-specific 64-bit wide vector of one packed `f64`.
+    pub struct float64x1_t(1 x f64); // FIXME: check this!
+    /// ARM-specific 128-bit wide vector of two packed `f64`.
+    pub struct float64x2_t(2 x f64);
+}
+
+/// ARM-specific type containing two `float64x1_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub struct float64x1x2_t(pub float64x1_t, pub float64x1_t);
+/// ARM-specific type containing three `float64x1_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub struct float64x1x3_t(pub float64x1_t, pub float64x1_t, pub float64x1_t);
+/// ARM-specific type containing four `float64x1_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub struct float64x1x4_t(
+    pub float64x1_t,
+    pub float64x1_t,
+    pub float64x1_t,
+    pub float64x1_t,
+);
+
+/// ARM-specific type containing two `float64x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub struct float64x2x2_t(pub float64x2_t, pub float64x2_t);
+/// ARM-specific type containing three `float64x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub struct float64x2x3_t(pub float64x2_t, pub float64x2_t, pub float64x2_t);
+/// ARM-specific type containing four `float64x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub struct float64x2x4_t(
+    pub float64x2_t,
+    pub float64x2_t,
+    pub float64x2_t,
+    pub float64x2_t,
+);
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N1 = 0, N2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_lane_s64<const N1: i32, const N2: i32>(_a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N1 == 0);
+    static_assert!(N2 == 0);
+    b
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N1 = 0, N2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_lane_u64<const N1: i32, const N2: i32>(_a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N1 == 0);
+    static_assert!(N2 == 0);
+    b
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N1 = 0, N2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_lane_p64<const N1: i32, const N2: i32>(_a: poly64x1_t, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(N1 == 0);
+    static_assert!(N2 == 0);
+    b
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N1 = 0, N2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_lane_f64<const N1: i32, const N2: i32>(
+    _a: float64x1_t,
+    b: float64x1_t,
+) -> float64x1_t {
+    static_assert!(N1 == 0);
+    static_assert!(N2 == 0);
+    b
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_laneq_s64<const LANE1: i32, const LANE2: i32>(
+    _a: int64x1_t,
+    b: int64x2_t,
+) -> int64x1_t {
+    static_assert!(LANE1 == 0);
+    static_assert_uimm_bits!(LANE2, 1);
+    unsafe { transmute::<i64, _>(simd_extract!(b, LANE2 as u32)) }
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_laneq_u64<const LANE1: i32, const LANE2: i32>(
+    _a: uint64x1_t,
+    b: uint64x2_t,
+) -> uint64x1_t {
+    static_assert!(LANE1 == 0);
+    static_assert_uimm_bits!(LANE2, 1);
+    unsafe { transmute::<u64, _>(simd_extract!(b, LANE2 as u32)) }
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_laneq_p64<const LANE1: i32, const LANE2: i32>(
+    _a: poly64x1_t,
+    b: poly64x2_t,
+) -> poly64x1_t {
+    static_assert!(LANE1 == 0);
+    static_assert_uimm_bits!(LANE2, 1);
+    unsafe { transmute::<u64, _>(simd_extract!(b, LANE2 as u32)) }
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcopy_laneq_f64<const LANE1: i32, const LANE2: i32>(
+    _a: float64x1_t,
+    b: float64x2_t,
+) -> float64x1_t {
+    static_assert!(LANE1 == 0);
+    static_assert_uimm_bits!(LANE2, 1);
+    unsafe { transmute::<f64, _>(simd_extract!(b, LANE2 as u32)) }
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_dup_f64(ptr: *const f64) -> float64x1_t {
+    vld1_f64(ptr)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld1r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_dup_f64(ptr: *const f64) -> float64x2_t {
+    let x = vld1q_lane_f64::<0>(ptr, transmute(f64x2::splat(0.)));
+    simd_shuffle!(x, x, [0, 0])
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(ldr, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_lane_f64<const LANE: i32>(ptr: *const f64, src: float64x1_t) -> float64x1_t {
+    static_assert!(LANE == 0);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(ld1, LANE = 1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_lane_f64<const LANE: i32>(ptr: *const f64, src: float64x2_t) -> float64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+
+/// Bitwise Select instructions. This instruction sets each bit in the destination SIMD&FP register
+/// to the corresponding bit from the first source SIMD&FP register when the original
+/// destination bit was 1, otherwise from the second source SIMD&FP register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(bsl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vbsl_f64(a: uint64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
+    let not = int64x1_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+/// Bitwise Select.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(bsl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vbsl_p64(a: poly64x1_t, b: poly64x1_t, c: poly64x1_t) -> poly64x1_t {
+    let not = int64x1_t::splat(-1);
+    unsafe { simd_or(simd_and(a, b), simd_and(simd_xor(a, transmute(not)), c)) }
+}
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(bsl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vbslq_f64(a: uint64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    let not = int64x2_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(bsl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vbslq_p64(a: poly64x2_t, b: poly64x2_t, c: poly64x2_t) -> poly64x2_t {
+    let not = int64x2_t::splat(-1);
+    unsafe { simd_or(simd_and(a, b), simd_and(simd_xor(a, transmute(not)), c)) }
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vadd_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    unsafe { simd_add(a, b) }
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    unsafe { simd_add(a, b) }
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(add))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vadd_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    unsafe { simd_add(a, b) }
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(add))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vadd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    unsafe { simd_add(a, b) }
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(add))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vaddd_s64(a: i64, b: i64) -> i64 {
+    a.wrapping_add(b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(add))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vaddd_u64(a: u64, b: u64) -> u64 {
+    a.wrapping_add(b)
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vext_p64<const N: i32>(a: poly64x1_t, _b: poly64x1_t) -> poly64x1_t {
+    static_assert!(N == 0);
+    a
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vext_f64<const N: i32>(a: float64x1_t, _b: float64x1_t) -> float64x1_t {
+    static_assert!(N == 0);
+    a
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdup_n_p64(value: p64) -> poly64x1_t {
+    unsafe { transmute(u64x1::new(value)) }
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdup_n_f64(value: f64) -> float64x1_t {
+    float64x1_t::splat(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupq_n_p64(value: p64) -> poly64x2_t {
+    unsafe { transmute(u64x2::new(value, value)) }
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vdupq_n_f64(value: f64) -> float64x2_t {
+    float64x2_t::splat(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmov_n_p64(value: p64) -> poly64x1_t {
+    vdup_n_p64(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmov_n_f64(value: f64) -> float64x1_t {
+    vdup_n_f64(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmovq_n_p64(value: p64) -> poly64x2_t {
+    vdupq_n_p64(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vmovq_n_f64(value: f64) -> float64x2_t {
+    vdupq_n_f64(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vget_high_f64(a: float64x2_t) -> float64x1_t {
+    unsafe { float64x1_t([simd_extract!(a, 1)]) }
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ext))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vget_high_p64(a: poly64x2_t) -> poly64x1_t {
+    unsafe { transmute(u64x1::new(simd_extract!(a, 1))) }
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vget_low_f64(a: float64x2_t) -> float64x1_t {
+    unsafe { float64x1_t([simd_extract!(a, 0)]) }
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vget_low_p64(a: poly64x2_t) -> poly64x1_t {
+    unsafe { transmute(u64x1::new(simd_extract!(a, 0))) }
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, IMM5 = 0)
+)]
+pub fn vget_lane_f64<const IMM5: i32>(v: float64x1_t) -> f64 {
+    static_assert!(IMM5 == 0);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, IMM5 = 0)
+)]
+pub fn vgetq_lane_f64<const IMM5: i32>(v: float64x2_t) -> f64 {
+    static_assert_uimm_bits!(IMM5, 1);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcombine_f64(low: float64x1_t, high: float64x1_t) -> float64x2_t {
+    unsafe { simd_shuffle!(low, high, [0, 1]) }
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshld_n_s64<const N: i32>(a: i64) -> i64 {
+    static_assert_uimm_bits!(N, 6);
+    a << N
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshld_n_u64<const N: i32>(a: u64) -> u64 {
+    static_assert_uimm_bits!(N, 6);
+    a << N
+}
+
+/// Signed shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshrd_n_s64<const N: i32>(a: i64) -> i64 {
+    static_assert!(N >= 1 && N <= 64);
+    let n: i32 = if N == 64 { 63 } else { N };
+    a >> n
+}
+
+/// Unsigned shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vshrd_n_u64<const N: i32>(a: u64) -> u64 {
+    static_assert!(N >= 1 && N <= 64);
+    let n: i32 = if N == 64 {
+        return 0;
+    } else {
+        N
+    };
+    a >> n
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsrad_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
+    static_assert!(N >= 1 && N <= 64);
+    a.wrapping_add(vshrd_n_s64::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vsrad_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
+    static_assert!(N >= 1 && N <= 64);
+    a.wrapping_add(vshrd_n_u64::<N>(b))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::aarch64::test_support::*;
+    use crate::core_arch::arm_shared::test_support::*;
+    use crate::core_arch::{aarch64::neon::*, aarch64::*, simd::*};
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_f64() {
+        let a = 1.;
+        let b = 8.;
+        let e = 9.;
+        let r: f64 = transmute(vadd_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_f64() {
+        let a = f64x2::new(1., 2.);
+        let b = f64x2::new(8., 7.);
+        let e = f64x2::new(9., 9.);
+        let r: f64x2 = transmute(vaddq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_s64() {
+        let a = 1_i64;
+        let b = 8_i64;
+        let e = 9_i64;
+        let r: i64 = transmute(vadd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_u64() {
+        let a = 1_u64;
+        let b = 8_u64;
+        let e = 9_u64;
+        let r: u64 = transmute(vadd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddd_s64() {
+        let a = 1_i64;
+        let b = 8_i64;
+        let e = 9_i64;
+        let r: i64 = vaddd_s64(a, b);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddd_u64() {
+        let a = 1_u64;
+        let b = 8_u64;
+        let e = 9_u64;
+        let r: u64 = vaddd_u64(a, b);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let b: i64x1 = i64x1::new(1);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vext_p64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_f64() {
+        let a: f64x1 = f64x1::new(0.);
+        let b: f64x1 = f64x1::new(1.);
+        let e: f64x1 = f64x1::new(0.);
+        let r: f64x1 = transmute(vext_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshld_n_s64() {
+        let a: i64 = 1;
+        let e: i64 = 4;
+        let r: i64 = vshld_n_s64::<2>(a);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshld_n_u64() {
+        let a: u64 = 1;
+        let e: u64 = 4;
+        let r: u64 = vshld_n_u64::<2>(a);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrd_n_s64() {
+        let a: i64 = 4;
+        let e: i64 = 1;
+        let r: i64 = vshrd_n_s64::<2>(a);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrd_n_u64() {
+        let a: u64 = 4;
+        let e: u64 = 1;
+        let r: u64 = vshrd_n_u64::<2>(a);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsrad_n_s64() {
+        let a: i64 = 1;
+        let b: i64 = 4;
+        let e: i64 = 2;
+        let r: i64 = vsrad_n_s64::<2>(a, b);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsrad_n_u64() {
+        let a: u64 = 1;
+        let b: u64 = 4;
+        let e: u64 = 2;
+        let r: u64 = vsrad_n_u64::<2>(a, b);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_f64() {
+        let a: f64 = 3.3;
+        let e = f64x1::new(3.3);
+        let r: f64x1 = transmute(vdup_n_f64(a));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_p64() {
+        let a: u64 = 3;
+        let e = u64x1::new(3);
+        let r: u64x1 = transmute(vdup_n_p64(a));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_f64() {
+        let a: f64 = 3.3;
+        let e = f64x2::new(3.3, 3.3);
+        let r: f64x2 = transmute(vdupq_n_f64(a));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_p64() {
+        let a: u64 = 3;
+        let e = u64x2::new(3, 3);
+        let r: u64x2 = transmute(vdupq_n_p64(a));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_p64() {
+        let a: u64 = 3;
+        let e = u64x1::new(3);
+        let r: u64x1 = transmute(vmov_n_p64(a));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_f64() {
+        let a: f64 = 3.3;
+        let e = f64x1::new(3.3);
+        let r: f64x1 = transmute(vmov_n_f64(a));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_p64() {
+        let a: u64 = 3;
+        let e = u64x2::new(3, 3);
+        let r: u64x2 = transmute(vmovq_n_p64(a));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_f64() {
+        let a: f64 = 3.3;
+        let e = f64x2::new(3.3, 3.3);
+        let r: f64x2 = transmute(vmovq_n_f64(a));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_f64() {
+        let a = f64x2::new(1.0, 2.0);
+        let e = f64x1::new(2.0);
+        let r: f64x1 = transmute(vget_high_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_p64() {
+        let a = u64x2::new(1, 2);
+        let e = u64x1::new(2);
+        let r: u64x1 = transmute(vget_high_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_f64() {
+        let a = f64x2::new(1.0, 2.0);
+        let e = f64x1::new(1.0);
+        let r: f64x1 = transmute(vget_low_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_p64() {
+        let a = u64x2::new(1, 2);
+        let e = u64x1::new(1);
+        let r: u64x1 = transmute(vget_low_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_f64() {
+        let v = f64x1::new(1.0);
+        let r = vget_lane_f64::<0>(transmute(v));
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_f64() {
+        let v = f64x2::new(0.0, 1.0);
+        let r = vgetq_lane_f64::<1>(transmute(v));
+        assert_eq!(r, 1.0);
+        let r = vgetq_lane_f64::<0>(transmute(v));
+        assert_eq!(r, 0.0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let r: i64x1 = transmute(vcopy_lane_s64::<0, 0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let b: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcopy_lane_u64::<0, 0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_p64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let r: i64x1 = transmute(vcopy_lane_p64::<0, 0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 0.;
+        let e: f64 = 0.;
+        let r: f64 = transmute(vcopy_lane_f64::<0, 0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x2 = i64x2::new(0, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let r: i64x1 = transmute(vcopy_laneq_s64::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let b: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcopy_laneq_u64::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_p64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x2 = i64x2::new(0, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let r: i64x1 = transmute(vcopy_laneq_p64::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_f64() {
+        let a: f64 = 1.;
+        let b: f64x2 = f64x2::new(0., 0.5);
+        let e: f64 = 0.5;
+        let r: f64 = transmute(vcopy_laneq_f64::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_f64() {
+        let a = u64x1::new(0x8000000000000000);
+        let b = f64x1::new(-1.23f64);
+        let c = f64x1::new(2.34f64);
+        let e = f64x1::new(-2.34f64);
+        let r: f64x1 = transmute(vbsl_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_p64() {
+        let a = u64x1::new(1);
+        let b = u64x1::new(u64::MAX);
+        let c = u64x1::new(u64::MIN);
+        let e = u64x1::new(1);
+        let r: u64x1 = transmute(vbsl_p64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_f64() {
+        let a = u64x2::new(1, 0x8000000000000000);
+        let b = f64x2::new(f64::MAX, -1.23f64);
+        let c = f64x2::new(f64::MIN, 2.34f64);
+        let e = f64x2::new(f64::MIN, -2.34f64);
+        let r: f64x2 = transmute(vbslq_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_p64() {
+        let a = u64x2::new(u64::MAX, 1);
+        let b = u64x2::new(u64::MAX, u64::MAX);
+        let c = u64x2::new(u64::MIN, u64::MIN);
+        let e = u64x2::new(u64::MAX, 1);
+        let r: u64x2 = transmute(vbslq_p64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_f64() {
+        let a: [f64; 2] = [0., 1.];
+        let e = f64x1::new(1.);
+        let r: f64x1 = transmute(vld1_f64(a[1..].as_ptr()));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_f64() {
+        let a: [f64; 3] = [0., 1., 2.];
+        let e = f64x2::new(1., 2.);
+        let r: f64x2 = transmute(vld1q_f64(a[1..].as_ptr()));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_f64() {
+        let a: [f64; 2] = [1., 42.];
+        let e = f64x1::new(42.);
+        let r: f64x1 = transmute(vld1_dup_f64(a[1..].as_ptr()));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_f64() {
+        let elem: f64 = 42.;
+        let e = f64x2::new(42., 42.);
+        let r: f64x2 = transmute(vld1q_dup_f64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_f64() {
+        let a = f64x1::new(0.);
+        let elem: f64 = 42.;
+        let e = f64x1::new(42.);
+        let r: f64x1 = transmute(vld1_lane_f64::<0>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_f64() {
+        let a = f64x2::new(0., 1.);
+        let elem: f64 = 42.;
+        let e = f64x2::new(0., 42.);
+        let r: f64x2 = transmute(vld1q_lane_f64::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_f64() {
+        let mut vals = [0_f64; 2];
+        let a = f64x1::new(1.);
+
+        vst1_f64(vals[1..].as_mut_ptr(), transmute(a));
+
+        assert_eq!(vals[0], 0.);
+        assert_eq!(vals[1], 1.);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_f64() {
+        let mut vals = [0_f64; 3];
+        let a = f64x2::new(1., 2.);
+
+        vst1q_f64(vals[1..].as_mut_ptr(), transmute(a));
+
+        assert_eq!(vals[0], 0.);
+        assert_eq!(vals[1], 1.);
+        assert_eq!(vals[2], 2.);
+    }
+}
+
+#[cfg(test)]
+#[path = "../../arm_shared/neon/table_lookup_tests.rs"]
+mod table_lookup_tests;
+
+#[cfg(test)]
+#[path = "../../arm_shared/neon/shift_and_insert_tests.rs"]
+mod shift_and_insert_tests;
+
+#[cfg(test)]
+#[path = "../../arm_shared/neon/load_tests.rs"]
+mod load_tests;
+
+#[cfg(test)]
+#[path = "../../arm_shared/neon/store_tests.rs"]
+mod store_tests;
diff --git a/library/stdarch/crates/core_arch/src/aarch64/prefetch.rs b/library/stdarch/crates/core_arch/src/aarch64/prefetch.rs
new file mode 100644
index 0000000000000..4dcbc9549f115
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/prefetch.rs
@@ -0,0 +1,80 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.prefetch"]
+    fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
+}
+
+/// See [`prefetch`](fn._prefetch.html).
+#[unstable(feature = "stdarch_aarch64_prefetch", issue = "117217")]
+pub const _PREFETCH_READ: i32 = 0;
+
+/// See [`prefetch`](fn._prefetch.html).
+#[unstable(feature = "stdarch_aarch64_prefetch", issue = "117217")]
+pub const _PREFETCH_WRITE: i32 = 1;
+
+/// See [`prefetch`](fn._prefetch.html).
+#[unstable(feature = "stdarch_aarch64_prefetch", issue = "117217")]
+pub const _PREFETCH_LOCALITY0: i32 = 0;
+
+/// See [`prefetch`](fn._prefetch.html).
+#[unstable(feature = "stdarch_aarch64_prefetch", issue = "117217")]
+pub const _PREFETCH_LOCALITY1: i32 = 1;
+
+/// See [`prefetch`](fn._prefetch.html).
+#[unstable(feature = "stdarch_aarch64_prefetch", issue = "117217")]
+pub const _PREFETCH_LOCALITY2: i32 = 2;
+
+/// See [`prefetch`](fn._prefetch.html).
+#[unstable(feature = "stdarch_aarch64_prefetch", issue = "117217")]
+pub const _PREFETCH_LOCALITY3: i32 = 3;
+
+/// Fetch the cache line that contains address `p` using the given `RW` and `LOCALITY`.
+///
+/// The `RW` must be one of:
+///
+/// * [`_PREFETCH_READ`](constant._PREFETCH_READ.html): the prefetch is preparing
+///   for a read.
+///
+/// * [`_PREFETCH_WRITE`](constant._PREFETCH_WRITE.html): the prefetch is preparing
+///   for a write.
+///
+/// The `LOCALITY` must be one of:
+///
+/// * [`_PREFETCH_LOCALITY0`](constant._PREFETCH_LOCALITY0.html): Streaming or
+///   non-temporal prefetch, for data that is used only once.
+///
+/// * [`_PREFETCH_LOCALITY1`](constant._PREFETCH_LOCALITY1.html): Fetch into level 3 cache.
+///
+/// * [`_PREFETCH_LOCALITY2`](constant._PREFETCH_LOCALITY2.html): Fetch into level 2 cache.
+///
+/// * [`_PREFETCH_LOCALITY3`](constant._PREFETCH_LOCALITY3.html): Fetch into level 1 cache.
+///
+/// The prefetch memory instructions signal to the memory system that memory accesses
+/// from a specified address are likely to occur in the near future. The memory system
+/// can respond by taking actions that are expected to speed up the memory access when
+/// they do occur, such as preloading the specified address into one or more caches.
+/// Because these signals are only hints, it is valid for a particular CPU to treat
+/// any or all prefetch instructions as a NOP.
+///
+///
+/// [Arm's documentation](https://developer.arm.com/documentation/den0024/a/the-a64-instruction-set/memory-access-instructions/prefetching-memory?lang=en)
+#[inline(always)]
+#[cfg_attr(test, assert_instr("prfm pldl1strm", RW = _PREFETCH_READ, LOCALITY = _PREFETCH_LOCALITY0))]
+#[cfg_attr(test, assert_instr("prfm pldl3keep", RW = _PREFETCH_READ, LOCALITY = _PREFETCH_LOCALITY1))]
+#[cfg_attr(test, assert_instr("prfm pldl2keep", RW = _PREFETCH_READ, LOCALITY = _PREFETCH_LOCALITY2))]
+#[cfg_attr(test, assert_instr("prfm pldl1keep", RW = _PREFETCH_READ, LOCALITY = _PREFETCH_LOCALITY3))]
+#[cfg_attr(test, assert_instr("prfm pstl1strm", RW = _PREFETCH_WRITE, LOCALITY = _PREFETCH_LOCALITY0))]
+#[cfg_attr(test, assert_instr("prfm pstl3keep", RW = _PREFETCH_WRITE, LOCALITY = _PREFETCH_LOCALITY1))]
+#[cfg_attr(test, assert_instr("prfm pstl2keep", RW = _PREFETCH_WRITE, LOCALITY = _PREFETCH_LOCALITY2))]
+#[cfg_attr(test, assert_instr("prfm pstl1keep", RW = _PREFETCH_WRITE, LOCALITY = _PREFETCH_LOCALITY3))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_aarch64_prefetch", issue = "117217")]
+// FIXME: Replace this with the standard ACLE __pld/__pldx/__pli/__plix intrinsics
+pub unsafe fn _prefetch<const RW: i32, const LOCALITY: i32>(p: *const i8) {
+    // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
+    static_assert_uimm_bits!(RW, 1);
+    static_assert_uimm_bits!(LOCALITY, 2);
+    prefetch(p, RW, LOCALITY, 1);
+}
diff --git a/library/stdarch/crates/core_arch/src/aarch64/test_support.rs b/library/stdarch/crates/core_arch/src/aarch64/test_support.rs
new file mode 100644
index 0000000000000..e21cbfd1ed055
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/test_support.rs
@@ -0,0 +1,184 @@
+use crate::core_arch::{aarch64::neon::*, arm_shared::*, simd::*};
+use std::{mem::transmute, vec::Vec};
+
+macro_rules! V_u64 {
+    () => {
+        vec![
+            0x0000000000000000u64,
+            0x0101010101010101u64,
+            0x0202020202020202u64,
+            0x0F0F0F0F0F0F0F0Fu64,
+            0x8080808080808080u64,
+            0xF0F0F0F0F0F0F0F0u64,
+            0xFFFFFFFFFFFFFFFFu64,
+        ]
+    };
+}
+
+macro_rules! V_f64 {
+    () => {
+        vec![
+            0.0f64,
+            1.0f64,
+            -1.0f64,
+            1.2f64,
+            2.4f64,
+            f64::MAX,
+            f64::MIN,
+            f64::INFINITY,
+            f64::NEG_INFINITY,
+            f64::NAN,
+        ]
+    };
+}
+
+macro_rules! to64 {
+    ($t : ident) => {
+        |v: $t| -> u64 { transmute(v) }
+    };
+}
+
+macro_rules! to128 {
+    ($t : ident) => {
+        |v: $t| -> u128 { transmute(v) }
+    };
+}
+
+pub(crate) fn test<T, U, V, W, X>(
+    vals: Vec<T>,
+    fill1: fn(T) -> V,
+    fill2: fn(U) -> W,
+    cast: fn(W) -> X,
+    test_fun: fn(V, V) -> W,
+    verify_fun: fn(T, T) -> U,
+) where
+    T: Copy + core::fmt::Debug,
+    U: Copy + core::fmt::Debug + std::cmp::PartialEq,
+    V: Copy + core::fmt::Debug,
+    W: Copy + core::fmt::Debug,
+    X: Copy + core::fmt::Debug + std::cmp::PartialEq,
+{
+    let pairs = vals.iter().zip(vals.iter());
+
+    for (i, j) in pairs {
+        let a: V = fill1(*i);
+        let b: V = fill1(*j);
+
+        let actual_pre: W = test_fun(a, b);
+        let expected_pre: W = fill2(verify_fun(*i, *j));
+
+        let actual: X = cast(actual_pre);
+        let expected: X = cast(expected_pre);
+
+        assert_eq!(
+            actual, expected,
+            "[{:?}:{:?}] :\nf({:?}, {:?}) = {:?}\ng({:?}, {:?}) = {:?}\n",
+            *i, *j, &a, &b, actual_pre, &a, &b, expected_pre
+        );
+    }
+}
+
+macro_rules! gen_test_fn {
+    ($n: ident, $t: ident, $u: ident, $v: ident, $w: ident, $x: ident, $vals: expr, $fill1: expr, $fill2: expr, $cast: expr) => {
+        pub(crate) fn $n(test_fun: fn($v, $v) -> $w, verify_fun: fn($t, $t) -> $u) {
+            unsafe {
+                test::<$t, $u, $v, $w, $x>($vals, $fill1, $fill2, $cast, test_fun, verify_fun)
+            };
+        }
+    };
+}
+
+macro_rules! gen_fill_fn {
+    ($id: ident, $el_width: expr, $num_els: expr, $in_t : ident, $out_t: ident, $cmp_t: ident) => {
+        pub(crate) fn $id(val: $in_t) -> $out_t {
+            let initial: [$in_t; $num_els] = [val; $num_els];
+            let result: $cmp_t = unsafe { transmute(initial) };
+            let result_out: $out_t = unsafe { transmute(result) };
+
+            // println!("FILL: {:016x} as {} x {}: {:016x}", val.reverse_bits(), $el_width, $num_els, (result as u64).reverse_bits());
+
+            result_out
+        }
+    };
+}
+
+gen_fill_fn!(fill_u64, 64, 1, u64, uint64x1_t, u64);
+gen_fill_fn!(fillq_u64, 64, 2, u64, uint64x2_t, u128);
+gen_fill_fn!(fill_f64, 64, 1, f64, float64x1_t, u64);
+gen_fill_fn!(fillq_f64, 64, 2, f64, float64x2_t, u128);
+gen_fill_fn!(fill_p64, 64, 1, u64, poly64x1_t, u64);
+gen_fill_fn!(fillq_p64, 64, 2, u64, poly64x2_t, u128);
+
+gen_test_fn!(
+    test_ari_f64,
+    f64,
+    f64,
+    float64x1_t,
+    float64x1_t,
+    u64,
+    V_f64!(),
+    fill_f64,
+    fill_f64,
+    to64!(float64x1_t)
+);
+gen_test_fn!(
+    test_cmp_f64,
+    f64,
+    u64,
+    float64x1_t,
+    uint64x1_t,
+    u64,
+    V_f64!(),
+    fill_f64,
+    fill_u64,
+    to64!(uint64x1_t)
+);
+gen_test_fn!(
+    testq_ari_f64,
+    f64,
+    f64,
+    float64x2_t,
+    float64x2_t,
+    u128,
+    V_f64!(),
+    fillq_f64,
+    fillq_f64,
+    to128!(float64x2_t)
+);
+gen_test_fn!(
+    testq_cmp_f64,
+    f64,
+    u64,
+    float64x2_t,
+    uint64x2_t,
+    u128,
+    V_f64!(),
+    fillq_f64,
+    fillq_u64,
+    to128!(uint64x2_t)
+);
+
+gen_test_fn!(
+    test_cmp_p64,
+    u64,
+    u64,
+    poly64x1_t,
+    uint64x1_t,
+    u64,
+    V_u64!(),
+    fill_p64,
+    fill_u64,
+    to64!(uint64x1_t)
+);
+gen_test_fn!(
+    testq_cmp_p64,
+    u64,
+    u64,
+    poly64x2_t,
+    uint64x2_t,
+    u128,
+    V_u64!(),
+    fillq_p64,
+    fillq_u64,
+    to128!(uint64x2_t)
+);
diff --git a/library/stdarch/crates/core_arch/src/aarch64/tme.rs b/library/stdarch/crates/core_arch/src/aarch64/tme.rs
new file mode 100644
index 0000000000000..207633c1f8d34
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/tme.rs
@@ -0,0 +1,201 @@
+//! ARM's Transactional Memory Extensions (TME).
+//!
+//! This CPU feature is available on Aarch64 - A architecture profile.
+//! This feature is in the non-neon feature set. TME specific vendor documentation can
+//! be found [TME Intrinsics Introduction][tme_intrinsics_intro].
+//!
+//! The reference is [ACLE Q4 2019][acle_q4_2019_ref].
+//!
+//! ACLE has a section for TME extensions and state masks for aborts and failure codes.
+//! [ARM A64 Architecture Register Datasheet][a_profile_future] also describes possible failure code scenarios.
+//!
+//! [acle_q4_2019_ref]: https://static.docs.arm.com/101028/0010/ACLE_2019Q4_release-0010.pdf
+//! [tme_intrinsics_intro]: https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics
+//! [llvm_aarch64_int]: https://github.com/llvm/llvm-project/commit/a36d31478c182903523e04eb271bbf102bfab2cc#diff-ff24e1c35f4d54f1110ce5d90c709319R626-R646
+//! [a_profile_future]: https://static.docs.arm.com/ddi0601/a/SysReg_xml_futureA-2019-04.pdf?_ga=2.116560387.441514988.1590524918-1110153136.1588469296
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.aarch64.tstart"]
+    fn aarch64_tstart() -> u64;
+    #[link_name = "llvm.aarch64.tcommit"]
+    fn aarch64_tcommit();
+    #[link_name = "llvm.aarch64.tcancel"]
+    fn aarch64_tcancel(imm0: u64);
+    #[link_name = "llvm.aarch64.ttest"]
+    fn aarch64_ttest() -> u64;
+}
+
+/// Transaction successfully started.
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub const _TMSTART_SUCCESS: u64 = 0x00_u64;
+
+/// Extraction mask for failure reason
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub const _TMFAILURE_REASON: u64 = 0x00007FFF_u64;
+
+/// Transaction retry is possible.
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub const _TMFAILURE_RTRY: u64 = 1 << 15;
+
+/// Transaction executed a TCANCEL instruction
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub const _TMFAILURE_CNCL: u64 = 1 << 16;
+
+/// Transaction aborted because a conflict occurred
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub const _TMFAILURE_MEM: u64 = 1 << 17;
+
+/// Fallback error type for any other reason
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub const _TMFAILURE_IMP: u64 = 1 << 18;
+
+/// Transaction aborted because a non-permissible operation was attempted
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub const _TMFAILURE_ERR: u64 = 1 << 19;
+
+/// Transaction aborted due to read or write set limit was exceeded
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub const _TMFAILURE_SIZE: u64 = 1 << 20;
+
+/// Transaction aborted due to transactional nesting level was exceeded
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub const _TMFAILURE_NEST: u64 = 1 << 21;
+
+/// Transaction aborted due to a debug trap.
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub const _TMFAILURE_DBG: u64 = 1 << 22;
+
+/// Transaction failed from interrupt
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub const _TMFAILURE_INT: u64 = 1 << 23;
+
+/// Indicates a TRIVIAL version of TM is available
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub const _TMFAILURE_TRIVIAL: u64 = 1 << 24;
+
+// NOTE: Tests for these instructions are disabled on MSVC as dumpbin doesn't
+// understand these instructions.
+
+/// Starts a new transaction. When the transaction starts successfully the return value is 0.
+/// If the transaction fails, all state modifications are discarded and a cause of the failure
+/// is encoded in the return value.
+///
+/// [ARM TME Intrinsics](https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics).
+#[inline]
+#[target_feature(enable = "tme")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(tstart))]
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub unsafe fn __tstart() -> u64 {
+    aarch64_tstart()
+}
+
+/// Commits the current transaction. For a nested transaction, the only effect is that the
+/// transactional nesting depth is decreased. For an outer transaction, the state modifications
+/// performed transactionally are committed to the architectural state.
+///
+/// [ARM TME Intrinsics](https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics).
+#[inline]
+#[target_feature(enable = "tme")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(tcommit))]
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub unsafe fn __tcommit() {
+    aarch64_tcommit()
+}
+
+/// Cancels the current transaction and discards all state modifications that were performed transactionally.
+///
+/// [ARM TME Intrinsics](https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics).
+#[inline]
+#[target_feature(enable = "tme")]
+#[cfg_attr(
+    all(test, not(target_env = "msvc")),
+    assert_instr(tcancel, IMM16 = 0x0)
+)]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub unsafe fn __tcancel<const IMM16: u64>() {
+    static_assert!(IMM16 <= 65535);
+    aarch64_tcancel(IMM16);
+}
+
+/// Tests if executing inside a transaction. If no transaction is currently executing,
+/// the return value is 0. Otherwise, this intrinsic returns the depth of the transaction.
+///
+/// [ARM TME Intrinsics](https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics).
+#[inline]
+#[target_feature(enable = "tme")]
+#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(ttest))]
+#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
+pub unsafe fn __ttest() -> u64 {
+    aarch64_ttest()
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::aarch64::*;
+
+    const CANCEL_CODE: u64 = (0 | (0x123 & _TMFAILURE_REASON) as u64) as u64;
+
+    #[simd_test(enable = "tme")]
+    unsafe fn test_tstart() {
+        let mut x = 0;
+        for i in 0..10 {
+            let code = tme::__tstart();
+            if code == _TMSTART_SUCCESS {
+                x += 1;
+                assert_eq!(x, i + 1);
+                break;
+            }
+            assert_eq!(x, 0);
+        }
+    }
+
+    #[simd_test(enable = "tme")]
+    unsafe fn test_tcommit() {
+        let mut x = 0;
+        for i in 0..10 {
+            let code = tme::__tstart();
+            if code == _TMSTART_SUCCESS {
+                x += 1;
+                assert_eq!(x, i + 1);
+                tme::__tcommit();
+            }
+            assert_eq!(x, i + 1);
+        }
+    }
+
+    #[simd_test(enable = "tme")]
+    unsafe fn test_tcancel() {
+        let mut x = 0;
+
+        for i in 0..10 {
+            let code = tme::__tstart();
+            if code == _TMSTART_SUCCESS {
+                x += 1;
+                assert_eq!(x, i + 1);
+                tme::__tcancel::<CANCEL_CODE>();
+                break;
+            }
+        }
+
+        assert_eq!(x, 0);
+    }
+
+    #[simd_test(enable = "tme")]
+    unsafe fn test_ttest() {
+        for _ in 0..10 {
+            let code = tme::__tstart();
+            if code == _TMSTART_SUCCESS {
+                if tme::__ttest() == 2 {
+                    tme::__tcancel::<CANCEL_CODE>();
+                    break;
+                }
+            }
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/dsp.rs b/library/stdarch/crates/core_arch/src/arm/dsp.rs
new file mode 100644
index 0000000000000..22517e5929ad9
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/dsp.rs
@@ -0,0 +1,390 @@
+//! # References:
+//!
+//! - Section 8.3 "16-bit multiplications"
+//!
+//! Intrinsics that could live here:
+//!
+//! - \[x\] __smulbb
+//! - \[x\] __smulbt
+//! - \[x\] __smultb
+//! - \[x\] __smultt
+//! - \[x\] __smulwb
+//! - \[x\] __smulwt
+//! - \[x\] __qadd
+//! - \[x\] __qsub
+//! - \[x\] __qdbl
+//! - \[x\] __smlabb
+//! - \[x\] __smlabt
+//! - \[x\] __smlatb
+//! - \[x\] __smlatt
+//! - \[x\] __smlawb
+//! - \[x\] __smlawt
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.arm.smulbb"]
+    fn arm_smulbb(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smulbt"]
+    fn arm_smulbt(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smultb"]
+    fn arm_smultb(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smultt"]
+    fn arm_smultt(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smulwb"]
+    fn arm_smulwb(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smulwt"]
+    fn arm_smulwt(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qadd"]
+    fn arm_qadd(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub"]
+    fn arm_qsub(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlabb"]
+    fn arm_smlabb(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlabt"]
+    fn arm_smlabt(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlatb"]
+    fn arm_smlatb(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlatt"]
+    fn arm_smlatt(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlawb"]
+    fn arm_smlawb(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlawt"]
+    fn arm_smlawt(a: i32, b: i32, c: i32) -> i32;
+}
+
+/// Insert a SMULBB instruction
+///
+/// Returns the equivalent of a\[0\] * b\[0\]
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+#[inline]
+#[cfg_attr(test, assert_instr(smulbb))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smulbb(a: i32, b: i32) -> i32 {
+    arm_smulbb(a, b)
+}
+
+/// Insert a SMULTB instruction
+///
+/// Returns the equivalent of a\[0\] * b\[1\]
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+#[inline]
+#[cfg_attr(test, assert_instr(smultb))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smultb(a: i32, b: i32) -> i32 {
+    arm_smultb(a, b)
+}
+
+/// Insert a SMULTB instruction
+///
+/// Returns the equivalent of a\[1\] * b\[0\]
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+#[inline]
+#[cfg_attr(test, assert_instr(smulbt))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smulbt(a: i32, b: i32) -> i32 {
+    arm_smulbt(a, b)
+}
+
+/// Insert a SMULTT instruction
+///
+/// Returns the equivalent of a\[1\] * b\[1\]
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+#[inline]
+#[cfg_attr(test, assert_instr(smultt))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smultt(a: i32, b: i32) -> i32 {
+    arm_smultt(a, b)
+}
+
+/// Insert a SMULWB instruction
+///
+/// Multiplies the 32-bit signed first operand with the low halfword
+/// (as a 16-bit signed integer) of the second operand.
+/// Return the top 32 bits of the 48-bit product
+#[inline]
+#[cfg_attr(test, assert_instr(smulwb))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smulwb(a: i32, b: i32) -> i32 {
+    arm_smulwb(a, b)
+}
+
+/// Insert a SMULWT instruction
+///
+/// Multiplies the 32-bit signed first operand with the high halfword
+/// (as a 16-bit signed integer) of the second operand.
+/// Return the top 32 bits of the 48-bit product
+#[inline]
+#[cfg_attr(test, assert_instr(smulwt))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smulwt(a: i32, b: i32) -> i32 {
+    arm_smulwt(a, b)
+}
+
+/// Signed saturating addition
+///
+/// Returns the 32-bit saturating signed equivalent of a + b.
+/// Sets the Q flag if saturation occurs.
+#[inline]
+#[cfg_attr(test, assert_instr(qadd))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __qadd(a: i32, b: i32) -> i32 {
+    arm_qadd(a, b)
+}
+
+/// Signed saturating subtraction
+///
+/// Returns the 32-bit saturating signed equivalent of a - b.
+/// Sets the Q flag if saturation occurs.
+#[inline]
+#[cfg_attr(test, assert_instr(qsub))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __qsub(a: i32, b: i32) -> i32 {
+    arm_qsub(a, b)
+}
+
+/// Insert a QADD instruction
+///
+/// Returns the 32-bit saturating signed equivalent of a + a
+/// Sets the Q flag if saturation occurs.
+#[inline]
+#[cfg_attr(test, assert_instr(qadd))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __qdbl(a: i32) -> i32 {
+    arm_qadd(a, a)
+}
+
+/// Insert a SMLABB instruction
+///
+/// Returns the equivalent of a\[0\] * b\[0\] + c
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlabb))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smlabb(a: i32, b: i32, c: i32) -> i32 {
+    arm_smlabb(a, b, c)
+}
+
+/// Insert a SMLABT instruction
+///
+/// Returns the equivalent of a\[0\] * b\[1\] + c
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlabt))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smlabt(a: i32, b: i32, c: i32) -> i32 {
+    arm_smlabt(a, b, c)
+}
+
+/// Insert a SMLATB instruction
+///
+/// Returns the equivalent of a\[1\] * b\[0\] + c
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlatb))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smlatb(a: i32, b: i32, c: i32) -> i32 {
+    arm_smlatb(a, b, c)
+}
+
+/// Insert a SMLATT instruction
+///
+/// Returns the equivalent of a\[1\] * b\[1\] + c
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlatt))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smlatt(a: i32, b: i32, c: i32) -> i32 {
+    arm_smlatt(a, b, c)
+}
+
+/// Insert a SMLAWB instruction
+///
+/// Returns the equivalent of (a * b\[0\] + (c << 16)) >> 16
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlawb))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smlawb(a: i32, b: i32, c: i32) -> i32 {
+    arm_smlawb(a, b, c)
+}
+
+/// Insert a SMLAWT instruction
+///
+/// Returns the equivalent of (a * b\[1\] + (c << 16)) >> 16
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlawt))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smlawt(a: i32, b: i32, c: i32) -> i32 {
+    arm_smlawt(a, b, c)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::{
+        arm::*,
+        simd::{i8x4, i16x2, u8x4},
+    };
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[test]
+    fn smulbb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            assert_eq!(super::__smulbb(transmute(a), transmute(b)), 10 * 30);
+        }
+    }
+
+    #[test]
+    fn smulbt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            assert_eq!(super::__smulbt(transmute(a), transmute(b)), 10 * 40);
+        }
+    }
+
+    #[test]
+    fn smultb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            assert_eq!(super::__smultb(transmute(a), transmute(b)), 20 * 30);
+        }
+    }
+
+    #[test]
+    fn smultt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            assert_eq!(super::__smultt(transmute(a), transmute(b)), 20 * 40);
+        }
+    }
+
+    #[test]
+    fn smulwb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = 30;
+            assert_eq!(super::__smulwb(transmute(a), b), 20 * b);
+        }
+    }
+
+    #[test]
+    fn smulwt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = 30;
+            assert_eq!(super::__smulwt(transmute(a), b), (10 * b) >> 16);
+        }
+    }
+
+    #[test]
+    fn qadd() {
+        unsafe {
+            assert_eq!(super::__qadd(-10, 60), 50);
+            assert_eq!(super::__qadd(i32::MAX, 10), i32::MAX);
+            assert_eq!(super::__qadd(i32::MIN, -10), i32::MIN);
+        }
+    }
+
+    #[test]
+    fn qsub() {
+        unsafe {
+            assert_eq!(super::__qsub(10, 60), -50);
+            assert_eq!(super::__qsub(i32::MAX, -10), i32::MAX);
+            assert_eq!(super::__qsub(i32::MIN, 10), i32::MIN);
+        }
+    }
+
+    fn qdbl() {
+        unsafe {
+            assert_eq!(super::__qdbl(10), 20);
+            assert_eq!(super::__qdbl(i32::MAX), i32::MAX);
+        }
+    }
+
+    fn smlabb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            let c = 50;
+            let r = (10 * 30) + c;
+            assert_eq!(super::__smlabb(transmute(a), transmute(b), c), r);
+        }
+    }
+
+    fn smlabt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            let c = 50;
+            let r = (10 * 40) + c;
+            assert_eq!(super::__smlabt(transmute(a), transmute(b), c), r);
+        }
+    }
+
+    fn smlatb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            let c = 50;
+            let r = (20 * 30) + c;
+            assert_eq!(super::__smlabt(transmute(a), transmute(b), c), r);
+        }
+    }
+
+    fn smlatt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            let c = 50;
+            let r = (20 * 40) + c;
+            assert_eq!(super::__smlatt(transmute(a), transmute(b), c), r);
+        }
+    }
+
+    fn smlawb() {
+        unsafe {
+            let a: i32 = 10;
+            let b = i16x2::new(30, 40);
+            let c: i32 = 50;
+            let r: i32 = ((a * 30) + (c << 16)) >> 16;
+            assert_eq!(super::__smlawb(a, transmute(b), c), r);
+        }
+    }
+
+    fn smlawt() {
+        unsafe {
+            let a: i32 = 10;
+            let b = i16x2::new(30, 40);
+            let c: i32 = 50;
+            let r: i32 = ((a * 40) + (c << 16)) >> 16;
+            assert_eq!(super::__smlawt(a, transmute(b), c), r);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/mod.rs b/library/stdarch/crates/core_arch/src/arm/mod.rs
new file mode 100644
index 0000000000000..11d6e2df3ac04
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/mod.rs
@@ -0,0 +1,66 @@
+//! ARM intrinsics.
+//!
+//! The reference for NEON is [ARM's NEON Intrinsics Reference][arm_ref]. The
+//! [ARM's NEON Intrinsics Online Database][arm_dat] is also useful.
+//!
+//! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
+//! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
+
+// Supported arches: 6, 7-M. See Section 10.1 of ACLE (e.g. SSAT)
+#[cfg(any(target_feature = "v6", doc))]
+mod sat;
+
+#[cfg(any(target_feature = "v6", doc))]
+#[unstable(feature = "stdarch_arm_sat", issue = "none")]
+pub use self::sat::*;
+
+// Supported arches: 5TE, 7E-M. See Section 10.1 of ACLE (e.g. QADD)
+// We also include the A profile even though DSP is deprecated on that profile as of ACLE 2.0 (see
+// section 5.4.7)
+// Here we workaround the difference between LLVM's +dsp and ACLE's __ARM_FEATURE_DSP by gating on
+// '+v5te' rather than on '+dsp'
+#[cfg(any(
+    // >= v5TE but excludes v7-M
+    all(target_feature = "v5te", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+mod dsp;
+
+#[cfg(any(
+    // >= v5TE but excludes v7-M
+    all(target_feature = "v5te", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub use self::dsp::*;
+
+// Deprecated in ACLE 2.0 for the A profile but fully supported on the M and R profiles, says
+// Section 5.4.9 of ACLE. We'll expose these for the A profile even if deprecated
+#[cfg(any(
+    // v7-A, v7-R
+    all(target_feature = "v6", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+mod simd32;
+
+#[cfg(any(
+    // v7-A, v7-R
+    all(target_feature = "v6", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub use self::simd32::*;
+
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub use crate::core_arch::arm_shared::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
diff --git a/library/stdarch/crates/core_arch/src/arm/neon.rs b/library/stdarch/crates/core_arch/src/arm/neon.rs
new file mode 100644
index 0000000000000..90c358b5db7b3
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/neon.rs
@@ -0,0 +1,136 @@
+use crate::core_arch::arm_shared::neon::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.arm.neon.vbsl.v8i8"]
+    fn vbsl_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vbsl.v16i8"]
+    fn vbslq_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t;
+}
+
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7,aes")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_p64<const N: i32>(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(0 <= N && N <= 63);
+    transmute(vshiftins_v1i64(
+        transmute(a),
+        transmute(b),
+        int64x1_t::splat(N as i64),
+    ))
+}
+
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7,aes")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert!(0 <= N && N <= 63);
+    transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t::splat(N as i64),
+    ))
+}
+
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7,aes")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert!(0 <= N && N <= 63);
+    let a: poly64x2_t = simd_shuffle!(a, a, [0, 1]);
+    let b: poly64x2_t = simd_shuffle!(b, b, [0, 1]);
+    let ret_val: poly64x2_t = transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t::splat(N as i64),
+    ));
+    simd_shuffle!(ret_val, ret_val, [0, 1])
+}
+
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7,aes")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_p64<const N: i32>(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(1 <= N && N <= 64);
+    transmute(vshiftins_v1i64(
+        transmute(a),
+        transmute(b),
+        int64x1_t::splat(-N as i64),
+    ))
+}
+
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7,aes")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert!(1 <= N && N <= 64);
+    transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t::splat(-N as i64),
+    ))
+}
+
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7,aes")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert!(1 <= N && N <= 64);
+    let a: poly64x2_t = simd_shuffle!(a, a, [0, 1]);
+    let b: poly64x2_t = simd_shuffle!(b, b, [0, 1]);
+    let ret_val: poly64x2_t = transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t::splat(-N as i64),
+    ));
+    simd_shuffle!(ret_val, ret_val, [0, 1])
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/sat.rs b/library/stdarch/crates/core_arch/src/arm/sat.rs
new file mode 100644
index 0000000000000..bd38f59e642df
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/sat.rs
@@ -0,0 +1,62 @@
+//! # References:
+//!
+//! - Section 8.4 "Saturating intrinsics"
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Saturates a 32-bit signed integer to a signed integer with a given
+/// bit width.
+#[unstable(feature = "stdarch_arm_sat", issue = "none")]
+#[inline]
+#[cfg_attr(test, assert_instr("ssat", WIDTH = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __ssat<const WIDTH: u32>(x: i32) -> i32 {
+    static_assert!(matches!(WIDTH, 1..=32));
+    arm_ssat(x, WIDTH as i32)
+}
+
+/// Saturates a 32-bit signed integer to an unsigned integer with a given
+/// bit width.
+#[unstable(feature = "stdarch_arm_sat", issue = "none")]
+#[inline]
+#[cfg_attr(test, assert_instr("usat", WIDTH = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __usat<const WIDTH: u32>(x: i32) -> u32 {
+    static_assert!(matches!(WIDTH, 1..=32));
+    arm_usat(x, WIDTH as i32)
+}
+
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.arm.ssat"]
+    fn arm_ssat(x: i32, y: i32) -> i32;
+
+    #[link_name = "llvm.arm.usat"]
+    fn arm_usat(x: i32, y: i32) -> u32;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use stdarch_test::simd_test;
+
+    #[test]
+    fn test_ssat() {
+        unsafe {
+            assert_eq!(__ssat::<8>(1), 1);
+            assert_eq!(__ssat::<8>(1000), 127);
+            assert_eq!(__ssat::<8>(-1), -1);
+            assert_eq!(__ssat::<8>(-1000), -128);
+        }
+    }
+
+    #[test]
+    fn test_usat() {
+        unsafe {
+            assert_eq!(__usat::<8>(1), 1);
+            assert_eq!(__usat::<8>(1000), 255);
+            assert_eq!(__usat::<8>(-1), 0);
+            assert_eq!(__usat::<8>(-1000), 0);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/simd32.rs b/library/stdarch/crates/core_arch/src/arm/simd32.rs
new file mode 100644
index 0000000000000..2a9908ab2b96f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/simd32.rs
@@ -0,0 +1,765 @@
+//! # References
+//!
+//! - Section 8.5 "32-bit SIMD intrinsics" of ACLE
+//!
+//! Intrinsics that could live here
+//!
+//! - \[x\] __sel
+//! - \[ \] __ssat16
+//! - \[ \] __usat16
+//! - \[ \] __sxtab16
+//! - \[ \] __sxtb16
+//! - \[ \] __uxtab16
+//! - \[ \] __uxtb16
+//! - \[x\] __qadd8
+//! - \[x\] __qsub8
+//! - \[x\] __sadd8
+//! - \[x\] __shadd8
+//! - \[x\] __shsub8
+//! - \[x\] __ssub8
+//! - \[ \] __uadd8
+//! - \[ \] __uhadd8
+//! - \[ \] __uhsub8
+//! - \[ \] __uqadd8
+//! - \[ \] __uqsub8
+//! - \[x\] __usub8
+//! - \[x\] __usad8
+//! - \[x\] __usada8
+//! - \[x\] __qadd16
+//! - \[x\] __qasx
+//! - \[x\] __qsax
+//! - \[x\] __qsub16
+//! - \[x\] __sadd16
+//! - \[x\] __sasx
+//! - \[x\] __shadd16
+//! - \[ \] __shasx
+//! - \[ \] __shsax
+//! - \[x\] __shsub16
+//! - \[ \] __ssax
+//! - \[ \] __ssub16
+//! - \[ \] __uadd16
+//! - \[ \] __uasx
+//! - \[ \] __uhadd16
+//! - \[ \] __uhasx
+//! - \[ \] __uhsax
+//! - \[ \] __uhsub16
+//! - \[ \] __uqadd16
+//! - \[ \] __uqasx
+//! - \[x\] __uqsax
+//! - \[ \] __uqsub16
+//! - \[ \] __usax
+//! - \[ \] __usub16
+//! - \[x\] __smlad
+//! - \[ \] __smladx
+//! - \[ \] __smlald
+//! - \[ \] __smlaldx
+//! - \[x\] __smlsd
+//! - \[ \] __smlsdx
+//! - \[ \] __smlsld
+//! - \[ \] __smlsldx
+//! - \[x\] __smuad
+//! - \[x\] __smuadx
+//! - \[x\] __smusd
+//! - \[x\] __smusdx
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::mem::transmute;
+
+/// ARM-specific vector of four packed `i8` packed into a 32-bit integer.
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub type int8x4_t = i32;
+
+/// ARM-specific vector of four packed `u8` packed into a 32-bit integer.
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub type uint8x4_t = u32;
+
+/// ARM-specific vector of two packed `i16` packed into a 32-bit integer.
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub type int16x2_t = i32;
+
+/// ARM-specific vector of two packed `u16` packed into a 32-bit integer.
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub type uint16x2_t = u32;
+
+macro_rules! dsp_call {
+    ($name:expr, $a:expr, $b:expr) => {
+        transmute($name(transmute($a), transmute($b)))
+    };
+}
+
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.arm.qadd8"]
+    fn arm_qadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub8"]
+    fn arm_qsub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub16"]
+    fn arm_qsub16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qadd16"]
+    fn arm_qadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qasx"]
+    fn arm_qasx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsax"]
+    fn arm_qsax(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sadd16"]
+    fn arm_sadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sadd8"]
+    fn arm_sadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlad"]
+    fn arm_smlad(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlsd"]
+    fn arm_smlsd(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.sasx"]
+    fn arm_sasx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sel"]
+    fn arm_sel(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shadd8"]
+    fn arm_shadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shadd16"]
+    fn arm_shadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shsub8"]
+    fn arm_shsub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.ssub8"]
+    fn arm_ssub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.usub8"]
+    fn arm_usub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shsub16"]
+    fn arm_shsub16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smuad"]
+    fn arm_smuad(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smuadx"]
+    fn arm_smuadx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smusd"]
+    fn arm_smusd(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smusdx"]
+    fn arm_smusdx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.usad8"]
+    fn arm_usad8(a: i32, b: i32) -> u32;
+}
+
+/// Saturating four 8-bit integer additions
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] + b\[0\]
+/// res\[1\] = a\[1\] + b\[1\]
+/// res\[2\] = a\[2\] + b\[2\]
+/// res\[3\] = a\[3\] + b\[3\]
+#[inline]
+#[cfg_attr(test, assert_instr(qadd8))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_qadd8, a, b)
+}
+
+/// Saturating two 8-bit integer subtraction
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[0\]
+/// res\[1\] = a\[1\] - b\[1\]
+/// res\[2\] = a\[2\] - b\[2\]
+/// res\[3\] = a\[3\] - b\[3\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsub8))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_qsub8, a, b)
+}
+
+/// Saturating two 16-bit integer subtraction
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[0\]
+/// res\[1\] = a\[1\] - b\[1\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsub16))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qsub16, a, b)
+}
+
+/// Saturating two 16-bit integer additions
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] + b\[0\]
+/// res\[1\] = a\[1\] + b\[1\]
+#[inline]
+#[cfg_attr(test, assert_instr(qadd16))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qadd16, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] - b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+#[inline]
+#[cfg_attr(test, assert_instr(qasx))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qasx, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] - b\[0\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsax))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qsax, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sadd16))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_sadd16, a, b)
+}
+
+/// Returns the 8-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+/// res\[2\] = a\[2\] + b\[2\]
+/// res\[3\] = a\[3\] + b\[3\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sadd8))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_sadd8, a, b)
+}
+
+/// Dual 16-bit Signed Multiply with Addition of products
+/// and 32-bit accumulation.
+///
+/// Returns the 16-bit signed equivalent of
+/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\] + c
+#[inline]
+#[cfg_attr(test, assert_instr(smlad))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smlad(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlad(transmute(a), transmute(b), c)
+}
+
+/// Dual 16-bit Signed Multiply with Subtraction  of products
+/// and 32-bit accumulation and overflow detection.
+///
+/// Returns the 16-bit signed equivalent of
+/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\] + c
+#[inline]
+#[cfg_attr(test, assert_instr(smlsd))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smlsd(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlsd(transmute(a), transmute(b), c)
+}
+
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sasx))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_sasx, a, b)
+}
+
+/// Select bytes from each operand according to APSR GE flags
+///
+/// Returns the equivalent of
+///
+/// res\[0\] = GE\[0\] ? a\[0\] : b\[0\]
+/// res\[1\] = GE\[1\] ? a\[1\] : b\[1\]
+/// res\[2\] = GE\[2\] ? a\[2\] : b\[2\]
+/// res\[3\] = GE\[3\] ? a\[3\] : b\[3\]
+///
+/// where GE are bits of APSR
+#[inline]
+#[cfg_attr(test, assert_instr(sel))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_sel, a, b)
+}
+
+/// Signed halving parallel byte-wise addition.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] + b\[0\]) / 2
+/// res\[1\] = (a\[1\] + b\[1\]) / 2
+/// res\[2\] = (a\[2\] + b\[2\]) / 2
+/// res\[3\] = (a\[3\] + b\[3\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shadd8))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_shadd8, a, b)
+}
+
+/// Signed halving parallel halfword-wise addition.
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] + b\[0\]) / 2
+/// res\[1\] = (a\[1\] + b\[1\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shadd16))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_shadd16, a, b)
+}
+
+/// Signed halving parallel byte-wise subtraction.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] - b\[0\]) / 2
+/// res\[1\] = (a\[1\] - b\[1\]) / 2
+/// res\[2\] = (a\[2\] - b\[2\]) / 2
+/// res\[3\] = (a\[3\] - b\[3\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shsub8))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_shsub8, a, b)
+}
+
+/// Inserts a `USUB8` instruction.
+///
+/// Returns the 8-bit unsigned equivalent of
+///
+/// res\[0\] = a\[0\] - a\[0\]
+/// res\[1\] = a\[1\] - a\[1\]
+/// res\[2\] = a\[2\] - a\[2\]
+/// res\[3\] = a\[3\] - a\[3\]
+///
+/// where \[0\] is the lower 8 bits and \[3\] is the upper 8 bits.
+/// The GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(usub8))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __usub8(a: uint8x4_t, b: uint8x4_t) -> uint8x4_t {
+    dsp_call!(arm_usub8, a, b)
+}
+
+/// Inserts a `SSUB8` instruction.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - a\[0\]
+/// res\[1\] = a\[1\] - a\[1\]
+/// res\[2\] = a\[2\] - a\[2\]
+/// res\[3\] = a\[3\] - a\[3\]
+///
+/// where \[0\] is the lower 8 bits and \[3\] is the upper 8 bits.
+/// The GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(ssub8))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __ssub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_ssub8, a, b)
+}
+
+/// Signed halving parallel halfword-wise subtraction.
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] - b\[0\]) / 2
+/// res\[1\] = (a\[1\] - b\[1\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shsub16))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_shsub16, a, b)
+}
+
+/// Signed Dual Multiply Add.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smuad))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smuad(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smuad(transmute(a), transmute(b))
+}
+
+/// Signed Dual Multiply Add Reversed.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[1\] + a\[1\] * b\[0\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smuadx))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smuadx(transmute(a), transmute(b))
+}
+
+/// Signed Dual Multiply Subtract.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smusd))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smusd(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smusd(transmute(a), transmute(b))
+}
+
+/// Signed Dual Multiply Subtract Reversed.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[1\] - a\[1\] * b\[0\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smusdx))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smusdx(transmute(a), transmute(b))
+}
+
+/// Sum of 8-bit absolute differences.
+///
+/// Returns the 8-bit unsigned equivalent of
+///
+/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
+///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\])
+#[inline]
+#[cfg_attr(test, assert_instr(usad8))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __usad8(a: int8x4_t, b: int8x4_t) -> u32 {
+    arm_usad8(transmute(a), transmute(b))
+}
+
+/// Sum of 8-bit absolute differences and constant.
+///
+/// Returns the 8-bit unsigned equivalent of
+///
+/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
+///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\]) + c
+#[inline]
+#[cfg_attr(test, assert_instr(usad8))]
+#[unstable(feature = "stdarch_arm_dsp", issue = "117237")]
+pub unsafe fn __usada8(a: int8x4_t, b: int8x4_t, c: u32) -> u32 {
+    __usad8(a, b) + c
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::simd::{i8x4, i16x2, u8x4};
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[test]
+    fn qadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, i8::MAX);
+            let b = i8x4::new(2, -1, 0, 1);
+            let c = i8x4::new(3, 1, 3, i8::MAX);
+            let r: i8x4 = dsp_call!(super::__qadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, i8::MIN);
+            let b = i8x4::new(2, -1, 0, 1);
+            let c = i8x4::new(-1, 3, 3, i8::MIN);
+            let r: i8x4 = dsp_call!(super::__qsub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qadd16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(2, -1);
+            let c = i16x2::new(3, 1);
+            let r: i16x2 = dsp_call!(super::__qadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsub16() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(20, -10);
+            let c = i16x2::new(-10, 30);
+            let r: i16x2 = dsp_call!(super::__qsub16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qasx() {
+        unsafe {
+            let a = i16x2::new(1, i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(-1, i16::MAX);
+            let r: i16x2 = dsp_call!(super::__qasx, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsax() {
+        unsafe {
+            let a = i16x2::new(1, i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(3, i16::MAX - 2);
+            let r: i16x2 = dsp_call!(super::__qsax, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sadd16() {
+        unsafe {
+            let a = i16x2::new(1, i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(3, -i16::MAX);
+            let r: i16x2 = dsp_call!(super::__sadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, i8::MAX);
+            let b = i8x4::new(4, 3, 2, 2);
+            let c = i8x4::new(5, 5, 5, -i8::MAX);
+            let r: i8x4 = dsp_call!(super::__sadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sasx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(2, 1);
+            let c = i16x2::new(0, 4);
+            let r: i16x2 = dsp_call!(super::__sasx, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn smlad() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(3, 4);
+            let r = super::__smlad(transmute(a), transmute(b), 10);
+            assert_eq!(r, (1 * 3) + (2 * 4) + 10);
+        }
+    }
+
+    #[test]
+    fn smlsd() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(3, 4);
+            let r = super::__smlsd(transmute(a), transmute(b), 10);
+            assert_eq!(r, ((1 * 3) - (2 * 4)) + 10);
+        }
+    }
+
+    #[test]
+    fn sel() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, i8::MAX);
+            let b = i8x4::new(4, 3, 2, 2);
+            // call sadd8() to set GE bits
+            super::__sadd8(transmute(a), transmute(b));
+            let c = i8x4::new(1, 2, 3, i8::MAX);
+            let r: i8x4 = dsp_call!(super::__sel, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(3, 3, 3, 3);
+            let r: i8x4 = dsp_call!(super::__shadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shadd16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let c = i16x2::new(3, 3);
+            let r: i16x2 = dsp_call!(super::__shadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shsub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(-2, -1, 0, 1);
+            let r: i8x4 = dsp_call!(super::__shsub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn ssub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(-4, -2, 0, 2);
+            let r: i8x4 = dsp_call!(super::__ssub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn usub8() {
+        unsafe {
+            let a = u8x4::new(1, 2, 3, 4);
+            let b = u8x4::new(5, 4, 3, 2);
+            let c = u8x4::new(252, 254, 0, 2);
+            let r: u8x4 = dsp_call!(super::__usub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shsub16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let c = i16x2::new(-2, -1);
+            let r: i16x2 = dsp_call!(super::__shsub16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn smuad() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = super::__smuad(transmute(a), transmute(b));
+            assert_eq!(r, 13);
+        }
+    }
+
+    #[test]
+    fn smuadx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = super::__smuadx(transmute(a), transmute(b));
+            assert_eq!(r, 14);
+        }
+    }
+
+    #[test]
+    fn smusd() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = super::__smusd(transmute(a), transmute(b));
+            assert_eq!(r, -3);
+        }
+    }
+
+    #[test]
+    fn smusdx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = super::__smusdx(transmute(a), transmute(b));
+            assert_eq!(r, -6);
+        }
+    }
+
+    #[test]
+    fn usad8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(4, 3, 2, 1);
+            let r = super::__usad8(transmute(a), transmute(b));
+            assert_eq!(r, 8);
+        }
+    }
+
+    #[test]
+    fn usad8a() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(4, 3, 2, 1);
+            let c = 10;
+            let r = super::__usada8(transmute(a), transmute(b), c);
+            assert_eq!(r, 8 + c);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/barrier/common.rs b/library/stdarch/crates/core_arch/src/arm_shared/barrier/common.rs
new file mode 100644
index 0000000000000..476a07ffaef96
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/common.rs
@@ -0,0 +1,16 @@
+//! Access types available on all architectures
+
+/// Full system is the required shareability domain, reads and writes are the
+/// required access types
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub struct SY;
+
+dmb_dsb!(SY);
+
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+impl super::super::sealed::Isb for SY {
+    #[inline(always)]
+    unsafe fn __isb(&self) {
+        super::isb(super::arg::SY)
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs b/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs
new file mode 100644
index 0000000000000..ae9ce3c005cd3
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs
@@ -0,0 +1,45 @@
+// Reference: ARM11 MPCore Processor Technical Reference Manual (ARM DDI 0360E) Section 3.5 "Summary
+// of CP15 instructions"
+
+use crate::arch::asm;
+
+/// Full system is the required shareability domain, reads and writes are the
+/// required access types
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub struct SY;
+
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+impl super::super::sealed::Dmb for SY {
+    #[inline(always)]
+    unsafe fn __dmb(&self) {
+        asm!(
+            "mcr p15, 0, {}, c7, c10, 5",
+            in(reg) 0_u32,
+            options(preserves_flags, nostack)
+        )
+    }
+}
+
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+impl super::super::sealed::Dsb for SY {
+    #[inline(always)]
+    unsafe fn __dsb(&self) {
+        asm!(
+            "mcr p15, 0, {}, c7, c10, 4",
+            in(reg) 0_u32,
+            options(preserves_flags, nostack)
+        )
+    }
+}
+
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+impl super::super::sealed::Isb for SY {
+    #[inline(always)]
+    unsafe fn __isb(&self) {
+        asm!(
+            "mcr p15, 0, {}, c7, c5, 4",
+            in(reg) 0_u32,
+            options(preserves_flags, nostack)
+        )
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/barrier/mod.rs b/library/stdarch/crates/core_arch/src/arm_shared/barrier/mod.rs
new file mode 100644
index 0000000000000..e198b63521feb
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/mod.rs
@@ -0,0 +1,185 @@
+// Reference: Section 7.4 "Hints" of ACLE
+
+// CP15 instruction
+#[cfg(not(any(
+    // v8
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    // v7
+    target_feature = "v7",
+    // v6-M
+    target_feature = "mclass"
+)))]
+mod cp15;
+
+#[cfg(not(any(
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    target_feature = "v7",
+    target_feature = "mclass"
+)))]
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub use self::cp15::*;
+
+// Dedicated instructions
+#[cfg(any(
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    target_feature = "v7",
+    target_feature = "mclass"
+))]
+macro_rules! dmb_dsb {
+    ($A:ident) => {
+        #[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+        impl super::super::sealed::Dmb for $A {
+            #[inline(always)]
+            unsafe fn __dmb(&self) {
+                super::dmb(super::arg::$A)
+            }
+        }
+
+        #[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+        impl super::super::sealed::Dsb for $A {
+            #[inline(always)]
+            unsafe fn __dsb(&self) {
+                super::dsb(super::arg::$A)
+            }
+        }
+    };
+}
+
+#[cfg(any(
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    target_feature = "v7",
+    target_feature = "mclass"
+))]
+mod common;
+
+#[cfg(any(
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    target_feature = "v7",
+    target_feature = "mclass"
+))]
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub use self::common::*;
+
+#[cfg(any(
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    target_feature = "v7",
+))]
+mod not_mclass;
+
+#[cfg(any(
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    target_feature = "v7",
+))]
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub use self::not_mclass::*;
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+mod v8;
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub use self::v8::*;
+
+/// Generates a DMB (data memory barrier) instruction or equivalent CP15 instruction.
+///
+/// DMB ensures the observed ordering of memory accesses. Memory accesses of the specified type
+/// issued before the DMB are guaranteed to be observed (in the specified scope) before memory
+/// accesses issued after the DMB.
+///
+/// For example, DMB should be used between storing data, and updating a flag variable that makes
+/// that data available to another core.
+///
+/// The __dmb() intrinsic also acts as a compiler memory barrier of the appropriate type.
+#[inline(always)]
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub unsafe fn __dmb<A>(arg: A)
+where
+    A: super::sealed::Dmb,
+{
+    arg.__dmb()
+}
+
+/// Generates a DSB (data synchronization barrier) instruction or equivalent CP15 instruction.
+///
+/// DSB ensures the completion of memory accesses. A DSB behaves as the equivalent DMB and has
+/// additional properties. After a DSB instruction completes, all memory accesses of the specified
+/// type issued before the DSB are guaranteed to have completed.
+///
+/// The __dsb() intrinsic also acts as a compiler memory barrier of the appropriate type.
+#[inline(always)]
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub unsafe fn __dsb<A>(arg: A)
+where
+    A: super::sealed::Dsb,
+{
+    arg.__dsb()
+}
+
+/// Generates an ISB (instruction synchronization barrier) instruction or equivalent CP15
+/// instruction.
+///
+/// This instruction flushes the processor pipeline fetch buffers, so that following instructions
+/// are fetched from cache or memory.
+///
+/// An ISB is needed after some system maintenance operations. An ISB is also needed before
+/// transferring control to code that has been loaded or modified in memory, for example by an
+/// overlay mechanism or just-in-time code generator.  (Note that if instruction and data caches are
+/// separate, privileged cache maintenance operations would be needed in order to unify the caches.)
+///
+/// The only supported argument for the __isb() intrinsic is 15, corresponding to the SY (full
+/// system) scope of the ISB instruction.
+#[inline(always)]
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub unsafe fn __isb<A>(arg: A)
+where
+    A: super::sealed::Isb,
+{
+    arg.__isb()
+}
+
+unsafe extern "unadjusted" {
+    #[cfg_attr(
+        any(target_arch = "aarch64", target_arch = "arm64ec"),
+        link_name = "llvm.aarch64.dmb"
+    )]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.dmb")]
+    fn dmb(_: i32);
+
+    #[cfg_attr(
+        any(target_arch = "aarch64", target_arch = "arm64ec"),
+        link_name = "llvm.aarch64.dsb"
+    )]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.dsb")]
+    fn dsb(_: i32);
+
+    #[cfg_attr(
+        any(target_arch = "aarch64", target_arch = "arm64ec"),
+        link_name = "llvm.aarch64.isb"
+    )]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.isb")]
+    fn isb(_: i32);
+}
+
+// we put these in a module to prevent weirdness with glob re-exports
+mod arg {
+    // See Section 7.3  Memory barriers of ACLE
+    pub const SY: i32 = 15;
+    pub const ST: i32 = 14;
+    pub const LD: i32 = 13;
+    pub const ISH: i32 = 11;
+    pub const ISHST: i32 = 10;
+    pub const ISHLD: i32 = 9;
+    pub const NSH: i32 = 7;
+    pub const NSHST: i32 = 6;
+    pub const NSHLD: i32 = 5;
+    pub const OSH: i32 = 3;
+    pub const OSHST: i32 = 2;
+    pub const OSHLD: i32 = 1;
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/barrier/not_mclass.rs b/library/stdarch/crates/core_arch/src/arm_shared/barrier/not_mclass.rs
new file mode 100644
index 0000000000000..3b941b2715efa
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/not_mclass.rs
@@ -0,0 +1,50 @@
+//! Access types available on v7 and v8 but not on v7(E)-M or v8-M
+
+/// Full system is the required shareability domain, writes are the required
+/// access type
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub struct ST;
+
+dmb_dsb!(ST);
+
+/// Inner Shareable is the required shareability domain, reads and writes are
+/// the required access types
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub struct ISH;
+
+dmb_dsb!(ISH);
+
+/// Inner Shareable is the required shareability domain, writes are the required
+/// access type
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub struct ISHST;
+
+dmb_dsb!(ISHST);
+
+/// Non-shareable is the required shareability domain, reads and writes are the
+/// required access types
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub struct NSH;
+
+dmb_dsb!(NSH);
+
+/// Non-shareable is the required shareability domain, writes are the required
+/// access type
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub struct NSHST;
+
+dmb_dsb!(NSHST);
+
+/// Outer Shareable is the required shareability domain, reads and writes are
+/// the required access types
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub struct OSH;
+
+dmb_dsb!(OSH);
+
+/// Outer Shareable is the required shareability domain, writes are the required
+/// access type
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub struct OSHST;
+
+dmb_dsb!(OSHST);
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/barrier/v8.rs b/library/stdarch/crates/core_arch/src/arm_shared/barrier/v8.rs
new file mode 100644
index 0000000000000..5bf757f9f779d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/v8.rs
@@ -0,0 +1,27 @@
+/// Full system is the required shareability domain, reads are the required
+/// access type
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub struct LD;
+
+dmb_dsb!(LD);
+
+/// Inner Shareable is the required shareability domain, reads are the required
+/// access type
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub struct ISHLD;
+
+dmb_dsb!(ISHLD);
+
+/// Non-shareable is the required shareability domain, reads are the required
+/// access type
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub struct NSHLD;
+
+dmb_dsb!(NSHLD);
+
+/// Outer Shareable is the required shareability domain, reads are the required
+/// access type
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub struct OSHLD;
+
+dmb_dsb!(OSHLD);
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/hints.rs b/library/stdarch/crates/core_arch/src/arm_shared/hints.rs
new file mode 100644
index 0000000000000..54fd78270abda
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/hints.rs
@@ -0,0 +1,125 @@
+// # References
+//
+// - Section 7.4 "Hints" of ACLE
+// - Section 7.7 "NOP" of ACLE
+
+/// Generates a WFI (wait for interrupt) hint instruction, or nothing.
+///
+/// The WFI instruction allows (but does not require) the processor to enter a
+/// low-power state until one of a number of asynchronous events occurs.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(
+    target_feature = "v6",
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    doc
+))]
+#[inline(always)]
+#[unstable(feature = "stdarch_arm_hints", issue = "117218")]
+pub unsafe fn __wfi() {
+    hint(HINT_WFI);
+}
+
+/// Generates a WFE (wait for event) hint instruction, or nothing.
+///
+/// The WFE instruction allows (but does not require) the processor to enter a
+/// low-power state until some event occurs such as a SEV being issued by
+/// another processor.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(
+    target_feature = "v6",
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    doc
+))]
+#[inline(always)]
+#[unstable(feature = "stdarch_arm_hints", issue = "117218")]
+pub unsafe fn __wfe() {
+    hint(HINT_WFE);
+}
+
+/// Generates a SEV (send a global event) hint instruction.
+///
+/// This causes an event to be signaled to all processors in a multiprocessor
+/// system. It is a NOP on a uniprocessor system.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M, 7-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(
+    target_feature = "v6",
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    doc
+))]
+#[inline(always)]
+#[unstable(feature = "stdarch_arm_hints", issue = "117218")]
+pub unsafe fn __sev() {
+    hint(HINT_SEV);
+}
+
+/// Generates a send a local event hint instruction.
+///
+/// This causes an event to be signaled to only the processor executing this
+/// instruction. In a multiprocessor system, it is not required to affect the
+/// other processors.
+// LLVM says "instruction requires: armv8"
+#[cfg(any(
+    target_feature = "v8", // 32-bit ARMv8
+    target_arch = "aarch64", // AArch64
+    target_arch = "arm64ec", // Arm64EC
+    doc,
+))]
+#[inline(always)]
+#[unstable(feature = "stdarch_arm_hints", issue = "117218")]
+pub unsafe fn __sevl() {
+    hint(HINT_SEVL);
+}
+
+/// Generates a YIELD hint instruction.
+///
+/// This enables multithreading software to indicate to the hardware that it is
+/// performing a task, for example a spin-lock, that could be swapped out to
+/// improve overall system performance.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(
+    target_feature = "v6",
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    doc
+))]
+#[inline(always)]
+#[unstable(feature = "stdarch_arm_hints", issue = "117218")]
+pub unsafe fn __yield() {
+    hint(HINT_YIELD);
+}
+
+/// Generates an unspecified no-op instruction.
+///
+/// Note that not all architectures provide a distinguished NOP instruction. On
+/// those that do, it is unspecified whether this intrinsic generates it or
+/// another instruction. It is not guaranteed that inserting this instruction
+/// will increase execution time.
+#[inline(always)]
+#[unstable(feature = "stdarch_arm_hints", issue = "117218")]
+pub unsafe fn __nop() {
+    crate::arch::asm!("nop", options(nomem, nostack, preserves_flags));
+}
+
+unsafe extern "unadjusted" {
+    #[cfg_attr(
+        any(target_arch = "aarch64", target_arch = "arm64ec"),
+        link_name = "llvm.aarch64.hint"
+    )]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.hint")]
+    fn hint(_: i32);
+}
+
+// from LLVM 7.0.1's lib/Target/ARM/{ARMInstrThumb,ARMInstrInfo,ARMInstrThumb2}.td
+const HINT_NOP: i32 = 0;
+const HINT_YIELD: i32 = 1;
+const HINT_WFE: i32 = 2;
+const HINT_WFI: i32 = 3;
+const HINT_SEV: i32 = 4;
+const HINT_SEVL: i32 = 5;
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/mod.rs b/library/stdarch/crates/core_arch/src/arm_shared/mod.rs
new file mode 100644
index 0000000000000..527b53de99d95
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/mod.rs
@@ -0,0 +1,117 @@
+//! ARM C Language Extensions (ACLE)
+//!
+//! # Developer notes
+//!
+//! Below is a list of built-in targets that are representative of the different ARM
+//! architectures; the list includes the `target_feature`s they possess.
+//!
+//! - `armv4t-unknown-linux-gnueabi` - **ARMv4** - `+v4t`
+//! - `armv5te-unknown-linux-gnueabi` - **ARMv5TE** - `+v4t +v5te`
+//! - `arm-unknown-linux-gnueabi` - **ARMv6** - `+v4t +v5te +v6`
+//! - `thumbv6m-none-eabi` - **ARMv6-M** - `+v4t +v5te +v6 +thumb-mode +mclass`
+//! - `armv7-unknown-linux-gnueabihf` - **ARMv7-A** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +dsp +thumb2 +aclass`
+//! - `armv7r-none-eabi` - **ARMv7-R** - `+v4t +v5te +v6 +v6k +v6t2  +v7 +dsp +thumb2 +rclass`
+//! - `thumbv7m-none-eabi` - **ARMv7-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +thumb2 +thumb-mode +mclass`
+//! - `thumbv7em-none-eabi` - **ARMv7E-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +dsp +thumb2 +thumb-mode +mclass`
+//! - `thumbv8m.main-none-eabi` - **ARMv8-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +thumb2 +thumb-mode +mclass`
+//! - `armv8r-none-eabi` - **ARMv8-R** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +v8 +thumb2 +rclass`
+//! - `aarch64-unknown-linux-gnu` - **ARMv8-A (AArch64)** - `+fp +neon`
+//!
+//! Section 10.1 of ACLE says:
+//!
+//! - "In the sequence of Arm architectures { v5, v5TE, v6, v6T2, v7 } each architecture includes
+//! its predecessor instruction set."
+//!
+//! - "In the sequence of Thumb-only architectures { v6-M, v7-M, v7E-M } each architecture includes
+//! its predecessor instruction set."
+//!
+//! From that info and from looking at how LLVM features work (using custom targets) we can identify
+//! features that are subsets of others:
+//!
+//! Legend: `a < b` reads as "`a` is a subset of `b`"; this means that if `b` is enabled then `a` is
+//! enabled as well.
+//!
+//! - `v4t < v5te < v6 < v6k < v6t2 < v7 < v8`
+//! - `v6 < v8m < v6t2`
+//! - `v7 < v8m.main`
+//!
+//! *NOTE*: Section 5.4.7 of ACLE says:
+//!
+//! - "__ARM_FEATURE_DSP is defined to 1 if the DSP (v5E) instructions are supported and the
+//! intrinsics defined in Saturating intrinsics are available."
+//!
+//! This does *not* match how LLVM uses the '+dsp' feature; this feature is not set for v5te
+//! targets so we have to work around this difference.
+//!
+//! # References
+//!
+//! - [ACLE Q2 2018](https://developer.arm.com/docs/101028/latest)
+
+#![cfg_attr(
+    all(target_arch = "aarch64", target_abi = "softfloat"),
+    // Just allow the warning: anyone soundly using the intrinsics has to enable
+    // the target feature, and that will generate a warning for them.
+    allow(aarch64_softfloat_neon)
+)]
+// Only for 'neon' submodule
+#![allow(non_camel_case_types)]
+
+// 8, 7 and 6-M are supported via dedicated instructions like DMB. All other arches are supported
+// via CP15 instructions. See Section 10.1 of ACLE
+mod barrier;
+#[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+pub use self::barrier::*;
+
+mod hints;
+#[unstable(feature = "stdarch_arm_hints", issue = "117218")]
+pub use self::hints::*;
+
+#[cfg(any(
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    target_feature = "v7",
+    doc
+))]
+pub(crate) mod neon;
+
+#[cfg(any(
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    target_feature = "v7",
+    doc
+))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub use self::neon::*;
+
+#[cfg(test)]
+#[cfg(any(
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    target_feature = "v7",
+    doc
+))]
+pub(crate) mod test_support;
+
+mod sealed {
+    #[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+    pub trait Dmb {
+        unsafe fn __dmb(&self);
+    }
+
+    #[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+    pub trait Dsb {
+        unsafe fn __dsb(&self);
+    }
+
+    #[unstable(feature = "stdarch_arm_barrier", issue = "117219")]
+    pub trait Isb {
+        unsafe fn __isb(&self);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
new file mode 100644
index 0000000000000..286f1868852aa
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -0,0 +1,75440 @@
+// This code is automatically generated. DO NOT MODIFY.
+//
+// Instead, modify `crates/stdarch-gen-arm/spec/` and run the following command to re-generate this file:
+//
+// ```
+// cargo run --bin=stdarch-gen-arm -- crates/stdarch-gen-arm/spec
+// ```
+#![allow(improper_ctypes)]
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use super::*;
+
+#[doc = "CRC32 single round checksum for bytes (8 bits)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32b)"]
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(crc32b))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_aarch32_crc32", issue = "125085")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "stdarch_aarch64_crc32", since = "1.80.0")
+)]
+pub fn __crc32b(crc: u32, data: u8) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crc32b"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32b")]
+        fn ___crc32b(crc: u32, data: u32) -> u32;
+    }
+    unsafe { ___crc32b(crc, data as u32) }
+}
+#[doc = "CRC32-C single round checksum for bytes (8 bits)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32cb)"]
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(crc32cb))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_aarch32_crc32", issue = "125085")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "stdarch_aarch64_crc32", since = "1.80.0")
+)]
+pub fn __crc32cb(crc: u32, data: u8) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crc32cb"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32cb")]
+        fn ___crc32cb(crc: u32, data: u32) -> u32;
+    }
+    unsafe { ___crc32cb(crc, data as u32) }
+}
+#[doc = "CRC32-C single round checksum for quad words (64 bits)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32cd)"]
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(crc32cw))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_aarch32_crc32", issue = "125085")
+)]
+pub fn __crc32cd(crc: u32, data: u64) -> u32 {
+    let b: u32 = (data & 0xFFFFFFFF) as u32;
+    let c: u32 = (data >> 32) as u32;
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32cw")]
+        fn ___crc32cw(crc: u32, data: u32) -> u32;
+    }
+    unsafe { ___crc32cw(___crc32cw(crc, b), c) }
+}
+#[doc = "CRC32-C single round checksum for bytes (16 bits)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32ch)"]
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(crc32ch))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_aarch32_crc32", issue = "125085")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "stdarch_aarch64_crc32", since = "1.80.0")
+)]
+pub fn __crc32ch(crc: u32, data: u16) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crc32ch"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32ch")]
+        fn ___crc32ch(crc: u32, data: u32) -> u32;
+    }
+    unsafe { ___crc32ch(crc, data as u32) }
+}
+#[doc = "CRC32-C single round checksum for bytes (32 bits)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32cw)"]
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(crc32cw))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_aarch32_crc32", issue = "125085")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "stdarch_aarch64_crc32", since = "1.80.0")
+)]
+pub fn __crc32cw(crc: u32, data: u32) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crc32cw"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32cw")]
+        fn ___crc32cw(crc: u32, data: u32) -> u32;
+    }
+    unsafe { ___crc32cw(crc, data) }
+}
+#[doc = "CRC32 single round checksum for quad words (64 bits)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32d)"]
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(crc32w))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_aarch32_crc32", issue = "125085")
+)]
+pub fn __crc32d(crc: u32, data: u64) -> u32 {
+    let b: u32 = (data & 0xFFFFFFFF) as u32;
+    let c: u32 = (data >> 32) as u32;
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32w")]
+        fn ___crc32w(crc: u32, data: u32) -> u32;
+    }
+    unsafe { ___crc32w(___crc32w(crc, b), c) }
+}
+#[doc = "CRC32 single round checksum for bytes (16 bits)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32h)"]
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(crc32h))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_aarch32_crc32", issue = "125085")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "stdarch_aarch64_crc32", since = "1.80.0")
+)]
+pub fn __crc32h(crc: u32, data: u16) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crc32h"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32h")]
+        fn ___crc32h(crc: u32, data: u32) -> u32;
+    }
+    unsafe { ___crc32h(crc, data as u32) }
+}
+#[doc = "CRC32 single round checksum for bytes (32 bits)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32w)"]
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(crc32w))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_aarch32_crc32", issue = "125085")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "stdarch_aarch64_crc32", since = "1.80.0")
+)]
+pub fn __crc32w(crc: u32, data: u32) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crc32w"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32w")]
+        fn ___crc32w(crc: u32, data: u32) -> u32;
+    }
+    unsafe { ___crc32w(crc, data) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/priv_vpadal_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.s8"))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn priv_vpadal_s8(a: int16x4_t, b: int8x8_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadals.v4i16.v8i8")]
+        fn _priv_vpadal_s8(a: int16x4_t, b: int8x8_t) -> int16x4_t;
+    }
+    unsafe { _priv_vpadal_s8(a, b) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/priv_vpadalq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.s8"))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn priv_vpadalq_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadals.v8i16.v16i8")]
+        fn _priv_vpadalq_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t;
+    }
+    unsafe { _priv_vpadalq_s8(a, b) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/priv_vpadal_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.s16"))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn priv_vpadal_s16(a: int32x2_t, b: int16x4_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadals.v2i32.v4i16")]
+        fn _priv_vpadal_s16(a: int32x2_t, b: int16x4_t) -> int32x2_t;
+    }
+    unsafe { _priv_vpadal_s16(a, b) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/priv_vpadalq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.s16"))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn priv_vpadalq_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadals.v4i32.v8i16")]
+        fn _priv_vpadalq_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t;
+    }
+    unsafe { _priv_vpadalq_s16(a, b) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/priv_vpadal_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.s32"))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn priv_vpadal_s32(a: int64x1_t, b: int32x2_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadals.v1i64.v2i32")]
+        fn _priv_vpadal_s32(a: int64x1_t, b: int32x2_t) -> int64x1_t;
+    }
+    unsafe { _priv_vpadal_s32(a, b) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/priv_vpadalq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.s32"))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn priv_vpadalq_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadals.v2i64.v4i32")]
+        fn _priv_vpadalq_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t;
+    }
+    unsafe { _priv_vpadalq_s32(a, b) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/priv_vpadal_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.u8"))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn priv_vpadal_u8(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadalu.v4i16.v8i8")]
+        fn _priv_vpadal_u8(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t;
+    }
+    unsafe { _priv_vpadal_u8(a, b) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/priv_vpadalq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.u8"))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn priv_vpadalq_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadalu.v8i16.v16i8")]
+        fn _priv_vpadalq_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t;
+    }
+    unsafe { _priv_vpadalq_u8(a, b) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/priv_vpadal_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.u16"))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn priv_vpadal_u16(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadalu.v2i32.v4i16")]
+        fn _priv_vpadal_u16(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t;
+    }
+    unsafe { _priv_vpadal_u16(a, b) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/priv_vpadalq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.u16"))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn priv_vpadalq_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadalu.v4i32.v8i16")]
+        fn _priv_vpadalq_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t;
+    }
+    unsafe { _priv_vpadalq_u16(a, b) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/priv_vpadal_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.u32"))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn priv_vpadal_u32(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadalu.v1i64.v2i32")]
+        fn _priv_vpadal_u32(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t;
+    }
+    unsafe { _priv_vpadal_u32(a, b) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/priv_vpadalq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.u32"))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn priv_vpadalq_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadalu.v2i64.v4i32")]
+        fn _priv_vpadalq_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t;
+    }
+    unsafe { _priv_vpadalq_u32(a, b) }
+}
+#[doc = "Absolute difference and accumulate (64-bit)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaba_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saba)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaba_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    unsafe { simd_add(a, vabd_s16(b, c)) }
+}
+#[doc = "Absolute difference and accumulate (64-bit)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaba_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saba)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaba_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    unsafe { simd_add(a, vabd_s32(b, c)) }
+}
+#[doc = "Absolute difference and accumulate (64-bit)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaba_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saba)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaba_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    unsafe { simd_add(a, vabd_s8(b, c)) }
+}
+#[doc = "Absolute difference and accumulate (64-bit)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaba_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaba)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaba_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_add(a, vabd_u16(b, c)) }
+}
+#[doc = "Absolute difference and accumulate (64-bit)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaba_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaba)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaba_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_add(a, vabd_u32(b, c)) }
+}
+#[doc = "Absolute difference and accumulate (64-bit)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaba_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaba)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaba_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_add(a, vabd_u8(b, c)) }
+}
+#[doc = "Signed Absolute difference and Accumulate Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sabal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
+    let d: int8x8_t = vabd_s8(b, c);
+    unsafe {
+        let e: uint8x8_t = simd_cast(d);
+        simd_add(a, simd_cast(e))
+    }
+}
+#[doc = "Signed Absolute difference and Accumulate Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sabal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    let d: int16x4_t = vabd_s16(b, c);
+    unsafe {
+        let e: uint16x4_t = simd_cast(d);
+        simd_add(a, simd_cast(e))
+    }
+}
+#[doc = "Signed Absolute difference and Accumulate Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sabal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    let d: int32x2_t = vabd_s32(b, c);
+    unsafe {
+        let e: uint32x2_t = simd_cast(d);
+        simd_add(a, simd_cast(e))
+    }
+}
+#[doc = "Unsigned Absolute difference and Accumulate Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uabal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
+    let d: uint8x8_t = vabd_u8(b, c);
+    unsafe { simd_add(a, simd_cast(d)) }
+}
+#[doc = "Unsigned Absolute difference and Accumulate Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uabal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    let d: uint16x4_t = vabd_u16(b, c);
+    unsafe { simd_add(a, simd_cast(d)) }
+}
+#[doc = "Unsigned Absolute difference and Accumulate Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uabal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    let d: uint32x2_t = vabd_u32(b, c);
+    unsafe { simd_add(a, simd_cast(d)) }
+}
+#[doc = "Absolute difference and accumulate (128-bit)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabaq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saba)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    unsafe { simd_add(a, vabdq_s16(b, c)) }
+}
+#[doc = "Absolute difference and accumulate (128-bit)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabaq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saba)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    unsafe { simd_add(a, vabdq_s32(b, c)) }
+}
+#[doc = "Absolute difference and accumulate (128-bit)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabaq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saba)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    unsafe { simd_add(a, vabdq_s8(b, c)) }
+}
+#[doc = "Absolute difference and accumulate (128-bit)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabaq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaba)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_add(a, vabdq_u16(b, c)) }
+}
+#[doc = "Absolute difference and accumulate (128-bit)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabaq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaba)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_add(a, vabdq_u32(b, c)) }
+}
+#[doc = "Absolute difference and accumulate (128-bit)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabaq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaba)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_add(a, vabdq_u8(b, c)) }
+}
+#[doc = "Absolute difference between the arguments of Floating"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabd_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fabd)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vabd_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fabd.v4f16"
+        )]
+        fn _vabd_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vabd_f16(a, b) }
+}
+#[doc = "Absolute difference between the arguments of Floating"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fabd)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vabdq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v8f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fabd.v8f16"
+        )]
+        fn _vabdq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vabdq_f16(a, b) }
+}
+#[doc = "Absolute difference between the arguments of Floating"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabd_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fabd.v2f32"
+        )]
+        fn _vabd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vabd_f32(a, b) }
+}
+#[doc = "Absolute difference between the arguments of Floating"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabdq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fabd.v4f32"
+        )]
+        fn _vabdq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vabdq_f32(a, b) }
+}
+#[doc = "Absolute difference between the arguments"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabd_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sabd.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v8i8")]
+        fn _vabd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vabd_s8(a, b) }
+}
+#[doc = "Absolute difference between the arguments"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabdq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sabd.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v16i8")]
+        fn _vabdq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vabdq_s8(a, b) }
+}
+#[doc = "Absolute difference between the arguments"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabd_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sabd.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v4i16")]
+        fn _vabd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vabd_s16(a, b) }
+}
+#[doc = "Absolute difference between the arguments"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabdq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sabd.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v8i16")]
+        fn _vabdq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vabdq_s16(a, b) }
+}
+#[doc = "Absolute difference between the arguments"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabd_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sabd.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v2i32")]
+        fn _vabd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vabd_s32(a, b) }
+}
+#[doc = "Absolute difference between the arguments"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabdq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sabd.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v4i32")]
+        fn _vabdq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vabdq_s32(a, b) }
+}
+#[doc = "Absolute difference between the arguments"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabd_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uabd.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabdu.v8i8")]
+        fn _vabd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vabd_u8(a, b) }
+}
+#[doc = "Absolute difference between the arguments"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabdq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uabd.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabdu.v16i8")]
+        fn _vabdq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vabdq_u8(a, b) }
+}
+#[doc = "Absolute difference between the arguments"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabd_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uabd.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabdu.v4i16")]
+        fn _vabd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vabd_u16(a, b) }
+}
+#[doc = "Absolute difference between the arguments"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabdq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uabd.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabdu.v8i16")]
+        fn _vabdq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vabdq_u16(a, b) }
+}
+#[doc = "Absolute difference between the arguments"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabd_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uabd.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabdu.v2i32")]
+        fn _vabd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vabd_u32(a, b) }
+}
+#[doc = "Absolute difference between the arguments"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uabd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabdq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uabd.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabdu.v4i32")]
+        fn _vabdq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vabdq_u32(a, b) }
+}
+#[doc = "Signed Absolute difference Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabdl.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sabdl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabdl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    unsafe {
+        let c: uint8x8_t = simd_cast(vabd_s8(a, b));
+        simd_cast(c)
+    }
+}
+#[doc = "Signed Absolute difference Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabdl.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sabdl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabdl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    unsafe {
+        let c: uint16x4_t = simd_cast(vabd_s16(a, b));
+        simd_cast(c)
+    }
+}
+#[doc = "Signed Absolute difference Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabdl.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sabdl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabdl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    unsafe {
+        let c: uint32x2_t = simd_cast(vabd_s32(a, b));
+        simd_cast(c)
+    }
+}
+#[doc = "Unsigned Absolute difference Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabdl.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uabdl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabdl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    unsafe { simd_cast(vabd_u8(a, b)) }
+}
+#[doc = "Unsigned Absolute difference Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabdl.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uabdl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabdl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    unsafe { simd_cast(vabd_u16(a, b)) }
+}
+#[doc = "Unsigned Absolute difference Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabdl.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uabdl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabdl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    unsafe { simd_cast(vabd_u32(a, b)) }
+}
+#[doc = "Floating-point absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabs_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fabs)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vabs_f16(a: float16x4_t) -> float16x4_t {
+    unsafe { simd_fabs(a) }
+}
+#[doc = "Floating-point absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabsq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fabs)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vabsq_f16(a: float16x8_t) -> float16x8_t {
+    unsafe { simd_fabs(a) }
+}
+#[doc = "Floating-point absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabs_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fabs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabs_f32(a: float32x2_t) -> float32x2_t {
+    unsafe { simd_fabs(a) }
+}
+#[doc = "Floating-point absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabsq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fabs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabsq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe { simd_fabs(a) }
+}
+#[doc = "Absolute value (wrapping)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabs_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(abs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabs_s8(a: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.abs.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabs.v8i8")]
+        fn _vabs_s8(a: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vabs_s8(a) }
+}
+#[doc = "Absolute value (wrapping)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabsq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(abs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabsq_s8(a: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.abs.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabs.v16i8")]
+        fn _vabsq_s8(a: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vabsq_s8(a) }
+}
+#[doc = "Absolute value (wrapping)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabs_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(abs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabs_s16(a: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.abs.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabs.v4i16")]
+        fn _vabs_s16(a: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vabs_s16(a) }
+}
+#[doc = "Absolute value (wrapping)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabsq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(abs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabsq_s16(a: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.abs.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabs.v8i16")]
+        fn _vabsq_s16(a: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vabsq_s16(a) }
+}
+#[doc = "Absolute value (wrapping)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabs_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(abs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabs_s32(a: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.abs.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabs.v2i32")]
+        fn _vabs_s32(a: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vabs_s32(a) }
+}
+#[doc = "Absolute value (wrapping)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabsq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(abs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vabsq_s32(a: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.abs.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabs.v4i32")]
+        fn _vabsq_s32(a: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vabsq_s32(a) }
+}
+#[doc = "Floating-point absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabsh_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fabs)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vabsh_f16(a: f16) -> f16 {
+    unsafe { simd_extract!(vabs_f16(vdup_n_f16(a)), 0) }
+}
+#[doc = "Floating-point Add (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vadd_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vadd.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fadd)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vadd_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Floating-point Add (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vadd.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fadd)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vaddq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vadd_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vadd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vadd_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vadd_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vadd_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vadd_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vadd_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vadd_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Vector add."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(add)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_add(a, b) }
+}
+#[doc = "Bitwise exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vadd_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vadd_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Bitwise exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Bitwise exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vadd_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vadd_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Bitwise exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Bitwise exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vadd_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vadd_p64(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Bitwise exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddq_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddh_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vadd.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fadd)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vaddh_f16(a: f16, b: f16) -> f16 {
+    a + b
+}
+#[doc = "Add returning High Narrow (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddhn_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
+    unsafe {
+        let x = simd_cast(simd_shr(simd_add(a, b), int16x8_t::splat(8)));
+        simd_shuffle!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    }
+}
+#[doc = "Add returning High Narrow (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddhn_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
+    unsafe {
+        let x = simd_cast(simd_shr(simd_add(a, b), int32x4_t::splat(16)));
+        simd_shuffle!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+    }
+}
+#[doc = "Add returning High Narrow (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddhn_high_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
+    unsafe {
+        let x = simd_cast(simd_shr(simd_add(a, b), int64x2_t::splat(32)));
+        simd_shuffle!(r, x, [0, 1, 2, 3])
+    }
+}
+#[doc = "Add returning High Narrow (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddhn_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
+    unsafe {
+        let x = simd_cast(simd_shr(simd_add(a, b), uint16x8_t::splat(8)));
+        simd_shuffle!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    }
+}
+#[doc = "Add returning High Narrow (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddhn_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
+    unsafe {
+        let x = simd_cast(simd_shr(simd_add(a, b), uint32x4_t::splat(16)));
+        simd_shuffle!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+    }
+}
+#[doc = "Add returning High Narrow (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddhn_high_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
+    unsafe {
+        let x = simd_cast(simd_shr(simd_add(a, b), uint64x2_t::splat(32)));
+        simd_shuffle!(r, x, [0, 1, 2, 3])
+    }
+}
+#[doc = "Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddhn_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    unsafe { simd_cast(simd_shr(simd_add(a, b), int16x8_t::splat(8))) }
+}
+#[doc = "Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddhn_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    unsafe { simd_cast(simd_shr(simd_add(a, b), int32x4_t::splat(16))) }
+}
+#[doc = "Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddhn_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    unsafe { simd_cast(simd_shr(simd_add(a, b), int64x2_t::splat(32))) }
+}
+#[doc = "Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddhn_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    unsafe { simd_cast(simd_shr(simd_add(a, b), uint16x8_t::splat(8))) }
+}
+#[doc = "Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddhn_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    unsafe { simd_cast(simd_shr(simd_add(a, b), uint32x4_t::splat(16))) }
+}
+#[doc = "Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddhn_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    unsafe { simd_cast(simd_shr(simd_add(a, b), uint64x2_t::splat(32))) }
+}
+#[doc = "Signed Add Long (vector, high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddl_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddl2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    unsafe {
+        let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        let a: int32x4_t = simd_cast(a);
+        let b: int32x4_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Signed Add Long (vector, high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddl_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddl2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    unsafe {
+        let a: int32x2_t = simd_shuffle!(a, a, [2, 3]);
+        let b: int32x2_t = simd_shuffle!(b, b, [2, 3]);
+        let a: int64x2_t = simd_cast(a);
+        let b: int64x2_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Signed Add Long (vector, high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddl_high_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddl2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
+    unsafe {
+        let a: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let b: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let a: int16x8_t = simd_cast(a);
+        let b: int16x8_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Signed Add Long (vector, high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddl_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddl2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    unsafe {
+        let a: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
+        let b: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        let a: uint32x4_t = simd_cast(a);
+        let b: uint32x4_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Signed Add Long (vector, high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddl_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddl2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    unsafe {
+        let a: uint32x2_t = simd_shuffle!(a, a, [2, 3]);
+        let b: uint32x2_t = simd_shuffle!(b, b, [2, 3]);
+        let a: uint64x2_t = simd_cast(a);
+        let b: uint64x2_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Signed Add Long (vector, high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddl_high_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddl2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
+    unsafe {
+        let a: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let b: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let a: uint16x8_t = simd_cast(a);
+        let b: uint16x8_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Long (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddl_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    unsafe {
+        let a: int32x4_t = simd_cast(a);
+        let b: int32x4_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Long (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddl_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    unsafe {
+        let a: int64x2_t = simd_cast(a);
+        let b: int64x2_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Long (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddl_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    unsafe {
+        let a: int16x8_t = simd_cast(a);
+        let b: int16x8_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Long (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddl_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    unsafe {
+        let a: uint32x4_t = simd_cast(a);
+        let b: uint32x4_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Long (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddl_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    unsafe {
+        let a: uint64x2_t = simd_cast(a);
+        let b: uint64x2_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Long (vector)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddl_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    unsafe {
+        let a: uint16x8_t = simd_cast(a);
+        let b: uint16x8_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Bitwise exclusive OR"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddq_p128)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddq_p128(a: p128, b: p128) -> p128 {
+    a ^ b
+}
+#[doc = "Add Wide (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddw_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddw2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    unsafe {
+        let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        let b: int32x4_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Wide (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddw_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddw2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    unsafe {
+        let b: int32x2_t = simd_shuffle!(b, b, [2, 3]);
+        let b: int64x2_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Wide (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddw_high_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddw2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    unsafe {
+        let b: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let b: int16x8_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Wide (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddw_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddw2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    unsafe {
+        let b: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
+        let b: uint32x4_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Wide (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddw_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddw2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    unsafe {
+        let b: uint32x2_t = simd_shuffle!(b, b, [2, 3]);
+        let b: uint64x2_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Wide (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddw_high_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddw2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    unsafe {
+        let b: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let b: uint16x8_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddw_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddw)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t {
+    unsafe {
+        let b: int32x4_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddw_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddw)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t {
+    unsafe {
+        let b: int64x2_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddw_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddw)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t {
+    unsafe {
+        let b: int16x8_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddw_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddw)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t {
+    unsafe {
+        let b: uint32x4_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddw_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddw)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
+    unsafe {
+        let b: uint64x2_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "Add Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaddw_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddw)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vaddw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t {
+    unsafe {
+        let b: uint16x8_t = simd_cast(b);
+        simd_add(a, b)
+    }
+}
+#[doc = "AES single round encryption."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaesdq_u8)"]
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(aesd))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vaesdq_u8(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.aesd"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.aesd")]
+        fn _vaesdq_u8(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vaesdq_u8(data, key) }
+}
+#[doc = "AES single round encryption."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaeseq_u8)"]
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(aese))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vaeseq_u8(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.aese"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.aese")]
+        fn _vaeseq_u8(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vaeseq_u8(data, key) }
+}
+#[doc = "AES inverse mix columns."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaesimcq_u8)"]
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(aesimc))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vaesimcq_u8(data: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.aesimc"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.aesimc")]
+        fn _vaesimcq_u8(data: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vaesimcq_u8(data) }
+}
+#[doc = "AES mix columns."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vaesmcq_u8)"]
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(aesmc))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vaesmcq_u8(data: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.aesmc"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.aesmc")]
+        fn _vaesmcq_u8(data: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vaesmcq_u8(data) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vand_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vand_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vandq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vand_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vand_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vandq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vand_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vand_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vandq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vandq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vand_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vand_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vandq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vandq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vand_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vand_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vandq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vand_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vand_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vandq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vandq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vand_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vand_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vandq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vandq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vand_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vand_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise and"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vandq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(and)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vandq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_and(a, b) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbic_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbic_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    let c = int16x4_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbic_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbic_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    let c = int32x2_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbic_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbic_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    let c = int64x1_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbic_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbic_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    let c = int8x8_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbicq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbicq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    let c = int16x8_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbicq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbicq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    let c = int32x4_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbicq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbicq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    let c = int64x2_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbicq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbicq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    let c = int8x16_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbic_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbic_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    let c = int16x4_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbic_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbic_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    let c = int32x2_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbic_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbic_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    let c = int64x1_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbic_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbic_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    let c = int8x8_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbicq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbicq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    let c = int16x8_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbicq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbicq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    let c = int32x4_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbicq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbicq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    let c = int64x2_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise bit clear."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbicq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bic)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbicq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    let c = int8x16_t::splat(-1);
+    unsafe { simd_and(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbsl_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vbsl_f16(a: uint16x4_t, b: float16x4_t, c: float16x4_t) -> float16x4_t {
+    let not = int16x4_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_f16)"]
+#[inline]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vbslq_f16(a: uint16x8_t, b: float16x8_t, c: float16x8_t) -> float16x8_t {
+    let not = int16x8_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbsl_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbsl_f32(a: uint32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    let not = int32x2_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbsl_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbsl_p16(a: uint16x4_t, b: poly16x4_t, c: poly16x4_t) -> poly16x4_t {
+    let not = int16x4_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbsl_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbsl_p8(a: uint8x8_t, b: poly8x8_t, c: poly8x8_t) -> poly8x8_t {
+    let not = int8x8_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbsl_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbsl_s16(a: uint16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    let not = int16x4_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbsl_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbsl_s32(a: uint32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    let not = int32x2_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbsl_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbsl_s64(a: uint64x1_t, b: int64x1_t, c: int64x1_t) -> int64x1_t {
+    let not = int64x1_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbsl_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbsl_s8(a: uint8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    let not = int8x8_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbslq_f32(a: uint32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    let not = int32x4_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbslq_p16(a: uint16x8_t, b: poly16x8_t, c: poly16x8_t) -> poly16x8_t {
+    let not = int16x8_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbslq_p8(a: uint8x16_t, b: poly8x16_t, c: poly8x16_t) -> poly8x16_t {
+    let not = int8x16_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbslq_s16(a: uint16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    let not = int16x8_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbslq_s32(a: uint32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    let not = int32x4_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbslq_s64(a: uint64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t {
+    let not = int64x2_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbslq_s8(a: uint8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    let not = int8x16_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, transmute(b)),
+            simd_and(simd_xor(a, transmute(not)), transmute(c)),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbsl_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbsl_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    let not = int16x4_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, b),
+            simd_and(simd_xor(a, transmute(not)), c),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbsl_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbsl_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    let not = int32x2_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, b),
+            simd_and(simd_xor(a, transmute(not)), c),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbsl_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbsl_u64(a: uint64x1_t, b: uint64x1_t, c: uint64x1_t) -> uint64x1_t {
+    let not = int64x1_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, b),
+            simd_and(simd_xor(a, transmute(not)), c),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbsl_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbsl_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    let not = int8x8_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, b),
+            simd_and(simd_xor(a, transmute(not)), c),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbslq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    let not = int16x8_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, b),
+            simd_and(simd_xor(a, transmute(not)), c),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbslq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    let not = int32x4_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, b),
+            simd_and(simd_xor(a, transmute(not)), c),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbslq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    let not = int64x2_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, b),
+            simd_and(simd_xor(a, transmute(not)), c),
+        ))
+    }
+}
+#[doc = "Bitwise Select."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(bsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vbslq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    let not = int8x16_t::splat(-1);
+    unsafe {
+        transmute(simd_or(
+            simd_and(a, b),
+            simd_and(simd_xor(a, transmute(not)), c),
+        ))
+    }
+}
+#[doc = "Floating-point absolute compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcage_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacge.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facge)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcage_f16(a: float16x4_t, b: float16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vacge.v4i16.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facge.v4i16.v4f16"
+        )]
+        fn _vcage_f16(a: float16x4_t, b: float16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vcage_f16(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcageq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacge.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facge)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcageq_f16(a: float16x8_t, b: float16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vacge.v8i16.v8f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facge.v8i16.v8f16"
+        )]
+        fn _vcageq_f16(a: float16x8_t, b: float16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vcageq_f16(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcage_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacge.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcage_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vacge.v2i32.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facge.v2i32.v2f32"
+        )]
+        fn _vcage_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vcage_f32(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcageq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacge.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcageq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vacge.v4i32.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facge.v4i32.v4f32"
+        )]
+        fn _vcageq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vcageq_f32(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcagt_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacgt.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facgt)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcagt_f16(a: float16x4_t, b: float16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vacgt.v4i16.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facgt.v4i16.v4f16"
+        )]
+        fn _vcagt_f16(a: float16x4_t, b: float16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vcagt_f16(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcagtq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacgt.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facgt)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcagtq_f16(a: float16x8_t, b: float16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vacgt.v8i16.v8f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facgt.v8i16.v8f16"
+        )]
+        fn _vcagtq_f16(a: float16x8_t, b: float16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vcagtq_f16(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcagt_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacgt.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcagt_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vacgt.v2i32.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facgt.v2i32.v2f32"
+        )]
+        fn _vcagt_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vcagt_f32(a, b) }
+}
+#[doc = "Floating-point absolute compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcagtq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacgt.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcagtq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vacgt.v4i32.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.facgt.v4i32.v4f32"
+        )]
+        fn _vcagtq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vcagtq_f32(a, b) }
+}
+#[doc = "Floating-point absolute compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcale_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacge.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facge)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcale_f16(a: float16x4_t, b: float16x4_t) -> uint16x4_t {
+    vcage_f16(b, a)
+}
+#[doc = "Floating-point absolute compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaleq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacge.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facge)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcaleq_f16(a: float16x8_t, b: float16x8_t) -> uint16x8_t {
+    vcageq_f16(b, a)
+}
+#[doc = "Floating-point absolute compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcale_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacge.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcale_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    vcage_f32(b, a)
+}
+#[doc = "Floating-point absolute compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaleq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacge.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcaleq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    vcageq_f32(b, a)
+}
+#[doc = "Floating-point absolute compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcalt_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacgt.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facgt)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcalt_f16(a: float16x4_t, b: float16x4_t) -> uint16x4_t {
+    vcagt_f16(b, a)
+}
+#[doc = "Floating-point absolute compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaltq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacgt.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facgt)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcaltq_f16(a: float16x8_t, b: float16x8_t) -> uint16x8_t {
+    vcagtq_f16(b, a)
+}
+#[doc = "Floating-point absolute compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcalt_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacgt.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcalt_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    vcagt_f32(b, a)
+}
+#[doc = "Floating-point absolute compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcaltq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacgt.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(facgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcaltq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    vcagtq_f32(b, a)
+}
+#[doc = "Floating-point compare equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmeq)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vceq_f16(a: float16x4_t, b: float16x4_t) -> uint16x4_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Floating-point compare equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmeq)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vceqq_f16(a: float16x8_t, b: float16x8_t) -> uint16x8_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Floating-point compare equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceq_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Floating-point compare equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceqq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceq_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceqq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceq_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceqq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceq_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceqq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceq_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceqq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceq_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceqq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceq_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceqq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceq_p8(a: poly8x8_t, b: poly8x8_t) -> uint8x8_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Compare bitwise Equal (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmeq)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vceqq_p8(a: poly8x16_t, b: poly8x16_t) -> uint8x16_t {
+    unsafe { simd_eq(a, b) }
+}
+#[doc = "Floating-point compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcge_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmge)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcge_f16(a: float16x4_t, b: float16x4_t) -> uint16x4_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Floating-point compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgeq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmge)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcgeq_f16(a: float16x8_t, b: float16x8_t) -> uint16x8_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Floating-point compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcge_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcge_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Floating-point compare greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgeq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgeq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare signed greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcge_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcge_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare signed greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgeq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgeq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare signed greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcge_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcge_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare signed greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgeq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgeq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare signed greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcge_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcge_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare signed greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgeq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgeq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare unsigned greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcge_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcge_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare unsigned greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgeq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgeq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare unsigned greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcge_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcge_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare unsigned greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgeq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgeq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare unsigned greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcge_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcge_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Compare unsigned greater than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgeq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgeq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_ge(a, b) }
+}
+#[doc = "Floating-point compare greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgez_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmge)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcgez_f16(a: float16x4_t) -> uint16x4_t {
+    let b: f16x4 = f16x4::new(0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Floating-point compare greater than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgezq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmge)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcgezq_f16(a: float16x8_t) -> uint16x8_t {
+    let b: f16x8 = f16x8::new(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_ge(a, transmute(b)) }
+}
+#[doc = "Floating-point compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmgt)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcgt_f16(a: float16x4_t, b: float16x4_t) -> uint16x4_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Floating-point compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmgt)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcgtq_f16(a: float16x8_t, b: float16x8_t) -> uint16x8_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Floating-point compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgt_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Floating-point compare greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgtq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare signed greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgt_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare signed greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgtq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare signed greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgt_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare signed greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgtq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare signed greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgt_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare signed greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgtq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare unsigned greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhi)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgt_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare unsigned greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhi)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgtq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare unsigned greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhi)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgt_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare unsigned greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhi)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgtq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare unsigned greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhi)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgt_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Compare unsigned greater than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhi)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcgtq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_gt(a, b) }
+}
+#[doc = "Floating-point compare greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtz_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmgt)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcgtz_f16(a: float16x4_t) -> uint16x4_t {
+    let b: f16x4 = f16x4::new(0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Floating-point compare greater than zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtzq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmgt)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcgtzq_f16(a: float16x8_t) -> uint16x8_t {
+    let b: f16x8 = f16x8::new(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_gt(a, transmute(b)) }
+}
+#[doc = "Floating-point compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcle_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmge)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcle_f16(a: float16x4_t, b: float16x4_t) -> uint16x4_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Floating-point compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcleq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmge)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcleq_f16(a: float16x8_t, b: float16x8_t) -> uint16x8_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Floating-point compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcle_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcle_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Floating-point compare less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcleq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcleq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare signed less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcle_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcle_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare signed less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcleq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcleq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare signed less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcle_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcle_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare signed less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcleq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcleq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare signed less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcle_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcle_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare signed less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcleq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmge)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcleq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare unsigned less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcle_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcle_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare unsigned less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcleq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcleq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare unsigned less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcle_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcle_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare unsigned less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcleq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcleq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare unsigned less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcle_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcle_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Compare unsigned less than or equal"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcleq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcleq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_le(a, b) }
+}
+#[doc = "Floating-point compare less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclez_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcle.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmle)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vclez_f16(a: float16x4_t) -> uint16x4_t {
+    let b: f16x4 = f16x4::new(0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Floating-point compare less than or equal to zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclezq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcle.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmle)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vclezq_f16(a: float16x8_t) -> uint16x8_t {
+    let b: f16x8 = f16x8::new(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_le(a, transmute(b)) }
+}
+#[doc = "Count leading sign bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcls_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcls.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcls_s8(a: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcls.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.cls.v8i8"
+        )]
+        fn _vcls_s8(a: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vcls_s8(a) }
+}
+#[doc = "Count leading sign bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclsq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcls.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclsq_s8(a: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcls.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.cls.v16i8"
+        )]
+        fn _vclsq_s8(a: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vclsq_s8(a) }
+}
+#[doc = "Count leading sign bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcls_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcls.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcls_s16(a: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcls.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.cls.v4i16"
+        )]
+        fn _vcls_s16(a: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vcls_s16(a) }
+}
+#[doc = "Count leading sign bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclsq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcls.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclsq_s16(a: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcls.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.cls.v8i16"
+        )]
+        fn _vclsq_s16(a: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vclsq_s16(a) }
+}
+#[doc = "Count leading sign bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcls_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcls.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcls_s32(a: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcls.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.cls.v2i32"
+        )]
+        fn _vcls_s32(a: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vcls_s32(a) }
+}
+#[doc = "Count leading sign bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclsq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcls.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclsq_s32(a: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcls.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.cls.v4i32"
+        )]
+        fn _vclsq_s32(a: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vclsq_s32(a) }
+}
+#[doc = "Count leading sign bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcls_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcls))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcls_u8(a: uint8x8_t) -> int8x8_t {
+    unsafe { vcls_s8(transmute(a)) }
+}
+#[doc = "Count leading sign bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclsq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcls))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclsq_u8(a: uint8x16_t) -> int8x16_t {
+    unsafe { vclsq_s8(transmute(a)) }
+}
+#[doc = "Count leading sign bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcls_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcls))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcls_u16(a: uint16x4_t) -> int16x4_t {
+    unsafe { vcls_s16(transmute(a)) }
+}
+#[doc = "Count leading sign bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclsq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcls))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclsq_u16(a: uint16x8_t) -> int16x8_t {
+    unsafe { vclsq_s16(transmute(a)) }
+}
+#[doc = "Count leading sign bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcls_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcls))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcls_u32(a: uint32x2_t) -> int32x2_t {
+    unsafe { vcls_s32(transmute(a)) }
+}
+#[doc = "Count leading sign bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclsq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcls))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclsq_u32(a: uint32x4_t) -> int32x4_t {
+    unsafe { vclsq_s32(transmute(a)) }
+}
+#[doc = "Floating-point compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclt_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmgt)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vclt_f16(a: float16x4_t, b: float16x4_t) -> uint16x4_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Floating-point compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmgt)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcltq_f16(a: float16x8_t, b: float16x8_t) -> uint16x8_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Floating-point compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclt_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclt_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Floating-point compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcltq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare signed less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclt_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclt_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare signed less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcltq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare signed less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclt_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclt_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare signed less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcltq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare signed less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclt_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclt_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare signed less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmgt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcltq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare unsigned less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclt_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhi)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclt_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare unsigned less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhi)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcltq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare unsigned less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclt_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhi)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclt_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare unsigned less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhi)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcltq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare unsigned less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclt_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhi)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclt_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Compare unsigned less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmhi)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcltq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_lt(a, b) }
+}
+#[doc = "Floating-point compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltz_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclt.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmlt)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcltz_f16(a: float16x4_t) -> uint16x4_t {
+    let b: f16x4 = f16x4::new(0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Floating-point compare less than"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltzq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclt.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcmlt)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcltzq_f16(a: float16x8_t) -> uint16x8_t {
+    let b: f16x8 = f16x8::new(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+    unsafe { simd_lt(a, transmute(b)) }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclz_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclz_s8(a: int8x8_t) -> int8x8_t {
+    unsafe { simd_ctlz(a) }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclzq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclzq_s8(a: int8x16_t) -> int8x16_t {
+    unsafe { simd_ctlz(a) }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclz_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclz_s16(a: int16x4_t) -> int16x4_t {
+    unsafe { simd_ctlz(a) }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclzq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclzq_s16(a: int16x8_t) -> int16x8_t {
+    unsafe { simd_ctlz(a) }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclz_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclz_s32(a: int32x2_t) -> int32x2_t {
+    unsafe { simd_ctlz(a) }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclzq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclzq_s32(a: int32x4_t) -> int32x4_t {
+    unsafe { simd_ctlz(a) }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclz_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclz_u16(a: uint16x4_t) -> uint16x4_t {
+    unsafe { transmute(vclz_s16(transmute(a))) }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclz_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclz_u16(a: uint16x4_t) -> uint16x4_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x4_t = transmute(vclz_s16(transmute(a)));
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclzq_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclzq_u16(a: uint16x8_t) -> uint16x8_t {
+    unsafe { transmute(vclzq_s16(transmute(a))) }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclzq_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclzq_u16(a: uint16x8_t) -> uint16x8_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(vclzq_s16(transmute(a)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclz_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclz_u32(a: uint32x2_t) -> uint32x2_t {
+    unsafe { transmute(vclz_s32(transmute(a))) }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclz_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclz_u32(a: uint32x2_t) -> uint32x2_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint32x2_t = transmute(vclz_s32(transmute(a)));
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclzq_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclzq_u32(a: uint32x4_t) -> uint32x4_t {
+    unsafe { transmute(vclzq_s32(transmute(a))) }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclzq_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclzq_u32(a: uint32x4_t) -> uint32x4_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(vclzq_s32(transmute(a)));
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclz_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclz_u8(a: uint8x8_t) -> uint8x8_t {
+    unsafe { transmute(vclz_s8(transmute(a))) }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclz_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclz_u8(a: uint8x8_t) -> uint8x8_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vclz_s8(transmute(a)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclzq_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclzq_u8(a: uint8x16_t) -> uint8x16_t {
+    unsafe { transmute(vclzq_s8(transmute(a))) }
+}
+#[doc = "Count leading zero bits"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclzq_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(clz)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vclzq_u8(a: uint8x16_t) -> uint8x16_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(vclzq_s8(transmute(a)));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Population count per byte."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcnt_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cnt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcnt_s8(a: int8x8_t) -> int8x8_t {
+    unsafe { simd_ctpop(a) }
+}
+#[doc = "Population count per byte."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcntq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cnt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcntq_s8(a: int8x16_t) -> int8x16_t {
+    unsafe { simd_ctpop(a) }
+}
+#[doc = "Population count per byte."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcnt_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cnt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcnt_u8(a: uint8x8_t) -> uint8x8_t {
+    unsafe { transmute(vcnt_s8(transmute(a))) }
+}
+#[doc = "Population count per byte."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcnt_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cnt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcnt_u8(a: uint8x8_t) -> uint8x8_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vcnt_s8(transmute(a)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Population count per byte."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcntq_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cnt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcntq_u8(a: uint8x16_t) -> uint8x16_t {
+    unsafe { transmute(vcntq_s8(transmute(a))) }
+}
+#[doc = "Population count per byte."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcntq_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cnt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcntq_u8(a: uint8x16_t) -> uint8x16_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(vcntq_s8(transmute(a)));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Population count per byte."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcnt_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cnt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcnt_p8(a: poly8x8_t) -> poly8x8_t {
+    unsafe { transmute(vcnt_s8(transmute(a))) }
+}
+#[doc = "Population count per byte."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcnt_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cnt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcnt_p8(a: poly8x8_t) -> poly8x8_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vcnt_s8(transmute(a)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Population count per byte."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcntq_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cnt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcntq_p8(a: poly8x16_t) -> poly8x16_t {
+    unsafe { transmute(vcntq_s8(transmute(a))) }
+}
+#[doc = "Population count per byte."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcntq_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cnt)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcntq_p8(a: poly8x16_t) -> poly8x16_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(vcntq_s8(transmute(a)));
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Join two smaller vectors into a single larger vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vcombine_f16(a: float16x4_t, b: float16x4_t) -> float16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Join two smaller vectors into a single larger vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcombine_f32(a: float32x2_t, b: float32x2_t) -> float32x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 1, 2, 3]) }
+}
+#[doc = "Join two smaller vectors into a single larger vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcombine_s8(a: int8x8_t, b: int8x8_t) -> int8x16_t {
+    unsafe { simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Join two smaller vectors into a single larger vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcombine_s16(a: int16x4_t, b: int16x4_t) -> int16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Join two smaller vectors into a single larger vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcombine_s32(a: int32x2_t, b: int32x2_t) -> int32x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 1, 2, 3]) }
+}
+#[doc = "Join two smaller vectors into a single larger vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcombine_s64(a: int64x1_t, b: int64x1_t) -> int64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 1]) }
+}
+#[doc = "Join two smaller vectors into a single larger vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcombine_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x16_t {
+    unsafe { simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Join two smaller vectors into a single larger vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcombine_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Join two smaller vectors into a single larger vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcombine_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x4_t {
+    unsafe { simd_shuffle!(a, b, [0, 1, 2, 3]) }
+}
+#[doc = "Join two smaller vectors into a single larger vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcombine_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 1]) }
+}
+#[doc = "Join two smaller vectors into a single larger vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcombine_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x16_t {
+    unsafe { simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Join two smaller vectors into a single larger vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcombine_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x8_t {
+    unsafe { simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Join two smaller vectors into a single larger vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcombine_p64(a: poly64x1_t, b: poly64x1_t) -> poly64x2_t {
+    unsafe { simd_shuffle!(a, b, [0, 1]) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcreate_f16(a: u64) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcreate_f16(a: u64) -> float16x4_t {
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_f32(a: u64) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_f32(a: u64) -> float32x2_t {
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_s8(a: u64) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_s8(a: u64) -> int8x8_t {
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_s16(a: u64) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_s16(a: u64) -> int16x4_t {
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_s32(a: u64) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_s32(a: u64) -> int32x2_t {
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_s64(a: u64) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_u8(a: u64) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_u8(a: u64) -> uint8x8_t {
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_u16(a: u64) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_u16(a: u64) -> uint16x4_t {
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_u32(a: u64) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_u32(a: u64) -> uint32x2_t {
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_u64(a: u64) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_p8(a: u64) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_p8(a: u64) -> poly8x8_t {
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_p16(a: u64) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_p16(a: u64) -> poly16x4_t {
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_p64)"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcreate_p64(a: u64) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Floating-point convert to lower precision narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_f16_f32)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+# [cfg_attr (all (test , target_arch = "arm") , assert_instr (vcvt . f16 . f32))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtn)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvt_f16_f32(a: float32x4_t) -> float16x4_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_f16_s16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(scvtf)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvt_f16_s16(a: int16x4_t) -> float16x4_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_f16_s16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(scvtf)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtq_f16_s16(a: int16x8_t) -> float16x8_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_f16_u16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ucvtf)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvt_f16_u16(a: uint16x4_t) -> float16x4_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_f16_u16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ucvtf)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtq_f16_u16(a: uint16x8_t) -> float16x8_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Floating-point convert to higher precision long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_f32_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtl)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvt_f32_f16(a: float16x4_t) -> float32x4_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_f32_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(scvtf)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcvt_f32_s32(a: int32x2_t) -> float32x2_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_f32_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(scvtf)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcvtq_f32_s32(a: int32x4_t) -> float32x4_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_f32_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ucvtf)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcvt_f32_u32(a: uint32x2_t) -> float32x2_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_f32_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ucvtf)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcvtq_f32_u32(a: uint32x4_t) -> float32x4_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_f16_s16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcvt", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(scvtf, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvt_n_f16_s16<const N: i32>(a: int16x4_t) -> float16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfxs2fp.v4f16.v4i16"
+        )]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxs2fp.v4f16.v4i16"
+        )]
+        fn _vcvt_n_f16_s16(a: int16x4_t, n: i32) -> float16x4_t;
+    }
+    unsafe { _vcvt_n_f16_s16(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_f16_s16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcvt", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(scvtf, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtq_n_f16_s16<const N: i32>(a: int16x8_t) -> float16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfxs2fp.v8f16.v8i16"
+        )]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxs2fp.v8f16.v8i16"
+        )]
+        fn _vcvtq_n_f16_s16(a: int16x8_t, n: i32) -> float16x8_t;
+    }
+    unsafe { _vcvtq_n_f16_s16(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_f16_u16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcvt", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ucvtf, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvt_n_f16_u16<const N: i32>(a: uint16x4_t) -> float16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfxu2fp.v4f16.v4i16"
+        )]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxu2fp.v4f16.v4i16"
+        )]
+        fn _vcvt_n_f16_u16(a: uint16x4_t, n: i32) -> float16x4_t;
+    }
+    unsafe { _vcvt_n_f16_u16(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_f16_u16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcvt", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ucvtf, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtq_n_f16_u16<const N: i32>(a: uint16x8_t) -> float16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfxu2fp.v8f16.v8i16"
+        )]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxu2fp.v8f16.v8i16"
+        )]
+        fn _vcvtq_n_f16_u16(a: uint16x8_t, n: i32) -> float16x8_t;
+    }
+    unsafe { _vcvtq_n_f16_u16(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_f32_s32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vcvt_n_f32_s32<const N: i32>(a: int32x2_t) -> float32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32"
+        )]
+        fn _vcvt_n_f32_s32(a: int32x2_t, n: i32) -> float32x2_t;
+    }
+    unsafe { _vcvt_n_f32_s32(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_f32_s32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vcvtq_n_f32_s32<const N: i32>(a: int32x4_t) -> float32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32"
+        )]
+        fn _vcvtq_n_f32_s32(a: int32x4_t, n: i32) -> float32x4_t;
+    }
+    unsafe { _vcvtq_n_f32_s32(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_f32_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_n_f32_s32<const N: i32>(a: int32x2_t) -> float32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32"
+        )]
+        fn _vcvt_n_f32_s32(a: int32x2_t, n: i32) -> float32x2_t;
+    }
+    unsafe { _vcvt_n_f32_s32(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_f32_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtq_n_f32_s32<const N: i32>(a: int32x4_t) -> float32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32"
+        )]
+        fn _vcvtq_n_f32_s32(a: int32x4_t, n: i32) -> float32x4_t;
+    }
+    unsafe { _vcvtq_n_f32_s32(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_f32_u32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vcvt_n_f32_u32<const N: i32>(a: uint32x2_t) -> float32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32"
+        )]
+        fn _vcvt_n_f32_u32(a: uint32x2_t, n: i32) -> float32x2_t;
+    }
+    unsafe { _vcvt_n_f32_u32(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_f32_u32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vcvtq_n_f32_u32<const N: i32>(a: uint32x4_t) -> float32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32"
+        )]
+        fn _vcvtq_n_f32_u32(a: uint32x4_t, n: i32) -> float32x4_t;
+    }
+    unsafe { _vcvtq_n_f32_u32(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_f32_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_n_f32_u32<const N: i32>(a: uint32x2_t) -> float32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32"
+        )]
+        fn _vcvt_n_f32_u32(a: uint32x2_t, n: i32) -> float32x2_t;
+    }
+    unsafe { _vcvt_n_f32_u32(a, N) }
+}
+#[doc = "Fixed-point convert to floating-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_f32_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtq_n_f32_u32<const N: i32>(a: uint32x4_t) -> float32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32"
+        )]
+        fn _vcvtq_n_f32_u32(a: uint32x4_t, n: i32) -> float32x4_t;
+    }
+    unsafe { _vcvtq_n_f32_u32(a, N) }
+}
+#[doc = "Floating-point convert to signed fixed-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_s16_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcvt", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtzs, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvt_n_s16_f16<const N: i32>(a: float16x4_t) -> int16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfp2fxs.v4i16.v4f16"
+        )]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxs.v4i16.v4f16"
+        )]
+        fn _vcvt_n_s16_f16(a: float16x4_t, n: i32) -> int16x4_t;
+    }
+    unsafe { _vcvt_n_s16_f16(a, N) }
+}
+#[doc = "Floating-point convert to signed fixed-point"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_s16_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcvt", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtzs, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtq_n_s16_f16<const N: i32>(a: float16x8_t) -> int16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfp2fxs.v8i16.v8f16"
+        )]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxs.v8i16.v8f16"
+        )]
+        fn _vcvtq_n_s16_f16(a: float16x8_t, n: i32) -> int16x8_t;
+    }
+    unsafe { _vcvtq_n_s16_f16(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_s32_f32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vcvt_n_s32_f32<const N: i32>(a: float32x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32"
+        )]
+        fn _vcvt_n_s32_f32(a: float32x2_t, n: i32) -> int32x2_t;
+    }
+    unsafe { _vcvt_n_s32_f32(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_s32_f32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vcvtq_n_s32_f32<const N: i32>(a: float32x4_t) -> int32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32"
+        )]
+        fn _vcvtq_n_s32_f32(a: float32x4_t, n: i32) -> int32x4_t;
+    }
+    unsafe { _vcvtq_n_s32_f32(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_n_s32_f32<const N: i32>(a: float32x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32"
+        )]
+        fn _vcvt_n_s32_f32(a: float32x2_t, n: i32) -> int32x2_t;
+    }
+    unsafe { _vcvt_n_s32_f32(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtq_n_s32_f32<const N: i32>(a: float32x4_t) -> int32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32"
+        )]
+        fn _vcvtq_n_s32_f32(a: float32x4_t, n: i32) -> int32x4_t;
+    }
+    unsafe { _vcvtq_n_s32_f32(a, N) }
+}
+#[doc = "Fixed-point convert to unsigned fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_u16_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcvt", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtzu, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvt_n_u16_f16<const N: i32>(a: float16x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfp2fxu.v4i16.v4f16"
+        )]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxu.v4i16.v4f16"
+        )]
+        fn _vcvt_n_u16_f16(a: float16x4_t, n: i32) -> uint16x4_t;
+    }
+    unsafe { _vcvt_n_u16_f16(a, N) }
+}
+#[doc = "Fixed-point convert to unsigned fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_u16_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcvt", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtzu, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtq_n_u16_f16<const N: i32>(a: float16x8_t) -> uint16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfp2fxu.v8i16.v8f16"
+        )]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxu.v8i16.v8f16"
+        )]
+        fn _vcvtq_n_u16_f16(a: float16x8_t, n: i32) -> uint16x8_t;
+    }
+    unsafe { _vcvtq_n_u16_f16(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_u32_f32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vcvt_n_u32_f32<const N: i32>(a: float32x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32"
+        )]
+        fn _vcvt_n_u32_f32(a: float32x2_t, n: i32) -> uint32x2_t;
+    }
+    unsafe { _vcvt_n_u32_f32(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_u32_f32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vcvtq_n_u32_f32<const N: i32>(a: float32x4_t) -> uint32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            target_arch = "arm",
+            link_name = "llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32"
+        )]
+        fn _vcvtq_n_u32_f32(a: float32x4_t, n: i32) -> uint32x4_t;
+    }
+    unsafe { _vcvtq_n_u32_f32(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvt_n_u32_f32<const N: i32>(a: float32x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32"
+        )]
+        fn _vcvt_n_u32_f32(a: float32x2_t, n: i32) -> uint32x2_t;
+    }
+    unsafe { _vcvt_n_u32_f32(a, N) }
+}
+#[doc = "Floating-point convert to fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vcvtq_n_u32_f32<const N: i32>(a: float32x4_t) -> uint32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32"
+        )]
+        fn _vcvtq_n_u32_f32(a: float32x4_t, n: i32) -> uint32x4_t;
+    }
+    unsafe { _vcvtq_n_u32_f32(a, N) }
+}
+#[doc = "Floating-point convert to signed fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_s16_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtzs)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvt_s16_f16(a: float16x4_t) -> int16x4_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Floating-point convert to signed fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_s16_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtzs)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtq_s16_f16(a: float16x8_t) -> int16x8_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Floating-point convert to signed fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtzs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcvt_s32_f32(a: float32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fptosi.sat.v2i32.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.fptosi.sat.v2i32.v2f32"
+        )]
+        fn _vcvt_s32_f32(a: float32x2_t) -> int32x2_t;
+    }
+    unsafe { _vcvt_s32_f32(a) }
+}
+#[doc = "Floating-point convert to signed fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_s32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtzs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcvtq_s32_f32(a: float32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fptosi.sat.v4i32.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.fptosi.sat.v4i32.v4f32"
+        )]
+        fn _vcvtq_s32_f32(a: float32x4_t) -> int32x4_t;
+    }
+    unsafe { _vcvtq_s32_f32(a) }
+}
+#[doc = "Floating-point convert to unsigned fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_u16_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtzu)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvt_u16_f16(a: float16x4_t) -> uint16x4_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Floating-point convert to unsigned fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_u16_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtzu)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vcvtq_u16_f16(a: float16x8_t) -> uint16x8_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Floating-point convert to unsigned fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtzu)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcvt_u32_f32(a: float32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fptoui.sat.v2i32.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.fptoui.sat.v2i32.v2f32"
+        )]
+        fn _vcvt_u32_f32(a: float32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vcvt_u32_f32(a) }
+}
+#[doc = "Floating-point convert to unsigned fixed-point, rounding toward zero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_u32_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fcvtzu)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fptoui.sat.v4i32.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.fptoui.sat.v4i32.v4f32"
+        )]
+        fn _vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vcvtq_u32_f32(a) }
+}
+#[doc = "Dot product arithmetic (indexed)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_s32)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sdot, LANE = 0)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_dotprod", issue = "117224")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: int32x2_t = transmute(c);
+        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        vdot_s32(a, b, transmute(c))
+    }
+}
+#[doc = "Dot product arithmetic (indexed)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_s32)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sdot, LANE = 0)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_dotprod", issue = "117224")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: int32x2_t = transmute(c);
+        let c: int32x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vdotq_s32(a, b, transmute(c))
+    }
+}
+#[doc = "Dot product arithmetic (indexed)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_u32)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(udot, LANE = 0)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_dotprod", issue = "117224")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: uint32x2_t = transmute(c);
+        let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        vdot_u32(a, b, transmute(c))
+    }
+}
+#[doc = "Dot product arithmetic (indexed)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_u32)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(udot, LANE = 0)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_dotprod", issue = "117224")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x8_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: uint32x2_t = transmute(c);
+        let c: uint32x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vdotq_u32(a, b, transmute(c))
+    }
+}
+#[doc = "Dot product arithmetic (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_s32)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sdot)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_dotprod", issue = "117224")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdot_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sdot.v2i32.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sdot.v2i32.v8i8"
+        )]
+        fn _vdot_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t;
+    }
+    unsafe { _vdot_s32(a, b, c) }
+}
+#[doc = "Dot product arithmetic (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_s32)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sdot)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_dotprod", issue = "117224")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdotq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sdot.v4i32.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sdot.v4i32.v16i8"
+        )]
+        fn _vdotq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t;
+    }
+    unsafe { _vdotq_s32(a, b, c) }
+}
+#[doc = "Dot product arithmetic (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_u32)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(udot)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_dotprod", issue = "117224")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdot_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.udot.v2i32.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.udot.v2i32.v8i8"
+        )]
+        fn _vdot_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t;
+    }
+    unsafe { _vdot_u32(a, b, c) }
+}
+#[doc = "Dot product arithmetic (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_u32)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(udot)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_dotprod", issue = "117224")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdotq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.udot.v4i32.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.udot.v4i32.v16i8"
+        )]
+        fn _vdotq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t;
+    }
+    unsafe { _vdotq_u32(a, b, c) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vdup_lane_f16<const N: i32>(a: float16x4_t) -> float16x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vdupq_lane_f16<const N: i32>(a: float16x4_t) -> float16x8_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_lane_f32<const N: i32>(a: float32x2_t) -> float32x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_lane_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_lane_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_lane_f32<const N: i32>(a: float32x2_t) -> float32x4_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_lane_s32<const N: i32>(a: int32x2_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_lane_u32<const N: i32>(a: uint32x2_t) -> uint32x4_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_lane_p16<const N: i32>(a: poly16x4_t) -> poly16x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_lane_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_lane_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_lane_p16<const N: i32>(a: poly16x4_t) -> poly16x8_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_lane_s16<const N: i32>(a: int16x4_t) -> int16x8_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_lane_u16<const N: i32>(a: uint16x4_t) -> uint16x8_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_lane_p8<const N: i32>(a: poly8x8_t) -> poly8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_lane_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_lane_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_lane_p8<const N: i32>(a: poly8x8_t) -> poly8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [
+                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32,
+                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32
+            ]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_lane_s8<const N: i32>(a: int8x8_t) -> int8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [
+                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32,
+                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32
+            ]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_lane_u8<const N: i32>(a: uint8x8_t) -> uint8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [
+                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32,
+                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32
+            ]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, N = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, N = 0)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_lane_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert!(N == 0);
+    a
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, N = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, N = 0)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_lane_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert!(N == 0);
+    a
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vdup_laneq_f16<const N: i32>(a: float16x8_t) -> float16x4_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vdupq_laneq_f16<const N: i32>(a: float16x8_t) -> float16x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_laneq_f32<const N: i32>(a: float32x4_t) -> float32x2_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_laneq_s32<const N: i32>(a: int32x4_t) -> int32x2_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_laneq_u32<const N: i32>(a: uint32x4_t) -> uint32x2_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_laneq_f32<const N: i32>(a: float32x4_t) -> float32x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_laneq_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_laneq_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_laneq_p16<const N: i32>(a: poly16x8_t) -> poly16x4_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_laneq_s16<const N: i32>(a: int16x8_t) -> int16x4_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_laneq_u16<const N: i32>(a: uint16x8_t) -> uint16x4_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_laneq_p16<const N: i32>(a: poly16x8_t) -> poly16x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_laneq_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 4)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_laneq_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 8))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 8)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_laneq_p8<const N: i32>(a: poly8x16_t) -> poly8x8_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 8))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 8)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_laneq_s8<const N: i32>(a: int8x16_t) -> int8x8_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 8))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 8)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_laneq_u8<const N: i32>(a: uint8x16_t) -> uint8x8_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 8))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 8)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_laneq_p8<const N: i32>(a: poly8x16_t) -> poly8x16_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [
+                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32,
+                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32
+            ]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 8))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 8)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_laneq_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [
+                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32,
+                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32
+            ]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 8))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 8)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_laneq_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe {
+        simd_shuffle!(
+            a,
+            a,
+            [
+                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32,
+                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32
+            ]
+        )
+    }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_laneq_s64<const N: i32>(a: int64x2_t) -> int64x1_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { transmute::<i64, _>(simd_extract!(a, N as u32)) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_laneq_u64<const N: i32>(a: uint64x2_t) -> uint64x1_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { transmute::<u64, _>(simd_extract!(a, N as u32)) }
+}
+#[doc = "Create a new vector with all lanes set to a value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vdup_n_f16(a: f16) -> float16x4_t {
+    float16x4_t::splat(a)
+}
+#[doc = "Create a new vector with all lanes set to a value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vdupq_n_f16(a: f16) -> float16x8_t {
+    float16x8_t::splat(a)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_n_f32(value: f32) -> float32x2_t {
+    float32x2_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_n_p16(value: p16) -> poly16x4_t {
+    poly16x4_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_n_p8(value: p8) -> poly8x8_t {
+    poly8x8_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_n_s16(value: i16) -> int16x4_t {
+    int16x4_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_n_s32(value: i32) -> int32x2_t {
+    int32x2_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmov)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_n_s64(value: i64) -> int64x1_t {
+    int64x1_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_n_s8(value: i8) -> int8x8_t {
+    int8x8_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_n_u16(value: u16) -> uint16x4_t {
+    uint16x4_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_n_u32(value: u32) -> uint32x2_t {
+    uint32x2_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmov)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_n_u64(value: u64) -> uint64x1_t {
+    uint64x1_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdup_n_u8(value: u8) -> uint8x8_t {
+    uint8x8_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_n_f32(value: f32) -> float32x4_t {
+    float32x4_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_n_p16(value: p16) -> poly16x8_t {
+    poly16x8_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_n_p8(value: p8) -> poly8x16_t {
+    poly8x16_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_n_s16(value: i16) -> int16x8_t {
+    int16x8_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_n_s32(value: i32) -> int32x4_t {
+    int32x4_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_n_s64(value: i64) -> int64x2_t {
+    int64x2_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_n_s8(value: i8) -> int8x16_t {
+    int8x16_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_n_u16(value: u16) -> uint16x8_t {
+    uint16x8_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_n_u32(value: u32) -> uint32x4_t {
+    uint32x4_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_n_u64(value: u64) -> uint64x2_t {
+    uint64x2_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_n_u8(value: u8) -> uint8x16_t {
+    uint8x16_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_f32_vfp4)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn vdup_n_f32_vfp4(value: f32) -> float32x2_t {
+    float32x2_t::splat(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_f32_vfp4)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+fn vdupq_n_f32_vfp4(value: f32) -> float32x4_t {
+    float32x4_t::splat(value)
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 0)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_lane_s64<const N: i32>(a: int64x1_t) -> int64x2_t {
+    static_assert!(N == 0);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 0)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_lane_u64<const N: i32>(a: uint64x1_t) -> uint64x2_t {
+    static_assert!(N == 0);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_laneq_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Set all vector lanes to the same value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup, N = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vdupq_laneq_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe { simd_shuffle!(a, a, [N as u32, N as u32]) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veor_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veorq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veorq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veor_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veorq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veorq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veor_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veorq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veorq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veor_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veorq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veorq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veor_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veorq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veorq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veor_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veorq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veorq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veor_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veorq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veorq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veor_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise exclusive or (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veorq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(eor)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn veorq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 3)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vext_f16<const N: i32>(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        match N & 0b11 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vext_f32<const N: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe {
+        match N & 0b1 {
+            0 => simd_shuffle!(a, b, [0, 1]),
+            1 => simd_shuffle!(a, b, [1, 2]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vext_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe {
+        match N & 0b1 {
+            0 => simd_shuffle!(a, b, [0, 1]),
+            1 => simd_shuffle!(a, b, [1, 2]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vext_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe {
+        match N & 0b1 {
+            0 => simd_shuffle!(a, b, [0, 1]),
+            1 => simd_shuffle!(a, b, [1, 2]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, N = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, N = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vext_s64<const N: i32>(a: int64x1_t, _b: int64x1_t) -> int64x1_t {
+    static_assert!(N == 0);
+    a
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, N = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, N = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vext_u64<const N: i32>(a: uint64x1_t, _b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N == 0);
+    a
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 7))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 7)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vext_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        match N & 0b111 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+            4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+            5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+            6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+            7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 7))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 7)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vextq_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        match N & 0b111 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+            4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+            5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+            6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+            7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 7))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 7)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vext_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        match N & 0b111 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+            4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+            5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+            6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+            7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 7))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 7)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vextq_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        match N & 0b111 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+            4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+            5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+            6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+            7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 7))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 7)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vext_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        match N & 0b111 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+            4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+            5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+            6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+            7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 7))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 7)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vextq_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        match N & 0b111 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+            4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+            5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+            6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+            7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 7))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 7)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vextq_f16<const N: i32>(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        match N & 0b111 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+            4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+            5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+            6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+            7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 3)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vextq_f32<const N: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        match N & 0b11 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 3)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vext_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        match N & 0b11 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 3)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vextq_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        match N & 0b11 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 3)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vext_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        match N & 0b11 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 3)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vextq_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        match N & 0b11 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 3)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vext_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        match N & 0b11 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3]),
+            1 => simd_shuffle!(a, b, [1, 2, 3, 4]),
+            2 => simd_shuffle!(a, b, [2, 3, 4, 5]),
+            3 => simd_shuffle!(a, b, [3, 4, 5, 6]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vextq_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe {
+        match N & 0b1 {
+            0 => simd_shuffle!(a, b, [0, 1]),
+            1 => simd_shuffle!(a, b, [1, 2]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vextq_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe {
+        match N & 0b1 {
+            0 => simd_shuffle!(a, b, [0, 1]),
+            1 => simd_shuffle!(a, b, [1, 2]),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 15))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 15)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vextq_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe {
+        match N & 0b1111 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+            1 => simd_shuffle!(
+                a,
+                b,
+                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+            ),
+            2 => simd_shuffle!(
+                a,
+                b,
+                [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
+            ),
+            3 => simd_shuffle!(
+                a,
+                b,
+                [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
+            ),
+            4 => simd_shuffle!(
+                a,
+                b,
+                [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+            ),
+            5 => simd_shuffle!(
+                a,
+                b,
+                [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
+            ),
+            6 => simd_shuffle!(
+                a,
+                b,
+                [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
+            ),
+            7 => simd_shuffle!(
+                a,
+                b,
+                [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
+            ),
+            8 => simd_shuffle!(
+                a,
+                b,
+                [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
+            ),
+            9 => simd_shuffle!(
+                a,
+                b,
+                [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
+            ),
+            10 => simd_shuffle!(
+                a,
+                b,
+                [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
+            ),
+            11 => simd_shuffle!(
+                a,
+                b,
+                [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
+            ),
+            12 => simd_shuffle!(
+                a,
+                b,
+                [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
+            ),
+            13 => simd_shuffle!(
+                a,
+                b,
+                [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]
+            ),
+            14 => simd_shuffle!(
+                a,
+                b,
+                [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
+            ),
+            15 => simd_shuffle!(
+                a,
+                b,
+                [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
+            ),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 15))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 15)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vextq_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe {
+        match N & 0b1111 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+            1 => simd_shuffle!(
+                a,
+                b,
+                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+            ),
+            2 => simd_shuffle!(
+                a,
+                b,
+                [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
+            ),
+            3 => simd_shuffle!(
+                a,
+                b,
+                [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
+            ),
+            4 => simd_shuffle!(
+                a,
+                b,
+                [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+            ),
+            5 => simd_shuffle!(
+                a,
+                b,
+                [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
+            ),
+            6 => simd_shuffle!(
+                a,
+                b,
+                [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
+            ),
+            7 => simd_shuffle!(
+                a,
+                b,
+                [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
+            ),
+            8 => simd_shuffle!(
+                a,
+                b,
+                [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
+            ),
+            9 => simd_shuffle!(
+                a,
+                b,
+                [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
+            ),
+            10 => simd_shuffle!(
+                a,
+                b,
+                [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
+            ),
+            11 => simd_shuffle!(
+                a,
+                b,
+                [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
+            ),
+            12 => simd_shuffle!(
+                a,
+                b,
+                [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
+            ),
+            13 => simd_shuffle!(
+                a,
+                b,
+                [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]
+            ),
+            14 => simd_shuffle!(
+                a,
+                b,
+                [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
+            ),
+            15 => simd_shuffle!(
+                a,
+                b,
+                [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
+            ),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Extract vector from pair of vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 15))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext, N = 15)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vextq_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe {
+        match N & 0b1111 {
+            0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+            1 => simd_shuffle!(
+                a,
+                b,
+                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+            ),
+            2 => simd_shuffle!(
+                a,
+                b,
+                [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
+            ),
+            3 => simd_shuffle!(
+                a,
+                b,
+                [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
+            ),
+            4 => simd_shuffle!(
+                a,
+                b,
+                [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+            ),
+            5 => simd_shuffle!(
+                a,
+                b,
+                [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
+            ),
+            6 => simd_shuffle!(
+                a,
+                b,
+                [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
+            ),
+            7 => simd_shuffle!(
+                a,
+                b,
+                [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
+            ),
+            8 => simd_shuffle!(
+                a,
+                b,
+                [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
+            ),
+            9 => simd_shuffle!(
+                a,
+                b,
+                [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
+            ),
+            10 => simd_shuffle!(
+                a,
+                b,
+                [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
+            ),
+            11 => simd_shuffle!(
+                a,
+                b,
+                [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
+            ),
+            12 => simd_shuffle!(
+                a,
+                b,
+                [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
+            ),
+            13 => simd_shuffle!(
+                a,
+                b,
+                [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]
+            ),
+            14 => simd_shuffle!(
+                a,
+                b,
+                [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
+            ),
+            15 => simd_shuffle!(
+                a,
+                b,
+                [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
+            ),
+            _ => unreachable_unchecked(),
+        }
+    }
+}
+#[doc = "Floating-point fused Multiply-Add to accumulator (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmla)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfma_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t) -> float16x4_t {
+    unsafe { simd_fma(b, c, a) }
+}
+#[doc = "Floating-point fused Multiply-Add to accumulator (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmla)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmaq_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t) -> float16x8_t {
+    unsafe { simd_fma(b, c, a) }
+}
+#[doc = "Floating-point fused Multiply-Add to accumulator(vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vfma_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    unsafe { simd_fma(b, c, a) }
+}
+#[doc = "Floating-point fused Multiply-Add to accumulator(vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    unsafe { simd_fma(b, c, a) }
+}
+#[doc = "Floating-point fused Multiply-Add to accumulator(vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    vfma_f32(a, b, vdup_n_f32_vfp4(c))
+}
+#[doc = "Floating-point fused Multiply-Add to accumulator(vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    vfmaq_f32(a, b, vdupq_n_f32_vfp4(c))
+}
+#[doc = "Floating-point fused multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmls)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfms_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t) -> float16x4_t {
+    unsafe {
+        let b: float16x4_t = simd_neg(b);
+        vfma_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point fused multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmls)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vfmsq_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t) -> float16x8_t {
+    unsafe {
+        let b: float16x8_t = simd_neg(b);
+        vfmaq_f16(a, b, c)
+    }
+}
+#[doc = "Floating-point fused multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vfms_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    unsafe {
+        let b: float32x2_t = simd_neg(b);
+        vfma_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point fused multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vfmsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    unsafe {
+        let b: float32x4_t = simd_neg(b);
+        vfmaq_f32(a, b, c)
+    }
+}
+#[doc = "Floating-point fused Multiply-subtract to accumulator(vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vfms_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    vfms_f32(a, b, vdup_n_f32_vfp4(c))
+}
+#[doc = "Floating-point fused Multiply-subtract to accumulator(vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vfmsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    vfmsq_f32(a, b, vdupq_n_f32_vfp4(c))
+}
+#[doc = "Duplicate vector element to vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_high_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vget_high_f16(a: float16x8_t) -> float16x4_t {
+    unsafe { simd_shuffle!(a, a, [4, 5, 6, 7]) }
+}
+#[doc = "Duplicate vector element to vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_low_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(nop))]
+pub fn vget_low_f16(a: float16x8_t) -> float16x4_t {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_high_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_high_f32(a: float32x4_t) -> float32x2_t {
+    unsafe { simd_shuffle!(a, a, [2, 3]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_high_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_high_p16(a: poly16x8_t) -> poly16x4_t {
+    unsafe { simd_shuffle!(a, a, [4, 5, 6, 7]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_high_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_high_p8(a: poly8x16_t) -> poly8x8_t {
+    unsafe { simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_high_s16(a: int16x8_t) -> int16x4_t {
+    unsafe { simd_shuffle!(a, a, [4, 5, 6, 7]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_high_s32(a: int32x4_t) -> int32x2_t {
+    unsafe { simd_shuffle!(a, a, [2, 3]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_high_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_high_s8(a: int8x16_t) -> int8x8_t {
+    unsafe { simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_high_u16(a: uint16x8_t) -> uint16x4_t {
+    unsafe { simd_shuffle!(a, a, [4, 5, 6, 7]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_high_u32(a: uint32x4_t) -> uint32x2_t {
+    unsafe { simd_shuffle!(a, a, [2, 3]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_high_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_high_u8(a: uint8x16_t) -> uint8x8_t {
+    unsafe { simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_high_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_high_s64(a: int64x2_t) -> int64x1_t {
+    unsafe { int64x1_t([simd_extract!(a, 1)]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_high_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ext)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_high_u64(a: uint64x2_t) -> uint64x1_t {
+    unsafe { uint64x1_t([simd_extract!(a, 1)]) }
+}
+#[doc = "Duplicate vector element to scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_lane_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vget_lane_f16<const LANE: i32>(a: float16x4_t) -> f16 {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { simd_extract!(a, LANE as u32) }
+}
+#[doc = "Duplicate vector element to scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vgetq_lane_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vgetq_lane_f16<const LANE: i32>(a: float16x8_t) -> f16 {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { simd_extract!(a, LANE as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_lane_f32<const IMM5: i32>(v: float32x2_t) -> f32 {
+    static_assert_uimm_bits!(IMM5, 1);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_lane_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_lane_p16<const IMM5: i32>(v: poly16x4_t) -> p16 {
+    static_assert_uimm_bits!(IMM5, 2);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_lane_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_lane_p8<const IMM5: i32>(v: poly8x8_t) -> p8 {
+    static_assert_uimm_bits!(IMM5, 3);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_lane_s16<const IMM5: i32>(v: int16x4_t) -> i16 {
+    static_assert_uimm_bits!(IMM5, 2);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_lane_s32<const IMM5: i32>(v: int32x2_t) -> i32 {
+    static_assert_uimm_bits!(IMM5, 1);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_lane_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_lane_s8<const IMM5: i32>(v: int8x8_t) -> i8 {
+    static_assert_uimm_bits!(IMM5, 3);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_lane_u16<const IMM5: i32>(v: uint16x4_t) -> u16 {
+    static_assert_uimm_bits!(IMM5, 2);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_lane_u32<const IMM5: i32>(v: uint32x2_t) -> u32 {
+    static_assert_uimm_bits!(IMM5, 1);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_lane_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_lane_u8<const IMM5: i32>(v: uint8x8_t) -> u8 {
+    static_assert_uimm_bits!(IMM5, 3);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vgetq_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vgetq_lane_f32<const IMM5: i32>(v: float32x4_t) -> f32 {
+    static_assert_uimm_bits!(IMM5, 2);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vgetq_lane_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vgetq_lane_p16<const IMM5: i32>(v: poly16x8_t) -> p16 {
+    static_assert_uimm_bits!(IMM5, 3);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vgetq_lane_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vgetq_lane_p64<const IMM5: i32>(v: poly64x2_t) -> p64 {
+    static_assert_uimm_bits!(IMM5, 1);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vgetq_lane_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vgetq_lane_p8<const IMM5: i32>(v: poly8x16_t) -> p8 {
+    static_assert_uimm_bits!(IMM5, 4);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vgetq_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vgetq_lane_s16<const IMM5: i32>(v: int16x8_t) -> i16 {
+    static_assert_uimm_bits!(IMM5, 3);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vgetq_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vgetq_lane_s32<const IMM5: i32>(v: int32x4_t) -> i32 {
+    static_assert_uimm_bits!(IMM5, 2);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vgetq_lane_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vgetq_lane_s64<const IMM5: i32>(v: int64x2_t) -> i64 {
+    static_assert_uimm_bits!(IMM5, 1);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vgetq_lane_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vgetq_lane_s8<const IMM5: i32>(v: int8x16_t) -> i8 {
+    static_assert_uimm_bits!(IMM5, 4);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vgetq_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vgetq_lane_u16<const IMM5: i32>(v: uint16x8_t) -> u16 {
+    static_assert_uimm_bits!(IMM5, 3);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vgetq_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vgetq_lane_u32<const IMM5: i32>(v: uint32x4_t) -> u32 {
+    static_assert_uimm_bits!(IMM5, 2);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vgetq_lane_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vgetq_lane_u64<const IMM5: i32>(v: uint64x2_t) -> u64 {
+    static_assert_uimm_bits!(IMM5, 2);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vgetq_lane_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vgetq_lane_u8<const IMM5: i32>(v: uint8x16_t) -> u8 {
+    static_assert_uimm_bits!(IMM5, 4);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_lane_p64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 0))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_lane_p64<const IMM5: i32>(v: poly64x1_t) -> p64 {
+    static_assert!(IMM5 == 0);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_lane_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 0))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_lane_s64<const IMM5: i32>(v: int64x1_t) -> i64 {
+    static_assert!(IMM5 == 0);
+    unsafe { simd_extract!(v, IMM5 as u32) }
+}
+#[doc = "Move vector element to general-purpose register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_lane_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 0))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_lane_u64<const IMM5: i32>(v: uint64x1_t) -> u64 {
+    static_assert!(IMM5 == 0);
+    unsafe { simd_extract!(v, 0) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_low_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_low_f32(a: float32x4_t) -> float32x2_t {
+    unsafe { simd_shuffle!(a, a, [0, 1]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_low_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_low_p16(a: poly16x8_t) -> poly16x4_t {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_low_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_low_p8(a: poly8x16_t) -> poly8x8_t {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_low_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_low_s16(a: int16x8_t) -> int16x4_t {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_low_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_low_s32(a: int32x4_t) -> int32x2_t {
+    unsafe { simd_shuffle!(a, a, [0, 1]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_low_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_low_s8(a: int8x16_t) -> int8x8_t {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_low_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_low_u16(a: uint16x8_t) -> uint16x4_t {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_low_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_low_u32(a: uint32x4_t) -> uint32x2_t {
+    unsafe { simd_shuffle!(a, a, [0, 1]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_low_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_low_u8(a: uint8x16_t) -> uint8x8_t {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_low_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_low_s64(a: int64x2_t) -> int64x1_t {
+    unsafe { int64x1_t([simd_extract!(a, 0)]) }
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vget_low_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vget_low_u64(a: uint64x2_t) -> uint64x1_t {
+    unsafe { uint64x1_t([simd_extract!(a, 0)]) }
+}
+#[doc = "Halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhadd_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.shadd.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v8i8")]
+        fn _vhadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vhadd_s8(a, b) }
+}
+#[doc = "Halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhaddq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.shadd.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v16i8")]
+        fn _vhaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vhaddq_s8(a, b) }
+}
+#[doc = "Halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhadd_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.shadd.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v4i16")]
+        fn _vhadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vhadd_s16(a, b) }
+}
+#[doc = "Halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhaddq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.shadd.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v8i16")]
+        fn _vhaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vhaddq_s16(a, b) }
+}
+#[doc = "Halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhadd_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.shadd.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v2i32")]
+        fn _vhadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vhadd_s32(a, b) }
+}
+#[doc = "Halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhaddq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.shadd.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v4i32")]
+        fn _vhaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vhaddq_s32(a, b) }
+}
+#[doc = "Halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhadd_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uhadd.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v8i8")]
+        fn _vhadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vhadd_u8(a, b) }
+}
+#[doc = "Halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhaddq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uhadd.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v16i8")]
+        fn _vhaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vhaddq_u8(a, b) }
+}
+#[doc = "Halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhadd_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uhadd.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v4i16")]
+        fn _vhadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vhadd_u16(a, b) }
+}
+#[doc = "Halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhaddq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uhadd.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v8i16")]
+        fn _vhaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vhaddq_u16(a, b) }
+}
+#[doc = "Halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhadd_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uhadd.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v2i32")]
+        fn _vhadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vhadd_u32(a, b) }
+}
+#[doc = "Halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhaddq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uhadd.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v4i32")]
+        fn _vhaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vhaddq_u32(a, b) }
+}
+#[doc = "Signed halving subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhsub_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.shsub.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v4i16")]
+        fn _vhsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vhsub_s16(a, b) }
+}
+#[doc = "Signed halving subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhsubq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.shsub.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v8i16")]
+        fn _vhsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vhsubq_s16(a, b) }
+}
+#[doc = "Signed halving subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhsub_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.shsub.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v2i32")]
+        fn _vhsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vhsub_s32(a, b) }
+}
+#[doc = "Signed halving subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhsubq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.shsub.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v4i32")]
+        fn _vhsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vhsubq_s32(a, b) }
+}
+#[doc = "Signed halving subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhsub_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.shsub.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v8i8")]
+        fn _vhsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vhsub_s8(a, b) }
+}
+#[doc = "Signed halving subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhsubq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.shsub.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v16i8")]
+        fn _vhsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vhsubq_s8(a, b) }
+}
+#[doc = "Signed halving subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhsub_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uhsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uhsub.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v8i8")]
+        fn _vhsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vhsub_u8(a, b) }
+}
+#[doc = "Signed halving subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhsubq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uhsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uhsub.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v16i8")]
+        fn _vhsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vhsubq_u8(a, b) }
+}
+#[doc = "Signed halving subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhsub_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uhsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uhsub.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v4i16")]
+        fn _vhsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vhsub_u16(a, b) }
+}
+#[doc = "Signed halving subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhsubq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uhsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uhsub.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v8i16")]
+        fn _vhsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vhsubq_u16(a, b) }
+}
+#[doc = "Signed halving subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhsub_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uhsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uhsub.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v2i32")]
+        fn _vhsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vhsub_u32(a, b) }
+}
+#[doc = "Signed halving subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vhsubq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uhsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vhsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uhsub.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v4i32")]
+        fn _vhsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vhsubq_u32(a, b) }
+}
+#[doc = "Load one single-element structure and replicate to all lanes of one register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld1_dup_f16(ptr: *const f16) -> float16x4_t {
+    let x: float16x4_t = vld1_lane_f16::<0>(ptr, transmute(f16x4::splat(0.0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and replicate to all lanes of one register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld1q_dup_f16(ptr: *const f16) -> float16x8_t {
+    let x: float16x8_t = vld1q_lane_f16::<0>(ptr, transmute(f16x8::splat(0.0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_dup_f32(ptr: *const f32) -> float32x2_t {
+    let x = vld1_lane_f32::<0>(ptr, transmute(f32x2::splat(0.0)));
+    simd_shuffle!(x, x, [0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_dup_p16(ptr: *const p16) -> poly16x4_t {
+    let x = vld1_lane_p16::<0>(ptr, transmute(u16x4::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_dup_p8(ptr: *const p8) -> poly8x8_t {
+    let x = vld1_lane_p8::<0>(ptr, transmute(u8x8::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_dup_s16(ptr: *const i16) -> int16x4_t {
+    let x = vld1_lane_s16::<0>(ptr, transmute(i16x4::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_dup_s32(ptr: *const i32) -> int32x2_t {
+    let x = vld1_lane_s32::<0>(ptr, transmute(i32x2::splat(0)));
+    simd_shuffle!(x, x, [0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_dup_s8(ptr: *const i8) -> int8x8_t {
+    let x = vld1_lane_s8::<0>(ptr, transmute(i8x8::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_dup_u16(ptr: *const u16) -> uint16x4_t {
+    let x = vld1_lane_u16::<0>(ptr, transmute(u16x4::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_dup_u32(ptr: *const u32) -> uint32x2_t {
+    let x = vld1_lane_u32::<0>(ptr, transmute(u32x2::splat(0)));
+    simd_shuffle!(x, x, [0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_dup_u8(ptr: *const u8) -> uint8x8_t {
+    let x = vld1_lane_u8::<0>(ptr, transmute(u8x8::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_dup_f32(ptr: *const f32) -> float32x4_t {
+    let x = vld1q_lane_f32::<0>(ptr, transmute(f32x4::splat(0.0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_dup_p16(ptr: *const p16) -> poly16x8_t {
+    let x = vld1q_lane_p16::<0>(ptr, transmute(u16x8::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_dup_p8(ptr: *const p8) -> poly8x16_t {
+    let x = vld1q_lane_p8::<0>(ptr, transmute(u8x16::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_dup_s16(ptr: *const i16) -> int16x8_t {
+    let x = vld1q_lane_s16::<0>(ptr, transmute(i16x8::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_dup_s32(ptr: *const i32) -> int32x4_t {
+    let x = vld1q_lane_s32::<0>(ptr, transmute(i32x4::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_dup_s64(ptr: *const i64) -> int64x2_t {
+    let x = vld1q_lane_s64::<0>(ptr, transmute(i64x2::splat(0)));
+    simd_shuffle!(x, x, [0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_dup_s8(ptr: *const i8) -> int8x16_t {
+    let x = vld1q_lane_s8::<0>(ptr, transmute(i8x16::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_dup_u16(ptr: *const u16) -> uint16x8_t {
+    let x = vld1q_lane_u16::<0>(ptr, transmute(u16x8::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_dup_u32(ptr: *const u32) -> uint32x4_t {
+    let x = vld1q_lane_u32::<0>(ptr, transmute(u32x4::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_dup_u64(ptr: *const u64) -> uint64x2_t {
+    let x = vld1q_lane_u64::<0>(ptr, transmute(u64x2::splat(0)));
+    simd_shuffle!(x, x, [0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_dup_u8(ptr: *const u8) -> uint8x16_t {
+    let x = vld1q_lane_u8::<0>(ptr, transmute(u8x16::splat(0)));
+    simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ldr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_dup_p64(ptr: *const p64) -> poly64x1_t {
+    let x: poly64x1_t;
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    {
+        x = crate::core_arch::aarch64::vld1_p64(ptr);
+    }
+    #[cfg(target_arch = "arm")]
+    {
+        x = crate::core_arch::arm::vld1_p64(ptr);
+    };
+    x
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ldr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_dup_s64(ptr: *const i64) -> int64x1_t {
+    let x: int64x1_t;
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    {
+        x = crate::core_arch::aarch64::vld1_s64(ptr);
+    }
+    #[cfg(target_arch = "arm")]
+    {
+        x = crate::core_arch::arm::vld1_s64(ptr);
+    };
+    x
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ldr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_dup_u64(ptr: *const u64) -> uint64x1_t {
+    let x: uint64x1_t;
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    {
+        x = crate::core_arch::aarch64::vld1_u64(ptr);
+    }
+    #[cfg(target_arch = "arm")]
+    {
+        x = crate::core_arch::arm::vld1_u64(ptr);
+    };
+    x
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1_f16(ptr: *const f16) -> float16x4_t {
+    transmute(vld1_v4f16(
+        ptr as *const i8,
+        crate::mem::align_of::<f16>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1_f16(ptr: *const f16) -> float16x4_t {
+    let ret_val: float16x4_t = transmute(vld1_v4f16(
+        ptr as *const i8,
+        crate::mem::align_of::<f16>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1q_f16(ptr: *const f16) -> float16x8_t {
+    transmute(vld1q_v8f16(
+        ptr as *const i8,
+        crate::mem::align_of::<f16>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1q_f16(ptr: *const f16) -> float16x8_t {
+    let ret_val: float16x8_t = transmute(vld1q_v8f16(
+        ptr as *const i8,
+        crate::mem::align_of::<f16>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld1_f16_x2(a: *const f16) -> float16x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v4f16.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v4f16.p0")]
+        fn _vld1_f16_x2(a: *const f16) -> float16x4x2_t;
+    }
+    _vld1_f16_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld1_f16_x3(a: *const f16) -> float16x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v4f16.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v4f16.p0")]
+        fn _vld1_f16_x3(a: *const f16) -> float16x4x3_t;
+    }
+    _vld1_f16_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld1_f16_x4(a: *const f16) -> float16x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v4f16.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v4f16.p0")]
+        fn _vld1_f16_x4(a: *const f16) -> float16x4x4_t;
+    }
+    _vld1_f16_x4(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld1q_f16_x2(a: *const f16) -> float16x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v8f16.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v8f16.p0")]
+        fn _vld1q_f16_x2(a: *const f16) -> float16x8x2_t;
+    }
+    _vld1q_f16_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld1q_f16_x3(a: *const f16) -> float16x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v8f16.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v8f16.p0")]
+        fn _vld1q_f16_x3(a: *const f16) -> float16x8x3_t;
+    }
+    _vld1q_f16_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld1q_f16_x4(a: *const f16) -> float16x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v8f16.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v8f16.p0")]
+        fn _vld1q_f16_x4(a: *const f16) -> float16x8x4_t;
+    }
+    _vld1q_f16_x4(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr))]
+pub unsafe fn vld1_f32(ptr: *const f32) -> float32x2_t {
+    transmute(vld1_v2f32(
+        ptr as *const i8,
+        crate::mem::align_of::<f32>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr))]
+pub unsafe fn vld1_f32(ptr: *const f32) -> float32x2_t {
+    let ret_val: float32x2_t = transmute(vld1_v2f32(
+        ptr as *const i8,
+        crate::mem::align_of::<f32>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+pub unsafe fn vld1q_f32(ptr: *const f32) -> float32x4_t {
+    transmute(vld1q_v4f32(
+        ptr as *const i8,
+        crate::mem::align_of::<f32>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+pub unsafe fn vld1q_f32(ptr: *const f32) -> float32x4_t {
+    let ret_val: float32x4_t = transmute(vld1q_v4f32(
+        ptr as *const i8,
+        crate::mem::align_of::<f32>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+pub unsafe fn vld1_u8(ptr: *const u8) -> uint8x8_t {
+    transmute(vld1_v8i8(
+        ptr as *const i8,
+        crate::mem::align_of::<u8>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+pub unsafe fn vld1_u8(ptr: *const u8) -> uint8x8_t {
+    let ret_val: uint8x8_t = transmute(vld1_v8i8(
+        ptr as *const i8,
+        crate::mem::align_of::<u8>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+pub unsafe fn vld1q_u8(ptr: *const u8) -> uint8x16_t {
+    transmute(vld1q_v16i8(
+        ptr as *const i8,
+        crate::mem::align_of::<u8>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+pub unsafe fn vld1q_u8(ptr: *const u8) -> uint8x16_t {
+    let ret_val: uint8x16_t = transmute(vld1q_v16i8(
+        ptr as *const i8,
+        crate::mem::align_of::<u8>() as i32,
+    ));
+    simd_shuffle!(
+        ret_val,
+        ret_val,
+        [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+    )
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1_u16(ptr: *const u16) -> uint16x4_t {
+    transmute(vld1_v4i16(
+        ptr as *const i8,
+        crate::mem::align_of::<u16>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1_u16(ptr: *const u16) -> uint16x4_t {
+    let ret_val: uint16x4_t = transmute(vld1_v4i16(
+        ptr as *const i8,
+        crate::mem::align_of::<u16>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1q_u16(ptr: *const u16) -> uint16x8_t {
+    transmute(vld1q_v8i16(
+        ptr as *const i8,
+        crate::mem::align_of::<u16>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1q_u16(ptr: *const u16) -> uint16x8_t {
+    let ret_val: uint16x8_t = transmute(vld1q_v8i16(
+        ptr as *const i8,
+        crate::mem::align_of::<u16>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr))]
+pub unsafe fn vld1_u32(ptr: *const u32) -> uint32x2_t {
+    transmute(vld1_v2i32(
+        ptr as *const i8,
+        crate::mem::align_of::<u32>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr))]
+pub unsafe fn vld1_u32(ptr: *const u32) -> uint32x2_t {
+    let ret_val: uint32x2_t = transmute(vld1_v2i32(
+        ptr as *const i8,
+        crate::mem::align_of::<u32>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+pub unsafe fn vld1q_u32(ptr: *const u32) -> uint32x4_t {
+    transmute(vld1q_v4i32(
+        ptr as *const i8,
+        crate::mem::align_of::<u32>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+pub unsafe fn vld1q_u32(ptr: *const u32) -> uint32x4_t {
+    let ret_val: uint32x4_t = transmute(vld1q_v4i32(
+        ptr as *const i8,
+        crate::mem::align_of::<u32>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr))]
+pub unsafe fn vld1_u64(ptr: *const u64) -> uint64x1_t {
+    transmute(vld1_v1i64(
+        ptr as *const i8,
+        crate::mem::align_of::<u64>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.64"))]
+pub unsafe fn vld1q_u64(ptr: *const u64) -> uint64x2_t {
+    transmute(vld1q_v2i64(
+        ptr as *const i8,
+        crate::mem::align_of::<u64>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.64"))]
+pub unsafe fn vld1q_u64(ptr: *const u64) -> uint64x2_t {
+    let ret_val: uint64x2_t = transmute(vld1q_v2i64(
+        ptr as *const i8,
+        crate::mem::align_of::<u64>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+pub unsafe fn vld1_p8(ptr: *const p8) -> poly8x8_t {
+    transmute(vld1_v8i8(
+        ptr as *const i8,
+        crate::mem::align_of::<p8>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+pub unsafe fn vld1_p8(ptr: *const p8) -> poly8x8_t {
+    let ret_val: poly8x8_t = transmute(vld1_v8i8(
+        ptr as *const i8,
+        crate::mem::align_of::<p8>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+pub unsafe fn vld1q_p8(ptr: *const p8) -> poly8x16_t {
+    transmute(vld1q_v16i8(
+        ptr as *const i8,
+        crate::mem::align_of::<p8>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+pub unsafe fn vld1q_p8(ptr: *const p8) -> poly8x16_t {
+    let ret_val: poly8x16_t = transmute(vld1q_v16i8(
+        ptr as *const i8,
+        crate::mem::align_of::<p8>() as i32,
+    ));
+    simd_shuffle!(
+        ret_val,
+        ret_val,
+        [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+    )
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1_p16(ptr: *const p16) -> poly16x4_t {
+    transmute(vld1_v4i16(
+        ptr as *const i8,
+        crate::mem::align_of::<p16>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1_p16(ptr: *const p16) -> poly16x4_t {
+    let ret_val: poly16x4_t = transmute(vld1_v4i16(
+        ptr as *const i8,
+        crate::mem::align_of::<p16>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t {
+    transmute(vld1q_v8i16(
+        ptr as *const i8,
+        crate::mem::align_of::<p16>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t {
+    let ret_val: poly16x8_t = transmute(vld1q_v8i16(
+        ptr as *const i8,
+        crate::mem::align_of::<p16>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,aes")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.64"))]
+pub unsafe fn vld1q_p64(ptr: *const p64) -> poly64x2_t {
+    transmute(vld1q_v2i64(
+        ptr as *const i8,
+        crate::mem::align_of::<p64>() as i32,
+    ))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,aes")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.64"))]
+pub unsafe fn vld1q_p64(ptr: *const p64) -> poly64x2_t {
+    let ret_val: poly64x2_t = transmute(vld1q_v2i64(
+        ptr as *const i8,
+        crate::mem::align_of::<p64>() as i32,
+    ));
+    simd_shuffle!(ret_val, ret_val, [1, 0])
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_f32_x2(a: *const f32) -> float32x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v2f32.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v2f32.p0")]
+        fn _vld1_f32_x2(a: *const f32) -> float32x2x2_t;
+    }
+    _vld1_f32_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_f32_x3(a: *const f32) -> float32x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v2f32.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v2f32.p0")]
+        fn _vld1_f32_x3(a: *const f32) -> float32x2x3_t;
+    }
+    _vld1_f32_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_f32_x4(a: *const f32) -> float32x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v2f32.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v2f32.p0")]
+        fn _vld1_f32_x4(a: *const f32) -> float32x2x4_t;
+    }
+    _vld1_f32_x4(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_f32_x2(a: *const f32) -> float32x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v4f32.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v4f32.p0")]
+        fn _vld1q_f32_x2(a: *const f32) -> float32x4x2_t;
+    }
+    _vld1q_f32_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_f32_x3(a: *const f32) -> float32x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v4f32.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v4f32.p0")]
+        fn _vld1q_f32_x3(a: *const f32) -> float32x4x3_t;
+    }
+    _vld1q_f32_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_f32_x4(a: *const f32) -> float32x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v4f32.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v4f32.p0")]
+        fn _vld1q_f32_x4(a: *const f32) -> float32x4x4_t;
+    }
+    _vld1q_f32_x4(a)
+}
+#[doc = "Load one single-element structure to one lane of one register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld1_lane_f16<const LANE: i32>(ptr: *const f16, src: float16x4_t) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld1q_lane_f16<const LANE: i32>(ptr: *const f16, src: float16x8_t) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_lane_f32<const LANE: i32>(ptr: *const f32, src: float32x2_t) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", LANE = 3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_lane_p16<const LANE: i32>(ptr: *const p16, src: poly16x4_t) -> poly16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", LANE = 7))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 7)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_lane_p8<const LANE: i32>(ptr: *const p8, src: poly8x8_t) -> poly8x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", LANE = 3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_lane_s16<const LANE: i32>(ptr: *const i16, src: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_lane_s32<const LANE: i32>(ptr: *const i32, src: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ldr, LANE = 0)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_lane_s64<const LANE: i32>(ptr: *const i64, src: int64x1_t) -> int64x1_t {
+    static_assert!(LANE == 0);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", LANE = 7))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 7)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_lane_s8<const LANE: i32>(ptr: *const i8, src: int8x8_t) -> int8x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", LANE = 3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_lane_u16<const LANE: i32>(ptr: *const u16, src: uint16x4_t) -> uint16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_lane_u32<const LANE: i32>(ptr: *const u32, src: uint32x2_t) -> uint32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ldr, LANE = 0)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_lane_u64<const LANE: i32>(ptr: *const u64, src: uint64x1_t) -> uint64x1_t {
+    static_assert!(LANE == 0);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", LANE = 7))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 7)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_lane_u8<const LANE: i32>(ptr: *const u8, src: uint8x8_t) -> uint8x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", LANE = 3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_lane_f32<const LANE: i32>(ptr: *const f32, src: float32x4_t) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", LANE = 7))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 7)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_lane_p16<const LANE: i32>(ptr: *const p16, src: poly16x8_t) -> poly16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", LANE = 15))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 15)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_lane_p8<const LANE: i32>(ptr: *const p8, src: poly8x16_t) -> poly8x16_t {
+    static_assert_uimm_bits!(LANE, 4);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", LANE = 7))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 7)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_lane_s16<const LANE: i32>(ptr: *const i16, src: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", LANE = 3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_lane_s32<const LANE: i32>(ptr: *const i32, src: int32x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_lane_s64<const LANE: i32>(ptr: *const i64, src: int64x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", LANE = 15))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 15)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_lane_s8<const LANE: i32>(ptr: *const i8, src: int8x16_t) -> int8x16_t {
+    static_assert_uimm_bits!(LANE, 4);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", LANE = 7))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 7)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_lane_u16<const LANE: i32>(ptr: *const u16, src: uint16x8_t) -> uint16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", LANE = 3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_lane_u32<const LANE: i32>(ptr: *const u32, src: uint32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_lane_u64<const LANE: i32>(ptr: *const u64, src: uint64x2_t) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", LANE = 15))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 15)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_lane_u8<const LANE: i32>(ptr: *const u8, src: uint8x16_t) -> uint8x16_t {
+    static_assert_uimm_bits!(LANE, 4);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ldr, LANE = 0)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_lane_p64<const LANE: i32>(ptr: *const p64, src: poly64x1_t) -> poly64x1_t {
+    static_assert!(LANE == 0);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load one single-element structure to one lane of one register."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1, LANE = 1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_lane_p64<const LANE: i32>(ptr: *const p64, src: poly64x2_t) -> poly64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    simd_insert!(src, LANE as u32, *ptr)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,aes")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr))]
+pub unsafe fn vld1_p64(ptr: *const p64) -> poly64x1_t {
+    let a: *const i8 = ptr as *const i8;
+    let b: i32 = crate::mem::align_of::<p64>() as i32;
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v1i64")]
+        fn _vld1_v1i64(a: *const i8, b: i32) -> int64x1_t;
+    }
+    transmute(_vld1_v1i64(a, b))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p64_x2(a: *const p64) -> poly64x1x2_t {
+    transmute(vld1_s64_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p64_x3(a: *const p64) -> poly64x1x3_t {
+    transmute(vld1_s64_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p64_x4(a: *const p64) -> poly64x1x4_t {
+    transmute(vld1_s64_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p64_x2(a: *const p64) -> poly64x2x2_t {
+    transmute(vld1q_s64_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p64_x2(a: *const p64) -> poly64x2x2_t {
+    let mut ret_val: poly64x2x2_t = transmute(vld1q_s64_x2(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p64_x3(a: *const p64) -> poly64x2x3_t {
+    transmute(vld1q_s64_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p64_x3(a: *const p64) -> poly64x2x3_t {
+    let mut ret_val: poly64x2x3_t = transmute(vld1q_s64_x3(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p64_x4(a: *const p64) -> poly64x2x4_t {
+    transmute(vld1q_s64_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p64_x4(a: *const p64) -> poly64x2x4_t {
+    let mut ret_val: poly64x2x4_t = transmute(vld1q_s64_x4(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+pub unsafe fn vld1_s8(ptr: *const i8) -> int8x8_t {
+    vld1_v8i8(ptr as *const i8, crate::mem::align_of::<i8>() as i32)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+pub unsafe fn vld1q_s8(ptr: *const i8) -> int8x16_t {
+    vld1q_v16i8(ptr as *const i8, crate::mem::align_of::<i8>() as i32)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1_s16(ptr: *const i16) -> int16x4_t {
+    vld1_v4i16(ptr as *const i8, crate::mem::align_of::<i16>() as i32)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+pub unsafe fn vld1q_s16(ptr: *const i16) -> int16x8_t {
+    vld1q_v8i16(ptr as *const i8, crate::mem::align_of::<i16>() as i32)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr))]
+pub unsafe fn vld1_s32(ptr: *const i32) -> int32x2_t {
+    vld1_v2i32(ptr as *const i8, crate::mem::align_of::<i32>() as i32)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+pub unsafe fn vld1q_s32(ptr: *const i32) -> int32x4_t {
+    vld1q_v4i32(ptr as *const i8, crate::mem::align_of::<i32>() as i32)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr))]
+pub unsafe fn vld1_s64(ptr: *const i64) -> int64x1_t {
+    vld1_v1i64(ptr as *const i8, crate::mem::align_of::<i64>() as i32)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.64"))]
+pub unsafe fn vld1q_s64(ptr: *const i64) -> int64x2_t {
+    vld1q_v2i64(ptr as *const i8, crate::mem::align_of::<i64>() as i32)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_s8_x2(a: *const i8) -> int8x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v8i8.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v8i8.p0")]
+        fn _vld1_s8_x2(a: *const i8) -> int8x8x2_t;
+    }
+    _vld1_s8_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_s8_x3(a: *const i8) -> int8x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v8i8.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v8i8.p0")]
+        fn _vld1_s8_x3(a: *const i8) -> int8x8x3_t;
+    }
+    _vld1_s8_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_s8_x4(a: *const i8) -> int8x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v8i8.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v8i8.p0")]
+        fn _vld1_s8_x4(a: *const i8) -> int8x8x4_t;
+    }
+    _vld1_s8_x4(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_s8_x2(a: *const i8) -> int8x16x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v16i8.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v16i8.p0")]
+        fn _vld1q_s8_x2(a: *const i8) -> int8x16x2_t;
+    }
+    _vld1q_s8_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_s8_x3(a: *const i8) -> int8x16x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v16i8.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v16i8.p0")]
+        fn _vld1q_s8_x3(a: *const i8) -> int8x16x3_t;
+    }
+    _vld1q_s8_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_s8_x4(a: *const i8) -> int8x16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v16i8.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v16i8.p0")]
+        fn _vld1q_s8_x4(a: *const i8) -> int8x16x4_t;
+    }
+    _vld1q_s8_x4(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_s16_x2(a: *const i16) -> int16x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v4i16.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v4i16.p0")]
+        fn _vld1_s16_x2(a: *const i16) -> int16x4x2_t;
+    }
+    _vld1_s16_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_s16_x3(a: *const i16) -> int16x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v4i16.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v4i16.p0")]
+        fn _vld1_s16_x3(a: *const i16) -> int16x4x3_t;
+    }
+    _vld1_s16_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_s16_x4(a: *const i16) -> int16x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v4i16.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v4i16.p0")]
+        fn _vld1_s16_x4(a: *const i16) -> int16x4x4_t;
+    }
+    _vld1_s16_x4(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_s16_x2(a: *const i16) -> int16x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v8i16.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v8i16.p0")]
+        fn _vld1q_s16_x2(a: *const i16) -> int16x8x2_t;
+    }
+    _vld1q_s16_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_s16_x3(a: *const i16) -> int16x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v8i16.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v8i16.p0")]
+        fn _vld1q_s16_x3(a: *const i16) -> int16x8x3_t;
+    }
+    _vld1q_s16_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_s16_x4(a: *const i16) -> int16x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v8i16.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v8i16.p0")]
+        fn _vld1q_s16_x4(a: *const i16) -> int16x8x4_t;
+    }
+    _vld1q_s16_x4(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_s32_x2(a: *const i32) -> int32x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v2i32.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v2i32.p0")]
+        fn _vld1_s32_x2(a: *const i32) -> int32x2x2_t;
+    }
+    _vld1_s32_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_s32_x3(a: *const i32) -> int32x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v2i32.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v2i32.p0")]
+        fn _vld1_s32_x3(a: *const i32) -> int32x2x3_t;
+    }
+    _vld1_s32_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_s32_x4(a: *const i32) -> int32x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v2i32.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v2i32.p0")]
+        fn _vld1_s32_x4(a: *const i32) -> int32x2x4_t;
+    }
+    _vld1_s32_x4(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_s32_x2(a: *const i32) -> int32x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v4i32.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v4i32.p0")]
+        fn _vld1q_s32_x2(a: *const i32) -> int32x4x2_t;
+    }
+    _vld1q_s32_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_s32_x3(a: *const i32) -> int32x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v4i32.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v4i32.p0")]
+        fn _vld1q_s32_x3(a: *const i32) -> int32x4x3_t;
+    }
+    _vld1q_s32_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_s32_x4(a: *const i32) -> int32x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v4i32.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v4i32.p0")]
+        fn _vld1q_s32_x4(a: *const i32) -> int32x4x4_t;
+    }
+    _vld1q_s32_x4(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_s64_x2(a: *const i64) -> int64x1x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v1i64.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v1i64.p0")]
+        fn _vld1_s64_x2(a: *const i64) -> int64x1x2_t;
+    }
+    _vld1_s64_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_s64_x3(a: *const i64) -> int64x1x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v1i64.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v1i64.p0")]
+        fn _vld1_s64_x3(a: *const i64) -> int64x1x3_t;
+    }
+    _vld1_s64_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_s64_x4(a: *const i64) -> int64x1x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v1i64.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v1i64.p0")]
+        fn _vld1_s64_x4(a: *const i64) -> int64x1x4_t;
+    }
+    _vld1_s64_x4(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_s64_x2(a: *const i64) -> int64x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x2.v2i64.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v2i64.p0")]
+        fn _vld1q_s64_x2(a: *const i64) -> int64x2x2_t;
+    }
+    _vld1q_s64_x2(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_s64_x3(a: *const i64) -> int64x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x3.v2i64.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v2i64.p0")]
+        fn _vld1q_s64_x3(a: *const i64) -> int64x2x3_t;
+    }
+    _vld1q_s64_x3(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_s64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_s64_x4(a: *const i64) -> int64x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld1x4.v2i64.p0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v2i64.p0")]
+        fn _vld1q_s64_x4(a: *const i64) -> int64x2x4_t;
+    }
+    _vld1q_s64_x4(a)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u8_x2(a: *const u8) -> uint8x8x2_t {
+    transmute(vld1_s8_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u8_x2(a: *const u8) -> uint8x8x2_t {
+    let mut ret_val: uint8x8x2_t = transmute(vld1_s8_x2(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u8_x3(a: *const u8) -> uint8x8x3_t {
+    transmute(vld1_s8_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u8_x3(a: *const u8) -> uint8x8x3_t {
+    let mut ret_val: uint8x8x3_t = transmute(vld1_s8_x3(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u8_x4(a: *const u8) -> uint8x8x4_t {
+    transmute(vld1_s8_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u8_x4(a: *const u8) -> uint8x8x4_t {
+    let mut ret_val: uint8x8x4_t = transmute(vld1_s8_x4(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u8_x2(a: *const u8) -> uint8x16x2_t {
+    transmute(vld1q_s8_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u8_x2(a: *const u8) -> uint8x16x2_t {
+    let mut ret_val: uint8x16x2_t = transmute(vld1q_s8_x2(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u8_x3(a: *const u8) -> uint8x16x3_t {
+    transmute(vld1q_s8_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u8_x3(a: *const u8) -> uint8x16x3_t {
+    let mut ret_val: uint8x16x3_t = transmute(vld1q_s8_x3(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.2 = unsafe {
+        simd_shuffle!(
+            ret_val.2,
+            ret_val.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u8_x4(a: *const u8) -> uint8x16x4_t {
+    transmute(vld1q_s8_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u8_x4(a: *const u8) -> uint8x16x4_t {
+    let mut ret_val: uint8x16x4_t = transmute(vld1q_s8_x4(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.2 = unsafe {
+        simd_shuffle!(
+            ret_val.2,
+            ret_val.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.3 = unsafe {
+        simd_shuffle!(
+            ret_val.3,
+            ret_val.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u16_x2(a: *const u16) -> uint16x4x2_t {
+    transmute(vld1_s16_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u16_x2(a: *const u16) -> uint16x4x2_t {
+    let mut ret_val: uint16x4x2_t = transmute(vld1_s16_x2(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u16_x3(a: *const u16) -> uint16x4x3_t {
+    transmute(vld1_s16_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u16_x3(a: *const u16) -> uint16x4x3_t {
+    let mut ret_val: uint16x4x3_t = transmute(vld1_s16_x3(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u16_x4(a: *const u16) -> uint16x4x4_t {
+    transmute(vld1_s16_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u16_x4(a: *const u16) -> uint16x4x4_t {
+    let mut ret_val: uint16x4x4_t = transmute(vld1_s16_x4(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u16_x2(a: *const u16) -> uint16x8x2_t {
+    transmute(vld1q_s16_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u16_x2(a: *const u16) -> uint16x8x2_t {
+    let mut ret_val: uint16x8x2_t = transmute(vld1q_s16_x2(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u16_x3(a: *const u16) -> uint16x8x3_t {
+    transmute(vld1q_s16_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u16_x3(a: *const u16) -> uint16x8x3_t {
+    let mut ret_val: uint16x8x3_t = transmute(vld1q_s16_x3(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u16_x4(a: *const u16) -> uint16x8x4_t {
+    transmute(vld1q_s16_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u16_x4(a: *const u16) -> uint16x8x4_t {
+    let mut ret_val: uint16x8x4_t = transmute(vld1q_s16_x4(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u32_x2(a: *const u32) -> uint32x2x2_t {
+    transmute(vld1_s32_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u32_x2(a: *const u32) -> uint32x2x2_t {
+    let mut ret_val: uint32x2x2_t = transmute(vld1_s32_x2(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u32_x3(a: *const u32) -> uint32x2x3_t {
+    transmute(vld1_s32_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u32_x3(a: *const u32) -> uint32x2x3_t {
+    let mut ret_val: uint32x2x3_t = transmute(vld1_s32_x3(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u32_x4(a: *const u32) -> uint32x2x4_t {
+    transmute(vld1_s32_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u32_x4(a: *const u32) -> uint32x2x4_t {
+    let mut ret_val: uint32x2x4_t = transmute(vld1_s32_x4(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u32_x2(a: *const u32) -> uint32x4x2_t {
+    transmute(vld1q_s32_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u32_x2(a: *const u32) -> uint32x4x2_t {
+    let mut ret_val: uint32x4x2_t = transmute(vld1q_s32_x2(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u32_x3(a: *const u32) -> uint32x4x3_t {
+    transmute(vld1q_s32_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u32_x3(a: *const u32) -> uint32x4x3_t {
+    let mut ret_val: uint32x4x3_t = transmute(vld1q_s32_x3(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u32_x4(a: *const u32) -> uint32x4x4_t {
+    transmute(vld1q_s32_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u32_x4(a: *const u32) -> uint32x4x4_t {
+    let mut ret_val: uint32x4x4_t = transmute(vld1q_s32_x4(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u64_x2(a: *const u64) -> uint64x1x2_t {
+    transmute(vld1_s64_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u64_x3(a: *const u64) -> uint64x1x3_t {
+    transmute(vld1_s64_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_u64_x4(a: *const u64) -> uint64x1x4_t {
+    transmute(vld1_s64_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u64_x2(a: *const u64) -> uint64x2x2_t {
+    transmute(vld1q_s64_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u64_x2(a: *const u64) -> uint64x2x2_t {
+    let mut ret_val: uint64x2x2_t = transmute(vld1q_s64_x2(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u64_x3(a: *const u64) -> uint64x2x3_t {
+    transmute(vld1q_s64_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u64_x3(a: *const u64) -> uint64x2x3_t {
+    let mut ret_val: uint64x2x3_t = transmute(vld1q_s64_x3(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u64_x4(a: *const u64) -> uint64x2x4_t {
+    transmute(vld1q_s64_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_u64_x4(a: *const u64) -> uint64x2x4_t {
+    let mut ret_val: uint64x2x4_t = transmute(vld1q_s64_x4(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p8_x2(a: *const p8) -> poly8x8x2_t {
+    transmute(vld1_s8_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p8_x2(a: *const p8) -> poly8x8x2_t {
+    let mut ret_val: poly8x8x2_t = transmute(vld1_s8_x2(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p8_x3(a: *const p8) -> poly8x8x3_t {
+    transmute(vld1_s8_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p8_x3(a: *const p8) -> poly8x8x3_t {
+    let mut ret_val: poly8x8x3_t = transmute(vld1_s8_x3(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p8_x4(a: *const p8) -> poly8x8x4_t {
+    transmute(vld1_s8_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p8_x4(a: *const p8) -> poly8x8x4_t {
+    let mut ret_val: poly8x8x4_t = transmute(vld1_s8_x4(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p8_x2(a: *const p8) -> poly8x16x2_t {
+    transmute(vld1q_s8_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p8_x2(a: *const p8) -> poly8x16x2_t {
+    let mut ret_val: poly8x16x2_t = transmute(vld1q_s8_x2(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p8_x3(a: *const p8) -> poly8x16x3_t {
+    transmute(vld1q_s8_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p8_x3(a: *const p8) -> poly8x16x3_t {
+    let mut ret_val: poly8x16x3_t = transmute(vld1q_s8_x3(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.2 = unsafe {
+        simd_shuffle!(
+            ret_val.2,
+            ret_val.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p8_x4(a: *const p8) -> poly8x16x4_t {
+    transmute(vld1q_s8_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p8_x4(a: *const p8) -> poly8x16x4_t {
+    let mut ret_val: poly8x16x4_t = transmute(vld1q_s8_x4(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.2 = unsafe {
+        simd_shuffle!(
+            ret_val.2,
+            ret_val.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.3 = unsafe {
+        simd_shuffle!(
+            ret_val.3,
+            ret_val.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p16_x2(a: *const p16) -> poly16x4x2_t {
+    transmute(vld1_s16_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p16_x2(a: *const p16) -> poly16x4x2_t {
+    let mut ret_val: poly16x4x2_t = transmute(vld1_s16_x2(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p16_x3(a: *const p16) -> poly16x4x3_t {
+    transmute(vld1_s16_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p16_x3(a: *const p16) -> poly16x4x3_t {
+    let mut ret_val: poly16x4x3_t = transmute(vld1_s16_x3(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p16_x4(a: *const p16) -> poly16x4x4_t {
+    transmute(vld1_s16_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1_p16_x4(a: *const p16) -> poly16x4x4_t {
+    let mut ret_val: poly16x4x4_t = transmute(vld1_s16_x4(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p16_x2(a: *const p16) -> poly16x8x2_t {
+    transmute(vld1q_s16_x2(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p16_x2(a: *const p16) -> poly16x8x2_t {
+    let mut ret_val: poly16x8x2_t = transmute(vld1q_s16_x2(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p16_x3(a: *const p16) -> poly16x8x3_t {
+    transmute(vld1q_s16_x3(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p16_x3(a: *const p16) -> poly16x8x3_t {
+    let mut ret_val: poly16x8x3_t = transmute(vld1q_s16_x3(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p16_x4(a: *const p16) -> poly16x8x4_t {
+    transmute(vld1q_s16_x4(transmute(a)))
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_p16_x4(a: *const p16) -> poly16x8x4_t {
+    let mut ret_val: poly16x8x4_t = transmute(vld1q_s16_x4(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_v1i64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+unsafe fn vld1_v1i64(a: *const i8, b: i32) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v1i64")]
+        fn _vld1_v1i64(a: *const i8, b: i32) -> int64x1_t;
+    }
+    _vld1_v1i64(a, b)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_v2f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+unsafe fn vld1_v2f32(a: *const i8, b: i32) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v2f32")]
+        fn _vld1_v2f32(a: *const i8, b: i32) -> float32x2_t;
+    }
+    _vld1_v2f32(a, b)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_v2i32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+unsafe fn vld1_v2i32(a: *const i8, b: i32) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v2i32")]
+        fn _vld1_v2i32(a: *const i8, b: i32) -> int32x2_t;
+    }
+    _vld1_v2i32(a, b)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_v4i16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+unsafe fn vld1_v4i16(a: *const i8, b: i32) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v4i16")]
+        fn _vld1_v4i16(a: *const i8, b: i32) -> int16x4_t;
+    }
+    _vld1_v4i16(a, b)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_v8i8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+unsafe fn vld1_v8i8(a: *const i8, b: i32) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v8i8")]
+        fn _vld1_v8i8(a: *const i8, b: i32) -> int8x8_t;
+    }
+    _vld1_v8i8(a, b)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_v16i8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+unsafe fn vld1q_v16i8(a: *const i8, b: i32) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v16i8")]
+        fn _vld1q_v16i8(a: *const i8, b: i32) -> int8x16_t;
+    }
+    _vld1q_v16i8(a, b)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_v2i64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+unsafe fn vld1q_v2i64(a: *const i8, b: i32) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v2i64")]
+        fn _vld1q_v2i64(a: *const i8, b: i32) -> int64x2_t;
+    }
+    _vld1q_v2i64(a, b)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_v4f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+unsafe fn vld1q_v4f32(a: *const i8, b: i32) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v4f32")]
+        fn _vld1q_v4f32(a: *const i8, b: i32) -> float32x4_t;
+    }
+    _vld1q_v4f32(a, b)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_v4i32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+unsafe fn vld1q_v4i32(a: *const i8, b: i32) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v4i32")]
+        fn _vld1q_v4i32(a: *const i8, b: i32) -> int32x4_t;
+    }
+    _vld1q_v4i32(a, b)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_v8i16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+unsafe fn vld1q_v8i16(a: *const i8, b: i32) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v8i16")]
+        fn _vld1q_v8i16(a: *const i8, b: i32) -> int16x8_t;
+    }
+    _vld1q_v8i16(a, b)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_v4f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+unsafe fn vld1_v4f16(a: *const i8, b: i32) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v4f16")]
+        fn _vld1_v4f16(a: *const i8, b: i32) -> float16x4_t;
+    }
+    _vld1_v4f16(a, b)
+}
+#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_v8f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+unsafe fn vld1q_v8f16(a: *const i8, b: i32) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v8f16")]
+        fn _vld1q_v8f16(a: *const i8, b: i32) -> float16x8_t;
+    }
+    _vld1q_v8f16(a, b)
+}
+#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vldr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld1r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld1q_dup_p64(ptr: *const p64) -> poly64x2_t {
+    let x = vld1q_lane_p64::<0>(ptr, transmute(u64x2::splat(0)));
+    simd_shuffle!(x, x, [0, 0])
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld2_dup_f16(a: *const f16) -> float16x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v4f16.p0")]
+        fn _vld2_dup_f16(ptr: *const f16, size: i32) -> float16x4x2_t;
+    }
+    _vld2_dup_f16(a as _, 2)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld2q_dup_f16(a: *const f16) -> float16x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v8f16.p0")]
+        fn _vld2q_dup_f16(ptr: *const f16, size: i32) -> float16x8x2_t;
+    }
+    _vld2q_dup_f16(a as _, 2)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld2_dup_f16(a: *const f16) -> float16x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v4f16.p0"
+        )]
+        fn _vld2_dup_f16(ptr: *const f16) -> float16x4x2_t;
+    }
+    _vld2_dup_f16(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld2q_dup_f16(a: *const f16) -> float16x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v8f16.p0"
+        )]
+        fn _vld2q_dup_f16(ptr: *const f16) -> float16x8x2_t;
+    }
+    _vld2q_dup_f16(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_dup_f32(a: *const f32) -> float32x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v2f32.p0")]
+        fn _vld2_dup_f32(ptr: *const i8, size: i32) -> float32x2x2_t;
+    }
+    _vld2_dup_f32(a as *const i8, 4)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_dup_f32(a: *const f32) -> float32x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v4f32.p0")]
+        fn _vld2q_dup_f32(ptr: *const i8, size: i32) -> float32x4x2_t;
+    }
+    _vld2q_dup_f32(a as *const i8, 4)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_dup_s8(a: *const i8) -> int8x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v8i8.p0")]
+        fn _vld2_dup_s8(ptr: *const i8, size: i32) -> int8x8x2_t;
+    }
+    _vld2_dup_s8(a as *const i8, 1)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_dup_s8(a: *const i8) -> int8x16x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v16i8.p0")]
+        fn _vld2q_dup_s8(ptr: *const i8, size: i32) -> int8x16x2_t;
+    }
+    _vld2q_dup_s8(a as *const i8, 1)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_dup_s16(a: *const i16) -> int16x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v4i16.p0")]
+        fn _vld2_dup_s16(ptr: *const i8, size: i32) -> int16x4x2_t;
+    }
+    _vld2_dup_s16(a as *const i8, 2)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_dup_s16(a: *const i16) -> int16x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v8i16.p0")]
+        fn _vld2q_dup_s16(ptr: *const i8, size: i32) -> int16x8x2_t;
+    }
+    _vld2q_dup_s16(a as *const i8, 2)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_dup_s32(a: *const i32) -> int32x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v2i32.p0")]
+        fn _vld2_dup_s32(ptr: *const i8, size: i32) -> int32x2x2_t;
+    }
+    _vld2_dup_s32(a as *const i8, 4)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_dup_s32(a: *const i32) -> int32x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v4i32.p0")]
+        fn _vld2q_dup_s32(ptr: *const i8, size: i32) -> int32x4x2_t;
+    }
+    _vld2q_dup_s32(a as *const i8, 4)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2_dup_f32(a: *const f32) -> float32x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v2f32.p0"
+        )]
+        fn _vld2_dup_f32(ptr: *const f32) -> float32x2x2_t;
+    }
+    _vld2_dup_f32(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2q_dup_f32(a: *const f32) -> float32x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v4f32.p0"
+        )]
+        fn _vld2q_dup_f32(ptr: *const f32) -> float32x4x2_t;
+    }
+    _vld2q_dup_f32(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2_dup_s8(a: *const i8) -> int8x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v8i8.p0"
+        )]
+        fn _vld2_dup_s8(ptr: *const i8) -> int8x8x2_t;
+    }
+    _vld2_dup_s8(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2q_dup_s8(a: *const i8) -> int8x16x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v16i8.p0"
+        )]
+        fn _vld2q_dup_s8(ptr: *const i8) -> int8x16x2_t;
+    }
+    _vld2q_dup_s8(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2_dup_s16(a: *const i16) -> int16x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v4i16.p0"
+        )]
+        fn _vld2_dup_s16(ptr: *const i16) -> int16x4x2_t;
+    }
+    _vld2_dup_s16(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2q_dup_s16(a: *const i16) -> int16x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v8i16.p0"
+        )]
+        fn _vld2q_dup_s16(ptr: *const i16) -> int16x8x2_t;
+    }
+    _vld2q_dup_s16(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2_dup_s32(a: *const i32) -> int32x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v2i32.p0"
+        )]
+        fn _vld2_dup_s32(ptr: *const i32) -> int32x2x2_t;
+    }
+    _vld2_dup_s32(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2q_dup_s32(a: *const i32) -> int32x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v4i32.p0"
+        )]
+        fn _vld2q_dup_s32(ptr: *const i32) -> int32x4x2_t;
+    }
+    _vld2q_dup_s32(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_dup_p64(a: *const p64) -> poly64x1x2_t {
+    transmute(vld2_dup_s64(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld2_dup_s64(a: *const i64) -> int64x1x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v1i64.p0")]
+        fn _vld2_dup_s64(ptr: *const i8, size: i32) -> int64x1x2_t;
+    }
+    _vld2_dup_s64(a as *const i8, 8)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2r))]
+pub unsafe fn vld2_dup_s64(a: *const i64) -> int64x1x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2r.v1i64.p0"
+        )]
+        fn _vld2_dup_s64(ptr: *const i64) -> int64x1x2_t;
+    }
+    _vld2_dup_s64(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_dup_u64(a: *const u64) -> uint64x1x2_t {
+    transmute(vld2_dup_s64(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_dup_u8(a: *const u8) -> uint8x8x2_t {
+    transmute(vld2_dup_s8(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_dup_u8(a: *const u8) -> uint8x8x2_t {
+    let mut ret_val: uint8x8x2_t = transmute(vld2_dup_s8(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_dup_u8(a: *const u8) -> uint8x16x2_t {
+    transmute(vld2q_dup_s8(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_dup_u8(a: *const u8) -> uint8x16x2_t {
+    let mut ret_val: uint8x16x2_t = transmute(vld2q_dup_s8(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_dup_u16(a: *const u16) -> uint16x4x2_t {
+    transmute(vld2_dup_s16(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_dup_u16(a: *const u16) -> uint16x4x2_t {
+    let mut ret_val: uint16x4x2_t = transmute(vld2_dup_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_dup_u16(a: *const u16) -> uint16x8x2_t {
+    transmute(vld2q_dup_s16(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_dup_u16(a: *const u16) -> uint16x8x2_t {
+    let mut ret_val: uint16x8x2_t = transmute(vld2q_dup_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_dup_u32(a: *const u32) -> uint32x2x2_t {
+    transmute(vld2_dup_s32(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_dup_u32(a: *const u32) -> uint32x2x2_t {
+    let mut ret_val: uint32x2x2_t = transmute(vld2_dup_s32(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_dup_u32(a: *const u32) -> uint32x4x2_t {
+    transmute(vld2q_dup_s32(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_dup_u32(a: *const u32) -> uint32x4x2_t {
+    let mut ret_val: uint32x4x2_t = transmute(vld2q_dup_s32(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_dup_p8(a: *const p8) -> poly8x8x2_t {
+    transmute(vld2_dup_s8(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_dup_p8(a: *const p8) -> poly8x8x2_t {
+    let mut ret_val: poly8x8x2_t = transmute(vld2_dup_s8(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_dup_p8(a: *const p8) -> poly8x16x2_t {
+    transmute(vld2q_dup_s8(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_dup_p8(a: *const p8) -> poly8x16x2_t {
+    let mut ret_val: poly8x16x2_t = transmute(vld2q_dup_s8(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_dup_p16(a: *const p16) -> poly16x4x2_t {
+    transmute(vld2_dup_s16(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_dup_p16(a: *const p16) -> poly16x4x2_t {
+    let mut ret_val: poly16x4x2_t = transmute(vld2_dup_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_dup_p16(a: *const p16) -> poly16x8x2_t {
+    transmute(vld2q_dup_s16(transmute(a)))
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_dup_p16(a: *const p16) -> poly16x8x2_t {
+    let mut ret_val: poly16x8x2_t = transmute(vld2q_dup_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld2_f16(a: *const f16) -> float16x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v4f16.p0")]
+        fn _vld2_f16(ptr: *const f16, size: i32) -> float16x4x2_t;
+    }
+    _vld2_f16(a as _, 2)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld2q_f16(a: *const f16) -> float16x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v8f16.p0")]
+        fn _vld2q_f16(ptr: *const f16, size: i32) -> float16x8x2_t;
+    }
+    _vld2q_f16(a as _, 2)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld2_f16(a: *const f16) -> float16x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v4f16.p0"
+        )]
+        fn _vld2_f16(ptr: *const f16) -> float16x4x2_t;
+    }
+    _vld2_f16(a as _)
+}
+#[doc = "Load single 2-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld2q_f16(a: *const f16) -> float16x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v8f16.p0"
+        )]
+        fn _vld2q_f16(ptr: *const f16) -> float16x8x2_t;
+    }
+    _vld2q_f16(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_f32(a: *const f32) -> float32x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v2f32")]
+        fn _vld2_f32(ptr: *const i8, size: i32) -> float32x2x2_t;
+    }
+    _vld2_f32(a as *const i8, 4)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_f32(a: *const f32) -> float32x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v4f32")]
+        fn _vld2q_f32(ptr: *const i8, size: i32) -> float32x4x2_t;
+    }
+    _vld2q_f32(a as *const i8, 4)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_s8(a: *const i8) -> int8x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v8i8")]
+        fn _vld2_s8(ptr: *const i8, size: i32) -> int8x8x2_t;
+    }
+    _vld2_s8(a as *const i8, 1)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_s8(a: *const i8) -> int8x16x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v16i8")]
+        fn _vld2q_s8(ptr: *const i8, size: i32) -> int8x16x2_t;
+    }
+    _vld2q_s8(a as *const i8, 1)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_s16(a: *const i16) -> int16x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v4i16")]
+        fn _vld2_s16(ptr: *const i8, size: i32) -> int16x4x2_t;
+    }
+    _vld2_s16(a as *const i8, 2)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_s16(a: *const i16) -> int16x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v8i16")]
+        fn _vld2q_s16(ptr: *const i8, size: i32) -> int16x8x2_t;
+    }
+    _vld2q_s16(a as *const i8, 2)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_s32(a: *const i32) -> int32x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v2i32")]
+        fn _vld2_s32(ptr: *const i8, size: i32) -> int32x2x2_t;
+    }
+    _vld2_s32(a as *const i8, 4)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_s32(a: *const i32) -> int32x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v4i32")]
+        fn _vld2q_s32(ptr: *const i8, size: i32) -> int32x4x2_t;
+    }
+    _vld2q_s32(a as *const i8, 4)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2_f32(a: *const f32) -> float32x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v2f32.p0"
+        )]
+        fn _vld2_f32(ptr: *const float32x2_t) -> float32x2x2_t;
+    }
+    _vld2_f32(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2q_f32(a: *const f32) -> float32x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v4f32.p0"
+        )]
+        fn _vld2q_f32(ptr: *const float32x4_t) -> float32x4x2_t;
+    }
+    _vld2q_f32(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2_s8(a: *const i8) -> int8x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v8i8.p0"
+        )]
+        fn _vld2_s8(ptr: *const int8x8_t) -> int8x8x2_t;
+    }
+    _vld2_s8(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2q_s8(a: *const i8) -> int8x16x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v16i8.p0"
+        )]
+        fn _vld2q_s8(ptr: *const int8x16_t) -> int8x16x2_t;
+    }
+    _vld2q_s8(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2_s16(a: *const i16) -> int16x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v4i16.p0"
+        )]
+        fn _vld2_s16(ptr: *const int16x4_t) -> int16x4x2_t;
+    }
+    _vld2_s16(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2q_s16(a: *const i16) -> int16x8x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v8i16.p0"
+        )]
+        fn _vld2q_s16(ptr: *const int16x8_t) -> int16x8x2_t;
+    }
+    _vld2q_s16(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2_s32(a: *const i32) -> int32x2x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v2i32.p0"
+        )]
+        fn _vld2_s32(ptr: *const int32x2_t) -> int32x2x2_t;
+    }
+    _vld2_s32(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld2))]
+pub unsafe fn vld2q_s32(a: *const i32) -> int32x4x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v4i32.p0"
+        )]
+        fn _vld2q_s32(ptr: *const int32x4_t) -> int32x4x2_t;
+    }
+    _vld2q_s32(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld2_lane_f16<const LANE: i32>(a: *const f16, b: float16x4x2_t) -> float16x4x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v4f16.p0")]
+        fn _vld2_lane_f16(
+            ptr: *const f16,
+            a: float16x4_t,
+            b: float16x4_t,
+            n: i32,
+            size: i32,
+        ) -> float16x4x2_t;
+    }
+    _vld2_lane_f16(a as _, b.0, b.1, LANE, 2)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld2q_lane_f16<const LANE: i32>(a: *const f16, b: float16x8x2_t) -> float16x8x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v8f16.p0")]
+        fn _vld2q_lane_f16(
+            ptr: *const f16,
+            a: float16x8_t,
+            b: float16x8_t,
+            n: i32,
+            size: i32,
+        ) -> float16x8x2_t;
+    }
+    _vld2q_lane_f16(a as _, b.0, b.1, LANE, 2)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld2_lane_f16<const LANE: i32>(a: *const f16, b: float16x4x2_t) -> float16x4x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v4f16.p0"
+        )]
+        fn _vld2_lane_f16(a: float16x4_t, b: float16x4_t, n: i64, ptr: *const f16)
+            -> float16x4x2_t;
+    }
+    _vld2_lane_f16(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld2q_lane_f16<const LANE: i32>(a: *const f16, b: float16x8x2_t) -> float16x8x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v8f16.p0"
+        )]
+        fn _vld2q_lane_f16(
+            a: float16x8_t,
+            b: float16x8_t,
+            n: i64,
+            ptr: *const f16,
+        ) -> float16x8x2_t;
+    }
+    _vld2q_lane_f16(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_f32<const LANE: i32>(a: *const f32, b: float32x2x2_t) -> float32x2x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v2f32.p0"
+        )]
+        fn _vld2_lane_f32(a: float32x2_t, b: float32x2_t, n: i64, ptr: *const i8) -> float32x2x2_t;
+    }
+    _vld2_lane_f32(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_f32<const LANE: i32>(a: *const f32, b: float32x4x2_t) -> float32x4x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v4f32.p0"
+        )]
+        fn _vld2q_lane_f32(a: float32x4_t, b: float32x4_t, n: i64, ptr: *const i8)
+            -> float32x4x2_t;
+    }
+    _vld2q_lane_f32(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_s8<const LANE: i32>(a: *const i8, b: int8x8x2_t) -> int8x8x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v8i8.p0"
+        )]
+        fn _vld2_lane_s8(a: int8x8_t, b: int8x8_t, n: i64, ptr: *const i8) -> int8x8x2_t;
+    }
+    _vld2_lane_s8(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_s16<const LANE: i32>(a: *const i16, b: int16x4x2_t) -> int16x4x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v4i16.p0"
+        )]
+        fn _vld2_lane_s16(a: int16x4_t, b: int16x4_t, n: i64, ptr: *const i8) -> int16x4x2_t;
+    }
+    _vld2_lane_s16(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_s16<const LANE: i32>(a: *const i16, b: int16x8x2_t) -> int16x8x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v8i16.p0"
+        )]
+        fn _vld2q_lane_s16(a: int16x8_t, b: int16x8_t, n: i64, ptr: *const i8) -> int16x8x2_t;
+    }
+    _vld2q_lane_s16(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_s32<const LANE: i32>(a: *const i32, b: int32x2x2_t) -> int32x2x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v2i32.p0"
+        )]
+        fn _vld2_lane_s32(a: int32x2_t, b: int32x2_t, n: i64, ptr: *const i8) -> int32x2x2_t;
+    }
+    _vld2_lane_s32(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_s32<const LANE: i32>(a: *const i32, b: int32x4x2_t) -> int32x4x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2lane.v4i32.p0"
+        )]
+        fn _vld2q_lane_s32(a: int32x4_t, b: int32x4_t, n: i64, ptr: *const i8) -> int32x4x2_t;
+    }
+    _vld2q_lane_s32(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld2_lane_f32<const LANE: i32>(a: *const f32, b: float32x2x2_t) -> float32x2x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v2f32.p0")]
+        fn _vld2_lane_f32(
+            ptr: *const i8,
+            a: float32x2_t,
+            b: float32x2_t,
+            n: i32,
+            size: i32,
+        ) -> float32x2x2_t;
+    }
+    _vld2_lane_f32(a as _, b.0, b.1, LANE, 4)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld2q_lane_f32<const LANE: i32>(a: *const f32, b: float32x4x2_t) -> float32x4x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v4f32.p0")]
+        fn _vld2q_lane_f32(
+            ptr: *const i8,
+            a: float32x4_t,
+            b: float32x4_t,
+            n: i32,
+            size: i32,
+        ) -> float32x4x2_t;
+    }
+    _vld2q_lane_f32(a as _, b.0, b.1, LANE, 4)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld2q_lane_s16<const LANE: i32>(a: *const i16, b: int16x8x2_t) -> int16x8x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v8i16.p0")]
+        fn _vld2q_lane_s16(
+            ptr: *const i8,
+            a: int16x8_t,
+            b: int16x8_t,
+            n: i32,
+            size: i32,
+        ) -> int16x8x2_t;
+    }
+    _vld2q_lane_s16(a as _, b.0, b.1, LANE, 2)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld2q_lane_s32<const LANE: i32>(a: *const i32, b: int32x4x2_t) -> int32x4x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v4i32.p0")]
+        fn _vld2q_lane_s32(
+            ptr: *const i8,
+            a: int32x4_t,
+            b: int32x4_t,
+            n: i32,
+            size: i32,
+        ) -> int32x4x2_t;
+    }
+    _vld2q_lane_s32(a as _, b.0, b.1, LANE, 4)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld2_lane_s8<const LANE: i32>(a: *const i8, b: int8x8x2_t) -> int8x8x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v8i8.p0")]
+        fn _vld2_lane_s8(ptr: *const i8, a: int8x8_t, b: int8x8_t, n: i32, size: i32)
+            -> int8x8x2_t;
+    }
+    _vld2_lane_s8(a as _, b.0, b.1, LANE, 1)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld2_lane_s16<const LANE: i32>(a: *const i16, b: int16x4x2_t) -> int16x4x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v4i16.p0")]
+        fn _vld2_lane_s16(
+            ptr: *const i8,
+            a: int16x4_t,
+            b: int16x4_t,
+            n: i32,
+            size: i32,
+        ) -> int16x4x2_t;
+    }
+    _vld2_lane_s16(a as _, b.0, b.1, LANE, 2)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld2_lane_s32<const LANE: i32>(a: *const i32, b: int32x2x2_t) -> int32x2x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v2i32.p0")]
+        fn _vld2_lane_s32(
+            ptr: *const i8,
+            a: int32x2_t,
+            b: int32x2_t,
+            n: i32,
+            size: i32,
+        ) -> int32x2x2_t;
+    }
+    _vld2_lane_s32(a as _, b.0, b.1, LANE, 4)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_lane_u8<const LANE: i32>(a: *const u8, b: uint8x8x2_t) -> uint8x8x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    transmute(vld2_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_lane_u16<const LANE: i32>(a: *const u16, b: uint16x4x2_t) -> uint16x4x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    transmute(vld2_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_lane_u16<const LANE: i32>(a: *const u16, b: uint16x8x2_t) -> uint16x8x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    transmute(vld2q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_lane_u32<const LANE: i32>(a: *const u32, b: uint32x2x2_t) -> uint32x2x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    transmute(vld2_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_lane_u32<const LANE: i32>(a: *const u32, b: uint32x4x2_t) -> uint32x4x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    transmute(vld2q_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_lane_p8<const LANE: i32>(a: *const p8, b: poly8x8x2_t) -> poly8x8x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    transmute(vld2_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_lane_p16<const LANE: i32>(a: *const p16, b: poly16x4x2_t) -> poly16x4x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    transmute(vld2_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_lane_p16<const LANE: i32>(a: *const p16, b: poly16x8x2_t) -> poly16x8x2_t {
+    static_assert_uimm_bits!(LANE, 3);
+    transmute(vld2q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_p64(a: *const p64) -> poly64x1x2_t {
+    transmute(vld2_s64(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v1i64")]
+        fn _vld2_s64(ptr: *const i8, size: i32) -> int64x1x2_t;
+    }
+    _vld2_s64(a as *const i8, 8)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld2.v1i64.p0"
+        )]
+        fn _vld2_s64(ptr: *const int64x1_t) -> int64x1x2_t;
+    }
+    _vld2_s64(a as _)
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_u64(a: *const u64) -> uint64x1x2_t {
+    transmute(vld2_s64(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_u8(a: *const u8) -> uint8x8x2_t {
+    transmute(vld2_s8(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_u8(a: *const u8) -> uint8x8x2_t {
+    let mut ret_val: uint8x8x2_t = transmute(vld2_s8(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_u8(a: *const u8) -> uint8x16x2_t {
+    transmute(vld2q_s8(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_u8(a: *const u8) -> uint8x16x2_t {
+    let mut ret_val: uint8x16x2_t = transmute(vld2q_s8(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_u16(a: *const u16) -> uint16x4x2_t {
+    transmute(vld2_s16(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_u16(a: *const u16) -> uint16x4x2_t {
+    let mut ret_val: uint16x4x2_t = transmute(vld2_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_u16(a: *const u16) -> uint16x8x2_t {
+    transmute(vld2q_s16(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_u16(a: *const u16) -> uint16x8x2_t {
+    let mut ret_val: uint16x8x2_t = transmute(vld2q_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_u32(a: *const u32) -> uint32x2x2_t {
+    transmute(vld2_s32(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_u32(a: *const u32) -> uint32x2x2_t {
+    let mut ret_val: uint32x2x2_t = transmute(vld2_s32(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_u32(a: *const u32) -> uint32x4x2_t {
+    transmute(vld2q_s32(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_u32(a: *const u32) -> uint32x4x2_t {
+    let mut ret_val: uint32x4x2_t = transmute(vld2q_s32(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_p8(a: *const p8) -> poly8x8x2_t {
+    transmute(vld2_s8(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_p8(a: *const p8) -> poly8x8x2_t {
+    let mut ret_val: poly8x8x2_t = transmute(vld2_s8(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_p8(a: *const p8) -> poly8x16x2_t {
+    transmute(vld2q_s8(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_p8(a: *const p8) -> poly8x16x2_t {
+    let mut ret_val: poly8x16x2_t = transmute(vld2q_s8(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_p16(a: *const p16) -> poly16x4x2_t {
+    transmute(vld2_s16(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2_p16(a: *const p16) -> poly16x4x2_t {
+    let mut ret_val: poly16x4x2_t = transmute(vld2_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_p16(a: *const p16) -> poly16x8x2_t {
+    transmute(vld2q_s16(transmute(a)))
+}
+#[doc = "Load multiple 2-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld2q_p16(a: *const p16) -> poly16x8x2_t {
+    let mut ret_val: poly16x8x2_t = transmute(vld2q_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld3_dup_f16(a: *const f16) -> float16x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v4f16.p0")]
+        fn _vld3_dup_f16(ptr: *const f16, size: i32) -> float16x4x3_t;
+    }
+    _vld3_dup_f16(a as _, 2)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld3q_dup_f16(a: *const f16) -> float16x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v8f16.p0")]
+        fn _vld3q_dup_f16(ptr: *const f16, size: i32) -> float16x8x3_t;
+    }
+    _vld3q_dup_f16(a as _, 2)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld3_dup_f16(a: *const f16) -> float16x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v4f16.p0"
+        )]
+        fn _vld3_dup_f16(ptr: *const f16) -> float16x4x3_t;
+    }
+    _vld3_dup_f16(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld3q_dup_f16(a: *const f16) -> float16x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v8f16.p0"
+        )]
+        fn _vld3q_dup_f16(ptr: *const f16) -> float16x8x3_t;
+    }
+    _vld3q_dup_f16(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3_dup_f32(a: *const f32) -> float32x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v2f32.p0"
+        )]
+        fn _vld3_dup_f32(ptr: *const f32) -> float32x2x3_t;
+    }
+    _vld3_dup_f32(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3q_dup_f32(a: *const f32) -> float32x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v4f32.p0"
+        )]
+        fn _vld3q_dup_f32(ptr: *const f32) -> float32x4x3_t;
+    }
+    _vld3q_dup_f32(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3_dup_s8(a: *const i8) -> int8x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v8i8.p0"
+        )]
+        fn _vld3_dup_s8(ptr: *const i8) -> int8x8x3_t;
+    }
+    _vld3_dup_s8(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3q_dup_s8(a: *const i8) -> int8x16x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v16i8.p0"
+        )]
+        fn _vld3q_dup_s8(ptr: *const i8) -> int8x16x3_t;
+    }
+    _vld3q_dup_s8(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3_dup_s16(a: *const i16) -> int16x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v4i16.p0"
+        )]
+        fn _vld3_dup_s16(ptr: *const i16) -> int16x4x3_t;
+    }
+    _vld3_dup_s16(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3q_dup_s16(a: *const i16) -> int16x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v8i16.p0"
+        )]
+        fn _vld3q_dup_s16(ptr: *const i16) -> int16x8x3_t;
+    }
+    _vld3q_dup_s16(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3_dup_s32(a: *const i32) -> int32x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v2i32.p0"
+        )]
+        fn _vld3_dup_s32(ptr: *const i32) -> int32x2x3_t;
+    }
+    _vld3_dup_s32(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3q_dup_s32(a: *const i32) -> int32x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v4i32.p0"
+        )]
+        fn _vld3q_dup_s32(ptr: *const i32) -> int32x4x3_t;
+    }
+    _vld3q_dup_s32(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld3r))]
+pub unsafe fn vld3_dup_s64(a: *const i64) -> int64x1x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3r.v1i64.p0"
+        )]
+        fn _vld3_dup_s64(ptr: *const i64) -> int64x1x3_t;
+    }
+    _vld3_dup_s64(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_dup_f32(a: *const f32) -> float32x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v2f32.p0")]
+        fn _vld3_dup_f32(ptr: *const i8, size: i32) -> float32x2x3_t;
+    }
+    _vld3_dup_f32(a as *const i8, 4)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_dup_f32(a: *const f32) -> float32x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v4f32.p0")]
+        fn _vld3q_dup_f32(ptr: *const i8, size: i32) -> float32x4x3_t;
+    }
+    _vld3q_dup_f32(a as *const i8, 4)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_dup_s8(a: *const i8) -> int8x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v8i8.p0")]
+        fn _vld3_dup_s8(ptr: *const i8, size: i32) -> int8x8x3_t;
+    }
+    _vld3_dup_s8(a as *const i8, 1)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_dup_s8(a: *const i8) -> int8x16x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v16i8.p0")]
+        fn _vld3q_dup_s8(ptr: *const i8, size: i32) -> int8x16x3_t;
+    }
+    _vld3q_dup_s8(a as *const i8, 1)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_dup_s16(a: *const i16) -> int16x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v4i16.p0")]
+        fn _vld3_dup_s16(ptr: *const i8, size: i32) -> int16x4x3_t;
+    }
+    _vld3_dup_s16(a as *const i8, 2)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_dup_s16(a: *const i16) -> int16x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v8i16.p0")]
+        fn _vld3q_dup_s16(ptr: *const i8, size: i32) -> int16x8x3_t;
+    }
+    _vld3q_dup_s16(a as *const i8, 2)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_dup_s32(a: *const i32) -> int32x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v2i32.p0")]
+        fn _vld3_dup_s32(ptr: *const i8, size: i32) -> int32x2x3_t;
+    }
+    _vld3_dup_s32(a as *const i8, 4)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_dup_s32(a: *const i32) -> int32x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v4i32.p0")]
+        fn _vld3q_dup_s32(ptr: *const i8, size: i32) -> int32x4x3_t;
+    }
+    _vld3q_dup_s32(a as *const i8, 4)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_dup_p64(a: *const p64) -> poly64x1x3_t {
+    transmute(vld3_dup_s64(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld3_dup_s64(a: *const i64) -> int64x1x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v1i64.p0")]
+        fn _vld3_dup_s64(ptr: *const i8, size: i32) -> int64x1x3_t;
+    }
+    _vld3_dup_s64(a as *const i8, 8)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_dup_u64(a: *const u64) -> uint64x1x3_t {
+    transmute(vld3_dup_s64(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_dup_u8(a: *const u8) -> uint8x8x3_t {
+    transmute(vld3_dup_s8(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_dup_u8(a: *const u8) -> uint8x8x3_t {
+    let mut ret_val: uint8x8x3_t = transmute(vld3_dup_s8(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_dup_u8(a: *const u8) -> uint8x16x3_t {
+    transmute(vld3q_dup_s8(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_dup_u8(a: *const u8) -> uint8x16x3_t {
+    let mut ret_val: uint8x16x3_t = transmute(vld3q_dup_s8(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.2 = unsafe {
+        simd_shuffle!(
+            ret_val.2,
+            ret_val.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_dup_u16(a: *const u16) -> uint16x4x3_t {
+    transmute(vld3_dup_s16(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_dup_u16(a: *const u16) -> uint16x4x3_t {
+    let mut ret_val: uint16x4x3_t = transmute(vld3_dup_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_dup_u16(a: *const u16) -> uint16x8x3_t {
+    transmute(vld3q_dup_s16(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_dup_u16(a: *const u16) -> uint16x8x3_t {
+    let mut ret_val: uint16x8x3_t = transmute(vld3q_dup_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_dup_u32(a: *const u32) -> uint32x2x3_t {
+    transmute(vld3_dup_s32(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_dup_u32(a: *const u32) -> uint32x2x3_t {
+    let mut ret_val: uint32x2x3_t = transmute(vld3_dup_s32(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_dup_u32(a: *const u32) -> uint32x4x3_t {
+    transmute(vld3q_dup_s32(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_dup_u32(a: *const u32) -> uint32x4x3_t {
+    let mut ret_val: uint32x4x3_t = transmute(vld3q_dup_s32(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_dup_p8(a: *const p8) -> poly8x8x3_t {
+    transmute(vld3_dup_s8(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_dup_p8(a: *const p8) -> poly8x8x3_t {
+    let mut ret_val: poly8x8x3_t = transmute(vld3_dup_s8(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_dup_p8(a: *const p8) -> poly8x16x3_t {
+    transmute(vld3q_dup_s8(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_dup_p8(a: *const p8) -> poly8x16x3_t {
+    let mut ret_val: poly8x16x3_t = transmute(vld3q_dup_s8(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.2 = unsafe {
+        simd_shuffle!(
+            ret_val.2,
+            ret_val.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_dup_p16(a: *const p16) -> poly16x4x3_t {
+    transmute(vld3_dup_s16(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_dup_p16(a: *const p16) -> poly16x4x3_t {
+    let mut ret_val: poly16x4x3_t = transmute(vld3_dup_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_dup_p16(a: *const p16) -> poly16x8x3_t {
+    transmute(vld3q_dup_s16(transmute(a)))
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_dup_p16(a: *const p16) -> poly16x8x3_t {
+    let mut ret_val: poly16x8x3_t = transmute(vld3q_dup_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld3_f16(a: *const f16) -> float16x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v4f16.p0")]
+        fn _vld3_f16(ptr: *const f16, size: i32) -> float16x4x3_t;
+    }
+    _vld3_f16(a as _, 2)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld3q_f16(a: *const f16) -> float16x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v8f16.p0")]
+        fn _vld3q_f16(ptr: *const f16, size: i32) -> float16x8x3_t;
+    }
+    _vld3q_f16(a as _, 2)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld3_f16(a: *const f16) -> float16x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v4f16.p0"
+        )]
+        fn _vld3_f16(ptr: *const f16) -> float16x4x3_t;
+    }
+    _vld3_f16(a as _)
+}
+#[doc = "Load single 3-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld3q_f16(a: *const f16) -> float16x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v8f16.p0"
+        )]
+        fn _vld3q_f16(ptr: *const f16) -> float16x8x3_t;
+    }
+    _vld3q_f16(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3_f32(a: *const f32) -> float32x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v2f32.p0"
+        )]
+        fn _vld3_f32(ptr: *const float32x2_t) -> float32x2x3_t;
+    }
+    _vld3_f32(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v4f32.p0"
+        )]
+        fn _vld3q_f32(ptr: *const float32x4_t) -> float32x4x3_t;
+    }
+    _vld3q_f32(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3_s8(a: *const i8) -> int8x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v8i8.p0"
+        )]
+        fn _vld3_s8(ptr: *const int8x8_t) -> int8x8x3_t;
+    }
+    _vld3_s8(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3q_s8(a: *const i8) -> int8x16x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v16i8.p0"
+        )]
+        fn _vld3q_s8(ptr: *const int8x16_t) -> int8x16x3_t;
+    }
+    _vld3q_s8(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3_s16(a: *const i16) -> int16x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v4i16.p0"
+        )]
+        fn _vld3_s16(ptr: *const int16x4_t) -> int16x4x3_t;
+    }
+    _vld3_s16(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3q_s16(a: *const i16) -> int16x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v8i16.p0"
+        )]
+        fn _vld3q_s16(ptr: *const int16x8_t) -> int16x8x3_t;
+    }
+    _vld3q_s16(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v2i32.p0"
+        )]
+        fn _vld3_s32(ptr: *const int32x2_t) -> int32x2x3_t;
+    }
+    _vld3_s32(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3))]
+pub unsafe fn vld3q_s32(a: *const i32) -> int32x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v4i32.p0"
+        )]
+        fn _vld3q_s32(ptr: *const int32x4_t) -> int32x4x3_t;
+    }
+    _vld3q_s32(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_f32(a: *const f32) -> float32x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v2f32.p0")]
+        fn _vld3_f32(ptr: *const i8, size: i32) -> float32x2x3_t;
+    }
+    _vld3_f32(a as *const i8, 4)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v4f32.p0")]
+        fn _vld3q_f32(ptr: *const i8, size: i32) -> float32x4x3_t;
+    }
+    _vld3q_f32(a as *const i8, 4)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_s8(a: *const i8) -> int8x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v8i8.p0")]
+        fn _vld3_s8(ptr: *const i8, size: i32) -> int8x8x3_t;
+    }
+    _vld3_s8(a as *const i8, 1)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_s8(a: *const i8) -> int8x16x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v16i8.p0")]
+        fn _vld3q_s8(ptr: *const i8, size: i32) -> int8x16x3_t;
+    }
+    _vld3q_s8(a as *const i8, 1)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_s16(a: *const i16) -> int16x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v4i16.p0")]
+        fn _vld3_s16(ptr: *const i8, size: i32) -> int16x4x3_t;
+    }
+    _vld3_s16(a as *const i8, 2)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_s16(a: *const i16) -> int16x8x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v8i16.p0")]
+        fn _vld3q_s16(ptr: *const i8, size: i32) -> int16x8x3_t;
+    }
+    _vld3q_s16(a as *const i8, 2)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v2i32.p0")]
+        fn _vld3_s32(ptr: *const i8, size: i32) -> int32x2x3_t;
+    }
+    _vld3_s32(a as *const i8, 4)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_s32(a: *const i32) -> int32x4x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v4i32.p0")]
+        fn _vld3q_s32(ptr: *const i8, size: i32) -> int32x4x3_t;
+    }
+    _vld3q_s32(a as *const i8, 4)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld3_lane_f16<const LANE: i32>(a: *const f16, b: float16x4x3_t) -> float16x4x3_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v4f16.p0")]
+        fn _vld3_lane_f16(
+            ptr: *const f16,
+            a: float16x4_t,
+            b: float16x4_t,
+            c: float16x4_t,
+            n: i32,
+            size: i32,
+        ) -> float16x4x3_t;
+    }
+    _vld3_lane_f16(a as _, b.0, b.1, b.2, LANE, 2)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld3q_lane_f16<const LANE: i32>(a: *const f16, b: float16x8x3_t) -> float16x8x3_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v8f16.p0")]
+        fn _vld3q_lane_f16(
+            ptr: *const f16,
+            a: float16x8_t,
+            b: float16x8_t,
+            c: float16x8_t,
+            n: i32,
+            size: i32,
+        ) -> float16x8x3_t;
+    }
+    _vld3q_lane_f16(a as _, b.0, b.1, b.2, LANE, 2)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld3_lane_f16<const LANE: i32>(a: *const f16, b: float16x4x3_t) -> float16x4x3_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v4f16.p0"
+        )]
+        fn _vld3_lane_f16(
+            a: float16x4_t,
+            b: float16x4_t,
+            c: float16x4_t,
+            n: i64,
+            ptr: *const f16,
+        ) -> float16x4x3_t;
+    }
+    _vld3_lane_f16(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld3q_lane_f16<const LANE: i32>(a: *const f16, b: float16x8x3_t) -> float16x8x3_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v8f16.p0"
+        )]
+        fn _vld3q_lane_f16(
+            a: float16x8_t,
+            b: float16x8_t,
+            c: float16x8_t,
+            n: i64,
+            ptr: *const f16,
+        ) -> float16x8x3_t;
+    }
+    _vld3q_lane_f16(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_f32<const LANE: i32>(a: *const f32, b: float32x2x3_t) -> float32x2x3_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v2f32.p0"
+        )]
+        fn _vld3_lane_f32(
+            a: float32x2_t,
+            b: float32x2_t,
+            c: float32x2_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> float32x2x3_t;
+    }
+    _vld3_lane_f32(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_f32<const LANE: i32>(a: *const f32, b: float32x4x3_t) -> float32x4x3_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v4f32.p0"
+        )]
+        fn _vld3q_lane_f32(
+            a: float32x4_t,
+            b: float32x4_t,
+            c: float32x4_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> float32x4x3_t;
+    }
+    _vld3q_lane_f32(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld3_lane_f32<const LANE: i32>(a: *const f32, b: float32x2x3_t) -> float32x2x3_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v2f32.p0")]
+        fn _vld3_lane_f32(
+            ptr: *const i8,
+            a: float32x2_t,
+            b: float32x2_t,
+            c: float32x2_t,
+            n: i32,
+            size: i32,
+        ) -> float32x2x3_t;
+    }
+    _vld3_lane_f32(a as _, b.0, b.1, b.2, LANE, 4)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_s8<const LANE: i32>(a: *const i8, b: int8x8x3_t) -> int8x8x3_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v8i8.p0"
+        )]
+        fn _vld3_lane_s8(
+            a: int8x8_t,
+            b: int8x8_t,
+            c: int8x8_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int8x8x3_t;
+    }
+    _vld3_lane_s8(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_s16<const LANE: i32>(a: *const i16, b: int16x4x3_t) -> int16x4x3_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v4i16.p0"
+        )]
+        fn _vld3_lane_s16(
+            a: int16x4_t,
+            b: int16x4_t,
+            c: int16x4_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int16x4x3_t;
+    }
+    _vld3_lane_s16(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_s16<const LANE: i32>(a: *const i16, b: int16x8x3_t) -> int16x8x3_t {
+    static_assert_uimm_bits!(LANE, 4);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v8i16.p0"
+        )]
+        fn _vld3q_lane_s16(
+            a: int16x8_t,
+            b: int16x8_t,
+            c: int16x8_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int16x8x3_t;
+    }
+    _vld3q_lane_s16(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_s32<const LANE: i32>(a: *const i32, b: int32x2x3_t) -> int32x2x3_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v2i32.p0"
+        )]
+        fn _vld3_lane_s32(
+            a: int32x2_t,
+            b: int32x2_t,
+            c: int32x2_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int32x2x3_t;
+    }
+    _vld3_lane_s32(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_s32<const LANE: i32>(a: *const i32, b: int32x4x3_t) -> int32x4x3_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3lane.v4i32.p0"
+        )]
+        fn _vld3q_lane_s32(
+            a: int32x4_t,
+            b: int32x4_t,
+            c: int32x4_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int32x4x3_t;
+    }
+    _vld3q_lane_s32(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld3_lane_s8<const LANE: i32>(a: *const i8, b: int8x8x3_t) -> int8x8x3_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v8i8.p0")]
+        fn _vld3_lane_s8(
+            ptr: *const i8,
+            a: int8x8_t,
+            b: int8x8_t,
+            c: int8x8_t,
+            n: i32,
+            size: i32,
+        ) -> int8x8x3_t;
+    }
+    _vld3_lane_s8(a as _, b.0, b.1, b.2, LANE, 1)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld3_lane_s16<const LANE: i32>(a: *const i16, b: int16x4x3_t) -> int16x4x3_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v4i16.p0")]
+        fn _vld3_lane_s16(
+            ptr: *const i8,
+            a: int16x4_t,
+            b: int16x4_t,
+            c: int16x4_t,
+            n: i32,
+            size: i32,
+        ) -> int16x4x3_t;
+    }
+    _vld3_lane_s16(a as _, b.0, b.1, b.2, LANE, 2)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld3q_lane_s16<const LANE: i32>(a: *const i16, b: int16x8x3_t) -> int16x8x3_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v8i16.p0")]
+        fn _vld3q_lane_s16(
+            ptr: *const i8,
+            a: int16x8_t,
+            b: int16x8_t,
+            c: int16x8_t,
+            n: i32,
+            size: i32,
+        ) -> int16x8x3_t;
+    }
+    _vld3q_lane_s16(a as _, b.0, b.1, b.2, LANE, 2)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld3_lane_s32<const LANE: i32>(a: *const i32, b: int32x2x3_t) -> int32x2x3_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v2i32.p0")]
+        fn _vld3_lane_s32(
+            ptr: *const i8,
+            a: int32x2_t,
+            b: int32x2_t,
+            c: int32x2_t,
+            n: i32,
+            size: i32,
+        ) -> int32x2x3_t;
+    }
+    _vld3_lane_s32(a as _, b.0, b.1, b.2, LANE, 4)
+}
+#[doc = "Load multiple 3-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld3q_lane_s32<const LANE: i32>(a: *const i32, b: int32x4x3_t) -> int32x4x3_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v4i32.p0")]
+        fn _vld3q_lane_s32(
+            ptr: *const i8,
+            a: int32x4_t,
+            b: int32x4_t,
+            c: int32x4_t,
+            n: i32,
+            size: i32,
+        ) -> int32x4x3_t;
+    }
+    _vld3q_lane_s32(a as _, b.0, b.1, b.2, LANE, 4)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_lane_u8<const LANE: i32>(a: *const u8, b: uint8x8x3_t) -> uint8x8x3_t {
+    static_assert_uimm_bits!(LANE, 3);
+    transmute(vld3_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_lane_u16<const LANE: i32>(a: *const u16, b: uint16x4x3_t) -> uint16x4x3_t {
+    static_assert_uimm_bits!(LANE, 2);
+    transmute(vld3_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_lane_u16<const LANE: i32>(a: *const u16, b: uint16x8x3_t) -> uint16x8x3_t {
+    static_assert_uimm_bits!(LANE, 3);
+    transmute(vld3q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_lane_u32<const LANE: i32>(a: *const u32, b: uint32x2x3_t) -> uint32x2x3_t {
+    static_assert_uimm_bits!(LANE, 1);
+    transmute(vld3_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_lane_u32<const LANE: i32>(a: *const u32, b: uint32x4x3_t) -> uint32x4x3_t {
+    static_assert_uimm_bits!(LANE, 2);
+    transmute(vld3q_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_lane_p8<const LANE: i32>(a: *const p8, b: poly8x8x3_t) -> poly8x8x3_t {
+    static_assert_uimm_bits!(LANE, 3);
+    transmute(vld3_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_lane_p16<const LANE: i32>(a: *const p16, b: poly16x4x3_t) -> poly16x4x3_t {
+    static_assert_uimm_bits!(LANE, 2);
+    transmute(vld3_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_lane_p16<const LANE: i32>(a: *const p16, b: poly16x8x3_t) -> poly16x8x3_t {
+    static_assert_uimm_bits!(LANE, 3);
+    transmute(vld3q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_p64(a: *const p64) -> poly64x1x3_t {
+    transmute(vld3_s64(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld3.v1i64.p0"
+        )]
+        fn _vld3_s64(ptr: *const int64x1_t) -> int64x1x3_t;
+    }
+    _vld3_s64(a as _)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v1i64.p0")]
+        fn _vld3_s64(ptr: *const i8, size: i32) -> int64x1x3_t;
+    }
+    _vld3_s64(a as *const i8, 8)
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_u64(a: *const u64) -> uint64x1x3_t {
+    transmute(vld3_s64(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_u8(a: *const u8) -> uint8x8x3_t {
+    transmute(vld3_s8(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_u8(a: *const u8) -> uint8x8x3_t {
+    let mut ret_val: uint8x8x3_t = transmute(vld3_s8(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_u8(a: *const u8) -> uint8x16x3_t {
+    transmute(vld3q_s8(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_u8(a: *const u8) -> uint8x16x3_t {
+    let mut ret_val: uint8x16x3_t = transmute(vld3q_s8(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.2 = unsafe {
+        simd_shuffle!(
+            ret_val.2,
+            ret_val.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_u16(a: *const u16) -> uint16x4x3_t {
+    transmute(vld3_s16(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_u16(a: *const u16) -> uint16x4x3_t {
+    let mut ret_val: uint16x4x3_t = transmute(vld3_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_u16(a: *const u16) -> uint16x8x3_t {
+    transmute(vld3q_s16(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_u16(a: *const u16) -> uint16x8x3_t {
+    let mut ret_val: uint16x8x3_t = transmute(vld3q_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_u32(a: *const u32) -> uint32x2x3_t {
+    transmute(vld3_s32(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_u32(a: *const u32) -> uint32x2x3_t {
+    let mut ret_val: uint32x2x3_t = transmute(vld3_s32(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_u32(a: *const u32) -> uint32x4x3_t {
+    transmute(vld3q_s32(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_u32(a: *const u32) -> uint32x4x3_t {
+    let mut ret_val: uint32x4x3_t = transmute(vld3q_s32(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_p8(a: *const p8) -> poly8x8x3_t {
+    transmute(vld3_s8(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_p8(a: *const p8) -> poly8x8x3_t {
+    let mut ret_val: poly8x8x3_t = transmute(vld3_s8(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_p8(a: *const p8) -> poly8x16x3_t {
+    transmute(vld3q_s8(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_p8(a: *const p8) -> poly8x16x3_t {
+    let mut ret_val: poly8x16x3_t = transmute(vld3q_s8(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.2 = unsafe {
+        simd_shuffle!(
+            ret_val.2,
+            ret_val.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_p16(a: *const p16) -> poly16x4x3_t {
+    transmute(vld3_s16(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3_p16(a: *const p16) -> poly16x4x3_t {
+    let mut ret_val: poly16x4x3_t = transmute(vld3_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_p16(a: *const p16) -> poly16x8x3_t {
+    transmute(vld3q_s16(transmute(a)))
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld3q_p16(a: *const p16) -> poly16x8x3_t {
+    let mut ret_val: poly16x8x3_t = transmute(vld3q_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 3-element structures to three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld3q_lane_f32<const LANE: i32>(a: *const f32, b: float32x4x3_t) -> float32x4x3_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v4f32.p0")]
+        fn _vld3q_lane_f32(
+            ptr: *const i8,
+            a: float32x4_t,
+            b: float32x4_t,
+            c: float32x4_t,
+            n: i32,
+            size: i32,
+        ) -> float32x4x3_t;
+    }
+    _vld3q_lane_f32(a as _, b.0, b.1, b.2, LANE, 4)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld4_dup_f16(a: *const f16) -> float16x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v4f16.p0")]
+        fn _vld4_dup_f16(ptr: *const f16, size: i32) -> float16x4x4_t;
+    }
+    _vld4_dup_f16(a as _, 2)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld4q_dup_f16(a: *const f16) -> float16x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v8f16.p0")]
+        fn _vld4q_dup_f16(ptr: *const f16, size: i32) -> float16x8x4_t;
+    }
+    _vld4q_dup_f16(a as _, 2)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld4_dup_f16(a: *const f16) -> float16x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v4f16.p0"
+        )]
+        fn _vld4_dup_f16(ptr: *const f16) -> float16x4x4_t;
+    }
+    _vld4_dup_f16(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld4q_dup_f16(a: *const f16) -> float16x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v8f16.p0"
+        )]
+        fn _vld4q_dup_f16(ptr: *const f16) -> float16x8x4_t;
+    }
+    _vld4q_dup_f16(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4_dup_f32(a: *const f32) -> float32x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v2f32.p0")]
+        fn _vld4_dup_f32(ptr: *const i8, size: i32) -> float32x2x4_t;
+    }
+    _vld4_dup_f32(a as *const i8, 4)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4q_dup_f32(a: *const f32) -> float32x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v4f32.p0")]
+        fn _vld4q_dup_f32(ptr: *const i8, size: i32) -> float32x4x4_t;
+    }
+    _vld4q_dup_f32(a as *const i8, 4)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4_dup_s8(a: *const i8) -> int8x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v8i8.p0")]
+        fn _vld4_dup_s8(ptr: *const i8, size: i32) -> int8x8x4_t;
+    }
+    _vld4_dup_s8(a as *const i8, 1)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4q_dup_s8(a: *const i8) -> int8x16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v16i8.p0")]
+        fn _vld4q_dup_s8(ptr: *const i8, size: i32) -> int8x16x4_t;
+    }
+    _vld4q_dup_s8(a as *const i8, 1)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4_dup_s16(a: *const i16) -> int16x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v4i16.p0")]
+        fn _vld4_dup_s16(ptr: *const i8, size: i32) -> int16x4x4_t;
+    }
+    _vld4_dup_s16(a as *const i8, 2)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4q_dup_s16(a: *const i16) -> int16x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v8i16.p0")]
+        fn _vld4q_dup_s16(ptr: *const i8, size: i32) -> int16x8x4_t;
+    }
+    _vld4q_dup_s16(a as *const i8, 2)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4_dup_s32(a: *const i32) -> int32x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v2i32.p0")]
+        fn _vld4_dup_s32(ptr: *const i8, size: i32) -> int32x2x4_t;
+    }
+    _vld4_dup_s32(a as *const i8, 4)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4q_dup_s32(a: *const i32) -> int32x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v4i32.p0")]
+        fn _vld4q_dup_s32(ptr: *const i8, size: i32) -> int32x4x4_t;
+    }
+    _vld4q_dup_s32(a as *const i8, 4)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_dup_f32(a: *const f32) -> float32x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v2f32.p0.p0"
+        )]
+        fn _vld4_dup_f32(ptr: *const f32) -> float32x2x4_t;
+    }
+    _vld4_dup_f32(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_f32(a: *const f32) -> float32x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v4f32.p0.p0"
+        )]
+        fn _vld4q_dup_f32(ptr: *const f32) -> float32x4x4_t;
+    }
+    _vld4q_dup_f32(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_dup_s8(a: *const i8) -> int8x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v8i8.p0.p0"
+        )]
+        fn _vld4_dup_s8(ptr: *const i8) -> int8x8x4_t;
+    }
+    _vld4_dup_s8(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_s8(a: *const i8) -> int8x16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v16i8.p0.p0"
+        )]
+        fn _vld4q_dup_s8(ptr: *const i8) -> int8x16x4_t;
+    }
+    _vld4q_dup_s8(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_dup_s16(a: *const i16) -> int16x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v4i16.p0.p0"
+        )]
+        fn _vld4_dup_s16(ptr: *const i16) -> int16x4x4_t;
+    }
+    _vld4_dup_s16(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_s16(a: *const i16) -> int16x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v8i16.p0.p0"
+        )]
+        fn _vld4q_dup_s16(ptr: *const i16) -> int16x8x4_t;
+    }
+    _vld4q_dup_s16(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_dup_s32(a: *const i32) -> int32x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v2i32.p0.p0"
+        )]
+        fn _vld4_dup_s32(ptr: *const i32) -> int32x2x4_t;
+    }
+    _vld4_dup_s32(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_s32(a: *const i32) -> int32x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v4i32.p0.p0"
+        )]
+        fn _vld4q_dup_s32(ptr: *const i32) -> int32x4x4_t;
+    }
+    _vld4q_dup_s32(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_dup_s64(a: *const i64) -> int64x1x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4r.v1i64.p0.p0"
+        )]
+        fn _vld4_dup_s64(ptr: *const i64) -> int64x1x4_t;
+    }
+    _vld4_dup_s64(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_dup_p64(a: *const p64) -> poly64x1x4_t {
+    transmute(vld4_dup_s64(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(nop))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4_dup_s64(a: *const i64) -> int64x1x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v1i64.p0")]
+        fn _vld4_dup_s64(ptr: *const i8, size: i32) -> int64x1x4_t;
+    }
+    _vld4_dup_s64(a as *const i8, 8)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_dup_u64(a: *const u64) -> uint64x1x4_t {
+    transmute(vld4_dup_s64(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_dup_u8(a: *const u8) -> uint8x8x4_t {
+    transmute(vld4_dup_s8(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_dup_u8(a: *const u8) -> uint8x8x4_t {
+    let mut ret_val: uint8x8x4_t = transmute(vld4_dup_s8(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_dup_u8(a: *const u8) -> uint8x16x4_t {
+    transmute(vld4q_dup_s8(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_dup_u8(a: *const u8) -> uint8x16x4_t {
+    let mut ret_val: uint8x16x4_t = transmute(vld4q_dup_s8(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.2 = unsafe {
+        simd_shuffle!(
+            ret_val.2,
+            ret_val.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.3 = unsafe {
+        simd_shuffle!(
+            ret_val.3,
+            ret_val.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_dup_u16(a: *const u16) -> uint16x4x4_t {
+    transmute(vld4_dup_s16(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_dup_u16(a: *const u16) -> uint16x4x4_t {
+    let mut ret_val: uint16x4x4_t = transmute(vld4_dup_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_dup_u16(a: *const u16) -> uint16x8x4_t {
+    transmute(vld4q_dup_s16(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_dup_u16(a: *const u16) -> uint16x8x4_t {
+    let mut ret_val: uint16x8x4_t = transmute(vld4q_dup_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_dup_u32(a: *const u32) -> uint32x2x4_t {
+    transmute(vld4_dup_s32(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_dup_u32(a: *const u32) -> uint32x2x4_t {
+    let mut ret_val: uint32x2x4_t = transmute(vld4_dup_s32(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [1, 0]) };
+    ret_val
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_dup_u32(a: *const u32) -> uint32x4x4_t {
+    transmute(vld4q_dup_s32(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_dup_u32(a: *const u32) -> uint32x4x4_t {
+    let mut ret_val: uint32x4x4_t = transmute(vld4q_dup_s32(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_dup_p8(a: *const p8) -> poly8x8x4_t {
+    transmute(vld4_dup_s8(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_dup_p8(a: *const p8) -> poly8x8x4_t {
+    let mut ret_val: poly8x8x4_t = transmute(vld4_dup_s8(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_dup_p8(a: *const p8) -> poly8x16x4_t {
+    transmute(vld4q_dup_s8(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_dup_p8(a: *const p8) -> poly8x16x4_t {
+    let mut ret_val: poly8x16x4_t = transmute(vld4q_dup_s8(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.2 = unsafe {
+        simd_shuffle!(
+            ret_val.2,
+            ret_val.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.3 = unsafe {
+        simd_shuffle!(
+            ret_val.3,
+            ret_val.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_dup_p16(a: *const p16) -> poly16x4x4_t {
+    transmute(vld4_dup_s16(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_dup_p16(a: *const p16) -> poly16x4x4_t {
+    let mut ret_val: poly16x4x4_t = transmute(vld4_dup_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_dup_p16(a: *const p16) -> poly16x8x4_t {
+    transmute(vld4q_dup_s16(transmute(a)))
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4r)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_dup_p16(a: *const p16) -> poly16x8x4_t {
+    let mut ret_val: poly16x8x4_t = transmute(vld4q_dup_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld4_f16(a: *const f16) -> float16x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v4f16.p0")]
+        fn _vld4_f16(ptr: *const f16, size: i32) -> float16x4x4_t;
+    }
+    _vld4_f16(a as _, 2)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld4q_f16(a: *const f16) -> float16x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v8f16.p0")]
+        fn _vld4q_f16(ptr: *const f16, size: i32) -> float16x8x4_t;
+    }
+    _vld4q_f16(a as _, 2)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld4_f16(a: *const f16) -> float16x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v4f16.p0"
+        )]
+        fn _vld4_f16(ptr: *const f16) -> float16x4x4_t;
+    }
+    _vld4_f16(a as _)
+}
+#[doc = "Load single 4-element structure and replicate to all lanes of two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld4q_f16(a: *const f16) -> float16x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v8f16.p0"
+        )]
+        fn _vld4q_f16(ptr: *const f16) -> float16x8x4_t;
+    }
+    _vld4q_f16(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v2f32.p0"
+        )]
+        fn _vld4_f32(ptr: *const float32x2_t) -> float32x2x4_t;
+    }
+    _vld4_f32(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v4f32.p0"
+        )]
+        fn _vld4q_f32(ptr: *const float32x4_t) -> float32x4x4_t;
+    }
+    _vld4q_f32(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4_s8(a: *const i8) -> int8x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v8i8.p0"
+        )]
+        fn _vld4_s8(ptr: *const int8x8_t) -> int8x8x4_t;
+    }
+    _vld4_s8(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4q_s8(a: *const i8) -> int8x16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v16i8.p0"
+        )]
+        fn _vld4q_s8(ptr: *const int8x16_t) -> int8x16x4_t;
+    }
+    _vld4q_s8(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4_s16(a: *const i16) -> int16x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v4i16.p0"
+        )]
+        fn _vld4_s16(ptr: *const int16x4_t) -> int16x4x4_t;
+    }
+    _vld4_s16(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4q_s16(a: *const i16) -> int16x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v8i16.p0"
+        )]
+        fn _vld4q_s16(ptr: *const int16x8_t) -> int16x8x4_t;
+    }
+    _vld4q_s16(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v2i32.p0"
+        )]
+        fn _vld4_s32(ptr: *const int32x2_t) -> int32x2x4_t;
+    }
+    _vld4_s32(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(ld4))]
+pub unsafe fn vld4q_s32(a: *const i32) -> int32x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v4i32.p0"
+        )]
+        fn _vld4q_s32(ptr: *const int32x4_t) -> int32x4x4_t;
+    }
+    _vld4q_s32(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v2f32.p0")]
+        fn _vld4_f32(ptr: *const i8, size: i32) -> float32x2x4_t;
+    }
+    _vld4_f32(a as *const i8, 4)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v4f32.p0")]
+        fn _vld4q_f32(ptr: *const i8, size: i32) -> float32x4x4_t;
+    }
+    _vld4q_f32(a as *const i8, 4)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4_s8(a: *const i8) -> int8x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v8i8.p0")]
+        fn _vld4_s8(ptr: *const i8, size: i32) -> int8x8x4_t;
+    }
+    _vld4_s8(a as *const i8, 1)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4q_s8(a: *const i8) -> int8x16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v16i8.p0")]
+        fn _vld4q_s8(ptr: *const i8, size: i32) -> int8x16x4_t;
+    }
+    _vld4q_s8(a as *const i8, 1)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4_s16(a: *const i16) -> int16x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v4i16.p0")]
+        fn _vld4_s16(ptr: *const i8, size: i32) -> int16x4x4_t;
+    }
+    _vld4_s16(a as *const i8, 2)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4q_s16(a: *const i16) -> int16x8x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v8i16.p0")]
+        fn _vld4q_s16(ptr: *const i8, size: i32) -> int16x8x4_t;
+    }
+    _vld4q_s16(a as *const i8, 2)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v2i32.p0")]
+        fn _vld4_s32(ptr: *const i8, size: i32) -> int32x2x4_t;
+    }
+    _vld4_s32(a as *const i8, 4)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4q_s32(a: *const i32) -> int32x4x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v4i32.p0")]
+        fn _vld4q_s32(ptr: *const i8, size: i32) -> int32x4x4_t;
+    }
+    _vld4q_s32(a as *const i8, 4)
+}
+#[doc = "Load multiple 4-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld4_lane_f16<const LANE: i32>(a: *const f16, b: float16x4x4_t) -> float16x4x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v4f16.p0")]
+        fn _vld4_lane_f16(
+            ptr: *const f16,
+            a: float16x4_t,
+            b: float16x4_t,
+            c: float16x4_t,
+            d: float16x4_t,
+            n: i32,
+            size: i32,
+        ) -> float16x4x4_t;
+    }
+    _vld4_lane_f16(a as _, b.0, b.1, b.2, b.3, LANE, 2)
+}
+#[doc = "Load multiple 4-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld4q_lane_f16<const LANE: i32>(a: *const f16, b: float16x8x4_t) -> float16x8x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v8f16.p0")]
+        fn _vld4q_lane_f16(
+            ptr: *const f16,
+            a: float16x8_t,
+            b: float16x8_t,
+            c: float16x8_t,
+            d: float16x8_t,
+            n: i32,
+            size: i32,
+        ) -> float16x8x4_t;
+    }
+    _vld4q_lane_f16(a as _, b.0, b.1, b.2, b.3, LANE, 2)
+}
+#[doc = "Load multiple 4-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld4_lane_f16<const LANE: i32>(a: *const f16, b: float16x4x4_t) -> float16x4x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v4f16.p0"
+        )]
+        fn _vld4_lane_f16(
+            a: float16x4_t,
+            b: float16x4_t,
+            c: float16x4_t,
+            d: float16x4_t,
+            n: i64,
+            ptr: *const f16,
+        ) -> float16x4x4_t;
+    }
+    _vld4_lane_f16(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vld4q_lane_f16<const LANE: i32>(a: *const f16, b: float16x8x4_t) -> float16x8x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v8f16.p0"
+        )]
+        fn _vld4q_lane_f16(
+            a: float16x8_t,
+            b: float16x8_t,
+            c: float16x8_t,
+            d: float16x8_t,
+            n: i64,
+            ptr: *const f16,
+        ) -> float16x8x4_t;
+    }
+    _vld4q_lane_f16(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_f32<const LANE: i32>(a: *const f32, b: float32x2x4_t) -> float32x2x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v2f32.p0"
+        )]
+        fn _vld4_lane_f32(
+            a: float32x2_t,
+            b: float32x2_t,
+            c: float32x2_t,
+            d: float32x2_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> float32x2x4_t;
+    }
+    _vld4_lane_f32(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_f32<const LANE: i32>(a: *const f32, b: float32x4x4_t) -> float32x4x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v4f32.p0"
+        )]
+        fn _vld4q_lane_f32(
+            a: float32x4_t,
+            b: float32x4_t,
+            c: float32x4_t,
+            d: float32x4_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> float32x4x4_t;
+    }
+    _vld4q_lane_f32(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_s8<const LANE: i32>(a: *const i8, b: int8x8x4_t) -> int8x8x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v8i8.p0"
+        )]
+        fn _vld4_lane_s8(
+            a: int8x8_t,
+            b: int8x8_t,
+            c: int8x8_t,
+            d: int8x8_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int8x8x4_t;
+    }
+    _vld4_lane_s8(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_s16<const LANE: i32>(a: *const i16, b: int16x4x4_t) -> int16x4x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v4i16.p0"
+        )]
+        fn _vld4_lane_s16(
+            a: int16x4_t,
+            b: int16x4_t,
+            c: int16x4_t,
+            d: int16x4_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int16x4x4_t;
+    }
+    _vld4_lane_s16(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_s16<const LANE: i32>(a: *const i16, b: int16x8x4_t) -> int16x8x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v8i16.p0"
+        )]
+        fn _vld4q_lane_s16(
+            a: int16x8_t,
+            b: int16x8_t,
+            c: int16x8_t,
+            d: int16x8_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int16x8x4_t;
+    }
+    _vld4q_lane_s16(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_s32<const LANE: i32>(a: *const i32, b: int32x2x4_t) -> int32x2x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v2i32.p0"
+        )]
+        fn _vld4_lane_s32(
+            a: int32x2_t,
+            b: int32x2_t,
+            c: int32x2_t,
+            d: int32x2_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int32x2x4_t;
+    }
+    _vld4_lane_s32(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_s32<const LANE: i32>(a: *const i32, b: int32x4x4_t) -> int32x4x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4lane.v4i32.p0"
+        )]
+        fn _vld4q_lane_s32(
+            a: int32x4_t,
+            b: int32x4_t,
+            c: int32x4_t,
+            d: int32x4_t,
+            n: i64,
+            ptr: *const i8,
+        ) -> int32x4x4_t;
+    }
+    _vld4q_lane_s32(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4_lane_f32<const LANE: i32>(a: *const f32, b: float32x2x4_t) -> float32x2x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v2f32.p0")]
+        fn _vld4_lane_f32(
+            ptr: *const i8,
+            a: float32x2_t,
+            b: float32x2_t,
+            c: float32x2_t,
+            d: float32x2_t,
+            n: i32,
+            size: i32,
+        ) -> float32x2x4_t;
+    }
+    _vld4_lane_f32(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4q_lane_f32<const LANE: i32>(a: *const f32, b: float32x4x4_t) -> float32x4x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v4f32.p0")]
+        fn _vld4q_lane_f32(
+            ptr: *const i8,
+            a: float32x4_t,
+            b: float32x4_t,
+            c: float32x4_t,
+            d: float32x4_t,
+            n: i32,
+            size: i32,
+        ) -> float32x4x4_t;
+    }
+    _vld4q_lane_f32(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4_lane_s8<const LANE: i32>(a: *const i8, b: int8x8x4_t) -> int8x8x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v8i8.p0")]
+        fn _vld4_lane_s8(
+            ptr: *const i8,
+            a: int8x8_t,
+            b: int8x8_t,
+            c: int8x8_t,
+            d: int8x8_t,
+            n: i32,
+            size: i32,
+        ) -> int8x8x4_t;
+    }
+    _vld4_lane_s8(a as _, b.0, b.1, b.2, b.3, LANE, 1)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4_lane_s16<const LANE: i32>(a: *const i16, b: int16x4x4_t) -> int16x4x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v4i16.p0")]
+        fn _vld4_lane_s16(
+            ptr: *const i8,
+            a: int16x4_t,
+            b: int16x4_t,
+            c: int16x4_t,
+            d: int16x4_t,
+            n: i32,
+            size: i32,
+        ) -> int16x4x4_t;
+    }
+    _vld4_lane_s16(a as _, b.0, b.1, b.2, b.3, LANE, 2)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4q_lane_s16<const LANE: i32>(a: *const i16, b: int16x8x4_t) -> int16x8x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v8i16.p0")]
+        fn _vld4q_lane_s16(
+            ptr: *const i8,
+            a: int16x8_t,
+            b: int16x8_t,
+            c: int16x8_t,
+            d: int16x8_t,
+            n: i32,
+            size: i32,
+        ) -> int16x8x4_t;
+    }
+    _vld4q_lane_s16(a as _, b.0, b.1, b.2, b.3, LANE, 2)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4_lane_s32<const LANE: i32>(a: *const i32, b: int32x2x4_t) -> int32x2x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v2i32.p0")]
+        fn _vld4_lane_s32(
+            ptr: *const i8,
+            a: int32x2_t,
+            b: int32x2_t,
+            c: int32x2_t,
+            d: int32x2_t,
+            n: i32,
+            size: i32,
+        ) -> int32x2x4_t;
+    }
+    _vld4_lane_s32(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vld4q_lane_s32<const LANE: i32>(a: *const i32, b: int32x4x4_t) -> int32x4x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v4i32.p0")]
+        fn _vld4q_lane_s32(
+            ptr: *const i8,
+            a: int32x4_t,
+            b: int32x4_t,
+            c: int32x4_t,
+            d: int32x4_t,
+            n: i32,
+            size: i32,
+        ) -> int32x4x4_t;
+    }
+    _vld4q_lane_s32(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_lane_u8<const LANE: i32>(a: *const u8, b: uint8x8x4_t) -> uint8x8x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    transmute(vld4_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_lane_u16<const LANE: i32>(a: *const u16, b: uint16x4x4_t) -> uint16x4x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    transmute(vld4_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_lane_u16<const LANE: i32>(a: *const u16, b: uint16x8x4_t) -> uint16x8x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    transmute(vld4q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_lane_u32<const LANE: i32>(a: *const u32, b: uint32x2x4_t) -> uint32x2x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    transmute(vld4_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_lane_u32<const LANE: i32>(a: *const u32, b: uint32x4x4_t) -> uint32x4x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    transmute(vld4q_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_lane_p8<const LANE: i32>(a: *const p8, b: poly8x8x4_t) -> poly8x8x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    transmute(vld4_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_lane_p16<const LANE: i32>(a: *const p16, b: poly16x4x4_t) -> poly16x4x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    transmute(vld4_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_lane_p16<const LANE: i32>(a: *const p16, b: poly16x8x4_t) -> poly16x8x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    transmute(vld4q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_p64(a: *const p64) -> poly64x1x4_t {
+    transmute(vld4_s64(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ld4.v1i64.p0"
+        )]
+        fn _vld4_s64(ptr: *const int64x1_t) -> int64x1x4_t;
+    }
+    _vld4_s64(a as _)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v1i64.p0")]
+        fn _vld4_s64(ptr: *const i8, size: i32) -> int64x1x4_t;
+    }
+    _vld4_s64(a as *const i8, 8)
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_u64(a: *const u64) -> uint64x1x4_t {
+    transmute(vld4_s64(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_u8(a: *const u8) -> uint8x8x4_t {
+    transmute(vld4_s8(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_u8(a: *const u8) -> uint8x8x4_t {
+    let mut ret_val: uint8x8x4_t = transmute(vld4_s8(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_u8(a: *const u8) -> uint8x16x4_t {
+    transmute(vld4q_s8(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_u8(a: *const u8) -> uint8x16x4_t {
+    let mut ret_val: uint8x16x4_t = transmute(vld4q_s8(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.2 = unsafe {
+        simd_shuffle!(
+            ret_val.2,
+            ret_val.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.3 = unsafe {
+        simd_shuffle!(
+            ret_val.3,
+            ret_val.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_u16(a: *const u16) -> uint16x4x4_t {
+    transmute(vld4_s16(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_u16(a: *const u16) -> uint16x4x4_t {
+    let mut ret_val: uint16x4x4_t = transmute(vld4_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_u16(a: *const u16) -> uint16x8x4_t {
+    transmute(vld4q_s16(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_u16(a: *const u16) -> uint16x8x4_t {
+    let mut ret_val: uint16x8x4_t = transmute(vld4q_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_u32(a: *const u32) -> uint32x2x4_t {
+    transmute(vld4_s32(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_u32(a: *const u32) -> uint32x2x4_t {
+    let mut ret_val: uint32x2x4_t = transmute(vld4_s32(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_u32(a: *const u32) -> uint32x4x4_t {
+    transmute(vld4q_s32(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_u32(a: *const u32) -> uint32x4x4_t {
+    let mut ret_val: uint32x4x4_t = transmute(vld4q_s32(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_p8(a: *const p8) -> poly8x8x4_t {
+    transmute(vld4_s8(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_p8(a: *const p8) -> poly8x8x4_t {
+    let mut ret_val: poly8x8x4_t = transmute(vld4_s8(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_p8(a: *const p8) -> poly8x16x4_t {
+    transmute(vld4q_s8(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_p8(a: *const p8) -> poly8x16x4_t {
+    let mut ret_val: poly8x16x4_t = transmute(vld4q_s8(transmute(a)));
+    ret_val.0 = unsafe {
+        simd_shuffle!(
+            ret_val.0,
+            ret_val.0,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.1 = unsafe {
+        simd_shuffle!(
+            ret_val.1,
+            ret_val.1,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.2 = unsafe {
+        simd_shuffle!(
+            ret_val.2,
+            ret_val.2,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val.3 = unsafe {
+        simd_shuffle!(
+            ret_val.3,
+            ret_val.3,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    };
+    ret_val
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_p16(a: *const p16) -> poly16x4x4_t {
+    transmute(vld4_s16(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4_p16(a: *const p16) -> poly16x4x4_t {
+    let mut ret_val: poly16x4x4_t = transmute(vld4_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_p16(a: *const p16) -> poly16x8x4_t {
+    transmute(vld4q_s16(transmute(a)))
+}
+#[doc = "Load multiple 4-element structures to four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ld4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vld4q_p16(a: *const p16) -> poly16x8x4_t {
+    let mut ret_val: poly16x8x4_t = transmute(vld4q_s16(transmute(a)));
+    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    ret_val
+}
+#[doc = "Store SIMD&FP register (immediate offset)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vldrq_p128)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vldrq_p128(a: *const p128) -> p128 {
+    *a
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmax_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmax)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmax_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmax.v4f16"
+        )]
+        fn _vmax_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vmax_f16(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmax)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmaxq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v8f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmax.v8f16"
+        )]
+        fn _vmaxq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vmaxq_f16(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmax_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmax.v2f32"
+        )]
+        fn _vmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vmax_f32(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmax.v4f32"
+        )]
+        fn _vmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vmaxq_f32(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmax_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smax.v8i8"
+        )]
+        fn _vmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vmax_s8(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmaxq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smax.v16i8"
+        )]
+        fn _vmaxq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vmaxq_s8(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmax_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smax.v4i16"
+        )]
+        fn _vmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vmax_s16(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmaxq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smax.v8i16"
+        )]
+        fn _vmaxq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vmaxq_s16(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmax_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smax.v2i32"
+        )]
+        fn _vmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vmax_s32(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmaxq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smax.v4i32"
+        )]
+        fn _vmaxq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vmaxq_s32(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmax_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umax.v8i8"
+        )]
+        fn _vmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vmax_u8(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmaxq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umax.v16i8"
+        )]
+        fn _vmaxq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vmaxq_u8(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmax_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umax.v4i16"
+        )]
+        fn _vmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vmax_u16(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmaxq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umax.v8i16"
+        )]
+        fn _vmaxq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vmaxq_u16(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmax_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umax.v2i32"
+        )]
+        fn _vmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vmax_u32(a, b) }
+}
+#[doc = "Maximum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umax)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umax.v4i32"
+        )]
+        fn _vmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vmaxq_u32(a, b) }
+}
+#[doc = "Floating-point Maximum Number (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnm_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmaxnm)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmaxnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.v4f16"
+        )]
+        fn _vmaxnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vmaxnm_f16(a, b) }
+}
+#[doc = "Floating-point Maximum Number (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmaxnm)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmaxnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v8f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.v8f16"
+        )]
+        fn _vmaxnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vmaxnmq_f16(a, b) }
+}
+#[doc = "Floating-point Maximum Number (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnm_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmaxnm)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.v2f32"
+        )]
+        fn _vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vmaxnm_f32(a, b) }
+}
+#[doc = "Floating-point Maximum Number (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmaxnm)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.v4f32"
+        )]
+        fn _vmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vmaxnmq_f32(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmin_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmin)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmin_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmin.v4f16"
+        )]
+        fn _vmin_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vmin_f16(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmin)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vminq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v8f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmin.v8f16"
+        )]
+        fn _vminq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vminq_f16(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmin_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmin.v2f32"
+        )]
+        fn _vmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vmin_f32(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmin.v4f32"
+        )]
+        fn _vminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vminq_f32(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmin_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smin.v8i8"
+        )]
+        fn _vmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vmin_s8(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vminq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smin.v16i8"
+        )]
+        fn _vminq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vminq_s8(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmin_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smin.v4i16"
+        )]
+        fn _vmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vmin_s16(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vminq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smin.v8i16"
+        )]
+        fn _vminq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vminq_s16(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmin_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smin.v2i32"
+        )]
+        fn _vmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vmin_s32(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vminq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smin.v4i32"
+        )]
+        fn _vminq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vminq_s32(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmin_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umin.v8i8"
+        )]
+        fn _vmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vmin_u8(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vminq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umin.v16i8"
+        )]
+        fn _vminq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vminq_u8(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmin_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umin.v4i16"
+        )]
+        fn _vmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vmin_u16(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vminq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umin.v8i16"
+        )]
+        fn _vminq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vminq_u16(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmin_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umin.v2i32"
+        )]
+        fn _vmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vmin_u32(a, b) }
+}
+#[doc = "Minimum (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umin)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umin.v4i32"
+        )]
+        fn _vminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vminq_u32(a, b) }
+}
+#[doc = "Floating-point Minimum Number (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnm_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fminnm)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vminnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.v4f16"
+        )]
+        fn _vminnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vminnm_f16(a, b) }
+}
+#[doc = "Floating-point Minimum Number (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fminnm)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vminnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v8f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.v8f16"
+        )]
+        fn _vminnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vminnmq_f16(a, b) }
+}
+#[doc = "Floating-point Minimum Number (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnm_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fminnm)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.v2f32"
+        )]
+        fn _vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vminnm_f32(a, b) }
+}
+#[doc = "Floating-point Minimum Number (vector)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fminnm)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.v4f32"
+        )]
+        fn _vminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vminnmq_f32(a, b) }
+}
+#[doc = "Floating-point multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Floating-point multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_lane_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x2_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmla_f32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_laneq_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x4_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmla_f32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_lane_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x2_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        vmlaq_f32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_laneq_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x4_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlaq_f32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmla_s16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmla_u16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmla_s16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x8_t) -> uint16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmla_u16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlaq_s16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x4_t) -> uint16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlaq_u16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmlaq_s16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmlaq_u16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmla_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmla_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmla_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x4_t) -> uint32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmla_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        vmlaq_s32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x2_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        vmlaq_u32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlaq_s32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlaq_u32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    vmla_f32(a, b, vdup_n_f32(c))
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    vmlaq_f32(a, b, vdupq_n_f32(c))
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_n_s16(a: int16x4_t, b: int16x4_t, c: i16) -> int16x4_t {
+    vmla_s16(a, b, vdup_n_s16(c))
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_n_s16(a: int16x8_t, b: int16x8_t, c: i16) -> int16x8_t {
+    vmlaq_s16(a, b, vdupq_n_s16(c))
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_n_u16(a: uint16x4_t, b: uint16x4_t, c: u16) -> uint16x4_t {
+    vmla_u16(a, b, vdup_n_u16(c))
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_n_u16(a: uint16x8_t, b: uint16x8_t, c: u16) -> uint16x8_t {
+    vmlaq_u16(a, b, vdupq_n_u16(c))
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_n_s32(a: int32x2_t, b: int32x2_t, c: i32) -> int32x2_t {
+    vmla_s32(a, b, vdup_n_s32(c))
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_n_s32(a: int32x4_t, b: int32x4_t, c: i32) -> int32x4_t {
+    vmlaq_s32(a, b, vdupq_n_s32(c))
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_n_u32(a: uint32x2_t, b: uint32x2_t, c: u32) -> uint32x2_t {
+    vmla_u32(a, b, vdup_n_u32(c))
+}
+#[doc = "Vector multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_n_u32(a: uint32x4_t, b: uint32x4_t, c: u32) -> uint32x4_t {
+    vmlaq_u32(a, b, vdupq_n_u32(c))
+}
+#[doc = "Multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmla_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-add to accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_add(a, simd_mul(b, c)) }
+}
+#[doc = "Vector widening multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlal, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlal_s16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector widening multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlal, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmlal_s16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector widening multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlal, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmlal_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector widening multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlal, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmlal_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector widening multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlal, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlal_u16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector widening multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlal, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x8_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmlal_u16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector widening multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlal, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmlal_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector widening multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlal, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x4_t) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmlal_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector widening multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vmlal_s16(a, b, vdup_n_s16(c))
+}
+#[doc = "Vector widening multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vmlal_s32(a, b, vdup_n_s32(c))
+}
+#[doc = "Vector widening multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_n_u16(a: uint32x4_t, b: uint16x4_t, c: u16) -> uint32x4_t {
+    vmlal_u16(a, b, vdup_n_u16(c))
+}
+#[doc = "Vector widening multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_n_u32(a: uint64x2_t, b: uint32x2_t, c: u32) -> uint64x2_t {
+    vmlal_u32(a, b, vdup_n_u32(c))
+}
+#[doc = "Signed multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
+    unsafe { simd_add(a, vmull_s8(b, c)) }
+}
+#[doc = "Signed multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    unsafe { simd_add(a, vmull_s16(b, c)) }
+}
+#[doc = "Signed multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    unsafe { simd_add(a, vmull_s32(b, c)) }
+}
+#[doc = "Unsigned multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
+    unsafe { simd_add(a, vmull_u8(b, c)) }
+}
+#[doc = "Unsigned multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    unsafe { simd_add(a, vmull_u16(b, c)) }
+}
+#[doc = "Unsigned multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    unsafe { simd_add(a, vmull_u32(b, c)) }
+}
+#[doc = "Floating-point multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Floating-point multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_lane_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x2_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmls_f32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_laneq_f32<const LANE: i32>(
+    a: float32x2_t,
+    b: float32x2_t,
+    c: float32x4_t,
+) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmls_f32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_lane_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x2_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        vmlsq_f32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_laneq_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x4_t,
+) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlsq_f32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmls_s16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmls_u16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmls_s16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x8_t) -> uint16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmls_u16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlsq_s16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x4_t) -> uint16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlsq_u16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmlsq_s16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmlsq_u16(
+            a,
+            b,
+            simd_shuffle!(
+                c,
+                c,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmls_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmls_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmls_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x4_t) -> uint32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmls_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        vmlsq_s32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x2_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        vmlsq_u32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlsq_s32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlsq_u32(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    vmls_f32(a, b, vdup_n_f32(c))
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    vmlsq_f32(a, b, vdupq_n_f32(c))
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_n_s16(a: int16x4_t, b: int16x4_t, c: i16) -> int16x4_t {
+    vmls_s16(a, b, vdup_n_s16(c))
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_n_s16(a: int16x8_t, b: int16x8_t, c: i16) -> int16x8_t {
+    vmlsq_s16(a, b, vdupq_n_s16(c))
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_n_u16(a: uint16x4_t, b: uint16x4_t, c: u16) -> uint16x4_t {
+    vmls_u16(a, b, vdup_n_u16(c))
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_n_u16(a: uint16x8_t, b: uint16x8_t, c: u16) -> uint16x8_t {
+    vmlsq_u16(a, b, vdupq_n_u16(c))
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_n_s32(a: int32x2_t, b: int32x2_t, c: i32) -> int32x2_t {
+    vmls_s32(a, b, vdup_n_s32(c))
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_n_s32(a: int32x4_t, b: int32x4_t, c: i32) -> int32x4_t {
+    vmlsq_s32(a, b, vdupq_n_s32(c))
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_n_u32(a: uint32x2_t, b: uint32x2_t, c: u32) -> uint32x2_t {
+    vmls_u32(a, b, vdup_n_u32(c))
+}
+#[doc = "Vector multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_n_u32(a: uint32x4_t, b: uint32x4_t, c: u32) -> uint32x4_t {
+    vmlsq_u32(a, b, vdupq_n_u32(c))
+}
+#[doc = "Multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmls_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Multiply-subtract from accumulator"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mls)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_sub(a, simd_mul(b, c)) }
+}
+#[doc = "Vector widening multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlsl, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlsl_s16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector widening multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlsl, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmlsl_s16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector widening multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlsl, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmlsl_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector widening multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlsl, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmlsl_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector widening multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlsl, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmlsl_u16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector widening multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlsl, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x8_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmlsl_u16(
+            a,
+            b,
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector widening multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlsl, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmlsl_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector widening multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32", LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlsl, LANE = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x4_t) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmlsl_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector widening multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vmlsl_s16(a, b, vdup_n_s16(c))
+}
+#[doc = "Vector widening multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vmlsl_s32(a, b, vdup_n_s32(c))
+}
+#[doc = "Vector widening multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_n_u16(a: uint32x4_t, b: uint16x4_t, c: u16) -> uint32x4_t {
+    vmlsl_u16(a, b, vdup_n_u16(c))
+}
+#[doc = "Vector widening multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_n_u32(a: uint64x2_t, b: uint32x2_t, c: u32) -> uint64x2_t {
+    vmlsl_u32(a, b, vdup_n_u32(c))
+}
+#[doc = "Signed multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
+    unsafe { simd_sub(a, vmull_s8(b, c)) }
+}
+#[doc = "Signed multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    unsafe { simd_sub(a, vmull_s16(b, c)) }
+}
+#[doc = "Signed multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    unsafe { simd_sub(a, vmull_s32(b, c)) }
+}
+#[doc = "Unsigned multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
+    unsafe { simd_sub(a, vmull_u8(b, c)) }
+}
+#[doc = "Unsigned multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    unsafe { simd_sub(a, vmull_u16(b, c)) }
+}
+#[doc = "Unsigned multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmlsl_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    unsafe { simd_sub(a, vmull_u32(b, c)) }
+}
+#[doc = "8-bit integer matrix multiply-accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmmlaq_s32)"]
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smmla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_i8mm", issue = "117223")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmmlaq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smmla.v4i32.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.smmla.v4i32.v16i8")]
+        fn _vmmlaq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t;
+    }
+    unsafe { _vmmlaq_s32(a, b, c) }
+}
+#[doc = "8-bit integer matrix multiply-accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmmlaq_u32)"]
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ummla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_i8mm", issue = "117223")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmmlaq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ummla.v4i32.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.ummla.v4i32.v16i8")]
+        fn _vmmlaq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t;
+    }
+    unsafe { _vmmlaq_u32(a, b, c) }
+}
+#[doc = "Duplicate element to vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmov_n_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmov_n_f16(a: f16) -> float16x4_t {
+    vdup_n_f16(a)
+}
+#[doc = "Duplicate element to vector"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovq_n_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmovq_n_f16(a: f16) -> float16x8_t {
+    vdupq_n_f16(a)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmov_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmov_n_f32(value: f32) -> float32x2_t {
+    vdup_n_f32(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmov_n_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmov_n_p16(value: p16) -> poly16x4_t {
+    vdup_n_p16(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmov_n_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmov_n_p8(value: p8) -> poly8x8_t {
+    vdup_n_p8(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmov_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmov_n_s16(value: i16) -> int16x4_t {
+    vdup_n_s16(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmov_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmov_n_s32(value: i32) -> int32x2_t {
+    vdup_n_s32(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmov_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmov)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmov_n_s64(value: i64) -> int64x1_t {
+    vdup_n_s64(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmov_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmov_n_s8(value: i8) -> int8x8_t {
+    vdup_n_s8(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmov_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmov_n_u16(value: u16) -> uint16x4_t {
+    vdup_n_u16(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmov_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmov_n_u32(value: u32) -> uint32x2_t {
+    vdup_n_u32(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmov_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmov)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmov_n_u64(value: u64) -> uint64x1_t {
+    vdup_n_u64(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmov_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmov_n_u8(value: u8) -> uint8x8_t {
+    vdup_n_u8(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovq_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovq_n_f32(value: f32) -> float32x4_t {
+    vdupq_n_f32(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovq_n_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovq_n_p16(value: p16) -> poly16x8_t {
+    vdupq_n_p16(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovq_n_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovq_n_p8(value: p8) -> poly8x16_t {
+    vdupq_n_p8(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovq_n_s16(value: i16) -> int16x8_t {
+    vdupq_n_s16(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovq_n_s32(value: i32) -> int32x4_t {
+    vdupq_n_s32(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovq_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovq_n_s64(value: i64) -> int64x2_t {
+    vdupq_n_s64(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovq_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovq_n_s8(value: i8) -> int8x16_t {
+    vdupq_n_s8(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovq_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovq_n_u16(value: u16) -> uint16x8_t {
+    vdupq_n_u16(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovq_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovq_n_u32(value: u32) -> uint32x4_t {
+    vdupq_n_u32(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovq_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovq_n_u64(value: u64) -> uint64x2_t {
+    vdupq_n_u64(value)
+}
+#[doc = "Duplicate vector element to vector or scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovq_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(dup)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovq_n_u8(value: u8) -> uint8x16_t {
+    vdupq_n_u8(value)
+}
+#[doc = "Vector long move."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sxtl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovl_s16(a: int16x4_t) -> int32x4_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Vector long move."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sxtl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovl_s32(a: int32x2_t) -> int64x2_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Vector long move."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sxtl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovl_s8(a: int8x8_t) -> int16x8_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Vector long move."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uxtl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovl_u16(a: uint16x4_t) -> uint32x4_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Vector long move."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uxtl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovl_u32(a: uint32x2_t) -> uint64x2_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Vector long move."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uxtl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovl_u8(a: uint8x8_t) -> uint16x8_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Vector narrow integer."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(xtn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovn_s16(a: int16x8_t) -> int8x8_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Vector narrow integer."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(xtn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovn_s32(a: int32x4_t) -> int16x4_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Vector narrow integer."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(xtn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovn_s64(a: int64x2_t) -> int32x2_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Vector narrow integer."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(xtn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovn_u16(a: uint16x8_t) -> uint8x8_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Vector narrow integer."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(xtn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovn_u32(a: uint32x4_t) -> uint16x4_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Vector narrow integer."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(xtn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmovn_u64(a: uint64x2_t) -> uint32x2_t {
+    unsafe { simd_cast(a) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmul_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_lane_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmul_lane_f16<const LANE: i32>(a: float16x4_t, v: float16x4_t) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(v, v, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_lane_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulq_lane_f16<const LANE: i32>(a: float16x8_t, v: float16x4_t) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(
+                v,
+                v,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Floating-point multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(
+                b,
+                b,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uint16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(
+                b,
+                b,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(
+                b,
+                b,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) -> uint16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(
+                b,
+                b,
+                [
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32,
+                    LANE as u32
+                ]
+            ),
+        )
+    }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x4_t) -> uint32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        simd_mul(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_n_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmul_n_f16(a: float16x4_t, b: f16) -> float16x4_t {
+    unsafe { simd_mul(a, vdup_n_f16(b)) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_n_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vmulq_n_f16(a: float16x8_t, b: f16) -> float16x8_t {
+    unsafe { simd_mul(a, vdupq_n_f16(b)) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_n_f32(a: float32x2_t, b: f32) -> float32x2_t {
+    unsafe { simd_mul(a, vdup_n_f32(b)) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_n_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_n_f32(a: float32x4_t, b: f32) -> float32x4_t {
+    unsafe { simd_mul(a, vdupq_n_f32(b)) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
+    unsafe { simd_mul(a, vdup_n_s16(b)) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_n_s16(a: int16x8_t, b: i16) -> int16x8_t {
+    unsafe { simd_mul(a, vdupq_n_s16(b)) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
+    unsafe { simd_mul(a, vdup_n_s32(b)) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_n_s32(a: int32x4_t, b: i32) -> int32x4_t {
+    unsafe { simd_mul(a, vdupq_n_s32(b)) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_n_u16(a: uint16x4_t, b: u16) -> uint16x4_t {
+    unsafe { simd_mul(a, vdup_n_u16(b)) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_n_u16(a: uint16x8_t, b: u16) -> uint16x8_t {
+    unsafe { simd_mul(a, vdupq_n_u16(b)) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_n_u32(a: uint32x2_t, b: u32) -> uint32x2_t {
+    unsafe { simd_mul(a, vdup_n_u32(b)) }
+}
+#[doc = "Vector multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_n_u32(a: uint32x4_t, b: u32) -> uint32x4_t {
+    unsafe { simd_mul(a, vdupq_n_u32(b)) }
+}
+#[doc = "Polynomial multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(pmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulp.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.pmul.v8i8"
+        )]
+        fn _vmul_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t;
+    }
+    unsafe { _vmul_p8(a, b) }
+}
+#[doc = "Polynomial multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(pmul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulp.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.pmul.v16i8"
+        )]
+        fn _vmulq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t;
+    }
+    unsafe { _vmulq_p8(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmul_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Multiply"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mul)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmulq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_mul(a, b) }
+}
+#[doc = "Vector long multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smull, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmull_s16(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector long multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smull, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmull_s16(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector long multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smull, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmull_s32(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector long multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smull, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmull_s32(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector long multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umull, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        vmull_u16(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector long multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_laneq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umull, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        vmull_u16(
+            a,
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
+        )
+    }
+}
+#[doc = "Vector long multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umull, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { vmull_u32(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector long multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_laneq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umull, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x4_t) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vmull_u32(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+}
+#[doc = "Vector long multiply with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_n_s16(a: int16x4_t, b: i16) -> int32x4_t {
+    vmull_s16(a, vdup_n_s16(b))
+}
+#[doc = "Vector long multiply with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_n_s32(a: int32x2_t, b: i32) -> int64x2_t {
+    vmull_s32(a, vdup_n_s32(b))
+}
+#[doc = "Vector long multiply with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_n_u16(a: uint16x4_t, b: u16) -> uint32x4_t {
+    vmull_u16(a, vdup_n_u16(b))
+}
+#[doc = "Vector long multiply with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_n_u32(a: uint32x2_t, b: u32) -> uint64x2_t {
+    vmull_u32(a, vdup_n_u32(b))
+}
+#[doc = "Polynomial multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.p8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(pmull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_p8(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.pmull.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullp.v8i16")]
+        fn _vmull_p8(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t;
+    }
+    unsafe { _vmull_p8(a, b) }
+}
+#[doc = "Signed multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smull.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v4i32")]
+        fn _vmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t;
+    }
+    unsafe { _vmull_s16(a, b) }
+}
+#[doc = "Signed multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smull.v2i64"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v2i64")]
+        fn _vmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t;
+    }
+    unsafe { _vmull_s32(a, b) }
+}
+#[doc = "Signed multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smull.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v8i16")]
+        fn _vmull_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t;
+    }
+    unsafe { _vmull_s8(a, b) }
+}
+#[doc = "Unsigned multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umull.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v8i16")]
+        fn _vmull_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t;
+    }
+    unsafe { _vmull_u8(a, b) }
+}
+#[doc = "Unsigned multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umull.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v4i32")]
+        fn _vmull_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t;
+    }
+    unsafe { _vmull_u16(a, b) }
+}
+#[doc = "Unsigned multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmull_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umull.v2i64"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v2i64")]
+        fn _vmull_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t;
+    }
+    unsafe { _vmull_u32(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvn_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvn_p8(a: poly8x8_t) -> poly8x8_t {
+    let b = poly8x8_t::splat(255);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvn_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvn_s16(a: int16x4_t) -> int16x4_t {
+    let b = int16x4_t::splat(-1);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvn_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvn_s32(a: int32x2_t) -> int32x2_t {
+    let b = int32x2_t::splat(-1);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvn_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvn_s8(a: int8x8_t) -> int8x8_t {
+    let b = int8x8_t::splat(-1);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvn_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvn_u16(a: uint16x4_t) -> uint16x4_t {
+    let b = uint16x4_t::splat(65_535);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvn_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvn_u32(a: uint32x2_t) -> uint32x2_t {
+    let b = uint32x2_t::splat(4_294_967_295);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvn_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvn_u8(a: uint8x8_t) -> uint8x8_t {
+    let b = uint8x8_t::splat(255);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvnq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvnq_p8(a: poly8x16_t) -> poly8x16_t {
+    let b = poly8x16_t::splat(255);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvnq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvnq_s16(a: int16x8_t) -> int16x8_t {
+    let b = int16x8_t::splat(-1);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvnq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvnq_s32(a: int32x4_t) -> int32x4_t {
+    let b = int32x4_t::splat(-1);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvnq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvnq_s8(a: int8x16_t) -> int8x16_t {
+    let b = int8x16_t::splat(-1);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvnq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvnq_u16(a: uint16x8_t) -> uint16x8_t {
+    let b = uint16x8_t::splat(65_535);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvnq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvnq_u32(a: uint32x4_t) -> uint32x4_t {
+    let b = uint32x4_t::splat(4_294_967_295);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Vector bitwise not."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmvnq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(mvn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vmvnq_u8(a: uint8x16_t) -> uint8x16_t {
+    let b = uint8x16_t::splat(255);
+    unsafe { simd_xor(a, b) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vneg_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fneg)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vneg_f16(a: float16x4_t) -> float16x4_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vnegq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fneg)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vnegq_f16(a: float16x8_t) -> float16x8_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vneg_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fneg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vneg_f32(a: float32x2_t) -> float32x2_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vnegq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fneg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vnegq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vneg_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(neg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vneg_s8(a: int8x8_t) -> int8x8_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vnegq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(neg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vnegq_s8(a: int8x16_t) -> int8x16_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vneg_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(neg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vneg_s16(a: int16x4_t) -> int16x4_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vnegq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(neg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vnegq_s16(a: int16x8_t) -> int16x8_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vneg_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(neg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vneg_s32(a: int32x2_t) -> int32x2_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vnegq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(neg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vnegq_s32(a: int32x4_t) -> int32x4_t {
+    unsafe { simd_neg(a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorn_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorn_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    let c = int16x4_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorn_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorn_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    let c = int32x2_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorn_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorn_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    let c = int64x1_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorn_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorn_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    let c = int8x8_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vornq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vornq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    let c = int16x8_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vornq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vornq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    let c = int32x4_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vornq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vornq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    let c = int64x2_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vornq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vornq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    let c = int8x16_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, c), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorn_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorn_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    let c = int16x4_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorn_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorn_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    let c = int32x2_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorn_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorn_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    let c = int64x1_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorn_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorn_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    let c = int8x8_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vornq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vornq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    let c = int16x8_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vornq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vornq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    let c = int32x4_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vornq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vornq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    let c = int64x2_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise inclusive OR NOT"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vornq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vornq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    let c = int8x16_t::splat(-1);
+    unsafe { simd_or(simd_xor(b, transmute(c)), a) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorr_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorr_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorr_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorr_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorrq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorr_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorr_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorrq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorr_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorr_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorrq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorr_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorr_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorr_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorr_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorrq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorr_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorr_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorrq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorr_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorr_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Vector bitwise or (immediate, inclusive)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(orr)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vorrq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_or(a, b) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadal_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sadalp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadal_s8(a: int16x4_t, b: int8x8_t) -> int16x4_t {
+    let x: int16x4_t;
+    #[cfg(target_arch = "arm")]
+    {
+        x = priv_vpadal_s8(a, b);
+    }
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    unsafe {
+        x = simd_add(vpaddl_s8(b), a);
+    };
+    x
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadalq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sadalp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadalq_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    let x: int16x8_t;
+    #[cfg(target_arch = "arm")]
+    {
+        x = priv_vpadalq_s8(a, b);
+    }
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    unsafe {
+        x = simd_add(vpaddlq_s8(b), a);
+    };
+    x
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadal_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sadalp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadal_s16(a: int32x2_t, b: int16x4_t) -> int32x2_t {
+    let x: int32x2_t;
+    #[cfg(target_arch = "arm")]
+    {
+        x = priv_vpadal_s16(a, b);
+    }
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    unsafe {
+        x = simd_add(vpaddl_s16(b), a);
+    };
+    x
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadalq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sadalp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadalq_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    let x: int32x4_t;
+    #[cfg(target_arch = "arm")]
+    {
+        x = priv_vpadalq_s16(a, b);
+    }
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    unsafe {
+        x = simd_add(vpaddlq_s16(b), a);
+    };
+    x
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadal_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sadalp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadal_s32(a: int64x1_t, b: int32x2_t) -> int64x1_t {
+    let x: int64x1_t;
+    #[cfg(target_arch = "arm")]
+    {
+        x = priv_vpadal_s32(a, b);
+    }
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    unsafe {
+        x = simd_add(vpaddl_s32(b), a);
+    };
+    x
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadalq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sadalp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadalq_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    let x: int64x2_t;
+    #[cfg(target_arch = "arm")]
+    {
+        x = priv_vpadalq_s32(a, b);
+    }
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    unsafe {
+        x = simd_add(vpaddlq_s32(b), a);
+    };
+    x
+}
+#[doc = "Unsigned Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadal_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uadalp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadal_u8(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t {
+    let x: uint16x4_t;
+    #[cfg(target_arch = "arm")]
+    {
+        x = priv_vpadal_u8(a, b);
+    }
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    unsafe {
+        x = simd_add(vpaddl_u8(b), a);
+    };
+    x
+}
+#[doc = "Unsigned Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadalq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uadalp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadalq_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    let x: uint16x8_t;
+    #[cfg(target_arch = "arm")]
+    {
+        x = priv_vpadalq_u8(a, b);
+    }
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    unsafe {
+        x = simd_add(vpaddlq_u8(b), a);
+    };
+    x
+}
+#[doc = "Unsigned Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadal_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uadalp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadal_u16(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t {
+    let x: uint32x2_t;
+    #[cfg(target_arch = "arm")]
+    {
+        x = priv_vpadal_u16(a, b);
+    }
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    unsafe {
+        x = simd_add(vpaddl_u16(b), a);
+    };
+    x
+}
+#[doc = "Unsigned Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadalq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uadalp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadalq_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    let x: uint32x4_t;
+    #[cfg(target_arch = "arm")]
+    {
+        x = priv_vpadalq_u16(a, b);
+    }
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    unsafe {
+        x = simd_add(vpaddlq_u16(b), a);
+    };
+    x
+}
+#[doc = "Unsigned Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadal_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uadalp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadal_u32(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t {
+    let x: uint64x1_t;
+    #[cfg(target_arch = "arm")]
+    {
+        x = priv_vpadal_u32(a, b);
+    }
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    unsafe {
+        x = simd_add(vpaddl_u32(b), a);
+    };
+    x
+}
+#[doc = "Unsigned Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadalq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpadal.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uadalp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadalq_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    let x: uint64x2_t;
+    #[cfg(target_arch = "arm")]
+    {
+        x = priv_vpadalq_u32(a, b);
+    }
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    unsafe {
+        x = simd_add(vpaddlq_u32(b), a);
+    };
+    x
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadd_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(faddp)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vpadd_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadd.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.faddp.v4f16"
+        )]
+        fn _vpadd_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vpadd_f16(a, b) }
+}
+#[doc = "Floating-point add pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadd_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(faddp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadd.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.faddp.v2f32"
+        )]
+        fn _vpadd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vpadd_f32(a, b) }
+}
+#[doc = "Add pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadd_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.addp.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadd.v8i8")]
+        fn _vpadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vpadd_s8(a, b) }
+}
+#[doc = "Add pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadd_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.addp.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadd.v4i16")]
+        fn _vpadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vpadd_s16(a, b) }
+}
+#[doc = "Add pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadd_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.addp.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadd.v2i32")]
+        fn _vpadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vpadd_s32(a, b) }
+}
+#[doc = "Add pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadd_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { transmute(vpadd_s8(transmute(a), transmute(b))) }
+}
+#[doc = "Add pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadd_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vpadd_s8(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Add pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadd_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { transmute(vpadd_s16(transmute(a), transmute(b))) }
+}
+#[doc = "Add pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadd_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    let b: uint16x4_t = unsafe { simd_shuffle!(b, b, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x4_t = transmute(vpadd_s16(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Add pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadd_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { transmute(vpadd_s32(transmute(a), transmute(b))) }
+}
+#[doc = "Add pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadd_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(addp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    let b: uint32x2_t = unsafe { simd_shuffle!(b, b, [1, 0]) };
+    unsafe {
+        let ret_val: uint32x2_t = transmute(vpadd_s32(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddl_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpaddl.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddlp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpaddl_s8(a: int8x8_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddlp.v4i16.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v4i16.v8i8")]
+        fn _vpaddl_s8(a: int8x8_t) -> int16x4_t;
+    }
+    unsafe { _vpaddl_s8(a) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddlq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpaddl.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddlp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpaddlq_s8(a: int8x16_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddlp.v8i16.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v8i16.v16i8")]
+        fn _vpaddlq_s8(a: int8x16_t) -> int16x8_t;
+    }
+    unsafe { _vpaddlq_s8(a) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddl_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpaddl.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddlp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpaddl_s16(a: int16x4_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddlp.v2i32.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v2i32.v4i16")]
+        fn _vpaddl_s16(a: int16x4_t) -> int32x2_t;
+    }
+    unsafe { _vpaddl_s16(a) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddlq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpaddl.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddlp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpaddlq_s16(a: int16x8_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddlp.v4i32.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v4i32.v8i16")]
+        fn _vpaddlq_s16(a: int16x8_t) -> int32x4_t;
+    }
+    unsafe { _vpaddlq_s16(a) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddl_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpaddl.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddlp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpaddl_s32(a: int32x2_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddlp.v1i64.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v1i64.v2i32")]
+        fn _vpaddl_s32(a: int32x2_t) -> int64x1_t;
+    }
+    unsafe { _vpaddl_s32(a) }
+}
+#[doc = "Signed Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddlq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpaddl.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(saddlp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpaddlq_s32(a: int32x4_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.saddlp.v2i64.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v2i64.v4i32")]
+        fn _vpaddlq_s32(a: int32x4_t) -> int64x2_t;
+    }
+    unsafe { _vpaddlq_s32(a) }
+}
+#[doc = "Unsigned Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddl_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpaddl.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddlp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpaddl_u8(a: uint8x8_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddlp.v4i16.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v4i16.v8i8")]
+        fn _vpaddl_u8(a: uint8x8_t) -> uint16x4_t;
+    }
+    unsafe { _vpaddl_u8(a) }
+}
+#[doc = "Unsigned Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddlq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpaddl.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddlp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpaddlq_u8(a: uint8x16_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddlp.v8i16.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v8i16.v16i8")]
+        fn _vpaddlq_u8(a: uint8x16_t) -> uint16x8_t;
+    }
+    unsafe { _vpaddlq_u8(a) }
+}
+#[doc = "Unsigned Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddl_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpaddl.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddlp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpaddl_u16(a: uint16x4_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddlp.v2i32.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v2i32.v4i16")]
+        fn _vpaddl_u16(a: uint16x4_t) -> uint32x2_t;
+    }
+    unsafe { _vpaddl_u16(a) }
+}
+#[doc = "Unsigned Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddlq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpaddl.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddlp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpaddlq_u16(a: uint16x8_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddlp.v4i32.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v4i32.v8i16")]
+        fn _vpaddlq_u16(a: uint16x8_t) -> uint32x4_t;
+    }
+    unsafe { _vpaddlq_u16(a) }
+}
+#[doc = "Unsigned Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddl_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpaddl.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddlp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpaddl_u32(a: uint32x2_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddlp.v1i64.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v1i64.v2i32")]
+        fn _vpaddl_u32(a: uint32x2_t) -> uint64x1_t;
+    }
+    unsafe { _vpaddl_u32(a) }
+}
+#[doc = "Unsigned Add and Accumulate Long Pairwise."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddlq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vpaddl.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uaddlp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpaddlq_u32(a: uint32x4_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uaddlp.v2i64.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v2i64.v4i32")]
+        fn _vpaddlq_u32(a: uint32x4_t) -> uint64x2_t;
+    }
+    unsafe { _vpaddlq_u32(a) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmax_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fmaxp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxp.v2f32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2f32")]
+        fn _vpmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vpmax_f32(a, b) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmax_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smaxp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxp.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v8i8")]
+        fn _vpmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vpmax_s8(a, b) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmax_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smaxp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxp.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v4i16")]
+        fn _vpmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vpmax_s16(a, b) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmax_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(smaxp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxp.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2i32")]
+        fn _vpmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vpmax_s32(a, b) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmax_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umaxp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxp.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v8i8")]
+        fn _vpmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vpmax_u8(a, b) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmax_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umaxp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxp.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v4i16")]
+        fn _vpmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vpmax_u16(a, b) }
+}
+#[doc = "Folding maximum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmax_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(umaxp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxp.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v2i32")]
+        fn _vpmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vpmax_u32(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmin_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fminp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminp.v2f32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2f32")]
+        fn _vpmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vpmin_f32(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmin_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sminp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminp.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v8i8")]
+        fn _vpmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vpmin_s8(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmin_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sminp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminp.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v4i16")]
+        fn _vpmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vpmin_s16(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmin_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sminp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminp.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2i32")]
+        fn _vpmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vpmin_s32(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmin_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uminp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminp.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v8i8")]
+        fn _vpmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vpmin_u8(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmin_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uminp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminp.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v4i16")]
+        fn _vpmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vpmin_u16(a, b) }
+}
+#[doc = "Folding minimum of adjacent pairs"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpmin_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uminp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vpmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminp.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v2i32")]
+        fn _vpmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vpmin_u32(a, b) }
+}
+#[doc = "Signed saturating Absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabs_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqabs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqabs_s8(a: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqabs.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v8i8")]
+        fn _vqabs_s8(a: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vqabs_s8(a) }
+}
+#[doc = "Signed saturating Absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabsq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqabs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqabsq_s8(a: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqabs.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v16i8")]
+        fn _vqabsq_s8(a: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vqabsq_s8(a) }
+}
+#[doc = "Signed saturating Absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabs_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqabs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqabs_s16(a: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqabs.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v4i16")]
+        fn _vqabs_s16(a: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vqabs_s16(a) }
+}
+#[doc = "Signed saturating Absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabsq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqabs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqabsq_s16(a: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqabs.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v8i16")]
+        fn _vqabsq_s16(a: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vqabsq_s16(a) }
+}
+#[doc = "Signed saturating Absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabs_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqabs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqabs_s32(a: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqabs.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v2i32")]
+        fn _vqabs_s32(a: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vqabs_s32(a) }
+}
+#[doc = "Signed saturating Absolute value"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabsq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqabs)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqabsq_s32(a: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqabs.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v4i32")]
+        fn _vqabsq_s32(a: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vqabsq_s32(a) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqadd_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqadd.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v8i8")]
+        fn _vqadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vqadd_s8(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqadd.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v16i8")]
+        fn _vqaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vqaddq_s8(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqadd_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqadd.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v4i16")]
+        fn _vqadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vqadd_s16(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqadd.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v8i16")]
+        fn _vqaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vqaddq_s16(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqadd_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqadd.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v2i32")]
+        fn _vqadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vqadd_s32(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqadd.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v4i32")]
+        fn _vqaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vqaddq_s32(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqadd_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqadd_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqadd.v1i64"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v1i64")]
+        fn _vqadd_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+    unsafe { _vqadd_s64(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqadd.v2i64"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v2i64")]
+        fn _vqaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+    unsafe { _vqaddq_s64(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqadd_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqadd.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v8i8")]
+        fn _vqadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vqadd_u8(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqadd.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v16i8")]
+        fn _vqaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vqaddq_u8(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqadd_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqadd.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v4i16")]
+        fn _vqadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vqadd_u16(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqadd.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v8i16")]
+        fn _vqaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vqaddq_u16(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqadd_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqadd.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v2i32")]
+        fn _vqadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vqadd_u32(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqadd.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v4i32")]
+        fn _vqaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vqaddq_u32(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqadd_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqadd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqadd.v1i64"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v1i64")]
+        fn _vqadd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vqadd_u64(a, b) }
+}
+#[doc = "Saturating add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqadd.v2i64"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v2i64")]
+        fn _vqaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vqaddq_u64(a, b) }
+}
+#[doc = "Vector widening saturating doubling multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmlal, N = 2)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmlal_lane_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 2);
+    vqaddq_s32(a, vqdmull_lane_s16::<N>(b, c))
+}
+#[doc = "Vector widening saturating doubling multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmlal, N = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmlal_lane_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    vqaddq_s64(a, vqdmull_lane_s32::<N>(b, c))
+}
+#[doc = "Vector widening saturating doubling multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmlal_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vqaddq_s32(a, vqdmull_n_s16(b, c))
+}
+#[doc = "Vector widening saturating doubling multiply accumulate with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmlal_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vqaddq_s64(a, vqdmull_n_s32(b, c))
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    vqaddq_s32(a, vqdmull_s16(b, c))
+}
+#[doc = "Signed saturating doubling multiply-add long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlal_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmlal)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    vqaddq_s64(a, vqdmull_s32(b, c))
+}
+#[doc = "Vector widening saturating doubling multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmlsl, N = 2)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmlsl_lane_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 2);
+    vqsubq_s32(a, vqdmull_lane_s16::<N>(b, c))
+}
+#[doc = "Vector widening saturating doubling multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmlsl, N = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmlsl_lane_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    vqsubq_s64(a, vqdmull_lane_s32::<N>(b, c))
+}
+#[doc = "Vector widening saturating doubling multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmlsl_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vqsubq_s32(a, vqdmull_n_s16(b, c))
+}
+#[doc = "Vector widening saturating doubling multiply subtract with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmlsl_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vqsubq_s64(a, vqdmull_n_s32(b, c))
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    vqsubq_s32(a, vqdmull_s16(b, c))
+}
+#[doc = "Signed saturating doubling multiply-subtract long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsl_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmlsl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    vqsubq_s64(a, vqdmull_s32(b, c))
+}
+#[doc = "Vector saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulh_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmulh, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmulh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vqdmulh_s16(a, vdup_n_s16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Vector saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhq_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmulh, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmulhq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { vqdmulhq_s16(a, vdupq_n_s16(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Vector saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulh_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmulh, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmulh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqdmulh_s32(a, vdup_n_s32(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Vector saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhq_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmulh, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmulhq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { vqdmulhq_s32(a, vdupq_n_s32(simd_extract!(b, LANE as u32))) }
+}
+#[doc = "Vector saturating doubling multiply high with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulh_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmulh_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
+    let b: int16x4_t = vdup_n_s16(b);
+    vqdmulh_s16(a, b)
+}
+#[doc = "Vector saturating doubling multiply high with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmulhq_n_s16(a: int16x8_t, b: i16) -> int16x8_t {
+    let b: int16x8_t = vdupq_n_s16(b);
+    vqdmulhq_s16(a, b)
+}
+#[doc = "Vector saturating doubling multiply high with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulh_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
+    let b: int32x2_t = vdup_n_s32(b);
+    vqdmulh_s32(a, b)
+}
+#[doc = "Vector saturating doubling multiply high with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmulhq_n_s32(a: int32x4_t, b: i32) -> int32x4_t {
+    let b: int32x4_t = vdupq_n_s32(b);
+    vqdmulhq_s32(a, b)
+}
+#[doc = "Signed saturating doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqdmulh.v4i16"
+        )]
+        fn _vqdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vqdmulh_s16(a, b) }
+}
+#[doc = "Signed saturating doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqdmulh.v8i16"
+        )]
+        fn _vqdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vqdmulhq_s16(a, b) }
+}
+#[doc = "Signed saturating doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulh_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqdmulh.v2i32"
+        )]
+        fn _vqdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vqdmulh_s32(a, b) }
+}
+#[doc = "Signed saturating doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqdmulh.v4i32"
+        )]
+        fn _vqdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vqdmulhq_s32(a, b) }
+}
+#[doc = "Vector saturating doubling long multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmull, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmull_lane_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 2);
+    unsafe {
+        let b: int16x4_t = simd_shuffle!(b, b, [N as u32, N as u32, N as u32, N as u32]);
+        vqdmull_s16(a, b)
+    }
+}
+#[doc = "Vector saturating doubling long multiply by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmull, N = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmull_lane_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 1);
+    unsafe {
+        let b: int32x2_t = simd_shuffle!(b, b, [N as u32, N as u32]);
+        vqdmull_s32(a, b)
+    }
+}
+#[doc = "Vector saturating doubling long multiply with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmull_n_s16(a: int16x4_t, b: i16) -> int32x4_t {
+    vqdmull_s16(a, vdup_n_s16(b))
+}
+#[doc = "Vector saturating doubling long multiply with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmull_n_s32(a: int32x2_t, b: i32) -> int64x2_t {
+    vqdmull_s32(a, vdup_n_s32(b))
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqdmull.v4i32"
+        )]
+        fn _vqdmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t;
+    }
+    unsafe { _vqdmull_s16(a, b) }
+}
+#[doc = "Signed saturating doubling multiply long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqdmull)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqdmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v2i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqdmull.v2i64"
+        )]
+        fn _vqdmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t;
+    }
+    unsafe { _vqdmull_s32(a, b) }
+}
+#[doc = "Signed saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqxtn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqmovn_s16(a: int16x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovns.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqxtn.v8i8"
+        )]
+        fn _vqmovn_s16(a: int16x8_t) -> int8x8_t;
+    }
+    unsafe { _vqmovn_s16(a) }
+}
+#[doc = "Signed saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqxtn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqmovn_s32(a: int32x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovns.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqxtn.v4i16"
+        )]
+        fn _vqmovn_s32(a: int32x4_t) -> int16x4_t;
+    }
+    unsafe { _vqmovn_s32(a) }
+}
+#[doc = "Signed saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqxtn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqmovn_s64(a: int64x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovns.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqxtn.v2i32"
+        )]
+        fn _vqmovn_s64(a: int64x2_t) -> int32x2_t;
+    }
+    unsafe { _vqmovn_s64(a) }
+}
+#[doc = "Unsigned saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqxtn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqmovn_u16(a: uint16x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqxtn.v8i8"
+        )]
+        fn _vqmovn_u16(a: uint16x8_t) -> uint8x8_t;
+    }
+    unsafe { _vqmovn_u16(a) }
+}
+#[doc = "Unsigned saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqxtn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqmovn_u32(a: uint32x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqxtn.v4i16"
+        )]
+        fn _vqmovn_u32(a: uint32x4_t) -> uint16x4_t;
+    }
+    unsafe { _vqmovn_u32(a) }
+}
+#[doc = "Unsigned saturating extract narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqxtn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqmovn_u64(a: uint64x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqxtn.v2i32"
+        )]
+        fn _vqmovn_u64(a: uint64x2_t) -> uint32x2_t;
+    }
+    unsafe { _vqmovn_u64(a) }
+}
+#[doc = "Signed saturating extract unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovun_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovun))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqxtun)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqmovun_s16(a: int16x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnsu.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqxtun.v8i8"
+        )]
+        fn _vqmovun_s16(a: int16x8_t) -> uint8x8_t;
+    }
+    unsafe { _vqmovun_s16(a) }
+}
+#[doc = "Signed saturating extract unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovun_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovun))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqxtun)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqmovun_s32(a: int32x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnsu.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqxtun.v4i16"
+        )]
+        fn _vqmovun_s32(a: int32x4_t) -> uint16x4_t;
+    }
+    unsafe { _vqmovun_s32(a) }
+}
+#[doc = "Signed saturating extract unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovun_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovun))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqxtun)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqmovun_s64(a: int64x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnsu.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqxtun.v2i32"
+        )]
+        fn _vqmovun_s64(a: int64x2_t) -> uint32x2_t;
+    }
+    unsafe { _vqmovun_s64(a) }
+}
+#[doc = "Signed saturating negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqneg_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqneg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqneg_s8(a: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqneg.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v8i8")]
+        fn _vqneg_s8(a: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vqneg_s8(a) }
+}
+#[doc = "Signed saturating negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqnegq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqneg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqnegq_s8(a: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqneg.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v16i8")]
+        fn _vqnegq_s8(a: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vqnegq_s8(a) }
+}
+#[doc = "Signed saturating negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqneg_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqneg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqneg_s16(a: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqneg.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v4i16")]
+        fn _vqneg_s16(a: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vqneg_s16(a) }
+}
+#[doc = "Signed saturating negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqnegq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqneg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqnegq_s16(a: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqneg.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v8i16")]
+        fn _vqnegq_s16(a: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vqnegq_s16(a) }
+}
+#[doc = "Signed saturating negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqneg_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqneg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqneg_s32(a: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqneg.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v2i32")]
+        fn _vqneg_s32(a: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vqneg_s32(a) }
+}
+#[doc = "Signed saturating negate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqnegq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqneg)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqnegq_s32(a: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqneg.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v4i32")]
+        fn _vqnegq_s32(a: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vqnegq_s32(a) }
+}
+#[doc = "Vector rounding saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulh_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let b: int16x4_t =
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vqrdmulh_s16(a, b)
+    }
+}
+#[doc = "Vector rounding saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulh_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let b: int32x2_t = simd_shuffle!(b, b, [LANE as u32, LANE as u32]);
+        vqrdmulh_s32(a, b)
+    }
+}
+#[doc = "Vector rounding saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulh_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        let b: int16x4_t =
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vqrdmulh_s16(a, b)
+    }
+}
+#[doc = "Vector rounding saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulh_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let b: int32x2_t = simd_shuffle!(b, b, [LANE as u32, LANE as u32]);
+        vqrdmulh_s32(a, b)
+    }
+}
+#[doc = "Vector rounding saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhq_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulhq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let b: int16x8_t = simd_shuffle!(
+            b,
+            b,
+            [
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32
+            ]
+        );
+        vqrdmulhq_s16(a, b)
+    }
+}
+#[doc = "Vector rounding saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhq_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulhq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let b: int32x4_t =
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vqrdmulhq_s32(a, b)
+    }
+}
+#[doc = "Vector rounding saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhq_laneq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulhq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe {
+        let b: int16x8_t = simd_shuffle!(
+            b,
+            b,
+            [
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32,
+                LANE as u32
+            ]
+        );
+        vqrdmulhq_s16(a, b)
+    }
+}
+#[doc = "Vector rounding saturating doubling multiply high by scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhq_laneq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh, LANE = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulhq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe {
+        let b: int32x4_t =
+            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vqrdmulhq_s32(a, b)
+    }
+}
+#[doc = "Vector saturating rounding doubling multiply high with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulh_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulh_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
+    vqrdmulh_s16(a, vdup_n_s16(b))
+}
+#[doc = "Vector saturating rounding doubling multiply high with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulhq_n_s16(a: int16x8_t, b: i16) -> int16x8_t {
+    vqrdmulhq_s16(a, vdupq_n_s16(b))
+}
+#[doc = "Vector saturating rounding doubling multiply high with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulh_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
+    vqrdmulh_s32(a, vdup_n_s32(b))
+}
+#[doc = "Vector saturating rounding doubling multiply high with scalar"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulhq_n_s32(a: int32x4_t, b: i32) -> int32x4_t {
+    vqrdmulhq_s32(a, vdupq_n_s32(b))
+}
+#[doc = "Signed saturating rounding doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulh_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrdmulh.v4i16"
+        )]
+        fn _vqrdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vqrdmulh_s16(a, b) }
+}
+#[doc = "Signed saturating rounding doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrdmulh.v8i16"
+        )]
+        fn _vqrdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vqrdmulhq_s16(a, b) }
+}
+#[doc = "Signed saturating rounding doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulh_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrdmulh.v2i32"
+        )]
+        fn _vqrdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vqrdmulh_s32(a, b) }
+}
+#[doc = "Signed saturating rounding doubling multiply returning high half"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrdmulh)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrdmulh.v4i32"
+        )]
+        fn _vqrdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vqrdmulhq_s32(a, b) }
+}
+#[doc = "Signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshl_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshl.v8i8"
+        )]
+        fn _vqrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vqrshl_s8(a, b) }
+}
+#[doc = "Signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshl.v16i8"
+        )]
+        fn _vqrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vqrshlq_s8(a, b) }
+}
+#[doc = "Signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshl_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshl.v4i16"
+        )]
+        fn _vqrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vqrshl_s16(a, b) }
+}
+#[doc = "Signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshl.v8i16"
+        )]
+        fn _vqrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vqrshlq_s16(a, b) }
+}
+#[doc = "Signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshl_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshl.v2i32"
+        )]
+        fn _vqrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vqrshl_s32(a, b) }
+}
+#[doc = "Signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshl.v4i32"
+        )]
+        fn _vqrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vqrshlq_s32(a, b) }
+}
+#[doc = "Signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshl_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v1i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshl.v1i64"
+        )]
+        fn _vqrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+    unsafe { _vqrshl_s64(a, b) }
+}
+#[doc = "Signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v2i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshl.v2i64"
+        )]
+        fn _vqrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+    unsafe { _vqrshlq_s64(a, b) }
+}
+#[doc = "Unsigned signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshl_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqrshl.v8i8"
+        )]
+        fn _vqrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vqrshl_u8(a, b) }
+}
+#[doc = "Unsigned signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqrshl.v16i8"
+        )]
+        fn _vqrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vqrshlq_u8(a, b) }
+}
+#[doc = "Unsigned signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshl_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqrshl.v4i16"
+        )]
+        fn _vqrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vqrshl_u16(a, b) }
+}
+#[doc = "Unsigned signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqrshl.v8i16"
+        )]
+        fn _vqrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vqrshlq_u16(a, b) }
+}
+#[doc = "Unsigned signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshl_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqrshl.v2i32"
+        )]
+        fn _vqrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vqrshl_u32(a, b) }
+}
+#[doc = "Unsigned signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqrshl.v4i32"
+        )]
+        fn _vqrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vqrshlq_u32(a, b) }
+}
+#[doc = "Unsigned signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshl_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v1i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqrshl.v1i64"
+        )]
+        fn _vqrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vqrshl_u64(a, b) }
+}
+#[doc = "Unsigned signed saturating rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqrshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v2i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqrshl.v2i64"
+        )]
+        fn _vqrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vqrshlq_u64(a, b) }
+}
+#[doc = "Signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_s16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v8i8")]
+        fn _vqrshrn_n_s16(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+    }
+    unsafe {
+        _vqrshrn_n_s16(
+            a,
+            const {
+                int16x8_t([
+                    -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16,
+                    -N as i16,
+                ])
+            },
+        )
+    }
+}
+#[doc = "Signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_s32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v4i16")]
+        fn _vqrshrn_n_s32(a: int32x4_t, n: int32x4_t) -> int16x4_t;
+    }
+    unsafe {
+        _vqrshrn_n_s32(
+            a,
+            const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) },
+        )
+    }
+}
+#[doc = "Signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_s64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v2i32")]
+        fn _vqrshrn_n_s64(a: int64x2_t, n: int64x2_t) -> int32x2_t;
+    }
+    unsafe { _vqrshrn_n_s64(a, const { int64x2_t([-N as i64, -N as i64]) }) }
+}
+#[doc = "Signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshrn.v8i8"
+        )]
+        fn _vqrshrn_n_s16(a: int16x8_t, n: i32) -> int8x8_t;
+    }
+    unsafe { _vqrshrn_n_s16(a, N) }
+}
+#[doc = "Signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshrn.v4i16"
+        )]
+        fn _vqrshrn_n_s32(a: int32x4_t, n: i32) -> int16x4_t;
+    }
+    unsafe { _vqrshrn_n_s32(a, N) }
+}
+#[doc = "Signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshrn.v2i32"
+        )]
+        fn _vqrshrn_n_s64(a: int64x2_t, n: i32) -> int32x2_t;
+    }
+    unsafe { _vqrshrn_n_s64(a, N) }
+}
+#[doc = "Unsigned signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_u16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v8i8")]
+        fn _vqrshrn_n_u16(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
+    }
+    unsafe {
+        _vqrshrn_n_u16(
+            a,
+            const {
+                uint16x8_t([
+                    -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16,
+                    -N as u16,
+                ])
+            },
+        )
+    }
+}
+#[doc = "Unsigned signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_u32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v4i16")]
+        fn _vqrshrn_n_u32(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
+    }
+    unsafe {
+        _vqrshrn_n_u32(
+            a,
+            const { uint32x4_t([-N as u32, -N as u32, -N as u32, -N as u32]) },
+        )
+    }
+}
+#[doc = "Unsigned signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_u64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v2i32")]
+        fn _vqrshrn_n_u64(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
+    }
+    unsafe { _vqrshrn_n_u64(a, const { uint64x2_t([-N as u64, -N as u64]) }) }
+}
+#[doc = "Unsigned signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqrshrn.v8i8"
+        )]
+        fn _vqrshrn_n_u16(a: uint16x8_t, n: i32) -> uint8x8_t;
+    }
+    unsafe { _vqrshrn_n_u16(a, N) }
+}
+#[doc = "Unsigned signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqrshrn.v4i16"
+        )]
+        fn _vqrshrn_n_u32(a: uint32x4_t, n: i32) -> uint16x4_t;
+    }
+    unsafe { _vqrshrn_n_u32(a, N) }
+}
+#[doc = "Unsigned signed saturating rounded shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqrshrn.v2i32"
+        )]
+        fn _vqrshrn_n_u64(a: uint64x2_t, n: i32) -> uint32x2_t;
+    }
+    unsafe { _vqrshrn_n_u64(a, N) }
+}
+#[doc = "Signed saturating rounded shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_n_s16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqrshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v8i8")]
+        fn _vqrshrun_n_s16(a: int16x8_t, n: int16x8_t) -> uint8x8_t;
+    }
+    unsafe {
+        _vqrshrun_n_s16(
+            a,
+            const {
+                int16x8_t([
+                    -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16,
+                    -N as i16,
+                ])
+            },
+        )
+    }
+}
+#[doc = "Signed saturating rounded shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_n_s32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqrshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v4i16")]
+        fn _vqrshrun_n_s32(a: int32x4_t, n: int32x4_t) -> uint16x4_t;
+    }
+    unsafe {
+        _vqrshrun_n_s32(
+            a,
+            const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) },
+        )
+    }
+}
+#[doc = "Signed saturating rounded shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_n_s64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqrshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v2i32")]
+        fn _vqrshrun_n_s64(a: int64x2_t, n: int64x2_t) -> uint32x2_t;
+    }
+    unsafe { _vqrshrun_n_s64(a, const { int64x2_t([-N as i64, -N as i64]) }) }
+}
+#[doc = "Signed saturating rounded shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshrun.v8i8"
+        )]
+        fn _vqrshrun_n_s16(a: int16x8_t, n: i32) -> uint8x8_t;
+    }
+    unsafe { _vqrshrun_n_s16(a, N) }
+}
+#[doc = "Signed saturating rounded shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshrun.v4i16"
+        )]
+        fn _vqrshrun_n_s32(a: int32x4_t, n: i32) -> uint16x4_t;
+    }
+    unsafe { _vqrshrun_n_s32(a, N) }
+}
+#[doc = "Signed saturating rounded shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqrshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqrshrun.v2i32"
+        )]
+        fn _vqrshrun_n_s64(a: int64x2_t, n: i32) -> uint32x2_t;
+    }
+    unsafe { _vqrshrun_n_s64(a, N) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    vqshl_s8(a, vdup_n_s8(N as _))
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    vqshlq_s8(a, vdupq_n_s8(N as _))
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(N, 4);
+    vqshl_s16(a, vdup_n_s16(N as _))
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(N, 4);
+    vqshlq_s16(a, vdupq_n_s16(N as _))
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(N, 5);
+    vqshl_s32(a, vdup_n_s32(N as _))
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 5);
+    vqshlq_s32(a, vdupq_n_s32(N as _))
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert_uimm_bits!(N, 6);
+    vqshl_s64(a, vdup_n_s64(N as _))
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 6);
+    vqshlq_s64(a, vdupq_n_s64(N as _))
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    vqshl_u8(a, vdup_n_s8(N as _))
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    vqshlq_u8(a, vdupq_n_s8(N as _))
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert_uimm_bits!(N, 4);
+    vqshl_u16(a, vdup_n_s16(N as _))
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert_uimm_bits!(N, 4);
+    vqshlq_u16(a, vdupq_n_s16(N as _))
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert_uimm_bits!(N, 5);
+    vqshl_u32(a, vdup_n_s32(N as _))
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(N, 5);
+    vqshlq_u32(a, vdupq_n_s32(N as _))
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert_uimm_bits!(N, 6);
+    vqshl_u64(a, vdup_n_s64(N as _))
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert_uimm_bits!(N, 6);
+    vqshlq_u64(a, vdupq_n_s64(N as _))
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshl.v8i8"
+        )]
+        fn _vqshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vqshl_s8(a, b) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshl.v16i8"
+        )]
+        fn _vqshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vqshlq_s8(a, b) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshl.v4i16"
+        )]
+        fn _vqshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vqshl_s16(a, b) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshl.v8i16"
+        )]
+        fn _vqshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vqshlq_s16(a, b) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshl.v2i32"
+        )]
+        fn _vqshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vqshl_s32(a, b) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshl.v4i32"
+        )]
+        fn _vqshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vqshlq_s32(a, b) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v1i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshl.v1i64"
+        )]
+        fn _vqshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+    unsafe { _vqshl_s64(a, b) }
+}
+#[doc = "Signed saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v2i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshl.v2i64"
+        )]
+        fn _vqshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+    unsafe { _vqshlq_s64(a, b) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqshl.v8i8"
+        )]
+        fn _vqshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vqshl_u8(a, b) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqshl.v16i8"
+        )]
+        fn _vqshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vqshlq_u8(a, b) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqshl.v4i16"
+        )]
+        fn _vqshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vqshl_u16(a, b) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqshl.v8i16"
+        )]
+        fn _vqshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vqshlq_u16(a, b) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqshl.v2i32"
+        )]
+        fn _vqshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vqshl_u32(a, b) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqshl.v4i32"
+        )]
+        fn _vqshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vqshlq_u32(a, b) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshl_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v1i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqshl.v1i64"
+        )]
+        fn _vqshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vqshl_u64(a, b) }
+}
+#[doc = "Unsigned saturating shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v2i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqshl.v2i64"
+        )]
+        fn _vqshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vqshlq_u64(a, b) }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlu_n_s8)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshlu_n_s8<const N: i32>(a: int8x8_t) -> uint8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v8i8")]
+        fn _vqshlu_n_s8(a: int8x8_t, n: int8x8_t) -> uint8x8_t;
+    }
+    unsafe {
+        _vqshlu_n_s8(
+            a,
+            const {
+                int8x8_t([
+                    N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8,
+                ])
+            },
+        )
+    }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluq_n_s8)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshluq_n_s8<const N: i32>(a: int8x16_t) -> uint8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v16i8")]
+        fn _vqshluq_n_s8(a: int8x16_t, n: int8x16_t) -> uint8x16_t;
+    }
+    unsafe {
+        _vqshluq_n_s8(
+            a,
+            const {
+                int8x16_t([
+                    N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8,
+                    N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8,
+                ])
+            },
+        )
+    }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlu_n_s16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshlu_n_s16<const N: i32>(a: int16x4_t) -> uint16x4_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v4i16")]
+        fn _vqshlu_n_s16(a: int16x4_t, n: int16x4_t) -> uint16x4_t;
+    }
+    unsafe {
+        _vqshlu_n_s16(
+            a,
+            const { int16x4_t([N as i16, N as i16, N as i16, N as i16]) },
+        )
+    }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluq_n_s16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshluq_n_s16<const N: i32>(a: int16x8_t) -> uint16x8_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v8i16")]
+        fn _vqshluq_n_s16(a: int16x8_t, n: int16x8_t) -> uint16x8_t;
+    }
+    unsafe {
+        _vqshluq_n_s16(
+            a,
+            const {
+                int16x8_t([
+                    N as i16, N as i16, N as i16, N as i16, N as i16, N as i16, N as i16, N as i16,
+                ])
+            },
+        )
+    }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlu_n_s32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshlu_n_s32<const N: i32>(a: int32x2_t) -> uint32x2_t {
+    static_assert_uimm_bits!(N, 5);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v2i32")]
+        fn _vqshlu_n_s32(a: int32x2_t, n: int32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vqshlu_n_s32(a, const { int32x2_t([N as i32, N as i32]) }) }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluq_n_s32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshluq_n_s32<const N: i32>(a: int32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(N, 5);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v4i32")]
+        fn _vqshluq_n_s32(a: int32x4_t, n: int32x4_t) -> uint32x4_t;
+    }
+    unsafe {
+        _vqshluq_n_s32(
+            a,
+            const { int32x4_t([N as i32, N as i32, N as i32, N as i32]) },
+        )
+    }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlu_n_s64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshlu_n_s64<const N: i32>(a: int64x1_t) -> uint64x1_t {
+    static_assert_uimm_bits!(N, 6);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v1i64")]
+        fn _vqshlu_n_s64(a: int64x1_t, n: int64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vqshlu_n_s64(a, const { int64x1_t([N as i64]) }) }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluq_n_s64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshluq_n_s64<const N: i32>(a: int64x2_t) -> uint64x2_t {
+    static_assert_uimm_bits!(N, 6);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v2i64")]
+        fn _vqshluq_n_s64(a: int64x2_t, n: int64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vqshluq_n_s64(a, const { int64x2_t([N as i64, N as i64]) }) }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlu_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlu_n_s8<const N: i32>(a: int8x8_t) -> uint8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshlu.v8i8"
+        )]
+        fn _vqshlu_n_s8(a: int8x8_t, n: int8x8_t) -> uint8x8_t;
+    }
+    unsafe {
+        _vqshlu_n_s8(
+            a,
+            const {
+                int8x8_t([
+                    N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8,
+                ])
+            },
+        )
+    }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluq_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshluq_n_s8<const N: i32>(a: int8x16_t) -> uint8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshlu.v16i8"
+        )]
+        fn _vqshluq_n_s8(a: int8x16_t, n: int8x16_t) -> uint8x16_t;
+    }
+    unsafe {
+        _vqshluq_n_s8(
+            a,
+            const {
+                int8x16_t([
+                    N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8,
+                    N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8,
+                ])
+            },
+        )
+    }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlu_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlu_n_s16<const N: i32>(a: int16x4_t) -> uint16x4_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshlu.v4i16"
+        )]
+        fn _vqshlu_n_s16(a: int16x4_t, n: int16x4_t) -> uint16x4_t;
+    }
+    unsafe {
+        _vqshlu_n_s16(
+            a,
+            const { int16x4_t([N as i16, N as i16, N as i16, N as i16]) },
+        )
+    }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshluq_n_s16<const N: i32>(a: int16x8_t) -> uint16x8_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshlu.v8i16"
+        )]
+        fn _vqshluq_n_s16(a: int16x8_t, n: int16x8_t) -> uint16x8_t;
+    }
+    unsafe {
+        _vqshluq_n_s16(
+            a,
+            const {
+                int16x8_t([
+                    N as i16, N as i16, N as i16, N as i16, N as i16, N as i16, N as i16, N as i16,
+                ])
+            },
+        )
+    }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlu_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlu_n_s32<const N: i32>(a: int32x2_t) -> uint32x2_t {
+    static_assert_uimm_bits!(N, 5);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshlu.v2i32"
+        )]
+        fn _vqshlu_n_s32(a: int32x2_t, n: int32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vqshlu_n_s32(a, const { int32x2_t([N as i32, N as i32]) }) }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshluq_n_s32<const N: i32>(a: int32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(N, 5);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshlu.v4i32"
+        )]
+        fn _vqshluq_n_s32(a: int32x4_t, n: int32x4_t) -> uint32x4_t;
+    }
+    unsafe {
+        _vqshluq_n_s32(
+            a,
+            const { int32x4_t([N as i32, N as i32, N as i32, N as i32]) },
+        )
+    }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlu_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshlu_n_s64<const N: i32>(a: int64x1_t) -> uint64x1_t {
+    static_assert_uimm_bits!(N, 6);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshlu.v1i64"
+        )]
+        fn _vqshlu_n_s64(a: int64x1_t, n: int64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vqshlu_n_s64(a, const { int64x1_t([N as i64]) }) }
+}
+#[doc = "Signed saturating shift left unsigned"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluq_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshluq_n_s64<const N: i32>(a: int64x2_t) -> uint64x2_t {
+    static_assert_uimm_bits!(N, 6);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshlu.v2i64"
+        )]
+        fn _vqshluq_n_s64(a: int64x2_t, n: int64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vqshluq_n_s64(a, const { int64x2_t([N as i64, N as i64]) }) }
+}
+#[doc = "Signed saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_s16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v8i8")]
+        fn _vqshrn_n_s16(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+    }
+    unsafe {
+        _vqshrn_n_s16(
+            a,
+            const {
+                int16x8_t([
+                    -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16,
+                    -N as i16,
+                ])
+            },
+        )
+    }
+}
+#[doc = "Signed saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_s32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v4i16")]
+        fn _vqshrn_n_s32(a: int32x4_t, n: int32x4_t) -> int16x4_t;
+    }
+    unsafe {
+        _vqshrn_n_s32(
+            a,
+            const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) },
+        )
+    }
+}
+#[doc = "Signed saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_s64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v2i32")]
+        fn _vqshrn_n_s64(a: int64x2_t, n: int64x2_t) -> int32x2_t;
+    }
+    unsafe { _vqshrn_n_s64(a, const { int64x2_t([-N as i64, -N as i64]) }) }
+}
+#[doc = "Signed saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshrn.v8i8"
+        )]
+        fn _vqshrn_n_s16(a: int16x8_t, n: i32) -> int8x8_t;
+    }
+    unsafe { _vqshrn_n_s16(a, N) }
+}
+#[doc = "Signed saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshrn.v4i16"
+        )]
+        fn _vqshrn_n_s32(a: int32x4_t, n: i32) -> int16x4_t;
+    }
+    unsafe { _vqshrn_n_s32(a, N) }
+}
+#[doc = "Signed saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshrn.v2i32"
+        )]
+        fn _vqshrn_n_s64(a: int64x2_t, n: i32) -> int32x2_t;
+    }
+    unsafe { _vqshrn_n_s64(a, N) }
+}
+#[doc = "Unsigned saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_u16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v8i8")]
+        fn _vqshrn_n_u16(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
+    }
+    unsafe {
+        _vqshrn_n_u16(
+            a,
+            const {
+                uint16x8_t([
+                    -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16,
+                    -N as u16,
+                ])
+            },
+        )
+    }
+}
+#[doc = "Unsigned saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_u32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v4i16")]
+        fn _vqshrn_n_u32(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
+    }
+    unsafe {
+        _vqshrn_n_u32(
+            a,
+            const { uint32x4_t([-N as u32, -N as u32, -N as u32, -N as u32]) },
+        )
+    }
+}
+#[doc = "Unsigned saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_u64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v2i32")]
+        fn _vqshrn_n_u64(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
+    }
+    unsafe { _vqshrn_n_u64(a, const { uint64x2_t([-N as u64, -N as u64]) }) }
+}
+#[doc = "Unsigned saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqshrn.v8i8"
+        )]
+        fn _vqshrn_n_u16(a: uint16x8_t, n: i32) -> uint8x8_t;
+    }
+    unsafe { _vqshrn_n_u16(a, N) }
+}
+#[doc = "Unsigned saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqshrn.v4i16"
+        )]
+        fn _vqshrn_n_u32(a: uint32x4_t, n: i32) -> uint16x4_t;
+    }
+    unsafe { _vqshrn_n_u32(a, N) }
+}
+#[doc = "Unsigned saturating shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqshrn.v2i32"
+        )]
+        fn _vqshrn_n_u64(a: uint64x2_t, n: i32) -> uint32x2_t;
+    }
+    unsafe { _vqshrn_n_u64(a, N) }
+}
+#[doc = "Signed saturating shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_n_s16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v8i8")]
+        fn _vqshrun_n_s16(a: int16x8_t, n: int16x8_t) -> uint8x8_t;
+    }
+    unsafe {
+        _vqshrun_n_s16(
+            a,
+            const {
+                int16x8_t([
+                    -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16,
+                    -N as i16,
+                ])
+            },
+        )
+    }
+}
+#[doc = "Signed saturating shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_n_s32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v4i16")]
+        fn _vqshrun_n_s32(a: int32x4_t, n: int32x4_t) -> uint16x4_t;
+    }
+    unsafe {
+        _vqshrun_n_s32(
+            a,
+            const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) },
+        )
+    }
+}
+#[doc = "Signed saturating shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_n_s64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vqshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v2i32")]
+        fn _vqshrun_n_s64(a: int64x2_t, n: int64x2_t) -> uint32x2_t;
+    }
+    unsafe { _vqshrun_n_s64(a, const { int64x2_t([-N as i64, -N as i64]) }) }
+}
+#[doc = "Signed saturating shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshrun.v8i8"
+        )]
+        fn _vqshrun_n_s16(a: int16x8_t, n: i32) -> uint8x8_t;
+    }
+    unsafe { _vqshrun_n_s16(a, N) }
+}
+#[doc = "Signed saturating shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshrun.v4i16"
+        )]
+        fn _vqshrun_n_s32(a: int32x4_t, n: i32) -> uint16x4_t;
+    }
+    unsafe { _vqshrun_n_s32(a, N) }
+}
+#[doc = "Signed saturating shift right unsigned narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vqshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqshrun.v2i32"
+        )]
+        fn _vqshrun_n_s64(a: int64x2_t, n: i32) -> uint32x2_t;
+    }
+    unsafe { _vqshrun_n_s64(a, N) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsub_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqsub.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v8i8")]
+        fn _vqsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vqsub_s8(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqsub.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v16i8")]
+        fn _vqsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vqsubq_s8(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsub_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqsub.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v4i16")]
+        fn _vqsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vqsub_s16(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqsub.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v8i16")]
+        fn _vqsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vqsubq_s16(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsub_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqsub.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v2i32")]
+        fn _vqsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vqsub_s32(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqsub.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v4i32")]
+        fn _vqsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vqsubq_s32(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsub_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqsub.v1i64"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v1i64")]
+        fn _vqsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+    unsafe { _vqsub_s64(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sqsub.v2i64"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v2i64")]
+        fn _vqsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+    unsafe { _vqsubq_s64(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsub_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqsub.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v8i8")]
+        fn _vqsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vqsub_u8(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqsub.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v16i8")]
+        fn _vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vqsubq_u8(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsub_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqsub.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v4i16")]
+        fn _vqsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vqsub_u16(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqsub.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v8i16")]
+        fn _vqsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vqsubq_u16(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsub_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqsub.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v2i32")]
+        fn _vqsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vqsub_u32(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqsub.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v4i32")]
+        fn _vqsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vqsubq_u32(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsub_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqsub.v1i64"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v1i64")]
+        fn _vqsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vqsub_u64(a, b) }
+}
+#[doc = "Saturating subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uqsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vqsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uqsub.v2i64"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v2i64")]
+        fn _vqsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vqsubq_u64(a, b) }
+}
+#[doc = "Rounding Add returning High Narrow (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
+    let x = vraddhn_s16(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Rounding Add returning High Narrow (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
+    let x = vraddhn_s32(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Rounding Add returning High Narrow (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_high_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
+    let x = vraddhn_s64(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3]) }
+}
+#[doc = "Rounding Add returning High Narrow (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
+    unsafe {
+        let x: uint8x8_t = transmute(vraddhn_s16(transmute(b), transmute(c)));
+        simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    }
+}
+#[doc = "Rounding Add returning High Narrow (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
+    unsafe {
+        let x: uint16x4_t = transmute(vraddhn_s32(transmute(b), transmute(c)));
+        simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7])
+    }
+}
+#[doc = "Rounding Add returning High Narrow (high half)."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_high_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
+    unsafe {
+        let x: uint32x2_t = transmute(vraddhn_s64(transmute(b), transmute(c)));
+        simd_shuffle!(a, x, [0, 1, 2, 3])
+    }
+}
+#[doc = "Rounding Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.raddhn.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vraddhn.v8i8")]
+        fn _vraddhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t;
+    }
+    unsafe { _vraddhn_s16(a, b) }
+}
+#[doc = "Rounding Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.raddhn.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vraddhn.v4i16")]
+        fn _vraddhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t;
+    }
+    unsafe { _vraddhn_s32(a, b) }
+}
+#[doc = "Rounding Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.raddhn.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vraddhn.v2i32")]
+        fn _vraddhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t;
+    }
+    unsafe { _vraddhn_s64(a, b) }
+}
+#[doc = "Rounding Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    unsafe { transmute(vraddhn_s16(transmute(a), transmute(b))) }
+}
+#[doc = "Rounding Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint16x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vraddhn_s16(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Rounding Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    unsafe { transmute(vraddhn_s32(transmute(a), transmute(b))) }
+}
+#[doc = "Rounding Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    let b: uint32x4_t = unsafe { simd_shuffle!(b, b, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x4_t = transmute(vraddhn_s32(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Rounding Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    unsafe { transmute(vraddhn_s64(transmute(a), transmute(b))) }
+}
+#[doc = "Rounding Add returning High Narrow."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vraddhn.i64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(raddhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vraddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    let b: uint64x2_t = unsafe { simd_shuffle!(b, b, [1, 0]) };
+    unsafe {
+        let ret_val: uint32x2_t = transmute(vraddhn_s64(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Reciprocal estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpe_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frecpe)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrecpe_f16(a: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecpe.v4f16"
+        )]
+        fn _vrecpe_f16(a: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vrecpe_f16(a) }
+}
+#[doc = "Reciprocal estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpeq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frecpe)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrecpeq_f16(a: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v8f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecpe.v8f16"
+        )]
+        fn _vrecpeq_f16(a: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vrecpeq_f16(a) }
+}
+#[doc = "Reciprocal estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpe_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frecpe)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrecpe_f32(a: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecpe.v2f32"
+        )]
+        fn _vrecpe_f32(a: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vrecpe_f32(a) }
+}
+#[doc = "Reciprocal estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpeq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frecpe)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrecpeq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecpe.v4f32"
+        )]
+        fn _vrecpeq_f32(a: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vrecpeq_f32(a) }
+}
+#[doc = "Unsigned reciprocal estimate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpe_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urecpe)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrecpe_u32(a: uint32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urecpe.v2i32"
+        )]
+        fn _vrecpe_u32(a: uint32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vrecpe_u32(a) }
+}
+#[doc = "Unsigned reciprocal estimate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpeq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urecpe)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrecpeq_u32(a: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urecpe.v4i32"
+        )]
+        fn _vrecpeq_u32(a: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vrecpeq_u32(a) }
+}
+#[doc = "Floating-point reciprocal step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecps_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecps))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frecps)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrecps_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecps.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecps.v4f16"
+        )]
+        fn _vrecps_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vrecps_f16(a, b) }
+}
+#[doc = "Floating-point reciprocal step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpsq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecps))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frecps)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrecpsq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecps.v8f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecps.v8f16"
+        )]
+        fn _vrecpsq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vrecpsq_f16(a, b) }
+}
+#[doc = "Floating-point reciprocal step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecps_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecps))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frecps)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrecps_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecps.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecps.v2f32"
+        )]
+        fn _vrecps_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vrecps_f32(a, b) }
+}
+#[doc = "Floating-point reciprocal step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpsq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecps))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frecps)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrecpsq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecps.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frecps.v4f32"
+        )]
+        fn _vrecpsq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vrecpsq_f32(a, b) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f32_f16(a: float16x4_t) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f32_f16(a: float16x4_t) -> float32x2_t {
+    let a: float16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_s8_f16(a: float16x4_t) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_s8_f16(a: float16x4_t) -> int8x8_t {
+    let a: float16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_s16_f16(a: float16x4_t) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_s16_f16(a: float16x4_t) -> int16x4_t {
+    let a: float16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_s32_f16(a: float16x4_t) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_s32_f16(a: float16x4_t) -> int32x2_t {
+    let a: float16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_s64_f16(a: float16x4_t) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_s64_f16(a: float16x4_t) -> int64x1_t {
+    let a: float16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_u8_f16(a: float16x4_t) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_u8_f16(a: float16x4_t) -> uint8x8_t {
+    let a: float16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_u16_f16(a: float16x4_t) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_u16_f16(a: float16x4_t) -> uint16x4_t {
+    let a: float16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_u32_f16(a: float16x4_t) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_u32_f16(a: float16x4_t) -> uint32x2_t {
+    let a: float16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_u64_f16(a: float16x4_t) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_u64_f16(a: float16x4_t) -> uint64x1_t {
+    let a: float16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_p8_f16(a: float16x4_t) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_p8_f16(a: float16x4_t) -> poly8x8_t {
+    let a: float16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_p16_f16(a: float16x4_t) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_p16_f16(a: float16x4_t) -> poly16x4_t {
+    let a: float16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f32_f16(a: float16x8_t) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f32_f16(a: float16x8_t) -> float32x4_t {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_s8_f16(a: float16x8_t) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_s8_f16(a: float16x8_t) -> int8x16_t {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_s16_f16(a: float16x8_t) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_s16_f16(a: float16x8_t) -> int16x8_t {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_s32_f16(a: float16x8_t) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_s32_f16(a: float16x8_t) -> int32x4_t {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_s64_f16(a: float16x8_t) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_s64_f16(a: float16x8_t) -> int64x2_t {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_u8_f16(a: float16x8_t) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_u8_f16(a: float16x8_t) -> uint8x16_t {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_u16_f16(a: float16x8_t) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_u16_f16(a: float16x8_t) -> uint16x8_t {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_u32_f16(a: float16x8_t) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_u32_f16(a: float16x8_t) -> uint32x4_t {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_u64_f16(a: float16x8_t) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_u64_f16(a: float16x8_t) -> uint64x2_t {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_p8_f16(a: float16x8_t) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_p8_f16(a: float16x8_t) -> poly8x16_t {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_p16_f16(a: float16x8_t) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_p16_f16(a: float16x8_t) -> poly16x8_t {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_f32(a: float32x2_t) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_f32(a: float32x2_t) -> float16x4_t {
+    let a: float32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_f32(a: float32x4_t) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_f32(a: float32x4_t) -> float16x8_t {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_s8(a: int8x8_t) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_s8(a: int8x8_t) -> float16x4_t {
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_s8(a: int8x16_t) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_s8(a: int8x16_t) -> float16x8_t {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_s16(a: int16x4_t) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_s16(a: int16x4_t) -> float16x4_t {
+    let a: int16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_s16(a: int16x8_t) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_s16(a: int16x8_t) -> float16x8_t {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_s32(a: int32x2_t) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_s32(a: int32x2_t) -> float16x4_t {
+    let a: int32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_s32(a: int32x4_t) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_s32(a: int32x4_t) -> float16x8_t {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_s64(a: int64x1_t) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_s64(a: int64x1_t) -> float16x4_t {
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_s64(a: int64x2_t) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_s64(a: int64x2_t) -> float16x8_t {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_u8(a: uint8x8_t) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_u8(a: uint8x8_t) -> float16x4_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_u8(a: uint8x16_t) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_u8(a: uint8x16_t) -> float16x8_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_u16(a: uint16x4_t) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_u16(a: uint16x4_t) -> float16x4_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_u16(a: uint16x8_t) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_u16(a: uint16x8_t) -> float16x8_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_u32(a: uint32x2_t) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_u32(a: uint32x2_t) -> float16x4_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_u32(a: uint32x4_t) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_u32(a: uint32x4_t) -> float16x8_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_u64(a: uint64x1_t) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_u64(a: uint64x1_t) -> float16x4_t {
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_u64(a: uint64x2_t) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_u64(a: uint64x2_t) -> float16x8_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_p8(a: poly8x8_t) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_p8(a: poly8x8_t) -> float16x4_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_p8(a: poly8x16_t) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_p8(a: poly8x16_t) -> float16x8_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_p16(a: poly16x4_t) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_p16(a: poly16x4_t) -> float16x4_t {
+    let a: poly16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_p16(a: poly16x8_t) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_p16(a: poly16x8_t) -> float16x8_t {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_p128(a: p128) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_p128(a: p128) -> float16x8_t {
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_p64_f16(a: float16x4_t) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_p64_f16(a: float16x4_t) -> poly64x1_t {
+    let a: float16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_p128_f16(a: float16x8_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_p128_f16(a: float16x8_t) -> p128 {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_f16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_p64_f16(a: float16x8_t) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_f16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_p64_f16(a: float16x8_t) -> poly64x2_t {
+    let a: float16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_p64(a: poly64x1_t) -> float16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f16_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpret_f16_p64(a: poly64x1_t) -> float16x4_t {
+    unsafe {
+        let ret_val: float16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_p64(a: poly64x2_t) -> float16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f16_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vreinterpretq_f16_p64(a: poly64x2_t) -> float16x8_t {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_p128(a: p128) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_p128(a: p128) -> float32x4_t {
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_f32(a: float32x2_t) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_f32(a: float32x2_t) -> int8x8_t {
+    let a: float32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_f32(a: float32x2_t) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_f32(a: float32x2_t) -> int16x4_t {
+    let a: float32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_f32(a: float32x2_t) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_f32(a: float32x2_t) -> int32x2_t {
+    let a: float32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_f32(a: float32x2_t) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_f32(a: float32x2_t) -> int64x1_t {
+    let a: float32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_f32(a: float32x2_t) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_f32(a: float32x2_t) -> uint8x8_t {
+    let a: float32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_f32(a: float32x2_t) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_f32(a: float32x2_t) -> uint16x4_t {
+    let a: float32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_f32(a: float32x2_t) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_f32(a: float32x2_t) -> uint32x2_t {
+    let a: float32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_f32(a: float32x2_t) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_f32(a: float32x2_t) -> uint64x1_t {
+    let a: float32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_f32(a: float32x2_t) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_f32(a: float32x2_t) -> poly8x8_t {
+    let a: float32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_f32(a: float32x2_t) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_f32(a: float32x2_t) -> poly16x4_t {
+    let a: float32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_f32(a: float32x4_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_f32(a: float32x4_t) -> p128 {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_f32(a: float32x4_t) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_f32(a: float32x4_t) -> int8x16_t {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_f32(a: float32x4_t) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_f32(a: float32x4_t) -> int16x8_t {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_f32(a: float32x4_t) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_f32(a: float32x4_t) -> int32x4_t {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_f32(a: float32x4_t) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_f32(a: float32x4_t) -> int64x2_t {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_f32(a: float32x4_t) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_f32(a: float32x4_t) -> uint8x16_t {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_f32(a: float32x4_t) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_f32(a: float32x4_t) -> uint16x8_t {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_f32(a: float32x4_t) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_f32(a: float32x4_t) -> uint64x2_t {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_f32(a: float32x4_t) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_f32(a: float32x4_t) -> poly8x16_t {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_f32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_f32(a: float32x4_t) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_f32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_f32(a: float32x4_t) -> poly16x8_t {
+    let a: float32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_s8(a: int8x8_t) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_s8(a: int8x8_t) -> float32x2_t {
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_s8(a: int8x8_t) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_s8(a: int8x8_t) -> int16x4_t {
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_s8(a: int8x8_t) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_s8(a: int8x8_t) -> int32x2_t {
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_s8(a: int8x8_t) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_s8(a: int8x8_t) -> int64x1_t {
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_s8(a: int8x8_t) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_s8(a: int8x8_t) -> uint8x8_t {
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_s8(a: int8x8_t) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_s8(a: int8x8_t) -> uint16x4_t {
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_s8(a: int8x8_t) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_s8(a: int8x8_t) -> uint32x2_t {
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_s8(a: int8x8_t) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_s8(a: int8x8_t) -> uint64x1_t {
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_s8(a: int8x8_t) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_s8(a: int8x8_t) -> poly8x8_t {
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_s8(a: int8x8_t) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_s8(a: int8x8_t) -> poly16x4_t {
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_s8(a: int8x16_t) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_s8(a: int8x16_t) -> float32x4_t {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_s8(a: int8x16_t) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_s8(a: int8x16_t) -> int16x8_t {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_s8(a: int8x16_t) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_s8(a: int8x16_t) -> int32x4_t {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_s8(a: int8x16_t) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_s8(a: int8x16_t) -> int64x2_t {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_s8(a: int8x16_t) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_s8(a: int8x16_t) -> uint8x16_t {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_s8(a: int8x16_t) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_s8(a: int8x16_t) -> uint16x8_t {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_s8(a: int8x16_t) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_s8(a: int8x16_t) -> uint32x4_t {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_s8(a: int8x16_t) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_s8(a: int8x16_t) -> uint64x2_t {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_s8(a: int8x16_t) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_s8(a: int8x16_t) -> poly8x16_t {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_s8(a: int8x16_t) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_s8(a: int8x16_t) -> poly16x8_t {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_s16(a: int16x4_t) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_s16(a: int16x4_t) -> float32x2_t {
+    let a: int16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_s16(a: int16x4_t) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_s16(a: int16x4_t) -> int8x8_t {
+    let a: int16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_s16(a: int16x4_t) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_s16(a: int16x4_t) -> int32x2_t {
+    let a: int16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_s16(a: int16x4_t) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_s16(a: int16x4_t) -> int64x1_t {
+    let a: int16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_s16(a: int16x4_t) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_s16(a: int16x4_t) -> uint8x8_t {
+    let a: int16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_s16(a: int16x4_t) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_s16(a: int16x4_t) -> uint16x4_t {
+    let a: int16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_s16(a: int16x4_t) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_s16(a: int16x4_t) -> uint32x2_t {
+    let a: int16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_s16(a: int16x4_t) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_s16(a: int16x4_t) -> uint64x1_t {
+    let a: int16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_s16(a: int16x4_t) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_s16(a: int16x4_t) -> poly8x8_t {
+    let a: int16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_s16(a: int16x4_t) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_s16(a: int16x4_t) -> poly16x4_t {
+    let a: int16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_s16(a: int16x8_t) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_s16(a: int16x8_t) -> float32x4_t {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_s16(a: int16x8_t) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_s16(a: int16x8_t) -> int8x16_t {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_s16(a: int16x8_t) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_s16(a: int16x8_t) -> int32x4_t {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_s16(a: int16x8_t) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_s16(a: int16x8_t) -> int64x2_t {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_s16(a: int16x8_t) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_s16(a: int16x8_t) -> uint8x16_t {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_s16(a: int16x8_t) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_s16(a: int16x8_t) -> uint16x8_t {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_s16(a: int16x8_t) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_s16(a: int16x8_t) -> uint32x4_t {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_s16(a: int16x8_t) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_s16(a: int16x8_t) -> uint64x2_t {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_s16(a: int16x8_t) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_s16(a: int16x8_t) -> poly8x16_t {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_s16(a: int16x8_t) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_s16(a: int16x8_t) -> poly16x8_t {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_s32(a: int32x2_t) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_s32(a: int32x2_t) -> float32x2_t {
+    let a: int32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_s32(a: int32x2_t) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_s32(a: int32x2_t) -> int8x8_t {
+    let a: int32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_s32(a: int32x2_t) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_s32(a: int32x2_t) -> int16x4_t {
+    let a: int32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_s32(a: int32x2_t) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_s32(a: int32x2_t) -> int64x1_t {
+    let a: int32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_s32(a: int32x2_t) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_s32(a: int32x2_t) -> uint8x8_t {
+    let a: int32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_s32(a: int32x2_t) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_s32(a: int32x2_t) -> uint16x4_t {
+    let a: int32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_s32(a: int32x2_t) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_s32(a: int32x2_t) -> uint32x2_t {
+    let a: int32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_s32(a: int32x2_t) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_s32(a: int32x2_t) -> uint64x1_t {
+    let a: int32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_s32(a: int32x2_t) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_s32(a: int32x2_t) -> poly8x8_t {
+    let a: int32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_s32(a: int32x2_t) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_s32(a: int32x2_t) -> poly16x4_t {
+    let a: int32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_s32(a: int32x4_t) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_s32(a: int32x4_t) -> float32x4_t {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_s32(a: int32x4_t) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_s32(a: int32x4_t) -> int8x16_t {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_s32(a: int32x4_t) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_s32(a: int32x4_t) -> int16x8_t {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_s32(a: int32x4_t) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_s32(a: int32x4_t) -> int64x2_t {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_s32(a: int32x4_t) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_s32(a: int32x4_t) -> uint8x16_t {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_s32(a: int32x4_t) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_s32(a: int32x4_t) -> uint16x8_t {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_s32(a: int32x4_t) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_s32(a: int32x4_t) -> uint32x4_t {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_s32(a: int32x4_t) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_s32(a: int32x4_t) -> uint64x2_t {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_s32(a: int32x4_t) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_s32(a: int32x4_t) -> poly8x16_t {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_s32(a: int32x4_t) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_s32(a: int32x4_t) -> poly16x8_t {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_s64(a: int64x1_t) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_s64(a: int64x1_t) -> float32x2_t {
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_s64(a: int64x1_t) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_s64(a: int64x1_t) -> int8x8_t {
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_s64(a: int64x1_t) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_s64(a: int64x1_t) -> int16x4_t {
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_s64(a: int64x1_t) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_s64(a: int64x1_t) -> int32x2_t {
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_s64(a: int64x1_t) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_s64(a: int64x1_t) -> uint8x8_t {
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_s64(a: int64x1_t) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_s64(a: int64x1_t) -> uint16x4_t {
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_s64(a: int64x1_t) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_s64(a: int64x1_t) -> uint32x2_t {
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_s64(a: int64x1_t) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_s64(a: int64x1_t) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_s64(a: int64x1_t) -> poly8x8_t {
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_s64(a: int64x1_t) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_s64(a: int64x1_t) -> poly16x4_t {
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_s64(a: int64x2_t) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_s64(a: int64x2_t) -> float32x4_t {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_s64(a: int64x2_t) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_s64(a: int64x2_t) -> int8x16_t {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_s64(a: int64x2_t) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_s64(a: int64x2_t) -> int16x8_t {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_s64(a: int64x2_t) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_s64(a: int64x2_t) -> int32x4_t {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_s64(a: int64x2_t) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_s64(a: int64x2_t) -> uint8x16_t {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_s64(a: int64x2_t) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_s64(a: int64x2_t) -> uint16x8_t {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_s64(a: int64x2_t) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_s64(a: int64x2_t) -> uint32x4_t {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_s64(a: int64x2_t) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_s64(a: int64x2_t) -> uint64x2_t {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_s64(a: int64x2_t) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_s64(a: int64x2_t) -> poly8x16_t {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_s64(a: int64x2_t) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_s64(a: int64x2_t) -> poly16x8_t {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_u8(a: uint8x8_t) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_u8(a: uint8x8_t) -> float32x2_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_u8(a: uint8x8_t) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_u8(a: uint8x8_t) -> int8x8_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_u8(a: uint8x8_t) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_u8(a: uint8x8_t) -> int16x4_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_u8(a: uint8x8_t) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_u8(a: uint8x8_t) -> int32x2_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_u8(a: uint8x8_t) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_u8(a: uint8x8_t) -> int64x1_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_u8(a: uint8x8_t) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_u8(a: uint8x8_t) -> uint16x4_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_u8(a: uint8x8_t) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_u8(a: uint8x8_t) -> uint32x2_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_u8(a: uint8x8_t) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_u8(a: uint8x8_t) -> uint64x1_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_u8(a: uint8x8_t) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_u8(a: uint8x8_t) -> poly8x8_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_u8(a: uint8x8_t) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_u8(a: uint8x8_t) -> poly16x4_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_u8(a: uint8x16_t) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_u8(a: uint8x16_t) -> float32x4_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_u8(a: uint8x16_t) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_u8(a: uint8x16_t) -> int8x16_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_u8(a: uint8x16_t) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_u8(a: uint8x16_t) -> int16x8_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_u8(a: uint8x16_t) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_u8(a: uint8x16_t) -> int32x4_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_u8(a: uint8x16_t) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_u8(a: uint8x16_t) -> int64x2_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_u8(a: uint8x16_t) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_u8(a: uint8x16_t) -> uint16x8_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_u8(a: uint8x16_t) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_u8(a: uint8x16_t) -> uint32x4_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_u8(a: uint8x16_t) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_u8(a: uint8x16_t) -> uint64x2_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_u8(a: uint8x16_t) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_u8(a: uint8x16_t) -> poly8x16_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_u8(a: uint8x16_t) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_u8(a: uint8x16_t) -> poly16x8_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_u16(a: uint16x4_t) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_u16(a: uint16x4_t) -> float32x2_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_u16(a: uint16x4_t) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_u16(a: uint16x4_t) -> int8x8_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_u16(a: uint16x4_t) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_u16(a: uint16x4_t) -> int16x4_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_u16(a: uint16x4_t) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_u16(a: uint16x4_t) -> int32x2_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_u16(a: uint16x4_t) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_u16(a: uint16x4_t) -> int64x1_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_u16(a: uint16x4_t) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_u16(a: uint16x4_t) -> uint8x8_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_u16(a: uint16x4_t) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_u16(a: uint16x4_t) -> uint32x2_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_u16(a: uint16x4_t) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_u16(a: uint16x4_t) -> uint64x1_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_u16(a: uint16x4_t) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_u16(a: uint16x4_t) -> poly8x8_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_u16(a: uint16x4_t) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_u16(a: uint16x4_t) -> poly16x4_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_u16(a: uint16x8_t) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_u16(a: uint16x8_t) -> float32x4_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_u16(a: uint16x8_t) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_u16(a: uint16x8_t) -> int8x16_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_u16(a: uint16x8_t) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_u16(a: uint16x8_t) -> int16x8_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_u16(a: uint16x8_t) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_u16(a: uint16x8_t) -> int32x4_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_u16(a: uint16x8_t) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_u16(a: uint16x8_t) -> int64x2_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_u16(a: uint16x8_t) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_u16(a: uint16x8_t) -> uint8x16_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_u16(a: uint16x8_t) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_u16(a: uint16x8_t) -> uint32x4_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_u16(a: uint16x8_t) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_u16(a: uint16x8_t) -> uint64x2_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_u16(a: uint16x8_t) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_u16(a: uint16x8_t) -> poly8x16_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_u16(a: uint16x8_t) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_u16(a: uint16x8_t) -> poly16x8_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_u32(a: uint32x2_t) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_u32(a: uint32x2_t) -> float32x2_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_u32(a: uint32x2_t) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_u32(a: uint32x2_t) -> int8x8_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_u32(a: uint32x2_t) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_u32(a: uint32x2_t) -> int16x4_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_u32(a: uint32x2_t) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_u32(a: uint32x2_t) -> int32x2_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_u32(a: uint32x2_t) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_u32(a: uint32x2_t) -> int64x1_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_u32(a: uint32x2_t) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_u32(a: uint32x2_t) -> uint8x8_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_u32(a: uint32x2_t) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_u32(a: uint32x2_t) -> uint16x4_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_u32(a: uint32x2_t) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_u32(a: uint32x2_t) -> uint64x1_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_u32(a: uint32x2_t) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_u32(a: uint32x2_t) -> poly8x8_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_u32(a: uint32x2_t) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_u32(a: uint32x2_t) -> poly16x4_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_u32(a: uint32x4_t) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_u32(a: uint32x4_t) -> float32x4_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_u32(a: uint32x4_t) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_u32(a: uint32x4_t) -> int8x16_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_u32(a: uint32x4_t) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_u32(a: uint32x4_t) -> int16x8_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_u32(a: uint32x4_t) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_u32(a: uint32x4_t) -> int32x4_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_u32(a: uint32x4_t) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_u32(a: uint32x4_t) -> int64x2_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_u32(a: uint32x4_t) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_u32(a: uint32x4_t) -> uint8x16_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_u32(a: uint32x4_t) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_u32(a: uint32x4_t) -> uint16x8_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_u32(a: uint32x4_t) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_u32(a: uint32x4_t) -> uint64x2_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_u32(a: uint32x4_t) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_u32(a: uint32x4_t) -> poly8x16_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_u32(a: uint32x4_t) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_u32(a: uint32x4_t) -> poly16x8_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_u64(a: uint64x1_t) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_u64(a: uint64x1_t) -> float32x2_t {
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_u64(a: uint64x1_t) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_u64(a: uint64x1_t) -> int8x8_t {
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_u64(a: uint64x1_t) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_u64(a: uint64x1_t) -> int16x4_t {
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_u64(a: uint64x1_t) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_u64(a: uint64x1_t) -> int32x2_t {
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_u64(a: uint64x1_t) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_u64(a: uint64x1_t) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_u64(a: uint64x1_t) -> uint8x8_t {
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_u64(a: uint64x1_t) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_u64(a: uint64x1_t) -> uint16x4_t {
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_u64(a: uint64x1_t) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_u64(a: uint64x1_t) -> uint32x2_t {
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_u64(a: uint64x1_t) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_u64(a: uint64x1_t) -> poly8x8_t {
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_u64(a: uint64x1_t) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_u64(a: uint64x1_t) -> poly16x4_t {
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_u64(a: uint64x2_t) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_u64(a: uint64x2_t) -> float32x4_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_u64(a: uint64x2_t) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_u64(a: uint64x2_t) -> int8x16_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_u64(a: uint64x2_t) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_u64(a: uint64x2_t) -> int16x8_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_u64(a: uint64x2_t) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_u64(a: uint64x2_t) -> int32x4_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_u64(a: uint64x2_t) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_u64(a: uint64x2_t) -> int64x2_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_u64(a: uint64x2_t) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_u64(a: uint64x2_t) -> uint8x16_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_u64(a: uint64x2_t) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_u64(a: uint64x2_t) -> uint16x8_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_u64(a: uint64x2_t) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_u64(a: uint64x2_t) -> uint32x4_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_u64(a: uint64x2_t) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_u64(a: uint64x2_t) -> poly8x16_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_u64(a: uint64x2_t) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_u64(a: uint64x2_t) -> poly16x8_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_p8(a: poly8x8_t) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_p8(a: poly8x8_t) -> float32x2_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_p8(a: poly8x8_t) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_p8(a: poly8x8_t) -> int8x8_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_p8(a: poly8x8_t) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_p8(a: poly8x8_t) -> int16x4_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_p8(a: poly8x8_t) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_p8(a: poly8x8_t) -> int32x2_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_p8(a: poly8x8_t) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_p8(a: poly8x8_t) -> int64x1_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_p8(a: poly8x8_t) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_p8(a: poly8x8_t) -> uint8x8_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_p8(a: poly8x8_t) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_p8(a: poly8x8_t) -> uint16x4_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_p8(a: poly8x8_t) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_p8(a: poly8x8_t) -> uint32x2_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_p8(a: poly8x8_t) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_p8(a: poly8x8_t) -> uint64x1_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_p8(a: poly8x8_t) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_p8(a: poly8x8_t) -> poly16x4_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_p8(a: poly8x16_t) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_p8(a: poly8x16_t) -> float32x4_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_p8(a: poly8x16_t) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_p8(a: poly8x16_t) -> int8x16_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_p8(a: poly8x16_t) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_p8(a: poly8x16_t) -> int16x8_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_p8(a: poly8x16_t) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_p8(a: poly8x16_t) -> int32x4_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_p8(a: poly8x16_t) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_p8(a: poly8x16_t) -> int64x2_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_p8(a: poly8x16_t) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_p8(a: poly8x16_t) -> uint8x16_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_p8(a: poly8x16_t) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_p8(a: poly8x16_t) -> uint16x8_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_p8(a: poly8x16_t) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_p8(a: poly8x16_t) -> uint32x4_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_p8(a: poly8x16_t) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_p8(a: poly8x16_t) -> uint64x2_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_p8(a: poly8x16_t) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_p8(a: poly8x16_t) -> poly16x8_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_p16(a: poly16x4_t) -> float32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_f32_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_f32_p16(a: poly16x4_t) -> float32x2_t {
+    let a: poly16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_p16(a: poly16x4_t) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_p16(a: poly16x4_t) -> int8x8_t {
+    let a: poly16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_p16(a: poly16x4_t) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_p16(a: poly16x4_t) -> int16x4_t {
+    let a: poly16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_p16(a: poly16x4_t) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_p16(a: poly16x4_t) -> int32x2_t {
+    let a: poly16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_p16(a: poly16x4_t) -> int64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s64_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s64_p16(a: poly16x4_t) -> int64x1_t {
+    let a: poly16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_p16(a: poly16x4_t) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_p16(a: poly16x4_t) -> uint8x8_t {
+    let a: poly16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_p16(a: poly16x4_t) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_p16(a: poly16x4_t) -> uint16x4_t {
+    let a: poly16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_p16(a: poly16x4_t) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_p16(a: poly16x4_t) -> uint32x2_t {
+    let a: poly16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_p16(a: poly16x4_t) -> uint64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u64_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u64_p16(a: poly16x4_t) -> uint64x1_t {
+    let a: poly16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_p16(a: poly16x4_t) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_p16(a: poly16x4_t) -> poly8x8_t {
+    let a: poly16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_p16(a: poly16x8_t) -> float32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_f32_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_f32_p16(a: poly16x8_t) -> float32x4_t {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: float32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_p16(a: poly16x8_t) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_p16(a: poly16x8_t) -> int8x16_t {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_p16(a: poly16x8_t) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_p16(a: poly16x8_t) -> int16x8_t {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_p16(a: poly16x8_t) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_p16(a: poly16x8_t) -> int32x4_t {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_p16(a: poly16x8_t) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_p16(a: poly16x8_t) -> int64x2_t {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_p16(a: poly16x8_t) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_p16(a: poly16x8_t) -> uint8x16_t {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_p16(a: poly16x8_t) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_p16(a: poly16x8_t) -> uint16x8_t {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_p16(a: poly16x8_t) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_p16(a: poly16x8_t) -> uint32x4_t {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_p16(a: poly16x8_t) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_p16(a: poly16x8_t) -> uint64x2_t {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_p16(a: poly16x8_t) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_p16(a: poly16x8_t) -> poly8x16_t {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_p128(a: p128) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_p128(a: p128) -> int8x16_t {
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_p128(a: p128) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_p128(a: p128) -> int16x8_t {
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_p128(a: p128) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_p128(a: p128) -> int32x4_t {
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_p128(a: p128) -> int64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s64_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s64_p128(a: p128) -> int64x2_t {
+    unsafe {
+        let ret_val: int64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_p128(a: p128) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_p128(a: p128) -> uint8x16_t {
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_p128(a: p128) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_p128(a: p128) -> uint16x8_t {
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_p128(a: p128) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_p128(a: p128) -> uint32x4_t {
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_p128(a: p128) -> uint64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u64_p128(a: p128) -> uint64x2_t {
+    unsafe {
+        let ret_val: uint64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_p128(a: p128) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_p128(a: p128) -> poly8x16_t {
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_p128(a: p128) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_p128(a: p128) -> poly16x8_t {
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_p128)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_p128(a: p128) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_p128)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_p128(a: p128) -> poly64x2_t {
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_s8(a: int8x8_t) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_s8(a: int8x8_t) -> poly64x1_t {
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_s8(a: int8x16_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_s8(a: int8x16_t) -> p128 {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_s8(a: int8x16_t) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_s8(a: int8x16_t) -> poly64x2_t {
+    let a: int8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_s16(a: int16x4_t) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_s16(a: int16x4_t) -> poly64x1_t {
+    let a: int16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_s16(a: int16x8_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_s16(a: int16x8_t) -> p128 {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_s16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_s16(a: int16x8_t) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_s16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_s16(a: int16x8_t) -> poly64x2_t {
+    let a: int16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_s32(a: int32x2_t) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_s32(a: int32x2_t) -> poly64x1_t {
+    let a: int32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_s32(a: int32x4_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_s32(a: int32x4_t) -> p128 {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_s32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_s32(a: int32x4_t) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_s32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_s32(a: int32x4_t) -> poly64x2_t {
+    let a: int32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_s64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_s64(a: int64x2_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_s64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_s64(a: int64x2_t) -> p128 {
+    let a: int64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_u8(a: uint8x8_t) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_u8(a: uint8x8_t) -> poly64x1_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_u8(a: uint8x16_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_u8(a: uint8x16_t) -> p128 {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_u8(a: uint8x16_t) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_u8(a: uint8x16_t) -> poly64x2_t {
+    let a: uint8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_u16(a: uint16x4_t) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_u16(a: uint16x4_t) -> poly64x1_t {
+    let a: uint16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_u16(a: uint16x8_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_u16(a: uint16x8_t) -> p128 {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_u16(a: uint16x8_t) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_u16(a: uint16x8_t) -> poly64x2_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_u32(a: uint32x2_t) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_u32(a: uint32x2_t) -> poly64x1_t {
+    let a: uint32x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_u32(a: uint32x4_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_u32(a: uint32x4_t) -> p128 {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_u32(a: uint32x4_t) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_u32(a: uint32x4_t) -> poly64x2_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_u64(a: uint64x2_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_u64(a: uint64x2_t) -> p128 {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_p8(a: poly8x8_t) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_p8(a: poly8x8_t) -> poly64x1_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_p8(a: poly8x16_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_p8(a: poly8x16_t) -> p128 {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_p8(a: poly8x16_t) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_p8(a: poly8x16_t) -> poly64x2_t {
+    let a: poly8x16_t =
+        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_p16(a: poly16x4_t) -> poly64x1_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p64_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p64_p16(a: poly16x4_t) -> poly64x1_t {
+    let a: poly16x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_p16(a: poly16x8_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_p16(a: poly16x8_t) -> p128 {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_p16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_p16(a: poly16x8_t) -> poly64x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p64_p16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p64_p16(a: poly16x8_t) -> poly64x2_t {
+    let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly64x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_p64(a: poly64x1_t) -> int8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s8_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s8_p64(a: poly64x1_t) -> int8x8_t {
+    unsafe {
+        let ret_val: int8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_p64(a: poly64x1_t) -> int16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s16_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s16_p64(a: poly64x1_t) -> int16x4_t {
+    unsafe {
+        let ret_val: int16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_p64(a: poly64x1_t) -> int32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_s32_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_s32_p64(a: poly64x1_t) -> int32x2_t {
+    unsafe {
+        let ret_val: int32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_p64(a: poly64x1_t) -> uint8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u8_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u8_p64(a: poly64x1_t) -> uint8x8_t {
+    unsafe {
+        let ret_val: uint8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_p64(a: poly64x1_t) -> uint16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u16_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u16_p64(a: poly64x1_t) -> uint16x4_t {
+    unsafe {
+        let ret_val: uint16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_p64(a: poly64x1_t) -> uint32x2_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_u32_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_u32_p64(a: poly64x1_t) -> uint32x2_t {
+    unsafe {
+        let ret_val: uint32x2_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_p64(a: poly64x1_t) -> poly8x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p8_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p8_p64(a: poly64x1_t) -> poly8x8_t {
+    unsafe {
+        let ret_val: poly8x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_p64(a: poly64x1_t) -> poly16x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpret_p16_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpret_p16_p64(a: poly64x1_t) -> poly16x4_t {
+    unsafe {
+        let ret_val: poly16x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_p64(a: poly64x2_t) -> p128 {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p128_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p128_p64(a: poly64x2_t) -> p128 {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_p64(a: poly64x2_t) -> int8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s8_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s8_p64(a: poly64x2_t) -> int8x16_t {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_p64(a: poly64x2_t) -> int16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s16_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s16_p64(a: poly64x2_t) -> int16x8_t {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_p64(a: poly64x2_t) -> int32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_s32_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_s32_p64(a: poly64x2_t) -> int32x4_t {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: int32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_p64(a: poly64x2_t) -> uint8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u8_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u8_p64(a: poly64x2_t) -> uint8x16_t {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_p64(a: poly64x2_t) -> uint16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u16_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u16_p64(a: poly64x2_t) -> uint16x8_t {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_p64(a: poly64x2_t) -> uint32x4_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u32_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_u32_p64(a: poly64x2_t) -> uint32x4_t {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: uint32x4_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_p64(a: poly64x2_t) -> poly8x16_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p8_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p8_p64(a: poly64x2_t) -> poly8x16_t {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly8x16_t = transmute(a);
+        simd_shuffle!(
+            ret_val,
+            ret_val,
+            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+        )
+    }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_p64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_p64(a: poly64x2_t) -> poly16x8_t {
+    unsafe { transmute(a) }
+}
+#[doc = "Vector reinterpret cast operation"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_p16_p64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vreinterpretq_p16_p64(a: poly64x2_t) -> poly16x8_t {
+    let a: poly64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    unsafe {
+        let ret_val: poly16x8_t = transmute(a);
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev16_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev16)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev16_p8(a: poly8x8_t) -> poly8x8_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2, 5, 4, 7, 6]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev16_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev16)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev16_s8(a: int8x8_t) -> int8x8_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2, 5, 4, 7, 6]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev16_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev16)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev16_u8(a: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2, 5, 4, 7, 6]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev16q_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev16)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev16q_p8(a: poly8x16_t) -> poly8x16_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev16q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev16)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev16q_s8(a: int8x16_t) -> int8x16_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev16q_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev16)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev16q_u8(a: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev32_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev32)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev32_p16(a: poly16x4_t) -> poly16x4_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev32_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev32)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev32_p8(a: poly8x8_t) -> poly8x8_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0, 7, 6, 5, 4]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev32_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev32)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev32_s16(a: int16x4_t) -> int16x4_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev32_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev32)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev32_s8(a: int8x8_t) -> int8x8_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0, 7, 6, 5, 4]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev32_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev32)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev32_u16(a: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev32_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev32)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev32_u8(a: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0, 7, 6, 5, 4]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev32q_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev32)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev32q_p16(a: poly16x8_t) -> poly16x8_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2, 5, 4, 7, 6]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev32q_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev32)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev32q_p8(a: poly8x16_t) -> poly8x16_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev32q_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev32)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev32q_s16(a: int16x8_t) -> int16x8_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2, 5, 4, 7, 6]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev32q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev32)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev32q_s8(a: int8x16_t) -> int8x16_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev32q_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev32)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev32q_u16(a: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2, 5, 4, 7, 6]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev32q_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev32)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev32q_u8(a: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64_f32(a: float32x2_t) -> float32x2_t {
+    unsafe { simd_shuffle!(a, a, [1, 0]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64_p16(a: poly16x4_t) -> poly16x4_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64_p8(a: poly8x8_t) -> poly8x8_t {
+    unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64_s16(a: int16x4_t) -> int16x4_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64_s32(a: int32x2_t) -> int32x2_t {
+    unsafe { simd_shuffle!(a, a, [1, 0]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64_s8(a: int8x8_t) -> int8x8_t {
+    unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64_u16(a: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64_u32(a: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_shuffle!(a, a, [1, 0]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64_u8(a: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64q_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64q_f32(a: float32x4_t) -> float32x4_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64q_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64q_p16(a: poly16x8_t) -> poly16x8_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0, 7, 6, 5, 4]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64q_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64q_p8(a: poly8x16_t) -> poly8x16_t {
+    unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64q_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64q_s16(a: int16x8_t) -> int16x8_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0, 7, 6, 5, 4]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64q_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64q_s32(a: int32x4_t) -> int32x4_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64q_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64q_s8(a: int8x16_t) -> int8x16_t {
+    unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64q_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64q_u16(a: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0, 7, 6, 5, 4]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64q_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64q_u32(a: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_shuffle!(a, a, [1, 0, 3, 2]) }
+}
+#[doc = "Reversing vector elements (swap endianness)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64q_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrev64q_u8(a: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8]) }
+}
+#[doc = "Reverse elements in 64-bit doublewords"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrev64))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrev64_f16(a: float16x4_t) -> float16x4_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) }
+}
+#[doc = "Reverse elements in 64-bit doublewords"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrev64q_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrev64))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rev64)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrev64q_f16(a: float16x8_t) -> float16x8_t {
+    unsafe { simd_shuffle!(a, a, [3, 2, 1, 0, 7, 6, 5, 4]) }
+}
+#[doc = "Rounding halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrhadd_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrhadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srhadd.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v8i8")]
+        fn _vrhadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vrhadd_s8(a, b) }
+}
+#[doc = "Rounding halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrhaddq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrhaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srhadd.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v16i8")]
+        fn _vrhaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vrhaddq_s8(a, b) }
+}
+#[doc = "Rounding halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrhadd_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrhadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srhadd.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v4i16")]
+        fn _vrhadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vrhadd_s16(a, b) }
+}
+#[doc = "Rounding halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrhaddq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrhaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srhadd.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v8i16")]
+        fn _vrhaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vrhaddq_s16(a, b) }
+}
+#[doc = "Rounding halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrhadd_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrhadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srhadd.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v2i32")]
+        fn _vrhadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vrhadd_s32(a, b) }
+}
+#[doc = "Rounding halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrhaddq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrhaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srhadd.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v4i32")]
+        fn _vrhaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vrhaddq_s32(a, b) }
+}
+#[doc = "Rounding halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrhadd_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrhadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urhadd.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v8i8")]
+        fn _vrhadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vrhadd_u8(a, b) }
+}
+#[doc = "Rounding halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrhaddq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrhaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urhadd.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v16i8")]
+        fn _vrhaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vrhaddq_u8(a, b) }
+}
+#[doc = "Rounding halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrhadd_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrhadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urhadd.v4i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v4i16")]
+        fn _vrhadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vrhadd_u16(a, b) }
+}
+#[doc = "Rounding halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrhaddq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrhaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urhadd.v8i16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v8i16")]
+        fn _vrhaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vrhaddq_u16(a, b) }
+}
+#[doc = "Rounding halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrhadd_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrhadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urhadd.v2i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v2i32")]
+        fn _vrhadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vrhadd_u32(a, b) }
+}
+#[doc = "Rounding halving add"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrhaddq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urhadd)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrhaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urhadd.v4i32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v4i32")]
+        fn _vrhaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vrhaddq_u32(a, b) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndn_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrintn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frintn)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrndn_f16(a: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.roundeven.v4f16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrintn.v4f16")]
+        fn _vrndn_f16(a: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vrndn_f16(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndnq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrintn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frintn)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrndnq_f16(a: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.roundeven.v8f16"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrintn.v8f16")]
+        fn _vrndnq_f16(a: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vrndnq_f16(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndn_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrintn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frintn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrndn_f32(a: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.roundeven.v2f32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrintn.v2f32")]
+        fn _vrndn_f32(a: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vrndn_f32(a) }
+}
+#[doc = "Floating-point round to integral, to nearest with ties to even"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrndnq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrintn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frintn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrndnq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.roundeven.v4f32"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrintn.v4f32")]
+        fn _vrndnq_f32(a: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vrndnq_f32(a) }
+}
+#[doc = "Signed rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshl_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srshl.v8i8"
+        )]
+        fn _vrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vrshl_s8(a, b) }
+}
+#[doc = "Signed rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshlq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srshl.v16i8"
+        )]
+        fn _vrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vrshlq_s8(a, b) }
+}
+#[doc = "Signed rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshl_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srshl.v4i16"
+        )]
+        fn _vrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vrshl_s16(a, b) }
+}
+#[doc = "Signed rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshlq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srshl.v8i16"
+        )]
+        fn _vrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vrshlq_s16(a, b) }
+}
+#[doc = "Signed rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshl_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srshl.v2i32"
+        )]
+        fn _vrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vrshl_s32(a, b) }
+}
+#[doc = "Signed rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshlq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srshl.v4i32"
+        )]
+        fn _vrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vrshlq_s32(a, b) }
+}
+#[doc = "Signed rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshl_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v1i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srshl.v1i64"
+        )]
+        fn _vrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+    unsafe { _vrshl_s64(a, b) }
+}
+#[doc = "Signed rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshlq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v2i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.srshl.v2i64"
+        )]
+        fn _vrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+    unsafe { _vrshlq_s64(a, b) }
+}
+#[doc = "Unsigned rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshl_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urshl.v8i8"
+        )]
+        fn _vrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vrshl_u8(a, b) }
+}
+#[doc = "Unsigned rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshlq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urshl.v16i8"
+        )]
+        fn _vrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vrshlq_u8(a, b) }
+}
+#[doc = "Unsigned rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshl_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urshl.v4i16"
+        )]
+        fn _vrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vrshl_u16(a, b) }
+}
+#[doc = "Unsigned rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshlq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urshl.v8i16"
+        )]
+        fn _vrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vrshlq_u16(a, b) }
+}
+#[doc = "Unsigned rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshl_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urshl.v2i32"
+        )]
+        fn _vrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vrshl_u32(a, b) }
+}
+#[doc = "Unsigned rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshlq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urshl.v4i32"
+        )]
+        fn _vrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vrshlq_u32(a, b) }
+}
+#[doc = "Unsigned rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshl_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v1i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urshl.v1i64"
+        )]
+        fn _vrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vrshl_u64(a, b) }
+}
+#[doc = "Unsigned rounding shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshlq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v2i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.urshl.v2i64"
+        )]
+        fn _vrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vrshlq_u64(a, b) }
+}
+#[doc = "Signed rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshr_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    vrshl_s8(a, vdup_n_s8(-N as _))
+}
+#[doc = "Signed rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrq_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    vrshlq_s8(a, vdupq_n_s8(-N as _))
+}
+#[doc = "Signed rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshr_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    vrshl_s16(a, vdup_n_s16(-N as _))
+}
+#[doc = "Signed rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    vrshlq_s16(a, vdupq_n_s16(-N as _))
+}
+#[doc = "Signed rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshr_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    vrshl_s32(a, vdup_n_s32(-N as _))
+}
+#[doc = "Signed rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    vrshlq_s32(a, vdupq_n_s32(-N as _))
+}
+#[doc = "Signed rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshr_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    vrshl_s64(a, vdup_n_s64(-N as _))
+}
+#[doc = "Signed rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrq_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    vrshlq_s64(a, vdupq_n_s64(-N as _))
+}
+#[doc = "Unsigned rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshr_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    vrshl_u8(a, vdup_n_s8(-N as _))
+}
+#[doc = "Unsigned rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrq_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    vrshlq_u8(a, vdupq_n_s8(-N as _))
+}
+#[doc = "Unsigned rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshr_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    vrshl_u16(a, vdup_n_s16(-N as _))
+}
+#[doc = "Unsigned rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrq_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    vrshlq_u16(a, vdupq_n_s16(-N as _))
+}
+#[doc = "Unsigned rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshr_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    vrshl_u32(a, vdup_n_s32(-N as _))
+}
+#[doc = "Unsigned rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrq_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    vrshlq_u32(a, vdupq_n_s32(-N as _))
+}
+#[doc = "Unsigned rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshr_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    vrshl_u64(a, vdup_n_s64(-N as _))
+}
+#[doc = "Unsigned rounding shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrq_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(urshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshrq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    vrshlq_u64(a, vdupq_n_s64(-N as _))
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_n_s16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v8i8")]
+        fn _vrshrn_n_s16(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+    }
+    unsafe {
+        _vrshrn_n_s16(
+            a,
+            const {
+                int16x8_t([
+                    -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16,
+                    -N as i16,
+                ])
+            },
+        )
+    }
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_n_s32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v4i16")]
+        fn _vrshrn_n_s32(a: int32x4_t, n: int32x4_t) -> int16x4_t;
+    }
+    unsafe {
+        _vrshrn_n_s32(
+            a,
+            const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) },
+        )
+    }
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_n_s64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub fn vrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v2i32")]
+        fn _vrshrn_n_s64(a: int64x2_t, n: int64x2_t) -> int32x2_t;
+    }
+    unsafe { _vrshrn_n_s64(a, const { int64x2_t([-N as i64, -N as i64]) }) }
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.rshrn.v8i8"
+        )]
+        fn _vrshrn_n_s16(a: int16x8_t, n: i32) -> int8x8_t;
+    }
+    unsafe { _vrshrn_n_s16(a, N) }
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.rshrn.v4i16"
+        )]
+        fn _vrshrn_n_s32(a: int32x4_t, n: i32) -> int16x4_t;
+    }
+    unsafe { _vrshrn_n_s32(a, N) }
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.rshrn.v2i32"
+        )]
+        fn _vrshrn_n_s64(a: int64x2_t, n: i32) -> int32x2_t;
+    }
+    unsafe { _vrshrn_n_s64(a, N) }
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rshrn, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { transmute(vrshrn_n_s16::<N>(transmute(a))) }
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rshrn, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { transmute(vrshrn_n_s32::<N>(transmute(a))) }
+}
+#[doc = "Rounding shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rshrn, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { transmute(vrshrn_n_s64::<N>(transmute(a))) }
+}
+#[doc = "Reciprocal square-root estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrte_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frsqrte)
+)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrsqrte_f16(a: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrte.v4f16"
+        )]
+        fn _vrsqrte_f16(a: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vrsqrte_f16(a) }
+}
+#[doc = "Reciprocal square-root estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrteq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frsqrte)
+)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrsqrteq_f16(a: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v8f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrte.v8f16"
+        )]
+        fn _vrsqrteq_f16(a: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vrsqrteq_f16(a) }
+}
+#[doc = "Reciprocal square-root estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrte_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frsqrte)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsqrte_f32(a: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrte.v2f32"
+        )]
+        fn _vrsqrte_f32(a: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vrsqrte_f32(a) }
+}
+#[doc = "Reciprocal square-root estimate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrteq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frsqrte)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsqrteq_f32(a: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrte.v4f32"
+        )]
+        fn _vrsqrteq_f32(a: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vrsqrteq_f32(a) }
+}
+#[doc = "Unsigned reciprocal square root estimate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrte_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ursqrte)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsqrte_u32(a: uint32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ursqrte.v2i32"
+        )]
+        fn _vrsqrte_u32(a: uint32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vrsqrte_u32(a) }
+}
+#[doc = "Unsigned reciprocal square root estimate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrteq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ursqrte)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsqrteq_u32(a: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ursqrte.v4i32"
+        )]
+        fn _vrsqrteq_u32(a: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vrsqrteq_u32(a) }
+}
+#[doc = "Floating-point reciprocal square root step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrts_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrts))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frsqrts)
+)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrsqrts_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrts.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrts.v4f16"
+        )]
+        fn _vrsqrts_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vrsqrts_f16(a, b) }
+}
+#[doc = "Floating-point reciprocal square root step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrtsq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,fp16")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrts))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frsqrts)
+)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vrsqrtsq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrts.v8f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrts.v8f16"
+        )]
+        fn _vrsqrtsq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vrsqrtsq_f16(a, b) }
+}
+#[doc = "Floating-point reciprocal square root step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrts_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrts))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frsqrts)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsqrts_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrts.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrts.v2f32"
+        )]
+        fn _vrsqrts_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vrsqrts_f32(a, b) }
+}
+#[doc = "Floating-point reciprocal square root step"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrtsq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrts))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(frsqrts)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsqrtsq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrts.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.frsqrts.v4f32"
+        )]
+        fn _vrsqrtsq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vrsqrtsq_f32(a, b) }
+}
+#[doc = "Signed rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsra_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srsra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsra_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { simd_add(a, vrshr_n_s8::<N>(b)) }
+}
+#[doc = "Signed rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsraq_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srsra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsraq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { simd_add(a, vrshrq_n_s8::<N>(b)) }
+}
+#[doc = "Signed rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsra_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srsra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsra_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_add(a, vrshr_n_s16::<N>(b)) }
+}
+#[doc = "Signed rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsraq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srsra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsraq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_add(a, vrshrq_n_s16::<N>(b)) }
+}
+#[doc = "Signed rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsra_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srsra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsra_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_add(a, vrshr_n_s32::<N>(b)) }
+}
+#[doc = "Signed rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsraq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srsra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsraq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_add(a, vrshrq_n_s32::<N>(b)) }
+}
+#[doc = "Signed rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsra_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srsra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsra_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { simd_add(a, vrshr_n_s64::<N>(b)) }
+}
+#[doc = "Signed rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsraq_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(srsra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsraq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { simd_add(a, vrshrq_n_s64::<N>(b)) }
+}
+#[doc = "Unsigned rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsra_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ursra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsra_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { simd_add(a, vrshr_n_u8::<N>(b)) }
+}
+#[doc = "Unsigned rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsraq_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ursra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsraq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { simd_add(a, vrshrq_n_u8::<N>(b)) }
+}
+#[doc = "Unsigned rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsra_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ursra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsra_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_add(a, vrshr_n_u16::<N>(b)) }
+}
+#[doc = "Unsigned rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsraq_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ursra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsraq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_add(a, vrshrq_n_u16::<N>(b)) }
+}
+#[doc = "Unsigned rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsra_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ursra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsra_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_add(a, vrshr_n_u32::<N>(b)) }
+}
+#[doc = "Unsigned rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsraq_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ursra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsraq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_add(a, vrshrq_n_u32::<N>(b)) }
+}
+#[doc = "Unsigned rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsra_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ursra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsra_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { simd_add(a, vrshr_n_u64::<N>(b)) }
+}
+#[doc = "Unsigned rounding shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsraq_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ursra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsraq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { simd_add(a, vrshrq_n_u64::<N>(b)) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rsubhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsubhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsubhn.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.rsubhn.v8i8"
+        )]
+        fn _vrsubhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t;
+    }
+    unsafe { _vrsubhn_s16(a, b) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rsubhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsubhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsubhn.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.rsubhn.v4i16"
+        )]
+        fn _vrsubhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t;
+    }
+    unsafe { _vrsubhn_s32(a, b) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rsubhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsubhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsubhn.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.rsubhn.v2i32"
+        )]
+        fn _vrsubhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t;
+    }
+    unsafe { _vrsubhn_s64(a, b) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_u16)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rsubhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsubhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    unsafe { transmute(vrsubhn_s16(transmute(a), transmute(b))) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_u16)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rsubhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsubhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint16x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vrsubhn_s16(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_u32)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rsubhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsubhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    unsafe { transmute(vrsubhn_s32(transmute(a), transmute(b))) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_u32)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rsubhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsubhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
+    let b: uint32x4_t = unsafe { simd_shuffle!(b, b, [3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint16x4_t = transmute(vrsubhn_s32(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+    }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_u64)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rsubhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    unsafe { transmute(vrsubhn_s64(transmute(a), transmute(b))) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_u64)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(rsubhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vrsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
+    let b: uint64x2_t = unsafe { simd_shuffle!(b, b, [1, 0]) };
+    unsafe {
+        let ret_val: uint32x2_t = transmute(vrsubhn_s64(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [1, 0])
+    }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vset_lane_f16<const LANE: i32>(a: f16, b: float16x4_t) -> float16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vsetq_lane_f16<const LANE: i32>(a: f16, b: float16x8_t) -> float16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vset_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> float32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsetq_lane_f32<const LANE: i32>(a: f32, b: float32x4_t) -> float32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vset_lane_s8<const LANE: i32>(a: i8, b: int8x8_t) -> int8x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsetq_lane_s8<const LANE: i32>(a: i8, b: int8x16_t) -> int8x16_t {
+    static_assert_uimm_bits!(LANE, 4);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vset_lane_s16<const LANE: i32>(a: i16, b: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsetq_lane_s16<const LANE: i32>(a: i16, b: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vset_lane_s32<const LANE: i32>(a: i32, b: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsetq_lane_s32<const LANE: i32>(a: i32, b: int32x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsetq_lane_s64<const LANE: i32>(a: i64, b: int64x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vset_lane_u8<const LANE: i32>(a: u8, b: uint8x8_t) -> uint8x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsetq_lane_u8<const LANE: i32>(a: u8, b: uint8x16_t) -> uint8x16_t {
+    static_assert_uimm_bits!(LANE, 4);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vset_lane_u16<const LANE: i32>(a: u16, b: uint16x4_t) -> uint16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsetq_lane_u16<const LANE: i32>(a: u16, b: uint16x8_t) -> uint16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vset_lane_u32<const LANE: i32>(a: u32, b: uint32x2_t) -> uint32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsetq_lane_u32<const LANE: i32>(a: u32, b: uint32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsetq_lane_u64<const LANE: i32>(a: u64, b: uint64x2_t) -> uint64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vset_lane_p8<const LANE: i32>(a: p8, b: poly8x8_t) -> poly8x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsetq_lane_p8<const LANE: i32>(a: p8, b: poly8x16_t) -> poly8x16_t {
+    static_assert_uimm_bits!(LANE, 4);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vset_lane_p16<const LANE: i32>(a: p16, b: poly16x4_t) -> poly16x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsetq_lane_p16<const LANE: i32>(a: p16, b: poly16x8_t) -> poly16x8_t {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_p64)"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vset_lane_p64<const LANE: i32>(a: p64, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(LANE == 0);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vset_lane_s64<const LANE: i32>(a: i64, b: int64x1_t) -> int64x1_t {
+    static_assert!(LANE == 0);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vset_lane_u64<const LANE: i32>(a: u64, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(LANE == 0);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "Insert vector element from another vector element"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsetq_lane_p64)"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsetq_lane_p64<const LANE: i32>(a: p64, b: poly64x2_t) -> poly64x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe { simd_insert!(b, LANE as u32, a) }
+}
+#[doc = "SHA1 hash update accelerator, choose."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha1cq_u32)"]
+#[inline]
+#[target_feature(enable = "sha2")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(sha1c))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vsha1cq_u32(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha1c"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha1c")]
+        fn _vsha1cq_u32(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vsha1cq_u32(hash_abcd, hash_e, wk) }
+}
+#[doc = "SHA1 fixed rotate."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha1h_u32)"]
+#[inline]
+#[target_feature(enable = "sha2")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(sha1h))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vsha1h_u32(hash_e: u32) -> u32 {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha1h"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha1h")]
+        fn _vsha1h_u32(hash_e: u32) -> u32;
+    }
+    unsafe { _vsha1h_u32(hash_e) }
+}
+#[doc = "SHA1 hash update accelerator, majority"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha1mq_u32)"]
+#[inline]
+#[target_feature(enable = "sha2")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(sha1m))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vsha1mq_u32(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha1m"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha1m")]
+        fn _vsha1mq_u32(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vsha1mq_u32(hash_abcd, hash_e, wk) }
+}
+#[doc = "SHA1 hash update accelerator, parity"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha1pq_u32)"]
+#[inline]
+#[target_feature(enable = "sha2")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(sha1p))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vsha1pq_u32(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha1p"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha1p")]
+        fn _vsha1pq_u32(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vsha1pq_u32(hash_abcd, hash_e, wk) }
+}
+#[doc = "SHA1 schedule update accelerator, first part."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha1su0q_u32)"]
+#[inline]
+#[target_feature(enable = "sha2")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(sha1su0))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vsha1su0q_u32(w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha1su0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha1su0")]
+        fn _vsha1su0q_u32(w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vsha1su0q_u32(w0_3, w4_7, w8_11) }
+}
+#[doc = "SHA1 schedule update accelerator, second part."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha1su1q_u32)"]
+#[inline]
+#[target_feature(enable = "sha2")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(sha1su1))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vsha1su1q_u32(tw0_3: uint32x4_t, w12_15: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha1su1"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha1su1")]
+        fn _vsha1su1q_u32(tw0_3: uint32x4_t, w12_15: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vsha1su1q_u32(tw0_3, w12_15) }
+}
+#[doc = "SHA1 schedule update accelerator, upper part."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha256h2q_u32)"]
+#[inline]
+#[target_feature(enable = "sha2")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(sha256h2))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vsha256h2q_u32(hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha256h2"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha256h2")]
+        fn _vsha256h2q_u32(
+            hash_abcd: uint32x4_t,
+            hash_efgh: uint32x4_t,
+            wk: uint32x4_t,
+        ) -> uint32x4_t;
+    }
+    unsafe { _vsha256h2q_u32(hash_abcd, hash_efgh, wk) }
+}
+#[doc = "SHA1 schedule update accelerator, first part."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha256hq_u32)"]
+#[inline]
+#[target_feature(enable = "sha2")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(sha256h))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vsha256hq_u32(hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha256h"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha256h")]
+        fn _vsha256hq_u32(
+            hash_abcd: uint32x4_t,
+            hash_efgh: uint32x4_t,
+            wk: uint32x4_t,
+        ) -> uint32x4_t;
+    }
+    unsafe { _vsha256hq_u32(hash_abcd, hash_efgh, wk) }
+}
+#[doc = "SHA256 schedule update accelerator, first part."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha256su0q_u32)"]
+#[inline]
+#[target_feature(enable = "sha2")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(sha256su0))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vsha256su0q_u32(w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha256su0"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha256su0")]
+        fn _vsha256su0q_u32(w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vsha256su0q_u32(w0_3, w4_7) }
+}
+#[doc = "SHA256 schedule update accelerator, second part."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsha256su1q_u32)"]
+#[inline]
+#[target_feature(enable = "sha2")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(sha256su1))]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "aarch64_neon_crypto_intrinsics", since = "1.72.0")
+)]
+pub fn vsha256su1q_u32(tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.crypto.sha256su1"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha256su1")]
+        fn _vsha256su1q_u32(tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t)
+            -> uint32x4_t;
+    }
+    unsafe { _vsha256su1q_u32(tw0_3, w8_11, w12_15) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshiftins_v16i8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+fn vshiftins_v16i8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftins.v16i8")]
+        fn _vshiftins_v16i8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vshiftins_v16i8(a, b, c) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshiftins_v1i64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+fn vshiftins_v1i64(a: int64x1_t, b: int64x1_t, c: int64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftins.v1i64")]
+        fn _vshiftins_v1i64(a: int64x1_t, b: int64x1_t, c: int64x1_t) -> int64x1_t;
+    }
+    unsafe { _vshiftins_v1i64(a, b, c) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshiftins_v2i32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+fn vshiftins_v2i32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftins.v2i32")]
+        fn _vshiftins_v2i32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vshiftins_v2i32(a, b, c) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshiftins_v2i64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+fn vshiftins_v2i64(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftins.v2i64")]
+        fn _vshiftins_v2i64(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t;
+    }
+    unsafe { _vshiftins_v2i64(a, b, c) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshiftins_v4i16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+fn vshiftins_v4i16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftins.v4i16")]
+        fn _vshiftins_v4i16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vshiftins_v4i16(a, b, c) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshiftins_v4i32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+fn vshiftins_v4i32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftins.v4i32")]
+        fn _vshiftins_v4i32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vshiftins_v4i32(a, b, c) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshiftins_v8i16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+fn vshiftins_v8i16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftins.v8i16")]
+        fn _vshiftins_v8i16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vshiftins_v8i16(a, b, c) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshiftins_v8i8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+fn vshiftins_v8i8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftins.v8i8")]
+        fn _vshiftins_v8i8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vshiftins_v8i8(a, b, c) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_shl(a, vdup_n_s8(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_shl(a, vdupq_n_s8(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { simd_shl(a, vdup_n_s16(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { simd_shl(a, vdupq_n_s16(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert_uimm_bits!(N, 5);
+    unsafe { simd_shl(a, vdup_n_s32(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert_uimm_bits!(N, 5);
+    unsafe { simd_shl(a, vdupq_n_s32(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert_uimm_bits!(N, 6);
+    unsafe { simd_shl(a, vdup_n_s64(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert_uimm_bits!(N, 6);
+    unsafe { simd_shl(a, vdupq_n_s64(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_shl(a, vdup_n_u8(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe { simd_shl(a, vdupq_n_u8(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { simd_shl(a, vdup_n_u16(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe { simd_shl(a, vdupq_n_u16(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert_uimm_bits!(N, 5);
+    unsafe { simd_shl(a, vdup_n_u32(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert_uimm_bits!(N, 5);
+    unsafe { simd_shl(a, vdupq_n_u32(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert_uimm_bits!(N, 6);
+    unsafe { simd_shl(a, vdup_n_u64(N as _)) }
+}
+#[doc = "Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shl, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert_uimm_bits!(N, 6);
+    unsafe { simd_shl(a, vdupq_n_u64(N as _)) }
+}
+#[doc = "Signed Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sshl.v8i8"
+        )]
+        fn _vshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vshl_s8(a, b) }
+}
+#[doc = "Signed Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sshl.v16i8"
+        )]
+        fn _vshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+    unsafe { _vshlq_s8(a, b) }
+}
+#[doc = "Signed Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sshl.v4i16"
+        )]
+        fn _vshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+    unsafe { _vshl_s16(a, b) }
+}
+#[doc = "Signed Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sshl.v8i16"
+        )]
+        fn _vshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+    unsafe { _vshlq_s16(a, b) }
+}
+#[doc = "Signed Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sshl.v2i32"
+        )]
+        fn _vshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+    unsafe { _vshl_s32(a, b) }
+}
+#[doc = "Signed Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sshl.v4i32"
+        )]
+        fn _vshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+    unsafe { _vshlq_s32(a, b) }
+}
+#[doc = "Signed Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v1i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sshl.v1i64"
+        )]
+        fn _vshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+    unsafe { _vshl_s64(a, b) }
+}
+#[doc = "Signed Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v2i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sshl.v2i64"
+        )]
+        fn _vshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+    unsafe { _vshlq_s64(a, b) }
+}
+#[doc = "Unsigned Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v8i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ushl.v8i8"
+        )]
+        fn _vshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+    unsafe { _vshl_u8(a, b) }
+}
+#[doc = "Unsigned Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v16i8")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ushl.v16i8"
+        )]
+        fn _vshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+    unsafe { _vshlq_u8(a, b) }
+}
+#[doc = "Unsigned Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v4i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ushl.v4i16"
+        )]
+        fn _vshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+    unsafe { _vshl_u16(a, b) }
+}
+#[doc = "Unsigned Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v8i16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ushl.v8i16"
+        )]
+        fn _vshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+    unsafe { _vshlq_u16(a, b) }
+}
+#[doc = "Unsigned Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v2i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ushl.v2i32"
+        )]
+        fn _vshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+    unsafe { _vshl_u32(a, b) }
+}
+#[doc = "Unsigned Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v4i32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ushl.v4i32"
+        )]
+        fn _vshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+    unsafe { _vshlq_u32(a, b) }
+}
+#[doc = "Unsigned Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshl_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v1i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ushl.v1i64"
+        )]
+        fn _vshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+    unsafe { _vshl_u64(a, b) }
+}
+#[doc = "Unsigned Shift left"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshlq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v2i64")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.ushl.v2i64"
+        )]
+        fn _vshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+    unsafe { _vshlq_u64(a, b) }
+}
+#[doc = "Signed shift left long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshll, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshll_n_s16<const N: i32>(a: int16x4_t) -> int32x4_t {
+    static_assert!(N >= 0 && N <= 16);
+    unsafe { simd_shl(simd_cast(a), vdupq_n_s32(N as _)) }
+}
+#[doc = "Signed shift left long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshll, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshll_n_s32<const N: i32>(a: int32x2_t) -> int64x2_t {
+    static_assert!(N >= 0 && N <= 32);
+    unsafe { simd_shl(simd_cast(a), vdupq_n_s64(N as _)) }
+}
+#[doc = "Signed shift left long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s8", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshll, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshll_n_s8<const N: i32>(a: int8x8_t) -> int16x8_t {
+    static_assert!(N >= 0 && N <= 8);
+    unsafe { simd_shl(simd_cast(a), vdupq_n_s16(N as _)) }
+}
+#[doc = "Signed shift left long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushll, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshll_n_u16<const N: i32>(a: uint16x4_t) -> uint32x4_t {
+    static_assert!(N >= 0 && N <= 16);
+    unsafe { simd_shl(simd_cast(a), vdupq_n_u32(N as _)) }
+}
+#[doc = "Signed shift left long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushll, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshll_n_u32<const N: i32>(a: uint32x2_t) -> uint64x2_t {
+    static_assert!(N >= 0 && N <= 32);
+    unsafe { simd_shl(simd_cast(a), vdupq_n_u64(N as _)) }
+}
+#[doc = "Signed shift left long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u8", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushll, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshll_n_u8<const N: i32>(a: uint8x8_t) -> uint16x8_t {
+    static_assert!(N >= 0 && N <= 8);
+    unsafe { simd_shl(simd_cast(a), vdupq_n_u16(N as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshr_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s8", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    let n: i32 = if N == 8 { 7 } else { N };
+    unsafe { simd_shr(a, vdup_n_s8(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrq_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s8", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    let n: i32 = if N == 8 { 7 } else { N };
+    unsafe { simd_shr(a, vdupq_n_s8(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshr_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    let n: i32 = if N == 16 { 15 } else { N };
+    unsafe { simd_shr(a, vdup_n_s16(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    let n: i32 = if N == 16 { 15 } else { N };
+    unsafe { simd_shr(a, vdupq_n_s16(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshr_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    let n: i32 = if N == 32 { 31 } else { N };
+    unsafe { simd_shr(a, vdup_n_s32(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    let n: i32 = if N == 32 { 31 } else { N };
+    unsafe { simd_shr(a, vdupq_n_s32(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshr_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s64", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    let n: i32 = if N == 64 { 63 } else { N };
+    unsafe { simd_shr(a, vdup_n_s64(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrq_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s64", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sshr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    let n: i32 = if N == 64 { 63 } else { N };
+    unsafe { simd_shr(a, vdupq_n_s64(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshr_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u8", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    let n: i32 = if N == 8 {
+        return vdup_n_u8(0);
+    } else {
+        N
+    };
+    unsafe { simd_shr(a, vdup_n_u8(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrq_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u8", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    let n: i32 = if N == 8 {
+        return vdupq_n_u8(0);
+    } else {
+        N
+    };
+    unsafe { simd_shr(a, vdupq_n_u8(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshr_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    let n: i32 = if N == 16 {
+        return vdup_n_u16(0);
+    } else {
+        N
+    };
+    unsafe { simd_shr(a, vdup_n_u16(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrq_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    let n: i32 = if N == 16 {
+        return vdupq_n_u16(0);
+    } else {
+        N
+    };
+    unsafe { simd_shr(a, vdupq_n_u16(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshr_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    let n: i32 = if N == 32 {
+        return vdup_n_u32(0);
+    } else {
+        N
+    };
+    unsafe { simd_shr(a, vdup_n_u32(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrq_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    let n: i32 = if N == 32 {
+        return vdupq_n_u32(0);
+    } else {
+        N
+    };
+    unsafe { simd_shr(a, vdupq_n_u32(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshr_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u64", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    let n: i32 = if N == 64 {
+        return vdup_n_u64(0);
+    } else {
+        N
+    };
+    unsafe { simd_shr(a, vdup_n_u64(n as _)) }
+}
+#[doc = "Shift right"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrq_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u64", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ushr, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    let n: i32 = if N == 64 {
+        return vdupq_n_u64(0);
+    } else {
+        N
+    };
+    unsafe { simd_shr(a, vdupq_n_u64(n as _)) }
+}
+#[doc = "Shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shrn, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { simd_cast(simd_shr(a, vdupq_n_s16(N as _))) }
+}
+#[doc = "Shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shrn, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_cast(simd_shr(a, vdupq_n_s32(N as _))) }
+}
+#[doc = "Shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i64", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shrn, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_cast(simd_shr(a, vdupq_n_s64(N as _))) }
+}
+#[doc = "Shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i16", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shrn, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { simd_cast(simd_shr(a, vdupq_n_u16(N as _))) }
+}
+#[doc = "Shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i32", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shrn, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_cast(simd_shr(a, vdupq_n_u32(N as _))) }
+}
+#[doc = "Shift right narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i64", N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(shrn, N = 2)
+)]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_cast(simd_shr(a, vdupq_n_u64(N as _))) }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_s8)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsli_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    vshiftins_v8i8(a, b, int8x8_t::splat(N as i8))
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_s8)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsliq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    vshiftins_v16i8(a, b, int8x16_t::splat(N as i8))
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_s16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsli_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_uimm_bits!(N, 4);
+    vshiftins_v4i16(a, b, int16x4_t::splat(N as i16))
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_s16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsliq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_uimm_bits!(N, 4);
+    vshiftins_v8i16(a, b, int16x8_t::splat(N as i16))
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_s32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsli_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N >= 0 && N <= 31);
+    vshiftins_v2i32(a, b, int32x2_t::splat(N))
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_s32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsliq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N >= 0 && N <= 31);
+    vshiftins_v4i32(a, b, int32x4_t::splat(N))
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_s64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsli_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N >= 0 && N <= 63);
+    vshiftins_v1i64(a, b, int64x1_t::splat(N as i64))
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_s64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsliq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N >= 0 && N <= 63);
+    vshiftins_v2i64(a, b, int64x2_t::splat(N as i64))
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_u8)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsli_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        transmute(vshiftins_v8i8(
+            transmute(a),
+            transmute(b),
+            int8x8_t::splat(N as i8),
+        ))
+    }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_u8)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsliq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        transmute(vshiftins_v16i8(
+            transmute(a),
+            transmute(b),
+            int8x16_t::splat(N as i8),
+        ))
+    }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_u16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsli_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe {
+        transmute(vshiftins_v4i16(
+            transmute(a),
+            transmute(b),
+            int16x4_t::splat(N as i16),
+        ))
+    }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_u16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsliq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe {
+        transmute(vshiftins_v8i16(
+            transmute(a),
+            transmute(b),
+            int16x8_t::splat(N as i16),
+        ))
+    }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_u32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsli_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N >= 0 && N <= 31);
+    unsafe {
+        transmute(vshiftins_v2i32(
+            transmute(a),
+            transmute(b),
+            int32x2_t::splat(N as i32),
+        ))
+    }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_u32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsliq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N >= 0 && N <= 31);
+    unsafe {
+        transmute(vshiftins_v4i32(
+            transmute(a),
+            transmute(b),
+            int32x4_t::splat(N as i32),
+        ))
+    }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_u64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsli_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N >= 0 && N <= 63);
+    unsafe {
+        transmute(vshiftins_v1i64(
+            transmute(a),
+            transmute(b),
+            int64x1_t::splat(N as i64),
+        ))
+    }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_u64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsliq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N >= 0 && N <= 63);
+    unsafe {
+        transmute(vshiftins_v2i64(
+            transmute(a),
+            transmute(b),
+            int64x2_t::splat(N as i64),
+        ))
+    }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_p8)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsli_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        transmute(vshiftins_v8i8(
+            transmute(a),
+            transmute(b),
+            int8x8_t::splat(N as i8),
+        ))
+    }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_p8)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsliq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert_uimm_bits!(N, 3);
+    unsafe {
+        transmute(vshiftins_v16i8(
+            transmute(a),
+            transmute(b),
+            int8x16_t::splat(N as i8),
+        ))
+    }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_p16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsli_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe {
+        transmute(vshiftins_v4i16(
+            transmute(a),
+            transmute(b),
+            int16x4_t::splat(N as i16),
+        ))
+    }
+}
+#[doc = "Shift Left and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsliq_n_p16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsliq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert_uimm_bits!(N, 4);
+    unsafe {
+        transmute(vshiftins_v8i16(
+            transmute(a),
+            transmute(b),
+            int16x8_t::splat(N as i16),
+        ))
+    }
+}
+#[doc = "Signed shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsra_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsra_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { simd_add(a, vshr_n_s8::<N>(b)) }
+}
+#[doc = "Signed shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsraq_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsraq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { simd_add(a, vshrq_n_s8::<N>(b)) }
+}
+#[doc = "Signed shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsra_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsra_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_add(a, vshr_n_s16::<N>(b)) }
+}
+#[doc = "Signed shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsraq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsraq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_add(a, vshrq_n_s16::<N>(b)) }
+}
+#[doc = "Signed shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsra_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsra_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_add(a, vshr_n_s32::<N>(b)) }
+}
+#[doc = "Signed shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsraq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsraq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_add(a, vshrq_n_s32::<N>(b)) }
+}
+#[doc = "Signed shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsra_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsra_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { simd_add(a, vshr_n_s64::<N>(b)) }
+}
+#[doc = "Signed shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsraq_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsraq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { simd_add(a, vshrq_n_s64::<N>(b)) }
+}
+#[doc = "Unsigned shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsra_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsra_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { simd_add(a, vshr_n_u8::<N>(b)) }
+}
+#[doc = "Unsigned shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsraq_n_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsraq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(N >= 1 && N <= 8);
+    unsafe { simd_add(a, vshrq_n_u8::<N>(b)) }
+}
+#[doc = "Unsigned shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsra_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsra_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_add(a, vshr_n_u16::<N>(b)) }
+}
+#[doc = "Unsigned shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsraq_n_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsraq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert!(N >= 1 && N <= 16);
+    unsafe { simd_add(a, vshrq_n_u16::<N>(b)) }
+}
+#[doc = "Unsigned shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsra_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsra_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_add(a, vshr_n_u32::<N>(b)) }
+}
+#[doc = "Unsigned shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsraq_n_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsraq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N >= 1 && N <= 32);
+    unsafe { simd_add(a, vshrq_n_u32::<N>(b)) }
+}
+#[doc = "Unsigned shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsra_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsra_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { simd_add(a, vshr_n_u64::<N>(b)) }
+}
+#[doc = "Unsigned shift right and accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsraq_n_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usra, N = 2)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsraq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N >= 1 && N <= 64);
+    unsafe { simd_add(a, vshrq_n_u64::<N>(b)) }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsri_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(1 <= N && N <= 8);
+    vshiftins_v8i8(a, b, int8x8_t::splat(-N as i8))
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_s8)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsriq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(1 <= N && N <= 8);
+    vshiftins_v16i8(a, b, int8x16_t::splat(-N as i8))
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsri_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(1 <= N && N <= 16);
+    vshiftins_v4i16(a, b, int16x4_t::splat(-N as i16))
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_s16)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsriq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert!(1 <= N && N <= 16);
+    vshiftins_v8i16(a, b, int16x8_t::splat(-N as i16))
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsri_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(1 <= N && N <= 32);
+    vshiftins_v2i32(a, b, int32x2_t::splat(-N as i32))
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_s32)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsriq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(1 <= N && N <= 32);
+    vshiftins_v4i32(a, b, int32x4_t::splat(-N as i32))
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsri_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(1 <= N && N <= 64);
+    vshiftins_v1i64(a, b, int64x1_t::splat(-N as i64))
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_s64)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsriq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(1 <= N && N <= 64);
+    vshiftins_v2i64(a, b, int64x2_t::splat(-N as i64))
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_u8)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsri_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert!(1 <= N && N <= 8);
+    unsafe {
+        transmute(vshiftins_v8i8(
+            transmute(a),
+            transmute(b),
+            int8x8_t::splat(-N as i8),
+        ))
+    }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_u8)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsriq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(1 <= N && N <= 8);
+    unsafe {
+        transmute(vshiftins_v16i8(
+            transmute(a),
+            transmute(b),
+            int8x16_t::splat(-N as i8),
+        ))
+    }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_u16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsri_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert!(1 <= N && N <= 16);
+    unsafe {
+        transmute(vshiftins_v4i16(
+            transmute(a),
+            transmute(b),
+            int16x4_t::splat(-N as i16),
+        ))
+    }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_u16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsriq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert!(1 <= N && N <= 16);
+    unsafe {
+        transmute(vshiftins_v8i16(
+            transmute(a),
+            transmute(b),
+            int16x8_t::splat(-N as i16),
+        ))
+    }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_u32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsri_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(1 <= N && N <= 32);
+    unsafe {
+        transmute(vshiftins_v2i32(
+            transmute(a),
+            transmute(b),
+            int32x2_t::splat(-N),
+        ))
+    }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_u32)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsriq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(1 <= N && N <= 32);
+    unsafe {
+        transmute(vshiftins_v4i32(
+            transmute(a),
+            transmute(b),
+            int32x4_t::splat(-N),
+        ))
+    }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_u64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsri_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(1 <= N && N <= 64);
+    unsafe {
+        transmute(vshiftins_v1i64(
+            transmute(a),
+            transmute(b),
+            int64x1_t::splat(-N as i64),
+        ))
+    }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_u64)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsriq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(1 <= N && N <= 64);
+    unsafe {
+        transmute(vshiftins_v2i64(
+            transmute(a),
+            transmute(b),
+            int64x2_t::splat(-N as i64),
+        ))
+    }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_p8)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsri_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert!(1 <= N && N <= 8);
+    unsafe {
+        transmute(vshiftins_v8i8(
+            transmute(a),
+            transmute(b),
+            int8x8_t::splat(-N as i8),
+        ))
+    }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_p8)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsriq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert!(1 <= N && N <= 8);
+    unsafe {
+        transmute(vshiftins_v16i8(
+            transmute(a),
+            transmute(b),
+            int8x16_t::splat(-N as i8),
+        ))
+    }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsri_n_p16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsri_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert!(1 <= N && N <= 16);
+    unsafe {
+        transmute(vshiftins_v4i16(
+            transmute(a),
+            transmute(b),
+            int16x4_t::splat(-N as i16),
+        ))
+    }
+}
+#[doc = "Shift Right and Insert (immediate)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsriq_n_p16)"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn vsriq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert!(1 <= N && N <= 16);
+    unsafe {
+        transmute(vshiftins_v8i16(
+            transmute(a),
+            transmute(b),
+            int16x8_t::splat(-N as i16),
+        ))
+    }
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.16"))]
+pub unsafe fn vst1_f16(ptr: *mut f16, a: float16x4_t) {
+    vst1_v4f16(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<f16>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.16"))]
+pub unsafe fn vst1q_f16(ptr: *mut f16, a: float16x8_t) {
+    vst1q_v8f16(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<f16>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(vst1))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1_f16_x2(a: *mut f16, b: float16x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0.v4f16")]
+        fn _vst1_f16_x2(ptr: *mut f16, a: float16x4_t, b: float16x4_t);
+    }
+    _vst1_f16_x2(a, b.0, b.1)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(vst1))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1q_f16_x2(a: *mut f16, b: float16x8x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0.v8f16")]
+        fn _vst1q_f16_x2(ptr: *mut f16, a: float16x8_t, b: float16x8_t);
+    }
+    _vst1q_f16_x2(a, b.0, b.1)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(st1))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1_f16_x2(a: *mut f16, b: float16x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v4f16.p0"
+        )]
+        fn _vst1_f16_x2(a: float16x4_t, b: float16x4_t, ptr: *mut f16);
+    }
+    _vst1_f16_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(st1))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1q_f16_x2(a: *mut f16, b: float16x8x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v8f16.p0"
+        )]
+        fn _vst1q_f16_x2(a: float16x8_t, b: float16x8_t, ptr: *mut f16);
+    }
+    _vst1q_f16_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(vst1))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1_f16_x3(a: *mut f16, b: float16x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0.v4f16")]
+        fn _vst1_f16_x3(ptr: *mut f16, a: float16x4_t, b: float16x4_t, c: float16x4_t);
+    }
+    _vst1_f16_x3(a, b.0, b.1, b.2)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(vst1))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1q_f16_x3(a: *mut f16, b: float16x8x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0.v8f16")]
+        fn _vst1q_f16_x3(ptr: *mut f16, a: float16x8_t, b: float16x8_t, c: float16x8_t);
+    }
+    _vst1q_f16_x3(a, b.0, b.1, b.2)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(st1))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1_f16_x3(a: *mut f16, b: float16x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v4f16.p0"
+        )]
+        fn _vst1_f16_x3(a: float16x4_t, b: float16x4_t, c: float16x4_t, ptr: *mut f16);
+    }
+    _vst1_f16_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(st1))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1q_f16_x3(a: *mut f16, b: float16x8x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v8f16.p0"
+        )]
+        fn _vst1q_f16_x3(a: float16x8_t, b: float16x8_t, c: float16x8_t, ptr: *mut f16);
+    }
+    _vst1q_f16_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_f16_x4(a: *mut f16, b: float16x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0.v4f16")]
+        fn _vst1_f16_x4(
+            ptr: *mut f16,
+            a: float16x4_t,
+            b: float16x4_t,
+            c: float16x4_t,
+            d: float16x4_t,
+        );
+    }
+    _vst1_f16_x4(a, b.0, b.1, b.2, b.3)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_f16_x4(a: *mut f16, b: float16x8x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0.v8f16")]
+        fn _vst1q_f16_x4(
+            ptr: *mut f16,
+            a: float16x8_t,
+            b: float16x8_t,
+            c: float16x8_t,
+            d: float16x8_t,
+        );
+    }
+    _vst1q_f16_x4(a, b.0, b.1, b.2, b.3)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(st1))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1_f16_x4(a: *mut f16, b: float16x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v4f16.p0"
+        )]
+        fn _vst1_f16_x4(
+            a: float16x4_t,
+            b: float16x4_t,
+            c: float16x4_t,
+            d: float16x4_t,
+            ptr: *mut f16,
+        );
+    }
+    _vst1_f16_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(st1))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1q_f16_x4(a: *mut f16, b: float16x8x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v8f16.p0"
+        )]
+        fn _vst1q_f16_x4(
+            a: float16x8_t,
+            b: float16x8_t,
+            c: float16x8_t,
+            d: float16x8_t,
+            ptr: *mut f16,
+        );
+    }
+    _vst1q_f16_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.32"))]
+pub unsafe fn vst1_f32(ptr: *mut f32, a: float32x2_t) {
+    vst1_v2f32(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<f32>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.32"))]
+pub unsafe fn vst1q_f32(ptr: *mut f32, a: float32x4_t) {
+    vst1q_v4f32(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<f32>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.8"))]
+pub unsafe fn vst1_s8(ptr: *mut i8, a: int8x8_t) {
+    vst1_v8i8(ptr as *const i8, a, crate::mem::align_of::<i8>() as i32)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.8"))]
+pub unsafe fn vst1q_s8(ptr: *mut i8, a: int8x16_t) {
+    vst1q_v16i8(ptr as *const i8, a, crate::mem::align_of::<i8>() as i32)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.16"))]
+pub unsafe fn vst1_s16(ptr: *mut i16, a: int16x4_t) {
+    vst1_v4i16(ptr as *const i8, a, crate::mem::align_of::<i16>() as i32)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.16"))]
+pub unsafe fn vst1q_s16(ptr: *mut i16, a: int16x8_t) {
+    vst1q_v8i16(ptr as *const i8, a, crate::mem::align_of::<i16>() as i32)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.32"))]
+pub unsafe fn vst1_s32(ptr: *mut i32, a: int32x2_t) {
+    vst1_v2i32(ptr as *const i8, a, crate::mem::align_of::<i32>() as i32)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.32"))]
+pub unsafe fn vst1q_s32(ptr: *mut i32, a: int32x4_t) {
+    vst1q_v4i32(ptr as *const i8, a, crate::mem::align_of::<i32>() as i32)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.64"))]
+pub unsafe fn vst1_s64(ptr: *mut i64, a: int64x1_t) {
+    vst1_v1i64(ptr as *const i8, a, crate::mem::align_of::<i64>() as i32)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.64"))]
+pub unsafe fn vst1q_s64(ptr: *mut i64, a: int64x2_t) {
+    vst1q_v2i64(ptr as *const i8, a, crate::mem::align_of::<i64>() as i32)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.8"))]
+pub unsafe fn vst1_u8(ptr: *mut u8, a: uint8x8_t) {
+    vst1_v8i8(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<u8>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.8"))]
+pub unsafe fn vst1q_u8(ptr: *mut u8, a: uint8x16_t) {
+    vst1q_v16i8(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<u8>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.16"))]
+pub unsafe fn vst1_u16(ptr: *mut u16, a: uint16x4_t) {
+    vst1_v4i16(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<u16>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.16"))]
+pub unsafe fn vst1q_u16(ptr: *mut u16, a: uint16x8_t) {
+    vst1q_v8i16(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<u16>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.32"))]
+pub unsafe fn vst1_u32(ptr: *mut u32, a: uint32x2_t) {
+    vst1_v2i32(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<u32>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.32"))]
+pub unsafe fn vst1q_u32(ptr: *mut u32, a: uint32x4_t) {
+    vst1q_v4i32(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<u32>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.64"))]
+pub unsafe fn vst1_u64(ptr: *mut u64, a: uint64x1_t) {
+    vst1_v1i64(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<u64>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.64"))]
+pub unsafe fn vst1q_u64(ptr: *mut u64, a: uint64x2_t) {
+    vst1q_v2i64(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<u64>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.8"))]
+pub unsafe fn vst1_p8(ptr: *mut p8, a: poly8x8_t) {
+    vst1_v8i8(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<p8>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.8"))]
+pub unsafe fn vst1q_p8(ptr: *mut p8, a: poly8x16_t) {
+    vst1q_v16i8(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<p8>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.16"))]
+pub unsafe fn vst1_p16(ptr: *mut p16, a: poly16x4_t) {
+    vst1_v4i16(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<p16>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.16"))]
+pub unsafe fn vst1q_p16(ptr: *mut p16, a: poly16x8_t) {
+    vst1q_v8i16(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<p16>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.64"))]
+pub unsafe fn vst1_p64(ptr: *mut p64, a: poly64x1_t) {
+    vst1_v1i64(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<p64>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.64"))]
+pub unsafe fn vst1q_p64(ptr: *mut p64, a: poly64x2_t) {
+    vst1q_v2i64(
+        ptr as *const i8,
+        transmute(a),
+        crate::mem::align_of::<p64>() as i32,
+    )
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst1_f32_x2(a: *mut f32, b: float32x2x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.v2f32.p0")]
+        fn _vst1_f32_x2(ptr: *mut f32, a: float32x2_t, b: float32x2_t);
+    }
+    _vst1_f32_x2(a, b.0, b.1)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst1q_f32_x2(a: *mut f32, b: float32x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.v4f32.p0")]
+        fn _vst1q_f32_x2(ptr: *mut f32, a: float32x4_t, b: float32x4_t);
+    }
+    _vst1q_f32_x2(a, b.0, b.1)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f32_x2(a: *mut f32, b: float32x2x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v2f32.p0"
+        )]
+        fn _vst1_f32_x2(a: float32x2_t, b: float32x2_t, ptr: *mut f32);
+    }
+    _vst1_f32_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f32_x2(a: *mut f32, b: float32x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v4f32.p0"
+        )]
+        fn _vst1q_f32_x2(a: float32x4_t, b: float32x4_t, ptr: *mut f32);
+    }
+    _vst1q_f32_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f32_x3(a: *mut f32, b: float32x2x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v2f32.p0"
+        )]
+        fn _vst1_f32_x3(a: float32x2_t, b: float32x2_t, c: float32x2_t, ptr: *mut f32);
+    }
+    _vst1_f32_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f32_x3(a: *mut f32, b: float32x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v4f32.p0"
+        )]
+        fn _vst1q_f32_x3(a: float32x4_t, b: float32x4_t, c: float32x4_t, ptr: *mut f32);
+    }
+    _vst1q_f32_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_f32_x4(a: *mut f32, b: float32x2x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0.v2f32.p0")]
+        fn _vst1_f32_x4(
+            ptr: *mut f32,
+            a: float32x2_t,
+            b: float32x2_t,
+            c: float32x2_t,
+            d: float32x2_t,
+        );
+    }
+    _vst1_f32_x4(a, b.0, b.1, b.2, b.3)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_f32_x4(a: *mut f32, b: float32x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0.v4f32.p0")]
+        fn _vst1q_f32_x4(
+            ptr: *mut f32,
+            a: float32x4_t,
+            b: float32x4_t,
+            c: float32x4_t,
+            d: float32x4_t,
+        );
+    }
+    _vst1q_f32_x4(a, b.0, b.1, b.2, b.3)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f32_x4(a: *mut f32, b: float32x2x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v2f32.p0"
+        )]
+        fn _vst1_f32_x4(
+            a: float32x2_t,
+            b: float32x2_t,
+            c: float32x2_t,
+            d: float32x2_t,
+            ptr: *mut f32,
+        );
+    }
+    _vst1_f32_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f32_x4(a: *mut f32, b: float32x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v4f32.p0"
+        )]
+        fn _vst1q_f32_x4(
+            a: float32x4_t,
+            b: float32x4_t,
+            c: float32x4_t,
+            d: float32x4_t,
+            ptr: *mut f32,
+        );
+    }
+    _vst1q_f32_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1_lane_f16<const LANE: i32>(a: *mut f16, b: float16x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst1q_lane_f16<const LANE: i32>(a: *mut f16, b: float16x8_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_lane_s8<const LANE: i32>(a: *mut i8, b: int8x16_t) {
+    static_assert_uimm_bits!(LANE, 4);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_lane_s64<const LANE: i32>(a: *mut i64, b: int64x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x8_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x16_t) {
+    static_assert_uimm_bits!(LANE, 4);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x8_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x8_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x16_t) {
+    static_assert_uimm_bits!(LANE, 4);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x8_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x1_t) {
+    static_assert!(LANE == 0);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_lane_s64<const LANE: i32>(a: *mut i64, b: int64x1_t) {
+    static_assert!(LANE == 0);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_lane_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x1_t) {
+    static_assert!(LANE == 0);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_p64_x2(a: *mut p64, b: poly64x1x2_t) {
+    vst1_s64_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_p64_x3(a: *mut p64, b: poly64x1x3_t) {
+    vst1_s64_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_p64_x4(a: *mut p64, b: poly64x1x4_t) {
+    vst1_s64_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_p64_x2(a: *mut p64, b: poly64x2x2_t) {
+    vst1q_s64_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_p64_x3(a: *mut p64, b: poly64x2x3_t) {
+    vst1q_s64_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_p64_x4(a: *mut p64, b: poly64x2x4_t) {
+    vst1q_s64_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_s8_x2(a: *mut i8, b: int8x8x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v8i8.p0"
+        )]
+        fn _vst1_s8_x2(a: int8x8_t, b: int8x8_t, ptr: *mut i8);
+    }
+    _vst1_s8_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_s8_x2(a: *mut i8, b: int8x16x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v16i8.p0"
+        )]
+        fn _vst1q_s8_x2(a: int8x16_t, b: int8x16_t, ptr: *mut i8);
+    }
+    _vst1q_s8_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_s16_x2(a: *mut i16, b: int16x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v4i16.p0"
+        )]
+        fn _vst1_s16_x2(a: int16x4_t, b: int16x4_t, ptr: *mut i16);
+    }
+    _vst1_s16_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_s16_x2(a: *mut i16, b: int16x8x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v8i16.p0"
+        )]
+        fn _vst1q_s16_x2(a: int16x8_t, b: int16x8_t, ptr: *mut i16);
+    }
+    _vst1q_s16_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_s32_x2(a: *mut i32, b: int32x2x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v2i32.p0"
+        )]
+        fn _vst1_s32_x2(a: int32x2_t, b: int32x2_t, ptr: *mut i32);
+    }
+    _vst1_s32_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_s32_x2(a: *mut i32, b: int32x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v4i32.p0"
+        )]
+        fn _vst1q_s32_x2(a: int32x4_t, b: int32x4_t, ptr: *mut i32);
+    }
+    _vst1q_s32_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_s64_x2(a: *mut i64, b: int64x1x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v1i64.p0"
+        )]
+        fn _vst1_s64_x2(a: int64x1_t, b: int64x1_t, ptr: *mut i64);
+    }
+    _vst1_s64_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_s64_x2(a: *mut i64, b: int64x2x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x2.v2i64.p0"
+        )]
+        fn _vst1q_s64_x2(a: int64x2_t, b: int64x2_t, ptr: *mut i64);
+    }
+    _vst1q_s64_x2(b.0, b.1, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s8_x2(a: *mut i8, b: int8x8x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.v8i8.p0")]
+        fn _vst1_s8_x2(ptr: *mut i8, a: int8x8_t, b: int8x8_t);
+    }
+    _vst1_s8_x2(a, b.0, b.1)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s8_x2(a: *mut i8, b: int8x16x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.v16i8.p0")]
+        fn _vst1q_s8_x2(ptr: *mut i8, a: int8x16_t, b: int8x16_t);
+    }
+    _vst1q_s8_x2(a, b.0, b.1)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s16_x2(a: *mut i16, b: int16x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.v4i16.p0")]
+        fn _vst1_s16_x2(ptr: *mut i16, a: int16x4_t, b: int16x4_t);
+    }
+    _vst1_s16_x2(a, b.0, b.1)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s16_x2(a: *mut i16, b: int16x8x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.v8i16.p0")]
+        fn _vst1q_s16_x2(ptr: *mut i16, a: int16x8_t, b: int16x8_t);
+    }
+    _vst1q_s16_x2(a, b.0, b.1)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s32_x2(a: *mut i32, b: int32x2x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.v2i32.p0")]
+        fn _vst1_s32_x2(ptr: *mut i32, a: int32x2_t, b: int32x2_t);
+    }
+    _vst1_s32_x2(a, b.0, b.1)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s32_x2(a: *mut i32, b: int32x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.v4i32.p0")]
+        fn _vst1q_s32_x2(ptr: *mut i32, a: int32x4_t, b: int32x4_t);
+    }
+    _vst1q_s32_x2(a, b.0, b.1)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s64_x2(a: *mut i64, b: int64x1x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.v1i64.p0")]
+        fn _vst1_s64_x2(ptr: *mut i64, a: int64x1_t, b: int64x1_t);
+    }
+    _vst1_s64_x2(a, b.0, b.1)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s64_x2(a: *mut i64, b: int64x2x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.v2i64.p0")]
+        fn _vst1q_s64_x2(ptr: *mut i64, a: int64x2_t, b: int64x2_t);
+    }
+    _vst1q_s64_x2(a, b.0, b.1)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_s8_x3(a: *mut i8, b: int8x8x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v8i8.p0"
+        )]
+        fn _vst1_s8_x3(a: int8x8_t, b: int8x8_t, c: int8x8_t, ptr: *mut i8);
+    }
+    _vst1_s8_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_s8_x3(a: *mut i8, b: int8x16x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v16i8.p0"
+        )]
+        fn _vst1q_s8_x3(a: int8x16_t, b: int8x16_t, c: int8x16_t, ptr: *mut i8);
+    }
+    _vst1q_s8_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_s16_x3(a: *mut i16, b: int16x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v4i16.p0"
+        )]
+        fn _vst1_s16_x3(a: int16x4_t, b: int16x4_t, c: int16x4_t, ptr: *mut i16);
+    }
+    _vst1_s16_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_s16_x3(a: *mut i16, b: int16x8x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v8i16.p0"
+        )]
+        fn _vst1q_s16_x3(a: int16x8_t, b: int16x8_t, c: int16x8_t, ptr: *mut i16);
+    }
+    _vst1q_s16_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_s32_x3(a: *mut i32, b: int32x2x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v2i32.p0"
+        )]
+        fn _vst1_s32_x3(a: int32x2_t, b: int32x2_t, c: int32x2_t, ptr: *mut i32);
+    }
+    _vst1_s32_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_s32_x3(a: *mut i32, b: int32x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v4i32.p0"
+        )]
+        fn _vst1q_s32_x3(a: int32x4_t, b: int32x4_t, c: int32x4_t, ptr: *mut i32);
+    }
+    _vst1q_s32_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_s64_x3(a: *mut i64, b: int64x1x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v1i64.p0"
+        )]
+        fn _vst1_s64_x3(a: int64x1_t, b: int64x1_t, c: int64x1_t, ptr: *mut i64);
+    }
+    _vst1_s64_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_s64_x3(a: *mut i64, b: int64x2x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x3.v2i64.p0"
+        )]
+        fn _vst1q_s64_x3(a: int64x2_t, b: int64x2_t, c: int64x2_t, ptr: *mut i64);
+    }
+    _vst1q_s64_x3(b.0, b.1, b.2, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s8_x3(a: *mut i8, b: int8x8x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0.v8i8.p0")]
+        fn _vst1_s8_x3(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t);
+    }
+    _vst1_s8_x3(a, b.0, b.1, b.2)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s8_x3(a: *mut i8, b: int8x16x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0.v16i8.p0")]
+        fn _vst1q_s8_x3(ptr: *mut i8, a: int8x16_t, b: int8x16_t, c: int8x16_t);
+    }
+    _vst1q_s8_x3(a, b.0, b.1, b.2)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s16_x3(a: *mut i16, b: int16x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0.v4i16.p0")]
+        fn _vst1_s16_x3(ptr: *mut i16, a: int16x4_t, b: int16x4_t, c: int16x4_t);
+    }
+    _vst1_s16_x3(a, b.0, b.1, b.2)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s16_x3(a: *mut i16, b: int16x8x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0.v8i16.p0")]
+        fn _vst1q_s16_x3(ptr: *mut i16, a: int16x8_t, b: int16x8_t, c: int16x8_t);
+    }
+    _vst1q_s16_x3(a, b.0, b.1, b.2)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s32_x3(a: *mut i32, b: int32x2x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0.v2i32.p0")]
+        fn _vst1_s32_x3(ptr: *mut i32, a: int32x2_t, b: int32x2_t, c: int32x2_t);
+    }
+    _vst1_s32_x3(a, b.0, b.1, b.2)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s32_x3(a: *mut i32, b: int32x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0.v4i32.p0")]
+        fn _vst1q_s32_x3(ptr: *mut i32, a: int32x4_t, b: int32x4_t, c: int32x4_t);
+    }
+    _vst1q_s32_x3(a, b.0, b.1, b.2)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s64_x3(a: *mut i64, b: int64x1x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0.v1i64.p0")]
+        fn _vst1_s64_x3(ptr: *mut i64, a: int64x1_t, b: int64x1_t, c: int64x1_t);
+    }
+    _vst1_s64_x3(a, b.0, b.1, b.2)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s64_x3(a: *mut i64, b: int64x2x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0.v2i64.p0")]
+        fn _vst1q_s64_x3(ptr: *mut i64, a: int64x2_t, b: int64x2_t, c: int64x2_t);
+    }
+    _vst1q_s64_x3(a, b.0, b.1, b.2)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_s8_x4(a: *mut i8, b: int8x8x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v8i8.p0"
+        )]
+        fn _vst1_s8_x4(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, ptr: *mut i8);
+    }
+    _vst1_s8_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_s8_x4(a: *mut i8, b: int8x16x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v16i8.p0"
+        )]
+        fn _vst1q_s8_x4(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, ptr: *mut i8);
+    }
+    _vst1q_s8_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_s16_x4(a: *mut i16, b: int16x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v4i16.p0"
+        )]
+        fn _vst1_s16_x4(a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, ptr: *mut i16);
+    }
+    _vst1_s16_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_s16_x4(a: *mut i16, b: int16x8x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v8i16.p0"
+        )]
+        fn _vst1q_s16_x4(a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, ptr: *mut i16);
+    }
+    _vst1q_s16_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_s32_x4(a: *mut i32, b: int32x2x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v2i32.p0"
+        )]
+        fn _vst1_s32_x4(a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, ptr: *mut i32);
+    }
+    _vst1_s32_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_s32_x4(a: *mut i32, b: int32x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v4i32.p0"
+        )]
+        fn _vst1q_s32_x4(a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, ptr: *mut i32);
+    }
+    _vst1q_s32_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_s64_x4(a: *mut i64, b: int64x1x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v1i64.p0"
+        )]
+        fn _vst1_s64_x4(a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, ptr: *mut i64);
+    }
+    _vst1_s64_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_s64_x4(a: *mut i64, b: int64x2x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st1x4.v2i64.p0"
+        )]
+        fn _vst1q_s64_x4(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, ptr: *mut i64);
+    }
+    _vst1q_s64_x4(b.0, b.1, b.2, b.3, a)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s8_x4(a: *mut i8, b: int8x8x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0.v8i8.p0")]
+        fn _vst1_s8_x4(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t);
+    }
+    _vst1_s8_x4(a, b.0, b.1, b.2, b.3)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s8_x4(a: *mut i8, b: int8x16x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0.v16i8.p0")]
+        fn _vst1q_s8_x4(ptr: *mut i8, a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t);
+    }
+    _vst1q_s8_x4(a, b.0, b.1, b.2, b.3)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s16_x4(a: *mut i16, b: int16x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0.v4i16.p0")]
+        fn _vst1_s16_x4(ptr: *mut i16, a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t);
+    }
+    _vst1_s16_x4(a, b.0, b.1, b.2, b.3)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s16_x4(a: *mut i16, b: int16x8x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0.v8i16.p0")]
+        fn _vst1q_s16_x4(ptr: *mut i16, a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t);
+    }
+    _vst1q_s16_x4(a, b.0, b.1, b.2, b.3)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s32_x4(a: *mut i32, b: int32x2x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0.v2i32.p0")]
+        fn _vst1_s32_x4(ptr: *mut i32, a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t);
+    }
+    _vst1_s32_x4(a, b.0, b.1, b.2, b.3)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s32_x4(a: *mut i32, b: int32x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0.v4i32.p0")]
+        fn _vst1q_s32_x4(ptr: *mut i32, a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t);
+    }
+    _vst1q_s32_x4(a, b.0, b.1, b.2, b.3)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s64_x4(a: *mut i64, b: int64x1x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0.v1i64.p0")]
+        fn _vst1_s64_x4(ptr: *mut i64, a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t);
+    }
+    _vst1_s64_x4(a, b.0, b.1, b.2, b.3)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s64_x4(a: *mut i64, b: int64x2x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0.v2i64.p0")]
+        fn _vst1q_s64_x4(ptr: *mut i64, a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t);
+    }
+    _vst1q_s64_x4(a, b.0, b.1, b.2, b.3)
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_u8_x2(a: *mut u8, b: uint8x8x2_t) {
+    vst1_s8_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_u8_x3(a: *mut u8, b: uint8x8x3_t) {
+    vst1_s8_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_u8_x4(a: *mut u8, b: uint8x8x4_t) {
+    vst1_s8_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_u8_x2(a: *mut u8, b: uint8x16x2_t) {
+    vst1q_s8_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_u8_x3(a: *mut u8, b: uint8x16x3_t) {
+    vst1q_s8_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_u8_x4(a: *mut u8, b: uint8x16x4_t) {
+    vst1q_s8_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_u16_x2(a: *mut u16, b: uint16x4x2_t) {
+    vst1_s16_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_u16_x3(a: *mut u16, b: uint16x4x3_t) {
+    vst1_s16_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_u16_x4(a: *mut u16, b: uint16x4x4_t) {
+    vst1_s16_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_u16_x2(a: *mut u16, b: uint16x8x2_t) {
+    vst1q_s16_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_u16_x3(a: *mut u16, b: uint16x8x3_t) {
+    vst1q_s16_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_u16_x4(a: *mut u16, b: uint16x8x4_t) {
+    vst1q_s16_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_u32_x2(a: *mut u32, b: uint32x2x2_t) {
+    vst1_s32_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_u32_x3(a: *mut u32, b: uint32x2x3_t) {
+    vst1_s32_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_u32_x4(a: *mut u32, b: uint32x2x4_t) {
+    vst1_s32_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u32_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_u32_x2(a: *mut u32, b: uint32x4x2_t) {
+    vst1q_s32_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u32_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_u32_x3(a: *mut u32, b: uint32x4x3_t) {
+    vst1q_s32_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u32_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_u32_x4(a: *mut u32, b: uint32x4x4_t) {
+    vst1q_s32_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_u64_x2(a: *mut u64, b: uint64x1x2_t) {
+    vst1_s64_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_u64_x3(a: *mut u64, b: uint64x1x3_t) {
+    vst1_s64_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_u64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_u64_x4(a: *mut u64, b: uint64x1x4_t) {
+    vst1_s64_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u64_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_u64_x2(a: *mut u64, b: uint64x2x2_t) {
+    vst1q_s64_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u64_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_u64_x3(a: *mut u64, b: uint64x2x3_t) {
+    vst1q_s64_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_u64_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_u64_x4(a: *mut u64, b: uint64x2x4_t) {
+    vst1q_s64_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_p8_x2(a: *mut p8, b: poly8x8x2_t) {
+    vst1_s8_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_p8_x3(a: *mut p8, b: poly8x8x3_t) {
+    vst1_s8_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_p8_x4(a: *mut p8, b: poly8x8x4_t) {
+    vst1_s8_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p8_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_p8_x2(a: *mut p8, b: poly8x16x2_t) {
+    vst1q_s8_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p8_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_p8_x3(a: *mut p8, b: poly8x16x3_t) {
+    vst1q_s8_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p8_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_p8_x4(a: *mut p8, b: poly8x16x4_t) {
+    vst1q_s8_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_p16_x2(a: *mut p16, b: poly16x4x2_t) {
+    vst1_s16_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_p16_x3(a: *mut p16, b: poly16x4x3_t) {
+    vst1_s16_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_p16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1_p16_x4(a: *mut p16, b: poly16x4x4_t) {
+    vst1_s16_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p16_x2)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_p16_x2(a: *mut p16, b: poly16x8x2_t) {
+    vst1q_s16_x2(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p16_x3)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_p16_x3(a: *mut p16, b: poly16x8x3_t) {
+    vst1q_s16_x3(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures to one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_p16_x4)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st1)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_p16_x4(a: *mut p16, b: poly16x8x4_t) {
+    vst1q_s16_x4(transmute(a), transmute(b))
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_v1i64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.64"))]
+unsafe fn vst1_v1i64(addr: *const i8, val: int64x1_t, align: i32) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1.v1i64.p0")]
+        fn _vst1_v1i64(addr: *const i8, val: int64x1_t, align: i32);
+    }
+    _vst1_v1i64(addr, val, align)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_v2f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.32"))]
+unsafe fn vst1_v2f32(addr: *const i8, val: float32x2_t, align: i32) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1.v2f32.p0")]
+        fn _vst1_v2f32(addr: *const i8, val: float32x2_t, align: i32);
+    }
+    _vst1_v2f32(addr, val, align)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_v2i32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.32"))]
+unsafe fn vst1_v2i32(addr: *const i8, val: int32x2_t, align: i32) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1.v2i32.p0")]
+        fn _vst1_v2i32(addr: *const i8, val: int32x2_t, align: i32);
+    }
+    _vst1_v2i32(addr, val, align)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_v4i16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.16"))]
+unsafe fn vst1_v4i16(addr: *const i8, val: int16x4_t, align: i32) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1.v4i16.p0")]
+        fn _vst1_v4i16(addr: *const i8, val: int16x4_t, align: i32);
+    }
+    _vst1_v4i16(addr, val, align)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_v8i8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.8"))]
+unsafe fn vst1_v8i8(addr: *const i8, val: int8x8_t, align: i32) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1.v8i8.p0")]
+        fn _vst1_v8i8(addr: *const i8, val: int8x8_t, align: i32);
+    }
+    _vst1_v8i8(addr, val, align)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_v16i8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.8"))]
+unsafe fn vst1q_v16i8(addr: *const i8, val: int8x16_t, align: i32) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1.v16i8.p0")]
+        fn _vst1q_v16i8(addr: *const i8, val: int8x16_t, align: i32);
+    }
+    _vst1q_v16i8(addr, val, align)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_v2i64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.64"))]
+unsafe fn vst1q_v2i64(addr: *const i8, val: int64x2_t, align: i32) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1.v2i64.p0")]
+        fn _vst1q_v2i64(addr: *const i8, val: int64x2_t, align: i32);
+    }
+    _vst1q_v2i64(addr, val, align)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_v4f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.32"))]
+unsafe fn vst1q_v4f32(addr: *const i8, val: float32x4_t, align: i32) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1.v4f32.p0")]
+        fn _vst1q_v4f32(addr: *const i8, val: float32x4_t, align: i32);
+    }
+    _vst1q_v4f32(addr, val, align)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_v4i32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.32"))]
+unsafe fn vst1q_v4i32(addr: *const i8, val: int32x4_t, align: i32) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1.v4i32.p0")]
+        fn _vst1q_v4i32(addr: *const i8, val: int32x4_t, align: i32);
+    }
+    _vst1q_v4i32(addr, val, align)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_v8i16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.16"))]
+unsafe fn vst1q_v8i16(addr: *const i8, val: int16x8_t, align: i32) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1.v8i16.p0")]
+        fn _vst1q_v8i16(addr: *const i8, val: int16x8_t, align: i32);
+    }
+    _vst1q_v8i16(addr, val, align)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_v4f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.16"))]
+unsafe fn vst1_v4f16(addr: *const i8, val: float16x4_t, align: i32) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1.v4f16.p0")]
+        fn _vst1_v4f16(addr: *const i8, val: float16x4_t, align: i32);
+    }
+    _vst1_v4f16(addr, val, align)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers."]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_v8f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vst1.16"))]
+unsafe fn vst1q_v8f16(addr: *const i8, val: float16x8_t, align: i32) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1.v8f16.p0")]
+        fn _vst1q_v8f16(addr: *const i8, val: float16x8_t, align: i32);
+    }
+    _vst1q_v8f16(addr, val, align)
+}
+#[doc = "Store multiple single-element structures from one, two, three, or four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_lane_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst1q_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    *a = simd_extract!(b, LANE as u32);
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(st2))]
+pub unsafe fn vst2_f16(a: *mut f16, b: float16x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v4f16.p0"
+        )]
+        fn _vst2_f16(a: float16x4_t, b: float16x4_t, ptr: *mut i8);
+    }
+    _vst2_f16(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(st2))]
+pub unsafe fn vst2q_f16(a: *mut f16, b: float16x8x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v8f16.p0"
+        )]
+        fn _vst2q_f16(a: float16x8_t, b: float16x8_t, ptr: *mut i8);
+    }
+    _vst2q_f16(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2_f16(a: *mut f16, b: float16x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0.v4f16")]
+        fn _vst2_f16(ptr: *mut i8, a: float16x4_t, b: float16x4_t, size: i32);
+    }
+    _vst2_f16(a as _, b.0, b.1, 2)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2q_f16(a: *mut f16, b: float16x8x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0.v8f16")]
+        fn _vst2q_f16(ptr: *mut i8, a: float16x8_t, b: float16x8_t, size: i32);
+    }
+    _vst2q_f16(a as _, b.0, b.1, 2)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st2))]
+pub unsafe fn vst2_f32(a: *mut f32, b: float32x2x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v2f32.p0"
+        )]
+        fn _vst2_f32(a: float32x2_t, b: float32x2_t, ptr: *mut i8);
+    }
+    _vst2_f32(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st2))]
+pub unsafe fn vst2q_f32(a: *mut f32, b: float32x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v4f32.p0"
+        )]
+        fn _vst2q_f32(a: float32x4_t, b: float32x4_t, ptr: *mut i8);
+    }
+    _vst2q_f32(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st2))]
+pub unsafe fn vst2_s8(a: *mut i8, b: int8x8x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v8i8.p0"
+        )]
+        fn _vst2_s8(a: int8x8_t, b: int8x8_t, ptr: *mut i8);
+    }
+    _vst2_s8(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st2))]
+pub unsafe fn vst2q_s8(a: *mut i8, b: int8x16x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v16i8.p0"
+        )]
+        fn _vst2q_s8(a: int8x16_t, b: int8x16_t, ptr: *mut i8);
+    }
+    _vst2q_s8(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st2))]
+pub unsafe fn vst2_s16(a: *mut i16, b: int16x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v4i16.p0"
+        )]
+        fn _vst2_s16(a: int16x4_t, b: int16x4_t, ptr: *mut i8);
+    }
+    _vst2_s16(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st2))]
+pub unsafe fn vst2q_s16(a: *mut i16, b: int16x8x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v8i16.p0"
+        )]
+        fn _vst2q_s16(a: int16x8_t, b: int16x8_t, ptr: *mut i8);
+    }
+    _vst2q_s16(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st2))]
+pub unsafe fn vst2_s32(a: *mut i32, b: int32x2x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v2i32.p0"
+        )]
+        fn _vst2_s32(a: int32x2_t, b: int32x2_t, ptr: *mut i8);
+    }
+    _vst2_s32(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st2))]
+pub unsafe fn vst2q_s32(a: *mut i32, b: int32x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v4i32.p0"
+        )]
+        fn _vst2q_s32(a: int32x4_t, b: int32x4_t, ptr: *mut i8);
+    }
+    _vst2q_s32(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2_f32(a: *mut f32, b: float32x2x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.v2f32.p0")]
+        fn _vst2_f32(ptr: *mut i8, a: float32x2_t, b: float32x2_t, size: i32);
+    }
+    _vst2_f32(a as _, b.0, b.1, 4)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2q_f32(a: *mut f32, b: float32x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.v4f32.p0")]
+        fn _vst2q_f32(ptr: *mut i8, a: float32x4_t, b: float32x4_t, size: i32);
+    }
+    _vst2q_f32(a as _, b.0, b.1, 4)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2_s8(a: *mut i8, b: int8x8x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.v8i8.p0")]
+        fn _vst2_s8(ptr: *mut i8, a: int8x8_t, b: int8x8_t, size: i32);
+    }
+    _vst2_s8(a as _, b.0, b.1, 1)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2q_s8(a: *mut i8, b: int8x16x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.v16i8.p0")]
+        fn _vst2q_s8(ptr: *mut i8, a: int8x16_t, b: int8x16_t, size: i32);
+    }
+    _vst2q_s8(a as _, b.0, b.1, 1)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2_s16(a: *mut i16, b: int16x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.v4i16.p0")]
+        fn _vst2_s16(ptr: *mut i8, a: int16x4_t, b: int16x4_t, size: i32);
+    }
+    _vst2_s16(a as _, b.0, b.1, 2)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2q_s16(a: *mut i16, b: int16x8x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.v8i16.p0")]
+        fn _vst2q_s16(ptr: *mut i8, a: int16x8_t, b: int16x8_t, size: i32);
+    }
+    _vst2q_s16(a as _, b.0, b.1, 2)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2_s32(a: *mut i32, b: int32x2x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.v2i32.p0")]
+        fn _vst2_s32(ptr: *mut i8, a: int32x2_t, b: int32x2_t, size: i32);
+    }
+    _vst2_s32(a as _, b.0, b.1, 4)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2q_s32(a: *mut i32, b: int32x4x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.v4i32.p0")]
+        fn _vst2q_s32(ptr: *mut i8, a: int32x4_t, b: int32x4_t, size: i32);
+    }
+    _vst2q_s32(a as _, b.0, b.1, 4)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst2_lane_f16<const LANE: i32>(a: *mut f16, b: float16x4x2_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v4f16.p0"
+        )]
+        fn _vst2_lane_f16(a: float16x4_t, b: float16x4_t, n: i64, ptr: *mut i8);
+    }
+    _vst2_lane_f16(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst2q_lane_f16<const LANE: i32>(a: *mut f16, b: float16x8x2_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v8f16.p0"
+        )]
+        fn _vst2q_lane_f16(a: float16x8_t, b: float16x8_t, n: i64, ptr: *mut i8);
+    }
+    _vst2q_lane_f16(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst2_lane_f16<const LANE: i32>(a: *mut f16, b: float16x4x2_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0.v4f16")]
+        fn _vst2_lane_f16(ptr: *mut i8, a: float16x4_t, b: float16x4_t, n: i32, size: i32);
+    }
+    _vst2_lane_f16(a as _, b.0, b.1, LANE, 2)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst2q_lane_f16<const LANE: i32>(a: *mut f16, b: float16x8x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0.v8f16")]
+        fn _vst2q_lane_f16(ptr: *mut i8, a: float16x8_t, b: float16x8_t, n: i32, size: i32);
+    }
+    _vst2q_lane_f16(a as _, b.0, b.1, LANE, 2)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v2f32.p0"
+        )]
+        fn _vst2_lane_f32(a: float32x2_t, b: float32x2_t, n: i64, ptr: *mut i8);
+    }
+    _vst2_lane_f32(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x2_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v4f32.p0"
+        )]
+        fn _vst2q_lane_f32(a: float32x4_t, b: float32x4_t, n: i64, ptr: *mut i8);
+    }
+    _vst2q_lane_f32(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8x2_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v8i8.p0"
+        )]
+        fn _vst2_lane_s8(a: int8x8_t, b: int8x8_t, n: i64, ptr: *mut i8);
+    }
+    _vst2_lane_s8(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4x2_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v4i16.p0"
+        )]
+        fn _vst2_lane_s16(a: int16x4_t, b: int16x4_t, n: i64, ptr: *mut i8);
+    }
+    _vst2_lane_s16(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8x2_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v8i16.p0"
+        )]
+        fn _vst2q_lane_s16(a: int16x8_t, b: int16x8_t, n: i64, ptr: *mut i8);
+    }
+    _vst2q_lane_s16(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v2i32.p0"
+        )]
+        fn _vst2_lane_s32(a: int32x2_t, b: int32x2_t, n: i64, ptr: *mut i8);
+    }
+    _vst2_lane_s32(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4x2_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2lane.v4i32.p0"
+        )]
+        fn _vst2q_lane_s32(a: int32x4_t, b: int32x4_t, n: i64, ptr: *mut i8);
+    }
+    _vst2q_lane_s32(b.0, b.1, LANE as i64, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst2_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.v2f32.p0")]
+        fn _vst2_lane_f32(ptr: *mut i8, a: float32x2_t, b: float32x2_t, n: i32, size: i32);
+    }
+    _vst2_lane_f32(a as _, b.0, b.1, LANE, 4)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst2q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x2_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.v4f32.p0")]
+        fn _vst2q_lane_f32(ptr: *mut i8, a: float32x4_t, b: float32x4_t, n: i32, size: i32);
+    }
+    _vst2q_lane_f32(a as _, b.0, b.1, LANE, 4)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst2_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8x2_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.v8i8.p0")]
+        fn _vst2_lane_s8(ptr: *mut i8, a: int8x8_t, b: int8x8_t, n: i32, size: i32);
+    }
+    _vst2_lane_s8(a as _, b.0, b.1, LANE, 1)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst2_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4x2_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.v4i16.p0")]
+        fn _vst2_lane_s16(ptr: *mut i8, a: int16x4_t, b: int16x4_t, n: i32, size: i32);
+    }
+    _vst2_lane_s16(a as _, b.0, b.1, LANE, 2)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst2q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8x2_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.v8i16.p0")]
+        fn _vst2q_lane_s16(ptr: *mut i8, a: int16x8_t, b: int16x8_t, n: i32, size: i32);
+    }
+    _vst2q_lane_s16(a as _, b.0, b.1, LANE, 2)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst2_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.v2i32.p0")]
+        fn _vst2_lane_s32(ptr: *mut i8, a: int32x2_t, b: int32x2_t, n: i32, size: i32);
+    }
+    _vst2_lane_s32(a as _, b.0, b.1, LANE, 4)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst2q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4x2_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.v4i32.p0")]
+        fn _vst2q_lane_s32(ptr: *mut i8, a: int32x4_t, b: int32x4_t, n: i32, size: i32);
+    }
+    _vst2q_lane_s32(a as _, b.0, b.1, LANE, 4)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x8x2_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    vst2_lane_s8::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x4x2_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    vst2_lane_s16::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2q_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x8x2_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    vst2q_lane_s16::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x2x2_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    vst2_lane_s32::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2q_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x4x2_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    vst2q_lane_s32::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x8x2_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    vst2_lane_s8::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x4x2_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    vst2_lane_s16::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2q_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x8x2_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    vst2q_lane_s16::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2_p64(a: *mut p64, b: poly64x1x2_t) {
+    vst2_s64(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.v1i64.p0")]
+        fn _vst2_s64(ptr: *mut i8, a: int64x1_t, b: int64x1_t, size: i32);
+    }
+    _vst2_s64(a as _, b.0, b.1, 8)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st2.v1i64.p0"
+        )]
+        fn _vst2_s64(a: int64x1_t, b: int64x1_t, ptr: *mut i8);
+    }
+    _vst2_s64(b.0, b.1, a as _)
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2_u64(a: *mut u64, b: uint64x1x2_t) {
+    vst2_s64(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2_u8(a: *mut u8, b: uint8x8x2_t) {
+    vst2_s8(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2q_u8(a: *mut u8, b: uint8x16x2_t) {
+    vst2q_s8(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2_u16(a: *mut u16, b: uint16x4x2_t) {
+    vst2_s16(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2q_u16(a: *mut u16, b: uint16x8x2_t) {
+    vst2q_s16(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2_u32(a: *mut u32, b: uint32x2x2_t) {
+    vst2_s32(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2q_u32(a: *mut u32, b: uint32x4x2_t) {
+    vst2q_s32(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2_p8(a: *mut p8, b: poly8x8x2_t) {
+    vst2_s8(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2q_p8(a: *mut p8, b: poly8x16x2_t) {
+    vst2q_s8(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2_p16(a: *mut p16, b: poly16x4x2_t) {
+    vst2_s16(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 2-element structures from two registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst2q_p16(a: *mut p16, b: poly16x8x2_t) {
+    vst2q_s16(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3_f16(a: *mut f16, b: float16x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v4f16")]
+        fn _vst3_f16(ptr: *mut i8, a: float16x4_t, b: float16x4_t, c: float16x4_t, size: i32);
+    }
+    _vst3_f16(a as _, b.0, b.1, b.2, 2)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3q_f16(a: *mut f16, b: float16x8x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v8f16")]
+        fn _vst3q_f16(ptr: *mut i8, a: float16x8_t, b: float16x8_t, c: float16x8_t, size: i32);
+    }
+    _vst3q_f16(a as _, b.0, b.1, b.2, 2)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3_f16(a: *mut f16, b: float16x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v4f16.p0"
+        )]
+        fn _vst3_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t, ptr: *mut i8);
+    }
+    _vst3_f16(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3q_f16(a: *mut f16, b: float16x8x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v8f16.p0"
+        )]
+        fn _vst3q_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t, ptr: *mut i8);
+    }
+    _vst3q_f16(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3_f32(a: *mut f32, b: float32x2x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v2f32")]
+        fn _vst3_f32(ptr: *mut i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, size: i32);
+    }
+    _vst3_f32(a as _, b.0, b.1, b.2, 4)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3q_f32(a: *mut f32, b: float32x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v4f32")]
+        fn _vst3q_f32(ptr: *mut i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, size: i32);
+    }
+    _vst3q_f32(a as _, b.0, b.1, b.2, 4)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3_s8(a: *mut i8, b: int8x8x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v8i8")]
+        fn _vst3_s8(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, size: i32);
+    }
+    _vst3_s8(a as _, b.0, b.1, b.2, 1)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3q_s8(a: *mut i8, b: int8x16x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v16i8")]
+        fn _vst3q_s8(ptr: *mut i8, a: int8x16_t, b: int8x16_t, c: int8x16_t, size: i32);
+    }
+    _vst3q_s8(a as _, b.0, b.1, b.2, 1)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3_s16(a: *mut i16, b: int16x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v4i16")]
+        fn _vst3_s16(ptr: *mut i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, size: i32);
+    }
+    _vst3_s16(a as _, b.0, b.1, b.2, 2)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3q_s16(a: *mut i16, b: int16x8x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v8i16")]
+        fn _vst3q_s16(ptr: *mut i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, size: i32);
+    }
+    _vst3q_s16(a as _, b.0, b.1, b.2, 2)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3_s32(a: *mut i32, b: int32x2x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v2i32")]
+        fn _vst3_s32(ptr: *mut i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, size: i32);
+    }
+    _vst3_s32(a as _, b.0, b.1, b.2, 4)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3q_s32(a: *mut i32, b: int32x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v4i32")]
+        fn _vst3q_s32(ptr: *mut i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, size: i32);
+    }
+    _vst3q_s32(a as _, b.0, b.1, b.2, 4)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3_f32(a: *mut f32, b: float32x2x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v2f32.p0"
+        )]
+        fn _vst3_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t, ptr: *mut i8);
+    }
+    _vst3_f32(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3q_f32(a: *mut f32, b: float32x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v4f32.p0"
+        )]
+        fn _vst3q_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t, ptr: *mut i8);
+    }
+    _vst3q_f32(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3_s8(a: *mut i8, b: int8x8x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v8i8.p0"
+        )]
+        fn _vst3_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t, ptr: *mut i8);
+    }
+    _vst3_s8(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3q_s8(a: *mut i8, b: int8x16x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v16i8.p0"
+        )]
+        fn _vst3q_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t, ptr: *mut i8);
+    }
+    _vst3q_s8(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3_s16(a: *mut i16, b: int16x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v4i16.p0"
+        )]
+        fn _vst3_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t, ptr: *mut i8);
+    }
+    _vst3_s16(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3q_s16(a: *mut i16, b: int16x8x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v8i16.p0"
+        )]
+        fn _vst3q_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t, ptr: *mut i8);
+    }
+    _vst3q_s16(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3_s32(a: *mut i32, b: int32x2x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v2i32.p0"
+        )]
+        fn _vst3_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t, ptr: *mut i8);
+    }
+    _vst3_s32(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st3))]
+pub unsafe fn vst3q_s32(a: *mut i32, b: int32x4x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v4i32.p0"
+        )]
+        fn _vst3q_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t, ptr: *mut i8);
+    }
+    _vst3q_s32(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst3_lane_f16<const LANE: i32>(a: *mut f16, b: float16x4x3_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0.v4f16")]
+        fn _vst3_lane_f16(
+            ptr: *mut i8,
+            a: float16x4_t,
+            b: float16x4_t,
+            c: float16x4_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst3_lane_f16(a as _, b.0, b.1, b.2, LANE, 4)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst3q_lane_f16<const LANE: i32>(a: *mut f16, b: float16x8x3_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0.v8f16")]
+        fn _vst3q_lane_f16(
+            ptr: *mut i8,
+            a: float16x8_t,
+            b: float16x8_t,
+            c: float16x8_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst3q_lane_f16(a as _, b.0, b.1, b.2, LANE, 4)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst3_lane_f16<const LANE: i32>(a: *mut f16, b: float16x4x3_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v4f16.p0"
+        )]
+        fn _vst3_lane_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t, n: i64, ptr: *mut i8);
+    }
+    _vst3_lane_f16(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst3q_lane_f16<const LANE: i32>(a: *mut f16, b: float16x8x3_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v8f16.p0"
+        )]
+        fn _vst3q_lane_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t, n: i64, ptr: *mut i8);
+    }
+    _vst3q_lane_f16(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst3_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2x3_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0.v2f32")]
+        fn _vst3_lane_f32(
+            ptr: *mut i8,
+            a: float32x2_t,
+            b: float32x2_t,
+            c: float32x2_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst3_lane_f32(a as _, b.0, b.1, b.2, LANE, 4)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst3q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x3_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0.v4f32")]
+        fn _vst3q_lane_f32(
+            ptr: *mut i8,
+            a: float32x4_t,
+            b: float32x4_t,
+            c: float32x4_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst3q_lane_f32(a as _, b.0, b.1, b.2, LANE, 4)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst3_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8x3_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0.v8i8")]
+        fn _vst3_lane_s8(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, n: i32, size: i32);
+    }
+    _vst3_lane_s8(a as _, b.0, b.1, b.2, LANE, 1)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst3_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4x3_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0.v4i16")]
+        fn _vst3_lane_s16(
+            ptr: *mut i8,
+            a: int16x4_t,
+            b: int16x4_t,
+            c: int16x4_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst3_lane_s16(a as _, b.0, b.1, b.2, LANE, 2)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst3q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8x3_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0.v8i16")]
+        fn _vst3q_lane_s16(
+            ptr: *mut i8,
+            a: int16x8_t,
+            b: int16x8_t,
+            c: int16x8_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst3q_lane_s16(a as _, b.0, b.1, b.2, LANE, 2)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst3_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2x3_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0.v2i32")]
+        fn _vst3_lane_s32(
+            ptr: *mut i8,
+            a: int32x2_t,
+            b: int32x2_t,
+            c: int32x2_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst3_lane_s32(a as _, b.0, b.1, b.2, LANE, 4)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst3q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4x3_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0.v4i32")]
+        fn _vst3q_lane_s32(
+            ptr: *mut i8,
+            a: int32x4_t,
+            b: int32x4_t,
+            c: int32x4_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst3q_lane_s32(a as _, b.0, b.1, b.2, LANE, 4)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2x3_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v2f32.p0"
+        )]
+        fn _vst3_lane_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t, n: i64, ptr: *mut i8);
+    }
+    _vst3_lane_f32(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x3_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v4f32.p0"
+        )]
+        fn _vst3q_lane_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t, n: i64, ptr: *mut i8);
+    }
+    _vst3q_lane_f32(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8x3_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v8i8.p0"
+        )]
+        fn _vst3_lane_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t, n: i64, ptr: *mut i8);
+    }
+    _vst3_lane_s8(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4x3_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v4i16.p0"
+        )]
+        fn _vst3_lane_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t, n: i64, ptr: *mut i8);
+    }
+    _vst3_lane_s16(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8x3_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v8i16.p0"
+        )]
+        fn _vst3q_lane_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t, n: i64, ptr: *mut i8);
+    }
+    _vst3q_lane_s16(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2x3_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v2i32.p0"
+        )]
+        fn _vst3_lane_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t, n: i64, ptr: *mut i8);
+    }
+    _vst3_lane_s32(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4x3_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3lane.v4i32.p0"
+        )]
+        fn _vst3q_lane_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t, n: i64, ptr: *mut i8);
+    }
+    _vst3q_lane_s32(b.0, b.1, b.2, LANE as i64, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x8x3_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    vst3_lane_s8::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x4x3_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    vst3_lane_s16::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3q_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x8x3_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    vst3q_lane_s16::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x2x3_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    vst3_lane_s32::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3q_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x4x3_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    vst3q_lane_s32::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x8x3_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    vst3_lane_s8::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x4x3_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    vst3_lane_s16::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3q_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x8x3_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    vst3q_lane_s16::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3_p64(a: *mut p64, b: poly64x1x3_t) {
+    vst3_s64(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vst3_s64(a: *mut i64, b: int64x1x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st3.v1i64.p0"
+        )]
+        fn _vst3_s64(a: int64x1_t, b: int64x1_t, c: int64x1_t, ptr: *mut i8);
+    }
+    _vst3_s64(b.0, b.1, b.2, a as _)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vst3_s64(a: *mut i64, b: int64x1x3_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v1i64")]
+        fn _vst3_s64(ptr: *mut i8, a: int64x1_t, b: int64x1_t, c: int64x1_t, size: i32);
+    }
+    _vst3_s64(a as _, b.0, b.1, b.2, 8)
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3_u64(a: *mut u64, b: uint64x1x3_t) {
+    vst3_s64(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3_u8(a: *mut u8, b: uint8x8x3_t) {
+    vst3_s8(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3q_u8(a: *mut u8, b: uint8x16x3_t) {
+    vst3q_s8(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3_u16(a: *mut u16, b: uint16x4x3_t) {
+    vst3_s16(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3q_u16(a: *mut u16, b: uint16x8x3_t) {
+    vst3q_s16(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3_u32(a: *mut u32, b: uint32x2x3_t) {
+    vst3_s32(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3q_u32(a: *mut u32, b: uint32x4x3_t) {
+    vst3q_s32(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3_p8(a: *mut p8, b: poly8x8x3_t) {
+    vst3_s8(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3q_p8(a: *mut p8, b: poly8x16x3_t) {
+    vst3q_s8(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3_p16(a: *mut p16, b: poly16x4x3_t) {
+    vst3_s16(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 3-element structures from three registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st3)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst3q_p16(a: *mut p16, b: poly16x8x3_t) {
+    vst3q_s16(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4_f16(a: *mut f16, b: float16x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0.v4f16")]
+        fn _vst4_f16(
+            ptr: *mut i8,
+            a: float16x4_t,
+            b: float16x4_t,
+            c: float16x4_t,
+            d: float16x4_t,
+            size: i32,
+        );
+    }
+    _vst4_f16(a as _, b.0, b.1, b.2, b.3, 2)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4q_f16(a: *mut f16, b: float16x8x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0.v8f16")]
+        fn _vst4q_f16(
+            ptr: *mut i8,
+            a: float16x8_t,
+            b: float16x8_t,
+            c: float16x8_t,
+            d: float16x8_t,
+            size: i32,
+        );
+    }
+    _vst4q_f16(a as _, b.0, b.1, b.2, b.3, 2)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4_f16(a: *mut f16, b: float16x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v4f16.p0"
+        )]
+        fn _vst4_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t, d: float16x4_t, ptr: *mut i8);
+    }
+    _vst4_f16(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4q_f16(a: *mut f16, b: float16x8x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v8f16.p0"
+        )]
+        fn _vst4q_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t, d: float16x8_t, ptr: *mut i8);
+    }
+    _vst4q_f16(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0.v2f32")]
+        fn _vst4_f32(
+            ptr: *mut i8,
+            a: float32x2_t,
+            b: float32x2_t,
+            c: float32x2_t,
+            d: float32x2_t,
+            size: i32,
+        );
+    }
+    _vst4_f32(a as _, b.0, b.1, b.2, b.3, 4)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0.v4f32")]
+        fn _vst4q_f32(
+            ptr: *mut i8,
+            a: float32x4_t,
+            b: float32x4_t,
+            c: float32x4_t,
+            d: float32x4_t,
+            size: i32,
+        );
+    }
+    _vst4q_f32(a as _, b.0, b.1, b.2, b.3, 4)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4_s8(a: *mut i8, b: int8x8x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0.v8i8")]
+        fn _vst4_s8(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, size: i32);
+    }
+    _vst4_s8(a as _, b.0, b.1, b.2, b.3, 1)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4q_s8(a: *mut i8, b: int8x16x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0.v16i8")]
+        fn _vst4q_s8(
+            ptr: *mut i8,
+            a: int8x16_t,
+            b: int8x16_t,
+            c: int8x16_t,
+            d: int8x16_t,
+            size: i32,
+        );
+    }
+    _vst4q_s8(a as _, b.0, b.1, b.2, b.3, 1)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4_s16(a: *mut i16, b: int16x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0.v4i16")]
+        fn _vst4_s16(
+            ptr: *mut i8,
+            a: int16x4_t,
+            b: int16x4_t,
+            c: int16x4_t,
+            d: int16x4_t,
+            size: i32,
+        );
+    }
+    _vst4_s16(a as _, b.0, b.1, b.2, b.3, 2)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4q_s16(a: *mut i16, b: int16x8x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0.v8i16")]
+        fn _vst4q_s16(
+            ptr: *mut i8,
+            a: int16x8_t,
+            b: int16x8_t,
+            c: int16x8_t,
+            d: int16x8_t,
+            size: i32,
+        );
+    }
+    _vst4q_s16(a as _, b.0, b.1, b.2, b.3, 2)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4_s32(a: *mut i32, b: int32x2x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0.v2i32")]
+        fn _vst4_s32(
+            ptr: *mut i8,
+            a: int32x2_t,
+            b: int32x2_t,
+            c: int32x2_t,
+            d: int32x2_t,
+            size: i32,
+        );
+    }
+    _vst4_s32(a as _, b.0, b.1, b.2, b.3, 4)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4q_s32(a: *mut i32, b: int32x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0.v4i32")]
+        fn _vst4q_s32(
+            ptr: *mut i8,
+            a: int32x4_t,
+            b: int32x4_t,
+            c: int32x4_t,
+            d: int32x4_t,
+            size: i32,
+        );
+    }
+    _vst4q_s32(a as _, b.0, b.1, b.2, b.3, 4)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v2f32.p0"
+        )]
+        fn _vst4_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, ptr: *mut i8);
+    }
+    _vst4_f32(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v4f32.p0"
+        )]
+        fn _vst4q_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, ptr: *mut i8);
+    }
+    _vst4q_f32(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4_s8(a: *mut i8, b: int8x8x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v8i8.p0"
+        )]
+        fn _vst4_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, ptr: *mut i8);
+    }
+    _vst4_s8(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4q_s8(a: *mut i8, b: int8x16x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v16i8.p0"
+        )]
+        fn _vst4q_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, ptr: *mut i8);
+    }
+    _vst4q_s8(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4_s16(a: *mut i16, b: int16x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v4i16.p0"
+        )]
+        fn _vst4_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, ptr: *mut i8);
+    }
+    _vst4_s16(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4q_s16(a: *mut i16, b: int16x8x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v8i16.p0"
+        )]
+        fn _vst4q_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, ptr: *mut i8);
+    }
+    _vst4q_s16(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4_s32(a: *mut i32, b: int32x2x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v2i32.p0"
+        )]
+        fn _vst4_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, ptr: *mut i8);
+    }
+    _vst4_s32(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(st4))]
+pub unsafe fn vst4q_s32(a: *mut i32, b: int32x4x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v4i32.p0"
+        )]
+        fn _vst4q_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, ptr: *mut i8);
+    }
+    _vst4q_s32(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst4_lane_f16<const LANE: i32>(a: *mut f16, b: float16x4x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0.v4f16")]
+        fn _vst4_lane_f16(
+            ptr: *mut i8,
+            a: float16x4_t,
+            b: float16x4_t,
+            c: float16x4_t,
+            d: float16x4_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst4_lane_f16(a as _, b.0, b.1, b.2, b.3, LANE, 2)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst4q_lane_f16<const LANE: i32>(a: *mut f16, b: float16x8x4_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0.v8f16")]
+        fn _vst4q_lane_f16(
+            ptr: *mut i8,
+            a: float16x8_t,
+            b: float16x8_t,
+            c: float16x8_t,
+            d: float16x8_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst4q_lane_f16(a as _, b.0, b.1, b.2, b.3, LANE, 2)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst4_lane_f16<const LANE: i32>(a: *mut f16, b: float16x4x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v4f16.p0"
+        )]
+        fn _vst4_lane_f16(
+            a: float16x4_t,
+            b: float16x4_t,
+            c: float16x4_t,
+            d: float16x4_t,
+            n: i64,
+            ptr: *mut i8,
+        );
+    }
+    _vst4_lane_f16(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_f16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub unsafe fn vst4q_lane_f16<const LANE: i32>(a: *mut f16, b: float16x8x4_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v8f16.p0"
+        )]
+        fn _vst4q_lane_f16(
+            a: float16x8_t,
+            b: float16x8_t,
+            c: float16x8_t,
+            d: float16x8_t,
+            n: i64,
+            ptr: *mut i8,
+        );
+    }
+    _vst4q_lane_f16(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst4_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2x4_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0.v2f32")]
+        fn _vst4_lane_f32(
+            ptr: *mut i8,
+            a: float32x2_t,
+            b: float32x2_t,
+            c: float32x2_t,
+            d: float32x2_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst4_lane_f32(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst4q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0.v4f32")]
+        fn _vst4q_lane_f32(
+            ptr: *mut i8,
+            a: float32x4_t,
+            b: float32x4_t,
+            c: float32x4_t,
+            d: float32x4_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst4q_lane_f32(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst4_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8x4_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0.v8i8")]
+        fn _vst4_lane_s8(
+            ptr: *mut i8,
+            a: int8x8_t,
+            b: int8x8_t,
+            c: int8x8_t,
+            d: int8x8_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst4_lane_s8(a as _, b.0, b.1, b.2, b.3, LANE, 1)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst4_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0.v4i16")]
+        fn _vst4_lane_s16(
+            ptr: *mut i8,
+            a: int16x4_t,
+            b: int16x4_t,
+            c: int16x4_t,
+            d: int16x4_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst4_lane_s16(a as _, b.0, b.1, b.2, b.3, LANE, 2)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst4q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8x4_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0.v8i16")]
+        fn _vst4q_lane_s16(
+            ptr: *mut i8,
+            a: int16x8_t,
+            b: int16x8_t,
+            c: int16x8_t,
+            d: int16x8_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst4q_lane_s16(a as _, b.0, b.1, b.2, b.3, LANE, 2)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst4_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2x4_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0.v2i32")]
+        fn _vst4_lane_s32(
+            ptr: *mut i8,
+            a: int32x2_t,
+            b: int32x2_t,
+            c: int32x2_t,
+            d: int32x2_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst4_lane_s32(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+pub unsafe fn vst4q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0.v4i32")]
+        fn _vst4q_lane_s32(
+            ptr: *mut i8,
+            a: int32x4_t,
+            b: int32x4_t,
+            c: int32x4_t,
+            d: int32x4_t,
+            n: i32,
+            size: i32,
+        );
+    }
+    _vst4q_lane_s32(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2x4_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v2f32.p0"
+        )]
+        fn _vst4_lane_f32(
+            a: float32x2_t,
+            b: float32x2_t,
+            c: float32x2_t,
+            d: float32x2_t,
+            n: i64,
+            ptr: *mut i8,
+        );
+    }
+    _vst4_lane_f32(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_f32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v4f32.p0"
+        )]
+        fn _vst4q_lane_f32(
+            a: float32x4_t,
+            b: float32x4_t,
+            c: float32x4_t,
+            d: float32x4_t,
+            n: i64,
+            ptr: *mut i8,
+        );
+    }
+    _vst4q_lane_f32(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_s8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8x4_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v8i8.p0"
+        )]
+        fn _vst4_lane_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, n: i64, ptr: *mut i8);
+    }
+    _vst4_lane_s8(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v4i16.p0"
+        )]
+        fn _vst4_lane_s16(
+            a: int16x4_t,
+            b: int16x4_t,
+            c: int16x4_t,
+            d: int16x4_t,
+            n: i64,
+            ptr: *mut i8,
+        );
+    }
+    _vst4_lane_s16(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_s16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8x4_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v8i16.p0"
+        )]
+        fn _vst4q_lane_s16(
+            a: int16x8_t,
+            b: int16x8_t,
+            c: int16x8_t,
+            d: int16x8_t,
+            n: i64,
+            ptr: *mut i8,
+        );
+    }
+    _vst4q_lane_s16(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2x4_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v2i32.p0"
+        )]
+        fn _vst4_lane_s32(
+            a: int32x2_t,
+            b: int32x2_t,
+            c: int32x2_t,
+            d: int32x2_t,
+            n: i64,
+            ptr: *mut i8,
+        );
+    }
+    _vst4_lane_s32(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_s32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4lane.v4i32.p0"
+        )]
+        fn _vst4q_lane_s32(
+            a: int32x4_t,
+            b: int32x4_t,
+            c: int32x4_t,
+            d: int32x4_t,
+            n: i64,
+            ptr: *mut i8,
+        );
+    }
+    _vst4q_lane_s32(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x8x4_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    vst4_lane_s8::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x4x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    vst4_lane_s16::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4q_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x8x4_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    vst4q_lane_s16::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x2x4_t) {
+    static_assert_uimm_bits!(LANE, 1);
+    vst4_lane_s32::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4q_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x4x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    vst4q_lane_s32::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x8x4_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    vst4_lane_s8::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x4x4_t) {
+    static_assert_uimm_bits!(LANE, 2);
+    vst4_lane_s16::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4, LANE = 0)
+)]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4q_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x8x4_t) {
+    static_assert_uimm_bits!(LANE, 3);
+    vst4q_lane_s16::<LANE>(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_p64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4_p64(a: *mut p64, b: poly64x1x4_t) {
+    vst4_s64(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vst4_s64(a: *mut i64, b: int64x1x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0.v1i64")]
+        fn _vst4_s64(
+            ptr: *mut i8,
+            a: int64x1_t,
+            b: int64x1_t,
+            c: int64x1_t,
+            d: int64x1_t,
+            size: i32,
+        );
+    }
+    _vst4_s64(a as _, b.0, b.1, b.2, b.3, 8)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(not(target_arch = "arm"))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vst4_s64(a: *mut i64, b: int64x1x4_t) {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.st4.v1i64.p0"
+        )]
+        fn _vst4_s64(a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, ptr: *mut i8);
+    }
+    _vst4_s64(b.0, b.1, b.2, b.3, a as _)
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_u64)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4_u64(a: *mut u64, b: uint64x1x4_t) {
+    vst4_s64(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4_u8(a: *mut u8, b: uint8x8x4_t) {
+    vst4_s8(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_u8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4q_u8(a: *mut u8, b: uint8x16x4_t) {
+    vst4q_s8(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4_u16(a: *mut u16, b: uint16x4x4_t) {
+    vst4_s16(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_u16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4q_u16(a: *mut u16, b: uint16x8x4_t) {
+    vst4q_s16(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4_u32(a: *mut u32, b: uint32x2x4_t) {
+    vst4_s32(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_u32)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4q_u32(a: *mut u32, b: uint32x4x4_t) {
+    vst4q_s32(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4_p8(a: *mut p8, b: poly8x8x4_t) {
+    vst4_s8(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_p8)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4q_p8(a: *mut p8, b: poly8x16x4_t) {
+    vst4q_s8(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4_p16(a: *mut p16, b: poly16x4x4_t) {
+    vst4_s16(transmute(a), transmute(b))
+}
+#[doc = "Store multiple 4-element structures from four registers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_p16)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(st4)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vst4q_p16(a: *mut p16, b: poly16x8x4_t) {
+    vst4q_s16(transmute(a), transmute(b))
+}
+#[doc = "Store SIMD&FP register (immediate offset)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vstrq_p128)"]
+#[doc = "## Safety"]
+#[doc = "  * Neon instrinsic unsafe"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(nop)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub unsafe fn vstrq_p128(a: *mut p128, b: p128) {
+    *a = b
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsub_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fsub)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vsub_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fsub)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vsubq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsub_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsub_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(fsub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsub_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsub_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsub_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsub_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsub_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubq_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsub_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubq_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsub_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsub_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sub)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { simd_sub(a, b) }
+}
+#[doc = "Subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(subhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
+    let d: int8x8_t = vsubhn_s16(b, c);
+    unsafe { simd_shuffle!(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(subhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
+    let d: int16x4_t = vsubhn_s32(b, c);
+    unsafe { simd_shuffle!(a, d, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_high_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(subhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
+    let d: int32x2_t = vsubhn_s64(b, c);
+    unsafe { simd_shuffle!(a, d, [0, 1, 2, 3]) }
+}
+#[doc = "Subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(subhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
+    let d: uint8x8_t = vsubhn_u16(b, c);
+    unsafe { simd_shuffle!(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(subhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
+    let d: uint16x4_t = vsubhn_u32(b, c);
+    unsafe { simd_shuffle!(a, d, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_high_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(subhn2)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
+    let d: uint32x2_t = vsubhn_u64(b, c);
+    unsafe { simd_shuffle!(a, d, [0, 1, 2, 3]) }
+}
+#[doc = "Subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(subhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    let c: i16x8 = i16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
+    unsafe { simd_cast(simd_shr(simd_sub(a, b), transmute(c))) }
+}
+#[doc = "Subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(subhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    let c: i32x4 = i32x4::new(16, 16, 16, 16);
+    unsafe { simd_cast(simd_shr(simd_sub(a, b), transmute(c))) }
+}
+#[doc = "Subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(subhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    let c: i64x2 = i64x2::new(32, 32);
+    unsafe { simd_cast(simd_shr(simd_sub(a, b), transmute(c))) }
+}
+#[doc = "Subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(subhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    let c: u16x8 = u16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
+    unsafe { simd_cast(simd_shr(simd_sub(a, b), transmute(c))) }
+}
+#[doc = "Subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(subhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    let c: u32x4 = u32x4::new(16, 16, 16, 16);
+    unsafe { simd_cast(simd_shr(simd_sub(a, b), transmute(c))) }
+}
+#[doc = "Subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(subhn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    let c: u64x2 = u64x2::new(32, 32);
+    unsafe { simd_cast(simd_shr(simd_sub(a, b), transmute(c))) }
+}
+#[doc = "Signed Subtract Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubl_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssubl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    unsafe {
+        let c: int16x8_t = simd_cast(a);
+        let d: int16x8_t = simd_cast(b);
+        simd_sub(c, d)
+    }
+}
+#[doc = "Signed Subtract Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubl_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssubl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    unsafe {
+        let c: int32x4_t = simd_cast(a);
+        let d: int32x4_t = simd_cast(b);
+        simd_sub(c, d)
+    }
+}
+#[doc = "Signed Subtract Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubl_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssubl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    unsafe {
+        let c: int64x2_t = simd_cast(a);
+        let d: int64x2_t = simd_cast(b);
+        simd_sub(c, d)
+    }
+}
+#[doc = "Unsigned Subtract Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubl_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usubl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    unsafe {
+        let c: uint16x8_t = simd_cast(a);
+        let d: uint16x8_t = simd_cast(b);
+        simd_sub(c, d)
+    }
+}
+#[doc = "Unsigned Subtract Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubl_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usubl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    unsafe {
+        let c: uint32x4_t = simd_cast(a);
+        let d: uint32x4_t = simd_cast(b);
+        simd_sub(c, d)
+    }
+}
+#[doc = "Unsigned Subtract Long"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubl_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usubl)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    unsafe {
+        let c: uint64x2_t = simd_cast(a);
+        let d: uint64x2_t = simd_cast(b);
+        simd_sub(c, d)
+    }
+}
+#[doc = "Signed Subtract Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssubw)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t {
+    unsafe { simd_sub(a, simd_cast(b)) }
+}
+#[doc = "Signed Subtract Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssubw)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t {
+    unsafe { simd_sub(a, simd_cast(b)) }
+}
+#[doc = "Signed Subtract Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(ssubw)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t {
+    unsafe { simd_sub(a, simd_cast(b)) }
+}
+#[doc = "Unsigned Subtract Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usubw)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t {
+    unsafe { simd_sub(a, simd_cast(b)) }
+}
+#[doc = "Unsigned Subtract Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usubw)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t {
+    unsafe { simd_sub(a, simd_cast(b)) }
+}
+#[doc = "Unsigned Subtract Wide"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usubw)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsubw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
+    unsafe { simd_sub(a, simd_cast(b)) }
+}
+#[doc = "Dot product index form with signed and unsigned integers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudot_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsudot, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sudot, LANE = 0)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_i8mm", issue = "117223")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsudot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: uint8x8_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: uint32x2_t = transmute(c);
+        let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        vusdot_s32(a, transmute(c), b)
+    }
+}
+#[doc = "Dot product index form with signed and unsigned integers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudotq_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsudot, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(sudot, LANE = 0)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_i8mm", issue = "117223")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vsudotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: uint8x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: uint32x2_t = transmute(c);
+        let c: uint32x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vusdotq_s32(a, transmute(c), b)
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl1)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+fn vtbl1(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vtbl1")]
+        fn _vtbl1(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vtbl1(a, b) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl1_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    vtbl1(a, b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl1_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { transmute(vtbl1(transmute(a), transmute(b))) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl1_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vtbl1(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl1_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl1_p8(a: poly8x8_t, b: uint8x8_t) -> poly8x8_t {
+    unsafe { transmute(vtbl1(transmute(a), transmute(b))) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl1_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl1_p8(a: poly8x8_t, b: uint8x8_t) -> poly8x8_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vtbl1(transmute(a), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl2)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+fn vtbl2(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vtbl2")]
+        fn _vtbl2(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vtbl2(a, b, c) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl2_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl2_s8(a: int8x8x2_t, b: int8x8_t) -> int8x8_t {
+    vtbl2(a.0, a.1, b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl2_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl2_u8(a: uint8x8x2_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe { transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b))) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl2_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl2_u8(a: uint8x8x2_t, b: uint8x8_t) -> uint8x8_t {
+    let mut a: uint8x8x2_t = a;
+    a.0 = unsafe { simd_shuffle!(a.0, a.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.1 = unsafe { simd_shuffle!(a.1, a.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl2_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl2_p8(a: poly8x8x2_t, b: uint8x8_t) -> poly8x8_t {
+    unsafe { transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b))) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl2_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl2_p8(a: poly8x8x2_t, b: uint8x8_t) -> poly8x8_t {
+    let mut a: poly8x8x2_t = a;
+    a.0 = unsafe { simd_shuffle!(a.0, a.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.1 = unsafe { simd_shuffle!(a.1, a.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl3)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+fn vtbl3(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vtbl3")]
+        fn _vtbl3(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vtbl3(a, b, c, d) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl3_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl3_s8(a: int8x8x3_t, b: int8x8_t) -> int8x8_t {
+    vtbl3(a.0, a.1, a.2, b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl3_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl3_u8(a: uint8x8x3_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe {
+        transmute(vtbl3(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(b),
+        ))
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl3_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl3_u8(a: uint8x8x3_t, b: uint8x8_t) -> uint8x8_t {
+    let mut a: uint8x8x3_t = a;
+    a.0 = unsafe { simd_shuffle!(a.0, a.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.1 = unsafe { simd_shuffle!(a.1, a.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.2 = unsafe { simd_shuffle!(a.2, a.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vtbl3(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(b),
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl3_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl3_p8(a: poly8x8x3_t, b: uint8x8_t) -> poly8x8_t {
+    unsafe {
+        transmute(vtbl3(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(b),
+        ))
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl3_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl3_p8(a: poly8x8x3_t, b: uint8x8_t) -> poly8x8_t {
+    let mut a: poly8x8x3_t = a;
+    a.0 = unsafe { simd_shuffle!(a.0, a.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.1 = unsafe { simd_shuffle!(a.1, a.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.2 = unsafe { simd_shuffle!(a.2, a.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vtbl3(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(b),
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl4)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+fn vtbl4(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, e: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vtbl4")]
+        fn _vtbl4(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, e: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vtbl4(a, b, c, d, e) }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl4_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl4_s8(a: int8x8x4_t, b: int8x8_t) -> int8x8_t {
+    vtbl4(a.0, a.1, a.2, a.3, b)
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl4_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl4_u8(a: uint8x8x4_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe {
+        transmute(vtbl4(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(a.3),
+            transmute(b),
+        ))
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl4_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl4_u8(a: uint8x8x4_t, b: uint8x8_t) -> uint8x8_t {
+    let mut a: uint8x8x4_t = a;
+    a.0 = unsafe { simd_shuffle!(a.0, a.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.1 = unsafe { simd_shuffle!(a.1, a.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.2 = unsafe { simd_shuffle!(a.2, a.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.3 = unsafe { simd_shuffle!(a.3, a.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vtbl4(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(a.3),
+            transmute(b),
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl4_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t {
+    unsafe {
+        transmute(vtbl4(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(a.3),
+            transmute(b),
+        ))
+    }
+}
+#[doc = "Table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl4_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t {
+    let mut a: poly8x8x4_t = a;
+    a.0 = unsafe { simd_shuffle!(a.0, a.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.1 = unsafe { simd_shuffle!(a.1, a.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.2 = unsafe { simd_shuffle!(a.2, a.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    a.3 = unsafe { simd_shuffle!(a.3, a.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vtbl4(
+            transmute(a.0),
+            transmute(a.1),
+            transmute(a.2),
+            transmute(a.3),
+            transmute(b),
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx1)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+fn vtbx1(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vtbx1")]
+        fn _vtbx1(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vtbx1(a, b, c) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx1_s8)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx1_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    vtbx1(a, b, c)
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx1_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe { transmute(vtbx1(transmute(a), transmute(b), transmute(c))) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx1_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: uint8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vtbx1(transmute(a), transmute(b), transmute(c)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx1_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx1_p8(a: poly8x8_t, b: poly8x8_t, c: uint8x8_t) -> poly8x8_t {
+    unsafe { transmute(vtbx1(transmute(a), transmute(b), transmute(c))) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx1_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx1_p8(a: poly8x8_t, b: poly8x8_t, c: uint8x8_t) -> poly8x8_t {
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let b: poly8x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vtbx1(transmute(a), transmute(b), transmute(c)));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx2)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+fn vtbx2(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vtbx2")]
+        fn _vtbx2(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vtbx2(a, b, c, d) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx2_s8)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx2_s8(a: int8x8_t, b: int8x8x2_t, c: int8x8_t) -> int8x8_t {
+    vtbx2(a, b.0, b.1, c)
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx2_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx2_u8(a: uint8x8_t, b: uint8x8x2_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe {
+        transmute(vtbx2(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(c),
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx2_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx2_u8(a: uint8x8_t, b: uint8x8x2_t, c: uint8x8_t) -> uint8x8_t {
+    let mut b: uint8x8x2_t = b;
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe { simd_shuffle!(b.0, b.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.1 = unsafe { simd_shuffle!(b.1, b.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vtbx2(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(c),
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx2_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t {
+    unsafe {
+        transmute(vtbx2(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(c),
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx2_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t {
+    let mut b: poly8x8x2_t = b;
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe { simd_shuffle!(b.0, b.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.1 = unsafe { simd_shuffle!(b.1, b.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vtbx2(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(c),
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx3)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+fn vtbx3(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, e: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vtbx3")]
+        fn _vtbx3(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, e: int8x8_t) -> int8x8_t;
+    }
+    unsafe { _vtbx3(a, b, c, d, e) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx3_s8)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx3_s8(a: int8x8_t, b: int8x8x3_t, c: int8x8_t) -> int8x8_t {
+    vtbx3(a, b.0, b.1, b.2, c)
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx3_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe {
+        transmute(vtbx3(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(c),
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx3_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t {
+    let mut b: uint8x8x3_t = b;
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe { simd_shuffle!(b.0, b.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.1 = unsafe { simd_shuffle!(b.1, b.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.2 = unsafe { simd_shuffle!(b.2, b.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vtbx3(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(c),
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx3_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t {
+    unsafe {
+        transmute(vtbx3(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(c),
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx3_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t {
+    let mut b: poly8x8x3_t = b;
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe { simd_shuffle!(b.0, b.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.1 = unsafe { simd_shuffle!(b.1, b.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.2 = unsafe { simd_shuffle!(b.2, b.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vtbx3(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(c),
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx4)"]
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+fn vtbx4(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, e: int8x8_t, f: int8x8_t) -> int8x8_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vtbx4")]
+        fn _vtbx4(
+            a: int8x8_t,
+            b: int8x8_t,
+            c: int8x8_t,
+            d: int8x8_t,
+            e: int8x8_t,
+            f: int8x8_t,
+        ) -> int8x8_t;
+    }
+    unsafe { _vtbx4(a, b, c, d, e, f) }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx4_s8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx4_s8(a: int8x8_t, b: int8x8x4_t, c: int8x8_t) -> int8x8_t {
+    unsafe {
+        vtbx4(
+            a,
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            c,
+        )
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx4_s8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx4_s8(a: int8x8_t, b: int8x8x4_t, c: int8x8_t) -> int8x8_t {
+    let mut b: int8x8x4_t = b;
+    let a: int8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe { simd_shuffle!(b.0, b.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.1 = unsafe { simd_shuffle!(b.1, b.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.2 = unsafe { simd_shuffle!(b.2, b.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.3 = unsafe { simd_shuffle!(b.3, b.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: int8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: int8x8_t = vtbx4(
+            a,
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            c,
+        );
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx4_u8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx4_u8(a: uint8x8_t, b: uint8x8x4_t, c: uint8x8_t) -> uint8x8_t {
+    unsafe {
+        transmute(vtbx4(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            transmute(c),
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx4_u8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx4_u8(a: uint8x8_t, b: uint8x8x4_t, c: uint8x8_t) -> uint8x8_t {
+    let mut b: uint8x8x4_t = b;
+    let a: uint8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe { simd_shuffle!(b.0, b.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.1 = unsafe { simd_shuffle!(b.1, b.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.2 = unsafe { simd_shuffle!(b.2, b.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.3 = unsafe { simd_shuffle!(b.3, b.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: uint8x8_t = transmute(vtbx4(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            transmute(c),
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx4_p8)"]
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t {
+    unsafe {
+        transmute(vtbx4(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            transmute(c),
+        ))
+    }
+}
+#[doc = "Extended table look-up"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbx4_p8)"]
+#[inline]
+#[cfg(target_endian = "big")]
+#[target_feature(enable = "neon,v7")]
+#[cfg(target_arch = "arm")]
+#[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t {
+    let mut b: poly8x8x4_t = b;
+    let a: poly8x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.0 = unsafe { simd_shuffle!(b.0, b.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.1 = unsafe { simd_shuffle!(b.1, b.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.2 = unsafe { simd_shuffle!(b.2, b.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    b.3 = unsafe { simd_shuffle!(b.3, b.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    let c: uint8x8_t = unsafe { simd_shuffle!(c, c, [7, 6, 5, 4, 3, 2, 1, 0]) };
+    unsafe {
+        let ret_val: poly8x8_t = transmute(vtbx4(
+            transmute(a),
+            transmute(b.0),
+            transmute(b.1),
+            transmute(b.2),
+            transmute(b.3),
+            transmute(c),
+        ));
+        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vtrn_f16(a: float16x4_t, b: float16x4_t) -> float16x4x2_t {
+    unsafe {
+        let a1: float16x4_t = simd_shuffle!(a, b, [0, 4, 2, 6]);
+        let b1: float16x4_t = simd_shuffle!(a, b, [1, 5, 3, 7]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrnq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vtrnq_f16(a: float16x8_t, b: float16x8_t) -> float16x8x2_t {
+    unsafe {
+        let a1: float16x8_t = simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]);
+        let b1: float16x8_t = simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrn_f32(a: float32x2_t, b: float32x2_t) -> float32x2x2_t {
+    unsafe {
+        let a1: float32x2_t = simd_shuffle!(a, b, [0, 2]);
+        let b1: float32x2_t = simd_shuffle!(a, b, [1, 3]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrn_s32(a: int32x2_t, b: int32x2_t) -> int32x2x2_t {
+    unsafe {
+        let a1: int32x2_t = simd_shuffle!(a, b, [0, 2]);
+        let b1: int32x2_t = simd_shuffle!(a, b, [1, 3]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrn_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2x2_t {
+    unsafe {
+        let a1: uint32x2_t = simd_shuffle!(a, b, [0, 2]);
+        let b1: uint32x2_t = simd_shuffle!(a, b, [1, 3]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrnq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrnq_f32(a: float32x4_t, b: float32x4_t) -> float32x4x2_t {
+    unsafe {
+        let a1: float32x4_t = simd_shuffle!(a, b, [0, 4, 2, 6]);
+        let b1: float32x4_t = simd_shuffle!(a, b, [1, 5, 3, 7]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrn_s8(a: int8x8_t, b: int8x8_t) -> int8x8x2_t {
+    unsafe {
+        let a1: int8x8_t = simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]);
+        let b1: int8x8_t = simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrnq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrnq_s8(a: int8x16_t, b: int8x16_t) -> int8x16x2_t {
+    unsafe {
+        let a1: int8x16_t = simd_shuffle!(
+            a,
+            b,
+            [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]
+        );
+        let b1: int8x16_t = simd_shuffle!(
+            a,
+            b,
+            [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]
+        );
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrn_s16(a: int16x4_t, b: int16x4_t) -> int16x4x2_t {
+    unsafe {
+        let a1: int16x4_t = simd_shuffle!(a, b, [0, 4, 2, 6]);
+        let b1: int16x4_t = simd_shuffle!(a, b, [1, 5, 3, 7]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrnq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrnq_s16(a: int16x8_t, b: int16x8_t) -> int16x8x2_t {
+    unsafe {
+        let a1: int16x8_t = simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]);
+        let b1: int16x8_t = simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrnq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrnq_s32(a: int32x4_t, b: int32x4_t) -> int32x4x2_t {
+    unsafe {
+        let a1: int32x4_t = simd_shuffle!(a, b, [0, 4, 2, 6]);
+        let b1: int32x4_t = simd_shuffle!(a, b, [1, 5, 3, 7]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrn_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8x2_t {
+    unsafe {
+        let a1: uint8x8_t = simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]);
+        let b1: uint8x8_t = simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrnq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrnq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16x2_t {
+    unsafe {
+        let a1: uint8x16_t = simd_shuffle!(
+            a,
+            b,
+            [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]
+        );
+        let b1: uint8x16_t = simd_shuffle!(
+            a,
+            b,
+            [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]
+        );
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrn_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4x2_t {
+    unsafe {
+        let a1: uint16x4_t = simd_shuffle!(a, b, [0, 4, 2, 6]);
+        let b1: uint16x4_t = simd_shuffle!(a, b, [1, 5, 3, 7]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrnq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrnq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8x2_t {
+    unsafe {
+        let a1: uint16x8_t = simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]);
+        let b1: uint16x8_t = simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrnq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrnq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4x2_t {
+    unsafe {
+        let a1: uint32x4_t = simd_shuffle!(a, b, [0, 4, 2, 6]);
+        let b1: uint32x4_t = simd_shuffle!(a, b, [1, 5, 3, 7]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrn_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8x2_t {
+    unsafe {
+        let a1: poly8x8_t = simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]);
+        let b1: poly8x8_t = simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrnq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrnq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16x2_t {
+    unsafe {
+        let a1: poly8x16_t = simd_shuffle!(
+            a,
+            b,
+            [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]
+        );
+        let b1: poly8x16_t = simd_shuffle!(
+            a,
+            b,
+            [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]
+        );
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrn_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4x2_t {
+    unsafe {
+        let a1: poly16x4_t = simd_shuffle!(a, b, [0, 4, 2, 6]);
+        let b1: poly16x4_t = simd_shuffle!(a, b, [1, 5, 3, 7]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Transpose elements"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrnq_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(trn)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtrnq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8x2_t {
+    unsafe {
+        let a1: poly16x8_t = simd_shuffle!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]);
+        let b1: poly16x8_t = simd_shuffle!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]);
+        transmute((a1, b1))
+    }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtst_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtst_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    unsafe {
+        let c: int8x8_t = simd_and(a, b);
+        let d: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtstq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtstq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    unsafe {
+        let c: int8x16_t = simd_and(a, b);
+        let d: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtst_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtst_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    unsafe {
+        let c: int16x4_t = simd_and(a, b);
+        let d: i16x4 = i16x4::new(0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtstq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtstq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    unsafe {
+        let c: int16x8_t = simd_and(a, b);
+        let d: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtst_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtst_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    unsafe {
+        let c: int32x2_t = simd_and(a, b);
+        let d: i32x2 = i32x2::new(0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtstq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtstq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    unsafe {
+        let c: int32x4_t = simd_and(a, b);
+        let d: i32x4 = i32x4::new(0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtst_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtst_p8(a: poly8x8_t, b: poly8x8_t) -> uint8x8_t {
+    unsafe {
+        let c: poly8x8_t = simd_and(a, b);
+        let d: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtstq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtstq_p8(a: poly8x16_t, b: poly8x16_t) -> uint8x16_t {
+    unsafe {
+        let c: poly8x16_t = simd_and(a, b);
+        let d: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtst_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtst_p16(a: poly16x4_t, b: poly16x4_t) -> uint16x4_t {
+    unsafe {
+        let c: poly16x4_t = simd_and(a, b);
+        let d: i16x4 = i16x4::new(0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Signed compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtstq_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtstq_p16(a: poly16x8_t, b: poly16x8_t) -> uint16x8_t {
+    unsafe {
+        let c: poly16x8_t = simd_and(a, b);
+        let d: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Unsigned compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtst_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtst_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    unsafe {
+        let c: uint8x8_t = simd_and(a, b);
+        let d: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Unsigned compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtstq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtstq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe {
+        let c: uint8x16_t = simd_and(a, b);
+        let d: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Unsigned compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtst_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtst_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    unsafe {
+        let c: uint16x4_t = simd_and(a, b);
+        let d: u16x4 = u16x4::new(0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Unsigned compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtstq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtstq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    unsafe {
+        let c: uint16x8_t = simd_and(a, b);
+        let d: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Unsigned compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtst_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtst_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    unsafe {
+        let c: uint32x2_t = simd_and(a, b);
+        let d: u32x2 = u32x2::new(0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Unsigned compare bitwise Test bits nonzero"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtstq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(cmtst)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vtstq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe {
+        let c: uint32x4_t = simd_and(a, b);
+        let d: u32x4 = u32x4::new(0, 0, 0, 0);
+        simd_ne(c, transmute(d))
+    }
+}
+#[doc = "Dot product index form with unsigned and signed integers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usdot, LANE = 0)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_i8mm", issue = "117223")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vusdot_lane_s32<const LANE: i32>(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: int32x2_t = transmute(c);
+        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        vusdot_s32(a, b, transmute(c))
+    }
+}
+#[doc = "Dot product index form with unsigned and signed integers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_lane_s32)"]
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot, LANE = 0))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usdot, LANE = 0)
+)]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_i8mm", issue = "117223")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vusdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: uint8x16_t, c: int8x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    unsafe {
+        let c: int32x2_t = transmute(c);
+        let c: int32x4_t =
+            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        vusdotq_s32(a, b, transmute(c))
+    }
+}
+#[doc = "Dot product vector form with unsigned and signed integers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_s32)"]
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usdot)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_i8mm", issue = "117223")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vusdot_s32(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.usdot.v2i32.v8i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usdot.v2i32.v8i8")]
+        fn _vusdot_s32(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t;
+    }
+    unsafe { _vusdot_s32(a, b, c) }
+}
+#[doc = "Dot product vector form with unsigned and signed integers"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_s32)"]
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usdot)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_i8mm", issue = "117223")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vusdotq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.usdot.v4i32.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usdot.v4i32.v16i8")]
+        fn _vusdotq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t;
+    }
+    unsafe { _vusdotq_s32(a, b, c) }
+}
+#[doc = "Unsigned and signed 8-bit integer matrix multiply-accumulate"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusmmlaq_s32)"]
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(usmmla)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    unstable(feature = "stdarch_neon_i8mm", issue = "117223")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vusmmlaq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t {
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.usmmla.v4i32.v16i8"
+        )]
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usmmla.v4i32.v16i8")]
+        fn _vusmmlaq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t;
+    }
+    unsafe { _vusmmlaq_s32(a, b, c) }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vuzp_f16(a: float16x4_t, b: float16x4_t) -> float16x4x2_t {
+    unsafe {
+        let a0: float16x4_t = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let b0: float16x4_t = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzpq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vuzpq_f16(a: float16x8_t, b: float16x8_t) -> float16x8x2_t {
+    unsafe {
+        let a0: float16x8_t = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let b0: float16x8_t = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzp_f32(a: float32x2_t, b: float32x2_t) -> float32x2x2_t {
+    unsafe {
+        let a0: float32x2_t = simd_shuffle!(a, b, [0, 2]);
+        let b0: float32x2_t = simd_shuffle!(a, b, [1, 3]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzp_s32(a: int32x2_t, b: int32x2_t) -> int32x2x2_t {
+    unsafe {
+        let a0: int32x2_t = simd_shuffle!(a, b, [0, 2]);
+        let b0: int32x2_t = simd_shuffle!(a, b, [1, 3]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzp_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2x2_t {
+    unsafe {
+        let a0: uint32x2_t = simd_shuffle!(a, b, [0, 2]);
+        let b0: uint32x2_t = simd_shuffle!(a, b, [1, 3]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzpq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzpq_f32(a: float32x4_t, b: float32x4_t) -> float32x4x2_t {
+    unsafe {
+        let a0: float32x4_t = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let b0: float32x4_t = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzp_s8(a: int8x8_t, b: int8x8_t) -> int8x8x2_t {
+    unsafe {
+        let a0: int8x8_t = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let b0: int8x8_t = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzpq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzpq_s8(a: int8x16_t, b: int8x16_t) -> int8x16x2_t {
+    unsafe {
+        let a0: int8x16_t = simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
+        );
+        let b0: int8x16_t = simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
+        );
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzp_s16(a: int16x4_t, b: int16x4_t) -> int16x4x2_t {
+    unsafe {
+        let a0: int16x4_t = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let b0: int16x4_t = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzpq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzpq_s16(a: int16x8_t, b: int16x8_t) -> int16x8x2_t {
+    unsafe {
+        let a0: int16x8_t = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let b0: int16x8_t = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzpq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzpq_s32(a: int32x4_t, b: int32x4_t) -> int32x4x2_t {
+    unsafe {
+        let a0: int32x4_t = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let b0: int32x4_t = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzp_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8x2_t {
+    unsafe {
+        let a0: uint8x8_t = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let b0: uint8x8_t = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzpq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzpq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16x2_t {
+    unsafe {
+        let a0: uint8x16_t = simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
+        );
+        let b0: uint8x16_t = simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
+        );
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzp_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4x2_t {
+    unsafe {
+        let a0: uint16x4_t = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let b0: uint16x4_t = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzpq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzpq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8x2_t {
+    unsafe {
+        let a0: uint16x8_t = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let b0: uint16x8_t = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzpq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzpq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4x2_t {
+    unsafe {
+        let a0: uint32x4_t = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let b0: uint32x4_t = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzp_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8x2_t {
+    unsafe {
+        let a0: poly8x8_t = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let b0: poly8x8_t = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzpq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzpq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16x2_t {
+    unsafe {
+        let a0: poly8x16_t = simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
+        );
+        let b0: poly8x16_t = simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
+        );
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzp_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzp_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4x2_t {
+    unsafe {
+        let a0: poly16x4_t = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let b0: poly16x4_t = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Unzip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuzpq_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(uzp)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vuzpq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8x2_t {
+    unsafe {
+        let a0: poly16x8_t = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let b0: poly16x8_t = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vzip.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vzip_f16(a: float16x4_t, b: float16x4_t) -> float16x4x2_t {
+    unsafe {
+        let a0: float16x4_t = simd_shuffle!(a, b, [0, 4, 1, 5]);
+        let b0: float16x4_t = simd_shuffle!(a, b, [2, 6, 3, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzipq_f16)"]
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vzip.16"))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[target_feature(enable = "neon,fp16")]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub fn vzipq_f16(a: float16x8_t, b: float16x8_t) -> float16x8x2_t {
+    unsafe {
+        let a0: float16x8_t = simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
+        let b0: float16x8_t = simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzip_f32(a: float32x2_t, b: float32x2_t) -> float32x2x2_t {
+    unsafe {
+        let a0: float32x2_t = simd_shuffle!(a, b, [0, 2]);
+        let b0: float32x2_t = simd_shuffle!(a, b, [1, 3]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzip_s32(a: int32x2_t, b: int32x2_t) -> int32x2x2_t {
+    unsafe {
+        let a0: int32x2_t = simd_shuffle!(a, b, [0, 2]);
+        let b0: int32x2_t = simd_shuffle!(a, b, [1, 3]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzip_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2x2_t {
+    unsafe {
+        let a0: uint32x2_t = simd_shuffle!(a, b, [0, 2]);
+        let b0: uint32x2_t = simd_shuffle!(a, b, [1, 3]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vzip))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzip_s8(a: int8x8_t, b: int8x8_t) -> int8x8x2_t {
+    unsafe {
+        let a0: int8x8_t = simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
+        let b0: int8x8_t = simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vzip))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzip_s16(a: int16x4_t, b: int16x4_t) -> int16x4x2_t {
+    unsafe {
+        let a0: int16x4_t = simd_shuffle!(a, b, [0, 4, 1, 5]);
+        let b0: int16x4_t = simd_shuffle!(a, b, [2, 6, 3, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vzip))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzip_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8x2_t {
+    unsafe {
+        let a0: uint8x8_t = simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
+        let b0: uint8x8_t = simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vzip))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzip_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4x2_t {
+    unsafe {
+        let a0: uint16x4_t = simd_shuffle!(a, b, [0, 4, 1, 5]);
+        let b0: uint16x4_t = simd_shuffle!(a, b, [2, 6, 3, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vzip))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzip_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8x2_t {
+    unsafe {
+        let a0: poly8x8_t = simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
+        let b0: poly8x8_t = simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzip_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vzip))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzip_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4x2_t {
+    unsafe {
+        let a0: poly16x4_t = simd_shuffle!(a, b, [0, 4, 1, 5]);
+        let b0: poly16x4_t = simd_shuffle!(a, b, [2, 6, 3, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzipq_f32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzipq_f32(a: float32x4_t, b: float32x4_t) -> float32x4x2_t {
+    unsafe {
+        let a0: float32x4_t = simd_shuffle!(a, b, [0, 4, 1, 5]);
+        let b0: float32x4_t = simd_shuffle!(a, b, [2, 6, 3, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzipq_s8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzipq_s8(a: int8x16_t, b: int8x16_t) -> int8x16x2_t {
+    unsafe {
+        let a0: int8x16_t = simd_shuffle!(
+            a,
+            b,
+            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]
+        );
+        let b0: int8x16_t = simd_shuffle!(
+            a,
+            b,
+            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]
+        );
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzipq_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzipq_s16(a: int16x8_t, b: int16x8_t) -> int16x8x2_t {
+    unsafe {
+        let a0: int16x8_t = simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
+        let b0: int16x8_t = simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzipq_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzipq_s32(a: int32x4_t, b: int32x4_t) -> int32x4x2_t {
+    unsafe {
+        let a0: int32x4_t = simd_shuffle!(a, b, [0, 4, 1, 5]);
+        let b0: int32x4_t = simd_shuffle!(a, b, [2, 6, 3, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzipq_u8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzipq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16x2_t {
+    unsafe {
+        let a0: uint8x16_t = simd_shuffle!(
+            a,
+            b,
+            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]
+        );
+        let b0: uint8x16_t = simd_shuffle!(
+            a,
+            b,
+            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]
+        );
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzipq_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzipq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8x2_t {
+    unsafe {
+        let a0: uint16x8_t = simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
+        let b0: uint16x8_t = simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzipq_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzipq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4x2_t {
+    unsafe {
+        let a0: uint32x4_t = simd_shuffle!(a, b, [0, 4, 1, 5]);
+        let b0: uint32x4_t = simd_shuffle!(a, b, [2, 6, 3, 7]);
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzipq_p8)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzipq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16x2_t {
+    unsafe {
+        let a0: poly8x16_t = simd_shuffle!(
+            a,
+            b,
+            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]
+        );
+        let b0: poly8x16_t = simd_shuffle!(
+            a,
+            b,
+            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]
+        );
+        transmute((a0, b0))
+    }
+}
+#[doc = "Zip vectors"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vzipq_p16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(
+    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
+    assert_instr(zip)
+)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub fn vzipq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8x2_t {
+    unsafe {
+        let a0: poly16x8_t = simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
+        let b0: poly16x8_t = simd_shuffle!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]);
+        transmute((a0, b0))
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/load_tests.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/load_tests.rs
new file mode 100644
index 0000000000000..bdf511ecf8819
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/load_tests.rs
@@ -0,0 +1,206 @@
+//! Tests for ARM+v7+neon load (vld1) intrinsics.
+//!
+//! These are included in `{arm, aarch64}::neon`.
+
+use super::*;
+
+#[cfg(target_arch = "arm")]
+use crate::core_arch::arm::*;
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+use crate::core_arch::aarch64::*;
+
+use crate::core_arch::simd::*;
+use std::mem;
+use stdarch_test::simd_test;
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_s8() {
+    let a: [i8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: i8x8 = transmute(vld1_s8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_s8() {
+    let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+    let e = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+    let r: i8x16 = transmute(vld1q_s8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_s16() {
+    let a: [i16; 5] = [0, 1, 2, 3, 4];
+    let e = i16x4::new(1, 2, 3, 4);
+    let r: i16x4 = transmute(vld1_s16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_s16() {
+    let a: [i16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: i16x8 = transmute(vld1q_s16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_s32() {
+    let a: [i32; 3] = [0, 1, 2];
+    let e = i32x2::new(1, 2);
+    let r: i32x2 = transmute(vld1_s32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_s32() {
+    let a: [i32; 5] = [0, 1, 2, 3, 4];
+    let e = i32x4::new(1, 2, 3, 4);
+    let r: i32x4 = transmute(vld1q_s32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_s64() {
+    let a: [i64; 2] = [0, 1];
+    let e = i64x1::new(1);
+    let r: i64x1 = transmute(vld1_s64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_s64() {
+    let a: [i64; 3] = [0, 1, 2];
+    let e = i64x2::new(1, 2);
+    let r: i64x2 = transmute(vld1q_s64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_u8() {
+    let a: [u8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: u8x8 = transmute(vld1_u8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_u8() {
+    let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+    let e = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+    let r: u8x16 = transmute(vld1q_u8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_u16() {
+    let a: [u16; 5] = [0, 1, 2, 3, 4];
+    let e = u16x4::new(1, 2, 3, 4);
+    let r: u16x4 = transmute(vld1_u16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_u16() {
+    let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: u16x8 = transmute(vld1q_u16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_u32() {
+    let a: [u32; 3] = [0, 1, 2];
+    let e = u32x2::new(1, 2);
+    let r: u32x2 = transmute(vld1_u32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_u32() {
+    let a: [u32; 5] = [0, 1, 2, 3, 4];
+    let e = u32x4::new(1, 2, 3, 4);
+    let r: u32x4 = transmute(vld1q_u32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_u64() {
+    let a: [u64; 2] = [0, 1];
+    let e = u64x1::new(1);
+    let r: u64x1 = transmute(vld1_u64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_u64() {
+    let a: [u64; 3] = [0, 1, 2];
+    let e = u64x2::new(1, 2);
+    let r: u64x2 = transmute(vld1q_u64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_p8() {
+    let a: [p8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: u8x8 = transmute(vld1_p8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_p8() {
+    let a: [p8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+    let e = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+    let r: u8x16 = transmute(vld1q_p8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_p16() {
+    let a: [p16; 5] = [0, 1, 2, 3, 4];
+    let e = u16x4::new(1, 2, 3, 4);
+    let r: u16x4 = transmute(vld1_p16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_p16() {
+    let a: [p16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: u16x8 = transmute(vld1q_p16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon,aes")]
+unsafe fn test_vld1_p64() {
+    let a: [p64; 2] = [0, 1];
+    let e = u64x1::new(1);
+    let r: u64x1 = transmute(vld1_p64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon,aes")]
+unsafe fn test_vld1q_p64() {
+    let a: [p64; 3] = [0, 1, 2];
+    let e = u64x2::new(1, 2);
+    let r: u64x2 = transmute(vld1q_p64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_f32() {
+    let a: [f32; 3] = [0., 1., 2.];
+    let e = f32x2::new(1., 2.);
+    let r: f32x2 = transmute(vld1_f32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_f32() {
+    let a: [f32; 5] = [0., 1., 2., 3., 4.];
+    let e = f32x4::new(1., 2., 3., 4.);
+    let r: f32x4 = transmute(vld1q_f32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs
new file mode 100644
index 0000000000000..0683d48ed3271
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs
@@ -0,0 +1,5531 @@
+//! ARMv7 NEON intrinsics
+
+#[rustfmt::skip]
+mod generated;
+#[rustfmt::skip]
+#[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
+#[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
+pub use self::generated::*;
+
+use crate::{core_arch::simd::*, hint::unreachable_unchecked, intrinsics::simd::*, mem::transmute};
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+pub(crate) trait AsUnsigned {
+    type Unsigned;
+    fn as_unsigned(self) -> Self::Unsigned;
+}
+
+pub(crate) trait AsSigned {
+    type Signed;
+    fn as_signed(self) -> Self::Signed;
+}
+
+macro_rules! impl_sign_conversions_neon {
+    ($(($signed:ty, $unsigned:ty))*) => ($(
+        impl AsUnsigned for $signed {
+            type Unsigned = $unsigned;
+
+            #[inline(always)]
+            fn as_unsigned(self) -> $unsigned {
+                unsafe { transmute(self) }
+            }
+        }
+
+        impl AsSigned for $unsigned {
+            type Signed = $signed;
+
+            #[inline(always)]
+            fn as_signed(self) -> $signed {
+                unsafe { transmute(self) }
+            }
+        }
+    )*)
+}
+
+pub(crate) type p8 = u8;
+pub(crate) type p16 = u16;
+pub(crate) type p64 = u64;
+pub(crate) type p128 = u128;
+
+types! {
+    #![cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    #![cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
+
+    /// Arm-specific 64-bit wide vector of eight packed `i8`.
+    pub struct int8x8_t(8 x pub(crate) i8);
+    /// Arm-specific 64-bit wide vector of eight packed `u8`.
+    pub struct uint8x8_t(8 x pub(crate) u8);
+    /// Arm-specific 64-bit wide polynomial vector of eight packed `p8`.
+    pub struct poly8x8_t(8 x pub(crate) p8);
+    /// Arm-specific 64-bit wide vector of four packed `i16`.
+    pub struct int16x4_t(4 x pub(crate) i16);
+    /// Arm-specific 64-bit wide vector of four packed `u16`.
+    pub struct uint16x4_t(4 x pub(crate) u16);
+    /// Arm-specific 64-bit wide vector of four packed `p16`.
+    pub struct poly16x4_t(4 x pub(crate) p16);
+    /// Arm-specific 64-bit wide vector of two packed `i32`.
+    pub struct int32x2_t(2 x pub(crate) i32);
+    /// Arm-specific 64-bit wide vector of two packed `u32`.
+    pub struct uint32x2_t(2 x pub(crate) u32);
+    /// Arm-specific 64-bit wide vector of two packed `f32`.
+    pub struct float32x2_t(2 x pub(crate) f32);
+    /// Arm-specific 64-bit wide vector of one packed `i64`.
+    pub struct int64x1_t(1 x pub(crate) i64);
+    /// Arm-specific 64-bit wide vector of one packed `u64`.
+    pub struct uint64x1_t(1 x pub(crate) u64);
+    /// Arm-specific 64-bit wide vector of one packed `p64`.
+    pub struct poly64x1_t(1 x pub(crate) p64);
+
+    /// Arm-specific 128-bit wide vector of sixteen packed `i8`.
+    pub struct int8x16_t(16 x pub(crate) i8);
+    /// Arm-specific 128-bit wide vector of sixteen packed `u8`.
+    pub struct uint8x16_t(16 x pub(crate) u8);
+    /// Arm-specific 128-bit wide vector of sixteen packed `p8`.
+    pub struct poly8x16_t(16 x pub(crate) p8);
+    /// Arm-specific 128-bit wide vector of eight packed `i16`.
+    pub struct int16x8_t(8 x pub(crate) i16);
+    /// Arm-specific 128-bit wide vector of eight packed `u16`.
+    pub struct uint16x8_t(8 x pub(crate) u16);
+    /// Arm-specific 128-bit wide vector of eight packed `p16`.
+    pub struct poly16x8_t(8 x pub(crate) p16);
+    /// Arm-specific 128-bit wide vector of four packed `i32`.
+    pub struct int32x4_t(4 x pub(crate) i32);
+    /// Arm-specific 128-bit wide vector of four packed `u32`.
+    pub struct uint32x4_t(4 x pub(crate) u32);
+    /// Arm-specific 128-bit wide vector of four packed `f32`.
+    pub struct float32x4_t(4 x pub(crate) f32);
+    /// Arm-specific 128-bit wide vector of two packed `i64`.
+    pub struct int64x2_t(2 x pub(crate) i64);
+    /// Arm-specific 128-bit wide vector of two packed `u64`.
+    pub struct uint64x2_t(2 x pub(crate) u64);
+    /// Arm-specific 128-bit wide vector of two packed `p64`.
+    pub struct poly64x2_t(2 x pub(crate) p64);
+}
+
+types! {
+    #![unstable(feature = "stdarch_neon_f16", issue = "136306")]
+
+    /// Arm-specific 64-bit wide vector of four packed `f16`.
+    pub struct float16x4_t(4 x pub(crate) f16);
+    /// Arm-specific 128-bit wide vector of eight packed `f16`.
+    pub struct float16x8_t(8 x pub(crate) f16);
+}
+
+/// Arm-specific type containing two `int8x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int8x8x2_t(pub int8x8_t, pub int8x8_t);
+/// Arm-specific type containing three `int8x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int8x8x3_t(pub int8x8_t, pub int8x8_t, pub int8x8_t);
+/// Arm-specific type containing four `int8x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int8x8x4_t(pub int8x8_t, pub int8x8_t, pub int8x8_t, pub int8x8_t);
+
+/// Arm-specific type containing two `int8x16_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int8x16x2_t(pub int8x16_t, pub int8x16_t);
+/// Arm-specific type containing three `int8x16_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int8x16x3_t(pub int8x16_t, pub int8x16_t, pub int8x16_t);
+/// Arm-specific type containing four `int8x16_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int8x16x4_t(pub int8x16_t, pub int8x16_t, pub int8x16_t, pub int8x16_t);
+
+/// Arm-specific type containing two `uint8x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint8x8x2_t(pub uint8x8_t, pub uint8x8_t);
+/// Arm-specific type containing three `uint8x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint8x8x3_t(pub uint8x8_t, pub uint8x8_t, pub uint8x8_t);
+/// Arm-specific type containing four `uint8x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint8x8x4_t(pub uint8x8_t, pub uint8x8_t, pub uint8x8_t, pub uint8x8_t);
+
+/// Arm-specific type containing two `uint8x16_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint8x16x2_t(pub uint8x16_t, pub uint8x16_t);
+/// Arm-specific type containing three `uint8x16_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint8x16x3_t(pub uint8x16_t, pub uint8x16_t, pub uint8x16_t);
+/// Arm-specific type containing four `uint8x16_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint8x16x4_t(
+    pub uint8x16_t,
+    pub uint8x16_t,
+    pub uint8x16_t,
+    pub uint8x16_t,
+);
+
+/// Arm-specific type containing two `poly8x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly8x8x2_t(pub poly8x8_t, pub poly8x8_t);
+/// Arm-specific type containing three `poly8x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly8x8x3_t(pub poly8x8_t, pub poly8x8_t, pub poly8x8_t);
+/// Arm-specific type containing four `poly8x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly8x8x4_t(pub poly8x8_t, pub poly8x8_t, pub poly8x8_t, pub poly8x8_t);
+
+/// Arm-specific type containing two `poly8x16_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly8x16x2_t(pub poly8x16_t, pub poly8x16_t);
+/// Arm-specific type containing three `poly8x16_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly8x16x3_t(pub poly8x16_t, pub poly8x16_t, pub poly8x16_t);
+/// Arm-specific type containing four `poly8x16_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly8x16x4_t(
+    pub poly8x16_t,
+    pub poly8x16_t,
+    pub poly8x16_t,
+    pub poly8x16_t,
+);
+
+/// Arm-specific type containing two `int16x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int16x4x2_t(pub int16x4_t, pub int16x4_t);
+/// Arm-specific type containing three `int16x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int16x4x3_t(pub int16x4_t, pub int16x4_t, pub int16x4_t);
+/// Arm-specific type containing four `int16x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int16x4x4_t(pub int16x4_t, pub int16x4_t, pub int16x4_t, pub int16x4_t);
+
+/// Arm-specific type containing two `int16x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int16x8x2_t(pub int16x8_t, pub int16x8_t);
+/// Arm-specific type containing three `int16x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int16x8x3_t(pub int16x8_t, pub int16x8_t, pub int16x8_t);
+/// Arm-specific type containing four `int16x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int16x8x4_t(pub int16x8_t, pub int16x8_t, pub int16x8_t, pub int16x8_t);
+
+/// Arm-specific type containing two `uint16x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint16x4x2_t(pub uint16x4_t, pub uint16x4_t);
+/// Arm-specific type containing three `uint16x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint16x4x3_t(pub uint16x4_t, pub uint16x4_t, pub uint16x4_t);
+/// Arm-specific type containing four `uint16x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint16x4x4_t(
+    pub uint16x4_t,
+    pub uint16x4_t,
+    pub uint16x4_t,
+    pub uint16x4_t,
+);
+
+/// Arm-specific type containing two `uint16x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint16x8x2_t(pub uint16x8_t, pub uint16x8_t);
+/// Arm-specific type containing three `uint16x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint16x8x3_t(pub uint16x8_t, pub uint16x8_t, pub uint16x8_t);
+/// Arm-specific type containing four `uint16x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint16x8x4_t(
+    pub uint16x8_t,
+    pub uint16x8_t,
+    pub uint16x8_t,
+    pub uint16x8_t,
+);
+
+/// Arm-specific type containing two `poly16x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly16x4x2_t(pub poly16x4_t, pub poly16x4_t);
+/// Arm-specific type containing three `poly16x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly16x4x3_t(pub poly16x4_t, pub poly16x4_t, pub poly16x4_t);
+/// Arm-specific type containing four `poly16x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly16x4x4_t(
+    pub poly16x4_t,
+    pub poly16x4_t,
+    pub poly16x4_t,
+    pub poly16x4_t,
+);
+
+/// Arm-specific type containing two `poly16x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly16x8x2_t(pub poly16x8_t, pub poly16x8_t);
+/// Arm-specific type containing three `poly16x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly16x8x3_t(pub poly16x8_t, pub poly16x8_t, pub poly16x8_t);
+/// Arm-specific type containing four `poly16x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly16x8x4_t(
+    pub poly16x8_t,
+    pub poly16x8_t,
+    pub poly16x8_t,
+    pub poly16x8_t,
+);
+
+/// Arm-specific type containing two `int32x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int32x2x2_t(pub int32x2_t, pub int32x2_t);
+/// Arm-specific type containing three `int32x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int32x2x3_t(pub int32x2_t, pub int32x2_t, pub int32x2_t);
+/// Arm-specific type containing four `int32x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int32x2x4_t(pub int32x2_t, pub int32x2_t, pub int32x2_t, pub int32x2_t);
+
+/// Arm-specific type containing two `int32x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int32x4x2_t(pub int32x4_t, pub int32x4_t);
+/// Arm-specific type containing three `int32x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int32x4x3_t(pub int32x4_t, pub int32x4_t, pub int32x4_t);
+/// Arm-specific type containing four `int32x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int32x4x4_t(pub int32x4_t, pub int32x4_t, pub int32x4_t, pub int32x4_t);
+
+/// Arm-specific type containing two `uint32x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint32x2x2_t(pub uint32x2_t, pub uint32x2_t);
+/// Arm-specific type containing three `uint32x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint32x2x3_t(pub uint32x2_t, pub uint32x2_t, pub uint32x2_t);
+/// Arm-specific type containing four `uint32x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint32x2x4_t(
+    pub uint32x2_t,
+    pub uint32x2_t,
+    pub uint32x2_t,
+    pub uint32x2_t,
+);
+
+/// Arm-specific type containing two `uint32x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint32x4x2_t(pub uint32x4_t, pub uint32x4_t);
+/// Arm-specific type containing three `uint32x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint32x4x3_t(pub uint32x4_t, pub uint32x4_t, pub uint32x4_t);
+/// Arm-specific type containing four `uint32x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint32x4x4_t(
+    pub uint32x4_t,
+    pub uint32x4_t,
+    pub uint32x4_t,
+    pub uint32x4_t,
+);
+
+/// Arm-specific type containing two `float16x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub struct float16x4x2_t(pub float16x4_t, pub float16x4_t);
+
+/// Arm-specific type containing three `float16x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub struct float16x4x3_t(pub float16x4_t, pub float16x4_t, pub float16x4_t);
+
+/// Arm-specific type containing four `float16x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub struct float16x4x4_t(
+    pub float16x4_t,
+    pub float16x4_t,
+    pub float16x4_t,
+    pub float16x4_t,
+);
+
+/// Arm-specific type containing two `float16x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub struct float16x8x2_t(pub float16x8_t, pub float16x8_t);
+
+/// Arm-specific type containing three `float16x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+
+pub struct float16x8x3_t(pub float16x8_t, pub float16x8_t, pub float16x8_t);
+/// Arm-specific type containing four `float16x8_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+pub struct float16x8x4_t(
+    pub float16x8_t,
+    pub float16x8_t,
+    pub float16x8_t,
+    pub float16x8_t,
+);
+
+/// Arm-specific type containing two `float32x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct float32x2x2_t(pub float32x2_t, pub float32x2_t);
+/// Arm-specific type containing three `float32x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct float32x2x3_t(pub float32x2_t, pub float32x2_t, pub float32x2_t);
+/// Arm-specific type containing four `float32x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct float32x2x4_t(
+    pub float32x2_t,
+    pub float32x2_t,
+    pub float32x2_t,
+    pub float32x2_t,
+);
+
+/// Arm-specific type containing two `float32x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct float32x4x2_t(pub float32x4_t, pub float32x4_t);
+/// Arm-specific type containing three `float32x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct float32x4x3_t(pub float32x4_t, pub float32x4_t, pub float32x4_t);
+/// Arm-specific type containing four `float32x4_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct float32x4x4_t(
+    pub float32x4_t,
+    pub float32x4_t,
+    pub float32x4_t,
+    pub float32x4_t,
+);
+
+/// Arm-specific type containing two `int64x1_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int64x1x2_t(pub int64x1_t, pub int64x1_t);
+/// Arm-specific type containing three `int64x1_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int64x1x3_t(pub int64x1_t, pub int64x1_t, pub int64x1_t);
+/// Arm-specific type containing four `int64x1_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int64x1x4_t(pub int64x1_t, pub int64x1_t, pub int64x1_t, pub int64x1_t);
+
+/// Arm-specific type containing two `int64x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int64x2x2_t(pub int64x2_t, pub int64x2_t);
+/// Arm-specific type containing three `int64x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int64x2x3_t(pub int64x2_t, pub int64x2_t, pub int64x2_t);
+/// Arm-specific type containing four `int64x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct int64x2x4_t(pub int64x2_t, pub int64x2_t, pub int64x2_t, pub int64x2_t);
+
+/// Arm-specific type containing two `uint64x1_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint64x1x2_t(pub uint64x1_t, pub uint64x1_t);
+/// Arm-specific type containing three `uint64x1_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint64x1x3_t(pub uint64x1_t, pub uint64x1_t, pub uint64x1_t);
+/// Arm-specific type containing four `uint64x1_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint64x1x4_t(
+    pub uint64x1_t,
+    pub uint64x1_t,
+    pub uint64x1_t,
+    pub uint64x1_t,
+);
+
+/// Arm-specific type containing two `uint64x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint64x2x2_t(pub uint64x2_t, pub uint64x2_t);
+/// Arm-specific type containing three `uint64x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint64x2x3_t(pub uint64x2_t, pub uint64x2_t, pub uint64x2_t);
+/// Arm-specific type containing four `uint64x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct uint64x2x4_t(
+    pub uint64x2_t,
+    pub uint64x2_t,
+    pub uint64x2_t,
+    pub uint64x2_t,
+);
+
+/// Arm-specific type containing two `poly64x1_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly64x1x2_t(pub poly64x1_t, pub poly64x1_t);
+/// Arm-specific type containing three `poly64x1_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly64x1x3_t(pub poly64x1_t, pub poly64x1_t, pub poly64x1_t);
+/// Arm-specific type containing four `poly64x1_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly64x1x4_t(
+    pub poly64x1_t,
+    pub poly64x1_t,
+    pub poly64x1_t,
+    pub poly64x1_t,
+);
+
+/// Arm-specific type containing two `poly64x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly64x2x2_t(pub poly64x2_t, pub poly64x2_t);
+/// Arm-specific type containing three `poly64x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly64x2x3_t(pub poly64x2_t, pub poly64x2_t, pub poly64x2_t);
+/// Arm-specific type containing four `poly64x2_t` vectors.
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    not(target_arch = "arm"),
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+#[cfg_attr(
+    target_arch = "arm",
+    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
+)]
+pub struct poly64x2x4_t(
+    pub poly64x2_t,
+    pub poly64x2_t,
+    pub poly64x2_t,
+    pub poly64x2_t,
+);
+
+impl_sign_conversions_neon! {
+    (i8, u8)
+    (i16, u16)
+    (i32, u32)
+    (i64, u64)
+    (*const i8, *const u8)
+    (*const i16, *const u16)
+    (*const i32, *const u32)
+    (*const i64, *const u64)
+    (*mut i8, *mut u8)
+    (*mut i16, *mut u16)
+    (*mut i32, *mut u32)
+    (*mut i64, *mut u64)
+    (int16x4_t, uint16x4_t)
+    (int16x8_t, uint16x8_t)
+    (int32x2_t, uint32x2_t)
+    (int32x4_t, uint32x4_t)
+    (int64x1_t, uint64x1_t)
+    (int64x2_t, uint64x2_t)
+    (int8x16_t, uint8x16_t)
+    (int8x8_t, uint8x8_t)
+    (uint16x4_t, int16x4_t)
+    (uint16x8_t, int16x8_t)
+    (uint32x2_t, int32x2_t)
+    (uint32x4_t, int32x4_t)
+    (uint64x1_t, int64x1_t)
+    (uint64x2_t, int64x2_t)
+    (uint8x16_t, int8x16_t)
+    (uint8x8_t, int8x8_t)
+    (int16x4x2_t, uint16x4x2_t)
+    (int16x4x3_t, uint16x4x3_t)
+    (int16x4x4_t, uint16x4x4_t)
+    (int16x8x2_t, uint16x8x2_t)
+    (int16x8x3_t, uint16x8x3_t)
+    (int16x8x4_t, uint16x8x4_t)
+    (int32x2x2_t, uint32x2x2_t)
+    (int32x2x3_t, uint32x2x3_t)
+    (int32x2x4_t, uint32x2x4_t)
+    (int32x4x2_t, uint32x4x2_t)
+    (int32x4x3_t, uint32x4x3_t)
+    (int32x4x4_t, uint32x4x4_t)
+    (int64x1x2_t, uint64x1x2_t)
+    (int64x1x3_t, uint64x1x3_t)
+    (int64x1x4_t, uint64x1x4_t)
+    (int64x2x2_t, uint64x2x2_t)
+    (int64x2x3_t, uint64x2x3_t)
+    (int64x2x4_t, uint64x2x4_t)
+    (int8x16x2_t, uint8x16x2_t)
+    (int8x16x3_t, uint8x16x3_t)
+    (int8x16x4_t, uint8x16x4_t)
+    (int8x8x2_t, uint8x8x2_t)
+    (int8x8x3_t, uint8x8x3_t)
+    (int8x8x4_t, uint8x8x4_t)
+    (uint16x4x2_t, int16x4x2_t)
+    (uint16x4x3_t, int16x4x3_t)
+    (uint16x4x4_t, int16x4x4_t)
+    (uint16x8x2_t, int16x8x2_t)
+    (uint16x8x3_t, int16x8x3_t)
+    (uint16x8x4_t, int16x8x4_t)
+    (uint32x2x2_t, int32x2x2_t)
+    (uint32x2x3_t, int32x2x3_t)
+    (uint32x2x4_t, int32x2x4_t)
+    (uint32x4x2_t, int32x4x2_t)
+    (uint32x4x3_t, int32x4x3_t)
+    (uint32x4x4_t, int32x4x4_t)
+    (uint64x1x2_t, int64x1x2_t)
+    (uint64x1x3_t, int64x1x3_t)
+    (uint64x1x4_t, int64x1x4_t)
+    (uint64x2x2_t, int64x2x2_t)
+    (uint64x2x3_t, int64x2x3_t)
+    (uint64x2x4_t, int64x2x4_t)
+    (uint8x16x2_t, int8x16x2_t)
+    (uint8x16x3_t, int8x16x3_t)
+    (uint8x16x4_t, int8x16x4_t)
+    (uint8x8x2_t, int8x8x2_t)
+    (uint8x8x3_t, int8x8x3_t)
+    (uint8x8x4_t, int8x8x4_t)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    use crate::core_arch::aarch64::*;
+    #[cfg(target_arch = "arm")]
+    use crate::core_arch::arm::*;
+    use crate::core_arch::arm_shared::test_support::*;
+    use crate::core_arch::simd::*;
+    use std::{mem::transmute, vec::Vec};
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_s8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: i8 = 42;
+        let e = i8x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: i8x8 = transmute(vld1_lane_s8::<7>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let elem: i8 = 42;
+        let e = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 42);
+        let r: i8x16 = transmute(vld1q_lane_s8::<15>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_s16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let elem: i16 = 42;
+        let e = i16x4::new(0, 1, 2, 42);
+        let r: i16x4 = transmute(vld1_lane_s16::<3>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_s16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: i16 = 42;
+        let e = i16x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: i16x8 = transmute(vld1q_lane_s16::<7>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_s32() {
+        let a = i32x2::new(0, 1);
+        let elem: i32 = 42;
+        let e = i32x2::new(0, 42);
+        let r: i32x2 = transmute(vld1_lane_s32::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_s32() {
+        let a = i32x4::new(0, 1, 2, 3);
+        let elem: i32 = 42;
+        let e = i32x4::new(0, 1, 2, 42);
+        let r: i32x4 = transmute(vld1q_lane_s32::<3>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_s64() {
+        let a = i64x1::new(0);
+        let elem: i64 = 42;
+        let e = i64x1::new(42);
+        let r: i64x1 = transmute(vld1_lane_s64::<0>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_s64() {
+        let a = i64x2::new(0, 1);
+        let elem: i64 = 42;
+        let e = i64x2::new(0, 42);
+        let r: i64x2 = transmute(vld1q_lane_s64::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: u8 = 42;
+        let e = u8x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: u8x8 = transmute(vld1_lane_u8::<7>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let elem: u8 = 42;
+        let e = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 42);
+        let r: u8x16 = transmute(vld1q_lane_u8::<15>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_u16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let elem: u16 = 42;
+        let e = u16x4::new(0, 1, 2, 42);
+        let r: u16x4 = transmute(vld1_lane_u16::<3>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: u16 = 42;
+        let e = u16x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: u16x8 = transmute(vld1q_lane_u16::<7>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_u32() {
+        let a = u32x2::new(0, 1);
+        let elem: u32 = 42;
+        let e = u32x2::new(0, 42);
+        let r: u32x2 = transmute(vld1_lane_u32::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let elem: u32 = 42;
+        let e = u32x4::new(0, 1, 2, 42);
+        let r: u32x4 = transmute(vld1q_lane_u32::<3>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_u64() {
+        let a = u64x1::new(0);
+        let elem: u64 = 42;
+        let e = u64x1::new(42);
+        let r: u64x1 = transmute(vld1_lane_u64::<0>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_u64() {
+        let a = u64x2::new(0, 1);
+        let elem: u64 = 42;
+        let e = u64x2::new(0, 42);
+        let r: u64x2 = transmute(vld1q_lane_u64::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_p8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: p8 = 42;
+        let e = u8x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: u8x8 = transmute(vld1_lane_p8::<7>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_p8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let elem: p8 = 42;
+        let e = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 42);
+        let r: u8x16 = transmute(vld1q_lane_p8::<15>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_p16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let elem: p16 = 42;
+        let e = u16x4::new(0, 1, 2, 42);
+        let r: u16x4 = transmute(vld1_lane_p16::<3>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_p16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: p16 = 42;
+        let e = u16x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: u16x8 = transmute(vld1q_lane_p16::<7>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1_lane_p64() {
+        let a = u64x1::new(0);
+        let elem: u64 = 42;
+        let e = u64x1::new(42);
+        let r: u64x1 = transmute(vld1_lane_p64::<0>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1q_lane_p64() {
+        let a = u64x2::new(0, 1);
+        let elem: u64 = 42;
+        let e = u64x2::new(0, 42);
+        let r: u64x2 = transmute(vld1q_lane_p64::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_f32() {
+        let a = f32x2::new(0., 1.);
+        let elem: f32 = 42.;
+        let e = f32x2::new(0., 42.);
+        let r: f32x2 = transmute(vld1_lane_f32::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_f32() {
+        let a = f32x4::new(0., 1., 2., 3.);
+        let elem: f32 = 42.;
+        let e = f32x4::new(0., 1., 2., 42.);
+        let r: f32x4 = transmute(vld1q_lane_f32::<3>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_s8() {
+        let elem: i8 = 42;
+        let e = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: i8x8 = transmute(vld1_dup_s8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_s8() {
+        let elem: i8 = 42;
+        let e = i8x16::new(
+            42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        );
+        let r: i8x16 = transmute(vld1q_dup_s8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_s16() {
+        let elem: i16 = 42;
+        let e = i16x4::new(42, 42, 42, 42);
+        let r: i16x4 = transmute(vld1_dup_s16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_s16() {
+        let elem: i16 = 42;
+        let e = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: i16x8 = transmute(vld1q_dup_s16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_s32() {
+        let elem: i32 = 42;
+        let e = i32x2::new(42, 42);
+        let r: i32x2 = transmute(vld1_dup_s32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_s32() {
+        let elem: i32 = 42;
+        let e = i32x4::new(42, 42, 42, 42);
+        let r: i32x4 = transmute(vld1q_dup_s32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_s64() {
+        let elem: i64 = 42;
+        let e = i64x1::new(42);
+        let r: i64x1 = transmute(vld1_dup_s64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_s64() {
+        let elem: i64 = 42;
+        let e = i64x2::new(42, 42);
+        let r: i64x2 = transmute(vld1q_dup_s64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_u8() {
+        let elem: u8 = 42;
+        let e = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: u8x8 = transmute(vld1_dup_u8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_u8() {
+        let elem: u8 = 42;
+        let e = u8x16::new(
+            42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        );
+        let r: u8x16 = transmute(vld1q_dup_u8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_u16() {
+        let elem: u16 = 42;
+        let e = u16x4::new(42, 42, 42, 42);
+        let r: u16x4 = transmute(vld1_dup_u16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_u16() {
+        let elem: u16 = 42;
+        let e = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: u16x8 = transmute(vld1q_dup_u16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_u32() {
+        let elem: u32 = 42;
+        let e = u32x2::new(42, 42);
+        let r: u32x2 = transmute(vld1_dup_u32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_u32() {
+        let elem: u32 = 42;
+        let e = u32x4::new(42, 42, 42, 42);
+        let r: u32x4 = transmute(vld1q_dup_u32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_u64() {
+        let elem: u64 = 42;
+        let e = u64x1::new(42);
+        let r: u64x1 = transmute(vld1_dup_u64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_u64() {
+        let elem: u64 = 42;
+        let e = u64x2::new(42, 42);
+        let r: u64x2 = transmute(vld1q_dup_u64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_p8() {
+        let elem: p8 = 42;
+        let e = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: u8x8 = transmute(vld1_dup_p8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_p8() {
+        let elem: p8 = 42;
+        let e = u8x16::new(
+            42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        );
+        let r: u8x16 = transmute(vld1q_dup_p8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_p16() {
+        let elem: p16 = 42;
+        let e = u16x4::new(42, 42, 42, 42);
+        let r: u16x4 = transmute(vld1_dup_p16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_p16() {
+        let elem: p16 = 42;
+        let e = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: u16x8 = transmute(vld1q_dup_p16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1_dup_p64() {
+        let elem: u64 = 42;
+        let e = u64x1::new(42);
+        let r: u64x1 = transmute(vld1_dup_p64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1q_dup_p64() {
+        let elem: u64 = 42;
+        let e = u64x2::new(42, 42);
+        let r: u64x2 = transmute(vld1q_dup_p64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_f32() {
+        let elem: f32 = 42.;
+        let e = f32x2::new(42., 42.);
+        let r: f32x2 = transmute(vld1_dup_f32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_f32() {
+        let elem: f32 = 42.;
+        let e = f32x4::new(42., 42., 42., 42.);
+        let r: f32x4 = transmute(vld1q_dup_f32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_u8() {
+        let v = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = vget_lane_u8::<1>(transmute(v));
+        assert_eq!(r, 2);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_u32() {
+        let v = i32x4::new(1, 2, 3, 4);
+        let r = vgetq_lane_u32::<1>(transmute(v));
+        assert_eq!(r, 2);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_s32() {
+        let v = i32x4::new(1, 2, 3, 4);
+        let r = vgetq_lane_s32::<1>(transmute(v));
+        assert_eq!(r, 2);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_u64() {
+        let v: u64 = 1;
+        let r = vget_lane_u64::<0>(transmute(v));
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_u16() {
+        let v = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = vgetq_lane_u16::<1>(transmute(v));
+        assert_eq!(r, 2);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_s8() {
+        let v = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = vget_lane_s8::<2>(transmute(v));
+        assert_eq!(r, 2);
+        let r = vget_lane_s8::<4>(transmute(v));
+        assert_eq!(r, 4);
+        let r = vget_lane_s8::<5>(transmute(v));
+        assert_eq!(r, 5);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_p8() {
+        let v = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = vget_lane_p8::<2>(transmute(v));
+        assert_eq!(r, 2);
+        let r = vget_lane_p8::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vget_lane_p8::<5>(transmute(v));
+        assert_eq!(r, 5);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_p16() {
+        let v = u16x4::new(0, 1, 2, 3);
+        let r = vget_lane_p16::<2>(transmute(v));
+        assert_eq!(r, 2);
+        let r = vget_lane_p16::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vget_lane_p16::<0>(transmute(v));
+        assert_eq!(r, 0);
+        let r = vget_lane_p16::<1>(transmute(v));
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_s16() {
+        let v = i16x4::new(0, 1, 2, 3);
+        let r = vget_lane_s16::<2>(transmute(v));
+        assert_eq!(r, 2);
+        let r = vget_lane_s16::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vget_lane_s16::<0>(transmute(v));
+        assert_eq!(r, 0);
+        let r = vget_lane_s16::<1>(transmute(v));
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_u16() {
+        let v = u16x4::new(0, 1, 2, 3);
+        let r = vget_lane_u16::<2>(transmute(v));
+        assert_eq!(r, 2);
+        let r = vget_lane_u16::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vget_lane_u16::<0>(transmute(v));
+        assert_eq!(r, 0);
+        let r = vget_lane_u16::<1>(transmute(v));
+        assert_eq!(r, 1);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_f32() {
+        let v = f32x2::new(0.0, 1.0);
+        let r = vget_lane_f32::<1>(transmute(v));
+        assert_eq!(r, 1.0);
+        let r = vget_lane_f32::<0>(transmute(v));
+        assert_eq!(r, 0.0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_s32() {
+        let v = i32x2::new(0, 1);
+        let r = vget_lane_s32::<1>(transmute(v));
+        assert_eq!(r, 1);
+        let r = vget_lane_s32::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_u32() {
+        let v = u32x2::new(0, 1);
+        let r = vget_lane_u32::<1>(transmute(v));
+        assert_eq!(r, 1);
+        let r = vget_lane_u32::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_s64() {
+        let v = i64x1::new(1);
+        let r = vget_lane_s64::<0>(transmute(v));
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_p64() {
+        let v = u64x1::new(1);
+        let r = vget_lane_p64::<0>(transmute(v));
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_s8() {
+        let v = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = vgetq_lane_s8::<7>(transmute(v));
+        assert_eq!(r, 7);
+        let r = vgetq_lane_s8::<13>(transmute(v));
+        assert_eq!(r, 13);
+        let r = vgetq_lane_s8::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vgetq_lane_s8::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_p8() {
+        let v = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = vgetq_lane_p8::<7>(transmute(v));
+        assert_eq!(r, 7);
+        let r = vgetq_lane_p8::<13>(transmute(v));
+        assert_eq!(r, 13);
+        let r = vgetq_lane_p8::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vgetq_lane_p8::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_u8() {
+        let v = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = vgetq_lane_u8::<7>(transmute(v));
+        assert_eq!(r, 7);
+        let r = vgetq_lane_u8::<13>(transmute(v));
+        assert_eq!(r, 13);
+        let r = vgetq_lane_u8::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vgetq_lane_u8::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_s16() {
+        let v = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = vgetq_lane_s16::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vgetq_lane_s16::<6>(transmute(v));
+        assert_eq!(r, 6);
+        let r = vgetq_lane_s16::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_p16() {
+        let v = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = vgetq_lane_p16::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vgetq_lane_p16::<7>(transmute(v));
+        assert_eq!(r, 7);
+        let r = vgetq_lane_p16::<1>(transmute(v));
+        assert_eq!(r, 1);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_f32() {
+        let v = f32x4::new(0.0, 1.0, 2.0, 3.0);
+        let r = vgetq_lane_f32::<3>(transmute(v));
+        assert_eq!(r, 3.0);
+        let r = vgetq_lane_f32::<0>(transmute(v));
+        assert_eq!(r, 0.0);
+        let r = vgetq_lane_f32::<2>(transmute(v));
+        assert_eq!(r, 2.0);
+        let r = vgetq_lane_f32::<1>(transmute(v));
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_s64() {
+        let v = i64x2::new(0, 1);
+        let r = vgetq_lane_s64::<1>(transmute(v));
+        assert_eq!(r, 1);
+        let r = vgetq_lane_s64::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_p64() {
+        let v = u64x2::new(0, 1);
+        let r = vgetq_lane_p64::<1>(transmute(v));
+        assert_eq!(r, 1);
+        let r = vgetq_lane_p64::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let b: i64x1 = i64x1::new(1);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vext_s64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let b: u64x1 = u64x1::new(1);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vext_u64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_s8() {
+        let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e = i8x8::new(9, 10, 11, 12, 13, 14, 15, 16);
+        let r: i8x8 = transmute(vget_high_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_s16() {
+        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = i16x4::new(5, 6, 7, 8);
+        let r: i16x4 = transmute(vget_high_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_s32() {
+        let a = i32x4::new(1, 2, 3, 4);
+        let e = i32x2::new(3, 4);
+        let r: i32x2 = transmute(vget_high_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_s64() {
+        let a = i64x2::new(1, 2);
+        let e = i64x1::new(2);
+        let r: i64x1 = transmute(vget_high_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_u8() {
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e = u8x8::new(9, 10, 11, 12, 13, 14, 15, 16);
+        let r: u8x8 = transmute(vget_high_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = u16x4::new(5, 6, 7, 8);
+        let r: u16x4 = transmute(vget_high_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let e = u32x2::new(3, 4);
+        let r: u32x2 = transmute(vget_high_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_u64() {
+        let a = u64x2::new(1, 2);
+        let e = u64x1::new(2);
+        let r: u64x1 = transmute(vget_high_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_p8() {
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e = u8x8::new(9, 10, 11, 12, 13, 14, 15, 16);
+        let r: u8x8 = transmute(vget_high_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_p16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = u16x4::new(5, 6, 7, 8);
+        let r: u16x4 = transmute(vget_high_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_f32() {
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let e = f32x2::new(3.0, 4.0);
+        let r: f32x2 = transmute(vget_high_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_s8() {
+        let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vget_low_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_s16() {
+        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = i16x4::new(1, 2, 3, 4);
+        let r: i16x4 = transmute(vget_low_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_s32() {
+        let a = i32x4::new(1, 2, 3, 4);
+        let e = i32x2::new(1, 2);
+        let r: i32x2 = transmute(vget_low_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_s64() {
+        let a = i64x2::new(1, 2);
+        let e = i64x1::new(1);
+        let r: i64x1 = transmute(vget_low_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_u8() {
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vget_low_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = u16x4::new(1, 2, 3, 4);
+        let r: u16x4 = transmute(vget_low_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let e = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vget_low_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_u64() {
+        let a = u64x2::new(1, 2);
+        let e = u64x1::new(1);
+        let r: u64x1 = transmute(vget_low_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_p8() {
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vget_low_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_p16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = u16x4::new(1, 2, 3, 4);
+        let r: u16x4 = transmute(vget_low_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_f32() {
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let e = f32x2::new(1.0, 2.0);
+        let r: f32x2 = transmute(vget_low_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_s8() {
+        let v: i8 = 42;
+        let e = i8x16::new(
+            42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        );
+        let r: i8x16 = transmute(vdupq_n_s8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_s16() {
+        let v: i16 = 64;
+        let e = i16x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: i16x8 = transmute(vdupq_n_s16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_s32() {
+        let v: i32 = 64;
+        let e = i32x4::new(64, 64, 64, 64);
+        let r: i32x4 = transmute(vdupq_n_s32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_s64() {
+        let v: i64 = 64;
+        let e = i64x2::new(64, 64);
+        let r: i64x2 = transmute(vdupq_n_s64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_u8() {
+        let v: u8 = 64;
+        let e = u8x16::new(
+            64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        );
+        let r: u8x16 = transmute(vdupq_n_u8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_u16() {
+        let v: u16 = 64;
+        let e = u16x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u16x8 = transmute(vdupq_n_u16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_u32() {
+        let v: u32 = 64;
+        let e = u32x4::new(64, 64, 64, 64);
+        let r: u32x4 = transmute(vdupq_n_u32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_u64() {
+        let v: u64 = 64;
+        let e = u64x2::new(64, 64);
+        let r: u64x2 = transmute(vdupq_n_u64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_p8() {
+        let v: p8 = 64;
+        let e = u8x16::new(
+            64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        );
+        let r: u8x16 = transmute(vdupq_n_p8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_p16() {
+        let v: p16 = 64;
+        let e = u16x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u16x8 = transmute(vdupq_n_p16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_f32() {
+        let v: f32 = 64.0;
+        let e = f32x4::new(64.0, 64.0, 64.0, 64.0);
+        let r: f32x4 = transmute(vdupq_n_f32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_s8() {
+        let v: i8 = 64;
+        let e = i8x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: i8x8 = transmute(vdup_n_s8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_s16() {
+        let v: i16 = 64;
+        let e = i16x4::new(64, 64, 64, 64);
+        let r: i16x4 = transmute(vdup_n_s16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_s32() {
+        let v: i32 = 64;
+        let e = i32x2::new(64, 64);
+        let r: i32x2 = transmute(vdup_n_s32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_s64() {
+        let v: i64 = 64;
+        let e = i64x1::new(64);
+        let r: i64x1 = transmute(vdup_n_s64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_u8() {
+        let v: u8 = 64;
+        let e = u8x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u8x8 = transmute(vdup_n_u8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_u16() {
+        let v: u16 = 64;
+        let e = u16x4::new(64, 64, 64, 64);
+        let r: u16x4 = transmute(vdup_n_u16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_u32() {
+        let v: u32 = 64;
+        let e = u32x2::new(64, 64);
+        let r: u32x2 = transmute(vdup_n_u32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_u64() {
+        let v: u64 = 64;
+        let e = u64x1::new(64);
+        let r: u64x1 = transmute(vdup_n_u64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_p8() {
+        let v: p8 = 64;
+        let e = u8x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u8x8 = transmute(vdup_n_p8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_p16() {
+        let v: p16 = 64;
+        let e = u16x4::new(64, 64, 64, 64);
+        let r: u16x4 = transmute(vdup_n_p16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_f32() {
+        let v: f32 = 64.0;
+        let e = f32x2::new(64.0, 64.0);
+        let r: f32x2 = transmute(vdup_n_f32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vldrq_p128() {
+        let v: [p128; 2] = [1, 2];
+        let e: p128 = 2;
+        let r: p128 = vldrq_p128(v[1..].as_ptr());
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vstrq_p128() {
+        let v: [p128; 2] = [1, 2];
+        let e: p128 = 2;
+        let mut r: p128 = 1;
+        vstrq_p128(&mut r, v[1]);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_s8() {
+        let v: i8 = 64;
+        let e = i8x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: i8x8 = transmute(vmov_n_s8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_s16() {
+        let v: i16 = 64;
+        let e = i16x4::new(64, 64, 64, 64);
+        let r: i16x4 = transmute(vmov_n_s16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_s32() {
+        let v: i32 = 64;
+        let e = i32x2::new(64, 64);
+        let r: i32x2 = transmute(vmov_n_s32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_s64() {
+        let v: i64 = 64;
+        let e = i64x1::new(64);
+        let r: i64x1 = transmute(vmov_n_s64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_u8() {
+        let v: u8 = 64;
+        let e = u8x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u8x8 = transmute(vmov_n_u8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_u16() {
+        let v: u16 = 64;
+        let e = u16x4::new(64, 64, 64, 64);
+        let r: u16x4 = transmute(vmov_n_u16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_u32() {
+        let v: u32 = 64;
+        let e = u32x2::new(64, 64);
+        let r: u32x2 = transmute(vmov_n_u32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_u64() {
+        let v: u64 = 64;
+        let e = u64x1::new(64);
+        let r: u64x1 = transmute(vmov_n_u64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_p8() {
+        let v: p8 = 64;
+        let e = u8x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u8x8 = transmute(vmov_n_p8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_p16() {
+        let v: p16 = 64;
+        let e = u16x4::new(64, 64, 64, 64);
+        let r: u16x4 = transmute(vmov_n_p16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_f32() {
+        let v: f32 = 64.0;
+        let e = f32x2::new(64.0, 64.0);
+        let r: f32x2 = transmute(vmov_n_f32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_s8() {
+        let v: i8 = 64;
+        let e = i8x16::new(
+            64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        );
+        let r: i8x16 = transmute(vmovq_n_s8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_s16() {
+        let v: i16 = 64;
+        let e = i16x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: i16x8 = transmute(vmovq_n_s16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_s32() {
+        let v: i32 = 64;
+        let e = i32x4::new(64, 64, 64, 64);
+        let r: i32x4 = transmute(vmovq_n_s32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_s64() {
+        let v: i64 = 64;
+        let e = i64x2::new(64, 64);
+        let r: i64x2 = transmute(vmovq_n_s64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_u8() {
+        let v: u8 = 64;
+        let e = u8x16::new(
+            64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        );
+        let r: u8x16 = transmute(vmovq_n_u8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_u16() {
+        let v: u16 = 64;
+        let e = u16x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u16x8 = transmute(vmovq_n_u16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_u32() {
+        let v: u32 = 64;
+        let e = u32x4::new(64, 64, 64, 64);
+        let r: u32x4 = transmute(vmovq_n_u32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_u64() {
+        let v: u64 = 64;
+        let e = u64x2::new(64, 64);
+        let r: u64x2 = transmute(vmovq_n_u64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_p8() {
+        let v: p8 = 64;
+        let e = u8x16::new(
+            64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        );
+        let r: u8x16 = transmute(vmovq_n_p8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_p16() {
+        let v: p16 = 64;
+        let e = u16x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u16x8 = transmute(vmovq_n_p16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_f32() {
+        let v: f32 = 64.0;
+        let e = f32x4::new(64.0, 64.0, 64.0, 64.0);
+        let r: f32x4 = transmute(vmovq_n_f32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_u64() {
+        let v = i64x2::new(1, 2);
+        let r = vgetq_lane_u64::<1>(transmute(v));
+        assert_eq!(r, 2);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_s8() {
+        test_ari_s8(
+            |i, j| vadd_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_s8() {
+        testq_ari_s8(
+            |i, j| vaddq_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_s16() {
+        test_ari_s16(
+            |i, j| vadd_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_s16() {
+        testq_ari_s16(
+            |i, j| vaddq_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_s32() {
+        test_ari_s32(
+            |i, j| vadd_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_s32() {
+        testq_ari_s32(
+            |i, j| vaddq_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.overflowing_add(b).0 },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_u8() {
+        test_ari_u8(
+            |i, j| vadd_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_u8() {
+        testq_ari_u8(
+            |i, j| vaddq_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_u16() {
+        test_ari_u16(
+            |i, j| vadd_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_u16() {
+        testq_ari_u16(
+            |i, j| vaddq_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_u32() {
+        test_ari_u32(
+            |i, j| vadd_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_u32() {
+        testq_ari_u32(
+            |i, j| vaddq_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.overflowing_add(b).0 },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_f32() {
+        test_ari_f32(|i, j| vadd_f32(i, j), |a: f32, b: f32| -> f32 { a + b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_f32() {
+        testq_ari_f32(|i, j| vaddq_f32(i, j), |a: f32, b: f32| -> f32 { a + b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_s8() {
+        let v = i8::MAX;
+        let a = i8x8::new(v, v, v, v, v, v, v, v);
+        let v = 2 * (v as i16);
+        let e = i16x8::new(v, v, v, v, v, v, v, v);
+        let r: i16x8 = transmute(vaddl_s8(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_s16() {
+        let v = i16::MAX;
+        let a = i16x4::new(v, v, v, v);
+        let v = 2 * (v as i32);
+        let e = i32x4::new(v, v, v, v);
+        let r: i32x4 = transmute(vaddl_s16(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_s32() {
+        let v = i32::MAX;
+        let a = i32x2::new(v, v);
+        let v = 2 * (v as i64);
+        let e = i64x2::new(v, v);
+        let r: i64x2 = transmute(vaddl_s32(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_u8() {
+        let v = u8::MAX;
+        let a = u8x8::new(v, v, v, v, v, v, v, v);
+        let v = 2 * (v as u16);
+        let e = u16x8::new(v, v, v, v, v, v, v, v);
+        let r: u16x8 = transmute(vaddl_u8(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_u16() {
+        let v = u16::MAX;
+        let a = u16x4::new(v, v, v, v);
+        let v = 2 * (v as u32);
+        let e = u32x4::new(v, v, v, v);
+        let r: u32x4 = transmute(vaddl_u16(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_u32() {
+        let v = u32::MAX;
+        let a = u32x2::new(v, v);
+        let v = 2 * (v as u64);
+        let e = u64x2::new(v, v);
+        let r: u64x2 = transmute(vaddl_u32(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let x = i8::MAX;
+        let b = i8x16::new(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
+        let x = x as i16;
+        let e = i16x8::new(x + 8, x + 9, x + 10, x + 11, x + 12, x + 13, x + 14, x + 15);
+        let r: i16x8 = transmute(vaddl_high_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_s16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let x = i16::MAX;
+        let b = i16x8::new(x, x, x, x, x, x, x, x);
+        let x = x as i32;
+        let e = i32x4::new(x + 4, x + 5, x + 6, x + 7);
+        let r: i32x4 = transmute(vaddl_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_s32() {
+        let a = i32x4::new(0, 1, 2, 3);
+        let x = i32::MAX;
+        let b = i32x4::new(x, x, x, x);
+        let x = x as i64;
+        let e = i64x2::new(x + 2, x + 3);
+        let r: i64x2 = transmute(vaddl_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let x = u8::MAX;
+        let b = u8x16::new(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
+        let x = x as u16;
+        let e = u16x8::new(x + 8, x + 9, x + 10, x + 11, x + 12, x + 13, x + 14, x + 15);
+        let r: u16x8 = transmute(vaddl_high_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let x = u16::MAX;
+        let b = u16x8::new(x, x, x, x, x, x, x, x);
+        let x = x as u32;
+        let e = u32x4::new(x + 4, x + 5, x + 6, x + 7);
+        let r: u32x4 = transmute(vaddl_high_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let x = u32::MAX;
+        let b = u32x4::new(x, x, x, x);
+        let x = x as u64;
+        let e = u64x2::new(x + 2, x + 3);
+        let r: u64x2 = transmute(vaddl_high_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_s8() {
+        let x = i16::MAX;
+        let a = i16x8::new(x, 1, 2, 3, 4, 5, 6, 7);
+        let y = i8::MAX;
+        let b = i8x8::new(y, y, y, y, y, y, y, y);
+        let y = y as i16;
+        let e = i16x8::new(
+            x.wrapping_add(y),
+            1 + y,
+            2 + y,
+            3 + y,
+            4 + y,
+            5 + y,
+            6 + y,
+            7 + y,
+        );
+        let r: i16x8 = transmute(vaddw_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_s16() {
+        let x = i32::MAX;
+        let a = i32x4::new(x, 1, 2, 3);
+        let y = i16::MAX;
+        let b = i16x4::new(y, y, y, y);
+        let y = y as i32;
+        let e = i32x4::new(x.wrapping_add(y), 1 + y, 2 + y, 3 + y);
+        let r: i32x4 = transmute(vaddw_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_s32() {
+        let x = i64::MAX;
+        let a = i64x2::new(x, 1);
+        let y = i32::MAX;
+        let b = i32x2::new(y, y);
+        let y = y as i64;
+        let e = i64x2::new(x.wrapping_add(y), 1 + y);
+        let r: i64x2 = transmute(vaddw_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_u8() {
+        let x = u16::MAX;
+        let a = u16x8::new(x, 1, 2, 3, 4, 5, 6, 7);
+        let y = u8::MAX;
+        let b = u8x8::new(y, y, y, y, y, y, y, y);
+        let y = y as u16;
+        let e = u16x8::new(
+            x.wrapping_add(y),
+            1 + y,
+            2 + y,
+            3 + y,
+            4 + y,
+            5 + y,
+            6 + y,
+            7 + y,
+        );
+        let r: u16x8 = transmute(vaddw_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_u16() {
+        let x = u32::MAX;
+        let a = u32x4::new(x, 1, 2, 3);
+        let y = u16::MAX;
+        let b = u16x4::new(y, y, y, y);
+        let y = y as u32;
+        let e = u32x4::new(x.wrapping_add(y), 1 + y, 2 + y, 3 + y);
+        let r: u32x4 = transmute(vaddw_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_u32() {
+        let x = u64::MAX;
+        let a = u64x2::new(x, 1);
+        let y = u32::MAX;
+        let b = u32x2::new(y, y);
+        let y = y as u64;
+        let e = u64x2::new(x.wrapping_add(y), 1 + y);
+        let r: u64x2 = transmute(vaddw_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_s8() {
+        let x = i16::MAX;
+        let a = i16x8::new(x, 1, 2, 3, 4, 5, 6, 7);
+        let y = i8::MAX;
+        let b = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, y, y, y, y, y, y, y, y);
+        let y = y as i16;
+        let e = i16x8::new(
+            x.wrapping_add(y),
+            1 + y,
+            2 + y,
+            3 + y,
+            4 + y,
+            5 + y,
+            6 + y,
+            7 + y,
+        );
+        let r: i16x8 = transmute(vaddw_high_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_s16() {
+        let x = i32::MAX;
+        let a = i32x4::new(x, 1, 2, 3);
+        let y = i16::MAX;
+        let b = i16x8::new(0, 0, 0, 0, y, y, y, y);
+        let y = y as i32;
+        let e = i32x4::new(x.wrapping_add(y), 1 + y, 2 + y, 3 + y);
+        let r: i32x4 = transmute(vaddw_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_s32() {
+        let x = i64::MAX;
+        let a = i64x2::new(x, 1);
+        let y = i32::MAX;
+        let b = i32x4::new(0, 0, y, y);
+        let y = y as i64;
+        let e = i64x2::new(x.wrapping_add(y), 1 + y);
+        let r: i64x2 = transmute(vaddw_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_u8() {
+        let x = u16::MAX;
+        let a = u16x8::new(x, 1, 2, 3, 4, 5, 6, 7);
+        let y = u8::MAX;
+        let b = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, y, y, y, y, y, y, y, y);
+        let y = y as u16;
+        let e = u16x8::new(
+            x.wrapping_add(y),
+            1 + y,
+            2 + y,
+            3 + y,
+            4 + y,
+            5 + y,
+            6 + y,
+            7 + y,
+        );
+        let r: u16x8 = transmute(vaddw_high_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_u16() {
+        let x = u32::MAX;
+        let a = u32x4::new(x, 1, 2, 3);
+        let y = u16::MAX;
+        let b = u16x8::new(0, 0, 0, 0, y, y, y, y);
+        let y = y as u32;
+        let e = u32x4::new(x.wrapping_add(y), 1 + y, 2 + y, 3 + y);
+        let r: u32x4 = transmute(vaddw_high_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_u32() {
+        let x = u64::MAX;
+        let a = u64x2::new(x, 1);
+        let y = u32::MAX;
+        let b = u32x4::new(0, 0, y, y);
+        let y = y as u64;
+        let e = u64x2::new(x.wrapping_add(y), 1 + y);
+        let r: u64x2 = transmute(vaddw_high_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_s8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e = i8x8::new(-1, -2, -3, -4, -5, -6, -7, -8);
+        let r: i8x8 = transmute(vmvn_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e = i8x16::new(
+            -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16,
+        );
+        let r: i8x16 = transmute(vmvnq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_s16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let e = i16x4::new(-1, -2, -3, -4);
+        let r: i16x4 = transmute(vmvn_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_s16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e = i16x8::new(-1, -2, -3, -4, -5, -6, -7, -8);
+        let r: i16x8 = transmute(vmvnq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_s32() {
+        let a = i32x2::new(0, 1);
+        let e = i32x2::new(-1, -2);
+        let r: i32x2 = transmute(vmvn_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_s32() {
+        let a = i32x4::new(0, 1, 2, 3);
+        let e = i32x4::new(-1, -2, -3, -4);
+        let r: i32x4 = transmute(vmvnq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e = u8x8::new(255, 254, 253, 252, 251, 250, 249, 248);
+        let r: u8x8 = transmute(vmvn_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e = u8x16::new(
+            255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240,
+        );
+        let r: u8x16 = transmute(vmvnq_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_u16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let e = u16x4::new(65_535, 65_534, 65_533, 65_532);
+        let r: u16x4 = transmute(vmvn_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e = u16x8::new(
+            65_535, 65_534, 65_533, 65_532, 65_531, 65_530, 65_529, 65_528,
+        );
+        let r: u16x8 = transmute(vmvnq_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_u32() {
+        let a = u32x2::new(0, 1);
+        let e = u32x2::new(4_294_967_295, 4_294_967_294);
+        let r: u32x2 = transmute(vmvn_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let e = u32x4::new(4_294_967_295, 4_294_967_294, 4_294_967_293, 4_294_967_292);
+        let r: u32x4 = transmute(vmvnq_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_p8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e = u8x8::new(255, 254, 253, 252, 251, 250, 249, 248);
+        let r: u8x8 = transmute(vmvn_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_p8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e = u8x16::new(
+            255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240,
+        );
+        let r: u8x16 = transmute(vmvnq_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_s8() {
+        let a = i8x8::new(0, -1, -2, -3, -4, -5, -6, -7);
+        let b = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let e = i8x8::new(0, -2, -2, -4, -4, -6, -6, -8);
+        let r: i8x8 = transmute(vbic_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_s8() {
+        let a = i8x16::new(
+            0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+        );
+        let b = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let e = i8x16::new(
+            0, -2, -2, -4, -4, -6, -6, -8, -8, -10, -10, -12, -12, -14, -14, -16,
+        );
+        let r: i8x16 = transmute(vbicq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_s16() {
+        let a = i16x4::new(0, -1, -2, -3);
+        let b = i16x4::new(1, 1, 1, 1);
+        let e = i16x4::new(0, -2, -2, -4);
+        let r: i16x4 = transmute(vbic_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_s16() {
+        let a = i16x8::new(0, -1, -2, -3, -4, -5, -6, -7);
+        let b = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let e = i16x8::new(0, -2, -2, -4, -4, -6, -6, -8);
+        let r: i16x8 = transmute(vbicq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_s32() {
+        let a = i32x2::new(0, -1);
+        let b = i32x2::new(1, 1);
+        let e = i32x2::new(0, -2);
+        let r: i32x2 = transmute(vbic_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_s32() {
+        let a = i32x4::new(0, -1, -2, -3);
+        let b = i32x4::new(1, 1, 1, 1);
+        let e = i32x4::new(0, -2, -2, -4);
+        let r: i32x4 = transmute(vbicq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_s64() {
+        let a = i64x1::new(-1);
+        let b = i64x1::new(1);
+        let e = i64x1::new(-2);
+        let r: i64x1 = transmute(vbic_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_s64() {
+        let a = i64x2::new(0, -1);
+        let b = i64x2::new(1, 1);
+        let e = i64x2::new(0, -2);
+        let r: i64x2 = transmute(vbicq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let e = u8x8::new(0, 0, 2, 2, 4, 4, 6, 6);
+        let r: u8x8 = transmute(vbic_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let e = u8x16::new(0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
+        let r: u8x16 = transmute(vbicq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_u16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let b = u16x4::new(1, 1, 1, 1);
+        let e = u16x4::new(0, 0, 2, 2);
+        let r: u16x4 = transmute(vbic_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let e = u16x8::new(0, 0, 2, 2, 4, 4, 6, 6);
+        let r: u16x8 = transmute(vbicq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_u32() {
+        let a = u32x2::new(0, 1);
+        let b = u32x2::new(1, 1);
+        let e = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vbic_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let b = u32x4::new(1, 1, 1, 1);
+        let e = u32x4::new(0, 0, 2, 2);
+        let r: u32x4 = transmute(vbicq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_u64() {
+        let a = u64x1::new(1);
+        let b = u64x1::new(1);
+        let e = u64x1::new(0);
+        let r: u64x1 = transmute(vbic_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_u64() {
+        let a = u64x2::new(0, 1);
+        let b = u64x2::new(1, 1);
+        let e = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vbicq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_s8() {
+        let a = u8x8::new(u8::MAX, 1, u8::MAX, 2, u8::MAX, 0, u8::MAX, 0);
+        let b = i8x8::new(
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+        );
+        let c = i8x8::new(
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+        );
+        let e = i8x8::new(
+            i8::MAX,
+            i8::MIN | 1,
+            i8::MAX,
+            i8::MIN | 2,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+        );
+        let r: i8x8 = transmute(vbsl_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_s16() {
+        let a = u16x4::new(u16::MAX, 0, 1, 2);
+        let b = i16x4::new(i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        let c = i16x4::new(i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        let e = i16x4::new(i16::MAX, i16::MIN, i16::MIN | 1, i16::MIN | 2);
+        let r: i16x4 = transmute(vbsl_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_s32() {
+        let a = u32x2::new(u32::MAX, 1);
+        let b = i32x2::new(i32::MAX, i32::MAX);
+        let c = i32x2::new(i32::MIN, i32::MIN);
+        let e = i32x2::new(i32::MAX, i32::MIN | 1);
+        let r: i32x2 = transmute(vbsl_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_s64() {
+        let a = u64x1::new(1);
+        let b = i64x1::new(i64::MAX);
+        let c = i64x1::new(i64::MIN);
+        let e = i64x1::new(i64::MIN | 1);
+        let r: i64x1 = transmute(vbsl_s64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_u8() {
+        let a = u8x8::new(u8::MAX, 1, u8::MAX, 2, u8::MAX, 0, u8::MAX, 0);
+        let b = u8x8::new(
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+        );
+        let c = u8x8::new(
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+        );
+        let e = u8x8::new(u8::MAX, 1, u8::MAX, 2, u8::MAX, u8::MIN, u8::MAX, u8::MIN);
+        let r: u8x8 = transmute(vbsl_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_u16() {
+        let a = u16x4::new(u16::MAX, 0, 1, 2);
+        let b = u16x4::new(u16::MAX, u16::MAX, u16::MAX, u16::MAX);
+        let c = u16x4::new(u16::MIN, u16::MIN, u16::MIN, u16::MIN);
+        let e = u16x4::new(u16::MAX, 0, 1, 2);
+        let r: u16x4 = transmute(vbsl_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_u32() {
+        let a = u32x2::new(u32::MAX, 2);
+        let b = u32x2::new(u32::MAX, u32::MAX);
+        let c = u32x2::new(u32::MIN, u32::MIN);
+        let e = u32x2::new(u32::MAX, 2);
+        let r: u32x2 = transmute(vbsl_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_u64() {
+        let a = u64x1::new(2);
+        let b = u64x1::new(u64::MAX);
+        let c = u64x1::new(u64::MIN);
+        let e = u64x1::new(2);
+        let r: u64x1 = transmute(vbsl_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_f32() {
+        let a = u32x2::new(1, 0x80000000);
+        let b = f32x2::new(8388609f32, -1.23f32);
+        let c = f32x2::new(2097152f32, 2.34f32);
+        let e = f32x2::new(2097152.25f32, -2.34f32);
+        let r: f32x2 = transmute(vbsl_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_p8() {
+        let a = u8x8::new(u8::MAX, 1, u8::MAX, 2, u8::MAX, 0, u8::MAX, 0);
+        let b = u8x8::new(
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+        );
+        let c = u8x8::new(
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+        );
+        let e = u8x8::new(u8::MAX, 1, u8::MAX, 2, u8::MAX, u8::MIN, u8::MAX, u8::MIN);
+        let r: u8x8 = transmute(vbsl_p8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_p16() {
+        let a = u16x4::new(u16::MAX, 0, 1, 2);
+        let b = u16x4::new(u16::MAX, u16::MAX, u16::MAX, u16::MAX);
+        let c = u16x4::new(u16::MIN, u16::MIN, u16::MIN, u16::MIN);
+        let e = u16x4::new(u16::MAX, 0, 1, 2);
+        let r: u16x4 = transmute(vbsl_p16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_s8() {
+        let a = u8x16::new(
+            u8::MAX,
+            1,
+            u8::MAX,
+            2,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+        );
+        let b = i8x16::new(
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+        );
+        let c = i8x16::new(
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+        );
+        let e = i8x16::new(
+            i8::MAX,
+            i8::MIN | 1,
+            i8::MAX,
+            i8::MIN | 2,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+        );
+        let r: i8x16 = transmute(vbslq_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_s16() {
+        let a = u16x8::new(u16::MAX, 1, u16::MAX, 2, u16::MAX, 0, u16::MAX, 0);
+        let b = i16x8::new(
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+        );
+        let c = i16x8::new(
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+        );
+        let e = i16x8::new(
+            i16::MAX,
+            i16::MIN | 1,
+            i16::MAX,
+            i16::MIN | 2,
+            i16::MAX,
+            i16::MIN,
+            i16::MAX,
+            i16::MIN,
+        );
+        let r: i16x8 = transmute(vbslq_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_s32() {
+        let a = u32x4::new(u32::MAX, 1, u32::MAX, 2);
+        let b = i32x4::new(i32::MAX, i32::MAX, i32::MAX, i32::MAX);
+        let c = i32x4::new(i32::MIN, i32::MIN, i32::MIN, i32::MIN);
+        let e = i32x4::new(i32::MAX, i32::MIN | 1, i32::MAX, i32::MIN | 2);
+        let r: i32x4 = transmute(vbslq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_s64() {
+        let a = u64x2::new(u64::MAX, 1);
+        let b = i64x2::new(i64::MAX, i64::MAX);
+        let c = i64x2::new(i64::MIN, i64::MIN);
+        let e = i64x2::new(i64::MAX, i64::MIN | 1);
+        let r: i64x2 = transmute(vbslq_s64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_u8() {
+        let a = u8x16::new(
+            u8::MAX,
+            1,
+            u8::MAX,
+            2,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+        );
+        let b = u8x16::new(
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+        );
+        let c = u8x16::new(
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+        );
+        let e = u8x16::new(
+            u8::MAX,
+            1,
+            u8::MAX,
+            2,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+        );
+        let r: u8x16 = transmute(vbslq_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_u16() {
+        let a = u16x8::new(u16::MAX, 1, u16::MAX, 2, u16::MAX, 0, u16::MAX, 0);
+        let b = u16x8::new(
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+        );
+        let c = u16x8::new(
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+        );
+        let e = u16x8::new(
+            u16::MAX,
+            1,
+            u16::MAX,
+            2,
+            u16::MAX,
+            u16::MIN,
+            u16::MAX,
+            u16::MIN,
+        );
+        let r: u16x8 = transmute(vbslq_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_u32() {
+        let a = u32x4::new(u32::MAX, 1, u32::MAX, 2);
+        let b = u32x4::new(u32::MAX, u32::MAX, u32::MAX, u32::MAX);
+        let c = u32x4::new(u32::MIN, u32::MIN, u32::MIN, u32::MIN);
+        let e = u32x4::new(u32::MAX, 1, u32::MAX, 2);
+        let r: u32x4 = transmute(vbslq_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_u64() {
+        let a = u64x2::new(u64::MAX, 1);
+        let b = u64x2::new(u64::MAX, u64::MAX);
+        let c = u64x2::new(u64::MIN, u64::MIN);
+        let e = u64x2::new(u64::MAX, 1);
+        let r: u64x2 = transmute(vbslq_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_f32() {
+        let a = u32x4::new(u32::MAX, 0, 1, 0x80000000);
+        let b = f32x4::new(-1.23f32, -1.23f32, 8388609f32, -1.23f32);
+        let c = f32x4::new(2.34f32, 2.34f32, 2097152f32, 2.34f32);
+        let e = f32x4::new(-1.23f32, 2.34f32, 2097152.25f32, -2.34f32);
+        let r: f32x4 = transmute(vbslq_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_p8() {
+        let a = u8x16::new(
+            u8::MAX,
+            1,
+            u8::MAX,
+            2,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+        );
+        let b = u8x16::new(
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+        );
+        let c = u8x16::new(
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+        );
+        let e = u8x16::new(
+            u8::MAX,
+            1,
+            u8::MAX,
+            2,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+        );
+        let r: u8x16 = transmute(vbslq_p8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_p16() {
+        let a = u16x8::new(u16::MAX, 1, u16::MAX, 2, u16::MAX, 0, u16::MAX, 0);
+        let b = u16x8::new(
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+        );
+        let c = u16x8::new(
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+        );
+        let e = u16x8::new(
+            u16::MAX,
+            1,
+            u16::MAX,
+            2,
+            u16::MAX,
+            u16::MIN,
+            u16::MAX,
+            u16::MIN,
+        );
+        let r: u16x8 = transmute(vbslq_p16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_s8() {
+        let a = i8x8::new(0, -1, -2, -3, -4, -5, -6, -7);
+        let b = i8x8::new(-2, -2, -2, -2, -2, -2, -2, -2);
+        let e = i8x8::new(1, -1, -1, -3, -3, -5, -5, -7);
+        let r: i8x8 = transmute(vorn_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_s8() {
+        let a = i8x16::new(
+            0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+        );
+        let b = i8x16::new(
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+        );
+        let e = i8x16::new(
+            1, -1, -1, -3, -3, -5, -5, -7, -7, -9, -9, -11, -11, -13, -13, -15,
+        );
+        let r: i8x16 = transmute(vornq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_s16() {
+        let a = i16x4::new(0, -1, -2, -3);
+        let b = i16x4::new(-2, -2, -2, -2);
+        let e = i16x4::new(1, -1, -1, -3);
+        let r: i16x4 = transmute(vorn_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_s16() {
+        let a = i16x8::new(0, -1, -2, -3, -4, -5, -6, -7);
+        let b = i16x8::new(-2, -2, -2, -2, -2, -2, -2, -2);
+        let e = i16x8::new(1, -1, -1, -3, -3, -5, -5, -7);
+        let r: i16x8 = transmute(vornq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_s32() {
+        let a = i32x2::new(0, -1);
+        let b = i32x2::new(-2, -2);
+        let e = i32x2::new(1, -1);
+        let r: i32x2 = transmute(vorn_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_s32() {
+        let a = i32x4::new(0, -1, -2, -3);
+        let b = i32x4::new(-2, -2, -2, -2);
+        let e = i32x4::new(1, -1, -1, -3);
+        let r: i32x4 = transmute(vornq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_s64() {
+        let a = i64x1::new(0);
+        let b = i64x1::new(-2);
+        let e = i64x1::new(1);
+        let r: i64x1 = transmute(vorn_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_s64() {
+        let a = i64x2::new(0, -1);
+        let b = i64x2::new(-2, -2);
+        let e = i64x2::new(1, -1);
+        let r: i64x2 = transmute(vornq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let t = u8::MAX - 1;
+        let b = u8x8::new(t, t, t, t, t, t, t, t);
+        let e = u8x8::new(1, 1, 3, 3, 5, 5, 7, 7);
+        let r: u8x8 = transmute(vorn_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let t = u8::MAX - 1;
+        let b = u8x16::new(t, t, t, t, t, t, t, t, t, t, t, t, t, t, t, t);
+        let e = u8x16::new(1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
+        let r: u8x16 = transmute(vornq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_u16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let t = u16::MAX - 1;
+        let b = u16x4::new(t, t, t, t);
+        let e = u16x4::new(1, 1, 3, 3);
+        let r: u16x4 = transmute(vorn_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let t = u16::MAX - 1;
+        let b = u16x8::new(t, t, t, t, t, t, t, t);
+        let e = u16x8::new(1, 1, 3, 3, 5, 5, 7, 7);
+        let r: u16x8 = transmute(vornq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_u32() {
+        let a = u32x2::new(0, 1);
+        let t = u32::MAX - 1;
+        let b = u32x2::new(t, t);
+        let e = u32x2::new(1, 1);
+        let r: u32x2 = transmute(vorn_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let t = u32::MAX - 1;
+        let b = u32x4::new(t, t, t, t);
+        let e = u32x4::new(1, 1, 3, 3);
+        let r: u32x4 = transmute(vornq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_u64() {
+        let a = u64x1::new(0);
+        let t = u64::MAX - 1;
+        let b = u64x1::new(t);
+        let e = u64x1::new(1);
+        let r: u64x1 = transmute(vorn_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_u64() {
+        let a = u64x2::new(0, 1);
+        let t = u64::MAX - 1;
+        let b = u64x2::new(t, t);
+        let e = u64x2::new(1, 1);
+        let r: u64x2 = transmute(vornq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_s16() {
+        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vmovn_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_s32() {
+        let a = i32x4::new(1, 2, 3, 4);
+        let e = i16x4::new(1, 2, 3, 4);
+        let r: i16x4 = transmute(vmovn_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_s64() {
+        let a = i64x2::new(1, 2);
+        let e = i32x2::new(1, 2);
+        let r: i32x2 = transmute(vmovn_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vmovn_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let e = u16x4::new(1, 2, 3, 4);
+        let r: u16x4 = transmute(vmovn_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_u64() {
+        let a = u64x2::new(1, 2);
+        let e = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vmovn_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_s8() {
+        let e = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i16x8 = transmute(vmovl_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_s16() {
+        let e = i32x4::new(1, 2, 3, 4);
+        let a = i16x4::new(1, 2, 3, 4);
+        let r: i32x4 = transmute(vmovl_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_s32() {
+        let e = i64x2::new(1, 2);
+        let a = i32x2::new(1, 2);
+        let r: i64x2 = transmute(vmovl_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_u8() {
+        let e = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u16x8 = transmute(vmovl_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_u16() {
+        let e = u32x4::new(1, 2, 3, 4);
+        let a = u16x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vmovl_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_u32() {
+        let e = u64x2::new(1, 2);
+        let a = u32x2::new(1, 2);
+        let r: u64x2 = transmute(vmovl_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s8() {
+        test_bit_s8(|i, j| vand_s8(i, j), |a: i8, b: i8| -> i8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s8() {
+        testq_bit_s8(|i, j| vandq_s8(i, j), |a: i8, b: i8| -> i8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s16() {
+        test_bit_s16(|i, j| vand_s16(i, j), |a: i16, b: i16| -> i16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s16() {
+        testq_bit_s16(|i, j| vandq_s16(i, j), |a: i16, b: i16| -> i16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s32() {
+        test_bit_s32(|i, j| vand_s32(i, j), |a: i32, b: i32| -> i32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s32() {
+        testq_bit_s32(|i, j| vandq_s32(i, j), |a: i32, b: i32| -> i32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s64() {
+        test_bit_s64(|i, j| vand_s64(i, j), |a: i64, b: i64| -> i64 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s64() {
+        testq_bit_s64(|i, j| vandq_s64(i, j), |a: i64, b: i64| -> i64 { a & b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u8() {
+        test_bit_u8(|i, j| vand_u8(i, j), |a: u8, b: u8| -> u8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u8() {
+        testq_bit_u8(|i, j| vandq_u8(i, j), |a: u8, b: u8| -> u8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u16() {
+        test_bit_u16(|i, j| vand_u16(i, j), |a: u16, b: u16| -> u16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u16() {
+        testq_bit_u16(|i, j| vandq_u16(i, j), |a: u16, b: u16| -> u16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u32() {
+        test_bit_u32(|i, j| vand_u32(i, j), |a: u32, b: u32| -> u32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u32() {
+        testq_bit_u32(|i, j| vandq_u32(i, j), |a: u32, b: u32| -> u32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u64() {
+        test_bit_u64(|i, j| vand_u64(i, j), |a: u64, b: u64| -> u64 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u64() {
+        testq_bit_u64(|i, j| vandq_u64(i, j), |a: u64, b: u64| -> u64 { a & b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s8() {
+        test_bit_s8(|i, j| vorr_s8(i, j), |a: i8, b: i8| -> i8 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s8() {
+        testq_bit_s8(|i, j| vorrq_s8(i, j), |a: i8, b: i8| -> i8 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s16() {
+        test_bit_s16(|i, j| vorr_s16(i, j), |a: i16, b: i16| -> i16 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s16() {
+        testq_bit_s16(|i, j| vorrq_s16(i, j), |a: i16, b: i16| -> i16 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s32() {
+        test_bit_s32(|i, j| vorr_s32(i, j), |a: i32, b: i32| -> i32 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s32() {
+        testq_bit_s32(|i, j| vorrq_s32(i, j), |a: i32, b: i32| -> i32 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s64() {
+        test_bit_s64(|i, j| vorr_s64(i, j), |a: i64, b: i64| -> i64 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s64() {
+        testq_bit_s64(|i, j| vorrq_s64(i, j), |a: i64, b: i64| -> i64 { a | b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u8() {
+        test_bit_u8(|i, j| vorr_u8(i, j), |a: u8, b: u8| -> u8 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u8() {
+        testq_bit_u8(|i, j| vorrq_u8(i, j), |a: u8, b: u8| -> u8 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u16() {
+        test_bit_u16(|i, j| vorr_u16(i, j), |a: u16, b: u16| -> u16 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u16() {
+        testq_bit_u16(|i, j| vorrq_u16(i, j), |a: u16, b: u16| -> u16 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u32() {
+        test_bit_u32(|i, j| vorr_u32(i, j), |a: u32, b: u32| -> u32 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u32() {
+        testq_bit_u32(|i, j| vorrq_u32(i, j), |a: u32, b: u32| -> u32 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u64() {
+        test_bit_u64(|i, j| vorr_u64(i, j), |a: u64, b: u64| -> u64 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u64() {
+        testq_bit_u64(|i, j| vorrq_u64(i, j), |a: u64, b: u64| -> u64 { a | b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s8() {
+        test_bit_s8(|i, j| veor_s8(i, j), |a: i8, b: i8| -> i8 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s8() {
+        testq_bit_s8(|i, j| veorq_s8(i, j), |a: i8, b: i8| -> i8 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s16() {
+        test_bit_s16(|i, j| veor_s16(i, j), |a: i16, b: i16| -> i16 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s16() {
+        testq_bit_s16(|i, j| veorq_s16(i, j), |a: i16, b: i16| -> i16 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s32() {
+        test_bit_s32(|i, j| veor_s32(i, j), |a: i32, b: i32| -> i32 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s32() {
+        testq_bit_s32(|i, j| veorq_s32(i, j), |a: i32, b: i32| -> i32 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s64() {
+        test_bit_s64(|i, j| veor_s64(i, j), |a: i64, b: i64| -> i64 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s64() {
+        testq_bit_s64(|i, j| veorq_s64(i, j), |a: i64, b: i64| -> i64 { a ^ b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u8() {
+        test_bit_u8(|i, j| veor_u8(i, j), |a: u8, b: u8| -> u8 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u8() {
+        testq_bit_u8(|i, j| veorq_u8(i, j), |a: u8, b: u8| -> u8 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u16() {
+        test_bit_u16(|i, j| veor_u16(i, j), |a: u16, b: u16| -> u16 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u16() {
+        testq_bit_u16(|i, j| veorq_u16(i, j), |a: u16, b: u16| -> u16 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u32() {
+        test_bit_u32(|i, j| veor_u32(i, j), |a: u32, b: u32| -> u32 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u32() {
+        testq_bit_u32(|i, j| veorq_u32(i, j), |a: u32, b: u32| -> u32 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u64() {
+        test_bit_u64(|i, j| veor_u64(i, j), |a: u64, b: u64| -> u64 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u64() {
+        testq_bit_u64(|i, j| veorq_u64(i, j), |a: u64, b: u64| -> u64 { a ^ b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_s8() {
+        test_cmp_s8(
+            |i, j| vceq_s8(i, j),
+            |a: i8, b: i8| -> u8 { if a == b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_s8() {
+        testq_cmp_s8(
+            |i, j| vceqq_s8(i, j),
+            |a: i8, b: i8| -> u8 { if a == b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_s16() {
+        test_cmp_s16(
+            |i, j| vceq_s16(i, j),
+            |a: i16, b: i16| -> u16 { if a == b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_s16() {
+        testq_cmp_s16(
+            |i, j| vceqq_s16(i, j),
+            |a: i16, b: i16| -> u16 { if a == b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_s32() {
+        test_cmp_s32(
+            |i, j| vceq_s32(i, j),
+            |a: i32, b: i32| -> u32 { if a == b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_s32() {
+        testq_cmp_s32(
+            |i, j| vceqq_s32(i, j),
+            |a: i32, b: i32| -> u32 { if a == b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_u8() {
+        test_cmp_u8(
+            |i, j| vceq_u8(i, j),
+            |a: u8, b: u8| -> u8 { if a == b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_u8() {
+        testq_cmp_u8(
+            |i, j| vceqq_u8(i, j),
+            |a: u8, b: u8| -> u8 { if a == b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_u16() {
+        test_cmp_u16(
+            |i, j| vceq_u16(i, j),
+            |a: u16, b: u16| -> u16 { if a == b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_u16() {
+        testq_cmp_u16(
+            |i, j| vceqq_u16(i, j),
+            |a: u16, b: u16| -> u16 { if a == b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_u32() {
+        test_cmp_u32(
+            |i, j| vceq_u32(i, j),
+            |a: u32, b: u32| -> u32 { if a == b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_u32() {
+        testq_cmp_u32(
+            |i, j| vceqq_u32(i, j),
+            |a: u32, b: u32| -> u32 { if a == b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_f32() {
+        test_cmp_f32(
+            |i, j| vcge_f32(i, j),
+            |a: f32, b: f32| -> u32 { if a == b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_f32() {
+        testq_cmp_f32(
+            |i, j| vcgeq_f32(i, j),
+            |a: f32, b: f32| -> u32 { if a == b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_s8() {
+        test_cmp_s8(
+            |i, j| vcgt_s8(i, j),
+            |a: i8, b: i8| -> u8 { if a > b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_s8() {
+        testq_cmp_s8(
+            |i, j| vcgtq_s8(i, j),
+            |a: i8, b: i8| -> u8 { if a > b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_s16() {
+        test_cmp_s16(
+            |i, j| vcgt_s16(i, j),
+            |a: i16, b: i16| -> u16 { if a > b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_s16() {
+        testq_cmp_s16(
+            |i, j| vcgtq_s16(i, j),
+            |a: i16, b: i16| -> u16 { if a > b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_s32() {
+        test_cmp_s32(
+            |i, j| vcgt_s32(i, j),
+            |a: i32, b: i32| -> u32 { if a > b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_s32() {
+        testq_cmp_s32(
+            |i, j| vcgtq_s32(i, j),
+            |a: i32, b: i32| -> u32 { if a > b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_u8() {
+        test_cmp_u8(
+            |i, j| vcgt_u8(i, j),
+            |a: u8, b: u8| -> u8 { if a > b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_u8() {
+        testq_cmp_u8(
+            |i, j| vcgtq_u8(i, j),
+            |a: u8, b: u8| -> u8 { if a > b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_u16() {
+        test_cmp_u16(
+            |i, j| vcgt_u16(i, j),
+            |a: u16, b: u16| -> u16 { if a > b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_u16() {
+        testq_cmp_u16(
+            |i, j| vcgtq_u16(i, j),
+            |a: u16, b: u16| -> u16 { if a > b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_u32() {
+        test_cmp_u32(
+            |i, j| vcgt_u32(i, j),
+            |a: u32, b: u32| -> u32 { if a > b { 0xFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_u32() {
+        testq_cmp_u32(
+            |i, j| vcgtq_u32(i, j),
+            |a: u32, b: u32| -> u32 { if a > b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_f32() {
+        test_cmp_f32(
+            |i, j| vcgt_f32(i, j),
+            |a: f32, b: f32| -> u32 { if a > b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_f32() {
+        testq_cmp_f32(
+            |i, j| vcgtq_f32(i, j),
+            |a: f32, b: f32| -> u32 { if a > b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s8() {
+        test_cmp_s8(
+            |i, j| vclt_s8(i, j),
+            |a: i8, b: i8| -> u8 { if a < b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_s8() {
+        testq_cmp_s8(
+            |i, j| vcltq_s8(i, j),
+            |a: i8, b: i8| -> u8 { if a < b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s16() {
+        test_cmp_s16(
+            |i, j| vclt_s16(i, j),
+            |a: i16, b: i16| -> u16 { if a < b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_s16() {
+        testq_cmp_s16(
+            |i, j| vcltq_s16(i, j),
+            |a: i16, b: i16| -> u16 { if a < b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s32() {
+        test_cmp_s32(
+            |i, j| vclt_s32(i, j),
+            |a: i32, b: i32| -> u32 { if a < b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_s32() {
+        testq_cmp_s32(
+            |i, j| vcltq_s32(i, j),
+            |a: i32, b: i32| -> u32 { if a < b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_u8() {
+        test_cmp_u8(
+            |i, j| vclt_u8(i, j),
+            |a: u8, b: u8| -> u8 { if a < b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u8() {
+        testq_cmp_u8(
+            |i, j| vcltq_u8(i, j),
+            |a: u8, b: u8| -> u8 { if a < b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_u16() {
+        test_cmp_u16(
+            |i, j| vclt_u16(i, j),
+            |a: u16, b: u16| -> u16 { if a < b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u16() {
+        testq_cmp_u16(
+            |i, j| vcltq_u16(i, j),
+            |a: u16, b: u16| -> u16 { if a < b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_u32() {
+        test_cmp_u32(
+            |i, j| vclt_u32(i, j),
+            |a: u32, b: u32| -> u32 { if a < b { 0xFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u32() {
+        testq_cmp_u32(
+            |i, j| vcltq_u32(i, j),
+            |a: u32, b: u32| -> u32 { if a < b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_f32() {
+        test_cmp_f32(
+            |i, j| vclt_f32(i, j),
+            |a: f32, b: f32| -> u32 { if a < b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_f32() {
+        testq_cmp_f32(
+            |i, j| vcltq_f32(i, j),
+            |a: f32, b: f32| -> u32 { if a < b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_s8() {
+        test_cmp_s8(
+            |i, j| vcle_s8(i, j),
+            |a: i8, b: i8| -> u8 { if a <= b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_s8() {
+        testq_cmp_s8(
+            |i, j| vcleq_s8(i, j),
+            |a: i8, b: i8| -> u8 { if a <= b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_s16() {
+        test_cmp_s16(
+            |i, j| vcle_s16(i, j),
+            |a: i16, b: i16| -> u16 { if a <= b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_s16() {
+        testq_cmp_s16(
+            |i, j| vcleq_s16(i, j),
+            |a: i16, b: i16| -> u16 { if a <= b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_s32() {
+        test_cmp_s32(
+            |i, j| vcle_s32(i, j),
+            |a: i32, b: i32| -> u32 { if a <= b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_s32() {
+        testq_cmp_s32(
+            |i, j| vcleq_s32(i, j),
+            |a: i32, b: i32| -> u32 { if a <= b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_u8() {
+        test_cmp_u8(
+            |i, j| vcle_u8(i, j),
+            |a: u8, b: u8| -> u8 { if a <= b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_u8() {
+        testq_cmp_u8(
+            |i, j| vcleq_u8(i, j),
+            |a: u8, b: u8| -> u8 { if a <= b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_u16() {
+        test_cmp_u16(
+            |i, j| vcle_u16(i, j),
+            |a: u16, b: u16| -> u16 { if a <= b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_u16() {
+        testq_cmp_u16(
+            |i, j| vcleq_u16(i, j),
+            |a: u16, b: u16| -> u16 { if a <= b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_u32() {
+        test_cmp_u32(
+            |i, j| vcle_u32(i, j),
+            |a: u32, b: u32| -> u32 { if a <= b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_u32() {
+        testq_cmp_u32(
+            |i, j| vcleq_u32(i, j),
+            |a: u32, b: u32| -> u32 { if a <= b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_f32() {
+        test_cmp_f32(
+            |i, j| vcle_f32(i, j),
+            |a: f32, b: f32| -> u32 { if a <= b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_f32() {
+        testq_cmp_f32(
+            |i, j| vcleq_f32(i, j),
+            |a: f32, b: f32| -> u32 { if a <= b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_s8() {
+        test_cmp_s8(
+            |i, j| vcge_s8(i, j),
+            |a: i8, b: i8| -> u8 { if a >= b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_s8() {
+        testq_cmp_s8(
+            |i, j| vcgeq_s8(i, j),
+            |a: i8, b: i8| -> u8 { if a >= b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_s16() {
+        test_cmp_s16(
+            |i, j| vcge_s16(i, j),
+            |a: i16, b: i16| -> u16 { if a >= b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_s16() {
+        testq_cmp_s16(
+            |i, j| vcgeq_s16(i, j),
+            |a: i16, b: i16| -> u16 { if a >= b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_s32() {
+        test_cmp_s32(
+            |i, j| vcge_s32(i, j),
+            |a: i32, b: i32| -> u32 { if a >= b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_s32() {
+        testq_cmp_s32(
+            |i, j| vcgeq_s32(i, j),
+            |a: i32, b: i32| -> u32 { if a >= b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_u8() {
+        test_cmp_u8(
+            |i, j| vcge_u8(i, j),
+            |a: u8, b: u8| -> u8 { if a >= b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_u8() {
+        testq_cmp_u8(
+            |i, j| vcgeq_u8(i, j),
+            |a: u8, b: u8| -> u8 { if a >= b { 0xFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_u16() {
+        test_cmp_u16(
+            |i, j| vcge_u16(i, j),
+            |a: u16, b: u16| -> u16 { if a >= b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_u16() {
+        testq_cmp_u16(
+            |i, j| vcgeq_u16(i, j),
+            |a: u16, b: u16| -> u16 { if a >= b { 0xFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_u32() {
+        test_cmp_u32(
+            |i, j| vcge_u32(i, j),
+            |a: u32, b: u32| -> u32 { if a >= b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_u32() {
+        testq_cmp_u32(
+            |i, j| vcgeq_u32(i, j),
+            |a: u32, b: u32| -> u32 { if a >= b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_f32() {
+        test_cmp_f32(
+            |i, j| vcge_f32(i, j),
+            |a: f32, b: f32| -> u32 { if a >= b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_f32() {
+        testq_cmp_f32(
+            |i, j| vcgeq_f32(i, j),
+            |a: f32, b: f32| -> u32 { if a >= b { 0xFFFFFFFF } else { 0 } },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_s8() {
+        test_ari_s8(
+            |i, j| vqsub_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_s8() {
+        testq_ari_s8(
+            |i, j| vqsubq_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_s16() {
+        test_ari_s16(
+            |i, j| vqsub_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_s16() {
+        testq_ari_s16(
+            |i, j| vqsubq_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_s32() {
+        test_ari_s32(
+            |i, j| vqsub_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_s32() {
+        testq_ari_s32(
+            |i, j| vqsubq_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.saturating_sub(b) },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_u8() {
+        test_ari_u8(
+            |i, j| vqsub_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_u8() {
+        testq_ari_u8(
+            |i, j| vqsubq_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_u16() {
+        test_ari_u16(
+            |i, j| vqsub_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_u16() {
+        testq_ari_u16(
+            |i, j| vqsubq_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_u32() {
+        test_ari_u32(
+            |i, j| vqsub_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_u32() {
+        testq_ari_u32(
+            |i, j| vqsubq_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.saturating_sub(b) },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_s8() {
+        test_ari_s8(|i, j| vhadd_s8(i, j), |a: i8, b: i8| -> i8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_s8() {
+        testq_ari_s8(|i, j| vhaddq_s8(i, j), |a: i8, b: i8| -> i8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_s16() {
+        test_ari_s16(|i, j| vhadd_s16(i, j), |a: i16, b: i16| -> i16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_s16() {
+        testq_ari_s16(|i, j| vhaddq_s16(i, j), |a: i16, b: i16| -> i16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_s32() {
+        test_ari_s32(|i, j| vhadd_s32(i, j), |a: i32, b: i32| -> i32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_s32() {
+        testq_ari_s32(|i, j| vhaddq_s32(i, j), |a: i32, b: i32| -> i32 { a & b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_u8() {
+        test_ari_u8(|i, j| vhadd_u8(i, j), |a: u8, b: u8| -> u8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_u8() {
+        testq_ari_u8(|i, j| vhaddq_u8(i, j), |a: u8, b: u8| -> u8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_u16() {
+        test_ari_u16(|i, j| vhadd_u16(i, j), |a: u16, b: u16| -> u16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_u16() {
+        testq_ari_u16(|i, j| vhaddq_u16(i, j), |a: u16, b: u16| -> u16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_u32() {
+        test_ari_u32(|i, j| vhadd_u32(i, j), |a: u32, b: u32| -> u32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_u32() {
+        testq_ari_u32(|i, j| vhaddq_u32(i, j), |a: u32, b: u32| -> u32 { a & b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_s8() {
+        test_ari_s8(|i, j| vrhadd_s8(i, j), |a: i8, b: i8| -> i8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_s8() {
+        testq_ari_s8(|i, j| vrhaddq_s8(i, j), |a: i8, b: i8| -> i8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_s16() {
+        test_ari_s16(|i, j| vrhadd_s16(i, j), |a: i16, b: i16| -> i16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_s16() {
+        testq_ari_s16(|i, j| vrhaddq_s16(i, j), |a: i16, b: i16| -> i16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_s32() {
+        test_ari_s32(|i, j| vrhadd_s32(i, j), |a: i32, b: i32| -> i32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_s32() {
+        testq_ari_s32(|i, j| vrhaddq_s32(i, j), |a: i32, b: i32| -> i32 { a & b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_u8() {
+        test_ari_u8(|i, j| vrhadd_u8(i, j), |a: u8, b: u8| -> u8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_u8() {
+        testq_ari_u8(|i, j| vrhaddq_u8(i, j), |a: u8, b: u8| -> u8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_u16() {
+        test_ari_u16(|i, j| vrhadd_u16(i, j), |a: u16, b: u16| -> u16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_u16() {
+        testq_ari_u16(|i, j| vrhaddq_u16(i, j), |a: u16, b: u16| -> u16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_u32() {
+        test_ari_u32(|i, j| vrhadd_u32(i, j), |a: u32, b: u32| -> u32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_u32() {
+        testq_ari_u32(|i, j| vrhaddq_u32(i, j), |a: u32, b: u32| -> u32 { a & b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_s8() {
+        test_ari_s8(
+            |i, j| vqadd_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_s8() {
+        testq_ari_s8(
+            |i, j| vqaddq_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_s16() {
+        test_ari_s16(
+            |i, j| vqadd_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_s16() {
+        testq_ari_s16(
+            |i, j| vqaddq_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_s32() {
+        test_ari_s32(
+            |i, j| vqadd_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_s32() {
+        testq_ari_s32(
+            |i, j| vqaddq_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.saturating_add(b) },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_u8() {
+        test_ari_u8(
+            |i, j| vqadd_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_u8() {
+        testq_ari_u8(
+            |i, j| vqaddq_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_u16() {
+        test_ari_u16(
+            |i, j| vqadd_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_u16() {
+        testq_ari_u16(
+            |i, j| vqaddq_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_u32() {
+        test_ari_u32(
+            |i, j| vqadd_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_u32() {
+        testq_ari_u32(
+            |i, j| vqaddq_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.saturating_add(b) },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_s8() {
+        test_ari_s8(
+            |i, j| vmul_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_s8() {
+        testq_ari_s8(
+            |i, j| vmulq_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_s16() {
+        test_ari_s16(
+            |i, j| vmul_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_s16() {
+        testq_ari_s16(
+            |i, j| vmulq_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_s32() {
+        test_ari_s32(
+            |i, j| vmul_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_s32() {
+        testq_ari_s32(
+            |i, j| vmulq_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.overflowing_mul(b).0 },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_u8() {
+        test_ari_u8(
+            |i, j| vmul_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_u8() {
+        testq_ari_u8(
+            |i, j| vmulq_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_u16() {
+        test_ari_u16(
+            |i, j| vmul_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_u16() {
+        testq_ari_u16(
+            |i, j| vmulq_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_u32() {
+        test_ari_u32(
+            |i, j| vmul_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_u32() {
+        testq_ari_u32(
+            |i, j| vmulq_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.overflowing_mul(b).0 },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_f32() {
+        test_ari_f32(|i, j| vmul_f32(i, j), |a: f32, b: f32| -> f32 { a * b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_f32() {
+        testq_ari_f32(|i, j| vmulq_f32(i, j), |a: f32, b: f32| -> f32 { a * b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_s8() {
+        test_ari_s8(|i, j| vsub_s8(i, j), |a: i8, b: i8| -> i8 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_s8() {
+        testq_ari_s8(|i, j| vsubq_s8(i, j), |a: i8, b: i8| -> i8 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_s16() {
+        test_ari_s16(|i, j| vsub_s16(i, j), |a: i16, b: i16| -> i16 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_s16() {
+        testq_ari_s16(|i, j| vsubq_s16(i, j), |a: i16, b: i16| -> i16 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_s32() {
+        test_ari_s32(|i, j| vsub_s32(i, j), |a: i32, b: i32| -> i32 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_s32() {
+        testq_ari_s32(|i, j| vsubq_s32(i, j), |a: i32, b: i32| -> i32 { a - b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_u8() {
+        test_ari_u8(|i, j| vsub_u8(i, j), |a: u8, b: u8| -> u8 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_u8() {
+        testq_ari_u8(|i, j| vsubq_u8(i, j), |a: u8, b: u8| -> u8 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_u16() {
+        test_ari_u16(|i, j| vsub_u16(i, j), |a: u16, b: u16| -> u16 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_u16() {
+        testq_ari_u16(|i, j| vsubq_u16(i, j), |a: u16, b: u16| -> u16 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_u32() {
+        test_ari_u32(|i, j| vsub_u32(i, j), |a: u32, b: u32| -> u32 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_u32() {
+        testq_ari_u32(|i, j| vsubq_u32(i, j), |a: u32, b: u32| -> u32 { a - b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_f32() {
+        test_ari_f32(|i, j| vsub_f32(i, j), |a: f32, b: f32| -> f32 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_f32() {
+        testq_ari_f32(|i, j| vsubq_f32(i, j), |a: f32, b: f32| -> f32 { a - b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_s8() {
+        test_ari_s8(
+            |i, j| vhsub_s8(i, j),
+            |a: i8, b: i8| -> i8 { (((a as i16) - (b as i16)) / 2) as i8 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_s8() {
+        testq_ari_s8(
+            |i, j| vhsubq_s8(i, j),
+            |a: i8, b: i8| -> i8 { (((a as i16) - (b as i16)) / 2) as i8 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_s16() {
+        test_ari_s16(
+            |i, j| vhsub_s16(i, j),
+            |a: i16, b: i16| -> i16 { (((a as i32) - (b as i32)) / 2) as i16 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_s16() {
+        testq_ari_s16(
+            |i, j| vhsubq_s16(i, j),
+            |a: i16, b: i16| -> i16 { (((a as i32) - (b as i32)) / 2) as i16 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_s32() {
+        test_ari_s32(
+            |i, j| vhsub_s32(i, j),
+            |a: i32, b: i32| -> i32 { (((a as i64) - (b as i64)) / 2) as i32 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_s32() {
+        testq_ari_s32(
+            |i, j| vhsubq_s32(i, j),
+            |a: i32, b: i32| -> i32 { (((a as i64) - (b as i64)) / 2) as i32 },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_u8() {
+        test_ari_u8(
+            |i, j| vhsub_u8(i, j),
+            |a: u8, b: u8| -> u8 { (((a as u16) - (b as u16)) / 2) as u8 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_u8() {
+        testq_ari_u8(
+            |i, j| vhsubq_u8(i, j),
+            |a: u8, b: u8| -> u8 { (((a as u16) - (b as u16)) / 2) as u8 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_u16() {
+        test_ari_u16(
+            |i, j| vhsub_u16(i, j),
+            |a: u16, b: u16| -> u16 { (((a as u16) - (b as u16)) / 2) as u16 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_u16() {
+        testq_ari_u16(
+            |i, j| vhsubq_u16(i, j),
+            |a: u16, b: u16| -> u16 { (((a as u16) - (b as u16)) / 2) as u16 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_u32() {
+        test_ari_u32(
+            |i, j| vhsub_u32(i, j),
+            |a: u32, b: u32| -> u32 { (((a as u64) - (b as u64)) / 2) as u32 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_u32() {
+        testq_ari_u32(
+            |i, j| vhsubq_u32(i, j),
+            |a: u32, b: u32| -> u32 { (((a as u64) - (b as u64)) / 2) as u32 },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaba_s8() {
+        let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let c = i8x8::new(10, 9, 8, 7, 6, 5, 4, 3);
+        let r: i8x8 = transmute(vaba_s8(transmute(a), transmute(b), transmute(c)));
+        let e = i8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaba_s16() {
+        let a = i16x4::new(1, 2, 3, 4);
+        let b = i16x4::new(1, 1, 1, 1);
+        let c = i16x4::new(10, 9, 8, 7);
+        let r: i16x4 = transmute(vaba_s16(transmute(a), transmute(b), transmute(c)));
+        let e = i16x4::new(10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaba_s32() {
+        let a = i32x2::new(1, 2);
+        let b = i32x2::new(1, 1);
+        let c = i32x2::new(10, 9);
+        let r: i32x2 = transmute(vaba_s32(transmute(a), transmute(b), transmute(c)));
+        let e = i32x2::new(10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaba_u8() {
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let c = u8x8::new(10, 9, 8, 7, 6, 5, 4, 3);
+        let r: u8x8 = transmute(vaba_u8(transmute(a), transmute(b), transmute(c)));
+        let e = u8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaba_u16() {
+        let a = u16x4::new(1, 2, 3, 4);
+        let b = u16x4::new(1, 1, 1, 1);
+        let c = u16x4::new(10, 9, 8, 7);
+        let r: u16x4 = transmute(vaba_u16(transmute(a), transmute(b), transmute(c)));
+        let e = u16x4::new(10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaba_u32() {
+        let a = u32x2::new(1, 2);
+        let b = u32x2::new(1, 1);
+        let c = u32x2::new(10, 9);
+        let r: u32x2 = transmute(vaba_u32(transmute(a), transmute(b), transmute(c)));
+        let e = u32x2::new(10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabaq_s8() {
+        let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 6, 5, 4, 3, 2);
+        let b = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let c = i8x16::new(10, 9, 8, 7, 6, 5, 4, 3, 12, 13, 14, 15, 16, 17, 18, 19);
+        let r: i8x16 = transmute(vabaq_s8(transmute(a), transmute(b), transmute(c)));
+        let e = i8x16::new(
+            10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20,
+        );
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabaq_s16() {
+        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let c = i16x8::new(10, 9, 8, 7, 6, 5, 4, 3);
+        let r: i16x8 = transmute(vabaq_s16(transmute(a), transmute(b), transmute(c)));
+        let e = i16x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabaq_s32() {
+        let a = i32x4::new(1, 2, 3, 4);
+        let b = i32x4::new(1, 1, 1, 1);
+        let c = i32x4::new(10, 9, 8, 7);
+        let r: i32x4 = transmute(vabaq_s32(transmute(a), transmute(b), transmute(c)));
+        let e = i32x4::new(10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabaq_u8() {
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 6, 5, 4, 3, 2);
+        let b = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let c = u8x16::new(10, 9, 8, 7, 6, 5, 4, 3, 12, 13, 14, 15, 16, 17, 18, 19);
+        let r: u8x16 = transmute(vabaq_u8(transmute(a), transmute(b), transmute(c)));
+        let e = u8x16::new(
+            10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20,
+        );
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabaq_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let c = u16x8::new(10, 9, 8, 7, 6, 5, 4, 3);
+        let r: u16x8 = transmute(vabaq_u16(transmute(a), transmute(b), transmute(c)));
+        let e = u16x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabaq_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let b = u32x4::new(1, 1, 1, 1);
+        let c = u32x4::new(10, 9, 8, 7);
+        let r: u32x4 = transmute(vabaq_u32(transmute(a), transmute(b), transmute(c)));
+        let e = u32x4::new(10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev16_s8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i8x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: i8x8 = transmute(vrev16_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev16q_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = i8x16::new(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        let e: i8x16 = transmute(vrev16q_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev16_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u8x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: u8x8 = transmute(vrev16_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev16q_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = u8x16::new(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        let e: u8x16 = transmute(vrev16q_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev16_p8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i8x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: i8x8 = transmute(vrev16_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev16q_p8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = u8x16::new(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        let e: u8x16 = transmute(vrev16q_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_s8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i8x8::new(3, 2, 1, 0, 7, 6, 5, 4);
+        let e: i8x8 = transmute(vrev32_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = i8x16::new(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+        let e: i8x16 = transmute(vrev32q_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u8x8::new(3, 2, 1, 0, 7, 6, 5, 4);
+        let e: u8x8 = transmute(vrev32_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = u8x16::new(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+        let e: u8x16 = transmute(vrev32q_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_s16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let r = i16x4::new(1, 0, 3, 2);
+        let e: i16x4 = transmute(vrev32_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_s16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i16x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: i16x8 = transmute(vrev32q_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_p16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let r = i16x4::new(1, 0, 3, 2);
+        let e: i16x4 = transmute(vrev32_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_p16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i16x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: i16x8 = transmute(vrev32q_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_u16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let r = u16x4::new(1, 0, 3, 2);
+        let e: u16x4 = transmute(vrev32_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u16x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: u16x8 = transmute(vrev32q_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_p8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u8x8::new(3, 2, 1, 0, 7, 6, 5, 4);
+        let e: u8x8 = transmute(vrev32_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_p8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = u8x16::new(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+        let e: u8x16 = transmute(vrev32q_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_s8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i8x8::new(7, 6, 5, 4, 3, 2, 1, 0);
+        let e: i8x8 = transmute(vrev64_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = i8x16::new(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+        let e: i8x16 = transmute(vrev64q_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_s16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let r = i16x4::new(3, 2, 1, 0);
+        let e: i16x4 = transmute(vrev64_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_s16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i16x8::new(3, 2, 1, 0, 7, 6, 5, 4);
+        let e: i16x8 = transmute(vrev64q_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_s32() {
+        let a = i32x2::new(0, 1);
+        let r = i32x2::new(1, 0);
+        let e: i32x2 = transmute(vrev64_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_s32() {
+        let a = i32x4::new(0, 1, 2, 3);
+        let r = i32x4::new(1, 0, 3, 2);
+        let e: i32x4 = transmute(vrev64q_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u8x8::new(7, 6, 5, 4, 3, 2, 1, 0);
+        let e: u8x8 = transmute(vrev64_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = u8x16::new(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+        let e: u8x16 = transmute(vrev64q_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_u16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let r = u16x4::new(3, 2, 1, 0);
+        let e: u16x4 = transmute(vrev64_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u16x8::new(3, 2, 1, 0, 7, 6, 5, 4);
+        let e: u16x8 = transmute(vrev64q_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_u32() {
+        let a = u32x2::new(0, 1);
+        let r = u32x2::new(1, 0);
+        let e: u32x2 = transmute(vrev64_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let r = u32x4::new(1, 0, 3, 2);
+        let e: u32x4 = transmute(vrev64q_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_f32() {
+        let a = f32x2::new(1.0, 2.0);
+        let r = f32x2::new(2.0, 1.0);
+        let e: f32x2 = transmute(vrev64_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_f32() {
+        let a = f32x4::new(1.0, 2.0, -2.0, -1.0);
+        let r = f32x4::new(2.0, 1.0, -1.0, -2.0);
+        let e: f32x4 = transmute(vrev64q_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_p8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u8x8::new(7, 6, 5, 4, 3, 2, 1, 0);
+        let e: u8x8 = transmute(vrev64_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_p8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = u8x16::new(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+        let e: u8x16 = transmute(vrev64q_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_p16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let r = u16x4::new(3, 2, 1, 0);
+        let e: u16x4 = transmute(vrev64_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_p16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u16x8::new(3, 2, 1, 0, 7, 6, 5, 4);
+        let e: u16x8 = transmute(vrev64q_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    macro_rules! test_vcombine {
+        ($test_id:ident => $fn_id:ident ([$($a:expr),*], [$($b:expr),*])) => {
+            #[allow(unused_assignments)]
+            #[simd_test(enable = "neon")]
+            unsafe fn $test_id() {
+                let a = [$($a),*];
+                let b = [$($b),*];
+                let e = [$($a),* $(, $b)*];
+                let c = $fn_id(transmute(a), transmute(b));
+                let mut d = e;
+                d = transmute(c);
+                assert_eq!(d, e);
+            }
+        }
+    }
+
+    test_vcombine!(test_vcombine_s8 => vcombine_s8([3_i8, -4, 5, -6, 7, 8, 9, 10], [13_i8, -14, 15, -16, 17, 18, 19, 110]));
+    test_vcombine!(test_vcombine_u8 => vcombine_u8([3_u8, 4, 5, 6, 7, 8, 9, 10], [13_u8, 14, 15, 16, 17, 18, 19, 110]));
+    test_vcombine!(test_vcombine_p8 => vcombine_p8([3_u8, 4, 5, 6, 7, 8, 9, 10], [13_u8, 14, 15, 16, 17, 18, 19, 110]));
+
+    test_vcombine!(test_vcombine_s16 => vcombine_s16([3_i16, -4, 5, -6], [13_i16, -14, 15, -16]));
+    test_vcombine!(test_vcombine_u16 => vcombine_u16([3_u16, 4, 5, 6], [13_u16, 14, 15, 16]));
+    test_vcombine!(test_vcombine_p16 => vcombine_p16([3_u16, 4, 5, 6], [13_u16, 14, 15, 16]));
+    test_vcombine!(test_vcombine_f16 => vcombine_f16([3_f16, 4., 5., 6.],
+    [13_f16, 14., 15., 16.]));
+
+    test_vcombine!(test_vcombine_s32 => vcombine_s32([3_i32, -4], [13_i32, -14]));
+    test_vcombine!(test_vcombine_u32 => vcombine_u32([3_u32, 4], [13_u32, 14]));
+    // note: poly32x4 does not exist, and neither does vcombine_p32
+    test_vcombine!(test_vcombine_f32 => vcombine_f32([3_f32, -4.], [13_f32, -14.]));
+
+    test_vcombine!(test_vcombine_s64 => vcombine_s64([-3_i64], [13_i64]));
+    test_vcombine!(test_vcombine_u64 => vcombine_u64([3_u64], [13_u64]));
+    test_vcombine!(test_vcombine_p64 => vcombine_p64([3_u64], [13_u64]));
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    test_vcombine!(test_vcombine_f64 => vcombine_f64([-3_f64], [13_f64]));
+}
+
+#[cfg(all(test, target_arch = "arm"))]
+mod table_lookup_tests;
+
+#[cfg(all(test, target_arch = "arm"))]
+mod shift_and_insert_tests;
+
+#[cfg(all(test, target_arch = "arm"))]
+mod load_tests;
+
+#[cfg(all(test, target_arch = "arm"))]
+mod store_tests;
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs
new file mode 100644
index 0000000000000..cfb1a2843a31e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs
@@ -0,0 +1,93 @@
+//! Tests for ARM+v7+neon shift and insert (vsli[q]_n, vsri[q]_n) intrinsics.
+//!
+//! These are included in `{arm, aarch64}::neon`.
+
+use super::*;
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+use crate::core_arch::aarch64::*;
+
+#[cfg(target_arch = "arm")]
+use crate::core_arch::arm::*;
+
+use crate::core_arch::simd::*;
+use std::mem::transmute;
+use stdarch_test::simd_test;
+
+macro_rules! test_vsli {
+    ($test_id:ident, $t:ty => $fn_id:ident ([$($a:expr),*], [$($b:expr),*], $n:expr)) => {
+        #[simd_test(enable = "neon")]
+        #[allow(unused_assignments)]
+        unsafe fn $test_id() {
+            let a = [$($a as $t),*];
+            let b = [$($b as $t),*];
+            let n_bit_mask: $t = (1 << $n) - 1;
+            let e = [$(($a as $t & n_bit_mask) | (($b as $t) << $n)),*];
+            let r = $fn_id::<$n>(transmute(a), transmute(b));
+            let mut d = e;
+            d = transmute(r);
+            assert_eq!(d, e);
+        }
+    }
+}
+test_vsli!(test_vsli_n_s8, i8 => vsli_n_s8([3, -44, 127, -56, 0, 24, -97, 10], [-128, -14, 125, -77, 27, 8, -1, 110], 5));
+test_vsli!(test_vsliq_n_s8, i8 => vsliq_n_s8([3, -44, 127, -56, 0, 24, -97, 10, -33, 1, -6, -39, 15, 101, -80, -1], [-128, -14, 125, -77, 27, 8, -1, 110, -4, -92, 111, 32, 1, -4, -29, 99], 2));
+test_vsli!(test_vsli_n_s16, i16 => vsli_n_s16([3304, -44, 2300, -546], [-1208, -140, 1225, -707], 7));
+test_vsli!(test_vsliq_n_s16, i16 => vsliq_n_s16([3304, -44, 2300, -20046, 0, 9924, -907, 1190], [-1208, -140, 4225, -707, 2701, 804, -71, 2110], 14));
+test_vsli!(test_vsli_n_s32, i32 => vsli_n_s32([125683, -78901], [-128, -112944], 23));
+test_vsli!(test_vsliq_n_s32, i32 => vsliq_n_s32([125683, -78901, 127, -12009], [-128, -112944, 125, -707], 15));
+test_vsli!(test_vsli_n_s64, i64 => vsli_n_s64([-333333], [1028], 45));
+test_vsli!(test_vsliq_n_s64, i64 => vsliq_n_s64([-333333, -52023], [1028, -99814], 33));
+test_vsli!(test_vsli_n_u8, u8 => vsli_n_u8([3, 44, 127, 56, 0, 24, 97, 10], [127, 14, 125, 77, 27, 8, 1, 110], 5));
+test_vsli!(test_vsliq_n_u8, u8 => vsliq_n_u8([3, 44, 127, 56, 0, 24, 97, 10, 33, 1, 6, 39, 15, 101, 80, 1], [127, 14, 125, 77, 27, 8, 1, 110, 4, 92, 111, 32, 1, 4, 29, 99], 2));
+test_vsli!(test_vsli_n_u16, u16 => vsli_n_u16([3304, 44, 2300, 546], [1208, 140, 1225, 707], 7));
+test_vsli!(test_vsliq_n_u16, u16 => vsliq_n_u16([3304, 44, 2300, 20046, 0, 9924, 907, 1190], [1208, 140, 4225, 707, 2701, 804, 71, 2110], 14));
+test_vsli!(test_vsli_n_u32, u32 => vsli_n_u32([125683, 78901], [128, 112944], 23));
+test_vsli!(test_vsliq_n_u32, u32 => vsliq_n_u32([125683, 78901, 127, 12009], [128, 112944, 125, 707], 15));
+test_vsli!(test_vsli_n_u64, u64 => vsli_n_u64([333333], [1028], 45));
+test_vsli!(test_vsliq_n_u64, u64 => vsliq_n_u64([333333, 52023], [1028, 99814], 33));
+test_vsli!(test_vsli_n_p8, i8 => vsli_n_p8([3, 44, 127, 56, 0, 24, 97, 10], [127, 14, 125, 77, 27, 8, 1, 110], 5));
+test_vsli!(test_vsliq_n_p8, i8 => vsliq_n_p8([3, 44, 127, 56, 0, 24, 97, 10, 33, 1, 6, 39, 15, 101, 80, 1], [127, 14, 125, 77, 27, 8, 1, 110, 4, 92, 111, 32, 1, 4, 29, 99], 2));
+test_vsli!(test_vsli_n_p16, i16 => vsli_n_p16([3304, 44, 2300, 546], [1208, 140, 1225, 707], 7));
+test_vsli!(test_vsliq_n_p16, i16 => vsliq_n_p16([3304, 44, 2300, 20046, 0, 9924, 907, 1190], [1208, 140, 4225, 707, 2701, 804, 71, 2110], 14));
+//test_vsli!(test_vsli_n_p64, i64 => vsli_n_p64([333333], [1028], 45));
+//test_vsli!(test_vsliq_n_p64, i64 => vsliq_n_p64([333333, 52023], [1028, 99814], 33));
+
+macro_rules! test_vsri {
+    ($test_id:ident, $t:ty => $fn_id:ident ([$($a:expr),*], [$($b:expr),*], $n:expr)) => {
+        #[simd_test(enable = "neon")]
+        #[allow(unused_assignments)]
+        unsafe fn $test_id() {
+            let a = [$($a as $t),*];
+            let b = [$($b as $t),*];
+            let n_bit_mask = (((1 as $t) << $n) - 1).rotate_right($n);
+            let e = [$(($a as $t & n_bit_mask) | (($b as $t >> $n) & !n_bit_mask)),*];
+            let r = $fn_id::<$n>(transmute(a), transmute(b));
+            let mut d = e;
+            d = transmute(r);
+            assert_eq!(d, e);
+        }
+    }
+}
+test_vsri!(test_vsri_n_s8, i8 => vsri_n_s8([3, -44, 127, -56, 0, 24, -97, 10], [-128, -14, 125, -77, 27, 8, -1, 110], 5));
+test_vsri!(test_vsriq_n_s8, i8 => vsriq_n_s8([3, -44, 127, -56, 0, 24, -97, 10, -33, 1, -6, -39, 15, 101, -80, -1], [-128, -14, 125, -77, 27, 8, -1, 110, -4, -92, 111, 32, 1, -4, -29, 99], 2));
+test_vsri!(test_vsri_n_s16, i16 => vsri_n_s16([3304, -44, 2300, -546], [-1208, -140, 1225, -707], 7));
+test_vsri!(test_vsriq_n_s16, i16 => vsriq_n_s16([3304, -44, 2300, -20046, 0, 9924, -907, 1190], [-1208, -140, 4225, -707, 2701, 804, -71, 2110], 14));
+test_vsri!(test_vsri_n_s32, i32 => vsri_n_s32([125683, -78901], [-128, -112944], 23));
+test_vsri!(test_vsriq_n_s32, i32 => vsriq_n_s32([125683, -78901, 127, -12009], [-128, -112944, 125, -707], 15));
+test_vsri!(test_vsri_n_s64, i64 => vsri_n_s64([-333333], [1028], 45));
+test_vsri!(test_vsriq_n_s64, i64 => vsriq_n_s64([-333333, -52023], [1028, -99814], 33));
+test_vsri!(test_vsri_n_u8, u8 => vsri_n_u8([3, 44, 127, 56, 0, 24, 97, 10], [127, 14, 125, 77, 27, 8, 1, 110], 5));
+test_vsri!(test_vsriq_n_u8, u8 => vsriq_n_u8([3, 44, 127, 56, 0, 24, 97, 10, 33, 1, 6, 39, 15, 101, 80, 1], [127, 14, 125, 77, 27, 8, 1, 110, 4, 92, 111, 32, 1, 4, 29, 99], 2));
+test_vsri!(test_vsri_n_u16, u16 => vsri_n_u16([3304, 44, 2300, 546], [1208, 140, 1225, 707], 7));
+test_vsri!(test_vsriq_n_u16, u16 => vsriq_n_u16([3304, 44, 2300, 20046, 0, 9924, 907, 1190], [1208, 140, 4225, 707, 2701, 804, 71, 2110], 14));
+test_vsri!(test_vsri_n_u32, u32 => vsri_n_u32([125683, 78901], [128, 112944], 23));
+test_vsri!(test_vsriq_n_u32, u32 => vsriq_n_u32([125683, 78901, 127, 12009], [128, 112944, 125, 707], 15));
+test_vsri!(test_vsri_n_u64, u64 => vsri_n_u64([333333], [1028], 45));
+test_vsri!(test_vsriq_n_u64, u64 => vsriq_n_u64([333333, 52023], [1028, 99814], 33));
+test_vsri!(test_vsri_n_p8, i8 => vsri_n_p8([3, 44, 127, 56, 0, 24, 97, 10], [127, 14, 125, 77, 27, 8, 1, 110], 5));
+test_vsri!(test_vsriq_n_p8, i8 => vsriq_n_p8([3, 44, 127, 56, 0, 24, 97, 10, 33, 1, 6, 39, 15, 101, 80, 1], [127, 14, 125, 77, 27, 8, 1, 110, 4, 92, 111, 32, 1, 4, 29, 99], 2));
+test_vsri!(test_vsri_n_p16, i16 => vsri_n_p16([3304, 44, 2300, 546], [1208, 140, 1225, 707], 7));
+test_vsri!(test_vsriq_n_p16, i16 => vsriq_n_p16([3304, 44, 2300, 20046, 0, 9924, 907, 1190], [1208, 140, 4225, 707, 2701, 804, 71, 2110], 14));
+//test_vsri!(test_vsri_n_p64, i64 => vsri_n_p64([333333], [1028], 45));
+//test_vsri!(test_vsriq_n_p64, i64 => vsriq_n_p64([333333, 52023], [1028, 99814], 33));
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/store_tests.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/store_tests.rs
new file mode 100644
index 0000000000000..6b5d4a19ad572
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/store_tests.rs
@@ -0,0 +1,389 @@
+//! Tests for ARM+v7+neon store (vst1) intrinsics.
+//!
+//! These are included in `{arm, aarch64}::neon`.
+
+use super::*;
+
+#[cfg(target_arch = "arm")]
+use crate::core_arch::arm::*;
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+use crate::core_arch::aarch64::*;
+
+use crate::core_arch::simd::*;
+use stdarch_test::simd_test;
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_s8() {
+    let mut vals = [0_i8; 9];
+    let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+
+    vst1_s8(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_s8() {
+    let mut vals = [0_i8; 17];
+    let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+
+    vst1q_s8(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+    assert_eq!(vals[9], 9);
+    assert_eq!(vals[10], 10);
+    assert_eq!(vals[11], 11);
+    assert_eq!(vals[12], 12);
+    assert_eq!(vals[13], 13);
+    assert_eq!(vals[14], 14);
+    assert_eq!(vals[15], 15);
+    assert_eq!(vals[16], 16);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_s16() {
+    let mut vals = [0_i16; 5];
+    let a = i16x4::new(1, 2, 3, 4);
+
+    vst1_s16(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_s16() {
+    let mut vals = [0_i16; 9];
+    let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+
+    vst1q_s16(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_s32() {
+    let mut vals = [0_i32; 3];
+    let a = i32x2::new(1, 2);
+
+    vst1_s32(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_s32() {
+    let mut vals = [0_i32; 5];
+    let a = i32x4::new(1, 2, 3, 4);
+
+    vst1q_s32(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_s64() {
+    let mut vals = [0_i64; 2];
+    let a = i64x1::new(1);
+
+    vst1_s64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_s64() {
+    let mut vals = [0_i64; 3];
+    let a = i64x2::new(1, 2);
+
+    vst1q_s64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_u8() {
+    let mut vals = [0_u8; 9];
+    let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+
+    vst1_u8(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_u8() {
+    let mut vals = [0_u8; 17];
+    let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+
+    vst1q_u8(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+    assert_eq!(vals[9], 9);
+    assert_eq!(vals[10], 10);
+    assert_eq!(vals[11], 11);
+    assert_eq!(vals[12], 12);
+    assert_eq!(vals[13], 13);
+    assert_eq!(vals[14], 14);
+    assert_eq!(vals[15], 15);
+    assert_eq!(vals[16], 16);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_u16() {
+    let mut vals = [0_u16; 5];
+    let a = u16x4::new(1, 2, 3, 4);
+
+    vst1_u16(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_u16() {
+    let mut vals = [0_u16; 9];
+    let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+
+    vst1q_u16(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_u32() {
+    let mut vals = [0_u32; 3];
+    let a = u32x2::new(1, 2);
+
+    vst1_u32(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_u32() {
+    let mut vals = [0_u32; 5];
+    let a = u32x4::new(1, 2, 3, 4);
+
+    vst1q_u32(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_u64() {
+    let mut vals = [0_u64; 2];
+    let a = u64x1::new(1);
+
+    vst1_u64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_u64() {
+    let mut vals = [0_u64; 3];
+    let a = u64x2::new(1, 2);
+
+    vst1q_u64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_p8() {
+    let mut vals = [0_u8; 9];
+    let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+
+    vst1_p8(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_p8() {
+    let mut vals = [0_u8; 17];
+    let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+
+    vst1q_p8(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+    assert_eq!(vals[9], 9);
+    assert_eq!(vals[10], 10);
+    assert_eq!(vals[11], 11);
+    assert_eq!(vals[12], 12);
+    assert_eq!(vals[13], 13);
+    assert_eq!(vals[14], 14);
+    assert_eq!(vals[15], 15);
+    assert_eq!(vals[16], 16);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_p16() {
+    let mut vals = [0_u16; 5];
+    let a = u16x4::new(1, 2, 3, 4);
+
+    vst1_p16(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_p16() {
+    let mut vals = [0_u16; 9];
+    let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+
+    vst1q_p16(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+}
+
+#[simd_test(enable = "neon,aes")]
+unsafe fn test_vst1_p64() {
+    let mut vals = [0_u64; 2];
+    let a = u64x1::new(1);
+
+    vst1_p64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+}
+
+#[simd_test(enable = "neon,aes")]
+unsafe fn test_vst1q_p64() {
+    let mut vals = [0_u64; 3];
+    let a = u64x2::new(1, 2);
+
+    vst1q_p64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_f32() {
+    let mut vals = [0_f32; 3];
+    let a = f32x2::new(1., 2.);
+
+    vst1_f32(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0.);
+    assert_eq!(vals[1], 1.);
+    assert_eq!(vals[2], 2.);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_f32() {
+    let mut vals = [0_f32; 5];
+    let a = f32x4::new(1., 2., 3., 4.);
+
+    vst1q_f32(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0.);
+    assert_eq!(vals[1], 1.);
+    assert_eq!(vals[2], 2.);
+    assert_eq!(vals[3], 3.);
+    assert_eq!(vals[4], 4.);
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/table_lookup_tests.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/table_lookup_tests.rs
new file mode 100644
index 0000000000000..9403855f00e0f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/table_lookup_tests.rs
@@ -0,0 +1,1044 @@
+//! Tests for ARM+v7+neon table lookup (vtbl, vtbx) intrinsics.
+//!
+//! These are included in `{arm, aarch64}::neon`.
+
+use super::*;
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+use crate::core_arch::aarch64::*;
+
+#[cfg(target_arch = "arm")]
+use crate::core_arch::arm::*;
+
+use crate::core_arch::simd::*;
+use std::mem;
+use stdarch_test::simd_test;
+
+macro_rules! test_vtbl {
+    ($test_name:ident => $fn_id:ident:
+     - table[$table_t:ident]: [$($table_v:expr),*] |
+     $(- ctrl[$ctrl_t:ident]: [$($ctrl_v:expr),*] => [$($exp_v:expr),*])|*
+    ) => {
+        #[cfg(target_endian = "little")]
+        #[simd_test(enable = "neon")]
+        unsafe fn $test_name() {
+            // create table as array, and transmute it to
+            // arm's table type
+            let table: $table_t = mem::transmute([$($table_v),*]);
+
+            // For each control vector, perform a table lookup and
+            // verify the result:
+            $(
+                {
+                    let ctrl: $ctrl_t = mem::transmute([$($ctrl_v),*]);
+                    let result = $fn_id(table, mem::transmute(ctrl));
+                    let result: $ctrl_t = mem::transmute(result);
+                    let expected: $ctrl_t = mem::transmute([$($exp_v),*]);
+                    assert_eq!(result, expected);
+                }
+            )*
+        }
+    }
+}
+
+// ARM+v7+neon and AArch64+neon tests
+
+test_vtbl!(
+    test_vtbl1_s8 => vtbl1_s8:
+    - table[int8x8_t]: [0_i8, -11, 2, 3, 4, 5, 6, 7] |
+    - ctrl[i8x8]: [3_i8, 4, 1, 6, 0, 2, 7, 5] => [3_i8, 4, -11, 6, 0, 2, 7, 5] |
+    - ctrl[i8x8]: [3_i8, 8, 1, -9, 10, 2, 15, 5] => [3_i8, 0, -11, 0, 0, 2, 0, 5]
+);
+
+test_vtbl!(
+    test_vtbl1_u8 => vtbl1_u8:
+    - table[uint8x8_t]: [0_u8, 1, 2, 3, 4, 5, 6, 7] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 0, 2, 7, 5] => [3_u8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 9, 10, 2, 15, 5] => [3_u8, 0, 1, 0, 0, 2, 0, 5]
+);
+
+test_vtbl!(
+    test_vtbl1_p8 => vtbl1_p8:
+    - table[poly8x8_t]: [0_u8, 1, 2, 3, 4, 5, 6, 7] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 0, 2, 7, 5] => [3_u8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 9, 10, 2, 15, 5] => [3_u8, 0, 1, 0, 0, 2, 0, 5]
+);
+
+test_vtbl!(
+    test_vtbl2_s8 => vtbl2_s8:
+    - table[int8x8x2_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 14, 2, 13, 3, 12] => [0_i8, -121, -17, -72, 34, -116, 51, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 16, 10, 6, -19, 7, 18] => [68_i8, -117, 0, -84, 102, 0, 119, 0]
+);
+
+test_vtbl!(
+    test_vtbl2_u8 => vtbl2_u8:
+    - table[uint8x8x2_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [0_u8, 255, 17, 238, 34, 221, 51, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 187, 0, 170, 102, 0, 119, 0]
+);
+
+test_vtbl!(
+    test_vtbl2_p8 => vtbl2_p8:
+    - table[poly8x8x2_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [0_u8, 255, 17, 238, 34, 221, 51, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 187, 0, 170, 102, 0, 119, 0]
+);
+
+test_vtbl!(
+    test_vtbl3_s8 => vtbl3_s8:
+    - table[int8x8x3_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121,
+        0, 1, -2, 3, 4, -5, 6, 7
+    ] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 19, 2, 13, 21, 12] => [0_i8, -121, -17, 3, 34, -116, -5, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 16, 10, 6, -27, 7, 18] => [68_i8, -117, 0, -84, 102, 0, 119, -2]
+);
+
+test_vtbl!(
+    test_vtbl3_u8 => vtbl3_u8:
+    - table[uint8x8x3_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255,
+        0, 1, 2, 3, 4, 5, 6, 7
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 19, 2, 13, 21, 12] => [0_u8, 255, 17, 3, 34, 221, 5, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 27, 7, 18] => [68_u8, 187, 0, 170, 102, 0, 119, 2]
+);
+
+test_vtbl!(
+    test_vtbl3_p8 => vtbl3_p8:
+    - table[poly8x8x3_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255,
+        0, 1, 2, 3, 4, 5, 6, 7
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 19, 2, 13, 21, 12] => [0_u8, 255, 17, 3, 34, 221, 5, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 27, 7, 18] => [68_u8, 187, 0, 170, 102, 0, 119, 2]
+);
+
+test_vtbl!(
+    test_vtbl4_s8 => vtbl4_s8:
+    - table[int8x8x4_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121,
+        0, 1, -2, 3, 4, -5, 6, 7,
+        8, -9, 10, 11, 12, -13, 14, 15
+    ] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 19, 2, 13, 25, 12] => [0_i8, -121, -17, 3, 34, -116, -9, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 32, 10, -33, 27, 7, 18] => [68_i8, -117, 0, -84, 0, 11, 119, -2]
+);
+
+test_vtbl!(
+    test_vtbl4_u8 => vtbl4_u8:
+    - table[uint8x8x4_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255,
+        0, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 19, 2, 13, 21, 12] => [0_u8, 255, 17, 3, 34, 221, 5, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 27, 7, 18] => [68_u8, 187, 0, 170, 102, 11, 119, 2]
+);
+
+test_vtbl!(
+    test_vtbl4_p8 => vtbl4_p8:
+    - table[poly8x8x4_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255,
+        0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 19, 2, 13, 21, 12] => [0_u8, 255, 17, 3, 34, 221, 5, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 27, 7, 18] => [68_u8, 187, 0, 170, 102, 11, 119, 2]
+);
+
+macro_rules! test_vtbx {
+    ($test_name:ident => $fn_id:ident:
+     - table[$table_t:ident]: [$($table_v:expr),*] |
+     - ext[$ext_t:ident]: [$($ext_v:expr),*] |
+     $(- ctrl[$ctrl_t:ident]: [$($ctrl_v:expr),*] => [$($exp_v:expr),*])|*
+    ) => {
+        #[cfg(target_endian = "little")]
+        #[simd_test(enable = "neon")]
+        unsafe fn $test_name() {
+            // create table as array, and transmute it to
+            // arm's table type
+            let table: $table_t = mem::transmute([$($table_v),*]);
+            let ext: $ext_t = mem::transmute([$($ext_v),*]);
+
+            // For each control vector, perform a table lookup and
+            // verify the result:
+            $(
+                {
+                    let ctrl: $ctrl_t = mem::transmute([$($ctrl_v),*]);
+                    let result = $fn_id(ext, table, mem::transmute(ctrl));
+                    let result: $ctrl_t = mem::transmute(result);
+                    let expected: $ctrl_t = mem::transmute([$($exp_v),*]);
+                    assert_eq!(result, expected);
+                }
+            )*
+        }
+    }
+}
+
+test_vtbx!(
+    test_vtbx1_s8 => vtbx1_s8:
+    - table[int8x8_t]: [0_i8, 1, 2, -3, 4, 5, 6, 7] |
+    - ext[int8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[i8x8]: [3_i8, 4, 1, 6, 0, 2, 7, 5] => [-3_i8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[i8x8]: [3_i8, 8, 1, 9, 10, 2, -15, 5] => [-3_i8, 51, 1, 53, 54, 2, 56, 5]
+);
+
+test_vtbx!(
+    test_vtbx1_u8 => vtbx1_u8:
+    - table[uint8x8_t]: [0_u8, 1, 2, 3, 4, 5, 6, 7] |
+    - ext[uint8x8_t]: [50_u8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 0, 2, 7, 5] => [3_u8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 9, 10, 2, 15, 5] => [3_u8, 51, 1, 53, 54, 2, 56, 5]
+);
+
+test_vtbx!(
+    test_vtbx1_p8 => vtbx1_p8:
+    - table[poly8x8_t]: [0_u8, 1, 2, 3, 4, 5, 6, 7] |
+    - ext[poly8x8_t]: [50_u8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 0, 2, 7, 5] => [3_u8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 9, 10, 2, 15, 5] => [3_u8, 51, 1, 53, 54, 2, 56, 5]
+);
+
+test_vtbx!(
+    test_vtbx2_s8 => vtbx2_s8:
+    - table[int8x8x2_t]: [0_i8, 1, 2, -3, 4, 5, 6, 7, 8, 9, -10, 11, 12, -13, 14, 15] |
+    - ext[int8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[i8x8]: [3_i8, 4, 1, 6, 10, 2, 7, 15] => [-3_i8, 4, 1, 6, -10, 2, 7, 15] |
+    - ctrl[i8x8]: [3_i8, 8, 1, 10, 17, 2, 15, -19] => [-3_i8, 8, 1, -10, 54, 2, 15, 57]
+);
+
+test_vtbx!(
+    test_vtbx2_u8 => vtbx2_u8:
+    - table[uint8x8x2_t]: [0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] |
+    - ext[uint8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 10, 2, 7, 15] => [3_i8, 4, 1, 6, 10, 2, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 10, 17, 2, 15, 19] => [3_i8, 8, 1, 10, 54, 2, 15, 57]
+);
+
+test_vtbx!(
+    test_vtbx2_p8 => vtbx2_p8:
+    - table[poly8x8x2_t]: [0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] |
+    - ext[poly8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 10, 2, 7, 15] => [3_i8, 4, 1, 6, 10, 2, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 10, 17, 2, 15, 19] => [3_i8, 8, 1, 10, 54, 2, 15, 57]
+);
+
+test_vtbx!(
+    test_vtbx3_s8 => vtbx3_s8:
+    - table[int8x8x3_t]: [
+        0_i8, 1, 2, -3, 4, 5, 6, 7,
+        8, 9, -10, 11, 12, -13, 14, 15,
+        16, -17, 18, 19, 20, 21, 22, 23 ] |
+    - ext[int8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[i8x8]: [3_i8, 4, 17, 22, 10, 2, 7, 15] => [-3_i8, 4, -17, 22, -10, 2, 7, 15] |
+    - ctrl[i8x8]: [3_i8, 8, 17, 10, 37, 2, 19, -29] => [-3_i8, 8, -17, -10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx3_u8 => vtbx3_u8:
+    - table[uint8x8x3_t]: [
+        0_i8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23 ] |
+    - ext[uint8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 17, 22, 10, 2, 7, 15] => [3_i8, 4, 17, 22, 10, 2, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 17, 10, 37, 2, 19, 29] => [3_i8, 8, 17, 10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx3_p8 => vtbx3_p8:
+    - table[poly8x8x3_t]: [
+        0_i8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23 ] |
+    - ext[poly8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 17, 22, 10, 2, 7, 15] => [3_i8, 4, 17, 22, 10, 2, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 17, 10, 37, 2, 19, 29] => [3_i8, 8, 17, 10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx4_s8 => vtbx4_s8:
+    - table[int8x8x4_t]: [
+        0_i8, 1, 2, -3, 4, 5, 6, 7,
+        8, 9, -10, 11, 12, -13, 14, 15,
+        16, -17, 18, 19, 20, 21, 22, 23,
+        -24, 25, 26, -27, 28, -29, 30, 31] |
+    - ext[int8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[i8x8]: [3_i8, 31, 17, 22, 10, 29, 7, 15] => [-3_i8, 31, -17, 22, -10, -29, 7, 15] |
+    - ctrl[i8x8]: [3_i8, 8, 17, 10, 37, 2, 19, -42] => [-3_i8, 8, -17, -10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx4_u8 => vtbx4_u8:
+    - table[uint8x8x4_t]: [
+        0_i8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31] |
+    - ext[uint8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 31, 17, 22, 10, 29, 7, 15] => [3_i8, 31, 17, 22, 10, 29, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 17, 10, 37, 2, 19, 42] => [3_i8, 8, 17, 10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx4_p8 => vtbx4_p8:
+    - table[poly8x8x4_t]: [
+        0_i8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31] |
+    - ext[poly8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 31, 17, 22, 10, 29, 7, 15] => [3_i8, 31, 17, 22, 10, 29, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 17, 10, 37, 2, 19, 42] => [3_i8, 8, 17, 10, 54, 2, 19, 57]
+);
+
+// Aarch64 tests
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl1_s8 => vqtbl1_s8:
+    - table[int8x16_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 14, 2, 13, 3, 12] => [0_i8, -121, -17, -72, 34, -116, 51, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 16, 10, 6, 19, 7, 18] => [68_i8, -117, 0, -84, 102, 0, 119, 0]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl1q_s8 => vqtbl1q_s8:
+    - table[int8x16_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ctrl[i8x16]: [127_i8, 15, 1, 14, 2, 13, 3, 12, 4_i8, 11, 16, 10, 6, 19, 7, 18]
+        => [0_i8, -121, -17, -72, 34, -116, 51, -104, 68, -117, 0, -84, 102, 0, 119, 0]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl1_u8 => vqtbl1_u8:
+    - table[uint8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [0_u8, 121, 17, 72, 34, 116, 51, 104] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 117, 0, 84, 102, 0, 119, 0]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl1q_u8 => vqtbl1q_u8:
+    - table[uint8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ctrl[u8x16]: [127_u8, 15, 1, 14, 2, 13, 3, 12, 4_u8, 11, 16, 10, 6, 19, 7, 18]
+        => [0_u8, 121, 17, 72, 34, 116, 51, 104, 68, 117, 0, 84, 102, 0, 119, 0]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl1_p8 => vqtbl1_p8:
+    - table[poly8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [0_u8, 121, 17, 72, 34, 116, 51, 104] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 117, 0, 84, 102, 0, 119, 0]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl1q_p8 => vqtbl1q_p8:
+    - table[poly8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ctrl[u8x16]: [127_u8, 15, 1, 14, 2, 13, 3, 12, 4_u8, 11, 16, 10, 6, 19, 7, 18]
+        => [0_u8, 121, 17, 72, 34, 116, 51, 104, 68, 117, 0, 84, 102, 0, 119, 0]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl2_s8 => vqtbl2_s8:
+    - table[int8x16x2_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31
+    ] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [0_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 31, 32, 10, 6, 49, 7, 18] => [4_i8, -31, 0, 10, 6, 0, -7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl2q_s8 => vqtbl2q_s8:
+    - table[int8x16x2_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 31, 32, 10, 6, 49, 7, 18]
+        => [0_i8, -15, -1, 24, 2, -13, -3, -29, 4, -31, 0, 10, 6, 0, -7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl2_u8 => vqtbl2_u8:
+    - table[uint8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 31, 32, 10, 6, 49, 7, 18] => [4_u8, 31, 0, 10, 6, 0, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl2q_u8 => vqtbl2q_u8:
+    - table[uint8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 31, 32, 10, 6, 49, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 31, 0, 10, 6, 0, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl2_p8 => vqtbl2_p8:
+    - table[poly8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 31, 32, 10, 6, 49, 7, 18] => [4_u8, 31, 0, 10, 6, 0, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl2q_p8 => vqtbl2q_p8:
+    - table[poly8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 31, 32, 10, 6, 49, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 31, 0, 10, 6, 0, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl3_s8 => vqtbl3_s8:
+    - table[int8x16x3_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47
+    ] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [0_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 32, 46, 51, 6, 49, 7, 18] => [4_i8, 32, 46, 0, 6, 0, -7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl3q_s8 => vqtbl3q_s8:
+    - table[int8x16x3_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 32, 46, 51, 6, 49, 7, 18]
+        => [0_i8, -15, -1, 24, 2, -13, -3, -29, 4, 32, 46, 0, 6, 0, -7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl3_u8 => vqtbl3_u8:
+    - table[uint8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 32, 46, 51, 6, 49, 7, 18] => [4_u8, 32, 46, 0, 6, 0, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl3q_u8 => vqtbl3q_u8:
+    - table[uint8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 32, 46, 51, 6, 49, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 32, 46, 0, 6, 0, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl3_p8 => vqtbl3_p8:
+    - table[poly8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 32, 46, 51, 6, 49, 7, 18] => [4_u8, 32, 46, 0, 6, 0, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl3q_p8 => vqtbl3q_p8:
+    - table[poly8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 32, 46, 51, 6, 49, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 32, 46, 0, 6, 0, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl4_s8 => vqtbl4_s8:
+    - table[int8x16x4_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47,
+        48, -49, 50, -51, 52, -53, 54, -55,
+        56, -57, 58, -59, 60, -61, 62, -63
+    ] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [0_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 46, 64, 51, 6, 71, 7, 18] => [4_i8, 46, 0, -51, 6, 0, -7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl4q_s8 => vqtbl4q_s8:
+    - table[int8x16x4_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47,
+        48, -49, 50, -51, 52, -53, 54, -55,
+        56, -57, 58, -59, 60, -61, 62, -63
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 46, 64, 51, 6, 71, 7, 18]
+        => [0_i8, -15, -1, 24, 2, -13, -3, -29, 4, 46, 0, -51, 6, 0, -7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl4_u8 => vqtbl4_u8:
+    - table[uint8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 46, 64, 51, 6, 71, 7, 18] => [4_u8, 46, 0, 51, 6, 0, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl4q_u8 => vqtbl4q_u8:
+    - table[uint8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 46, 64, 51, 6, 71, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 46, 0, 51, 6, 0, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl4_p8 => vqtbl4_p8:
+    - table[poly8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 46, 64, 51, 6, 71, 7, 18] => [4_u8, 46, 0, 51, 6, 0, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbl!(
+    test_vqtbl4q_p8 => vqtbl4q_p8:
+    - table[poly8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 46, 64, 51, 6, 71, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 46, 0, 51, 6, 0, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx1_s8 => vqtbx1_s8:
+    - table[int8x16_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ext[int8x8_t]: [100_i8, -101, 102, -103, 104, -105, 106, -107] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 14, 2, 13, 3, 12] => [100_i8, -121, -17, -72, 34, -116, 51, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 16, 10, 6, 19, 7, 18] => [68_i8, -117, 102, -84, 102, -105, 119, -107]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx1q_s8 => vqtbx1q_s8:
+    - table[int8x16_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ext[int8x16_t]: [
+        100_i8, -101, 102, -103, 104, -105, 106, -107,
+        108, -109, 110, -111, 112, -113, 114, -115
+    ] |
+    - ctrl[i8x16]: [127_i8, 15, 1, 14, 2, 13, 3, 12, 4_i8, 11, 16, 10, 6, 19, 7, 18]
+        => [100_i8, -121, -17, -72, 34, -116, 51, -104, 68, -117, 110, -84, 102, -113, 119, -115]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx1_u8 => vqtbx1_u8:
+    - table[uint8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ext[uint8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [100_u8, 121, 17, 72, 34, 116, 51, 104] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 117, 102, 84, 102, 105, 119, 107]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx1q_u8 => vqtbx1q_u8:
+    - table[uint8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ext[uint8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [127_u8, 15, 1, 14, 2, 13, 3, 12, 4_u8, 11, 16, 10, 6, 19, 7, 18]
+        => [100_u8, 121, 17, 72, 34, 116, 51, 104, 68, 117, 110, 84, 102, 113, 119, 115]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx1_p8 => vqtbx1_p8:
+    - table[poly8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ext[poly8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [100_u8, 121, 17, 72, 34, 116, 51, 104] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 117, 102, 84, 102, 105, 119, 107]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx1q_p8 => vqtbx1q_p8:
+    - table[poly8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ext[poly8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [127_u8, 15, 1, 14, 2, 13, 3, 12, 4_u8, 11, 16, 10, 6, 19, 7, 18]
+        => [100_u8, 121, 17, 72, 34, 116, 51, 104, 68, 117, 110, 84, 102, 113, 119, 115]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx2_s8 => vqtbx2_s8:
+    - table[int8x16x2_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31
+    ] |
+    - ext[int8x8_t]: [100_i8, -101, 102, -103, 104, -105, 106, -107] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [100_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 31, 32, 10, 6, 49, 7, 18] => [4_i8, -31, 102, 10, 6, -105, -7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx2q_s8 => vqtbx2q_s8:
+    - table[int8x16x2_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31
+    ] |
+    - ext[int8x16_t]: [
+        100_i8, -101, 102, -103, 104, -105, 106, -107,
+        108, -109, 110, -111, 112, -113, 114, -115
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 31, 32, 10, 6, 49, 7, 18]
+        => [100_i8, -15, -1, 24, 2, -13, -3, -29, 4, -31, 110, 10, 6, -113, -7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx2_u8 => vqtbx2_u8:
+    - table[uint8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ext[uint8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 31, 32, 10, 6, 49, 7, 18] => [4_u8, 31, 102, 10, 6, 105, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx2q_u8 => vqtbx2q_u8:
+    - table[uint8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ext[uint8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 31, 32, 10, 6, 49, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 31, 110, 10, 6, 113, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx2_p8 => vqtbx2_p8:
+    - table[poly8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ext[poly8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 31, 32, 10, 6, 49, 7, 18] => [4_u8, 31, 102, 10, 6, 105, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx2q_p8 => vqtbx2q_p8:
+    - table[poly8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ext[poly8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 31, 32, 10, 6, 49, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 31, 110, 10, 6, 113, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx3_s8 => vqtbx3_s8:
+    - table[int8x16x3_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47
+    ] |
+    - ext[int8x8_t]: [100_i8, -101, 102, -103, 104, -105, 106, -107] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [100_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 32, 46, 51, 6, 49, 7, 18] => [4_i8, 32, 46, -103, 6, -105, -7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx3q_s8 => vqtbx3q_s8:
+    - table[int8x16x3_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47
+    ] |
+    - ext[int8x16_t]: [
+        100_i8, -101, 102, -103, 104, -105, 106, -107,
+        108, -109, 110, -111, 112, -113, 114, -115
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 32, 46, 51, 6, 49, 7, 18]
+        => [100_i8, -15, -1, 24, 2, -13, -3, -29, 4, 32, 46, -111, 6, -113, -7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx3_u8 => vqtbx3_u8:
+    - table[uint8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ext[uint8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 32, 46, 51, 6, 49, 7, 18] => [4_u8, 32, 46, 103, 6, 105, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx3q_u8 => vqtbx3q_u8:
+    - table[uint8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ext[uint8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 32, 46, 51, 6, 49, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 32, 46, 111, 6, 113, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx3_p8 => vqtbx3_p8:
+    - table[poly8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ext[poly8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 32, 46, 51, 6, 49, 7, 18] => [4_u8, 32, 46, 103, 6, 105, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx3q_p8 => vqtbx3q_p8:
+    - table[poly8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ext[poly8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 32, 46, 51, 6, 49, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 32, 46, 111, 6, 113, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx4_s8 => vqtbx4_s8:
+    - table[int8x16x4_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47,
+        48, -49, 50, -51, 52, -53, 54, -55,
+        56, -57, 58, -59, 60, -61, 62, -63
+    ] |
+    - ext[int8x8_t]: [100_i8, -101, 102, -103, 104, -105, 106, -107] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [100_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 46, 64, 51, 6, 71, 7, 18] => [4_i8, 46, 102, -51, 6, -105, -7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx4q_s8 => vqtbx4q_s8:
+    - table[int8x16x4_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47,
+        48, -49, 50, -51, 52, -53, 54, -55,
+        56, -57, 58, -59, 60, -61, 62, -63
+    ] |
+    - ext[int8x16_t]: [
+        100_i8, -101, 102, -103, 104, -105, 106, -107,
+        108, -109, 110, -111, 112, -113, 114, -115
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 46, 64, 51, 6, 71, 7, 18]
+        => [100_i8, -15, -1, 24, 2, -13, -3, -29, 4, 46, 110, -51, 6, -113, -7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx4_u8 => vqtbx4_u8:
+    - table[uint8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ext[uint8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 46, 64, 51, 6, 71, 7, 18] => [4_u8, 46, 102, 51, 6, 105, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx4q_u8 => vqtbx4q_u8:
+    - table[uint8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ext[uint8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 46, 64, 51, 6, 71, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 46, 110, 51, 6, 113, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx4_p8 => vqtbx4_p8:
+    - table[poly8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ext[poly8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 46, 64, 51, 6, 71, 7, 18] => [4_u8, 46, 102, 51, 6, 105, 7, 18]
+);
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+test_vtbx!(
+    test_vqtbx4q_p8 => vqtbx4q_p8:
+    - table[poly8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ext[poly8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 46, 64, 51, 6, 71, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 46, 110, 51, 6, 113, 7, 18]
+);
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/test_support.rs b/library/stdarch/crates/core_arch/src/arm_shared/test_support.rs
new file mode 100644
index 0000000000000..e2828f85561df
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/test_support.rs
@@ -0,0 +1,836 @@
+#[cfg(target_arch = "arm")]
+use crate::core_arch::arm::*;
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+use crate::core_arch::aarch64::*;
+
+use crate::core_arch::simd::*;
+use std::{mem::transmute, vec::Vec};
+
+macro_rules! V_u8 {
+    () => {
+        vec![0x00u8, 0x01u8, 0x02u8, 0x0Fu8, 0x80u8, 0xF0u8, 0xFFu8]
+    };
+}
+macro_rules! V_u16 {
+    () => {
+        vec![
+            0x0000u16, 0x0101u16, 0x0202u16, 0x0F0Fu16, 0x8000u16, 0xF0F0u16, 0xFFFFu16,
+        ]
+    };
+}
+macro_rules! V_u32 {
+    () => {
+        vec![
+            0x00000000u32,
+            0x01010101u32,
+            0x02020202u32,
+            0x0F0F0F0Fu32,
+            0x80000000u32,
+            0xF0F0F0F0u32,
+            0xFFFFFFFFu32,
+        ]
+    };
+}
+macro_rules! V_u64 {
+    () => {
+        vec![
+            0x0000000000000000u64,
+            0x0101010101010101u64,
+            0x0202020202020202u64,
+            0x0F0F0F0F0F0F0F0Fu64,
+            0x8080808080808080u64,
+            0xF0F0F0F0F0F0F0F0u64,
+            0xFFFFFFFFFFFFFFFFu64,
+        ]
+    };
+}
+
+macro_rules! V_i8 {
+    () => {
+        vec![
+            0x00i8, 0x01i8, 0x02i8, 0x0Fi8, -128i8, /* 0x80 */
+            -16i8,  /* 0xF0 */
+            -1i8,   /* 0xFF */
+        ]
+    };
+}
+macro_rules! V_i16 {
+    () => {
+        vec![
+            0x0000i16, 0x0101i16, 0x0202i16, 0x0F0Fi16, -32768i16, /* 0x8000 */
+            -3856i16,  /* 0xF0F0 */
+            -1i16,     /* 0xFFF */
+        ]
+    };
+}
+macro_rules! V_i32 {
+    () => {
+        vec![
+            0x00000000i32,
+            0x01010101i32,
+            0x02020202i32,
+            0x0F0F0F0Fi32,
+            -2139062144i32, /* 0x80000000 */
+            -252645136i32,  /* 0xF0F0F0F0 */
+            -1i32,          /* 0xFFFFFFFF */
+        ]
+    };
+}
+
+macro_rules! V_i64 {
+    () => {
+        vec![
+            0x0000000000000000i64,
+            0x0101010101010101i64,
+            0x0202020202020202i64,
+            0x0F0F0F0F0F0F0F0Fi64,
+            -9223372036854775808i64, /* 0x8000000000000000 */
+            -1152921504606846976i64, /* 0xF000000000000000 */
+            -1i64,                   /* 0xFFFFFFFFFFFFFFFF */
+        ]
+    };
+}
+
+macro_rules! V_f32 {
+    () => {
+        vec![
+            0.0f32,
+            1.0f32,
+            -1.0f32,
+            1.2f32,
+            2.4f32,
+            f32::MAX,
+            f32::MIN,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            f32::NAN,
+        ]
+    };
+}
+
+macro_rules! to64 {
+    ($t : ident) => {
+        |v: $t| -> u64 { transmute(v) }
+    };
+}
+
+macro_rules! to128 {
+    ($t : ident) => {
+        |v: $t| -> u128 { transmute(v) }
+    };
+}
+
+pub(crate) fn test<T, U, V, W, X>(
+    vals: Vec<T>,
+    fill1: fn(T) -> V,
+    fill2: fn(U) -> W,
+    cast: fn(W) -> X,
+    test_fun: fn(V, V) -> W,
+    verify_fun: fn(T, T) -> U,
+) where
+    T: Copy + core::fmt::Debug + std::cmp::PartialEq,
+    U: Copy + core::fmt::Debug + std::cmp::PartialEq,
+    V: Copy + core::fmt::Debug,
+    W: Copy + core::fmt::Debug,
+    X: Copy + core::fmt::Debug + std::cmp::PartialEq,
+{
+    let pairs = vals.iter().zip(vals.iter());
+
+    for (i, j) in pairs {
+        let a: V = fill1(*i);
+        let b: V = fill1(*j);
+
+        let actual_pre: W = test_fun(a, b);
+        let expected_pre: W = fill2(verify_fun(*i, *j));
+
+        let actual: X = cast(actual_pre);
+        let expected: X = cast(expected_pre);
+
+        assert_eq!(
+            actual, expected,
+            "[{:?}:{:?}] :\nf({:?}, {:?}) = {:?}\ng({:?}, {:?}) = {:?}\n",
+            *i, *j, &a, &b, actual_pre, &a, &b, expected_pre
+        );
+    }
+}
+
+macro_rules! gen_test_fn {
+    ($n: ident, $t: ident, $u: ident, $v: ident, $w: ident, $x: ident, $vals: expr, $fill1: expr, $fill2: expr, $cast: expr) => {
+        pub(crate) fn $n(test_fun: fn($v, $v) -> $w, verify_fun: fn($t, $t) -> $u) {
+            unsafe {
+                test::<$t, $u, $v, $w, $x>($vals, $fill1, $fill2, $cast, test_fun, verify_fun)
+            };
+        }
+    };
+}
+
+macro_rules! gen_fill_fn {
+    ($id: ident, $el_width: expr, $num_els: expr, $in_t : ident, $out_t: ident, $cmp_t: ident) => {
+        pub(crate) fn $id(val: $in_t) -> $out_t {
+            let initial: [$in_t; $num_els] = [val; $num_els];
+            let result: $cmp_t = unsafe { transmute(initial) };
+            let result_out: $out_t = unsafe { transmute(result) };
+
+            // println!("FILL: {:016x} as {} x {}: {:016x}", val.reverse_bits(), $el_width, $num_els, (result as u64).reverse_bits());
+
+            result_out
+        }
+    };
+}
+
+gen_fill_fn!(fill_u8, 8, 8, u8, uint8x8_t, u64);
+gen_fill_fn!(fill_s8, 8, 8, i8, int8x8_t, u64);
+gen_fill_fn!(fillq_u8, 8, 16, u8, uint8x16_t, u128);
+gen_fill_fn!(fillq_s8, 8, 16, i8, int8x16_t, u128);
+
+gen_fill_fn!(fill_u16, 16, 4, u16, uint16x4_t, u64);
+gen_fill_fn!(fill_s16, 16, 4, i16, int16x4_t, u64);
+gen_fill_fn!(fillq_u16, 16, 8, u16, uint16x8_t, u128);
+gen_fill_fn!(fillq_s16, 16, 8, i16, int16x8_t, u128);
+
+gen_fill_fn!(fill_u32, 32, 2, u32, uint32x2_t, u64);
+gen_fill_fn!(fill_s32, 32, 2, i32, int32x2_t, u64);
+gen_fill_fn!(fillq_u32, 32, 4, u32, uint32x4_t, u128);
+gen_fill_fn!(fillq_s32, 32, 4, i32, int32x4_t, u128);
+
+gen_fill_fn!(fill_u64, 64, 1, u64, uint64x1_t, u64);
+gen_fill_fn!(fill_s64, 64, 1, i64, int64x1_t, u64);
+gen_fill_fn!(fillq_u64, 64, 2, u64, uint64x2_t, u128);
+gen_fill_fn!(fillq_s64, 64, 2, i64, int64x2_t, u128);
+
+gen_fill_fn!(fill_f32, 32, 2, f32, float32x2_t, u64);
+gen_fill_fn!(fillq_f32, 32, 4, f32, float32x4_t, u128);
+
+gen_test_fn!(
+    test_ari_u8,
+    u8,
+    u8,
+    uint8x8_t,
+    uint8x8_t,
+    u64,
+    V_u8!(),
+    fill_u8,
+    fill_u8,
+    to64!(uint8x8_t)
+);
+gen_test_fn!(
+    test_bit_u8,
+    u8,
+    u8,
+    uint8x8_t,
+    uint8x8_t,
+    u64,
+    V_u8!(),
+    fill_u8,
+    fill_u8,
+    to64!(uint8x8_t)
+);
+gen_test_fn!(
+    test_cmp_u8,
+    u8,
+    u8,
+    uint8x8_t,
+    uint8x8_t,
+    u64,
+    V_u8!(),
+    fill_u8,
+    fill_u8,
+    to64!(uint8x8_t)
+);
+gen_test_fn!(
+    testq_ari_u8,
+    u8,
+    u8,
+    uint8x16_t,
+    uint8x16_t,
+    u128,
+    V_u8!(),
+    fillq_u8,
+    fillq_u8,
+    to128!(uint8x16_t)
+);
+gen_test_fn!(
+    testq_bit_u8,
+    u8,
+    u8,
+    uint8x16_t,
+    uint8x16_t,
+    u128,
+    V_u8!(),
+    fillq_u8,
+    fillq_u8,
+    to128!(uint8x16_t)
+);
+gen_test_fn!(
+    testq_cmp_u8,
+    u8,
+    u8,
+    uint8x16_t,
+    uint8x16_t,
+    u128,
+    V_u8!(),
+    fillq_u8,
+    fillq_u8,
+    to128!(uint8x16_t)
+);
+
+gen_test_fn!(
+    test_ari_s8,
+    i8,
+    i8,
+    int8x8_t,
+    int8x8_t,
+    u64,
+    V_i8!(),
+    fill_s8,
+    fill_s8,
+    to64!(int8x8_t)
+);
+gen_test_fn!(
+    test_bit_s8,
+    i8,
+    i8,
+    int8x8_t,
+    int8x8_t,
+    u64,
+    V_i8!(),
+    fill_s8,
+    fill_s8,
+    to64!(int8x8_t)
+);
+gen_test_fn!(
+    test_cmp_s8,
+    i8,
+    u8,
+    int8x8_t,
+    uint8x8_t,
+    u64,
+    V_i8!(),
+    fill_s8,
+    fill_u8,
+    to64!(uint8x8_t)
+);
+gen_test_fn!(
+    testq_ari_s8,
+    i8,
+    i8,
+    int8x16_t,
+    int8x16_t,
+    u128,
+    V_i8!(),
+    fillq_s8,
+    fillq_s8,
+    to128!(int8x16_t)
+);
+gen_test_fn!(
+    testq_bit_s8,
+    i8,
+    i8,
+    int8x16_t,
+    int8x16_t,
+    u128,
+    V_i8!(),
+    fillq_s8,
+    fillq_s8,
+    to128!(int8x16_t)
+);
+gen_test_fn!(
+    testq_cmp_s8,
+    i8,
+    u8,
+    int8x16_t,
+    uint8x16_t,
+    u128,
+    V_i8!(),
+    fillq_s8,
+    fillq_u8,
+    to128!(uint8x16_t)
+);
+
+gen_test_fn!(
+    test_ari_u16,
+    u16,
+    u16,
+    uint16x4_t,
+    uint16x4_t,
+    u64,
+    V_u16!(),
+    fill_u16,
+    fill_u16,
+    to64!(uint16x4_t)
+);
+gen_test_fn!(
+    test_bit_u16,
+    u16,
+    u16,
+    uint16x4_t,
+    uint16x4_t,
+    u64,
+    V_u16!(),
+    fill_u16,
+    fill_u16,
+    to64!(uint16x4_t)
+);
+gen_test_fn!(
+    test_cmp_u16,
+    u16,
+    u16,
+    uint16x4_t,
+    uint16x4_t,
+    u64,
+    V_u16!(),
+    fill_u16,
+    fill_u16,
+    to64!(uint16x4_t)
+);
+gen_test_fn!(
+    testq_ari_u16,
+    u16,
+    u16,
+    uint16x8_t,
+    uint16x8_t,
+    u128,
+    V_u16!(),
+    fillq_u16,
+    fillq_u16,
+    to128!(uint16x8_t)
+);
+gen_test_fn!(
+    testq_bit_u16,
+    u16,
+    u16,
+    uint16x8_t,
+    uint16x8_t,
+    u128,
+    V_u16!(),
+    fillq_u16,
+    fillq_u16,
+    to128!(uint16x8_t)
+);
+gen_test_fn!(
+    testq_cmp_u16,
+    u16,
+    u16,
+    uint16x8_t,
+    uint16x8_t,
+    u128,
+    V_u16!(),
+    fillq_u16,
+    fillq_u16,
+    to128!(uint16x8_t)
+);
+
+gen_test_fn!(
+    test_ari_s16,
+    i16,
+    i16,
+    int16x4_t,
+    int16x4_t,
+    u64,
+    V_i16!(),
+    fill_s16,
+    fill_s16,
+    to64!(int16x4_t)
+);
+gen_test_fn!(
+    test_bit_s16,
+    i16,
+    i16,
+    int16x4_t,
+    int16x4_t,
+    u64,
+    V_i16!(),
+    fill_s16,
+    fill_s16,
+    to64!(int16x4_t)
+);
+gen_test_fn!(
+    test_cmp_s16,
+    i16,
+    u16,
+    int16x4_t,
+    uint16x4_t,
+    u64,
+    V_i16!(),
+    fill_s16,
+    fill_u16,
+    to64!(uint16x4_t)
+);
+gen_test_fn!(
+    testq_ari_s16,
+    i16,
+    i16,
+    int16x8_t,
+    int16x8_t,
+    u128,
+    V_i16!(),
+    fillq_s16,
+    fillq_s16,
+    to128!(int16x8_t)
+);
+gen_test_fn!(
+    testq_bit_s16,
+    i16,
+    i16,
+    int16x8_t,
+    int16x8_t,
+    u128,
+    V_i16!(),
+    fillq_s16,
+    fillq_s16,
+    to128!(int16x8_t)
+);
+gen_test_fn!(
+    testq_cmp_s16,
+    i16,
+    u16,
+    int16x8_t,
+    uint16x8_t,
+    u128,
+    V_i16!(),
+    fillq_s16,
+    fillq_u16,
+    to128!(uint16x8_t)
+);
+
+gen_test_fn!(
+    test_ari_u32,
+    u32,
+    u32,
+    uint32x2_t,
+    uint32x2_t,
+    u64,
+    V_u32!(),
+    fill_u32,
+    fill_u32,
+    to64!(uint32x2_t)
+);
+gen_test_fn!(
+    test_bit_u32,
+    u32,
+    u32,
+    uint32x2_t,
+    uint32x2_t,
+    u64,
+    V_u32!(),
+    fill_u32,
+    fill_u32,
+    to64!(uint32x2_t)
+);
+gen_test_fn!(
+    test_cmp_u32,
+    u32,
+    u32,
+    uint32x2_t,
+    uint32x2_t,
+    u64,
+    V_u32!(),
+    fill_u32,
+    fill_u32,
+    to64!(uint32x2_t)
+);
+gen_test_fn!(
+    testq_ari_u32,
+    u32,
+    u32,
+    uint32x4_t,
+    uint32x4_t,
+    u128,
+    V_u32!(),
+    fillq_u32,
+    fillq_u32,
+    to128!(uint32x4_t)
+);
+gen_test_fn!(
+    testq_bit_u32,
+    u32,
+    u32,
+    uint32x4_t,
+    uint32x4_t,
+    u128,
+    V_u32!(),
+    fillq_u32,
+    fillq_u32,
+    to128!(uint32x4_t)
+);
+gen_test_fn!(
+    testq_cmp_u32,
+    u32,
+    u32,
+    uint32x4_t,
+    uint32x4_t,
+    u128,
+    V_u32!(),
+    fillq_u32,
+    fillq_u32,
+    to128!(uint32x4_t)
+);
+
+gen_test_fn!(
+    test_ari_s32,
+    i32,
+    i32,
+    int32x2_t,
+    int32x2_t,
+    u64,
+    V_i32!(),
+    fill_s32,
+    fill_s32,
+    to64!(int32x2_t)
+);
+gen_test_fn!(
+    test_bit_s32,
+    i32,
+    i32,
+    int32x2_t,
+    int32x2_t,
+    u64,
+    V_i32!(),
+    fill_s32,
+    fill_s32,
+    to64!(int32x2_t)
+);
+gen_test_fn!(
+    test_cmp_s32,
+    i32,
+    u32,
+    int32x2_t,
+    uint32x2_t,
+    u64,
+    V_i32!(),
+    fill_s32,
+    fill_u32,
+    to64!(uint32x2_t)
+);
+gen_test_fn!(
+    testq_ari_s32,
+    i32,
+    i32,
+    int32x4_t,
+    int32x4_t,
+    u128,
+    V_i32!(),
+    fillq_s32,
+    fillq_s32,
+    to128!(int32x4_t)
+);
+gen_test_fn!(
+    testq_bit_s32,
+    i32,
+    i32,
+    int32x4_t,
+    int32x4_t,
+    u128,
+    V_i32!(),
+    fillq_s32,
+    fillq_s32,
+    to128!(int32x4_t)
+);
+gen_test_fn!(
+    testq_cmp_s32,
+    i32,
+    u32,
+    int32x4_t,
+    uint32x4_t,
+    u128,
+    V_i32!(),
+    fillq_s32,
+    fillq_u32,
+    to128!(uint32x4_t)
+);
+
+gen_test_fn!(
+    test_ari_u64,
+    u64,
+    u64,
+    uint64x1_t,
+    uint64x1_t,
+    u64,
+    V_u64!(),
+    fill_u64,
+    fill_u64,
+    to64!(uint64x1_t)
+);
+gen_test_fn!(
+    test_bit_u64,
+    u64,
+    u64,
+    uint64x1_t,
+    uint64x1_t,
+    u64,
+    V_u64!(),
+    fill_u64,
+    fill_u64,
+    to64!(uint64x1_t)
+);
+gen_test_fn!(
+    test_cmp_u64,
+    u64,
+    u64,
+    uint64x1_t,
+    uint64x1_t,
+    u64,
+    V_u64!(),
+    fill_u64,
+    fill_u64,
+    to64!(uint64x1_t)
+);
+gen_test_fn!(
+    testq_ari_u64,
+    u64,
+    u64,
+    uint64x2_t,
+    uint64x2_t,
+    u128,
+    V_u64!(),
+    fillq_u64,
+    fillq_u64,
+    to128!(uint64x2_t)
+);
+gen_test_fn!(
+    testq_bit_u64,
+    u64,
+    u64,
+    uint64x2_t,
+    uint64x2_t,
+    u128,
+    V_u64!(),
+    fillq_u64,
+    fillq_u64,
+    to128!(uint64x2_t)
+);
+gen_test_fn!(
+    testq_cmp_u64,
+    u64,
+    u64,
+    uint64x2_t,
+    uint64x2_t,
+    u128,
+    V_u64!(),
+    fillq_u64,
+    fillq_u64,
+    to128!(uint64x2_t)
+);
+
+gen_test_fn!(
+    test_ari_s64,
+    i64,
+    i64,
+    int64x1_t,
+    int64x1_t,
+    u64,
+    V_i64!(),
+    fill_s64,
+    fill_s64,
+    to64!(int64x1_t)
+);
+gen_test_fn!(
+    test_bit_s64,
+    i64,
+    i64,
+    int64x1_t,
+    int64x1_t,
+    u64,
+    V_i64!(),
+    fill_s64,
+    fill_s64,
+    to64!(int64x1_t)
+);
+gen_test_fn!(
+    test_cmp_s64,
+    i64,
+    u64,
+    int64x1_t,
+    uint64x1_t,
+    u64,
+    V_i64!(),
+    fill_s64,
+    fill_u64,
+    to64!(uint64x1_t)
+);
+gen_test_fn!(
+    testq_ari_s64,
+    i64,
+    i64,
+    int64x2_t,
+    int64x2_t,
+    u128,
+    V_i64!(),
+    fillq_s64,
+    fillq_s64,
+    to128!(int64x2_t)
+);
+gen_test_fn!(
+    testq_bit_s64,
+    i64,
+    i64,
+    int64x2_t,
+    int64x2_t,
+    u128,
+    V_i64!(),
+    fillq_s64,
+    fillq_s64,
+    to128!(int64x2_t)
+);
+gen_test_fn!(
+    testq_cmp_s64,
+    i64,
+    u64,
+    int64x2_t,
+    uint64x2_t,
+    u128,
+    V_i64!(),
+    fillq_s64,
+    fillq_u64,
+    to128!(uint64x2_t)
+);
+
+gen_test_fn!(
+    test_ari_f32,
+    f32,
+    f32,
+    float32x2_t,
+    float32x2_t,
+    u64,
+    V_f32!(),
+    fill_f32,
+    fill_f32,
+    to64!(float32x2_t)
+);
+gen_test_fn!(
+    test_cmp_f32,
+    f32,
+    u32,
+    float32x2_t,
+    uint32x2_t,
+    u64,
+    V_f32!(),
+    fill_f32,
+    fill_u32,
+    to64!(uint32x2_t)
+);
+gen_test_fn!(
+    testq_ari_f32,
+    f32,
+    f32,
+    float32x4_t,
+    float32x4_t,
+    u128,
+    V_f32!(),
+    fillq_f32,
+    fillq_f32,
+    to128!(float32x4_t)
+);
+gen_test_fn!(
+    testq_cmp_f32,
+    f32,
+    u32,
+    float32x4_t,
+    uint32x4_t,
+    u128,
+    V_f32!(),
+    fillq_f32,
+    fillq_u32,
+    to128!(uint32x4_t)
+);
diff --git a/library/stdarch/crates/core_arch/src/core_arch_docs.md b/library/stdarch/crates/core_arch/src/core_arch_docs.md
new file mode 100644
index 0000000000000..bfa1b7228860b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/core_arch_docs.md
@@ -0,0 +1,350 @@
+SIMD and vendor intrinsics module.
+
+This module is intended to be the gateway to architecture-specific
+intrinsic functions, typically related to SIMD (but not always!). Each
+architecture that Rust compiles to may contain a submodule here, which
+means that this is not a portable module! If you're writing a portable
+library take care when using these APIs!
+
+Under this module you'll find an architecture-named module, such as
+`x86_64`. Each `#[cfg(target_arch)]` that Rust can compile to may have a
+module entry here, only present on that particular target. For example the
+`i686-pc-windows-msvc` target will have an `x86` module here, whereas
+`x86_64-pc-windows-msvc` has `x86_64`.
+
+[rfc]: https://github.com/rust-lang/rfcs/pull/2325
+[tracked]: https://github.com/rust-lang/rust/issues/48556
+
+# Overview
+
+This module exposes vendor-specific intrinsics that typically correspond to
+a single machine instruction. These intrinsics are not portable: their
+availability is architecture-dependent, and not all machines of that
+architecture might provide the intrinsic.
+
+The `arch` module is intended to be a low-level implementation detail for
+higher-level APIs. Using it correctly can be quite tricky as you need to
+ensure at least a few guarantees are upheld:
+
+* The correct architecture's module is used. For example the `arm` module
+  isn't available on the `x86_64-unknown-linux-gnu` target. This is
+  typically done by ensuring that `#[cfg]` is used appropriately when using
+  this module.
+* The CPU the program is currently running on supports the function being
+  called. For example it is unsafe to call an AVX2 function on a CPU that
+  doesn't actually support AVX2.
+
+As a result of the latter of these guarantees all intrinsics in this module
+are `unsafe` and extra care needs to be taken when calling them!
+
+# CPU Feature Detection
+
+In order to call these APIs in a safe fashion there's a number of
+mechanisms available to ensure that the correct CPU feature is available
+to call an intrinsic. Let's consider, for example, the `_mm256_add_epi64`
+intrinsics on the `x86` and `x86_64` architectures. This function requires
+the AVX2 feature as [documented by Intel][intel-dox] so to correctly call
+this function we need to (a) guarantee we only call it on `x86`/`x86_64`
+and (b) ensure that the CPU feature is available
+
+[intel-dox]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64&expand=100
+
+## Static CPU Feature Detection
+
+The first option available to us is to conditionally compile code via the
+`#[cfg]` attribute. CPU features correspond to the `target_feature` cfg
+available, and can be used like so:
+
+```ignore
+#[cfg(
+    all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "avx2"
+    )
+)]
+fn foo() {
+    #[cfg(target_arch = "x86")]
+    use std::arch::x86::_mm256_add_epi64;
+    #[cfg(target_arch = "x86_64")]
+    use std::arch::x86_64::_mm256_add_epi64;
+
+    unsafe {
+        _mm256_add_epi64(...);
+    }
+}
+```
+
+Here we're using `#[cfg(target_feature = "avx2")]` to conditionally compile
+this function into our module. This means that if the `avx2` feature is
+*enabled statically* then we'll use the `_mm256_add_epi64` function at
+runtime. The `unsafe` block here can be justified through the usage of
+`#[cfg]` to only compile the code in situations where the safety guarantees
+are upheld.
+
+Statically enabling a feature is typically done with the `-C
+target-feature` or `-C target-cpu` flags to the compiler. For example if
+your local CPU supports AVX2 then you can compile the above function with:
+
+```sh
+$ RUSTFLAGS='-C target-cpu=native' cargo build
+```
+
+Or otherwise you can specifically enable just the AVX2 feature:
+
+```sh
+$ RUSTFLAGS='-C target-feature=+avx2' cargo build
+```
+
+Note that when you compile a binary with a particular feature enabled it's
+important to ensure that you only run the binary on systems which satisfy
+the required feature set.
+
+## Dynamic CPU Feature Detection
+
+Sometimes statically dispatching isn't quite what you want. Instead you
+might want to build a portable binary that runs across a variety of CPUs,
+but at runtime it selects the most optimized implementation available. This
+allows you to build a "least common denominator" binary which has certain
+sections more optimized for different CPUs.
+
+Taking our previous example from before, we're going to compile our binary
+*without* AVX2 support, but we'd like to enable it for just one function.
+We can do that in a manner like:
+
+```ignore
+fn foo() {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("avx2") {
+            return unsafe { foo_avx2() };
+        }
+    }
+
+    // fallback implementation without using AVX2
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[target_feature(enable = "avx2")]
+unsafe fn foo_avx2() {
+    #[cfg(target_arch = "x86")]
+    use std::arch::x86::_mm256_add_epi64;
+    #[cfg(target_arch = "x86_64")]
+    use std::arch::x86_64::_mm256_add_epi64;
+
+    unsafe { _mm256_add_epi64(...); }
+}
+```
+
+There's a couple of components in play here, so let's go through them in
+detail!
+
+* First up we notice the `is_x86_feature_detected!` macro. Provided by
+  the standard library, this macro will perform necessary runtime detection
+  to determine whether the CPU the program is running on supports the
+  specified feature. In this case the macro will expand to a boolean
+  expression evaluating to whether the local CPU has the AVX2 feature or
+  not.
+
+  Note that this macro, like the `arch` module, is platform-specific. For
+  example calling `is_x86_feature_detected!("avx2")` on ARM will be a
+  compile time error. To ensure we don't hit this error a statement level
+  `#[cfg]` is used to only compile usage of the macro on `x86`/`x86_64`.
+
+* Next up we see our AVX2-enabled function, `foo_avx2`. This function is
+  decorated with the `#[target_feature]` attribute which enables a CPU
+  feature for just this one function. Using a compiler flag like `-C
+  target-feature=+avx2` will enable AVX2 for the entire program, but using
+  an attribute will only enable it for the one function. Usage of the
+  `#[target_feature]` attribute currently requires the function to also be
+  `unsafe`, as we see here. This is because the function can only be
+  correctly called on systems which have the AVX2 (like the intrinsics
+  themselves).
+
+And with all that we should have a working program! This program will run
+across all machines and it'll use the optimized AVX2 implementation on
+machines where support is detected.
+
+# Ergonomics
+
+It's important to note that using the `arch` module is not the easiest
+thing in the world, so if you're curious to try it out you may want to
+brace yourself for some wordiness!
+
+The primary purpose of this module is to enable stable crates on crates.io
+to build up much more ergonomic abstractions which end up using SIMD under
+the hood. Over time these abstractions may also move into the standard
+library itself, but for now this module is tasked with providing the bare
+minimum necessary to use vendor intrinsics on stable Rust.
+
+# Other architectures
+
+This documentation is only for one particular architecture, you can find
+others at:
+
+* [`x86`]
+* [`x86_64`]
+* [`arm`]
+* [`aarch64`]
+* [`riscv32`]
+* [`riscv64`]
+* [`mips`]
+* [`mips64`]
+* [`powerpc`]
+* [`powerpc64`]
+* [`nvptx`]
+* [`wasm32`]
+* [`loongarch64`]
+* [`s390x`]
+
+[`x86`]: ../../core/arch/x86/index.html
+[`x86_64`]: ../../core/arch/x86_64/index.html
+[`arm`]: ../../core/arch/arm/index.html
+[`aarch64`]: ../../core/arch/aarch64/index.html
+[`riscv32`]: ../../core/arch/riscv32/index.html
+[`riscv64`]: ../../core/arch/riscv64/index.html
+[`mips`]: ../../core/arch/mips/index.html
+[`mips64`]: ../../core/arch/mips64/index.html
+[`powerpc`]: ../../core/arch/powerpc/index.html
+[`powerpc64`]: ../../core/arch/powerpc64/index.html
+[`nvptx`]: ../../core/arch/nvptx/index.html
+[`wasm32`]: ../../core/arch/wasm32/index.html
+[`loongarch64`]: ../../core/arch/loongarch64/index.html
+[`s390x`]: ../../core/arch/s390x/index.html
+
+# Examples
+
+First let's take a look at not actually using any intrinsics but instead
+using LLVM's auto-vectorization to produce optimized vectorized code for
+AVX2 and also for the default platform.
+
+```rust
+fn main() {
+    let mut dst = [0];
+    add_quickly(&[1], &[2], &mut dst);
+    assert_eq!(dst[0], 3);
+}
+
+fn add_quickly(a: &[u8], b: &[u8], c: &mut [u8]) {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        // Note that this `unsafe` block is safe because we're testing
+        // that the `avx2` feature is indeed available on our CPU.
+        if is_x86_feature_detected!("avx2") {
+            return unsafe { add_quickly_avx2(a, b, c) };
+        }
+    }
+
+    add_quickly_fallback(a, b, c)
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[target_feature(enable = "avx2")]
+unsafe fn add_quickly_avx2(a: &[u8], b: &[u8], c: &mut [u8]) {
+    add_quickly_fallback(a, b, c) // the function below is inlined here
+}
+
+fn add_quickly_fallback(a: &[u8], b: &[u8], c: &mut [u8]) {
+    for ((a, b), c) in a.iter().zip(b).zip(c) {
+        *c = *a + *b;
+    }
+}
+```
+
+Next up let's take a look at an example of manually using intrinsics. Here
+we'll be using SSE4.1 features to implement hex encoding.
+
+```
+fn main() {
+    let mut dst = [0; 32];
+    hex_encode(b"\x01\x02\x03", &mut dst);
+    assert_eq!(&dst[..6], b"010203");
+
+    let mut src = [0; 16];
+    for i in 0..16 {
+        src[i] = (i + 1) as u8;
+    }
+    hex_encode(&src, &mut dst);
+    assert_eq!(&dst, b"0102030405060708090a0b0c0d0e0f10");
+}
+
+pub fn hex_encode(src: &[u8], dst: &mut [u8]) {
+    let len = src.len().checked_mul(2).unwrap();
+    assert!(dst.len() >= len);
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("sse4.1") {
+            return unsafe { hex_encode_sse41(src, dst) };
+        }
+    }
+
+    hex_encode_fallback(src, dst)
+}
+
+// translated from
+// <https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp>
+#[target_feature(enable = "sse4.1")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn hex_encode_sse41(mut src: &[u8], dst: &mut [u8]) {
+    #[cfg(target_arch = "x86")]
+    use std::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    use std::arch::x86_64::*;
+
+    unsafe {
+        let ascii_zero = _mm_set1_epi8(b'0' as i8);
+        let nines = _mm_set1_epi8(9);
+        let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
+        let and4bits = _mm_set1_epi8(0xf);
+
+        let mut i = 0_isize;
+        while src.len() >= 16 {
+            let invec = _mm_loadu_si128(src.as_ptr() as *const _);
+
+            let masked1 = _mm_and_si128(invec, and4bits);
+            let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
+
+            // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+            let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
+            let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
+
+            // add '0' or the offset depending on the masks
+            let masked1 = _mm_add_epi8(
+                masked1,
+                _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1),
+            );
+            let masked2 = _mm_add_epi8(
+                masked2,
+                _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2),
+            );
+
+            // interleave masked1 and masked2 bytes
+            let res1 = _mm_unpacklo_epi8(masked2, masked1);
+            let res2 = _mm_unpackhi_epi8(masked2, masked1);
+
+            _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
+            _mm_storeu_si128(
+                dst.as_mut_ptr().offset(i * 2 + 16) as *mut _,
+                res2,
+            );
+            src = &src[16..];
+            i += 16;
+        }
+
+        let i = i as usize;
+        hex_encode_fallback(src, &mut dst[i * 2..]);
+    }
+}
+
+fn hex_encode_fallback(src: &[u8], dst: &mut [u8]) {
+    fn hex(byte: u8) -> u8 {
+        static TABLE: &[u8] = b"0123456789abcdef";
+        TABLE[byte as usize]
+    }
+
+    for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) {
+        slots[0] = hex((*byte >> 4) & 0xf);
+        slots[1] = hex(*byte & 0xf);
+    }
+}
+```
diff --git a/library/stdarch/crates/core_arch/src/lib.rs b/library/stdarch/crates/core_arch/src/lib.rs
new file mode 100644
index 0000000000000..3a760fe276e2f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/lib.rs
@@ -0,0 +1,97 @@
+#![doc = include_str!("core_arch_docs.md")]
+#![allow(improper_ctypes_definitions)]
+#![allow(dead_code)]
+#![allow(unused_features)]
+#![allow(internal_features)]
+#![allow(unsafe_op_in_unsafe_fn)]
+#![deny(rust_2018_idioms)]
+#![feature(
+    custom_inner_attributes,
+    link_llvm_intrinsics,
+    repr_simd,
+    simd_ffi,
+    proc_macro_hygiene,
+    stmt_expr_attributes,
+    core_intrinsics,
+    no_core,
+    fmt_helpers_for_derive,
+    rustc_attrs,
+    staged_api,
+    doc_cfg,
+    tbm_target_feature,
+    sse4a_target_feature,
+    riscv_target_feature,
+    arm_target_feature,
+    mips_target_feature,
+    powerpc_target_feature,
+    s390x_target_feature,
+    loongarch_target_feature,
+    wasm_target_feature,
+    abi_unadjusted,
+    rtm_target_feature,
+    allow_internal_unstable,
+    decl_macro,
+    generic_arg_infer,
+    asm_experimental_arch,
+    sha512_sm_x86,
+    x86_amx_intrinsics,
+    f16,
+    keylocker_x86,
+    aarch64_unstable_target_feature,
+    bigint_helper_methods
+)]
+#![cfg_attr(test, feature(test, abi_vectorcall, stdarch_internal))]
+#![deny(clippy::missing_inline_in_public_items)]
+#![allow(
+    clippy::identity_op,
+    clippy::inline_always,
+    clippy::too_many_arguments,
+    clippy::cast_sign_loss,
+    clippy::cast_lossless,
+    clippy::cast_possible_wrap,
+    clippy::cast_possible_truncation,
+    clippy::cast_precision_loss,
+    clippy::cognitive_complexity,
+    clippy::many_single_char_names,
+    clippy::missing_safety_doc,
+    clippy::shadow_reuse,
+    clippy::similar_names,
+    clippy::unusual_byte_groupings,
+    clippy::wrong_self_convention
+)]
+#![cfg_attr(test, allow(unused_imports))]
+#![no_std]
+#![stable(feature = "stdsimd", since = "1.27.0")]
+#![doc(
+    test(attr(deny(warnings))),
+    test(attr(allow(dead_code, deprecated, unused_variables, unused_mut)))
+)]
+#![cfg_attr(
+    test,
+    feature(
+        stdarch_arm_feature_detection,
+        stdarch_powerpc_feature_detection,
+        stdarch_s390x_feature_detection
+    )
+)]
+
+#[cfg(test)]
+#[macro_use]
+extern crate std;
+#[cfg(test)]
+#[macro_use]
+extern crate std_detect;
+#[path = "mod.rs"]
+mod core_arch;
+
+#[stable(feature = "stdsimd", since = "1.27.0")]
+pub mod arch {
+    #[stable(feature = "stdsimd", since = "1.27.0")]
+    #[allow(unused_imports)]
+    pub use crate::core_arch::arch::*;
+    #[stable(feature = "stdsimd", since = "1.27.0")]
+    pub use core::arch::asm;
+}
+
+#[allow(unused_imports)]
+use core::{array, convert, ffi, fmt, hint, intrinsics, marker, mem, ops, ptr, sync};
diff --git a/library/stdarch/crates/core_arch/src/loongarch64/lasx/generated.rs b/library/stdarch/crates/core_arch/src/loongarch64/lasx/generated.rs
new file mode 100644
index 0000000000000..2e56d8fb9b83d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/loongarch64/lasx/generated.rs
@@ -0,0 +1,7063 @@
+// This code is automatically generated. DO NOT MODIFY.
+//
+// Instead, modify `crates/stdarch-gen-loongarch/lasx.spec` and run the following command to re-generate this file:
+//
+// ```
+// OUT_DIR=`pwd`/crates/core_arch cargo run -p stdarch-gen-loongarch -- crates/stdarch-gen-loongarch/lasx.spec
+// ```
+
+use super::types::*;
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.loongarch.lasx.xvsll.b"]
+    fn __lasx_xvsll_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsll.h"]
+    fn __lasx_xvsll_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsll.w"]
+    fn __lasx_xvsll_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsll.d"]
+    fn __lasx_xvsll_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvslli.b"]
+    fn __lasx_xvslli_b(a: v32i8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvslli.h"]
+    fn __lasx_xvslli_h(a: v16i16, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvslli.w"]
+    fn __lasx_xvslli_w(a: v8i32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvslli.d"]
+    fn __lasx_xvslli_d(a: v4i64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsra.b"]
+    fn __lasx_xvsra_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsra.h"]
+    fn __lasx_xvsra_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsra.w"]
+    fn __lasx_xvsra_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsra.d"]
+    fn __lasx_xvsra_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsrai.b"]
+    fn __lasx_xvsrai_b(a: v32i8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrai.h"]
+    fn __lasx_xvsrai_h(a: v16i16, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrai.w"]
+    fn __lasx_xvsrai_w(a: v8i32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsrai.d"]
+    fn __lasx_xvsrai_d(a: v4i64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsrar.b"]
+    fn __lasx_xvsrar_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrar.h"]
+    fn __lasx_xvsrar_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrar.w"]
+    fn __lasx_xvsrar_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsrar.d"]
+    fn __lasx_xvsrar_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsrari.b"]
+    fn __lasx_xvsrari_b(a: v32i8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrari.h"]
+    fn __lasx_xvsrari_h(a: v16i16, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrari.w"]
+    fn __lasx_xvsrari_w(a: v8i32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsrari.d"]
+    fn __lasx_xvsrari_d(a: v4i64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsrl.b"]
+    fn __lasx_xvsrl_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrl.h"]
+    fn __lasx_xvsrl_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrl.w"]
+    fn __lasx_xvsrl_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsrl.d"]
+    fn __lasx_xvsrl_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsrli.b"]
+    fn __lasx_xvsrli_b(a: v32i8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrli.h"]
+    fn __lasx_xvsrli_h(a: v16i16, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrli.w"]
+    fn __lasx_xvsrli_w(a: v8i32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsrli.d"]
+    fn __lasx_xvsrli_d(a: v4i64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsrlr.b"]
+    fn __lasx_xvsrlr_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrlr.h"]
+    fn __lasx_xvsrlr_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrlr.w"]
+    fn __lasx_xvsrlr_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsrlr.d"]
+    fn __lasx_xvsrlr_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsrlri.b"]
+    fn __lasx_xvsrlri_b(a: v32i8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrlri.h"]
+    fn __lasx_xvsrlri_h(a: v16i16, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrlri.w"]
+    fn __lasx_xvsrlri_w(a: v8i32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsrlri.d"]
+    fn __lasx_xvsrlri_d(a: v4i64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvbitclr.b"]
+    fn __lasx_xvbitclr_b(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvbitclr.h"]
+    fn __lasx_xvbitclr_h(a: v16u16, b: v16u16) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvbitclr.w"]
+    fn __lasx_xvbitclr_w(a: v8u32, b: v8u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvbitclr.d"]
+    fn __lasx_xvbitclr_d(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvbitclri.b"]
+    fn __lasx_xvbitclri_b(a: v32u8, b: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvbitclri.h"]
+    fn __lasx_xvbitclri_h(a: v16u16, b: u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvbitclri.w"]
+    fn __lasx_xvbitclri_w(a: v8u32, b: u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvbitclri.d"]
+    fn __lasx_xvbitclri_d(a: v4u64, b: u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvbitset.b"]
+    fn __lasx_xvbitset_b(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvbitset.h"]
+    fn __lasx_xvbitset_h(a: v16u16, b: v16u16) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvbitset.w"]
+    fn __lasx_xvbitset_w(a: v8u32, b: v8u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvbitset.d"]
+    fn __lasx_xvbitset_d(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvbitseti.b"]
+    fn __lasx_xvbitseti_b(a: v32u8, b: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvbitseti.h"]
+    fn __lasx_xvbitseti_h(a: v16u16, b: u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvbitseti.w"]
+    fn __lasx_xvbitseti_w(a: v8u32, b: u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvbitseti.d"]
+    fn __lasx_xvbitseti_d(a: v4u64, b: u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvbitrev.b"]
+    fn __lasx_xvbitrev_b(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvbitrev.h"]
+    fn __lasx_xvbitrev_h(a: v16u16, b: v16u16) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvbitrev.w"]
+    fn __lasx_xvbitrev_w(a: v8u32, b: v8u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvbitrev.d"]
+    fn __lasx_xvbitrev_d(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvbitrevi.b"]
+    fn __lasx_xvbitrevi_b(a: v32u8, b: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvbitrevi.h"]
+    fn __lasx_xvbitrevi_h(a: v16u16, b: u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvbitrevi.w"]
+    fn __lasx_xvbitrevi_w(a: v8u32, b: u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvbitrevi.d"]
+    fn __lasx_xvbitrevi_d(a: v4u64, b: u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvadd.b"]
+    fn __lasx_xvadd_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvadd.h"]
+    fn __lasx_xvadd_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvadd.w"]
+    fn __lasx_xvadd_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvadd.d"]
+    fn __lasx_xvadd_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvaddi.bu"]
+    fn __lasx_xvaddi_bu(a: v32i8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvaddi.hu"]
+    fn __lasx_xvaddi_hu(a: v16i16, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvaddi.wu"]
+    fn __lasx_xvaddi_wu(a: v8i32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvaddi.du"]
+    fn __lasx_xvaddi_du(a: v4i64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsub.b"]
+    fn __lasx_xvsub_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsub.h"]
+    fn __lasx_xvsub_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsub.w"]
+    fn __lasx_xvsub_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsub.d"]
+    fn __lasx_xvsub_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsubi.bu"]
+    fn __lasx_xvsubi_bu(a: v32i8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsubi.hu"]
+    fn __lasx_xvsubi_hu(a: v16i16, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsubi.wu"]
+    fn __lasx_xvsubi_wu(a: v8i32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsubi.du"]
+    fn __lasx_xvsubi_du(a: v4i64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmax.b"]
+    fn __lasx_xvmax_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvmax.h"]
+    fn __lasx_xvmax_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmax.w"]
+    fn __lasx_xvmax_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmax.d"]
+    fn __lasx_xvmax_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmaxi.b"]
+    fn __lasx_xvmaxi_b(a: v32i8, b: i32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvmaxi.h"]
+    fn __lasx_xvmaxi_h(a: v16i16, b: i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmaxi.w"]
+    fn __lasx_xvmaxi_w(a: v8i32, b: i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmaxi.d"]
+    fn __lasx_xvmaxi_d(a: v4i64, b: i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmax.bu"]
+    fn __lasx_xvmax_bu(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvmax.hu"]
+    fn __lasx_xvmax_hu(a: v16u16, b: v16u16) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvmax.wu"]
+    fn __lasx_xvmax_wu(a: v8u32, b: v8u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvmax.du"]
+    fn __lasx_xvmax_du(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvmaxi.bu"]
+    fn __lasx_xvmaxi_bu(a: v32u8, b: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvmaxi.hu"]
+    fn __lasx_xvmaxi_hu(a: v16u16, b: u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvmaxi.wu"]
+    fn __lasx_xvmaxi_wu(a: v8u32, b: u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvmaxi.du"]
+    fn __lasx_xvmaxi_du(a: v4u64, b: u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvmin.b"]
+    fn __lasx_xvmin_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvmin.h"]
+    fn __lasx_xvmin_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmin.w"]
+    fn __lasx_xvmin_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmin.d"]
+    fn __lasx_xvmin_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmini.b"]
+    fn __lasx_xvmini_b(a: v32i8, b: i32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvmini.h"]
+    fn __lasx_xvmini_h(a: v16i16, b: i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmini.w"]
+    fn __lasx_xvmini_w(a: v8i32, b: i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmini.d"]
+    fn __lasx_xvmini_d(a: v4i64, b: i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmin.bu"]
+    fn __lasx_xvmin_bu(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvmin.hu"]
+    fn __lasx_xvmin_hu(a: v16u16, b: v16u16) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvmin.wu"]
+    fn __lasx_xvmin_wu(a: v8u32, b: v8u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvmin.du"]
+    fn __lasx_xvmin_du(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvmini.bu"]
+    fn __lasx_xvmini_bu(a: v32u8, b: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvmini.hu"]
+    fn __lasx_xvmini_hu(a: v16u16, b: u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvmini.wu"]
+    fn __lasx_xvmini_wu(a: v8u32, b: u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvmini.du"]
+    fn __lasx_xvmini_du(a: v4u64, b: u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvseq.b"]
+    fn __lasx_xvseq_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvseq.h"]
+    fn __lasx_xvseq_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvseq.w"]
+    fn __lasx_xvseq_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvseq.d"]
+    fn __lasx_xvseq_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvseqi.b"]
+    fn __lasx_xvseqi_b(a: v32i8, b: i32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvseqi.h"]
+    fn __lasx_xvseqi_h(a: v16i16, b: i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvseqi.w"]
+    fn __lasx_xvseqi_w(a: v8i32, b: i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvseqi.d"]
+    fn __lasx_xvseqi_d(a: v4i64, b: i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvslt.b"]
+    fn __lasx_xvslt_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvslt.h"]
+    fn __lasx_xvslt_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvslt.w"]
+    fn __lasx_xvslt_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvslt.d"]
+    fn __lasx_xvslt_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvslti.b"]
+    fn __lasx_xvslti_b(a: v32i8, b: i32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvslti.h"]
+    fn __lasx_xvslti_h(a: v16i16, b: i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvslti.w"]
+    fn __lasx_xvslti_w(a: v8i32, b: i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvslti.d"]
+    fn __lasx_xvslti_d(a: v4i64, b: i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvslt.bu"]
+    fn __lasx_xvslt_bu(a: v32u8, b: v32u8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvslt.hu"]
+    fn __lasx_xvslt_hu(a: v16u16, b: v16u16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvslt.wu"]
+    fn __lasx_xvslt_wu(a: v8u32, b: v8u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvslt.du"]
+    fn __lasx_xvslt_du(a: v4u64, b: v4u64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvslti.bu"]
+    fn __lasx_xvslti_bu(a: v32u8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvslti.hu"]
+    fn __lasx_xvslti_hu(a: v16u16, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvslti.wu"]
+    fn __lasx_xvslti_wu(a: v8u32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvslti.du"]
+    fn __lasx_xvslti_du(a: v4u64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsle.b"]
+    fn __lasx_xvsle_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsle.h"]
+    fn __lasx_xvsle_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsle.w"]
+    fn __lasx_xvsle_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsle.d"]
+    fn __lasx_xvsle_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvslei.b"]
+    fn __lasx_xvslei_b(a: v32i8, b: i32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvslei.h"]
+    fn __lasx_xvslei_h(a: v16i16, b: i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvslei.w"]
+    fn __lasx_xvslei_w(a: v8i32, b: i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvslei.d"]
+    fn __lasx_xvslei_d(a: v4i64, b: i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsle.bu"]
+    fn __lasx_xvsle_bu(a: v32u8, b: v32u8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsle.hu"]
+    fn __lasx_xvsle_hu(a: v16u16, b: v16u16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsle.wu"]
+    fn __lasx_xvsle_wu(a: v8u32, b: v8u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsle.du"]
+    fn __lasx_xvsle_du(a: v4u64, b: v4u64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvslei.bu"]
+    fn __lasx_xvslei_bu(a: v32u8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvslei.hu"]
+    fn __lasx_xvslei_hu(a: v16u16, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvslei.wu"]
+    fn __lasx_xvslei_wu(a: v8u32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvslei.du"]
+    fn __lasx_xvslei_du(a: v4u64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsat.b"]
+    fn __lasx_xvsat_b(a: v32i8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsat.h"]
+    fn __lasx_xvsat_h(a: v16i16, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsat.w"]
+    fn __lasx_xvsat_w(a: v8i32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsat.d"]
+    fn __lasx_xvsat_d(a: v4i64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsat.bu"]
+    fn __lasx_xvsat_bu(a: v32u8, b: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvsat.hu"]
+    fn __lasx_xvsat_hu(a: v16u16, b: u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvsat.wu"]
+    fn __lasx_xvsat_wu(a: v8u32, b: u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvsat.du"]
+    fn __lasx_xvsat_du(a: v4u64, b: u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvadda.b"]
+    fn __lasx_xvadda_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvadda.h"]
+    fn __lasx_xvadda_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvadda.w"]
+    fn __lasx_xvadda_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvadda.d"]
+    fn __lasx_xvadda_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsadd.b"]
+    fn __lasx_xvsadd_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsadd.h"]
+    fn __lasx_xvsadd_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsadd.w"]
+    fn __lasx_xvsadd_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsadd.d"]
+    fn __lasx_xvsadd_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsadd.bu"]
+    fn __lasx_xvsadd_bu(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvsadd.hu"]
+    fn __lasx_xvsadd_hu(a: v16u16, b: v16u16) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvsadd.wu"]
+    fn __lasx_xvsadd_wu(a: v8u32, b: v8u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvsadd.du"]
+    fn __lasx_xvsadd_du(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvavg.b"]
+    fn __lasx_xvavg_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvavg.h"]
+    fn __lasx_xvavg_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvavg.w"]
+    fn __lasx_xvavg_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvavg.d"]
+    fn __lasx_xvavg_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvavg.bu"]
+    fn __lasx_xvavg_bu(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvavg.hu"]
+    fn __lasx_xvavg_hu(a: v16u16, b: v16u16) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvavg.wu"]
+    fn __lasx_xvavg_wu(a: v8u32, b: v8u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvavg.du"]
+    fn __lasx_xvavg_du(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvavgr.b"]
+    fn __lasx_xvavgr_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvavgr.h"]
+    fn __lasx_xvavgr_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvavgr.w"]
+    fn __lasx_xvavgr_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvavgr.d"]
+    fn __lasx_xvavgr_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvavgr.bu"]
+    fn __lasx_xvavgr_bu(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvavgr.hu"]
+    fn __lasx_xvavgr_hu(a: v16u16, b: v16u16) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvavgr.wu"]
+    fn __lasx_xvavgr_wu(a: v8u32, b: v8u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvavgr.du"]
+    fn __lasx_xvavgr_du(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvssub.b"]
+    fn __lasx_xvssub_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvssub.h"]
+    fn __lasx_xvssub_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvssub.w"]
+    fn __lasx_xvssub_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvssub.d"]
+    fn __lasx_xvssub_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvssub.bu"]
+    fn __lasx_xvssub_bu(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvssub.hu"]
+    fn __lasx_xvssub_hu(a: v16u16, b: v16u16) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvssub.wu"]
+    fn __lasx_xvssub_wu(a: v8u32, b: v8u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvssub.du"]
+    fn __lasx_xvssub_du(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvabsd.b"]
+    fn __lasx_xvabsd_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvabsd.h"]
+    fn __lasx_xvabsd_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvabsd.w"]
+    fn __lasx_xvabsd_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvabsd.d"]
+    fn __lasx_xvabsd_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvabsd.bu"]
+    fn __lasx_xvabsd_bu(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvabsd.hu"]
+    fn __lasx_xvabsd_hu(a: v16u16, b: v16u16) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvabsd.wu"]
+    fn __lasx_xvabsd_wu(a: v8u32, b: v8u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvabsd.du"]
+    fn __lasx_xvabsd_du(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvmul.b"]
+    fn __lasx_xvmul_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvmul.h"]
+    fn __lasx_xvmul_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmul.w"]
+    fn __lasx_xvmul_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmul.d"]
+    fn __lasx_xvmul_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmadd.b"]
+    fn __lasx_xvmadd_b(a: v32i8, b: v32i8, c: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvmadd.h"]
+    fn __lasx_xvmadd_h(a: v16i16, b: v16i16, c: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmadd.w"]
+    fn __lasx_xvmadd_w(a: v8i32, b: v8i32, c: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmadd.d"]
+    fn __lasx_xvmadd_d(a: v4i64, b: v4i64, c: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmsub.b"]
+    fn __lasx_xvmsub_b(a: v32i8, b: v32i8, c: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvmsub.h"]
+    fn __lasx_xvmsub_h(a: v16i16, b: v16i16, c: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmsub.w"]
+    fn __lasx_xvmsub_w(a: v8i32, b: v8i32, c: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmsub.d"]
+    fn __lasx_xvmsub_d(a: v4i64, b: v4i64, c: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvdiv.b"]
+    fn __lasx_xvdiv_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvdiv.h"]
+    fn __lasx_xvdiv_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvdiv.w"]
+    fn __lasx_xvdiv_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvdiv.d"]
+    fn __lasx_xvdiv_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvdiv.bu"]
+    fn __lasx_xvdiv_bu(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvdiv.hu"]
+    fn __lasx_xvdiv_hu(a: v16u16, b: v16u16) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvdiv.wu"]
+    fn __lasx_xvdiv_wu(a: v8u32, b: v8u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvdiv.du"]
+    fn __lasx_xvdiv_du(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvhaddw.h.b"]
+    fn __lasx_xvhaddw_h_b(a: v32i8, b: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvhaddw.w.h"]
+    fn __lasx_xvhaddw_w_h(a: v16i16, b: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvhaddw.d.w"]
+    fn __lasx_xvhaddw_d_w(a: v8i32, b: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvhaddw.hu.bu"]
+    fn __lasx_xvhaddw_hu_bu(a: v32u8, b: v32u8) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvhaddw.wu.hu"]
+    fn __lasx_xvhaddw_wu_hu(a: v16u16, b: v16u16) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvhaddw.du.wu"]
+    fn __lasx_xvhaddw_du_wu(a: v8u32, b: v8u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvhsubw.h.b"]
+    fn __lasx_xvhsubw_h_b(a: v32i8, b: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvhsubw.w.h"]
+    fn __lasx_xvhsubw_w_h(a: v16i16, b: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvhsubw.d.w"]
+    fn __lasx_xvhsubw_d_w(a: v8i32, b: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvhsubw.hu.bu"]
+    fn __lasx_xvhsubw_hu_bu(a: v32u8, b: v32u8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvhsubw.wu.hu"]
+    fn __lasx_xvhsubw_wu_hu(a: v16u16, b: v16u16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvhsubw.du.wu"]
+    fn __lasx_xvhsubw_du_wu(a: v8u32, b: v8u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmod.b"]
+    fn __lasx_xvmod_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvmod.h"]
+    fn __lasx_xvmod_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmod.w"]
+    fn __lasx_xvmod_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmod.d"]
+    fn __lasx_xvmod_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmod.bu"]
+    fn __lasx_xvmod_bu(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvmod.hu"]
+    fn __lasx_xvmod_hu(a: v16u16, b: v16u16) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvmod.wu"]
+    fn __lasx_xvmod_wu(a: v8u32, b: v8u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvmod.du"]
+    fn __lasx_xvmod_du(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvrepl128vei.b"]
+    fn __lasx_xvrepl128vei_b(a: v32i8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvrepl128vei.h"]
+    fn __lasx_xvrepl128vei_h(a: v16i16, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvrepl128vei.w"]
+    fn __lasx_xvrepl128vei_w(a: v8i32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvrepl128vei.d"]
+    fn __lasx_xvrepl128vei_d(a: v4i64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvpickev.b"]
+    fn __lasx_xvpickev_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvpickev.h"]
+    fn __lasx_xvpickev_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvpickev.w"]
+    fn __lasx_xvpickev_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvpickev.d"]
+    fn __lasx_xvpickev_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvpickod.b"]
+    fn __lasx_xvpickod_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvpickod.h"]
+    fn __lasx_xvpickod_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvpickod.w"]
+    fn __lasx_xvpickod_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvpickod.d"]
+    fn __lasx_xvpickod_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvilvh.b"]
+    fn __lasx_xvilvh_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvilvh.h"]
+    fn __lasx_xvilvh_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvilvh.w"]
+    fn __lasx_xvilvh_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvilvh.d"]
+    fn __lasx_xvilvh_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvilvl.b"]
+    fn __lasx_xvilvl_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvilvl.h"]
+    fn __lasx_xvilvl_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvilvl.w"]
+    fn __lasx_xvilvl_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvilvl.d"]
+    fn __lasx_xvilvl_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvpackev.b"]
+    fn __lasx_xvpackev_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvpackev.h"]
+    fn __lasx_xvpackev_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvpackev.w"]
+    fn __lasx_xvpackev_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvpackev.d"]
+    fn __lasx_xvpackev_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvpackod.b"]
+    fn __lasx_xvpackod_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvpackod.h"]
+    fn __lasx_xvpackod_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvpackod.w"]
+    fn __lasx_xvpackod_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvpackod.d"]
+    fn __lasx_xvpackod_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvshuf.b"]
+    fn __lasx_xvshuf_b(a: v32i8, b: v32i8, c: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvshuf.h"]
+    fn __lasx_xvshuf_h(a: v16i16, b: v16i16, c: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvshuf.w"]
+    fn __lasx_xvshuf_w(a: v8i32, b: v8i32, c: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvshuf.d"]
+    fn __lasx_xvshuf_d(a: v4i64, b: v4i64, c: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvand.v"]
+    fn __lasx_xvand_v(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvandi.b"]
+    fn __lasx_xvandi_b(a: v32u8, b: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvor.v"]
+    fn __lasx_xvor_v(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvori.b"]
+    fn __lasx_xvori_b(a: v32u8, b: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvnor.v"]
+    fn __lasx_xvnor_v(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvnori.b"]
+    fn __lasx_xvnori_b(a: v32u8, b: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvxor.v"]
+    fn __lasx_xvxor_v(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvxori.b"]
+    fn __lasx_xvxori_b(a: v32u8, b: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvbitsel.v"]
+    fn __lasx_xvbitsel_v(a: v32u8, b: v32u8, c: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvbitseli.b"]
+    fn __lasx_xvbitseli_b(a: v32u8, b: v32u8, c: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvshuf4i.b"]
+    fn __lasx_xvshuf4i_b(a: v32i8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvshuf4i.h"]
+    fn __lasx_xvshuf4i_h(a: v16i16, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvshuf4i.w"]
+    fn __lasx_xvshuf4i_w(a: v8i32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvreplgr2vr.b"]
+    fn __lasx_xvreplgr2vr_b(a: i32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvreplgr2vr.h"]
+    fn __lasx_xvreplgr2vr_h(a: i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvreplgr2vr.w"]
+    fn __lasx_xvreplgr2vr_w(a: i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvreplgr2vr.d"]
+    fn __lasx_xvreplgr2vr_d(a: i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvpcnt.b"]
+    fn __lasx_xvpcnt_b(a: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvpcnt.h"]
+    fn __lasx_xvpcnt_h(a: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvpcnt.w"]
+    fn __lasx_xvpcnt_w(a: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvpcnt.d"]
+    fn __lasx_xvpcnt_d(a: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvclo.b"]
+    fn __lasx_xvclo_b(a: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvclo.h"]
+    fn __lasx_xvclo_h(a: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvclo.w"]
+    fn __lasx_xvclo_w(a: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvclo.d"]
+    fn __lasx_xvclo_d(a: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvclz.b"]
+    fn __lasx_xvclz_b(a: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvclz.h"]
+    fn __lasx_xvclz_h(a: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvclz.w"]
+    fn __lasx_xvclz_w(a: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvclz.d"]
+    fn __lasx_xvclz_d(a: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfadd.s"]
+    fn __lasx_xvfadd_s(a: v8f32, b: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfadd.d"]
+    fn __lasx_xvfadd_d(a: v4f64, b: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfsub.s"]
+    fn __lasx_xvfsub_s(a: v8f32, b: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfsub.d"]
+    fn __lasx_xvfsub_d(a: v4f64, b: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfmul.s"]
+    fn __lasx_xvfmul_s(a: v8f32, b: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfmul.d"]
+    fn __lasx_xvfmul_d(a: v4f64, b: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfdiv.s"]
+    fn __lasx_xvfdiv_s(a: v8f32, b: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfdiv.d"]
+    fn __lasx_xvfdiv_d(a: v4f64, b: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfcvt.h.s"]
+    fn __lasx_xvfcvt_h_s(a: v8f32, b: v8f32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvfcvt.s.d"]
+    fn __lasx_xvfcvt_s_d(a: v4f64, b: v4f64) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfmin.s"]
+    fn __lasx_xvfmin_s(a: v8f32, b: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfmin.d"]
+    fn __lasx_xvfmin_d(a: v4f64, b: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfmina.s"]
+    fn __lasx_xvfmina_s(a: v8f32, b: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfmina.d"]
+    fn __lasx_xvfmina_d(a: v4f64, b: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfmax.s"]
+    fn __lasx_xvfmax_s(a: v8f32, b: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfmax.d"]
+    fn __lasx_xvfmax_d(a: v4f64, b: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfmaxa.s"]
+    fn __lasx_xvfmaxa_s(a: v8f32, b: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfmaxa.d"]
+    fn __lasx_xvfmaxa_d(a: v4f64, b: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfclass.s"]
+    fn __lasx_xvfclass_s(a: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfclass.d"]
+    fn __lasx_xvfclass_d(a: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfsqrt.s"]
+    fn __lasx_xvfsqrt_s(a: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfsqrt.d"]
+    fn __lasx_xvfsqrt_d(a: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfrecip.s"]
+    fn __lasx_xvfrecip_s(a: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfrecip.d"]
+    fn __lasx_xvfrecip_d(a: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfrecipe.s"]
+    fn __lasx_xvfrecipe_s(a: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfrecipe.d"]
+    fn __lasx_xvfrecipe_d(a: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfrsqrte.s"]
+    fn __lasx_xvfrsqrte_s(a: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfrsqrte.d"]
+    fn __lasx_xvfrsqrte_d(a: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfrint.s"]
+    fn __lasx_xvfrint_s(a: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfrint.d"]
+    fn __lasx_xvfrint_d(a: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfrsqrt.s"]
+    fn __lasx_xvfrsqrt_s(a: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfrsqrt.d"]
+    fn __lasx_xvfrsqrt_d(a: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvflogb.s"]
+    fn __lasx_xvflogb_s(a: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvflogb.d"]
+    fn __lasx_xvflogb_d(a: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfcvth.s.h"]
+    fn __lasx_xvfcvth_s_h(a: v16i16) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfcvth.d.s"]
+    fn __lasx_xvfcvth_d_s(a: v8f32) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfcvtl.s.h"]
+    fn __lasx_xvfcvtl_s_h(a: v16i16) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfcvtl.d.s"]
+    fn __lasx_xvfcvtl_d_s(a: v8f32) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvftint.w.s"]
+    fn __lasx_xvftint_w_s(a: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvftint.l.d"]
+    fn __lasx_xvftint_l_d(a: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvftint.wu.s"]
+    fn __lasx_xvftint_wu_s(a: v8f32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvftint.lu.d"]
+    fn __lasx_xvftint_lu_d(a: v4f64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvftintrz.w.s"]
+    fn __lasx_xvftintrz_w_s(a: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvftintrz.l.d"]
+    fn __lasx_xvftintrz_l_d(a: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvftintrz.wu.s"]
+    fn __lasx_xvftintrz_wu_s(a: v8f32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvftintrz.lu.d"]
+    fn __lasx_xvftintrz_lu_d(a: v4f64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvffint.s.w"]
+    fn __lasx_xvffint_s_w(a: v8i32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvffint.d.l"]
+    fn __lasx_xvffint_d_l(a: v4i64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvffint.s.wu"]
+    fn __lasx_xvffint_s_wu(a: v8u32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvffint.d.lu"]
+    fn __lasx_xvffint_d_lu(a: v4u64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvreplve.b"]
+    fn __lasx_xvreplve_b(a: v32i8, b: i32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvreplve.h"]
+    fn __lasx_xvreplve_h(a: v16i16, b: i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvreplve.w"]
+    fn __lasx_xvreplve_w(a: v8i32, b: i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvreplve.d"]
+    fn __lasx_xvreplve_d(a: v4i64, b: i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvpermi.w"]
+    fn __lasx_xvpermi_w(a: v8i32, b: v8i32, c: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvandn.v"]
+    fn __lasx_xvandn_v(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvneg.b"]
+    fn __lasx_xvneg_b(a: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvneg.h"]
+    fn __lasx_xvneg_h(a: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvneg.w"]
+    fn __lasx_xvneg_w(a: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvneg.d"]
+    fn __lasx_xvneg_d(a: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmuh.b"]
+    fn __lasx_xvmuh_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvmuh.h"]
+    fn __lasx_xvmuh_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmuh.w"]
+    fn __lasx_xvmuh_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmuh.d"]
+    fn __lasx_xvmuh_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmuh.bu"]
+    fn __lasx_xvmuh_bu(a: v32u8, b: v32u8) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvmuh.hu"]
+    fn __lasx_xvmuh_hu(a: v16u16, b: v16u16) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvmuh.wu"]
+    fn __lasx_xvmuh_wu(a: v8u32, b: v8u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvmuh.du"]
+    fn __lasx_xvmuh_du(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvsllwil.h.b"]
+    fn __lasx_xvsllwil_h_b(a: v32i8, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsllwil.w.h"]
+    fn __lasx_xvsllwil_w_h(a: v16i16, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsllwil.d.w"]
+    fn __lasx_xvsllwil_d_w(a: v8i32, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsllwil.hu.bu"]
+    fn __lasx_xvsllwil_hu_bu(a: v32u8, b: u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvsllwil.wu.hu"]
+    fn __lasx_xvsllwil_wu_hu(a: v16u16, b: u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvsllwil.du.wu"]
+    fn __lasx_xvsllwil_du_wu(a: v8u32, b: u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvsran.b.h"]
+    fn __lasx_xvsran_b_h(a: v16i16, b: v16i16) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsran.h.w"]
+    fn __lasx_xvsran_h_w(a: v8i32, b: v8i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsran.w.d"]
+    fn __lasx_xvsran_w_d(a: v4i64, b: v4i64) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvssran.b.h"]
+    fn __lasx_xvssran_b_h(a: v16i16, b: v16i16) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvssran.h.w"]
+    fn __lasx_xvssran_h_w(a: v8i32, b: v8i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvssran.w.d"]
+    fn __lasx_xvssran_w_d(a: v4i64, b: v4i64) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvssran.bu.h"]
+    fn __lasx_xvssran_bu_h(a: v16u16, b: v16u16) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvssran.hu.w"]
+    fn __lasx_xvssran_hu_w(a: v8u32, b: v8u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvssran.wu.d"]
+    fn __lasx_xvssran_wu_d(a: v4u64, b: v4u64) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvsrarn.b.h"]
+    fn __lasx_xvsrarn_b_h(a: v16i16, b: v16i16) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrarn.h.w"]
+    fn __lasx_xvsrarn_h_w(a: v8i32, b: v8i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrarn.w.d"]
+    fn __lasx_xvsrarn_w_d(a: v4i64, b: v4i64) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvssrarn.b.h"]
+    fn __lasx_xvssrarn_b_h(a: v16i16, b: v16i16) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvssrarn.h.w"]
+    fn __lasx_xvssrarn_h_w(a: v8i32, b: v8i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvssrarn.w.d"]
+    fn __lasx_xvssrarn_w_d(a: v4i64, b: v4i64) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvssrarn.bu.h"]
+    fn __lasx_xvssrarn_bu_h(a: v16u16, b: v16u16) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvssrarn.hu.w"]
+    fn __lasx_xvssrarn_hu_w(a: v8u32, b: v8u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvssrarn.wu.d"]
+    fn __lasx_xvssrarn_wu_d(a: v4u64, b: v4u64) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvsrln.b.h"]
+    fn __lasx_xvsrln_b_h(a: v16i16, b: v16i16) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrln.h.w"]
+    fn __lasx_xvsrln_h_w(a: v8i32, b: v8i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrln.w.d"]
+    fn __lasx_xvsrln_w_d(a: v4i64, b: v4i64) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvssrln.bu.h"]
+    fn __lasx_xvssrln_bu_h(a: v16u16, b: v16u16) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvssrln.hu.w"]
+    fn __lasx_xvssrln_hu_w(a: v8u32, b: v8u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvssrln.wu.d"]
+    fn __lasx_xvssrln_wu_d(a: v4u64, b: v4u64) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvsrlrn.b.h"]
+    fn __lasx_xvsrlrn_b_h(a: v16i16, b: v16i16) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrlrn.h.w"]
+    fn __lasx_xvsrlrn_h_w(a: v8i32, b: v8i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrlrn.w.d"]
+    fn __lasx_xvsrlrn_w_d(a: v4i64, b: v4i64) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrn.bu.h"]
+    fn __lasx_xvssrlrn_bu_h(a: v16u16, b: v16u16) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrn.hu.w"]
+    fn __lasx_xvssrlrn_hu_w(a: v8u32, b: v8u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrn.wu.d"]
+    fn __lasx_xvssrlrn_wu_d(a: v4u64, b: v4u64) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvfrstpi.b"]
+    fn __lasx_xvfrstpi_b(a: v32i8, b: v32i8, c: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvfrstpi.h"]
+    fn __lasx_xvfrstpi_h(a: v16i16, b: v16i16, c: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvfrstp.b"]
+    fn __lasx_xvfrstp_b(a: v32i8, b: v32i8, c: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvfrstp.h"]
+    fn __lasx_xvfrstp_h(a: v16i16, b: v16i16, c: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvshuf4i.d"]
+    fn __lasx_xvshuf4i_d(a: v4i64, b: v4i64, c: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvbsrl.v"]
+    fn __lasx_xvbsrl_v(a: v32i8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvbsll.v"]
+    fn __lasx_xvbsll_v(a: v32i8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvextrins.b"]
+    fn __lasx_xvextrins_b(a: v32i8, b: v32i8, c: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvextrins.h"]
+    fn __lasx_xvextrins_h(a: v16i16, b: v16i16, c: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvextrins.w"]
+    fn __lasx_xvextrins_w(a: v8i32, b: v8i32, c: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvextrins.d"]
+    fn __lasx_xvextrins_d(a: v4i64, b: v4i64, c: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmskltz.b"]
+    fn __lasx_xvmskltz_b(a: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvmskltz.h"]
+    fn __lasx_xvmskltz_h(a: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmskltz.w"]
+    fn __lasx_xvmskltz_w(a: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmskltz.d"]
+    fn __lasx_xvmskltz_d(a: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsigncov.b"]
+    fn __lasx_xvsigncov_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsigncov.h"]
+    fn __lasx_xvsigncov_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsigncov.w"]
+    fn __lasx_xvsigncov_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsigncov.d"]
+    fn __lasx_xvsigncov_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfmadd.s"]
+    fn __lasx_xvfmadd_s(a: v8f32, b: v8f32, c: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfmadd.d"]
+    fn __lasx_xvfmadd_d(a: v4f64, b: v4f64, c: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfmsub.s"]
+    fn __lasx_xvfmsub_s(a: v8f32, b: v8f32, c: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfmsub.d"]
+    fn __lasx_xvfmsub_d(a: v4f64, b: v4f64, c: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfnmadd.s"]
+    fn __lasx_xvfnmadd_s(a: v8f32, b: v8f32, c: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfnmadd.d"]
+    fn __lasx_xvfnmadd_d(a: v4f64, b: v4f64, c: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfnmsub.s"]
+    fn __lasx_xvfnmsub_s(a: v8f32, b: v8f32, c: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfnmsub.d"]
+    fn __lasx_xvfnmsub_d(a: v4f64, b: v4f64, c: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvftintrne.w.s"]
+    fn __lasx_xvftintrne_w_s(a: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvftintrne.l.d"]
+    fn __lasx_xvftintrne_l_d(a: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvftintrp.w.s"]
+    fn __lasx_xvftintrp_w_s(a: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvftintrp.l.d"]
+    fn __lasx_xvftintrp_l_d(a: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvftintrm.w.s"]
+    fn __lasx_xvftintrm_w_s(a: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvftintrm.l.d"]
+    fn __lasx_xvftintrm_l_d(a: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvftint.w.d"]
+    fn __lasx_xvftint_w_d(a: v4f64, b: v4f64) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvffint.s.l"]
+    fn __lasx_xvffint_s_l(a: v4i64, b: v4i64) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvftintrz.w.d"]
+    fn __lasx_xvftintrz_w_d(a: v4f64, b: v4f64) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvftintrp.w.d"]
+    fn __lasx_xvftintrp_w_d(a: v4f64, b: v4f64) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvftintrm.w.d"]
+    fn __lasx_xvftintrm_w_d(a: v4f64, b: v4f64) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvftintrne.w.d"]
+    fn __lasx_xvftintrne_w_d(a: v4f64, b: v4f64) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvftinth.l.s"]
+    fn __lasx_xvftinth_l_s(a: v8f32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvftintl.l.s"]
+    fn __lasx_xvftintl_l_s(a: v8f32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvffinth.d.w"]
+    fn __lasx_xvffinth_d_w(a: v8i32) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvffintl.d.w"]
+    fn __lasx_xvffintl_d_w(a: v8i32) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvftintrzh.l.s"]
+    fn __lasx_xvftintrzh_l_s(a: v8f32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvftintrzl.l.s"]
+    fn __lasx_xvftintrzl_l_s(a: v8f32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvftintrph.l.s"]
+    fn __lasx_xvftintrph_l_s(a: v8f32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvftintrpl.l.s"]
+    fn __lasx_xvftintrpl_l_s(a: v8f32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvftintrmh.l.s"]
+    fn __lasx_xvftintrmh_l_s(a: v8f32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvftintrml.l.s"]
+    fn __lasx_xvftintrml_l_s(a: v8f32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvftintrneh.l.s"]
+    fn __lasx_xvftintrneh_l_s(a: v8f32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvftintrnel.l.s"]
+    fn __lasx_xvftintrnel_l_s(a: v8f32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfrintrne.s"]
+    fn __lasx_xvfrintrne_s(a: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfrintrne.d"]
+    fn __lasx_xvfrintrne_d(a: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfrintrz.s"]
+    fn __lasx_xvfrintrz_s(a: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfrintrz.d"]
+    fn __lasx_xvfrintrz_d(a: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfrintrp.s"]
+    fn __lasx_xvfrintrp_s(a: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfrintrp.d"]
+    fn __lasx_xvfrintrp_d(a: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvfrintrm.s"]
+    fn __lasx_xvfrintrm_s(a: v8f32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvfrintrm.d"]
+    fn __lasx_xvfrintrm_d(a: v4f64) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvld"]
+    fn __lasx_xvld(a: *const i8, b: i32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvst"]
+    fn __lasx_xvst(a: v32i8, b: *mut i8, c: i32);
+    #[link_name = "llvm.loongarch.lasx.xvstelm.b"]
+    fn __lasx_xvstelm_b(a: v32i8, b: *mut i8, c: i32, d: u32);
+    #[link_name = "llvm.loongarch.lasx.xvstelm.h"]
+    fn __lasx_xvstelm_h(a: v16i16, b: *mut i8, c: i32, d: u32);
+    #[link_name = "llvm.loongarch.lasx.xvstelm.w"]
+    fn __lasx_xvstelm_w(a: v8i32, b: *mut i8, c: i32, d: u32);
+    #[link_name = "llvm.loongarch.lasx.xvstelm.d"]
+    fn __lasx_xvstelm_d(a: v4i64, b: *mut i8, c: i32, d: u32);
+    #[link_name = "llvm.loongarch.lasx.xvinsve0.w"]
+    fn __lasx_xvinsve0_w(a: v8i32, b: v8i32, c: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvinsve0.d"]
+    fn __lasx_xvinsve0_d(a: v4i64, b: v4i64, c: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvpickve.w"]
+    fn __lasx_xvpickve_w(a: v8i32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvpickve.d"]
+    fn __lasx_xvpickve_d(a: v4i64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrn.b.h"]
+    fn __lasx_xvssrlrn_b_h(a: v16i16, b: v16i16) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrn.h.w"]
+    fn __lasx_xvssrlrn_h_w(a: v8i32, b: v8i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrn.w.d"]
+    fn __lasx_xvssrlrn_w_d(a: v4i64, b: v4i64) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvssrln.b.h"]
+    fn __lasx_xvssrln_b_h(a: v16i16, b: v16i16) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvssrln.h.w"]
+    fn __lasx_xvssrln_h_w(a: v8i32, b: v8i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvssrln.w.d"]
+    fn __lasx_xvssrln_w_d(a: v4i64, b: v4i64) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvorn.v"]
+    fn __lasx_xvorn_v(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvldi"]
+    fn __lasx_xvldi(a: i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvldx"]
+    fn __lasx_xvldx(a: *const i8, b: i64) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvstx"]
+    fn __lasx_xvstx(a: v32i8, b: *mut i8, c: i64);
+    #[link_name = "llvm.loongarch.lasx.xvextl.qu.du"]
+    fn __lasx_xvextl_qu_du(a: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvinsgr2vr.w"]
+    fn __lasx_xvinsgr2vr_w(a: v8i32, b: i32, c: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvinsgr2vr.d"]
+    fn __lasx_xvinsgr2vr_d(a: v4i64, b: i64, c: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvreplve0.b"]
+    fn __lasx_xvreplve0_b(a: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvreplve0.h"]
+    fn __lasx_xvreplve0_h(a: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvreplve0.w"]
+    fn __lasx_xvreplve0_w(a: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvreplve0.d"]
+    fn __lasx_xvreplve0_d(a: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvreplve0.q"]
+    fn __lasx_xvreplve0_q(a: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.vext2xv.h.b"]
+    fn __lasx_vext2xv_h_b(a: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.vext2xv.w.h"]
+    fn __lasx_vext2xv_w_h(a: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.vext2xv.d.w"]
+    fn __lasx_vext2xv_d_w(a: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.vext2xv.w.b"]
+    fn __lasx_vext2xv_w_b(a: v32i8) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.vext2xv.d.h"]
+    fn __lasx_vext2xv_d_h(a: v16i16) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.vext2xv.d.b"]
+    fn __lasx_vext2xv_d_b(a: v32i8) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.vext2xv.hu.bu"]
+    fn __lasx_vext2xv_hu_bu(a: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.vext2xv.wu.hu"]
+    fn __lasx_vext2xv_wu_hu(a: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.vext2xv.du.wu"]
+    fn __lasx_vext2xv_du_wu(a: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.vext2xv.wu.bu"]
+    fn __lasx_vext2xv_wu_bu(a: v32i8) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.vext2xv.du.hu"]
+    fn __lasx_vext2xv_du_hu(a: v16i16) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.vext2xv.du.bu"]
+    fn __lasx_vext2xv_du_bu(a: v32i8) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvpermi.q"]
+    fn __lasx_xvpermi_q(a: v32i8, b: v32i8, c: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvpermi.d"]
+    fn __lasx_xvpermi_d(a: v4i64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvperm.w"]
+    fn __lasx_xvperm_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvldrepl.b"]
+    fn __lasx_xvldrepl_b(a: *const i8, b: i32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvldrepl.h"]
+    fn __lasx_xvldrepl_h(a: *const i8, b: i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvldrepl.w"]
+    fn __lasx_xvldrepl_w(a: *const i8, b: i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvldrepl.d"]
+    fn __lasx_xvldrepl_d(a: *const i8, b: i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvpickve2gr.w"]
+    fn __lasx_xvpickve2gr_w(a: v8i32, b: u32) -> i32;
+    #[link_name = "llvm.loongarch.lasx.xvpickve2gr.wu"]
+    fn __lasx_xvpickve2gr_wu(a: v8i32, b: u32) -> u32;
+    #[link_name = "llvm.loongarch.lasx.xvpickve2gr.d"]
+    fn __lasx_xvpickve2gr_d(a: v4i64, b: u32) -> i64;
+    #[link_name = "llvm.loongarch.lasx.xvpickve2gr.du"]
+    fn __lasx_xvpickve2gr_du(a: v4i64, b: u32) -> u64;
+    #[link_name = "llvm.loongarch.lasx.xvaddwev.q.d"]
+    fn __lasx_xvaddwev_q_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvaddwev.d.w"]
+    fn __lasx_xvaddwev_d_w(a: v8i32, b: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvaddwev.w.h"]
+    fn __lasx_xvaddwev_w_h(a: v16i16, b: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvaddwev.h.b"]
+    fn __lasx_xvaddwev_h_b(a: v32i8, b: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvaddwev.q.du"]
+    fn __lasx_xvaddwev_q_du(a: v4u64, b: v4u64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvaddwev.d.wu"]
+    fn __lasx_xvaddwev_d_wu(a: v8u32, b: v8u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvaddwev.w.hu"]
+    fn __lasx_xvaddwev_w_hu(a: v16u16, b: v16u16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvaddwev.h.bu"]
+    fn __lasx_xvaddwev_h_bu(a: v32u8, b: v32u8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsubwev.q.d"]
+    fn __lasx_xvsubwev_q_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsubwev.d.w"]
+    fn __lasx_xvsubwev_d_w(a: v8i32, b: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsubwev.w.h"]
+    fn __lasx_xvsubwev_w_h(a: v16i16, b: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsubwev.h.b"]
+    fn __lasx_xvsubwev_h_b(a: v32i8, b: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsubwev.q.du"]
+    fn __lasx_xvsubwev_q_du(a: v4u64, b: v4u64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsubwev.d.wu"]
+    fn __lasx_xvsubwev_d_wu(a: v8u32, b: v8u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsubwev.w.hu"]
+    fn __lasx_xvsubwev_w_hu(a: v16u16, b: v16u16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsubwev.h.bu"]
+    fn __lasx_xvsubwev_h_bu(a: v32u8, b: v32u8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmulwev.q.d"]
+    fn __lasx_xvmulwev_q_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmulwev.d.w"]
+    fn __lasx_xvmulwev_d_w(a: v8i32, b: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmulwev.w.h"]
+    fn __lasx_xvmulwev_w_h(a: v16i16, b: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmulwev.h.b"]
+    fn __lasx_xvmulwev_h_b(a: v32i8, b: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmulwev.q.du"]
+    fn __lasx_xvmulwev_q_du(a: v4u64, b: v4u64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmulwev.d.wu"]
+    fn __lasx_xvmulwev_d_wu(a: v8u32, b: v8u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmulwev.w.hu"]
+    fn __lasx_xvmulwev_w_hu(a: v16u16, b: v16u16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmulwev.h.bu"]
+    fn __lasx_xvmulwev_h_bu(a: v32u8, b: v32u8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvaddwod.q.d"]
+    fn __lasx_xvaddwod_q_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvaddwod.d.w"]
+    fn __lasx_xvaddwod_d_w(a: v8i32, b: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvaddwod.w.h"]
+    fn __lasx_xvaddwod_w_h(a: v16i16, b: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvaddwod.h.b"]
+    fn __lasx_xvaddwod_h_b(a: v32i8, b: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvaddwod.q.du"]
+    fn __lasx_xvaddwod_q_du(a: v4u64, b: v4u64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvaddwod.d.wu"]
+    fn __lasx_xvaddwod_d_wu(a: v8u32, b: v8u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvaddwod.w.hu"]
+    fn __lasx_xvaddwod_w_hu(a: v16u16, b: v16u16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvaddwod.h.bu"]
+    fn __lasx_xvaddwod_h_bu(a: v32u8, b: v32u8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsubwod.q.d"]
+    fn __lasx_xvsubwod_q_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsubwod.d.w"]
+    fn __lasx_xvsubwod_d_w(a: v8i32, b: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsubwod.w.h"]
+    fn __lasx_xvsubwod_w_h(a: v16i16, b: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsubwod.h.b"]
+    fn __lasx_xvsubwod_h_b(a: v32i8, b: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsubwod.q.du"]
+    fn __lasx_xvsubwod_q_du(a: v4u64, b: v4u64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsubwod.d.wu"]
+    fn __lasx_xvsubwod_d_wu(a: v8u32, b: v8u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsubwod.w.hu"]
+    fn __lasx_xvsubwod_w_hu(a: v16u16, b: v16u16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsubwod.h.bu"]
+    fn __lasx_xvsubwod_h_bu(a: v32u8, b: v32u8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmulwod.q.d"]
+    fn __lasx_xvmulwod_q_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmulwod.d.w"]
+    fn __lasx_xvmulwod_d_w(a: v8i32, b: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmulwod.w.h"]
+    fn __lasx_xvmulwod_w_h(a: v16i16, b: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmulwod.h.b"]
+    fn __lasx_xvmulwod_h_b(a: v32i8, b: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmulwod.q.du"]
+    fn __lasx_xvmulwod_q_du(a: v4u64, b: v4u64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmulwod.d.wu"]
+    fn __lasx_xvmulwod_d_wu(a: v8u32, b: v8u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmulwod.w.hu"]
+    fn __lasx_xvmulwod_w_hu(a: v16u16, b: v16u16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmulwod.h.bu"]
+    fn __lasx_xvmulwod_h_bu(a: v32u8, b: v32u8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvaddwev.d.wu.w"]
+    fn __lasx_xvaddwev_d_wu_w(a: v8u32, b: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvaddwev.w.hu.h"]
+    fn __lasx_xvaddwev_w_hu_h(a: v16u16, b: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvaddwev.h.bu.b"]
+    fn __lasx_xvaddwev_h_bu_b(a: v32u8, b: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmulwev.d.wu.w"]
+    fn __lasx_xvmulwev_d_wu_w(a: v8u32, b: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmulwev.w.hu.h"]
+    fn __lasx_xvmulwev_w_hu_h(a: v16u16, b: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmulwev.h.bu.b"]
+    fn __lasx_xvmulwev_h_bu_b(a: v32u8, b: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvaddwod.d.wu.w"]
+    fn __lasx_xvaddwod_d_wu_w(a: v8u32, b: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvaddwod.w.hu.h"]
+    fn __lasx_xvaddwod_w_hu_h(a: v16u16, b: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvaddwod.h.bu.b"]
+    fn __lasx_xvaddwod_h_bu_b(a: v32u8, b: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmulwod.d.wu.w"]
+    fn __lasx_xvmulwod_d_wu_w(a: v8u32, b: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmulwod.w.hu.h"]
+    fn __lasx_xvmulwod_w_hu_h(a: v16u16, b: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmulwod.h.bu.b"]
+    fn __lasx_xvmulwod_h_bu_b(a: v32u8, b: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvhaddw.q.d"]
+    fn __lasx_xvhaddw_q_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvhaddw.qu.du"]
+    fn __lasx_xvhaddw_qu_du(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvhsubw.q.d"]
+    fn __lasx_xvhsubw_q_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvhsubw.qu.du"]
+    fn __lasx_xvhsubw_qu_du(a: v4u64, b: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwev.q.d"]
+    fn __lasx_xvmaddwev_q_d(a: v4i64, b: v4i64, c: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwev.d.w"]
+    fn __lasx_xvmaddwev_d_w(a: v4i64, b: v8i32, c: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwev.w.h"]
+    fn __lasx_xvmaddwev_w_h(a: v8i32, b: v16i16, c: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwev.h.b"]
+    fn __lasx_xvmaddwev_h_b(a: v16i16, b: v32i8, c: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwev.q.du"]
+    fn __lasx_xvmaddwev_q_du(a: v4u64, b: v4u64, c: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwev.d.wu"]
+    fn __lasx_xvmaddwev_d_wu(a: v4u64, b: v8u32, c: v8u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwev.w.hu"]
+    fn __lasx_xvmaddwev_w_hu(a: v8u32, b: v16u16, c: v16u16) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwev.h.bu"]
+    fn __lasx_xvmaddwev_h_bu(a: v16u16, b: v32u8, c: v32u8) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwod.q.d"]
+    fn __lasx_xvmaddwod_q_d(a: v4i64, b: v4i64, c: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwod.d.w"]
+    fn __lasx_xvmaddwod_d_w(a: v4i64, b: v8i32, c: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwod.w.h"]
+    fn __lasx_xvmaddwod_w_h(a: v8i32, b: v16i16, c: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwod.h.b"]
+    fn __lasx_xvmaddwod_h_b(a: v16i16, b: v32i8, c: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwod.q.du"]
+    fn __lasx_xvmaddwod_q_du(a: v4u64, b: v4u64, c: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwod.d.wu"]
+    fn __lasx_xvmaddwod_d_wu(a: v4u64, b: v8u32, c: v8u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwod.w.hu"]
+    fn __lasx_xvmaddwod_w_hu(a: v8u32, b: v16u16, c: v16u16) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwod.h.bu"]
+    fn __lasx_xvmaddwod_h_bu(a: v16u16, b: v32u8, c: v32u8) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwev.q.du.d"]
+    fn __lasx_xvmaddwev_q_du_d(a: v4i64, b: v4u64, c: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwev.d.wu.w"]
+    fn __lasx_xvmaddwev_d_wu_w(a: v4i64, b: v8u32, c: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwev.w.hu.h"]
+    fn __lasx_xvmaddwev_w_hu_h(a: v8i32, b: v16u16, c: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwev.h.bu.b"]
+    fn __lasx_xvmaddwev_h_bu_b(a: v16i16, b: v32u8, c: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwod.q.du.d"]
+    fn __lasx_xvmaddwod_q_du_d(a: v4i64, b: v4u64, c: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwod.d.wu.w"]
+    fn __lasx_xvmaddwod_d_wu_w(a: v4i64, b: v8u32, c: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwod.w.hu.h"]
+    fn __lasx_xvmaddwod_w_hu_h(a: v8i32, b: v16u16, c: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvmaddwod.h.bu.b"]
+    fn __lasx_xvmaddwod_h_bu_b(a: v16i16, b: v32u8, c: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvrotr.b"]
+    fn __lasx_xvrotr_b(a: v32i8, b: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvrotr.h"]
+    fn __lasx_xvrotr_h(a: v16i16, b: v16i16) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvrotr.w"]
+    fn __lasx_xvrotr_w(a: v8i32, b: v8i32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvrotr.d"]
+    fn __lasx_xvrotr_d(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvadd.q"]
+    fn __lasx_xvadd_q(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsub.q"]
+    fn __lasx_xvsub_q(a: v4i64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvaddwev.q.du.d"]
+    fn __lasx_xvaddwev_q_du_d(a: v4u64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvaddwod.q.du.d"]
+    fn __lasx_xvaddwod_q_du_d(a: v4u64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmulwev.q.du.d"]
+    fn __lasx_xvmulwev_q_du_d(a: v4u64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmulwod.q.du.d"]
+    fn __lasx_xvmulwod_q_du_d(a: v4u64, b: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvmskgez.b"]
+    fn __lasx_xvmskgez_b(a: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvmsknz.b"]
+    fn __lasx_xvmsknz_b(a: v32i8) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvexth.h.b"]
+    fn __lasx_xvexth_h_b(a: v32i8) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvexth.w.h"]
+    fn __lasx_xvexth_w_h(a: v16i16) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvexth.d.w"]
+    fn __lasx_xvexth_d_w(a: v8i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvexth.q.d"]
+    fn __lasx_xvexth_q_d(a: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvexth.hu.bu"]
+    fn __lasx_xvexth_hu_bu(a: v32u8) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvexth.wu.hu"]
+    fn __lasx_xvexth_wu_hu(a: v16u16) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvexth.du.wu"]
+    fn __lasx_xvexth_du_wu(a: v8u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvexth.qu.du"]
+    fn __lasx_xvexth_qu_du(a: v4u64) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvrotri.b"]
+    fn __lasx_xvrotri_b(a: v32i8, b: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvrotri.h"]
+    fn __lasx_xvrotri_h(a: v16i16, b: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvrotri.w"]
+    fn __lasx_xvrotri_w(a: v8i32, b: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvrotri.d"]
+    fn __lasx_xvrotri_d(a: v4i64, b: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvextl.q.d"]
+    fn __lasx_xvextl_q_d(a: v4i64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsrlni.b.h"]
+    fn __lasx_xvsrlni_b_h(a: v32i8, b: v32i8, c: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrlni.h.w"]
+    fn __lasx_xvsrlni_h_w(a: v16i16, b: v16i16, c: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrlni.w.d"]
+    fn __lasx_xvsrlni_w_d(a: v8i32, b: v8i32, c: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsrlni.d.q"]
+    fn __lasx_xvsrlni_d_q(a: v4i64, b: v4i64, c: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsrlrni.b.h"]
+    fn __lasx_xvsrlrni_b_h(a: v32i8, b: v32i8, c: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrlrni.h.w"]
+    fn __lasx_xvsrlrni_h_w(a: v16i16, b: v16i16, c: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrlrni.w.d"]
+    fn __lasx_xvsrlrni_w_d(a: v8i32, b: v8i32, c: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsrlrni.d.q"]
+    fn __lasx_xvsrlrni_d_q(a: v4i64, b: v4i64, c: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvssrlni.b.h"]
+    fn __lasx_xvssrlni_b_h(a: v32i8, b: v32i8, c: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvssrlni.h.w"]
+    fn __lasx_xvssrlni_h_w(a: v16i16, b: v16i16, c: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvssrlni.w.d"]
+    fn __lasx_xvssrlni_w_d(a: v8i32, b: v8i32, c: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvssrlni.d.q"]
+    fn __lasx_xvssrlni_d_q(a: v4i64, b: v4i64, c: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvssrlni.bu.h"]
+    fn __lasx_xvssrlni_bu_h(a: v32u8, b: v32i8, c: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvssrlni.hu.w"]
+    fn __lasx_xvssrlni_hu_w(a: v16u16, b: v16i16, c: u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvssrlni.wu.d"]
+    fn __lasx_xvssrlni_wu_d(a: v8u32, b: v8i32, c: u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvssrlni.du.q"]
+    fn __lasx_xvssrlni_du_q(a: v4u64, b: v4i64, c: u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrni.b.h"]
+    fn __lasx_xvssrlrni_b_h(a: v32i8, b: v32i8, c: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrni.h.w"]
+    fn __lasx_xvssrlrni_h_w(a: v16i16, b: v16i16, c: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrni.w.d"]
+    fn __lasx_xvssrlrni_w_d(a: v8i32, b: v8i32, c: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrni.d.q"]
+    fn __lasx_xvssrlrni_d_q(a: v4i64, b: v4i64, c: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrni.bu.h"]
+    fn __lasx_xvssrlrni_bu_h(a: v32u8, b: v32i8, c: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrni.hu.w"]
+    fn __lasx_xvssrlrni_hu_w(a: v16u16, b: v16i16, c: u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrni.wu.d"]
+    fn __lasx_xvssrlrni_wu_d(a: v8u32, b: v8i32, c: u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvssrlrni.du.q"]
+    fn __lasx_xvssrlrni_du_q(a: v4u64, b: v4i64, c: u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvsrani.b.h"]
+    fn __lasx_xvsrani_b_h(a: v32i8, b: v32i8, c: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrani.h.w"]
+    fn __lasx_xvsrani_h_w(a: v16i16, b: v16i16, c: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrani.w.d"]
+    fn __lasx_xvsrani_w_d(a: v8i32, b: v8i32, c: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsrani.d.q"]
+    fn __lasx_xvsrani_d_q(a: v4i64, b: v4i64, c: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvsrarni.b.h"]
+    fn __lasx_xvsrarni_b_h(a: v32i8, b: v32i8, c: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvsrarni.h.w"]
+    fn __lasx_xvsrarni_h_w(a: v16i16, b: v16i16, c: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvsrarni.w.d"]
+    fn __lasx_xvsrarni_w_d(a: v8i32, b: v8i32, c: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvsrarni.d.q"]
+    fn __lasx_xvsrarni_d_q(a: v4i64, b: v4i64, c: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvssrani.b.h"]
+    fn __lasx_xvssrani_b_h(a: v32i8, b: v32i8, c: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvssrani.h.w"]
+    fn __lasx_xvssrani_h_w(a: v16i16, b: v16i16, c: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvssrani.w.d"]
+    fn __lasx_xvssrani_w_d(a: v8i32, b: v8i32, c: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvssrani.d.q"]
+    fn __lasx_xvssrani_d_q(a: v4i64, b: v4i64, c: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvssrani.bu.h"]
+    fn __lasx_xvssrani_bu_h(a: v32u8, b: v32i8, c: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvssrani.hu.w"]
+    fn __lasx_xvssrani_hu_w(a: v16u16, b: v16i16, c: u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvssrani.wu.d"]
+    fn __lasx_xvssrani_wu_d(a: v8u32, b: v8i32, c: u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvssrani.du.q"]
+    fn __lasx_xvssrani_du_q(a: v4u64, b: v4i64, c: u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xvssrarni.b.h"]
+    fn __lasx_xvssrarni_b_h(a: v32i8, b: v32i8, c: u32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvssrarni.h.w"]
+    fn __lasx_xvssrarni_h_w(a: v16i16, b: v16i16, c: u32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvssrarni.w.d"]
+    fn __lasx_xvssrarni_w_d(a: v8i32, b: v8i32, c: u32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvssrarni.d.q"]
+    fn __lasx_xvssrarni_d_q(a: v4i64, b: v4i64, c: u32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvssrarni.bu.h"]
+    fn __lasx_xvssrarni_bu_h(a: v32u8, b: v32i8, c: u32) -> v32u8;
+    #[link_name = "llvm.loongarch.lasx.xvssrarni.hu.w"]
+    fn __lasx_xvssrarni_hu_w(a: v16u16, b: v16i16, c: u32) -> v16u16;
+    #[link_name = "llvm.loongarch.lasx.xvssrarni.wu.d"]
+    fn __lasx_xvssrarni_wu_d(a: v8u32, b: v8i32, c: u32) -> v8u32;
+    #[link_name = "llvm.loongarch.lasx.xvssrarni.du.q"]
+    fn __lasx_xvssrarni_du_q(a: v4u64, b: v4i64, c: u32) -> v4u64;
+    #[link_name = "llvm.loongarch.lasx.xbnz.b"]
+    fn __lasx_xbnz_b(a: v32u8) -> i32;
+    #[link_name = "llvm.loongarch.lasx.xbnz.d"]
+    fn __lasx_xbnz_d(a: v4u64) -> i32;
+    #[link_name = "llvm.loongarch.lasx.xbnz.h"]
+    fn __lasx_xbnz_h(a: v16u16) -> i32;
+    #[link_name = "llvm.loongarch.lasx.xbnz.v"]
+    fn __lasx_xbnz_v(a: v32u8) -> i32;
+    #[link_name = "llvm.loongarch.lasx.xbnz.w"]
+    fn __lasx_xbnz_w(a: v8u32) -> i32;
+    #[link_name = "llvm.loongarch.lasx.xbz.b"]
+    fn __lasx_xbz_b(a: v32u8) -> i32;
+    #[link_name = "llvm.loongarch.lasx.xbz.d"]
+    fn __lasx_xbz_d(a: v4u64) -> i32;
+    #[link_name = "llvm.loongarch.lasx.xbz.h"]
+    fn __lasx_xbz_h(a: v16u16) -> i32;
+    #[link_name = "llvm.loongarch.lasx.xbz.v"]
+    fn __lasx_xbz_v(a: v32u8) -> i32;
+    #[link_name = "llvm.loongarch.lasx.xbz.w"]
+    fn __lasx_xbz_w(a: v8u32) -> i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.caf.d"]
+    fn __lasx_xvfcmp_caf_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.caf.s"]
+    fn __lasx_xvfcmp_caf_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.ceq.d"]
+    fn __lasx_xvfcmp_ceq_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.ceq.s"]
+    fn __lasx_xvfcmp_ceq_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cle.d"]
+    fn __lasx_xvfcmp_cle_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cle.s"]
+    fn __lasx_xvfcmp_cle_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.clt.d"]
+    fn __lasx_xvfcmp_clt_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.clt.s"]
+    fn __lasx_xvfcmp_clt_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cne.d"]
+    fn __lasx_xvfcmp_cne_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cne.s"]
+    fn __lasx_xvfcmp_cne_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cor.d"]
+    fn __lasx_xvfcmp_cor_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cor.s"]
+    fn __lasx_xvfcmp_cor_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cueq.d"]
+    fn __lasx_xvfcmp_cueq_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cueq.s"]
+    fn __lasx_xvfcmp_cueq_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cule.d"]
+    fn __lasx_xvfcmp_cule_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cule.s"]
+    fn __lasx_xvfcmp_cule_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cult.d"]
+    fn __lasx_xvfcmp_cult_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cult.s"]
+    fn __lasx_xvfcmp_cult_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cun.d"]
+    fn __lasx_xvfcmp_cun_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cune.d"]
+    fn __lasx_xvfcmp_cune_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cune.s"]
+    fn __lasx_xvfcmp_cune_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.cun.s"]
+    fn __lasx_xvfcmp_cun_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.saf.d"]
+    fn __lasx_xvfcmp_saf_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.saf.s"]
+    fn __lasx_xvfcmp_saf_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.seq.d"]
+    fn __lasx_xvfcmp_seq_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.seq.s"]
+    fn __lasx_xvfcmp_seq_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sle.d"]
+    fn __lasx_xvfcmp_sle_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sle.s"]
+    fn __lasx_xvfcmp_sle_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.slt.d"]
+    fn __lasx_xvfcmp_slt_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.slt.s"]
+    fn __lasx_xvfcmp_slt_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sne.d"]
+    fn __lasx_xvfcmp_sne_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sne.s"]
+    fn __lasx_xvfcmp_sne_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sor.d"]
+    fn __lasx_xvfcmp_sor_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sor.s"]
+    fn __lasx_xvfcmp_sor_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sueq.d"]
+    fn __lasx_xvfcmp_sueq_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sueq.s"]
+    fn __lasx_xvfcmp_sueq_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sule.d"]
+    fn __lasx_xvfcmp_sule_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sule.s"]
+    fn __lasx_xvfcmp_sule_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sult.d"]
+    fn __lasx_xvfcmp_sult_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sult.s"]
+    fn __lasx_xvfcmp_sult_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sun.d"]
+    fn __lasx_xvfcmp_sun_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sune.d"]
+    fn __lasx_xvfcmp_sune_d(a: v4f64, b: v4f64) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sune.s"]
+    fn __lasx_xvfcmp_sune_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvfcmp.sun.s"]
+    fn __lasx_xvfcmp_sun_s(a: v8f32, b: v8f32) -> v8i32;
+    #[link_name = "llvm.loongarch.lasx.xvpickve.d.f"]
+    fn __lasx_xvpickve_d_f(a: v4f64, b: u32) -> v4f64;
+    #[link_name = "llvm.loongarch.lasx.xvpickve.w.f"]
+    fn __lasx_xvpickve_w_f(a: v8f32, b: u32) -> v8f32;
+    #[link_name = "llvm.loongarch.lasx.xvrepli.b"]
+    fn __lasx_xvrepli_b(a: i32) -> v32i8;
+    #[link_name = "llvm.loongarch.lasx.xvrepli.d"]
+    fn __lasx_xvrepli_d(a: i32) -> v4i64;
+    #[link_name = "llvm.loongarch.lasx.xvrepli.h"]
+    fn __lasx_xvrepli_h(a: i32) -> v16i16;
+    #[link_name = "llvm.loongarch.lasx.xvrepli.w"]
+    fn __lasx_xvrepli_w(a: i32) -> v8i32;
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsll_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvsll_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsll_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvsll_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsll_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvsll_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsll_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvsll_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslli_b<const IMM3: u32>(a: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvslli_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslli_h<const IMM4: u32>(a: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvslli_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslli_w<const IMM5: u32>(a: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvslli_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslli_d<const IMM6: u32>(a: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvslli_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsra_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvsra_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsra_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvsra_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsra_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvsra_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsra_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvsra_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrai_b<const IMM3: u32>(a: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvsrai_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrai_h<const IMM4: u32>(a: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvsrai_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrai_w<const IMM5: u32>(a: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsrai_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrai_d<const IMM6: u32>(a: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvsrai_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrar_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvsrar_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrar_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvsrar_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrar_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvsrar_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrar_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvsrar_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrari_b<const IMM3: u32>(a: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvsrari_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrari_h<const IMM4: u32>(a: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvsrari_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrari_w<const IMM5: u32>(a: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsrari_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrari_d<const IMM6: u32>(a: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvsrari_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrl_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvsrl_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrl_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvsrl_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrl_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvsrl_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrl_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvsrl_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrli_b<const IMM3: u32>(a: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvsrli_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrli_h<const IMM4: u32>(a: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvsrli_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrli_w<const IMM5: u32>(a: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsrli_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrli_d<const IMM6: u32>(a: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvsrli_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlr_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvsrlr_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlr_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvsrlr_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlr_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvsrlr_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlr_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvsrlr_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlri_b<const IMM3: u32>(a: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvsrlri_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlri_h<const IMM4: u32>(a: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvsrlri_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlri_w<const IMM5: u32>(a: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsrlri_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlri_d<const IMM6: u32>(a: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvsrlri_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitclr_b(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvbitclr_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitclr_h(a: v16u16, b: v16u16) -> v16u16 {
+    __lasx_xvbitclr_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitclr_w(a: v8u32, b: v8u32) -> v8u32 {
+    __lasx_xvbitclr_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitclr_d(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvbitclr_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitclri_b<const IMM3: u32>(a: v32u8) -> v32u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvbitclri_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitclri_h<const IMM4: u32>(a: v16u16) -> v16u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvbitclri_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitclri_w<const IMM5: u32>(a: v8u32) -> v8u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvbitclri_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitclri_d<const IMM6: u32>(a: v4u64) -> v4u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvbitclri_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitset_b(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvbitset_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitset_h(a: v16u16, b: v16u16) -> v16u16 {
+    __lasx_xvbitset_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitset_w(a: v8u32, b: v8u32) -> v8u32 {
+    __lasx_xvbitset_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitset_d(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvbitset_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitseti_b<const IMM3: u32>(a: v32u8) -> v32u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvbitseti_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitseti_h<const IMM4: u32>(a: v16u16) -> v16u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvbitseti_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitseti_w<const IMM5: u32>(a: v8u32) -> v8u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvbitseti_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitseti_d<const IMM6: u32>(a: v4u64) -> v4u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvbitseti_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitrev_b(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvbitrev_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitrev_h(a: v16u16, b: v16u16) -> v16u16 {
+    __lasx_xvbitrev_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitrev_w(a: v8u32, b: v8u32) -> v8u32 {
+    __lasx_xvbitrev_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitrev_d(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvbitrev_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitrevi_b<const IMM3: u32>(a: v32u8) -> v32u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvbitrevi_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitrevi_h<const IMM4: u32>(a: v16u16) -> v16u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvbitrevi_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitrevi_w<const IMM5: u32>(a: v8u32) -> v8u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvbitrevi_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitrevi_d<const IMM6: u32>(a: v4u64) -> v4u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvbitrevi_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvadd_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvadd_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvadd_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvadd_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvadd_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvadd_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvadd_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvadd_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddi_bu<const IMM5: u32>(a: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvaddi_bu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddi_hu<const IMM5: u32>(a: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvaddi_hu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddi_wu<const IMM5: u32>(a: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvaddi_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddi_du<const IMM5: u32>(a: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvaddi_du(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsub_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvsub_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsub_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvsub_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsub_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvsub_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsub_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvsub_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubi_bu<const IMM5: u32>(a: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsubi_bu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubi_hu<const IMM5: u32>(a: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsubi_hu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubi_wu<const IMM5: u32>(a: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsubi_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubi_du<const IMM5: u32>(a: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsubi_du(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmax_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvmax_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmax_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvmax_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmax_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvmax_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmax_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvmax_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaxi_b<const IMM_S5: i32>(a: v32i8) -> v32i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvmaxi_b(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaxi_h<const IMM_S5: i32>(a: v16i16) -> v16i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvmaxi_h(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaxi_w<const IMM_S5: i32>(a: v8i32) -> v8i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvmaxi_w(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaxi_d<const IMM_S5: i32>(a: v4i64) -> v4i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvmaxi_d(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmax_bu(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvmax_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmax_hu(a: v16u16, b: v16u16) -> v16u16 {
+    __lasx_xvmax_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmax_wu(a: v8u32, b: v8u32) -> v8u32 {
+    __lasx_xvmax_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmax_du(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvmax_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaxi_bu<const IMM5: u32>(a: v32u8) -> v32u8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvmaxi_bu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaxi_hu<const IMM5: u32>(a: v16u16) -> v16u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvmaxi_hu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaxi_wu<const IMM5: u32>(a: v8u32) -> v8u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvmaxi_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaxi_du<const IMM5: u32>(a: v4u64) -> v4u64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvmaxi_du(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmin_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvmin_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmin_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvmin_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmin_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvmin_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmin_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvmin_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmini_b<const IMM_S5: i32>(a: v32i8) -> v32i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvmini_b(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmini_h<const IMM_S5: i32>(a: v16i16) -> v16i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvmini_h(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmini_w<const IMM_S5: i32>(a: v8i32) -> v8i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvmini_w(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmini_d<const IMM_S5: i32>(a: v4i64) -> v4i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvmini_d(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmin_bu(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvmin_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmin_hu(a: v16u16, b: v16u16) -> v16u16 {
+    __lasx_xvmin_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmin_wu(a: v8u32, b: v8u32) -> v8u32 {
+    __lasx_xvmin_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmin_du(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvmin_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmini_bu<const IMM5: u32>(a: v32u8) -> v32u8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvmini_bu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmini_hu<const IMM5: u32>(a: v16u16) -> v16u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvmini_hu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmini_wu<const IMM5: u32>(a: v8u32) -> v8u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvmini_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmini_du<const IMM5: u32>(a: v4u64) -> v4u64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvmini_du(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvseq_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvseq_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvseq_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvseq_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvseq_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvseq_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvseq_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvseq_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvseqi_b<const IMM_S5: i32>(a: v32i8) -> v32i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvseqi_b(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvseqi_h<const IMM_S5: i32>(a: v16i16) -> v16i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvseqi_h(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvseqi_w<const IMM_S5: i32>(a: v8i32) -> v8i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvseqi_w(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvseqi_d<const IMM_S5: i32>(a: v4i64) -> v4i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvseqi_d(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslt_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvslt_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslt_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvslt_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslt_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvslt_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslt_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvslt_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslti_b<const IMM_S5: i32>(a: v32i8) -> v32i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvslti_b(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslti_h<const IMM_S5: i32>(a: v16i16) -> v16i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvslti_h(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslti_w<const IMM_S5: i32>(a: v8i32) -> v8i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvslti_w(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslti_d<const IMM_S5: i32>(a: v4i64) -> v4i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvslti_d(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslt_bu(a: v32u8, b: v32u8) -> v32i8 {
+    __lasx_xvslt_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslt_hu(a: v16u16, b: v16u16) -> v16i16 {
+    __lasx_xvslt_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslt_wu(a: v8u32, b: v8u32) -> v8i32 {
+    __lasx_xvslt_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslt_du(a: v4u64, b: v4u64) -> v4i64 {
+    __lasx_xvslt_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslti_bu<const IMM5: u32>(a: v32u8) -> v32i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvslti_bu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslti_hu<const IMM5: u32>(a: v16u16) -> v16i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvslti_hu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslti_wu<const IMM5: u32>(a: v8u32) -> v8i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvslti_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslti_du<const IMM5: u32>(a: v4u64) -> v4i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvslti_du(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsle_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvsle_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsle_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvsle_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsle_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvsle_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsle_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvsle_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslei_b<const IMM_S5: i32>(a: v32i8) -> v32i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvslei_b(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslei_h<const IMM_S5: i32>(a: v16i16) -> v16i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvslei_h(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslei_w<const IMM_S5: i32>(a: v8i32) -> v8i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvslei_w(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslei_d<const IMM_S5: i32>(a: v4i64) -> v4i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lasx_xvslei_d(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsle_bu(a: v32u8, b: v32u8) -> v32i8 {
+    __lasx_xvsle_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsle_hu(a: v16u16, b: v16u16) -> v16i16 {
+    __lasx_xvsle_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsle_wu(a: v8u32, b: v8u32) -> v8i32 {
+    __lasx_xvsle_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsle_du(a: v4u64, b: v4u64) -> v4i64 {
+    __lasx_xvsle_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslei_bu<const IMM5: u32>(a: v32u8) -> v32i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvslei_bu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslei_hu<const IMM5: u32>(a: v16u16) -> v16i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvslei_hu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslei_wu<const IMM5: u32>(a: v8u32) -> v8i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvslei_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvslei_du<const IMM5: u32>(a: v4u64) -> v4i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvslei_du(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsat_b<const IMM3: u32>(a: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvsat_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsat_h<const IMM4: u32>(a: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvsat_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsat_w<const IMM5: u32>(a: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsat_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsat_d<const IMM6: u32>(a: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvsat_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsat_bu<const IMM3: u32>(a: v32u8) -> v32u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvsat_bu(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsat_hu<const IMM4: u32>(a: v16u16) -> v16u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvsat_hu(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsat_wu<const IMM5: u32>(a: v8u32) -> v8u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsat_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsat_du<const IMM6: u32>(a: v4u64) -> v4u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvsat_du(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvadda_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvadda_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvadda_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvadda_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvadda_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvadda_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvadda_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvadda_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsadd_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvsadd_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsadd_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvsadd_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsadd_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvsadd_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsadd_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvsadd_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsadd_bu(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvsadd_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsadd_hu(a: v16u16, b: v16u16) -> v16u16 {
+    __lasx_xvsadd_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsadd_wu(a: v8u32, b: v8u32) -> v8u32 {
+    __lasx_xvsadd_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsadd_du(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvsadd_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavg_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvavg_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavg_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvavg_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavg_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvavg_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavg_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvavg_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavg_bu(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvavg_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavg_hu(a: v16u16, b: v16u16) -> v16u16 {
+    __lasx_xvavg_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavg_wu(a: v8u32, b: v8u32) -> v8u32 {
+    __lasx_xvavg_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavg_du(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvavg_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavgr_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvavgr_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavgr_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvavgr_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavgr_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvavgr_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavgr_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvavgr_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavgr_bu(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvavgr_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavgr_hu(a: v16u16, b: v16u16) -> v16u16 {
+    __lasx_xvavgr_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavgr_wu(a: v8u32, b: v8u32) -> v8u32 {
+    __lasx_xvavgr_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvavgr_du(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvavgr_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssub_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvssub_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssub_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvssub_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssub_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvssub_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssub_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvssub_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssub_bu(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvssub_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssub_hu(a: v16u16, b: v16u16) -> v16u16 {
+    __lasx_xvssub_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssub_wu(a: v8u32, b: v8u32) -> v8u32 {
+    __lasx_xvssub_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssub_du(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvssub_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvabsd_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvabsd_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvabsd_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvabsd_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvabsd_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvabsd_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvabsd_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvabsd_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvabsd_bu(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvabsd_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvabsd_hu(a: v16u16, b: v16u16) -> v16u16 {
+    __lasx_xvabsd_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvabsd_wu(a: v8u32, b: v8u32) -> v8u32 {
+    __lasx_xvabsd_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvabsd_du(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvabsd_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmul_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvmul_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmul_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvmul_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmul_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvmul_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmul_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvmul_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmadd_b(a: v32i8, b: v32i8, c: v32i8) -> v32i8 {
+    __lasx_xvmadd_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmadd_h(a: v16i16, b: v16i16, c: v16i16) -> v16i16 {
+    __lasx_xvmadd_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmadd_w(a: v8i32, b: v8i32, c: v8i32) -> v8i32 {
+    __lasx_xvmadd_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmadd_d(a: v4i64, b: v4i64, c: v4i64) -> v4i64 {
+    __lasx_xvmadd_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmsub_b(a: v32i8, b: v32i8, c: v32i8) -> v32i8 {
+    __lasx_xvmsub_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmsub_h(a: v16i16, b: v16i16, c: v16i16) -> v16i16 {
+    __lasx_xvmsub_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmsub_w(a: v8i32, b: v8i32, c: v8i32) -> v8i32 {
+    __lasx_xvmsub_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmsub_d(a: v4i64, b: v4i64, c: v4i64) -> v4i64 {
+    __lasx_xvmsub_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvdiv_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvdiv_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvdiv_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvdiv_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvdiv_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvdiv_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvdiv_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvdiv_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvdiv_bu(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvdiv_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvdiv_hu(a: v16u16, b: v16u16) -> v16u16 {
+    __lasx_xvdiv_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvdiv_wu(a: v8u32, b: v8u32) -> v8u32 {
+    __lasx_xvdiv_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvdiv_du(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvdiv_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhaddw_h_b(a: v32i8, b: v32i8) -> v16i16 {
+    __lasx_xvhaddw_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhaddw_w_h(a: v16i16, b: v16i16) -> v8i32 {
+    __lasx_xvhaddw_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhaddw_d_w(a: v8i32, b: v8i32) -> v4i64 {
+    __lasx_xvhaddw_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhaddw_hu_bu(a: v32u8, b: v32u8) -> v16u16 {
+    __lasx_xvhaddw_hu_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhaddw_wu_hu(a: v16u16, b: v16u16) -> v8u32 {
+    __lasx_xvhaddw_wu_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhaddw_du_wu(a: v8u32, b: v8u32) -> v4u64 {
+    __lasx_xvhaddw_du_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhsubw_h_b(a: v32i8, b: v32i8) -> v16i16 {
+    __lasx_xvhsubw_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhsubw_w_h(a: v16i16, b: v16i16) -> v8i32 {
+    __lasx_xvhsubw_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhsubw_d_w(a: v8i32, b: v8i32) -> v4i64 {
+    __lasx_xvhsubw_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhsubw_hu_bu(a: v32u8, b: v32u8) -> v16i16 {
+    __lasx_xvhsubw_hu_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhsubw_wu_hu(a: v16u16, b: v16u16) -> v8i32 {
+    __lasx_xvhsubw_wu_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhsubw_du_wu(a: v8u32, b: v8u32) -> v4i64 {
+    __lasx_xvhsubw_du_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmod_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvmod_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmod_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvmod_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmod_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvmod_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmod_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvmod_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmod_bu(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvmod_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmod_hu(a: v16u16, b: v16u16) -> v16u16 {
+    __lasx_xvmod_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmod_wu(a: v8u32, b: v8u32) -> v8u32 {
+    __lasx_xvmod_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmod_du(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvmod_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrepl128vei_b<const IMM4: u32>(a: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvrepl128vei_b(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrepl128vei_h<const IMM3: u32>(a: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvrepl128vei_h(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrepl128vei_w<const IMM2: u32>(a: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM2, 2);
+    __lasx_xvrepl128vei_w(a, IMM2)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrepl128vei_d<const IMM1: u32>(a: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    __lasx_xvrepl128vei_d(a, IMM1)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickev_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvpickev_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickev_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvpickev_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickev_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvpickev_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickev_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvpickev_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickod_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvpickod_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickod_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvpickod_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickod_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvpickod_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickod_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvpickod_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvilvh_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvilvh_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvilvh_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvilvh_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvilvh_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvilvh_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvilvh_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvilvh_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvilvl_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvilvl_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvilvl_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvilvl_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvilvl_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvilvl_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvilvl_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvilvl_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpackev_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvpackev_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpackev_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvpackev_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpackev_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvpackev_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpackev_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvpackev_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpackod_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvpackod_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpackod_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvpackod_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpackod_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvpackod_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpackod_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvpackod_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvshuf_b(a: v32i8, b: v32i8, c: v32i8) -> v32i8 {
+    __lasx_xvshuf_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvshuf_h(a: v16i16, b: v16i16, c: v16i16) -> v16i16 {
+    __lasx_xvshuf_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvshuf_w(a: v8i32, b: v8i32, c: v8i32) -> v8i32 {
+    __lasx_xvshuf_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvshuf_d(a: v4i64, b: v4i64, c: v4i64) -> v4i64 {
+    __lasx_xvshuf_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvand_v(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvand_v(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvandi_b<const IMM8: u32>(a: v32u8) -> v32u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvandi_b(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvor_v(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvor_v(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvori_b<const IMM8: u32>(a: v32u8) -> v32u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvori_b(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvnor_v(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvnor_v(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvnori_b<const IMM8: u32>(a: v32u8) -> v32u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvnori_b(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvxor_v(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvxor_v(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvxori_b<const IMM8: u32>(a: v32u8) -> v32u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvxori_b(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitsel_v(a: v32u8, b: v32u8, c: v32u8) -> v32u8 {
+    __lasx_xvbitsel_v(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbitseli_b<const IMM8: u32>(a: v32u8, b: v32u8) -> v32u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvbitseli_b(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvshuf4i_b<const IMM8: u32>(a: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvshuf4i_b(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvshuf4i_h<const IMM8: u32>(a: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvshuf4i_h(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvshuf4i_w<const IMM8: u32>(a: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvshuf4i_w(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvreplgr2vr_b(a: i32) -> v32i8 {
+    __lasx_xvreplgr2vr_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvreplgr2vr_h(a: i32) -> v16i16 {
+    __lasx_xvreplgr2vr_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvreplgr2vr_w(a: i32) -> v8i32 {
+    __lasx_xvreplgr2vr_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvreplgr2vr_d(a: i64) -> v4i64 {
+    __lasx_xvreplgr2vr_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpcnt_b(a: v32i8) -> v32i8 {
+    __lasx_xvpcnt_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpcnt_h(a: v16i16) -> v16i16 {
+    __lasx_xvpcnt_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpcnt_w(a: v8i32) -> v8i32 {
+    __lasx_xvpcnt_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpcnt_d(a: v4i64) -> v4i64 {
+    __lasx_xvpcnt_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvclo_b(a: v32i8) -> v32i8 {
+    __lasx_xvclo_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvclo_h(a: v16i16) -> v16i16 {
+    __lasx_xvclo_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvclo_w(a: v8i32) -> v8i32 {
+    __lasx_xvclo_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvclo_d(a: v4i64) -> v4i64 {
+    __lasx_xvclo_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvclz_b(a: v32i8) -> v32i8 {
+    __lasx_xvclz_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvclz_h(a: v16i16) -> v16i16 {
+    __lasx_xvclz_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvclz_w(a: v8i32) -> v8i32 {
+    __lasx_xvclz_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvclz_d(a: v4i64) -> v4i64 {
+    __lasx_xvclz_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfadd_s(a: v8f32, b: v8f32) -> v8f32 {
+    __lasx_xvfadd_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfadd_d(a: v4f64, b: v4f64) -> v4f64 {
+    __lasx_xvfadd_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfsub_s(a: v8f32, b: v8f32) -> v8f32 {
+    __lasx_xvfsub_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfsub_d(a: v4f64, b: v4f64) -> v4f64 {
+    __lasx_xvfsub_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmul_s(a: v8f32, b: v8f32) -> v8f32 {
+    __lasx_xvfmul_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmul_d(a: v4f64, b: v4f64) -> v4f64 {
+    __lasx_xvfmul_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfdiv_s(a: v8f32, b: v8f32) -> v8f32 {
+    __lasx_xvfdiv_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfdiv_d(a: v4f64, b: v4f64) -> v4f64 {
+    __lasx_xvfdiv_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcvt_h_s(a: v8f32, b: v8f32) -> v16i16 {
+    __lasx_xvfcvt_h_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcvt_s_d(a: v4f64, b: v4f64) -> v8f32 {
+    __lasx_xvfcvt_s_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmin_s(a: v8f32, b: v8f32) -> v8f32 {
+    __lasx_xvfmin_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmin_d(a: v4f64, b: v4f64) -> v4f64 {
+    __lasx_xvfmin_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmina_s(a: v8f32, b: v8f32) -> v8f32 {
+    __lasx_xvfmina_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmina_d(a: v4f64, b: v4f64) -> v4f64 {
+    __lasx_xvfmina_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmax_s(a: v8f32, b: v8f32) -> v8f32 {
+    __lasx_xvfmax_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmax_d(a: v4f64, b: v4f64) -> v4f64 {
+    __lasx_xvfmax_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmaxa_s(a: v8f32, b: v8f32) -> v8f32 {
+    __lasx_xvfmaxa_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmaxa_d(a: v4f64, b: v4f64) -> v4f64 {
+    __lasx_xvfmaxa_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfclass_s(a: v8f32) -> v8i32 {
+    __lasx_xvfclass_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfclass_d(a: v4f64) -> v4i64 {
+    __lasx_xvfclass_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfsqrt_s(a: v8f32) -> v8f32 {
+    __lasx_xvfsqrt_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfsqrt_d(a: v4f64) -> v4f64 {
+    __lasx_xvfsqrt_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrecip_s(a: v8f32) -> v8f32 {
+    __lasx_xvfrecip_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrecip_d(a: v4f64) -> v4f64 {
+    __lasx_xvfrecip_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx,frecipe")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrecipe_s(a: v8f32) -> v8f32 {
+    __lasx_xvfrecipe_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx,frecipe")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrecipe_d(a: v4f64) -> v4f64 {
+    __lasx_xvfrecipe_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx,frecipe")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrsqrte_s(a: v8f32) -> v8f32 {
+    __lasx_xvfrsqrte_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx,frecipe")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrsqrte_d(a: v4f64) -> v4f64 {
+    __lasx_xvfrsqrte_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrint_s(a: v8f32) -> v8f32 {
+    __lasx_xvfrint_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrint_d(a: v4f64) -> v4f64 {
+    __lasx_xvfrint_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrsqrt_s(a: v8f32) -> v8f32 {
+    __lasx_xvfrsqrt_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrsqrt_d(a: v4f64) -> v4f64 {
+    __lasx_xvfrsqrt_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvflogb_s(a: v8f32) -> v8f32 {
+    __lasx_xvflogb_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvflogb_d(a: v4f64) -> v4f64 {
+    __lasx_xvflogb_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcvth_s_h(a: v16i16) -> v8f32 {
+    __lasx_xvfcvth_s_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcvth_d_s(a: v8f32) -> v4f64 {
+    __lasx_xvfcvth_d_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcvtl_s_h(a: v16i16) -> v8f32 {
+    __lasx_xvfcvtl_s_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcvtl_d_s(a: v8f32) -> v4f64 {
+    __lasx_xvfcvtl_d_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftint_w_s(a: v8f32) -> v8i32 {
+    __lasx_xvftint_w_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftint_l_d(a: v4f64) -> v4i64 {
+    __lasx_xvftint_l_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftint_wu_s(a: v8f32) -> v8u32 {
+    __lasx_xvftint_wu_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftint_lu_d(a: v4f64) -> v4u64 {
+    __lasx_xvftint_lu_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrz_w_s(a: v8f32) -> v8i32 {
+    __lasx_xvftintrz_w_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrz_l_d(a: v4f64) -> v4i64 {
+    __lasx_xvftintrz_l_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrz_wu_s(a: v8f32) -> v8u32 {
+    __lasx_xvftintrz_wu_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrz_lu_d(a: v4f64) -> v4u64 {
+    __lasx_xvftintrz_lu_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvffint_s_w(a: v8i32) -> v8f32 {
+    __lasx_xvffint_s_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvffint_d_l(a: v4i64) -> v4f64 {
+    __lasx_xvffint_d_l(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvffint_s_wu(a: v8u32) -> v8f32 {
+    __lasx_xvffint_s_wu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvffint_d_lu(a: v4u64) -> v4f64 {
+    __lasx_xvffint_d_lu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvreplve_b(a: v32i8, b: i32) -> v32i8 {
+    __lasx_xvreplve_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvreplve_h(a: v16i16, b: i32) -> v16i16 {
+    __lasx_xvreplve_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvreplve_w(a: v8i32, b: i32) -> v8i32 {
+    __lasx_xvreplve_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvreplve_d(a: v4i64, b: i32) -> v4i64 {
+    __lasx_xvreplve_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpermi_w<const IMM8: u32>(a: v8i32, b: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvpermi_w(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvandn_v(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvandn_v(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvneg_b(a: v32i8) -> v32i8 {
+    __lasx_xvneg_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvneg_h(a: v16i16) -> v16i16 {
+    __lasx_xvneg_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvneg_w(a: v8i32) -> v8i32 {
+    __lasx_xvneg_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvneg_d(a: v4i64) -> v4i64 {
+    __lasx_xvneg_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmuh_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvmuh_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmuh_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvmuh_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmuh_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvmuh_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmuh_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvmuh_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmuh_bu(a: v32u8, b: v32u8) -> v32u8 {
+    __lasx_xvmuh_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmuh_hu(a: v16u16, b: v16u16) -> v16u16 {
+    __lasx_xvmuh_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmuh_wu(a: v8u32, b: v8u32) -> v8u32 {
+    __lasx_xvmuh_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmuh_du(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvmuh_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsllwil_h_b<const IMM3: u32>(a: v32i8) -> v16i16 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvsllwil_h_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsllwil_w_h<const IMM4: u32>(a: v16i16) -> v8i32 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvsllwil_w_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsllwil_d_w<const IMM5: u32>(a: v8i32) -> v4i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsllwil_d_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsllwil_hu_bu<const IMM3: u32>(a: v32u8) -> v16u16 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvsllwil_hu_bu(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsllwil_wu_hu<const IMM4: u32>(a: v16u16) -> v8u32 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvsllwil_wu_hu(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsllwil_du_wu<const IMM5: u32>(a: v8u32) -> v4u64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsllwil_du_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsran_b_h(a: v16i16, b: v16i16) -> v32i8 {
+    __lasx_xvsran_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsran_h_w(a: v8i32, b: v8i32) -> v16i16 {
+    __lasx_xvsran_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsran_w_d(a: v4i64, b: v4i64) -> v8i32 {
+    __lasx_xvsran_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssran_b_h(a: v16i16, b: v16i16) -> v32i8 {
+    __lasx_xvssran_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssran_h_w(a: v8i32, b: v8i32) -> v16i16 {
+    __lasx_xvssran_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssran_w_d(a: v4i64, b: v4i64) -> v8i32 {
+    __lasx_xvssran_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssran_bu_h(a: v16u16, b: v16u16) -> v32u8 {
+    __lasx_xvssran_bu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssran_hu_w(a: v8u32, b: v8u32) -> v16u16 {
+    __lasx_xvssran_hu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssran_wu_d(a: v4u64, b: v4u64) -> v8u32 {
+    __lasx_xvssran_wu_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrarn_b_h(a: v16i16, b: v16i16) -> v32i8 {
+    __lasx_xvsrarn_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrarn_h_w(a: v8i32, b: v8i32) -> v16i16 {
+    __lasx_xvsrarn_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrarn_w_d(a: v4i64, b: v4i64) -> v8i32 {
+    __lasx_xvsrarn_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarn_b_h(a: v16i16, b: v16i16) -> v32i8 {
+    __lasx_xvssrarn_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarn_h_w(a: v8i32, b: v8i32) -> v16i16 {
+    __lasx_xvssrarn_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarn_w_d(a: v4i64, b: v4i64) -> v8i32 {
+    __lasx_xvssrarn_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarn_bu_h(a: v16u16, b: v16u16) -> v32u8 {
+    __lasx_xvssrarn_bu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarn_hu_w(a: v8u32, b: v8u32) -> v16u16 {
+    __lasx_xvssrarn_hu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarn_wu_d(a: v4u64, b: v4u64) -> v8u32 {
+    __lasx_xvssrarn_wu_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrln_b_h(a: v16i16, b: v16i16) -> v32i8 {
+    __lasx_xvsrln_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrln_h_w(a: v8i32, b: v8i32) -> v16i16 {
+    __lasx_xvsrln_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrln_w_d(a: v4i64, b: v4i64) -> v8i32 {
+    __lasx_xvsrln_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrln_bu_h(a: v16u16, b: v16u16) -> v32u8 {
+    __lasx_xvssrln_bu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrln_hu_w(a: v8u32, b: v8u32) -> v16u16 {
+    __lasx_xvssrln_hu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrln_wu_d(a: v4u64, b: v4u64) -> v8u32 {
+    __lasx_xvssrln_wu_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlrn_b_h(a: v16i16, b: v16i16) -> v32i8 {
+    __lasx_xvsrlrn_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlrn_h_w(a: v8i32, b: v8i32) -> v16i16 {
+    __lasx_xvsrlrn_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlrn_w_d(a: v4i64, b: v4i64) -> v8i32 {
+    __lasx_xvsrlrn_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrn_bu_h(a: v16u16, b: v16u16) -> v32u8 {
+    __lasx_xvssrlrn_bu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrn_hu_w(a: v8u32, b: v8u32) -> v16u16 {
+    __lasx_xvssrlrn_hu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrn_wu_d(a: v4u64, b: v4u64) -> v8u32 {
+    __lasx_xvssrlrn_wu_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrstpi_b<const IMM5: u32>(a: v32i8, b: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvfrstpi_b(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrstpi_h<const IMM5: u32>(a: v16i16, b: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvfrstpi_h(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrstp_b(a: v32i8, b: v32i8, c: v32i8) -> v32i8 {
+    __lasx_xvfrstp_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrstp_h(a: v16i16, b: v16i16, c: v16i16) -> v16i16 {
+    __lasx_xvfrstp_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvshuf4i_d<const IMM8: u32>(a: v4i64, b: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvshuf4i_d(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbsrl_v<const IMM5: u32>(a: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvbsrl_v(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvbsll_v<const IMM5: u32>(a: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvbsll_v(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvextrins_b<const IMM8: u32>(a: v32i8, b: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvextrins_b(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvextrins_h<const IMM8: u32>(a: v16i16, b: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvextrins_h(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvextrins_w<const IMM8: u32>(a: v8i32, b: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvextrins_w(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvextrins_d<const IMM8: u32>(a: v4i64, b: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvextrins_d(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmskltz_b(a: v32i8) -> v32i8 {
+    __lasx_xvmskltz_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmskltz_h(a: v16i16) -> v16i16 {
+    __lasx_xvmskltz_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmskltz_w(a: v8i32) -> v8i32 {
+    __lasx_xvmskltz_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmskltz_d(a: v4i64) -> v4i64 {
+    __lasx_xvmskltz_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsigncov_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvsigncov_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsigncov_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvsigncov_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsigncov_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvsigncov_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsigncov_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvsigncov_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmadd_s(a: v8f32, b: v8f32, c: v8f32) -> v8f32 {
+    __lasx_xvfmadd_s(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmadd_d(a: v4f64, b: v4f64, c: v4f64) -> v4f64 {
+    __lasx_xvfmadd_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmsub_s(a: v8f32, b: v8f32, c: v8f32) -> v8f32 {
+    __lasx_xvfmsub_s(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfmsub_d(a: v4f64, b: v4f64, c: v4f64) -> v4f64 {
+    __lasx_xvfmsub_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfnmadd_s(a: v8f32, b: v8f32, c: v8f32) -> v8f32 {
+    __lasx_xvfnmadd_s(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfnmadd_d(a: v4f64, b: v4f64, c: v4f64) -> v4f64 {
+    __lasx_xvfnmadd_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfnmsub_s(a: v8f32, b: v8f32, c: v8f32) -> v8f32 {
+    __lasx_xvfnmsub_s(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfnmsub_d(a: v4f64, b: v4f64, c: v4f64) -> v4f64 {
+    __lasx_xvfnmsub_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrne_w_s(a: v8f32) -> v8i32 {
+    __lasx_xvftintrne_w_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrne_l_d(a: v4f64) -> v4i64 {
+    __lasx_xvftintrne_l_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrp_w_s(a: v8f32) -> v8i32 {
+    __lasx_xvftintrp_w_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrp_l_d(a: v4f64) -> v4i64 {
+    __lasx_xvftintrp_l_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrm_w_s(a: v8f32) -> v8i32 {
+    __lasx_xvftintrm_w_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrm_l_d(a: v4f64) -> v4i64 {
+    __lasx_xvftintrm_l_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftint_w_d(a: v4f64, b: v4f64) -> v8i32 {
+    __lasx_xvftint_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvffint_s_l(a: v4i64, b: v4i64) -> v8f32 {
+    __lasx_xvffint_s_l(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrz_w_d(a: v4f64, b: v4f64) -> v8i32 {
+    __lasx_xvftintrz_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrp_w_d(a: v4f64, b: v4f64) -> v8i32 {
+    __lasx_xvftintrp_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrm_w_d(a: v4f64, b: v4f64) -> v8i32 {
+    __lasx_xvftintrm_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrne_w_d(a: v4f64, b: v4f64) -> v8i32 {
+    __lasx_xvftintrne_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftinth_l_s(a: v8f32) -> v4i64 {
+    __lasx_xvftinth_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintl_l_s(a: v8f32) -> v4i64 {
+    __lasx_xvftintl_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvffinth_d_w(a: v8i32) -> v4f64 {
+    __lasx_xvffinth_d_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvffintl_d_w(a: v8i32) -> v4f64 {
+    __lasx_xvffintl_d_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrzh_l_s(a: v8f32) -> v4i64 {
+    __lasx_xvftintrzh_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrzl_l_s(a: v8f32) -> v4i64 {
+    __lasx_xvftintrzl_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrph_l_s(a: v8f32) -> v4i64 {
+    __lasx_xvftintrph_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrpl_l_s(a: v8f32) -> v4i64 {
+    __lasx_xvftintrpl_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrmh_l_s(a: v8f32) -> v4i64 {
+    __lasx_xvftintrmh_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrml_l_s(a: v8f32) -> v4i64 {
+    __lasx_xvftintrml_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrneh_l_s(a: v8f32) -> v4i64 {
+    __lasx_xvftintrneh_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvftintrnel_l_s(a: v8f32) -> v4i64 {
+    __lasx_xvftintrnel_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrintrne_s(a: v8f32) -> v8f32 {
+    __lasx_xvfrintrne_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrintrne_d(a: v4f64) -> v4f64 {
+    __lasx_xvfrintrne_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrintrz_s(a: v8f32) -> v8f32 {
+    __lasx_xvfrintrz_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrintrz_d(a: v4f64) -> v4f64 {
+    __lasx_xvfrintrz_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrintrp_s(a: v8f32) -> v8f32 {
+    __lasx_xvfrintrp_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrintrp_d(a: v4f64) -> v4f64 {
+    __lasx_xvfrintrp_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrintrm_s(a: v8f32) -> v8f32 {
+    __lasx_xvfrintrm_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfrintrm_d(a: v4f64) -> v4f64 {
+    __lasx_xvfrintrm_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvld<const IMM_S12: i32>(mem_addr: *const i8) -> v32i8 {
+    static_assert_simm_bits!(IMM_S12, 12);
+    __lasx_xvld(mem_addr, IMM_S12)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvst<const IMM_S12: i32>(a: v32i8, mem_addr: *mut i8) {
+    static_assert_simm_bits!(IMM_S12, 12);
+    __lasx_xvst(a, mem_addr, IMM_S12)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvstelm_b<const IMM_S8: i32, const IMM4: u32>(a: v32i8, mem_addr: *mut i8) {
+    static_assert_simm_bits!(IMM_S8, 8);
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvstelm_b(a, mem_addr, IMM_S8, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvstelm_h<const IMM_S8: i32, const IMM3: u32>(a: v16i16, mem_addr: *mut i8) {
+    static_assert_simm_bits!(IMM_S8, 8);
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvstelm_h(a, mem_addr, IMM_S8, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvstelm_w<const IMM_S8: i32, const IMM2: u32>(a: v8i32, mem_addr: *mut i8) {
+    static_assert_simm_bits!(IMM_S8, 8);
+    static_assert_uimm_bits!(IMM2, 2);
+    __lasx_xvstelm_w(a, mem_addr, IMM_S8, IMM2)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvstelm_d<const IMM_S8: i32, const IMM1: u32>(a: v4i64, mem_addr: *mut i8) {
+    static_assert_simm_bits!(IMM_S8, 8);
+    static_assert_uimm_bits!(IMM1, 1);
+    __lasx_xvstelm_d(a, mem_addr, IMM_S8, IMM1)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvinsve0_w<const IMM3: u32>(a: v8i32, b: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvinsve0_w(a, b, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvinsve0_d<const IMM2: u32>(a: v4i64, b: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM2, 2);
+    __lasx_xvinsve0_d(a, b, IMM2)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickve_w<const IMM3: u32>(a: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvpickve_w(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickve_d<const IMM2: u32>(a: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM2, 2);
+    __lasx_xvpickve_d(a, IMM2)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrn_b_h(a: v16i16, b: v16i16) -> v32i8 {
+    __lasx_xvssrlrn_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrn_h_w(a: v8i32, b: v8i32) -> v16i16 {
+    __lasx_xvssrlrn_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrn_w_d(a: v4i64, b: v4i64) -> v8i32 {
+    __lasx_xvssrlrn_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrln_b_h(a: v16i16, b: v16i16) -> v32i8 {
+    __lasx_xvssrln_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrln_h_w(a: v8i32, b: v8i32) -> v16i16 {
+    __lasx_xvssrln_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrln_w_d(a: v4i64, b: v4i64) -> v8i32 {
+    __lasx_xvssrln_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvorn_v(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvorn_v(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvldi<const IMM_S13: i32>() -> v4i64 {
+    static_assert_simm_bits!(IMM_S13, 13);
+    __lasx_xvldi(IMM_S13)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvldx(mem_addr: *const i8, b: i64) -> v32i8 {
+    __lasx_xvldx(mem_addr, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvstx(a: v32i8, mem_addr: *mut i8, b: i64) {
+    __lasx_xvstx(a, mem_addr, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvextl_qu_du(a: v4u64) -> v4u64 {
+    __lasx_xvextl_qu_du(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvinsgr2vr_w<const IMM3: u32>(a: v8i32, b: i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvinsgr2vr_w(a, b, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvinsgr2vr_d<const IMM2: u32>(a: v4i64, b: i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM2, 2);
+    __lasx_xvinsgr2vr_d(a, b, IMM2)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvreplve0_b(a: v32i8) -> v32i8 {
+    __lasx_xvreplve0_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvreplve0_h(a: v16i16) -> v16i16 {
+    __lasx_xvreplve0_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvreplve0_w(a: v8i32) -> v8i32 {
+    __lasx_xvreplve0_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvreplve0_d(a: v4i64) -> v4i64 {
+    __lasx_xvreplve0_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvreplve0_q(a: v32i8) -> v32i8 {
+    __lasx_xvreplve0_q(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_vext2xv_h_b(a: v32i8) -> v16i16 {
+    __lasx_vext2xv_h_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_vext2xv_w_h(a: v16i16) -> v8i32 {
+    __lasx_vext2xv_w_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_vext2xv_d_w(a: v8i32) -> v4i64 {
+    __lasx_vext2xv_d_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_vext2xv_w_b(a: v32i8) -> v8i32 {
+    __lasx_vext2xv_w_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_vext2xv_d_h(a: v16i16) -> v4i64 {
+    __lasx_vext2xv_d_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_vext2xv_d_b(a: v32i8) -> v4i64 {
+    __lasx_vext2xv_d_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_vext2xv_hu_bu(a: v32i8) -> v16i16 {
+    __lasx_vext2xv_hu_bu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_vext2xv_wu_hu(a: v16i16) -> v8i32 {
+    __lasx_vext2xv_wu_hu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_vext2xv_du_wu(a: v8i32) -> v4i64 {
+    __lasx_vext2xv_du_wu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_vext2xv_wu_bu(a: v32i8) -> v8i32 {
+    __lasx_vext2xv_wu_bu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_vext2xv_du_hu(a: v16i16) -> v4i64 {
+    __lasx_vext2xv_du_hu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_vext2xv_du_bu(a: v32i8) -> v4i64 {
+    __lasx_vext2xv_du_bu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpermi_q<const IMM8: u32>(a: v32i8, b: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvpermi_q(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpermi_d<const IMM8: u32>(a: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lasx_xvpermi_d(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvperm_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvperm_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvldrepl_b<const IMM_S12: i32>(mem_addr: *const i8) -> v32i8 {
+    static_assert_simm_bits!(IMM_S12, 12);
+    __lasx_xvldrepl_b(mem_addr, IMM_S12)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvldrepl_h<const IMM_S11: i32>(mem_addr: *const i8) -> v16i16 {
+    static_assert_simm_bits!(IMM_S11, 11);
+    __lasx_xvldrepl_h(mem_addr, IMM_S11)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvldrepl_w<const IMM_S10: i32>(mem_addr: *const i8) -> v8i32 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    __lasx_xvldrepl_w(mem_addr, IMM_S10)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvldrepl_d<const IMM_S9: i32>(mem_addr: *const i8) -> v4i64 {
+    static_assert_simm_bits!(IMM_S9, 9);
+    __lasx_xvldrepl_d(mem_addr, IMM_S9)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickve2gr_w<const IMM3: u32>(a: v8i32) -> i32 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvpickve2gr_w(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickve2gr_wu<const IMM3: u32>(a: v8i32) -> u32 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvpickve2gr_wu(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickve2gr_d<const IMM2: u32>(a: v4i64) -> i64 {
+    static_assert_uimm_bits!(IMM2, 2);
+    __lasx_xvpickve2gr_d(a, IMM2)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickve2gr_du<const IMM2: u32>(a: v4i64) -> u64 {
+    static_assert_uimm_bits!(IMM2, 2);
+    __lasx_xvpickve2gr_du(a, IMM2)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwev_q_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvaddwev_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwev_d_w(a: v8i32, b: v8i32) -> v4i64 {
+    __lasx_xvaddwev_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwev_w_h(a: v16i16, b: v16i16) -> v8i32 {
+    __lasx_xvaddwev_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwev_h_b(a: v32i8, b: v32i8) -> v16i16 {
+    __lasx_xvaddwev_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwev_q_du(a: v4u64, b: v4u64) -> v4i64 {
+    __lasx_xvaddwev_q_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwev_d_wu(a: v8u32, b: v8u32) -> v4i64 {
+    __lasx_xvaddwev_d_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwev_w_hu(a: v16u16, b: v16u16) -> v8i32 {
+    __lasx_xvaddwev_w_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwev_h_bu(a: v32u8, b: v32u8) -> v16i16 {
+    __lasx_xvaddwev_h_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwev_q_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvsubwev_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwev_d_w(a: v8i32, b: v8i32) -> v4i64 {
+    __lasx_xvsubwev_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwev_w_h(a: v16i16, b: v16i16) -> v8i32 {
+    __lasx_xvsubwev_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwev_h_b(a: v32i8, b: v32i8) -> v16i16 {
+    __lasx_xvsubwev_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwev_q_du(a: v4u64, b: v4u64) -> v4i64 {
+    __lasx_xvsubwev_q_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwev_d_wu(a: v8u32, b: v8u32) -> v4i64 {
+    __lasx_xvsubwev_d_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwev_w_hu(a: v16u16, b: v16u16) -> v8i32 {
+    __lasx_xvsubwev_w_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwev_h_bu(a: v32u8, b: v32u8) -> v16i16 {
+    __lasx_xvsubwev_h_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwev_q_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvmulwev_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwev_d_w(a: v8i32, b: v8i32) -> v4i64 {
+    __lasx_xvmulwev_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwev_w_h(a: v16i16, b: v16i16) -> v8i32 {
+    __lasx_xvmulwev_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwev_h_b(a: v32i8, b: v32i8) -> v16i16 {
+    __lasx_xvmulwev_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwev_q_du(a: v4u64, b: v4u64) -> v4i64 {
+    __lasx_xvmulwev_q_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwev_d_wu(a: v8u32, b: v8u32) -> v4i64 {
+    __lasx_xvmulwev_d_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwev_w_hu(a: v16u16, b: v16u16) -> v8i32 {
+    __lasx_xvmulwev_w_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwev_h_bu(a: v32u8, b: v32u8) -> v16i16 {
+    __lasx_xvmulwev_h_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwod_q_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvaddwod_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwod_d_w(a: v8i32, b: v8i32) -> v4i64 {
+    __lasx_xvaddwod_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwod_w_h(a: v16i16, b: v16i16) -> v8i32 {
+    __lasx_xvaddwod_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwod_h_b(a: v32i8, b: v32i8) -> v16i16 {
+    __lasx_xvaddwod_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwod_q_du(a: v4u64, b: v4u64) -> v4i64 {
+    __lasx_xvaddwod_q_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwod_d_wu(a: v8u32, b: v8u32) -> v4i64 {
+    __lasx_xvaddwod_d_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwod_w_hu(a: v16u16, b: v16u16) -> v8i32 {
+    __lasx_xvaddwod_w_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwod_h_bu(a: v32u8, b: v32u8) -> v16i16 {
+    __lasx_xvaddwod_h_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwod_q_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvsubwod_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwod_d_w(a: v8i32, b: v8i32) -> v4i64 {
+    __lasx_xvsubwod_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwod_w_h(a: v16i16, b: v16i16) -> v8i32 {
+    __lasx_xvsubwod_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwod_h_b(a: v32i8, b: v32i8) -> v16i16 {
+    __lasx_xvsubwod_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwod_q_du(a: v4u64, b: v4u64) -> v4i64 {
+    __lasx_xvsubwod_q_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwod_d_wu(a: v8u32, b: v8u32) -> v4i64 {
+    __lasx_xvsubwod_d_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwod_w_hu(a: v16u16, b: v16u16) -> v8i32 {
+    __lasx_xvsubwod_w_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsubwod_h_bu(a: v32u8, b: v32u8) -> v16i16 {
+    __lasx_xvsubwod_h_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwod_q_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvmulwod_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwod_d_w(a: v8i32, b: v8i32) -> v4i64 {
+    __lasx_xvmulwod_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwod_w_h(a: v16i16, b: v16i16) -> v8i32 {
+    __lasx_xvmulwod_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwod_h_b(a: v32i8, b: v32i8) -> v16i16 {
+    __lasx_xvmulwod_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwod_q_du(a: v4u64, b: v4u64) -> v4i64 {
+    __lasx_xvmulwod_q_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwod_d_wu(a: v8u32, b: v8u32) -> v4i64 {
+    __lasx_xvmulwod_d_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwod_w_hu(a: v16u16, b: v16u16) -> v8i32 {
+    __lasx_xvmulwod_w_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwod_h_bu(a: v32u8, b: v32u8) -> v16i16 {
+    __lasx_xvmulwod_h_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwev_d_wu_w(a: v8u32, b: v8i32) -> v4i64 {
+    __lasx_xvaddwev_d_wu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwev_w_hu_h(a: v16u16, b: v16i16) -> v8i32 {
+    __lasx_xvaddwev_w_hu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwev_h_bu_b(a: v32u8, b: v32i8) -> v16i16 {
+    __lasx_xvaddwev_h_bu_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwev_d_wu_w(a: v8u32, b: v8i32) -> v4i64 {
+    __lasx_xvmulwev_d_wu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwev_w_hu_h(a: v16u16, b: v16i16) -> v8i32 {
+    __lasx_xvmulwev_w_hu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwev_h_bu_b(a: v32u8, b: v32i8) -> v16i16 {
+    __lasx_xvmulwev_h_bu_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwod_d_wu_w(a: v8u32, b: v8i32) -> v4i64 {
+    __lasx_xvaddwod_d_wu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwod_w_hu_h(a: v16u16, b: v16i16) -> v8i32 {
+    __lasx_xvaddwod_w_hu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwod_h_bu_b(a: v32u8, b: v32i8) -> v16i16 {
+    __lasx_xvaddwod_h_bu_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwod_d_wu_w(a: v8u32, b: v8i32) -> v4i64 {
+    __lasx_xvmulwod_d_wu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwod_w_hu_h(a: v16u16, b: v16i16) -> v8i32 {
+    __lasx_xvmulwod_w_hu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwod_h_bu_b(a: v32u8, b: v32i8) -> v16i16 {
+    __lasx_xvmulwod_h_bu_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhaddw_q_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvhaddw_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhaddw_qu_du(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvhaddw_qu_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhsubw_q_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvhsubw_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvhsubw_qu_du(a: v4u64, b: v4u64) -> v4u64 {
+    __lasx_xvhsubw_qu_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwev_q_d(a: v4i64, b: v4i64, c: v4i64) -> v4i64 {
+    __lasx_xvmaddwev_q_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwev_d_w(a: v4i64, b: v8i32, c: v8i32) -> v4i64 {
+    __lasx_xvmaddwev_d_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwev_w_h(a: v8i32, b: v16i16, c: v16i16) -> v8i32 {
+    __lasx_xvmaddwev_w_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwev_h_b(a: v16i16, b: v32i8, c: v32i8) -> v16i16 {
+    __lasx_xvmaddwev_h_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwev_q_du(a: v4u64, b: v4u64, c: v4u64) -> v4u64 {
+    __lasx_xvmaddwev_q_du(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwev_d_wu(a: v4u64, b: v8u32, c: v8u32) -> v4u64 {
+    __lasx_xvmaddwev_d_wu(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwev_w_hu(a: v8u32, b: v16u16, c: v16u16) -> v8u32 {
+    __lasx_xvmaddwev_w_hu(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwev_h_bu(a: v16u16, b: v32u8, c: v32u8) -> v16u16 {
+    __lasx_xvmaddwev_h_bu(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwod_q_d(a: v4i64, b: v4i64, c: v4i64) -> v4i64 {
+    __lasx_xvmaddwod_q_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwod_d_w(a: v4i64, b: v8i32, c: v8i32) -> v4i64 {
+    __lasx_xvmaddwod_d_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwod_w_h(a: v8i32, b: v16i16, c: v16i16) -> v8i32 {
+    __lasx_xvmaddwod_w_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwod_h_b(a: v16i16, b: v32i8, c: v32i8) -> v16i16 {
+    __lasx_xvmaddwod_h_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwod_q_du(a: v4u64, b: v4u64, c: v4u64) -> v4u64 {
+    __lasx_xvmaddwod_q_du(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwod_d_wu(a: v4u64, b: v8u32, c: v8u32) -> v4u64 {
+    __lasx_xvmaddwod_d_wu(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwod_w_hu(a: v8u32, b: v16u16, c: v16u16) -> v8u32 {
+    __lasx_xvmaddwod_w_hu(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwod_h_bu(a: v16u16, b: v32u8, c: v32u8) -> v16u16 {
+    __lasx_xvmaddwod_h_bu(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwev_q_du_d(a: v4i64, b: v4u64, c: v4i64) -> v4i64 {
+    __lasx_xvmaddwev_q_du_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwev_d_wu_w(a: v4i64, b: v8u32, c: v8i32) -> v4i64 {
+    __lasx_xvmaddwev_d_wu_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwev_w_hu_h(a: v8i32, b: v16u16, c: v16i16) -> v8i32 {
+    __lasx_xvmaddwev_w_hu_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwev_h_bu_b(a: v16i16, b: v32u8, c: v32i8) -> v16i16 {
+    __lasx_xvmaddwev_h_bu_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwod_q_du_d(a: v4i64, b: v4u64, c: v4i64) -> v4i64 {
+    __lasx_xvmaddwod_q_du_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwod_d_wu_w(a: v4i64, b: v8u32, c: v8i32) -> v4i64 {
+    __lasx_xvmaddwod_d_wu_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwod_w_hu_h(a: v8i32, b: v16u16, c: v16i16) -> v8i32 {
+    __lasx_xvmaddwod_w_hu_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmaddwod_h_bu_b(a: v16i16, b: v32u8, c: v32i8) -> v16i16 {
+    __lasx_xvmaddwod_h_bu_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrotr_b(a: v32i8, b: v32i8) -> v32i8 {
+    __lasx_xvrotr_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrotr_h(a: v16i16, b: v16i16) -> v16i16 {
+    __lasx_xvrotr_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrotr_w(a: v8i32, b: v8i32) -> v8i32 {
+    __lasx_xvrotr_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrotr_d(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvrotr_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvadd_q(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvadd_q(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsub_q(a: v4i64, b: v4i64) -> v4i64 {
+    __lasx_xvsub_q(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwev_q_du_d(a: v4u64, b: v4i64) -> v4i64 {
+    __lasx_xvaddwev_q_du_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvaddwod_q_du_d(a: v4u64, b: v4i64) -> v4i64 {
+    __lasx_xvaddwod_q_du_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwev_q_du_d(a: v4u64, b: v4i64) -> v4i64 {
+    __lasx_xvmulwev_q_du_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmulwod_q_du_d(a: v4u64, b: v4i64) -> v4i64 {
+    __lasx_xvmulwod_q_du_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmskgez_b(a: v32i8) -> v32i8 {
+    __lasx_xvmskgez_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvmsknz_b(a: v32i8) -> v32i8 {
+    __lasx_xvmsknz_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvexth_h_b(a: v32i8) -> v16i16 {
+    __lasx_xvexth_h_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvexth_w_h(a: v16i16) -> v8i32 {
+    __lasx_xvexth_w_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvexth_d_w(a: v8i32) -> v4i64 {
+    __lasx_xvexth_d_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvexth_q_d(a: v4i64) -> v4i64 {
+    __lasx_xvexth_q_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvexth_hu_bu(a: v32u8) -> v16u16 {
+    __lasx_xvexth_hu_bu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvexth_wu_hu(a: v16u16) -> v8u32 {
+    __lasx_xvexth_wu_hu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvexth_du_wu(a: v8u32) -> v4u64 {
+    __lasx_xvexth_du_wu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvexth_qu_du(a: v4u64) -> v4u64 {
+    __lasx_xvexth_qu_du(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrotri_b<const IMM3: u32>(a: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvrotri_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrotri_h<const IMM4: u32>(a: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvrotri_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrotri_w<const IMM5: u32>(a: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvrotri_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrotri_d<const IMM6: u32>(a: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvrotri_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvextl_q_d(a: v4i64) -> v4i64 {
+    __lasx_xvextl_q_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlni_b_h<const IMM4: u32>(a: v32i8, b: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvsrlni_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlni_h_w<const IMM5: u32>(a: v16i16, b: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsrlni_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlni_w_d<const IMM6: u32>(a: v8i32, b: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvsrlni_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlni_d_q<const IMM7: u32>(a: v4i64, b: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lasx_xvsrlni_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlrni_b_h<const IMM4: u32>(a: v32i8, b: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvsrlrni_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlrni_h_w<const IMM5: u32>(a: v16i16, b: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsrlrni_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlrni_w_d<const IMM6: u32>(a: v8i32, b: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvsrlrni_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrlrni_d_q<const IMM7: u32>(a: v4i64, b: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lasx_xvsrlrni_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlni_b_h<const IMM4: u32>(a: v32i8, b: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvssrlni_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlni_h_w<const IMM5: u32>(a: v16i16, b: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvssrlni_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlni_w_d<const IMM6: u32>(a: v8i32, b: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvssrlni_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlni_d_q<const IMM7: u32>(a: v4i64, b: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lasx_xvssrlni_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlni_bu_h<const IMM4: u32>(a: v32u8, b: v32i8) -> v32u8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvssrlni_bu_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlni_hu_w<const IMM5: u32>(a: v16u16, b: v16i16) -> v16u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvssrlni_hu_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlni_wu_d<const IMM6: u32>(a: v8u32, b: v8i32) -> v8u32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvssrlni_wu_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlni_du_q<const IMM7: u32>(a: v4u64, b: v4i64) -> v4u64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lasx_xvssrlni_du_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrni_b_h<const IMM4: u32>(a: v32i8, b: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvssrlrni_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrni_h_w<const IMM5: u32>(a: v16i16, b: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvssrlrni_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrni_w_d<const IMM6: u32>(a: v8i32, b: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvssrlrni_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrni_d_q<const IMM7: u32>(a: v4i64, b: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lasx_xvssrlrni_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrni_bu_h<const IMM4: u32>(a: v32u8, b: v32i8) -> v32u8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvssrlrni_bu_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrni_hu_w<const IMM5: u32>(a: v16u16, b: v16i16) -> v16u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvssrlrni_hu_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrni_wu_d<const IMM6: u32>(a: v8u32, b: v8i32) -> v8u32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvssrlrni_wu_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrlrni_du_q<const IMM7: u32>(a: v4u64, b: v4i64) -> v4u64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lasx_xvssrlrni_du_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrani_b_h<const IMM4: u32>(a: v32i8, b: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvsrani_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrani_h_w<const IMM5: u32>(a: v16i16, b: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsrani_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrani_w_d<const IMM6: u32>(a: v8i32, b: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvsrani_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrani_d_q<const IMM7: u32>(a: v4i64, b: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lasx_xvsrani_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrarni_b_h<const IMM4: u32>(a: v32i8, b: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvsrarni_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrarni_h_w<const IMM5: u32>(a: v16i16, b: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvsrarni_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrarni_w_d<const IMM6: u32>(a: v8i32, b: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvsrarni_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvsrarni_d_q<const IMM7: u32>(a: v4i64, b: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lasx_xvsrarni_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrani_b_h<const IMM4: u32>(a: v32i8, b: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvssrani_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrani_h_w<const IMM5: u32>(a: v16i16, b: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvssrani_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrani_w_d<const IMM6: u32>(a: v8i32, b: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvssrani_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrani_d_q<const IMM7: u32>(a: v4i64, b: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lasx_xvssrani_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrani_bu_h<const IMM4: u32>(a: v32u8, b: v32i8) -> v32u8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvssrani_bu_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrani_hu_w<const IMM5: u32>(a: v16u16, b: v16i16) -> v16u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvssrani_hu_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrani_wu_d<const IMM6: u32>(a: v8u32, b: v8i32) -> v8u32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvssrani_wu_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrani_du_q<const IMM7: u32>(a: v4u64, b: v4i64) -> v4u64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lasx_xvssrani_du_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarni_b_h<const IMM4: u32>(a: v32i8, b: v32i8) -> v32i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvssrarni_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarni_h_w<const IMM5: u32>(a: v16i16, b: v16i16) -> v16i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvssrarni_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarni_w_d<const IMM6: u32>(a: v8i32, b: v8i32) -> v8i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvssrarni_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarni_d_q<const IMM7: u32>(a: v4i64, b: v4i64) -> v4i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lasx_xvssrarni_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarni_bu_h<const IMM4: u32>(a: v32u8, b: v32i8) -> v32u8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lasx_xvssrarni_bu_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarni_hu_w<const IMM5: u32>(a: v16u16, b: v16i16) -> v16u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lasx_xvssrarni_hu_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarni_wu_d<const IMM6: u32>(a: v8u32, b: v8i32) -> v8u32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lasx_xvssrarni_wu_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvssrarni_du_q<const IMM7: u32>(a: v4u64, b: v4i64) -> v4u64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lasx_xvssrarni_du_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xbnz_b(a: v32u8) -> i32 {
+    __lasx_xbnz_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xbnz_d(a: v4u64) -> i32 {
+    __lasx_xbnz_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xbnz_h(a: v16u16) -> i32 {
+    __lasx_xbnz_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xbnz_v(a: v32u8) -> i32 {
+    __lasx_xbnz_v(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xbnz_w(a: v8u32) -> i32 {
+    __lasx_xbnz_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xbz_b(a: v32u8) -> i32 {
+    __lasx_xbz_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xbz_d(a: v4u64) -> i32 {
+    __lasx_xbz_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xbz_h(a: v16u16) -> i32 {
+    __lasx_xbz_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xbz_v(a: v32u8) -> i32 {
+    __lasx_xbz_v(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xbz_w(a: v8u32) -> i32 {
+    __lasx_xbz_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_caf_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_caf_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_caf_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_caf_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_ceq_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_ceq_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_ceq_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_ceq_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cle_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_cle_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cle_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_cle_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_clt_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_clt_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_clt_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_clt_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cne_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_cne_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cne_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_cne_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cor_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_cor_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cor_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_cor_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cueq_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_cueq_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cueq_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_cueq_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cule_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_cule_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cule_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_cule_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cult_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_cult_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cult_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_cult_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cun_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_cun_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cune_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_cune_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cune_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_cune_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_cun_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_cun_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_saf_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_saf_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_saf_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_saf_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_seq_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_seq_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_seq_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_seq_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sle_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_sle_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sle_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_sle_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_slt_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_slt_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_slt_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_slt_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sne_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_sne_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sne_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_sne_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sor_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_sor_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sor_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_sor_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sueq_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_sueq_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sueq_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_sueq_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sule_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_sule_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sule_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_sule_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sult_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_sult_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sult_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_sult_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sun_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_sun_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sune_d(a: v4f64, b: v4f64) -> v4i64 {
+    __lasx_xvfcmp_sune_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sune_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_sune_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvfcmp_sun_s(a: v8f32, b: v8f32) -> v8i32 {
+    __lasx_xvfcmp_sun_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickve_d_f<const IMM2: u32>(a: v4f64) -> v4f64 {
+    static_assert_uimm_bits!(IMM2, 2);
+    __lasx_xvpickve_d_f(a, IMM2)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvpickve_w_f<const IMM3: u32>(a: v8f32) -> v8f32 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lasx_xvpickve_w_f(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrepli_b<const IMM_S10: i32>() -> v32i8 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    __lasx_xvrepli_b(IMM_S10)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrepli_d<const IMM_S10: i32>() -> v4i64 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    __lasx_xvrepli_d(IMM_S10)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrepli_h<const IMM_S10: i32>() -> v16i16 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    __lasx_xvrepli_h(IMM_S10)
+}
+
+#[inline]
+#[target_feature(enable = "lasx")]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lasx_xvrepli_w<const IMM_S10: i32>() -> v8i32 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    __lasx_xvrepli_w(IMM_S10)
+}
diff --git a/library/stdarch/crates/core_arch/src/loongarch64/lasx/mod.rs b/library/stdarch/crates/core_arch/src/loongarch64/lasx/mod.rs
new file mode 100644
index 0000000000000..c3a244e740e9f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/loongarch64/lasx/mod.rs
@@ -0,0 +1,21 @@
+//! LoongArch64 LASX intrinsics
+
+#![allow(non_camel_case_types)]
+
+#[rustfmt::skip]
+mod types;
+
+#[rustfmt::skip]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub use self::types::*;
+
+#[rustfmt::skip]
+mod generated;
+
+#[rustfmt::skip]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub use self::generated::*;
+
+#[rustfmt::skip]
+#[cfg(test)]
+mod tests;
diff --git a/library/stdarch/crates/core_arch/src/loongarch64/lasx/tests.rs b/library/stdarch/crates/core_arch/src/loongarch64/lasx/tests.rs
new file mode 100644
index 0000000000000..54771d7b51109
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/loongarch64/lasx/tests.rs
@@ -0,0 +1,14758 @@
+// This code is automatically generated. DO NOT MODIFY.
+// See crates/stdarch-gen-loongarch/README.md
+
+use crate::{
+    core_arch::{loongarch64::*, simd::*},
+    mem::transmute,
+};
+use stdarch_test::simd_test;
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsll_b() {
+    let a = i8x32::new(
+        -111, -98, 47, -106, -82, -72, -70, 0, 110, -61, -20, 36, 41, -103, 42, 95, 15, -11,
+        -25, -5, 40, -63, 56, -39, 43, 127, 86, 75, -48, -32, 72, 69,
+    );
+    let b = i8x32::new(
+        64, -127, -78, 84, -102, -98, 45, 43, -78, -108, 25, 29, -65, 91, 36, 33, 61, 47, 69,
+        -59, -10, 108, 121, -25, -125, 62, -69, 74, 121, -89, -57, 75,
+    );
+    let r = i64x4::new(
+        18015190406413457,
+        -4710544755986517832,
+        -9191829245651812128,
+        2882304449461665880,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsll_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsll_h() {
+    let a = i16x16::new(
+        4856, -12188, 28154, -30840, -28949, 18688, -15524, 15161, 5118, 9078, -28997, 27522,
+        32276, -26448, -5994, -10720,
+    );
+    let b = i16x16::new(
+        -489, 29679, -21849, 9497, -19660, -26644, 7745, 5176, 4522, 9574, -4384, 20128, 7874,
+        -19019, -3312, -26556,
+    );
+    let r = i64x4::new(
+        1153199681048706048,
+        4107430984994057904,
+        7746911246556919808,
+        7061899947028838480,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsll_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsll_w() {
+    let a = i32x8::new(
+        1510216636,
+        213576479,
+        1189254660,
+        -1355467453,
+        1294786218,
+        -1710122153,
+        -615586704,
+        -1571284743,
+    );
+    let b = i32x8::new(
+        -529192780,
+        352003269,
+        -770638911,
+        706076772,
+        -1938691801,
+        -1503291372,
+        -471620902,
+        769195345,
+    );
+    let r = i64x4::new(
+        -7539760386422079488,
+        -913293731912406008,
+        -5372794352929123072,
+        3598939055443673088,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsll_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsll_d() {
+    let a = i64x4::new(
+        5587460212497087617,
+        8474749651444529729,
+        1738438059605040390,
+        -4067680789859467618,
+    );
+    let b = i64x4::new(
+        6741938213225194797,
+        5195523862780666814,
+        -3609057746391313602,
+        4479859630248272682,
+    );
+    let r = i64x4::new(
+        -8101940545267433472,
+        4611686018427387904,
+        -9223372036854775808,
+        -289787284616642560,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsll_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslli_b() {
+    let a = i8x32::new(
+        -94, -3, -119, 48, 100, -37, 40, -38, -29, -51, 88, 4, -25, -114, 55, 88, 100, 38, 83,
+        104, -128, 126, -102, 105, 5, -72, 101, 124, 38, -108, 10, -44,
+    );
+    let r = i64x4::new(
+        7539145145172948104,
+        6979515765458220172,
+        -6599752572338399088,
+        5775955139904200724,
+    );
+
+    assert_eq!(r, transmute(lasx_xvslli_b::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslli_h() {
+    let a = i16x16::new(
+        -28940, -25950, 22837, -4210, -14698, -22498, 27809, 10311, -17231, 19306, 6966, 1632,
+        -29260, 23078, 2703, -10254,
+    );
+    let r = i64x4::new(
+        -9223301665963114496,
+        -4611615647535693824,
+        140739635855360,
+        -9223160928474759168,
+    );
+
+    assert_eq!(r, transmute(lasx_xvslli_h::<14>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslli_w() {
+    let a = i32x8::new(
+        1994019050,
+        -2143307169,
+        -1465670605,
+        -1894478348,
+        307662278,
+        836483069,
+        412058602,
+        -1025645846,
+    );
+    let r = i64x4::new(
+        6845471437529022464,
+        -864691127599497216,
+        -216172778791895040,
+        -1585267064908546048,
+    );
+
+    assert_eq!(r, transmute(lasx_xvslli_w::<24>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslli_d() {
+    let a = i64x4::new(
+        4336457422713836724,
+        8560628373228459557,
+        7599406461945619908,
+        -8194824695476258169,
+    );
+    let r = i64x4::new(
+        -9223372036854775808,
+        -6917529027641081856,
+        -9223372036854775808,
+        -2305843009213693952,
+    );
+
+    assert_eq!(r, transmute(lasx_xvslli_d::<61>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsra_b() {
+    let a = i8x32::new(
+        52, 91, -50, -85, -69, -95, -127, 8, 86, -4, -99, 72, 8, -14, 107, -97, -44, 105, 87,
+        -117, -90, 118, 127, -106, 77, -92, -40, -82, -12, -112, -67, -118,
+    );
+    let b = i8x32::new(
+        27, 13, -111, 16, -29, 45, -40, 67, -68, 121, -101, -38, 25, -121, 103, 74, 99, 16,
+        -21, 6, 56, -24, 30, -89, 114, -108, -46, 9, 2, 53, 100, -76,
+    );
+    let r = i64x4::new(
+        108647106216395270,
+        -1801159457985266171,
+        -71645659462473222,
+        -505532365968836077,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsra_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsra_h() {
+    let a = i16x16::new(
+        -13251, -24270, -27793, -1924, -989, 12103, 27324, 24449, 18911, 19481, -8980, 16617,
+        28550, -13690, -1971, 3939,
+    );
+    let b = i16x16::new(
+        -21726, 27818, 27200, -20739, -19045, -6458, 30141, -312, -15113, -30000, 21700, 17092,
+        14409, 3061, -14681, 20631,
+    );
+    let r = i64x4::new(
+        -119365732601073,
+        26740135684866047,
+        292450088307458195,
+        8725659825471543,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsra_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsra_w() {
+    let a = i32x8::new(
+        -1962976084,
+        -1947195007,
+        -955995895,
+        -845185028,
+        679708613,
+        -1609457592,
+        2012287263,
+        -279940829,
+    );
+    let b = i32x8::new(
+        763303798,
+        231194360,
+        470062549,
+        -1292464267,
+        -359409273,
+        1320465704,
+        -1970959884,
+        -137912049,
+    );
+    let r = i64x4::new(
+        -498216206805,
+        -1730871820744,
+        -27002218866473201,
+        -36696200575105,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsra_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsra_d() {
+    let a = i64x4::new(
+        1051630801678824769,
+        -4354070504513252833,
+        -43346970620111970,
+        8876173186758680051,
+    );
+    let b = i64x4::new(
+        3011489794605089083,
+        -9183865802690171879,
+        1530248905177224378,
+        -4896156283978786540,
+    );
+    let r = i64x4::new(1, -129761412875, -1, 8464978396185);
+
+    assert_eq!(r, transmute(lasx_xvsra_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrai_b() {
+    let a = i8x32::new(
+        46, 37, 112, -119, 96, -75, 53, -50, 100, 120, 90, 18, 32, 73, 63, 27, 73, 42, 111,
+        -33, 12, 3, 108, 70, -108, 97, 15, -88, -9, 32, -126, -58,
+    );
+    let r = i64x4::new(
+        -287109943871995390,
+        72906425621612294,
+        289919230257005060,
+        -218421283493247239,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrai_b::<4>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrai_h() {
+    let a = i16x16::new(
+        -30922, -13998, -8176, -18755, 11883, -28383, 17428, 4209, 30936, -20707, -28809,
+        -5893, 6072, 26622, -29177, 17463,
+    );
+    let r = i64x4::new(-281474976710658, 8589803520, -4295098367, 562941363552256);
+
+    assert_eq!(r, transmute(lasx_xvsrai_h::<14>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrai_w() {
+    let a = i32x8::new(
+        -751445431,
+        2057508448,
+        -2111778568,
+        -33537291,
+        -1895386689,
+        499743663,
+        521751715,
+        -784629424,
+    );
+    let r = i64x4::new(68719476730, -16, 17179869169, -25769803773);
+
+    assert_eq!(r, transmute(lasx_xvsrai_w::<27>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrai_d() {
+    let a = i64x4::new(
+        -1330027126485395847,
+        2853839147873904128,
+        -6472260273666122769,
+        -8461705224280067242,
+    );
+    let r = i64x4::new(-2, 2, -6, -8);
+
+    assert_eq!(r, transmute(lasx_xvsrai_d::<60>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrar_b() {
+    let a = i8x32::new(
+        -45, 43, -69, -26, -38, 7, -79, 41, -6, -94, 1, 62, -82, -97, -39, 124, -99, 0, -23,
+        12, 74, 16, -39, -15, -15, 31, -87, -124, -112, -39, 102, 7,
+    );
+    let b = i8x32::new(
+        7, 68, -10, -95, -30, 74, -78, -17, -99, 98, 98, 80, -128, -62, 119, -13, 7, 92, -80,
+        88, -70, -115, 81, 99, 110, 14, 7, -60, -89, -109, 97, 81,
+    );
+    let r = i64x4::new(
+        66431358477468416,
+        1153177339669047552,
+        -77404437262827265,
+        302862676776648704,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrar_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrar_h() {
+    let a = i16x16::new(
+        9840, 12527, -16657, 1341, 1073, -31572, -646, 17766, -16172, -9625, -27578, -20296,
+        -9439, 19781, 4269, -7939,
+    );
+    let b = i16x16::new(
+        29495, 11395, -1796, 26363, 26559, -12537, -23906, 29853, -17327, 20486, -24193, 16816,
+        -26916, 11389, 8615, 25146,
+    );
+    let r = i64x4::new(
+        562932876181581,
+        562954232201216,
+        -5712534652352536470,
+        -2251658079567874,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrar_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrar_w() {
+    let a = i32x8::new(
+        1944832391,
+        -1034950307,
+        -1451047471,
+        1427692017,
+        -938846690,
+        1764815474,
+        -1610593481,
+        -198860459,
+    );
+    let b = i32x8::new(
+        -1327964835,
+        1934527229,
+        -13271412,
+        1797333888,
+        1389622833,
+        -155405641,
+        -1581591786,
+        335424649,
+    );
+    let r = i64x4::new(
+        -8589934588,
+        6131890526069889068,
+        906238092293,
+        -1668156707832192,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrar_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrar_d() {
+    let a = i64x4::new(
+        5484150993813900402,
+        9102605893479197027,
+        -7628992365150862705,
+        407230793930236127,
+    );
+    let b = i64x4::new(
+        5977319318978215334,
+        4512528532199919670,
+        6381392913686620354,
+        5222959627777138290,
+    );
+    let r = i64x4::new(19951225, 505, -1907248091287715676, 362);
+
+    assert_eq!(r, transmute(lasx_xvsrar_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrari_b() {
+    let a = i8x32::new(
+        109, 3, -113, -66, 80, 8, -16, -45, 106, 9, 96, 53, 102, 6, -51, -120, -121, -94, -127,
+        -109, 70, 112, 57, -43, -72, 63, -113, -113, 93, 124, -71, 81,
+    );
+    let r = i64x4::new(
+        -360849773505150962,
+        -1010494010926825203,
+        -358302209459292943,
+        790117907428411639,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrari_b::<3>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrari_h() {
+    let a = i16x16::new(
+        10070, -32733, -17965, 31244, -29243, 6071, -3241, 7927, -285, 21152, -3903, 3660,
+        13839, -14765, -18197, -22466,
+    );
+    let r = i64x4::new(
+        34621125774278695,
+        9007143421804430,
+        4222060231655423,
+        -24488623625338826,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrari_h::<8>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrari_w() {
+    let a = i32x8::new(
+        -500433597,
+        -325248258,
+        -1000460213,
+        209976326,
+        -903490350,
+        -314707005,
+        -503879914,
+        -356101505,
+    );
+    let r = i64x4::new(-1, 4294967294, -2, -1);
+
+    assert_eq!(r, transmute(lasx_xvsrari_w::<29>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrari_d() {
+    let a = i64x4::new(
+        -3633983878249405921,
+        5383874963092799521,
+        -4872778697398942371,
+        -2386944079627506318,
+    );
+    let r = i64x4::new(-3228, 4782, -4328, -2120);
+
+    assert_eq!(r, transmute(lasx_xvsrari_d::<50>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrl_b() {
+    let a = i8x32::new(
+        -118, -38, -124, -54, 98, -128, 79, -36, 103, -128, -88, -49, -98, 60, 2, -59, -16, -4,
+        27, 59, 105, 95, -37, -72, -110, 11, 75, 114, -49, 90, -21, -35,
+    );
+    let b = i8x32::new(
+        98, -9, -55, 119, -93, -49, 14, 102, 104, -92, 48, 65, 46, 102, -33, -36, -80, -60, -4,
+        56, 90, -121, 20, -53, -94, -28, -92, 39, 83, -100, -7, 114,
+    );
+    let r = i64x4::new(
+        216455408162832674,
+        864691138784135271,
+        1660983950228656112,
+        3996105849293766692,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrl_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrl_h() {
+    let a = i16x16::new(
+        10972, 24562, -12521, 26207, -104, -22440, -71, 23995, 14056, -10640, 15949, -18599,
+        29813, -7756, 7950, -20154,
+    );
+    let b = i16x16::new(
+        7336, 20691, 12756, -11763, -7124, -20665, 2106, -26250, -26129, 24711, -15979, 11749,
+        -21358, -26257, -4616, 7882,
+    );
+    let r = i64x4::new(
+        858654357979178,
+        105271911894745103,
+        412644454779584512,
+        12385032119328029,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrl_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrl_w() {
+    let a = i32x8::new(
+        -1772037605,
+        -1212681339,
+        176585315,
+        -732660743,
+        -1822623484,
+        992734189,
+        1682031435,
+        1636125097,
+    );
+    let b = i32x8::new(
+        -938134804,
+        -1078907146,
+        -307437339,
+        -1035019720,
+        338751406,
+        1059144383,
+        -1414917923,
+        -363001284,
+    );
+    let r = i64x4::new(3152506611213, 910538585043, 150899, 25769803779);
+
+    assert_eq!(r, transmute(lasx_xvsrl_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrl_d() {
+    let a = i64x4::new(
+        6435451644778058510,
+        -9196847159082085602,
+        -5149048879671131155,
+        1388424134264678769,
+    );
+    let b = i64x4::new(
+        -6322302375543819270,
+        -4446153186867162446,
+        -4228232340343120478,
+        228185722174108108,
+    );
+    let r = i64x4::new(22, 8215, 774027732, 338970735904462);
+
+    assert_eq!(r, transmute(lasx_xvsrl_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrli_b() {
+    let a = i8x32::new(
+        66, -19, -15, 83, -53, -81, -93, -68, -103, 77, 25, 65, 20, 104, -81, 127, -82, -32,
+        -11, 48, -83, -94, -74, 5, -117, -34, -28, 19, 13, -40, 68, 51,
+    );
+    let r = i64x4::new(
+        -4853842685553676990,
+        9200686999942024601,
+        411695280685441198,
+        3694315145030590091,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrli_b::<0>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrli_h() {
+    let a = i16x16::new(
+        -5451, -9527, 6137, -13536, -13439, 10877, -29799, 719, -28662, 31471, 20011, 1521,
+        1386, -27895, 10040, 24311,
+    );
+    let r = i64x4::new(7036883009470493, 73014771737, 38655688722, 3096241924866048);
+
+    assert_eq!(r, transmute(lasx_xvsrli_h::<11>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrli_w() {
+    let a = i32x8::new(
+        -1988432857,
+        -1485450469,
+        -951392465,
+        -21616344,
+        741104373,
+        -605174159,
+        -393417893,
+        356142399,
+    );
+    let r = i64x4::new(
+        92058329040061,
+        140028818776997,
+        120903329388054,
+        11669426172998,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrli_w::<17>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrli_d() {
+    let a = i64x4::new(
+        8921700513621232732,
+        1019177465435556626,
+        2713436842570698733,
+        -3430716780195672879,
+    );
+    let r = i64x4::new(16617962184, 1898365962, 5054169972, 27969530398);
+
+    assert_eq!(r, transmute(lasx_xvsrli_d::<29>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlr_b() {
+    let a = i8x32::new(
+        -109, 126, -8, -44, -19, -72, -121, -116, 21, 24, -60, 73, 76, 95, -106, -89, 56, -82,
+        -93, 112, -38, -24, -39, -57, -106, -17, -14, 31, 116, 16, 47, 122,
+    );
+    let b = i8x32::new(
+        50, -60, 62, 57, -113, -30, -127, -21, -61, -84, -32, -113, -114, 1, 55, -73, 71, -95,
+        8, -8, 28, 55, -59, -118, 89, 87, -10, 63, 2, 67, 25, 62,
+    );
+    let r = i64x4::new(
+        1316227579002488869,
+        72391849897361923,
+        3604852287775921920,
+        150872911094481483,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrlr_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlr_h() {
+    let a = i16x16::new(
+        -18779, 7604, 13987, 29727, 8545, 14399, -23049, 5564, 17277, 27629, -24885, 8060,
+        -12999, 4495, 32293, -31802,
+    );
+    let b = i16x16::new(
+        -19412, 3296, -29433, -25702, 19528, -23288, 18964, -13600, -11805, -27841, 14324,
+        17650, -2, 18151, -24330, -10882,
+    );
+    let r = i64x4::new(
+        8163242974380043,
+        1566138173559930913,
+        567182991583938672,
+        565118914199555,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrlr_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlr_w() {
+    let a = i32x8::new(
+        -1025998507,
+        -796106787,
+        2021600494,
+        398315156,
+        965338474,
+        -828271652,
+        -102077533,
+        -995359010,
+    );
+    let b = i32x8::new(
+        -2089285463,
+        264222581,
+        -1942623583,
+        -928385941,
+        -1125618647,
+        -149370823,
+        -1786649473,
+        -1080417791,
+    );
+    let r = i64x4::new(
+        7164011834433,
+        835329200199287,
+        442383516915,
+        7085854838990307330,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrlr_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlr_d() {
+    let a = i64x4::new(
+        2027979514153200323,
+        4238639346117886861,
+        -2310491845939102950,
+        -4959482478857813602,
+    );
+    let b = i64x4::new(
+        965489361978698802,
+        4289858003677505067,
+        -4742704455438896809,
+        -8773295883299999969,
+    );
+    let r = i64x4::new(1801, 481878, 1923591164085, 6280495597);
+
+    assert_eq!(r, transmute(lasx_xvsrlr_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlri_b() {
+    let a = i8x32::new(
+        -73, -25, 49, -12, -91, -46, 0, -44, 48, -66, 31, -39, 50, -103, -78, -38, -126, -47,
+        -3, 84, 54, 112, -106, -46, 71, 28, 47, 27, -56, -119, -101, -95,
+    );
+    let r = i64x4::new(
+        3819110935244323374,
+        3975875884220952588,
+        3829779379936769057,
+        2893318883870770962,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrlri_b::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlri_h() {
+    let a = i16x16::new(
+        6309, -29611, -25831, -4246, 15159, 10847, 16953, 29221, 6201, 24789, -30798, -15953,
+        15706, -1900, 10475, -5507,
+    );
+    let r = i64x4::new(
+        33777332217315340,
+        16044215407804446,
+        27303364801855500,
+        32932658182619167,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrlri_h::<9>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlri_w() {
+    let a = i32x8::new(
+        828273676,
+        -644812120,
+        -857187805,
+        -176164509,
+        981336800,
+        1382840349,
+        -1522792930,
+        -176015403,
+    );
+    let r = i64x4::new(8589934592, 8589934594, 4294967296, 8589934593);
+
+    assert_eq!(r, transmute(lasx_xvsrlri_w::<31>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlri_d() {
+    let a = i64x4::new(
+        -5793930330848080801,
+        3293244781940700302,
+        1069657060216154101,
+        -5794364669081104952,
+    );
+    let r = i64x4::new(
+        197700214732210481,
+        51456949717823442,
+        16713391565877408,
+        197693428197319479,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsrlri_d::<6>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitclr_b() {
+    let a = u8x32::new(
+        190, 161, 30, 161, 194, 88, 175, 219, 144, 202, 22, 193, 212, 153, 191, 196, 137, 221,
+        106, 10, 16, 144, 31, 238, 61, 152, 213, 196, 195, 243, 50, 92,
+    );
+    let b = u8x32::new(
+        11, 9, 78, 66, 137, 176, 138, 254, 176, 67, 163, 134, 131, 97, 153, 72, 134, 128, 41,
+        58, 184, 249, 6, 26, 185, 60, 185, 181, 44, 38, 89, 238,
+    );
+    let r = i64x4::new(
+        -7229587192453094986,
+        -4270087733699493232,
+        -1576382945987863415,
+        2031321085346416701,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitclr_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitclr_h() {
+    let a = u16x16::new(
+        7799, 9627, 56384, 27998, 4661, 64335, 54264, 6382, 47409, 49178, 38272, 57390, 35004,
+        32388, 62552, 35760,
+    );
+    let b = u16x16::new(
+        5291, 30357, 59434, 46615, 64011, 9844, 17102, 63063, 12386, 31313, 20554, 38159,
+        54802, 37529, 18767, 51367,
+    );
+    let r = i64x4::new(
+        7880974167965374071,
+        1760507201925878325,
+        6930636858734459185,
+        -8417099780160452424,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitclr_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitclr_w() {
+    let a = u32x8::new(
+        4257127193, 1617538994, 1062231453, 1690763623, 2766967375, 2604092619, 3654495562,
+        101565771,
+    );
+    let b = u32x8::new(
+        1233687892, 2875139141, 3243465390, 3012934629, 2446741029, 1858096423, 3334422766,
+        437336695,
+    );
+    let r = i64x4::new(
+        6947276946051865369,
+        7261774329674735005,
+        -7262251986338409905,
+        436221668492520778,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitclr_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitclr_d() {
+    let a = u64x4::new(
+        16927321994427904653,
+        2683926075985226749,
+        16958486450995068185,
+        3668272799860716893,
+    );
+    let b = u64x4::new(
+        15133760811038045272,
+        12911195625023626617,
+        15656282835364509484,
+        1632666566472745103,
+    );
+    let r = i64x4::new(
+        -1519422079281646963,
+        2683926075985226749,
+        -1488257622714483431,
+        3668272799860684125,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitclr_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitclri_b() {
+    let a = u8x32::new(
+        141, 68, 55, 244, 88, 222, 227, 17, 167, 11, 144, 254, 176, 224, 143, 139, 254, 1, 83,
+        117, 181, 160, 142, 4, 179, 103, 107, 27, 186, 98, 203, 106,
+    );
+    let r = i64x4::new(
+        1271033348788520077,
+        -8390310899796145241,
+        328376522984587710,
+        3065582154070828979,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitclri_b::<6>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitclri_h() {
+    let a = u16x16::new(
+        38228, 2400, 61493, 22229, 35926, 42301, 55100, 57087, 23321, 21128, 18634, 59029,
+        56405, 24055, 11367, 27455,
+    );
+    let r = i64x4::new(
+        6257171367882429780,
+        -2378508372711469996,
+        -1831477648240911591,
+        7727381349517352021,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitclri_h::<1>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitclri_w() {
+    let a = u32x8::new(
+        4093464829, 3397035519, 3710215001, 425447773, 2028980386, 1200168081, 1687167090,
+        2988462494,
+    );
+    let r = i64x4::new(
+        -8468273631661829891,
+        1827284273827504985,
+        542996640125929634,
+        -5611395396043530126,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitclri_w::<30>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitclri_d() {
+    let a = u64x4::new(
+        11636830919927548139,
+        10182450295979110848,
+        14581196067604683625,
+        18383675221698776393,
+    );
+    let r = i64x4::new(
+        -6809983522526181141,
+        -8264364146474618432,
+        -3865618374849045655,
+        -63139220754952887,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitclri_d::<46>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitset_b() {
+    let a = u8x32::new(
+        225, 92, 109, 112, 17, 10, 26, 83, 15, 81, 108, 14, 45, 110, 122, 43, 4, 150, 103, 97,
+        111, 130, 134, 212, 62, 58, 9, 2, 56, 158, 26, 145,
+    );
+    let b = u8x32::new(
+        52, 116, 92, 53, 153, 232, 239, 116, 224, 124, 185, 146, 220, 6, 151, 66, 61, 170, 93,
+        190, 38, 252, 85, 37, 106, 174, 206, 83, 194, 190, 144, 114,
+    );
+    let r = i64x4::new(
+        6024139629681007857,
+        3457196872474448143,
+        -817805275247962588,
+        -7702318388235109826,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitset_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitset_h() {
+    let a = u16x16::new(
+        17259, 49211, 15974, 6099, 8663, 62383, 26831, 38552, 3409, 2195, 20043, 5352, 3983,
+        31516, 6274, 5947,
+    );
+    let b = u16x16::new(
+        53731, 18053, 52835, 11975, 35791, 12348, 45618, 26117, 33156, 26353, 49938, 43656,
+        36487, 64856, 49663, 56384,
+    );
+    let r = i64x4::new(
+        1716784528350724971,
+        -7586198329949707817,
+        1578597770746596689,
+        1674099372676878223,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitset_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitset_w() {
+    let a = u32x8::new(
+        2021234591, 3371814330, 3553513799, 494005311, 250094477, 2516669349, 1444421180,
+        3141613342,
+    );
+    let b = u32x8::new(
+        3030677440, 3512547286, 2983366759, 1926382844, 3455887892, 2988190229, 2851051202,
+        575886239,
+    );
+    let r = i64x4::new(
+        -3964911796154165345,
+        2121736658348822983,
+        -7628724325403057267,
+        -4953617511697867204,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitset_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitset_d() {
+    let a = u64x4::new(
+        13787459408145721576,
+        16595537902770630413,
+        7409136402519495190,
+        8641001130845153939,
+    );
+    let b = u64x4::new(
+        5192067677796360406,
+        648800965073738257,
+        18042109477292491586,
+        15371630372089390212,
+    );
+    let r = i64x4::new(
+        -4659284665559635736,
+        -1851206170938790131,
+        7409136402519495190,
+        8641001130845153939,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitset_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitseti_b() {
+    let a = u8x32::new(
+        119, 80, 249, 199, 113, 106, 84, 111, 190, 194, 53, 9, 139, 230, 49, 32, 150, 255, 16,
+        235, 219, 105, 54, 143, 119, 37, 74, 94, 47, 119, 97, 78,
+    );
+    let r = i64x4::new(
+        -1165048079419059977,
+        -6867454469778062658,
+        -8091022549765259370,
+        -3539275497407339017,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitseti_b::<7>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitseti_h() {
+    let a = u16x16::new(
+        3428, 49184, 29775, 38443, 2320, 51224, 40616, 46501, 26758, 21099, 57944, 43971,
+        47859, 19503, 41964, 61802,
+    );
+    let r = i64x4::new(
+        -5320030648396665500,
+        -5357666549029656304,
+        -6069759003260655482,
+        -1050847327214912781,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitseti_h::<13>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitseti_w() {
+    let a = u32x8::new(
+        3638204102, 2069373672, 3681483208, 2380952857, 3881087295, 2378927021, 1601131765,
+        3307909931,
+    );
+    let r = i64x4::new(
+        8887892248618505926,
+        -5914786406144738872,
+        -5923487305849065153,
+        -1933536090599238411,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitseti_w::<29>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitseti_d() {
+    let a = u64x4::new(
+        9060047554002173201,
+        464447178838056277,
+        7020364402684265679,
+        7640056937583456779,
+    );
+    let r = i64x4::new(
+        9060047554002173201,
+        464447178838056277,
+        7020364402684265679,
+        7640056937583456779,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitseti_d::<17>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitrev_b() {
+    let a = u8x32::new(
+        86, 45, 120, 26, 67, 111, 181, 110, 186, 247, 233, 56, 217, 245, 220, 182, 112, 159,
+        77, 122, 167, 75, 37, 185, 177, 18, 190, 215, 60, 13, 253, 99,
+    );
+    let b = u8x32::new(
+        147, 78, 169, 66, 243, 63, 20, 253, 87, 88, 137, 49, 21, 0, 154, 117, 112, 42, 28, 48,
+        22, 139, 165, 183, 96, 228, 17, 98, 218, 192, 92, 92,
+    );
+    let r = i64x4::new(
+        5667198812028562782,
+        -7577037021778282950,
+        4108764896531684209,
+        8353346322052154032,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitrev_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitrev_h() {
+    let a = u16x16::new(
+        44834, 48985, 47421, 26123, 36975, 54201, 35400, 17963, 44073, 49622, 17677, 24094,
+        34507, 53208, 48965, 4380,
+    );
+    let b = u16x16::new(
+        3119, 5355, 43390, 6709, 8036, 22161, 7944, 37786, 31676, 17612, 21999, 1550, 37643,
+        51935, 23672, 51448,
+    );
+    let r = i64x4::new(
+        7362252059331604258,
+        4768057775407992959,
+        2170388733584915497,
+        1161012008856358603,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitrev_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitrev_w() {
+    let a = u32x8::new(
+        1780458127, 1583179777, 1403171735, 3038008548, 1551651469, 1192480700, 40883360,
+        521408888,
+    );
+    let b = u32x8::new(
+        2551625282, 692446886, 1507542621, 1654251513, 25012964, 1671838513, 1315668038,
+        3268446736,
+    );
+    let r = i64x4::new(
+        6799705642561938059,
+        -5254481525065206889,
+        5121102659209417373,
+        2239715596821320928,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitrev_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitrev_d() {
+    let a = u64x4::new(
+        3534178575908999157,
+        3435592769216332161,
+        6355029412175758040,
+        10622443384676276507,
+    );
+    let b = u64x4::new(
+        765862270911233836,
+        2594415241338312820,
+        11114879593910781230,
+        15091508809743360642,
+    );
+    let r = i64x4::new(
+        3534196168095043573,
+        3440096368843702657,
+        6355099780919935704,
+        -7824300689033275105,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitrev_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitrevi_b() {
+    let a = u8x32::new(
+        112, 47, 201, 157, 172, 239, 255, 219, 200, 1, 134, 120, 144, 4, 15, 114, 35, 84, 237,
+        118, 244, 43, 132, 135, 32, 116, 216, 122, 83, 233, 95, 217,
+    );
+    let r = i64x4::new(
+        -297290846994624688,
+        5921992374835618280,
+        -6366950966577761277,
+        -468434338938596352,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitrevi_b::<5>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitrevi_h() {
+    let a = u16x16::new(
+        32769, 5307, 42421, 62367, 28539, 63062, 1989, 15130, 7026, 1542, 27332, 53533, 17199,
+        28761, 1428, 12804,
+    );
+    let r = i64x4::new(
+        -315342455509907455,
+        3682272988378851195,
+        -2801974798966189198,
+        4180481285432101679,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitrevi_h::<11>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitrevi_w() {
+    let a = u32x8::new(
+        4260813560, 2237147704, 787609405, 2632090994, 1944569031, 3636389111, 844354358,
+        3691914548,
+    );
+    let r = i64x4::new(
+        -4226581827093603592,
+        -2530313314094680259,
+        -7440257783990598457,
+        -7201777846932221130,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitrevi_w::<30>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitrevi_d() {
+    let a = u64x4::new(
+        5820240183830393881,
+        7908556960014755456,
+        17094377170254219540,
+        17105994065815884924,
+    );
+    let r = i64x4::new(
+        5820240183863948313,
+        7908556959981201024,
+        -1352366903421777644,
+        -1340750007927221124,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbitrevi_d::<25>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvadd_b() {
+    let a = i8x32::new(
+        -63, 97, -109, 57, -109, 103, -19, 65, 57, -37, 32, 5, -97, -108, 12, -61, -91, 104,
+        -2, 65, -41, -85, -54, 104, 40, -13, 78, 80, 75, -33, -121, -67,
+    );
+    let b = i8x32::new(
+        -32, -51, 9, 94, 98, 84, -101, -90, -24, -111, 104, -25, 112, -85, 87, -10, -90, -59,
+        96, -43, -67, 16, -8, 83, 126, -13, 58, 116, 73, -90, 6, 67,
+    );
+    let r = i64x4::new(
+        -1762952590630572383,
+        -5088153816373105631,
+        -4917161598430335669,
+        39834845715162790,
+    );
+
+    assert_eq!(r, transmute(lasx_xvadd_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvadd_h() {
+    let a = i16x16::new(
+        19227, 23953, -4654, -5363, 31202, 4004, -2636, 15810, -18448, 29154, -23642, -23324,
+        23716, 21938, -17499, -1447,
+    );
+    let b = i16x16::new(
+        10023, 12046, -30915, -30883, 29754, 22142, -11854, 5774, 8790, 19058, -32113, 4500,
+        17933, 13821, 19847, 13830,
+    );
+    let r = i64x4::new(
+        8244530777499333186,
+        6075575139936955932,
+        -5298442949366588858,
+        3485514723534807729,
+    );
+
+    assert_eq!(r, transmute(lasx_xvadd_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvadd_w() {
+    let a = i32x8::new(
+        130061221,
+        1238983557,
+        1050069092,
+        -1831874224,
+        -377156607,
+        1147824901,
+        -1862271997,
+        91173942,
+    );
+    let b = i32x8::new(
+        683768234,
+        -1042445407,
+        -327184682,
+        -1513884019,
+        347904368,
+        886761024,
+        -1570339601,
+        13462118,
+    );
+    let r = i64x4::new(
+        844124927480171855,
+        4076821840425015098,
+        8738480013042623857,
+        449408456544649458,
+    );
+
+    assert_eq!(r, transmute(lasx_xvadd_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvadd_d() {
+    let a = i64x4::new(
+        -3908230933439843201,
+        -2965012514388925511,
+        -336128270114892540,
+        -637330020659137335,
+    );
+    let b = i64x4::new(
+        -7034299759176626990,
+        -361127056732231567,
+        4052152376745196186,
+        -2695706064065117364,
+    );
+    let r = i64x4::new(
+        7504213381093081425,
+        -3326139571121157078,
+        3716024106630303646,
+        -3333036084724254699,
+    );
+
+    assert_eq!(r, transmute(lasx_xvadd_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddi_bu() {
+    let a = i8x32::new(
+        97, -53, -62, 74, 99, 103, 85, -62, 12, -18, -65, 32, 19, -86, 65, -26, -98, 56, -9,
+        -49, 4, 57, -22, 9, 93, 38, 124, -2, -121, 70, 125, 21,
+    );
+    let r = i64x4::new(
+        -4226511262663192988,
+        -1637994053855153905,
+        931466702237612961,
+        1765491911008659808,
+    );
+
+    assert_eq!(r, transmute(lasx_xvaddi_bu::<3>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddi_hu() {
+    let a = i16x16::new(
+        28186, 30980, -18298, 10584, -13771, -23924, -28546, 30222, -16145, -32706, -20261,
+        19828, 22395, -2057, 5657, 15125,
+    );
+    let r = i64x4::new(
+        2979615520472788507,
+        8507177098988603958,
+        5581561774286553328,
+        4257614802810591100,
+    );
+
+    assert_eq!(r, transmute(lasx_xvaddi_hu::<1>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddi_wu() {
+    let a = i32x8::new(
+        832142867,
+        -97637134,
+        470208227,
+        -904606685,
+        -2133615997,
+        -538764334,
+        627855087,
+        2056153787,
+    );
+    let r = i64x4::new(
+        -419348219263615451,
+        -3885256050038354187,
+        -2313975115310458219,
+        8831113348648816385,
+    );
+
+    assert_eq!(r, transmute(lasx_xvaddi_wu::<18>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddi_du() {
+    let a = i64x4::new(
+        2524418528961435407,
+        -8855335564236661523,
+        -6695152760024429972,
+        -4546559236496052098,
+    );
+    let r = i64x4::new(
+        2524418528961435431,
+        -8855335564236661499,
+        -6695152760024429948,
+        -4546559236496052074,
+    );
+
+    assert_eq!(r, transmute(lasx_xvaddi_du::<24>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsub_b() {
+    let a = i8x32::new(
+        69, 68, 89, -122, -10, 4, 91, -20, -104, 41, -2, 28, -58, 89, 8, 71, 46, 82, -101, 51,
+        -88, -102, -124, -9, 40, -59, -102, -16, 3, 103, 85, -97,
+    );
+    let b = i8x32::new(
+        -65, -118, -63, 106, 15, -103, -19, -85, -42, 55, -34, -9, 15, 86, 74, 4, -118, -124,
+        43, 2, 17, -82, 112, -28, 76, -58, 103, -48, -26, 27, -97, 14,
+    );
+    let r = i64x4::new(
+        4714824500264876678,
+        4881343131253011138,
+        1374983920368537252,
+        -7947080804470620196,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsub_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsub_h() {
+    let a = i16x16::new(
+        13861, -12177, -9887, -27491, 3957, -5779, -6788, 4221, -12561, 4789, -8335, -24637,
+        660, -11584, -22855, 31170,
+    );
+    let b = i16x16::new(
+        -10247, 15942, -17883, -32294, -13460, -6485, 4553, 25005, -26816, -11045, 312, 22201,
+        12797, -7932, -13605, -24793,
+    );
+    let r = i64x4::new(
+        1351958658151964204,
+        -5849943150155381751,
+        5263263451968059311,
+        -2694318201466204009,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsub_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsub_w() {
+    let a = i32x8::new(
+        178703054,
+        -696864732,
+        212849982,
+        -285846503,
+        -1117046518,
+        705292054,
+        739892078,
+        504545429,
+    );
+    let b = i32x8::new(
+        1845948974,
+        513755820,
+        -260175909,
+        -530928548,
+        1413787975,
+        -1421495822,
+        1424414367,
+        1652017030,
+    );
+    let r = i64x4::new(
+        -5199575676077746016,
+        1052619368584826211,
+        9134484374713436099,
+        -4928352995773315889,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsub_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsub_d() {
+    let a = i64x4::new(
+        -7646834273082474631,
+        -3919573082038908840,
+        1242665522125115913,
+        -7118090461806523548,
+    );
+    let b = i64x4::new(
+        8536740478963669238,
+        -5376035241109169794,
+        -2919045115911617717,
+        -5820964252152272230,
+    );
+    let r = i64x4::new(
+        2263169321663407747,
+        1456462159070260954,
+        4161710638036733630,
+        -1297126209654251318,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsub_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubi_bu() {
+    let a = i8x32::new(
+        -110, 82, -20, -84, 15, -27, -19, -10, 74, -11, 10, -87, 103, -61, 21, -98, -92, -49,
+        78, 102, -11, -49, -45, 65, 12, 93, 109, -99, -11, -82, -27, 98,
+    );
+    let r = i64x4::new(
+        -1594036762305411707,
+        -7995940638099118019,
+        3802941238546645655,
+        6185872108420092159,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsubi_bu::<13>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubi_hu() {
+    let a = i16x16::new(
+        20553, 24028, -32247, -8607, 12622, -11323, -26896, -27740, -12003, -16731, 2560,
+        -6936, -6669, -11254, -12625, 5415,
+    );
+    let r = i64x4::new(
+        -2424482502709784510,
+        -7809920247766568633,
+        -1954269795052498666,
+        1522443898558080492,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsubi_hu::<7>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubi_wu() {
+    let a = i32x8::new(
+        755271012,
+        658180721,
+        -240702681,
+        -573588257,
+        -869840064,
+        -1735073421,
+        798270655,
+        299197982,
+    );
+    let r = i64x4::new(
+        2826864560638821706,
+        -2463542912799528179,
+        -7452083707597862106,
+        1285045436848317605,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsubi_wu::<26>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubi_du() {
+    let a = i64x4::new(
+        -6314492083383377124,
+        -2455352880818468995,
+        4567295273188684508,
+        4145748346670499022,
+    );
+    let r = i64x4::new(
+        -6314492083383377136,
+        -2455352880818469007,
+        4567295273188684496,
+        4145748346670499010,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsubi_du::<12>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmax_b() {
+    let a = i8x32::new(
+        25, 6, 107, -17, 0, 0, -33, -126, -67, -110, -28, -71, 103, -104, 76, -67, -63, 109,
+        -111, 21, -117, 23, 0, 127, 97, 55, -124, -87, -49, -29, -50, 33,
+    );
+    let b = i8x32::new(
+        -9, 89, -54, -48, -35, 107, -21, 85, -105, -19, -97, -119, 110, -49, -29, 38, 88, 38,
+        43, 117, -99, -12, -56, 125, -117, 87, 98, -75, 64, 37, 116, 118,
+    );
+    let r = i64x4::new(
+        6191159764511840537,
+        2759808746143411645,
+        9151340407859932504,
+        8535488153625188193,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmax_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmax_h() {
+    let a = i16x16::new(
+        30763, 3415, 26324, -7315, -21080, 18524, -4450, 24816, -15714, -28542, -635, -31873,
+        -26693, 15869, -3002, -24310,
+    );
+    let b = i16x16::new(
+        -31234, 12467, 15235, -27825, 27576, -30308, 5780, 15439, 5332, -17912, 27099, -21207,
+        26461, -8845, 28810, -15394,
+    );
+    let r = i64x4::new(
+        -2058876393102280661,
+        6985107848176626616,
+        -5969123438663035692,
+        -4332902052436023459,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmax_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmax_w() {
+    let a = i32x8::new(
+        1577861415,
+        918171955,
+        -750433312,
+        187580904,
+        2059773788,
+        -1443991497,
+        -1216535607,
+        1560471573,
+    );
+    let b = i32x8::new(
+        -1945753238,
+        -891888859,
+        -78561680,
+        1374400928,
+        -70918058,
+        1356405224,
+        -371800255,
+        -244516818,
+    );
+    let r = i64x4::new(
+        3943518520407245095,
+        5903007041568456304,
+        5825716079263328092,
+        6702174376295843649,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmax_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmax_d() {
+    let a = i64x4::new(
+        -2766896964461117900,
+        -5078071472767258214,
+        9065828085534222331,
+        -6500758532071144491,
+    );
+    let b = i64x4::new(
+        -22138921162050098,
+        -8125932019434035875,
+        -7840786109368633952,
+        -880822478913123851,
+    );
+    let r = i64x4::new(
+        -22138921162050098,
+        -5078071472767258214,
+        9065828085534222331,
+        -880822478913123851,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmax_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaxi_b() {
+    let a = i8x32::new(
+        -125, -85, -100, -36, 78, -85, 8, -111, -4, 10, -124, -8, 85, 25, -92, 61, 61, -45, 68,
+        58, -5, 10, 121, 74, -100, 75, 78, 36, -81, 0, 21, 82,
+    );
+    let r = i64x4::new(
+        -790112015120730635,
+        4464502462647438076,
+        5366332505119323453,
+        5914634738497113077,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmaxi_b::<-11>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaxi_h() {
+    let a = i16x16::new(
+        10159, 11019, -527, 25779, 18814, -6803, -7822, -21020, 17899, -30211, -21703, -32203,
+        -17678, -31762, -12745, 15653,
+    );
+    let r = i64x4::new(
+        7256424853078222767,
+        -2814792717481602,
+        -2814792717482517,
+        4406209242478280693,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmaxi_h::<-11>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaxi_w() {
+    let a = i32x8::new(
+        -1902781562,
+        -701262116,
+        1050694797,
+        1927374994,
+        2034319488,
+        1270402141,
+        1507027857,
+        -2022667122,
+    );
+    let r = i64x4::new(
+        21474836485,
+        8278012567408891021,
+        5456335650397700224,
+        22981864337,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmaxi_w::<5>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaxi_d() {
+    let a = i64x4::new(
+        1922310852027675403,
+        1444112415686500862,
+        -2217486151251900264,
+        2429249725865673045,
+    );
+    let r = i64x4::new(
+        1922310852027675403,
+        1444112415686500862,
+        -3,
+        2429249725865673045,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmaxi_d::<-3>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmax_bu() {
+    let a = u8x32::new(
+        85, 114, 198, 232, 2, 92, 134, 60, 6, 73, 97, 135, 118, 147, 202, 24, 163, 26, 22, 241,
+        100, 118, 187, 179, 231, 20, 8, 232, 203, 101, 192, 9,
+    );
+    let b = u8x32::new(
+        53, 108, 137, 217, 144, 216, 90, 50, 81, 196, 11, 85, 124, 110, 245, 183, 35, 166, 114,
+        134, 174, 222, 3, 134, 149, 130, 39, 166, 182, 16, 44, 58,
+    );
+    let r = i64x4::new(
+        4361411406047113813,
+        -5191080832418069423,
+        -5495554077319059805,
+        4233495576175936231,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmax_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmax_hu() {
+    let a = u16x16::new(
+        5749, 55167, 53819, 29245, 38403, 35505, 59653, 25124, 35403, 58917, 5938, 9735, 59292,
+        13480, 10576, 54135,
+    );
+    let b = u16x16::new(
+        5035, 18828, 58275, 53640, 3989, 38318, 53531, 14719, 27606, 5401, 62928, 12836, 16867,
+        7709, 62726, 59945,
+    );
+    let r = i64x4::new(
+        -3348176030115359115,
+        7072033525073876483,
+        3613283078621203019,
+        -1573457187787184228,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmax_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmax_wu() {
+    let a = u32x8::new(
+        1479333943, 2676167483, 3836141683, 1561090643, 2383304043, 4050203265, 880499204,
+        1213140090,
+    );
+    let b = u32x8::new(
+        3319622969, 1208019942, 2301441769, 3536726941, 665528183, 2671171581, 1912772755,
+        2591579616,
+    );
+    let r = i64x4::new(
+        -6952692252286292679,
+        -3256617523396288397,
+        -1051253505998826133,
+        -7315994376096540525,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmax_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmax_du() {
+    let a = u64x4::new(
+        15606303230109259264,
+        8116571215893940866,
+        8029178663488389518,
+        1343606515742555302,
+    );
+    let b = u64x4::new(
+        12474736035319899163,
+        7894892261694004420,
+        3771675238777573447,
+        5141420152487342561,
+    );
+    let r = i64x4::new(
+        -2840440843600292352,
+        8116571215893940866,
+        8029178663488389518,
+        5141420152487342561,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmax_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaxi_bu() {
+    let a = u8x32::new(
+        5, 31, 107, 171, 93, 98, 60, 232, 147, 171, 189, 163, 227, 182, 246, 12, 186, 67, 84,
+        153, 12, 95, 0, 34, 84, 166, 191, 25, 19, 211, 84, 138,
+    );
+    let r = i64x4::new(
+        -1712385603860226294,
+        934135061546904467,
+        2452877454773339066,
+        -8478920119441971628,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmaxi_bu::<10>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaxi_hu() {
+    let a = u16x16::new(
+        48338, 4001, 46491, 35597, 23103, 58140, 58650, 37062, 44161, 23848, 12302, 18312,
+        7294, 3406, 24569, 9169,
+    );
+    let r = i64x4::new(
+        -8426879650153513774,
+        -8014466583217022401,
+        5154422611776154753,
+        2580949584734723198,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmaxi_hu::<15>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaxi_wu() {
+    let a = u32x8::new(
+        3721611043, 1077683923, 3718582126, 906645810, 3702930805, 3185396072, 3048402980,
+        1473444340,
+    );
+    let r = i64x4::new(
+        4628617208431593251,
+        3894014106724011886,
+        -4765572115959759499,
+        6328395255824707620,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmaxi_wu::<12>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaxi_du() {
+    let a = u64x4::new(
+        6545420797271239625,
+        14656235662490779697,
+        8085422797121321277,
+        3280369825537805033,
+    );
+    let r = i64x4::new(
+        6545420797271239625,
+        -3790508411218771919,
+        8085422797121321277,
+        3280369825537805033,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmaxi_du::<18>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmin_b() {
+    let a = i8x32::new(
+        60, -51, 1, -10, 118, -28, -35, 82, -26, -121, -72, 104, 120, -114, -89, 101, -21,
+        -122, 65, -87, -82, 111, -120, 76, 3, -76, 9, 56, -41, -101, -3, 66,
+    );
+    let b = i8x32::new(
+        -95, -1, -42, 28, 90, -13, 93, 39, -93, -126, -63, 119, -82, -11, -1, 28, 58, -54, 83,
+        -38, 50, 121, 99, -78, -10, 115, 116, 63, 20, -24, 81, -7,
+    );
+    let r = i64x4::new(
+        2872703216671706529,
+        2064775833905037987,
+        -5582088942171093269,
+        -433018640497265418,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmin_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmin_h() {
+    let a = i16x16::new(
+        -11212, 17053, 31831, -17088, -26082, -20339, -29027, -7113, -12378, 23981, -6343,
+        -15884, -7455, -31741, -26691, 26033,
+    );
+    let b = i16x16::new(
+        -3990, -653, 31824, -8429, -13156, 20074, -32658, 26465, -31268, -28012, 12849, 11972,
+        -8106, 16341, 14932, -6230,
+    );
+    let r = i64x4::new(
+        -4809707714740235212,
+        -2001990296446068194,
+        -4470694295613700644,
+        -1753422264687927210,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmin_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmin_w() {
+    let a = i32x8::new(
+        545076841,
+        427733287,
+        -1694168270,
+        454215425,
+        1619909203,
+        1120598019,
+        1819961244,
+        -165320673,
+    );
+    let b = i32x8::new(
+        440778392, -880154888, 659189867, -948070867, 303440078, 2084920396, -670807717,
+        1250241,
+    );
+    let r = i64x4::new(
+        -3780236458933764456,
+        -4071933365454566606,
+        4812931843870826702,
+        -710046880263550629,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmin_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmin_d() {
+    let a = i64x4::new(
+        2741334847700576739,
+        -5405962583790843561,
+        8459180020282222757,
+        -1572925480949669194,
+    );
+    let b = i64x4::new(
+        -5261141090878992044,
+        -1222006182046777526,
+        4309148539181077305,
+        -3792381296290037631,
+    );
+    let r = i64x4::new(
+        -5261141090878992044,
+        -5405962583790843561,
+        4309148539181077305,
+        -3792381296290037631,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmin_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmini_b() {
+    let a = i8x32::new(
+        -85, 86, -102, -46, -93, 29, -46, 15, 36, -49, 80, -47, -57, 0, 17, 89, 60, 93, 100,
+        -34, 49, -3, -48, 22, -95, 29, -77, -48, 44, -92, -27, 74,
+    );
+    let r = i64x4::new(
+        -1093547173093904213,
+        -1085102769184911376,
+        -1094109792127880976,
+        -1088282380739546975,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmini_b::<-16>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmini_h() {
+    let a = i16x16::new(
+        29579, 25294, -26291, 17601, 19548, -1571, -3670, -17609, 15721, 11767, 5051, -4718,
+        14977, -104, -21933, 11733,
+    );
+    let r = i64x4::new(
+        2420355805741064,
+        -4956227148259196920,
+        -1327998905760612344,
+        2439077560844296,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmini_h::<8>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmini_w() {
+    let a = i32x8::new(
+        938211063,
+        1582718046,
+        -710671495,
+        -1169124073,
+        71125607,
+        1365032606,
+        -1290216030,
+        -736436725,
+    );
+    let r = i64x4::new(
+        -64424509456,
+        -5021349654917020807,
+        -64424509456,
+        -3162971646443594334,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmini_w::<-16>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmini_d() {
+    let a = i64x4::new(
+        6621191429364538735,
+        8224746792719035443,
+        4688148425230961784,
+        823273303261270164,
+    );
+    let r = i64x4::new(-8, -8, -8, -8);
+
+    assert_eq!(r, transmute(lasx_xvmini_d::<-8>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmin_bu() {
+    let a = u8x32::new(
+        21, 215, 240, 12, 207, 254, 97, 176, 94, 73, 182, 18, 231, 216, 171, 39, 221, 31, 171,
+        24, 170, 126, 78, 115, 189, 104, 30, 71, 73, 13, 173, 124,
+    );
+    let b = u8x32::new(
+        156, 34, 210, 157, 237, 204, 11, 176, 14, 3, 254, 148, 151, 143, 59, 162, 24, 238, 63,
+        85, 169, 120, 197, 108, 204, 8, 244, 238, 23, 109, 248, 6,
+    );
+    let r = i64x4::new(
+        -5761286108645023211,
+        2827011070121870094,
+        7804307871931244312,
+        481055128827070653,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmin_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmin_hu() {
+    let a = u16x16::new(
+        38440, 49714, 29557, 49236, 1896, 30340, 23067, 13106, 50372, 7988, 45184, 3030, 64318,
+        11696, 24753, 38944,
+    );
+    let b = u16x16::new(
+        37782, 2130, 14692, 21829, 22760, 43371, 63045, 45289, 2584, 36405, 12186, 43636, 1930,
+        62345, 57746, 16665,
+    );
+    let r = i64x4::new(
+        6144380368416052118,
+        3689110118768838504,
+        852921518428260888,
+        4690886800975071114,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmin_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmin_wu() {
+    let a = u32x8::new(
+        2388959959, 3753576755, 2396056833, 1264941814, 1407811024, 4062547104, 3162258102,
+        2894799861,
+    );
+    let b = u32x8::new(
+        1131111124, 1117231814, 2238242135, 3549614188, 791311618, 4010634425, 445826884,
+        195885173,
+    );
+    let r = i64x4::new(
+        4798474104311866068,
+        5432883724711157079,
+        -1221200381331475198,
+        841320412252129092,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmin_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmin_du() {
+    let a = u64x4::new(
+        16262575865555500950,
+        9397610354038464998,
+        11047831233023881635,
+        168959420679376173,
+    );
+    let b = u64x4::new(
+        5191113397333195233,
+        15218861976244884079,
+        15362510705177390571,
+        3583188655927147541,
+    );
+    let r = i64x4::new(
+        5191113397333195233,
+        -9049133719671086618,
+        -7398912840685669981,
+        168959420679376173,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmin_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmini_bu() {
+    let a = u8x32::new(
+        89, 194, 153, 118, 89, 237, 7, 106, 114, 216, 237, 232, 42, 35, 243, 48, 137, 126, 222,
+        196, 191, 34, 53, 34, 63, 196, 193, 56, 2, 174, 6, 34,
+    );
+    let r = i64x4::new(
+        1803437771371125017,
+        1808504320951916825,
+        1808504320951916825,
+        1803156197610166553,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmini_bu::<25>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmini_hu() {
+    let a = u16x16::new(
+        22785, 53436, 15467, 7600, 19970, 32791, 46922, 27359, 3030, 22997, 38845, 6828, 50455,
+        53714, 5069, 34493,
+    );
+    let r = i64x4::new(
+        7881419608817692,
+        7881419608817692,
+        7881419608817692,
+        7881419608817692,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmini_hu::<28>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmini_wu() {
+    let a = u32x8::new(
+        2549040097, 380059779, 106274074, 1242619380, 2422816304, 2036217770, 2017469655,
+        192110697,
+    );
+    let r = i64x4::new(94489280534, 94489280534, 94489280534, 94489280534);
+
+    assert_eq!(r, transmute(lasx_xvmini_wu::<22>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmini_du() {
+    let a = u64x4::new(
+        2554982158549964334,
+        5946824623239713063,
+        1554570220268300262,
+        16460909687025642884,
+    );
+    let r = i64x4::new(18, 18, 18, 18);
+
+    assert_eq!(r, transmute(lasx_xvmini_du::<18>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvseq_b() {
+    let a = i8x32::new(
+        -76, -8, 108, 108, 76, 13, -20, -73, -55, 105, 67, -14, 50, 11, -128, 38, -48, 61, -45,
+        0, -31, 68, 108, 17, 86, 59, -124, 71, 118, -60, -119, 53,
+    );
+    let b = i8x32::new(
+        67, 97, 92, 3, -94, -47, 103, 58, 78, 108, 121, -13, -27, -20, -58, -75, -64, 121, 8,
+        -31, 56, -8, -43, 119, 10, -100, 50, 122, 34, 124, -65, -92,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvseq_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvseq_h() {
+    let a = i16x16::new(
+        5587, -19681, -31618, -9619, 10724, 19984, 15759, -19212, -10822, 2437, -7916, -32319,
+        8472, 25354, -32596, 17629,
+    );
+    let b = i16x16::new(
+        -14248, 23765, 17541, -22426, -2225, 29478, -18012, -13943, 12940, 20394, 19156, -4063,
+        -17913, -12088, 8465, -31204,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvseq_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvseq_w() {
+    let a = i32x8::new(
+        884869290,
+        -2032301802,
+        1693636022,
+        1594721776,
+        2082937065,
+        -1159093260,
+        -1590139557,
+        -1882875192,
+    );
+    let b = i32x8::new(
+        186525406,
+        -1399001207,
+        -1514443895,
+        -1051577172,
+        1585652521,
+        90050345,
+        -1674322849,
+        2124996559,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvseq_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvseq_d() {
+    let a = i64x4::new(
+        2669611874067870445,
+        1365590924683817055,
+        2596664035622609827,
+        -5919289436914592027,
+    );
+    let b = i64x4::new(
+        -7435987568868960430,
+        -3618747286388594676,
+        1852961913881539893,
+        158448424073614869,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvseq_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvseqi_b() {
+    let a = i8x32::new(
+        8, -28, 17, -71, 11, 26, -79, 95, 102, 106, -100, -83, 116, -105, -72, 60, -64, -39,
+        -65, -93, -52, 80, 126, 38, 46, 91, -15, 42, -119, -109, 10, 70,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvseqi_b::<-14>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvseqi_h() {
+    let a = i16x16::new(
+        31558, 20053, 8868, 28957, 9939, -14167, 15718, -32625, 24920, 19118, 27698, -19776,
+        -15714, 14099, 21403, 13371,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvseqi_h::<-8>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvseqi_w() {
+    let a = i32x8::new(
+        1596885720,
+        1682548012,
+        -1583429372,
+        1961831515,
+        -1312514367,
+        263282180,
+        -1647205143,
+        409452108,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvseqi_w::<-11>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvseqi_d() {
+    let a = i64x4::new(
+        4860385404364618706,
+        -866096761684413508,
+        -6886759413716464738,
+        -1240694713477982808,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvseqi_d::<-2>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslt_b() {
+    let a = i8x32::new(
+        29, -4, -38, -40, -57, -127, 6, 23, -51, 12, 91, -49, 33, -64, 42, -82, 110, 44, -44,
+        -115, 78, -111, -13, -67, 97, -30, -44, 35, 108, 49, -20, -60,
+    );
+    let b = i8x32::new(
+        120, -26, -121, 12, 72, 65, -5, 75, 16, -1, 116, 18, -94, -26, -104, -66, -38, 101,
+        -92, 71, -74, 2, 17, -84, 102, 49, -4, -87, 30, -83, -9, -81,
+    );
+    let r = i64x4::new(
+        -71776119077994241,
+        -71777214277943041,
+        72056498804555520,
+        71776119077994495,
+    );
+
+    assert_eq!(r, transmute(lasx_xvslt_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslt_h() {
+    let a = i16x16::new(
+        -26246, 12525, 27206, -1022, 22747, 18600, -9895, -30775, -29586, 24084, -27504, -8187,
+        -18487, 5560, 18096, -17473,
+    );
+    let b = i16x16::new(
+        -25007, 1947, 11331, 32443, 1338, 4043, 6432, 22428, -5023, -29819, -32277, 19148,
+        -4421, 17327, -30689, 4545,
+    );
+    let r = i64x4::new(
+        -281474976645121,
+        -4294967296,
+        -281474976645121,
+        -281470681743361,
+    );
+
+    assert_eq!(r, transmute(lasx_xvslt_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslt_w() {
+    let a = i32x8::new(
+        -343022897,
+        2023876173,
+        564434564,
+        1237034632,
+        563192717,
+        -1067626766,
+        2022145749,
+        1215921380,
+    );
+    let b = i32x8::new(
+        -319278722,
+        -804141589,
+        -453029596,
+        -1367666903,
+        1987558200,
+        1387908488,
+        705912447,
+        -1635535899,
+    );
+    let r = i64x4::new(4294967295, 0, -1, 0);
+
+    assert_eq!(r, transmute(lasx_xvslt_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslt_d() {
+    let a = i64x4::new(
+        8053537017603706522,
+        8148317798642968933,
+        661692989904488737,
+        5141151145278580641,
+    );
+    let b = i64x4::new(
+        6944929519578764358,
+        -3223671261003932077,
+        8970791908210514994,
+        -3152991651421490245,
+    );
+    let r = i64x4::new(0, 0, -1, 0);
+
+    assert_eq!(r, transmute(lasx_xvslt_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslti_b() {
+    let a = i8x32::new(
+        -60, -44, 123, -31, 39, 115, -8, -17, 10, -6, 68, 82, -123, 86, -95, -108, -78, 45, 88,
+        -6, -82, 69, 96, 13, 79, 14, 43, -72, -35, 27, -30, 54,
+    );
+    let r = i64x4::new(
+        -72057589759672321,
+        -280379760050176,
+        1095216660735,
+        71777218556067840,
+    );
+
+    assert_eq!(r, transmute(lasx_xvslti_b::<-16>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslti_h() {
+    let a = i16x16::new(
+        -5839, -18013, 17630, 18447, -5550, -28050, -30597, -14016, -985, -1930, 10497, -28472,
+        -15481, 29582, 19157, 5547,
+    );
+    let r = i64x4::new(4294967295, -1, -281470681743361, 65535);
+
+    assert_eq!(r, transmute(lasx_xvslti_h::<-4>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslti_w() {
+    let a = i32x8::new(
+        -1407512371,
+        -898959054,
+        572699307,
+        1642426185,
+        797353241,
+        -259466597,
+        -1199389426,
+        -1398642331,
+    );
+    let r = i64x4::new(-1, 0, -4294967296, -1);
+
+    assert_eq!(r, transmute(lasx_xvslti_w::<-4>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslti_d() {
+    let a = i64x4::new(
+        -2819395691046139625,
+        5088541563771000132,
+        8992157267117868445,
+        3707348005090466869,
+    );
+    let r = i64x4::new(-1, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvslti_d::<1>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslt_bu() {
+    let a = u8x32::new(
+        25, 12, 175, 147, 216, 93, 84, 21, 98, 182, 199, 128, 107, 68, 249, 142, 59, 204, 118,
+        136, 201, 137, 11, 155, 238, 201, 130, 187, 247, 151, 109, 109,
+    );
+    let b = u8x32::new(
+        231, 122, 213, 181, 40, 150, 168, 103, 114, 67, 58, 96, 9, 131, 109, 87, 228, 98, 233,
+        122, 32, 208, 212, 193, 69, 197, 199, 67, 125, 145, 103, 17,
+    );
+    let r = i64x4::new(-1095216660481, 280375465083135, -1099494915841, 16711680);
+
+    assert_eq!(r, transmute(lasx_xvslt_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslt_hu() {
+    let a = u16x16::new(
+        52525, 2955, 54772, 12603, 44380, 34508, 12576, 61085, 25504, 9162, 5951, 6485, 30570,
+        47057, 5871, 54003,
+    );
+    let b = u16x16::new(
+        40432, 50345, 37115, 20747, 38363, 42964, 2046, 26895, 7013, 23222, 19013, 43373,
+        50793, 25948, 61295, 35633,
+    );
+    let r = i64x4::new(-281470681808896, 4294901760, -65536, 281470681808895);
+
+    assert_eq!(r, transmute(lasx_xvslt_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslt_wu() {
+    let a = u32x8::new(
+        645248129, 159156202, 442053255, 3539240300, 2212555000, 3589590552, 594555403,
+        303909752,
+    );
+    let b = u32x8::new(
+        3201000514, 1412178107, 2697992684, 4141300489, 840057459, 3810448458, 959312926,
+        2834332590,
+    );
+    let r = i64x4::new(-1, -1, -4294967296, -1);
+
+    assert_eq!(r, transmute(lasx_xvslt_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslt_du() {
+    let a = u64x4::new(
+        9001861276662418785,
+        11243806946003621417,
+        16522311710011399892,
+        3265452243993188662,
+    );
+    let b = u64x4::new(
+        12075582354920739274,
+        16153578604538879596,
+        2722606569672017936,
+        5142428655769651710,
+    );
+    let r = i64x4::new(-1, -1, 0, -1);
+
+    assert_eq!(r, transmute(lasx_xvslt_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslti_bu() {
+    let a = u8x32::new(
+        68, 117, 2, 67, 233, 205, 12, 99, 127, 21, 171, 71, 18, 146, 167, 76, 141, 21, 234,
+        150, 135, 213, 231, 122, 22, 117, 124, 46, 149, 74, 11, 213,
+    );
+    let r = i64x4::new(16711680, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvslti_bu::<7>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslti_hu() {
+    let a = u16x16::new(
+        45362, 8378, 15038, 64046, 51883, 25813, 52028, 8730, 1255, 3100, 9043, 37803, 61269,
+        5418, 42755, 28604,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvslti_hu::<13>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslti_wu() {
+    let a = u32x8::new(
+        1740233903, 2267221026, 574370304, 3294215750, 3920854673, 2171367380, 3811836140,
+        671324390,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvslti_wu::<8>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslti_du() {
+    let a = u64x4::new(
+        7794944984440982613,
+        6781669147121119045,
+        9839484777866727672,
+        2217716842113203908,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvslti_du::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsle_b() {
+    let a = i8x32::new(
+        -20, -44, 90, -101, -69, -3, -5, 99, -59, -13, 35, 125, 64, 21, 66, -2, 57, 4, 60, -35,
+        57, 37, -74, 54, -55, -125, -28, 64, -60, -10, 111, 91,
+    );
+    let b = i8x32::new(
+        -44, 127, 36, 48, 36, 79, 56, 54, -123, 29, -105, -117, -46, -9, -30, 97, 3, -5, -10,
+        118, -64, -118, -31, -42, 120, -84, -77, 40, 69, -80, 104, 61,
+    );
+    let r = i64x4::new(
+        72057594021216000,
+        -72057594037862656,
+        71776123339407360,
+        1095216726015,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsle_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsle_h() {
+    let a = i16x16::new(
+        -22122, -10270, -26549, -14589, 15764, 15351, -8429, 14898, -20819, -8483, -1055,
+        -5229, -21058, -26881, 1568, -1544,
+    );
+    let b = i16x16::new(
+        27196, -6538, 20190, -14481, 4568, 31469, -13818, -16230, -26411, 20205, -4192, -29119,
+        11920, 25504, -19817, -370,
+    );
+    let r = i64x4::new(-1, 4294901760, 4294901760, -281470681743361);
+
+    assert_eq!(r, transmute(lasx_xvsle_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsle_w() {
+    let a = i32x8::new(
+        -44465502,
+        -1482288791,
+        1430386258,
+        -837657585,
+        -294092640,
+        -1581080100,
+        -558275350,
+        -217520013,
+    );
+    let b = i32x8::new(
+        -251270550,
+        1931207536,
+        -1348623461,
+        -961792969,
+        845442346,
+        1529991774,
+        -2079565201,
+        2051352953,
+    );
+    let r = i64x4::new(-4294967296, 0, -1, -4294967296);
+
+    assert_eq!(r, transmute(lasx_xvsle_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsle_d() {
+    let a = i64x4::new(
+        -3700065874729391328,
+        3324167660406962127,
+        -431069737981318264,
+        4685397384184188250,
+    );
+    let b = i64x4::new(
+        3966484960661616600,
+        2732585182508661538,
+        -1886887956095472452,
+        3407078622354590260,
+    );
+    let r = i64x4::new(-1, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvsle_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslei_b() {
+    let a = i8x32::new(
+        -34, 34, -112, 113, 77, 45, 109, -125, 31, -88, -1, -53, 72, 39, 39, -99, -47, -45, 4,
+        17, -100, -96, 41, -62, -56, -88, 37, 8, 68, -53, 52, 61,
+    );
+    let r = i64x4::new(
+        -72057594021216001,
+        -72057589759672576,
+        -71776123356119041,
+        280375465148415,
+    );
+
+    assert_eq!(r, transmute(lasx_xvslei_b::<-14>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslei_h() {
+    let a = i16x16::new(
+        11585, -30889, -24807, -28938, -11929, -7, -8205, -24769, -12225, -7956, -26751, 11963,
+        30916, -25385, -28797, -6515,
+    );
+    let r = i64x4::new(-65536, -4294901761, 281474976710655, -65536);
+
+    assert_eq!(r, transmute(lasx_xvslei_h::<-15>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslei_w() {
+    let a = i32x8::new(
+        98083171, -839282918, 950280284, 1423312628, -74628250, -400513137, 1893412843,
+        1627152567,
+    );
+    let r = i64x4::new(-4294967296, 0, -1, 0);
+
+    assert_eq!(r, transmute(lasx_xvslei_w::<-3>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslei_d() {
+    let a = i64x4::new(
+        -4859364474358523407,
+        5515090293678524269,
+        -8825168226110066470,
+        -1006722941532041773,
+    );
+    let r = i64x4::new(-1, 0, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvslei_d::<6>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsle_bu() {
+    let a = u8x32::new(
+        158, 49, 59, 206, 238, 37, 129, 237, 128, 170, 238, 175, 10, 110, 43, 210, 223, 144,
+        115, 87, 183, 177, 226, 216, 74, 40, 36, 142, 76, 48, 213, 148,
+    );
+    let b = u8x32::new(
+        10, 235, 145, 113, 48, 119, 124, 22, 154, 225, 240, 6, 37, 126, 38, 233, 129, 30, 90,
+        103, 109, 14, 51, 10, 128, 242, 103, 199, 215, 228, 164, 115,
+    );
+    let r = i64x4::new(
+        280375481859840,
+        -71776123339407361,
+        4278190080,
+        281474976710655,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsle_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsle_hu() {
+    let a = u16x16::new(
+        61722, 23067, 57576, 43934, 56523, 22563, 45126, 9681, 5860, 62938, 40464, 22653,
+        53470, 26636, 64060, 22853,
+    );
+    let b = u16x16::new(
+        61426, 33539, 62959, 2501, 21021, 20564, 64705, 12707, 6875, 56968, 45402, 15505,
+        50807, 25207, 42588, 21407,
+    );
+    let r = i64x4::new(281474976645120, -4294967296, 281470681808895, 0);
+
+    assert_eq!(r, transmute(lasx_xvsle_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsle_wu() {
+    let a = u32x8::new(
+        3492865309, 1162904456, 1212423957, 2856547492, 4084218464, 1751333879, 3162347846,
+        990759844,
+    );
+    let b = u32x8::new(
+        525215252, 3081836083, 3319970808, 3111004663, 2712599486, 1206390980, 1598064821,
+        440769207,
+    );
+    let r = i64x4::new(-4294967296, -1, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvsle_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsle_du() {
+    let a = u64x4::new(
+        2621502387249005267,
+        2893454517032185854,
+        7681654086665024795,
+        5020934994941644473,
+    );
+    let b = u64x4::new(
+        2069393685367888462,
+        16283420533139074356,
+        5426371663235070936,
+        6959847307032735963,
+    );
+    let r = i64x4::new(0, -1, 0, -1);
+
+    assert_eq!(r, transmute(lasx_xvsle_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslei_bu() {
+    let a = u8x32::new(
+        31, 26, 96, 32, 50, 17, 14, 211, 51, 145, 198, 89, 217, 16, 184, 197, 220, 224, 23,
+        208, 243, 188, 17, 240, 237, 207, 250, 185, 88, 127, 104, 96,
+    );
+    let r = i64x4::new(72056494526365440, 280375465082880, 71776119077928960, 0);
+
+    assert_eq!(r, transmute(lasx_xvslei_bu::<29>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslei_hu() {
+    let a = u16x16::new(
+        43587, 14195, 3048, 63749, 62756, 59029, 53861, 44436, 63820, 31431, 3098, 39702,
+        37252, 60430, 367, 9201,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvslei_hu::<30>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslei_wu() {
+    let a = u32x8::new(
+        2210674294, 4169142079, 3945251466, 1311516675, 2977874622, 3173129893, 3425645958,
+        2905333026,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvslei_wu::<31>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvslei_du() {
+    let a = u64x4::new(
+        16014799523010103844,
+        8709196257349731516,
+        16077124464953821716,
+        14402865276083654462,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvslei_du::<5>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsat_b() {
+    let a = i8x32::new(
+        72, 84, 50, -112, -54, -10, 114, 37, -37, -9, 56, -1, -39, -51, 16, 88, -107, -47, -66,
+        -81, 83, 50, -69, 103, -46, 17, 121, 43, 8, -121, -113, 27,
+    );
+    let r = i64x4::new(
+        2698490476611392584,
+        6345798211138549723,
+        7474623341563662741,
+        1985954429852520914,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsat_b::<7>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsat_h() {
+    let a = i16x16::new(
+        -22224, 6834, -23483, -28336, -15236, 8349, -30647, -16818, -27867, 17449, -7303,
+        -20496, -3398, 17074, -14188, 16934,
+    );
+    let r = i64x4::new(
+        -1152657621547749376,
+        -1152657621547749376,
+        -1152657621547749376,
+        1152903912689234618,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsat_h::<12>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsat_w() {
+    let a = i32x8::new(
+        970917085,
+        -759322255,
+        -332118787,
+        127481445,
+        -925804081,
+        -2116293410,
+        240264455,
+        -1921693726,
+    );
+    let r = i64x4::new(-34359738361, 34359738360, -30064771080, -34359738361);
+
+    assert_eq!(r, transmute(lasx_xvsat_w::<3>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsat_d() {
+    let a = i64x4::new(
+        -7987623316798584571,
+        -7247559336295709650,
+        -5048248303955768218,
+        6102033771404793023,
+    );
+    let r = i64x4::new(
+        -7987623316798584571,
+        -7247559336295709650,
+        -5048248303955768218,
+        6102033771404793023,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsat_d::<63>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsat_bu() {
+    let a = u8x32::new(
+        25, 84, 86, 237, 15, 25, 247, 37, 97, 77, 124, 211, 71, 31, 112, 78, 71, 3, 68, 103,
+        56, 251, 164, 254, 198, 72, 14, 7, 154, 42, 226, 35,
+    );
+    let r = i64x4::new(
+        2683891456212418329,
+        4557395704426741567,
+        4557430858734043967,
+        2539795165049929535,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsat_bu::<5>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsat_hu() {
+    let a = u16x16::new(
+        50818, 7191, 19885, 24886, 23947, 902, 63438, 16327, 21304, 41986, 6658, 26825, 35878,
+        54181, 37442, 24336,
+    );
+    let r = i64x4::new(
+        1970354902204423,
+        1970354902204423,
+        1970354902204423,
+        1970354902204423,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsat_hu::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsat_wu() {
+    let a = u32x8::new(
+        2643833778, 2163840459, 3648859312, 2300494776, 1210790323, 4241633778, 1830707970,
+        1058612721,
+    );
+    let r = i64x4::new(270582939711, 270582939711, 270582939711, 270582939711);
+
+    assert_eq!(r, transmute(lasx_xvsat_wu::<5>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsat_du() {
+    let a = u64x4::new(
+        8558995131692178872,
+        17439570087619166841,
+        9621706324971219491,
+        6096695286958361953,
+    );
+    let r = i64x4::new(8796093022207, 8796093022207, 8796093022207, 8796093022207);
+
+    assert_eq!(r, transmute(lasx_xvsat_du::<42>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvadda_b() {
+    let a = i8x32::new(
+        25, 59, -110, -62, -36, -22, 27, -104, 32, 127, 92, 19, -127, -111, 2, 41, 37, 108,
+        -111, 108, -101, 89, -53, -16, 87, -111, 66, 68, 95, -47, 125, 105,
+    );
+    let b = i8x32::new(
+        -121, 110, -17, 74, 16, -33, -80, 48, -69, 114, 9, -63, -38, 6, -82, -112, -105, 5, 61,
+        119, 9, -72, 69, -21, 109, -14, -103, 72, -126, 41, -34, 60,
+    );
+    let r = i64x4::new(
+        -7463811258668570222,
+        -7398158934950416027,
+        2700648424200237454,
+        -6512388827583513148,
+    );
+
+    assert_eq!(r, transmute(lasx_xvadda_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvadda_h() {
+    let a = i16x16::new(
+        -7007, 10506, -11262, 28686, 22120, 22431, 1054, -2239, -28418, 24459, -8927, -15512,
+        9064, 22935, 26563, 2466,
+    );
+    let b = i16x16::new(
+        -1992, -19568, 12795, -27246, 14193, 19953, -3803, -27680, 2139, 30064, -7379, -12284,
+        5720, -19123, 21658, -12768,
+    );
+    let r = i64x4::new(
+        -2703182350329961689,
+        8421470691639987673,
+        7823948489959372637,
+        4288196905584441792,
+    );
+
+    assert_eq!(r, transmute(lasx_xvadda_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvadda_w() {
+    let a = i32x8::new(
+        1265529071,
+        -1075977129,
+        -583802219,
+        -13912299,
+        -172400466,
+        -972042514,
+        -260823873,
+        -1620748450,
+    );
+    let b = i32x8::new(
+        489335551, 1611173717, -476611840, -751628752, -192801793, 1467389657, -374333972,
+        35803655,
+    );
+    let r = i64x4::new(
+        -6905519068965954578,
+        3287973778850882155,
+        -7969462678089069741,
+        7114837115730115925,
+    );
+
+    assert_eq!(r, transmute(lasx_xvadda_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvadda_d() {
+    let a = i64x4::new(
+        7814609303075513348,
+        -7772522798724755627,
+        -1147865382247844592,
+        -7562711493144146696,
+    );
+    let b = i64x4::new(
+        3766721551761496817,
+        8105329332137326997,
+        -9194637465570314907,
+        7351062589763608413,
+    );
+    let r = i64x4::new(
+        -6865413218872541451,
+        -2568891942847468992,
+        -8104241225891392117,
+        -3532969990801796507,
+    );
+
+    assert_eq!(r, transmute(lasx_xvadda_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsadd_b() {
+    let a = i8x32::new(
+        95, 7, -14, -94, -86, -102, -123, 76, -40, 78, -16, 71, -122, 75, 8, -59, 43, 71, -16,
+        -38, -67, -40, 97, 101, -45, -28, -58, -99, 48, -111, -128, 118,
+    );
+    let b = i8x32::new(
+        -86, 59, 75, 107, -90, -1, 114, 4, -60, 20, -8, -67, 58, 47, 100, 122, -75, -106, -118,
+        -95, -44, 22, 76, 54, 90, 108, 113, 21, -92, -53, 125, -70,
+    );
+    let r = i64x4::new(
+        5834300617538748937,
+        4570162687008858780,
+        9187324073552698848,
+        3530119333939728429,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsadd_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsadd_h() {
+    let a = i16x16::new(
+        21287, 1075, 1515, 13634, 27666, -29218, 10797, -29531, -16877, -31125, 29749, 23913,
+        -6583, -15233, 14925, 1745,
+    );
+    let b = i16x16::new(
+        9900, -26262, -15712, 25834, 18751, -9376, 8538, -1589, -21802, 18049, 18837, -21370,
+        -11718, 2110, -13829, -19996,
+    );
+    let r = i64x4::new(
+        9223311063848417747,
+        -8759418229895430145,
+        715931602406637568,
+        -5137195089227040637,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsadd_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsadd_w() {
+    let a = i32x8::new(
+        192209429,
+        -2001895259,
+        1526351324,
+        940020268,
+        -971929246,
+        -265649149,
+        126711930,
+        1060927451,
+    );
+    let b = i32x8::new(
+        -1362410074,
+        17289452,
+        1453224925,
+        -157303455,
+        -1002635563,
+        -153598928,
+        1744530306,
+        450932350,
+    );
+    let r = i64x4::new(
+        -8523817033391921221,
+        3361743116011831295,
+        -1800656777305487305,
+        6493388403303310332,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsadd_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsadd_d() {
+    let a = i64x4::new(
+        7784983044177669725,
+        8101097656675707195,
+        5701949277844824642,
+        -9115087610184891150,
+    );
+    let b = i64x4::new(
+        -7435730805386005247,
+        -2620412598612541303,
+        -7972576523543653821,
+        7444842305858583495,
+    );
+    let r = i64x4::new(
+        349252238791664478,
+        5480685058063165892,
+        -2270627245698829179,
+        -1670245304326307655,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsadd_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsadd_bu() {
+    let a = u8x32::new(
+        25, 97, 235, 222, 176, 210, 161, 94, 48, 209, 231, 48, 45, 90, 187, 6, 29, 48, 193,
+        158, 240, 147, 240, 248, 228, 195, 131, 114, 9, 239, 172, 211,
+    );
+    let b = u8x32::new(
+        156, 230, 197, 50, 226, 217, 198, 2, 133, 7, 31, 251, 185, 83, 103, 173, 4, 107, 100,
+        3, 81, 209, 161, 88, 169, 211, 90, 7, 158, 153, 112, 221,
+    );
+    let r = i64x4::new(
+        6989586621679009717,
+        -5476467414210193227,
+        -1577084127,
+        -380207497217,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsadd_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsadd_hu() {
+    let a = u16x16::new(
+        18927, 31835, 27291, 15842, 30595, 45554, 31277, 2570, 50726, 18451, 33555, 31286,
+        37571, 1090, 50630, 36004,
+    );
+    let b = u16x16::new(
+        51573, 3134, 27346, 11433, 45605, 6834, 26138, 61459, 26540, 3859, 63747, 9497, 47455,
+        22235, 55919, 64188,
+    );
+    let r = i64x4::new(
+        7677464656203087871,
+        -423936190922293249,
+        -6967068626374950913,
+        -2766274561,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsadd_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsadd_wu() {
+    let a = u32x8::new(
+        2641259570, 2413939116, 2244295016, 1265788506, 4032439236, 2078944785, 2529147076,
+        1095977188,
+    );
+    let b = u32x8::new(
+        1074491620, 785068578, 441575896, 2827260071, 654541549, 2711155200, 2667914280,
+        1025335263,
+    );
+    let r = i64x4::new(
+        -4707110644611425002,
+        -867234291869342912,
+        -1,
+        9110967605937569791,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsadd_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsadd_du() {
+    let a = u64x4::new(
+        14430626347567901108,
+        8966103699466030320,
+        15088600594909856287,
+        4617508821066205697,
+    );
+    let b = u64x4::new(
+        9949819222347987503,
+        1797352673890553460,
+        93407820607851767,
+        16329185982288463052,
+    );
+    let r = i64x4::new(-1, -7683287700352967836, -3264735658191843562, -1);
+
+    assert_eq!(r, transmute(lasx_xvsadd_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavg_b() {
+    let a = i8x32::new(
+        1, -7, 51, 121, -46, 91, 117, 56, -128, -103, 77, -124, 47, -81, 71, -97, 9, -22, -45,
+        81, 64, -36, 18, -57, 53, -23, -56, -113, 55, -76, -98, -89,
+    );
+    let b = i8x32::new(
+        116, 40, 94, 32, 108, -83, -72, 62, 118, 3, 75, 51, -64, 117, 106, -76, 98, 102, -74,
+        83, -104, -25, 103, 87, -99, -120, 40, -83, -51, 73, 88, 19,
+    );
+    let r = i64x4::new(
+        4257595030195671098,
+        -6244220027603726597,
+        1098000814288676917,
+        -2451086284962613015,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavg_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavg_h() {
+    let a = i16x16::new(
+        22420, 2514, -1496, 12197, 18773, 25141, -11922, 14759, 28272, 9957, -8329, -18095,
+        14119, 4453, 29447, -17743,
+    );
+    let b = i16x16::new(
+        -23665, 8821, -12487, 30493, -29228, -14701, 16266, 5372, 21222, 5396, -495, -4093,
+        8979, 15419, 24369, -25475,
+    );
+    let r = i64x4::new(
+        6008334822825786769,
+        2833054969603877780,
+        -3122420865543937877,
+        -6082277202109387491,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavg_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavg_w() {
+    let a = i32x8::new(
+        264220248,
+        1806183666,
+        -744175589,
+        1149257464,
+        649257353,
+        1343192175,
+        -1646288099,
+        1777956369,
+    );
+    let b = i32x8::new(
+        -490550100,
+        1650015069,
+        602037366,
+        -115507354,
+        -1351815309,
+        -919786860,
+        1796894888,
+        -1823377644,
+    );
+    let r = i64x4::new(
+        7422130269685104002,
+        2219961461567099464,
+        909255992234993790,
+        -97541447405991454,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavg_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavg_d() {
+    let a = i64x4::new(
+        -5353831456328489109,
+        1116026769917166857,
+        -6482325223661420741,
+        4644114914180465662,
+    );
+    let b = i64x4::new(
+        -8278784043739101899,
+        8898944017823987194,
+        162737312931734425,
+        -3156875890654220898,
+    );
+    let r = i64x4::new(
+        -6816307750033795504,
+        5007485393870577025,
+        -3159793955364843158,
+        743619511763122382,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavg_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavg_bu() {
+    let a = u8x32::new(
+        222, 174, 254, 188, 116, 111, 1, 67, 236, 108, 184, 99, 34, 41, 62, 74, 228, 117, 143,
+        190, 202, 68, 177, 5, 102, 26, 144, 229, 66, 185, 137, 73,
+    );
+    let b = u8x32::new(
+        9, 86, 55, 74, 146, 206, 99, 36, 206, 46, 174, 95, 25, 21, 140, 91, 99, 120, 100, 243,
+        231, 197, 230, 158, 188, 38, 162, 58, 130, 77, 72, 87,
+    );
+    let r = i64x4::new(
+        3689185332455703155,
+        5937185894811520477,
+        5893950604224067235,
+        5794025379951354001,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavg_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavg_hu() {
+    let a = u16x16::new(
+        59347, 14794, 56762, 36383, 41235, 53425, 15726, 15850, 6947, 17893, 10811, 18470,
+        35860, 14001, 21530, 58912,
+    );
+    let b = u16x16::new(
+        45476, 48517, 33041, 8160, 7865, 37717, 29068, 45168, 12673, 29576, 21, 26212, 20245,
+        43416, 16626, 44166,
+    );
+    let r = i64x4::new(
+        6268922056724171963,
+        8587616261834498022,
+        6288455717791082066,
+        -3939723307751543404,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavg_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavg_wu() {
+    let a = u32x8::new(
+        1600834277, 4196831994, 2108873255, 518030497, 3166298163, 3812054340, 3824732684,
+        1900211486,
+    );
+    let b = u32x8::new(
+        1499894424, 568816404, 3212845718, 500610814, 585554707, 2609103780, 7570780, 977655961,
+    );
+    let r = i64x4::new(
+        -8212592065336791362,
+        2187515559063158366,
+        -4657412007911203421,
+        6180173283312674740,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavg_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavg_du() {
+    let a = u64x4::new(
+        16716089796022894912,
+        10136836254171396504,
+        5055029870739857077,
+        1722276628667681589,
+    );
+    let b = u64x4::new(
+        2981839357822260236,
+        4395528145348260085,
+        9124113278861486873,
+        17073319773492299474,
+    );
+    let r = i64x4::new(
+        -8597779496786974042,
+        7266182199759828294,
+        7089571574800671975,
+        -9048945872629561085,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavg_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavgr_b() {
+    let a = i8x32::new(
+        70, 49, 125, -63, -42, -19, 98, -71, -39, -43, 62, -91, -109, -76, -2, 73, -82, -26,
+        31, -13, -19, 61, -64, -122, -66, -36, 15, 102, 72, 18, -9, -30,
+    );
+    let b = i8x32::new(
+        -101, 91, 109, 12, 107, -108, -99, 124, -72, -12, -23, -93, 0, -21, -65, 51, -90, -9,
+        94, -109, -17, -42, -4, 45, -18, 41, 13, 6, 79, 39, 60, -14,
+    );
+    let r = i64x4::new(
+        1945767390385358577,
+        4530569318912812489,
+        -2675689108017254486,
+        -1577916506278329386,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavgr_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavgr_h() {
+    let a = i16x16::new(
+        -23160, -26916, 22577, 3623, -22521, -16865, 13203, 26275, -20646, 12156, -26885,
+        -1419, -20243, 28347, -3617, -21473,
+    );
+    let b = i16x16::new(
+        23255, 16173, -15467, -21396, 14626, -27747, 22216, -25899, 14208, 23641, 23787, 27175,
+        -6255, -22851, -20976, 28894,
+    );
+    let r = i64x4::new(
+        -2501171370499178448,
+        52993362325598357,
+        3625109573325288301,
+        1044782302812228671,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavgr_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavgr_w() {
+    let a = i32x8::new(
+        -500594887,
+        -775813621,
+        -892322315,
+        -1910111140,
+        573941213,
+        1978372579,
+        765765621,
+        1237953660,
+    );
+    let b = i32x8::new(
+        -541556784,
+        538719952,
+        -1163583489,
+        56482881,
+        -978953184,
+        -804071754,
+        1958602350,
+        1082613894,
+    );
+    let r = i64x4::new(
+        -509154771300449403,
+        -3980636370258710790,
+        2521791825760354559,
+        4983380877656540978,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavgr_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavgr_d() {
+    let a = i64x4::new(
+        -560846199430459987,
+        -6913595054902211026,
+        1018627982636790344,
+        -4796205388927403814,
+    );
+    let b = i64x4::new(
+        -1503583177859445318,
+        2269985815924150324,
+        8892159546918356586,
+        5254840197509918769,
+    );
+    let r = i64x4::new(
+        -1032214688644952652,
+        -2321804619489030351,
+        4955393764777573465,
+        229317404291257478,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavgr_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavgr_bu() {
+    let a = u8x32::new(
+        173, 186, 248, 144, 15, 66, 150, 226, 30, 14, 68, 38, 255, 233, 148, 172, 133, 29, 57,
+        83, 110, 70, 253, 31, 175, 67, 167, 162, 54, 221, 53, 188,
+    );
+    let b = u8x32::new(
+        73, 42, 164, 127, 251, 107, 243, 43, 224, 179, 219, 9, 103, 205, 153, 157, 108, 89, 40,
+        102, 99, 142, 142, 155, 155, 170, 95, 233, 116, 68, 9, 47,
+    );
+    let r = i64x4::new(
+        -8663422077139783045,
+        -6514496773710388865,
+        6757205291683625849,
+        8511681618342279077,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavgr_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavgr_hu() {
+    let a = u16x16::new(
+        748, 52495, 35014, 19986, 51280, 1137, 33343, 41113, 44125, 44938, 39033, 4840, 8926,
+        20195, 61480, 38149,
+    );
+    let b = u16x16::new(
+        49450, 21694, 20295, 62811, 50314, 20597, 51590, 51120, 20909, 7005, 34026, 24886,
+        1353, 12358, 20971, 58564,
+    );
+    let r = i64x4::new(
+        -6793842733113449973,
+        -5465780177655839123,
+        4183719475707936517,
+        -4835281559523879916,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavgr_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavgr_wu() {
+    let a = u32x8::new(
+        725985028, 2564620547, 4042355808, 1169637821, 2193709333, 848280370, 2882464312,
+        222274907,
+    );
+    let b = u32x8::new(
+        3005308642, 568881719, 1868204939, 3839859286, 1155339100, 2594656893, 1645672275,
+        936913519,
+    );
+    let r = i64x4::new(
+        6729144879071593203,
+        -7688930946620981258,
+        7393651477204383289,
+        2489338192049926342,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavgr_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvavgr_du() {
+    let a = u64x4::new(
+        2554728288465437854,
+        11449711494451353492,
+        3273645684131385521,
+        10253723919691993285,
+    );
+    let b = u64x4::new(
+        7302091036247388883,
+        15155026503610587821,
+        2157260177986334855,
+        2575722548058380647,
+    );
+    let r = i64x4::new(
+        4928409662356413369,
+        -5144375074678580959,
+        2715452931058860188,
+        6414723233875186966,
+    );
+
+    assert_eq!(r, transmute(lasx_xvavgr_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssub_b() {
+    let a = i8x32::new(
+        73, 33, 105, 6, 76, 22, -108, 53, 0, 81, 98, 121, -77, 54, 85, 86, 22, 5, -91, 107,
+        -24, 31, -120, 60, -115, 78, 110, 39, -112, 112, -39, 29,
+    );
+    let b = i8x32::new(
+        -83, -99, 27, 4, -80, 31, 26, -29, -50, 39, -93, 6, 26, 105, -109, -36, 65, -14, -120,
+        103, -50, -109, -38, -78, -38, 70, -79, -27, 61, 12, 39, 93,
+    );
+    let r = i64x4::new(
+        5945023633000660863,
+        8826999853620865586,
+        9200430838479393749,
+        -4561472970538678093,
+    );
+
+    assert_eq!(r, transmute(lasx_xvssub_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssub_h() {
+    let a = i16x16::new(
+        30107, 16338, 20726, 3737, -28092, -2792, 11304, -3451, 32157, 18332, 16586, 2662,
+        17942, -23482, 23033, -833,
+    );
+    let b = i16x16::new(
+        -212, 2969, -3923, -10268, -14795, -2019, 863, -28427, -5609, 18395, -17614, -2870,
+        -1551, 14381, 1242, -29426,
+    );
+    let r = i64x4::new(
+        3942162916357797487,
+        7030163866323241999,
+        1557260308647608319,
+        8048307602867637285,
+    );
+
+    assert_eq!(r, transmute(lasx_xvssub_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssub_w() {
+    let a = i32x8::new(
+        -638701442,
+        124032353,
+        -1177957330,
+        1822772002,
+        -624208464,
+        -690157477,
+        -752614768,
+        1017525230,
+    );
+    let b = i32x8::new(
+        932721978,
+        -1730383729,
+        2006657743,
+        -1118024603,
+        1361667737,
+        -932072815,
+        -1709865093,
+        -66403119,
+    );
+    let r = i64x4::new(
+        7964656428089998148,
+        9223372034707292160,
+        1039018467419877143,
+        4655436811119524629,
+    );
+
+    assert_eq!(r, transmute(lasx_xvssub_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssub_d() {
+    let a = i64x4::new(
+        8715609043439660533,
+        6520891714816295946,
+        -9200207215764087611,
+        -4552769804861861814,
+    );
+    let b = i64x4::new(
+        8369052152539855925,
+        2070139234200116232,
+        -8565613288638792421,
+        6969198225778950763,
+    );
+    let r = i64x4::new(
+        346556890899804608,
+        4450752480616179714,
+        -634593927125295190,
+        -9223372036854775808,
+    );
+
+    assert_eq!(r, transmute(lasx_xvssub_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssub_bu() {
+    let a = u8x32::new(
+        194, 170, 115, 69, 137, 47, 83, 232, 208, 7, 239, 24, 252, 237, 181, 153, 99, 109, 110,
+        137, 12, 246, 132, 6, 201, 93, 177, 189, 98, 6, 85, 252,
+    );
+    let b = u8x32::new(
+        192, 185, 64, 8, 157, 119, 247, 72, 81, 33, 0, 242, 154, 190, 235, 167, 199, 215, 118,
+        14, 79, 208, 68, 149, 8, 111, 58, 97, 85, 219, 178, 240,
+    );
+    let r = i64x4::new(
+        -6917529026614329342,
+        52097968963711,
+        18056182014935040,
+        864691185841012929,
+    );
+
+    assert_eq!(r, transmute(lasx_xvssub_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssub_hu() {
+    let a = u16x16::new(
+        32377, 48753, 23359, 60048, 51933, 60261, 16706, 5683, 42654, 19286, 27115, 5230,
+        25323, 3004, 59060, 28377,
+    );
+    let b = u16x16::new(
+        20524, 46292, 39370, 44869, 11104, 28817, 18216, 21295, 15477, 23627, 5697, 53043,
+        24168, 62463, 15113, 55444,
+    );
+    let r = i64x4::new(
+        4272508671652343373,
+        2060754813,
+        91989609572905,
+        188750927758467,
+    );
+
+    assert_eq!(r, transmute(lasx_xvssub_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssub_wu() {
+    let a = u32x8::new(
+        1657277873, 1330142084, 2851707029, 329302965, 4012116382, 3796717712, 1394210702,
+        3853566063,
+    );
+    let b = u32x8::new(
+        3002534878, 3166207065, 1567450925, 39925211, 2740035937, 1015422746, 235666751,
+        2928176588,
+    );
+    let r = i64x4::new(
+        0,
+        1242867990904189288,
+        -6501173152938039235,
+        3974517532346153551,
+    );
+
+    assert_eq!(r, transmute(lasx_xvssub_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssub_du() {
+    let a = u64x4::new(
+        15530474406792892207,
+        11041265010582297193,
+        12958884950634485683,
+        10554031950250935627,
+    );
+    let b = u64x4::new(
+        14455090273467103742,
+        13018023957546859856,
+        4721944463560386324,
+        13428322516292168868,
+    );
+    let r = i64x4::new(1075384133325788465, 0, 8236940487074099359, 0);
+
+    assert_eq!(r, transmute(lasx_xvssub_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvabsd_b() {
+    let a = i8x32::new(
+        77, 34, -55, -6, -27, 106, -19, 107, 7, -43, -15, 64, 88, -60, 98, 5, 123, -72, -69,
+        -120, -106, -29, -62, 112, -78, -24, 105, -79, 74, 24, -122, -33,
+    );
+    let b = i8x32::new(
+        70, -55, 105, 62, 94, -15, 120, -122, -62, 75, -50, -61, -74, -125, 109, 53, -51, -35,
+        -29, -26, 66, 19, -98, 51, 50, 111, 106, 64, 24, 86, -114, -90,
+    );
+    let r = i64x4::new(
+        -1906296455511910137,
+        3461932904704341573,
+        4405699852347385262,
+        4109603046844106624,
+    );
+
+    assert_eq!(r, transmute(lasx_xvabsd_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvabsd_h() {
+    let a = i16x16::new(
+        -3523, -20106, 11040, 6484, 22611, -2497, 28408, 18680, 14501, -17999, -17051, 5091,
+        17047, -23076, 3361, 4856,
+    );
+    let b = i16x16::new(
+        15765, 31104, 9632, 30835, -6611, 20000, -27189, 15641, 6191, 28248, 28092, 28462,
+        -4315, -1294, -14727, 24445,
+    );
+    let r = i64x4::new(
+        6854203208551254872,
+        855641242994831910,
+        6578545571444236406,
+        5513891007581016946,
+    );
+
+    assert_eq!(r, transmute(lasx_xvabsd_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvabsd_w() {
+    let a = i32x8::new(
+        -516201776,
+        -1265475612,
+        -789611388,
+        -170081681,
+        903632669,
+        -211238418,
+        -1863976799,
+        639146993,
+    );
+    let b = i32x8::new(
+        1884052123,
+        -78957215,
+        260861474,
+        -2114421033,
+        -1460646598,
+        -1379633816,
+        1900992494,
+        -2022565365,
+    );
+    let r = i64x4::new(
+        5096057713617598411,
+        8350873930216305054,
+        5018220025571183075,
+        -7014776540975538355,
+    );
+
+    assert_eq!(r, transmute(lasx_xvabsd_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvabsd_d() {
+    let a = i64x4::new(
+        -391271937360884965,
+        -20808483467978826,
+        2531375025191050735,
+        -2026665653248710281,
+    );
+    let b = i64x4::new(
+        716104320672255601,
+        -518451966573772136,
+        3032418447389694341,
+        -6748971658539956270,
+    );
+    let r = i64x4::new(
+        1107376258033140566,
+        497643483105793310,
+        501043422198643606,
+        4722306005291245989,
+    );
+
+    assert_eq!(r, transmute(lasx_xvabsd_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvabsd_bu() {
+    let a = u8x32::new(
+        167, 63, 182, 73, 179, 226, 126, 48, 51, 89, 114, 98, 233, 151, 164, 141, 121, 82, 125,
+        131, 94, 231, 83, 187, 111, 196, 18, 11, 152, 164, 19, 164,
+    );
+    let b = u8x32::new(
+        204, 191, 64, 88, 65, 66, 113, 230, 140, 89, 240, 41, 98, 215, 60, 243, 232, 132, 39,
+        170, 30, 165, 206, 56, 230, 91, 235, 13, 185, 191, 68, 138,
+    );
+    let r = i64x4::new(
+        -5328426372363288539,
+        7379218938975879257,
+        -8972504989300280721,
+        1887319547440621943,
+    );
+
+    assert_eq!(r, transmute(lasx_xvabsd_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvabsd_hu() {
+    let a = u16x16::new(
+        3423, 48528, 56740, 39409, 50360, 13926, 57000, 4567, 4452, 31543, 58373, 9298, 48132,
+        51688, 31647, 52056,
+    );
+    let b = u16x16::new(
+        4223, 51844, 62479, 1974, 39743, 1068, 23170, 3816, 24418, 43609, 63727, 13263, 6596,
+        17773, 11934, 45434,
+    );
+    let r = i64x4::new(
+        -7909703671511514336,
+        211533007095998841,
+        1116071278703431166,
+        1864011964690965056,
+    );
+
+    assert_eq!(r, transmute(lasx_xvabsd_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvabsd_wu() {
+    let a = u32x8::new(
+        596511673, 1656018177, 862222472, 3855869253, 1555502903, 50646434, 688234186,
+        2814498786,
+    );
+    let b = u32x8::new(
+        2976814235, 296937998, 3274139740, 128554952, 227946291, 3566260080, 3443244200,
+        2459204000,
+    );
+    let r = i64x4::new(
+        5837204923827128546,
+        -2438051046589534252,
+        -3347298437440673788,
+        1525979489064328670,
+    );
+
+    assert_eq!(r, transmute(lasx_xvabsd_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvabsd_du() {
+    let a = u64x4::new(
+        12734602602054551239,
+        14766664927105746582,
+        15860998294904895250,
+        6219187986984895141,
+    );
+    let b = u64x4::new(
+        14337911389010813068,
+        18082222857282413983,
+        12137634856997955567,
+        8346674176989823087,
+    );
+    let r = i64x4::new(
+        1603308786956261829,
+        3315557930176667401,
+        3723363437906939683,
+        2127486190004927946,
+    );
+
+    assert_eq!(r, transmute(lasx_xvabsd_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmul_b() {
+    let a = i8x32::new(
+        79, -96, -64, -1, -115, -89, -42, 81, 83, -94, 126, -51, 60, -90, -52, 65, 113, 30,
+        -64, -32, -115, 18, -120, -103, 68, -52, -106, 124, -90, 23, 39, 46,
+    );
+    let b = i8x32::new(
+        -85, 53, -41, 89, -85, -87, -95, 98, 86, 91, 64, 121, -108, 74, 124, 103, 27, -110, 66,
+        -68, -29, -83, -3, -62, 124, 30, -91, 77, -28, 116, -27, 64,
+    );
+    let r = i64x4::new(
+        186405908484464837,
+        2869070799329859298,
+        -979486707244065557,
+        -9159357540886189840,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmul_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmul_h() {
+    let a = i16x16::new(
+        -4021, 8043, -7726, -25122, -30015, -30658, -18708, -10900, 3772, -3578, -17492,
+        -13851, -17265, 32476, -4087, 27743,
+    );
+    let b = i16x16::new(
+        -2689, -26491, 4625, 17707, 7226, 23738, 2364, -25740, 1919, 17707, 29523, -15101,
+        -9498, -8760, 352, -20751,
+    );
+    let r = i64x4::new(
+        6506226959995370549,
+        1796983875076656058,
+        -7588815217799040188,
+        -7534790044979024262,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmul_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmul_w() {
+    let a = i32x8::new(
+        1226983252,
+        1810325729,
+        -263694346,
+        -895831021,
+        -666287351,
+        1386398263,
+        -1628946240,
+        -76075817,
+    );
+    let b = i32x8::new(
+        268813984,
+        1729713250,
+        -1600000134,
+        160164970,
+        1783576517,
+        -2129626845,
+        307974730,
+        -511240490,
+    );
+    let r = i64x4::new(
+        3987350480567897216,
+        -909423805039995588,
+        1476209283271918829,
+        1142495638330554240,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmul_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmul_d() {
+    let a = i64x4::new(
+        7081580607883685997,
+        8110222974893566630,
+        -8608830426521534350,
+        590950945391337126,
+    );
+    let b = i64x4::new(
+        5261749457268646376,
+        -3861654047048473926,
+        2264171061650339978,
+        -2049567854949213368,
+    );
+    let r = i64x4::new(
+        -9157092306373316664,
+        -1248560416451753828,
+        7374339937678077300,
+        -3668010491661410128,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmul_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmadd_b() {
+    let a = i8x32::new(
+        -80, 6, -31, 32, -90, -72, 112, 83, 57, 119, -115, 85, -124, 56, 112, 8, 55, -29, -86,
+        -43, -88, 94, 98, -85, 111, -93, -82, 53, 79, -43, 14, -67,
+    );
+    let b = i8x32::new(
+        86, -88, -20, -70, -85, 89, -29, -112, -123, -89, 29, 42, -11, -125, -93, -49, -27, -7,
+        99, 68, 125, -84, -21, -114, 79, -118, 99, -23, 69, 9, -20, -112,
+    );
+    let c = i8x32::new(
+        -63, 26, 78, 67, 81, 21, 10, -51, 114, -15, 89, -83, 83, -69, -105, -86, 92, 63, -57,
+        -19, 3, 118, -24, 53, 17, 70, 49, 96, -75, -120, -92, -112,
+    );
+    let r = i64x4::new(
+        -6679394867387754874,
+        9121453853276024435,
+        1250494502005582467,
+        -4810234623069954130,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmadd_b(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmadd_h() {
+    let a = i16x16::new(
+        -18216, 6658, -25854, 27669, 16377, -14455, 1886, -6575, 31234, 14625, 26195, -12640,
+        24030, -29160, 29917, -29533,
+    );
+    let b = i16x16::new(
+        -3405, 23202, -23415, -21889, -9055, -26344, -21723, -29614, -15925, -27403, -3911,
+        -6313, 18640, 2098, 7776, 25873,
+    );
+    let c = i16x16::new(
+        -28853, -6876, -18951, -29568, 17346, 756, -1848, -28084, -18031, -29179, -17665, 5467,
+        -7564, -24294, -5418, -17877,
+    );
+    let r = i64x4::new(
+        2275867314736517193,
+        5956455014341383419,
+        3282447490748182781,
+        -2270208808738554850,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmadd_h(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmadd_w() {
+    let a = i32x8::new(
+        631333548,
+        -711233206,
+        -373490054,
+        -1088004305,
+        1976762993,
+        -1387656422,
+        -955329396,
+        -154134074,
+    );
+    let b = i32x8::new(
+        -1871585382,
+        1805289828,
+        -855267305,
+        -1685758538,
+        1205523204,
+        -199185288,
+        1115810744,
+        -1091019827,
+    );
+    let c = i32x8::new(
+        -1280005623,
+        719575493,
+        -616783227,
+        1851306944,
+        1226448706,
+        -1988503778,
+        998289127,
+        -1282400946,
+    );
+    let r = i64x4::new(
+        -2474464942478687466,
+        1027640603165319277,
+        8552064293631354233,
+        4842015271998822292,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmadd_w(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmadd_d() {
+    let a = i64x4::new(
+        -8550317712350613337,
+        8202606384933985240,
+        5087434227784990050,
+        -1267807070683885625,
+    );
+    let b = i64x4::new(
+        802127189675314302,
+        3753081308686166762,
+        -8729512035384580104,
+        -6163460252766523953,
+    );
+    let c = i64x4::new(
+        9117516500379534748,
+        7040045067230881407,
+        -6924119543016236368,
+        -3601551888108100797,
+    );
+    let r = i64x4::new(
+        74735811180856175,
+        -6992817346463866386,
+        -821701661344765982,
+        -5913164195617334796,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmadd_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmsub_b() {
+    let a = i8x32::new(
+        41, 66, 49, 41, -31, 101, 127, 22, -98, 62, 39, -62, -91, 97, 100, 46, 4, 17, 71, 25,
+        127, 34, 34, -64, 56, -11, 109, -98, 39, -34, -124, -56,
+    );
+    let b = i8x32::new(
+        -126, 107, 108, -102, -4, -15, -17, -100, 43, 106, -14, -106, -108, 12, 54, 116, -15,
+        -102, 74, 95, -5, -115, 63, 100, -47, -1, 43, -111, 18, -6, -33, -59,
+    );
+    let c = i8x32::new(
+        -12, -61, 80, 77, 76, 74, -19, -82, 43, -87, 110, -104, 33, -78, -99, -79, 24, -83, -6,
+        122, -25, -80, -114, 88, 127, -19, 122, -59, 54, 43, 103, 122,
+    );
+    let r = i64x4::new(
+        1025900500437025089,
+        -412631794493733787,
+        6931094814234771308,
+        -1816111343100501367,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmsub_b(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmsub_h() {
+    let a = i16x16::new(
+        26038, 237, 16351, -25337, -23596, 9950, 32416, -11130, -4158, -30128, 4774, -23969,
+        18009, 9294, -3126, -30265,
+    );
+    let b = i16x16::new(
+        -31480, 9797, -14893, 24037, 11613, 4212, 22821, 26358, -744, -21778, -26335, 25179,
+        -6708, -1235, -24224, 19814,
+    );
+    let c = i16x16::new(
+        -26405, -560, -18771, -10193, -26133, 18220, 11977, 15766, 19965, 5097, 6382, -14160,
+        17216, 29647, -20172, -31904,
+    );
+    let r = i64x4::new(
+        2881334304583833566,
+        -2133902947871987083,
+        -1454770464836380918,
+        5874888860683169625,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmsub_h(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmsub_w() {
+    let a = i32x8::new(
+        -1934260879,
+        1181160590,
+        -1986745,
+        -225146926,
+        599588188,
+        1708212146,
+        -1981989107,
+        1701829445,
+    );
+    let b = i32x8::new(
+        -763566835,
+        214100032,
+        -67293570,
+        1596390731,
+        -1705509662,
+        -1061894423,
+        -18782985,
+        1095295438,
+    );
+    let c = i32x8::new(
+        333156491,
+        -310224012,
+        -1373786280,
+        699045355,
+        681377550,
+        -1946631976,
+        1564749118,
+        996805551,
+    );
+    let r = i64x4::new(
+        362284194097715042,
+        -5652196781102231049,
+        243945460745636608,
+        -6224637193866223557,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmsub_w(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmsub_d() {
+    let a = i64x4::new(
+        -3841665993514658557,
+        6022894223412086471,
+        -8518556207745298564,
+        -1430476343179717412,
+    );
+    let b = i64x4::new(
+        7897629235985733517,
+        228540188827833305,
+        -8463927364436887671,
+        -8371521766374880332,
+    );
+    let c = i64x4::new(
+        -4481659901844799958,
+        -4869069543228428543,
+        -327735423889799522,
+        -3356219160756661306,
+    );
+    let r = i64x4::new(
+        7809193441161400801,
+        2981175878869326830,
+        2247972583277073134,
+        -8100971496301761628,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmsub_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvdiv_b() {
+    let a = i8x32::new(
+        2, 48, -45, 96, 6, -14, 2, -26, -29, 13, -116, -94, -82, 97, -85, 21, -74, -3, -122,
+        -75, -114, -79, -14, -42, -40, -66, 107, 72, 117, -23, 55, 11,
+    );
+    let b = i8x32::new(
+        -113, -102, -25, 23, 113, -81, -87, 61, -8, 115, 14, -87, -39, -62, -33, 117, -111,
+        123, 30, 85, -119, -89, 37, 68, 93, 36, 94, 79, -50, 110, -128, -128,
+    );
+    let r = i64x4::new(67174400, 843334041468931, 16515072, 1090921824000);
+
+    assert_eq!(r, transmute(lasx_xvdiv_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvdiv_h() {
+    let a = i16x16::new(
+        -12734, -9855, -5625, -19685, -5760, 20073, -4828, 32152, -17118, -23694, 12801,
+        -32702, -21927, 29064, -255, 24493,
+    );
+    let b = i16x16::new(
+        5202, -19363, -28050, 14286, -31733, 14009, 1475, 5279, -16963, -26208, -32414, 583,
+        -21866, -8394, -11158, -24288,
+    );
+    let r = i64x4::new(
+        -281474976645122,
+        1970311952138240,
+        -15762598695796735,
+        -281470681939967,
+    );
+
+    assert_eq!(r, transmute(lasx_xvdiv_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvdiv_w() {
+    let a = i32x8::new(
+        -1639036870,
+        1679737548,
+        -1853446119,
+        1425169187,
+        709689254,
+        1564169372,
+        -368472440,
+        754854064,
+    );
+    let b = i32x8::new(
+        809279458,
+        -211299601,
+        1005342056,
+        1721341232,
+        -194511872,
+        199704853,
+        -196761589,
+        -1316660885,
+    );
+    let r = i64x4::new(-25769803778, 4294967295, 34359738365, 1);
+
+    assert_eq!(r, transmute(lasx_xvdiv_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvdiv_d() {
+    let a = i64x4::new(
+        -7822845930831810797,
+        4993735058150674767,
+        7948083854887733828,
+        -5125159230108645154,
+    );
+    let b = i64x4::new(
+        2343656432981471704,
+        -7268480484218017416,
+        -2152977508876073544,
+        -6907442353788163718,
+    );
+    let r = i64x4::new(-3, 0, -3, 0);
+
+    assert_eq!(r, transmute(lasx_xvdiv_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvdiv_bu() {
+    let a = u8x32::new(
+        40, 120, 155, 70, 202, 73, 51, 248, 122, 27, 98, 122, 31, 221, 63, 177, 129, 222, 159,
+        41, 95, 74, 144, 15, 252, 14, 101, 220, 155, 209, 168, 214,
+    );
+    let b = u8x32::new(
+        105, 3, 186, 90, 103, 16, 157, 200, 195, 15, 101, 16, 92, 118, 205, 221, 131, 139, 234,
+        115, 14, 110, 40, 173, 4, 100, 228, 49, 164, 68, 238, 100,
+    );
+    let r = i64x4::new(
+        72061996379416576,
+        1099629068544,
+        844450699936000,
+        144118486677848127,
+    );
+
+    assert_eq!(r, transmute(lasx_xvdiv_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvdiv_hu() {
+    let a = u16x16::new(
+        50698, 15156, 21232, 20163, 45596, 12286, 58595, 95, 55092, 17141, 32523, 54385, 48523,
+        48676, 43699, 52279,
+    );
+    let b = u16x16::new(
+        11498, 6508, 15832, 27488, 24369, 64684, 6317, 20994, 2748, 14521, 46887, 35685, 40979,
+        25137, 94, 32966,
+    );
+    let r = i64x4::new(4295098372, 38654705665, 281474976776212, 283467841601537);
+
+    assert_eq!(r, transmute(lasx_xvdiv_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvdiv_wu() {
+    let a = u32x8::new(
+        2271275962, 1878803191, 1899241851, 435455463, 2545672438, 1798262264, 2100509405,
+        2360750144,
+    );
+    let b = u32x8::new(
+        4032427811, 1883431317, 1741576561, 2070639342, 54934516, 2950464411, 621309259,
+        1280987465,
+    );
+    let r = i64x4::new(0, 1, 46, 4294967299);
+
+    assert_eq!(r, transmute(lasx_xvdiv_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvdiv_du() {
+    let a = u64x4::new(
+        275328165009035219,
+        4227696010240224586,
+        8090530403053432892,
+        18434063998903182990,
+    );
+    let b = u64x4::new(
+        5339394187150320758,
+        10250881649499684594,
+        7311272300344996355,
+        2859467035949281895,
+    );
+    let r = i64x4::new(0, 0, 1, 6);
+
+    assert_eq!(r, transmute(lasx_xvdiv_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhaddw_h_b() {
+    let a = i8x32::new(
+        -5, 56, 50, 120, 77, -103, 42, -127, 8, 14, 21, 38, 52, -56, 89, 77, 35, -121, 96,
+        -122, -68, 11, 79, -97, 3, 75, -125, 100, -38, 16, 97, -27,
+    );
+    let b = i8x32::new(
+        111, -97, -90, 28, -46, -48, -5, -21, -82, -34, 99, 31, -37, -82, 19, -57, -101, 13,
+        47, 8, 125, 38, 118, -109, -122, -71, 47, -65, -74, -3, -41, 82,
+    );
+    let r = i64x4::new(
+        -36873861897256793,
+        27302673318019004,
+        5911562916593442,
+        -18859072538017839,
+    );
+
+    assert_eq!(r, transmute(lasx_xvhaddw_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhaddw_w_h() {
+    let a = i16x16::new(
+        503, 16837, 17816, -5134, -2110, 16197, 4755, 25985, 3954, -31560, 16582, 19389,
+        -15163, 24197, -23773, -18386,
+    );
+    let b = i16x16::new(
+        -23093, -2745, 8695, 3948, 29248, 22668, 15341, -17908, 18023, -1280, 5749, -6270,
+        2684, 12529, 9865, -12718,
+    );
+    let r = i64x4::new(
+        15298673502096,
+        177493818519941,
+        107971182840607,
+        -36597416302335,
+    );
+
+    assert_eq!(r, transmute(lasx_xvhaddw_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhaddw_d_w() {
+    let a = i32x8::new(
+        1750963922,
+        584909082,
+        1421536823,
+        -1912125255,
+        -1415675154,
+        -950003373,
+        85319168,
+        -762670446,
+    );
+    let b = i32x8::new(
+        459045461,
+        -2028594364,
+        1976546319,
+        -755242326,
+        -53664060,
+        861552329,
+        642848731,
+        -407580162,
+    );
+    let r = i64x4::new(1043954543, 64421064, -1003667433, -119821715);
+
+    assert_eq!(r, transmute(lasx_xvhaddw_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhaddw_hu_bu() {
+    let a = u8x32::new(
+        38, 74, 29, 69, 140, 185, 4, 140, 17, 27, 252, 79, 243, 186, 145, 220, 13, 122, 179,
+        16, 98, 184, 199, 160, 74, 126, 80, 155, 7, 140, 148, 161,
+    );
+    let b = u8x32::new(
+        133, 115, 144, 226, 30, 38, 232, 188, 154, 67, 7, 165, 19, 149, 99, 178, 168, 65, 209,
+        54, 133, 14, 77, 82, 70, 34, 115, 197, 56, 192, 38, 122,
+    );
+    let r = i64x4::new(
+        104709614768292047,
+        89791398044631221,
+        66710930999804194,
+        56014362196705476,
+    );
+
+    assert_eq!(r, transmute(lasx_xvhaddw_hu_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhaddw_wu_hu() {
+    let a = u16x16::new(
+        63778, 40631, 16392, 22225, 8863, 7513, 8207, 22318, 52096, 47974, 5062, 54405, 51728,
+        26552, 52537, 29064,
+    );
+    let b = u16x16::new(
+        13712, 64264, 56403, 59007, 46671, 35207, 62888, 11353, 49037, 2930, 56459, 32449,
+        28370, 14428, 62265, 12050,
+    );
+    let r = i64x4::new(
+        337704688604231,
+        365956983477160,
+        476157254400755,
+        392255068231306,
+    );
+
+    assert_eq!(r, transmute(lasx_xvhaddw_wu_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhaddw_du_wu() {
+    let a = u32x8::new(
+        3700951359, 1340423021, 2816770908, 613522875, 1598890202, 536370888, 825435814,
+        1465472531,
+    );
+    let b = u32x8::new(
+        1643146315, 730247298, 3900765507, 744547675, 1943326068, 179507092, 214959309,
+        1444692790,
+    );
+    let r = i64x4::new(2983569336, 4514288382, 2479696956, 1680431840);
+
+    assert_eq!(r, transmute(lasx_xvhaddw_du_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhsubw_h_b() {
+    let a = i8x32::new(
+        -110, 85, -53, -96, -5, 14, -71, 50, -128, -83, 57, -86, 65, 24, 32, -119, 59, -41,
+        -85, 22, -67, -124, -126, -18, 54, -36, 103, 81, 116, -79, -55, -52,
+    );
+    let b = i8x32::new(
+        -15, -92, 68, 76, -101, -42, -21, -32, -36, 23, -114, -76, 40, 19, 111, -124, -29,
+        -110, -123, -123, 24, 35, 126, 25, -14, 6, -91, 78, 49, -69, 27, -22,
+    );
+    let r = i64x4::new(
+        19985221551915108,
+        -64457838384316463,
+        -40251557315215372,
+        -21955597927907350,
+    );
+
+    assert_eq!(r, transmute(lasx_xvhsubw_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhsubw_w_h() {
+    let a = i16x16::new(
+        32475, -17580, 4965, -21648, -16988, -15947, 18483, -27381, -26195, 19027, 19784,
+        -13358, -6180, 27442, 23283, 1155,
+    );
+    let b = i16x16::new(
+        7640, 26084, 32525, 1062, -7851, 17013, -8159, 21593, 32263, -22862, 17816, 30577,
+        -11674, 14875, 26487, -22021,
+    );
+    let r = i64x4::new(
+        -232666968384132,
+        -82553566404512,
+        -133887015531444,
+        -108800111503156,
+    );
+
+    assert_eq!(r, transmute(lasx_xvhsubw_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhsubw_d_w() {
+    let a = i32x8::new(
+        1120555405, 606416783, 1862962829, 65716515, -720291245, 1995296165, 1877873639,
+        383778576,
+    );
+    let b = i32x8::new(
+        -2142481365,
+        -2015795383,
+        110862808,
+        1067722925,
+        1036379333,
+        1746215780,
+        -901547317,
+        -304263170,
+    );
+    let r = i64x4::new(2748898148, -45146293, 958916832, 1285325893);
+
+    assert_eq!(r, transmute(lasx_xvhsubw_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhsubw_hu_bu() {
+    let a = u8x32::new(
+        113, 29, 201, 242, 134, 250, 176, 112, 14, 192, 71, 63, 59, 39, 230, 197, 232, 110, 2,
+        134, 244, 44, 110, 200, 209, 99, 15, 169, 39, 126, 139, 207,
+    );
+    let b = u8x32::new(
+        235, 233, 194, 214, 34, 190, 122, 157, 241, 119, 67, 242, 183, 26, 163, 208, 6, 32,
+        249, 49, 62, 56, 64, 107, 68, 140, 184, 157, 27, 232, 174, 226,
+    );
+    let r = i64x4::new(
+        -2813822050959566,
+        9851010004352975,
+        38561998787379304,
+        9289103727198239,
+    );
+
+    assert_eq!(r, transmute(lasx_xvhsubw_hu_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhsubw_wu_hu() {
+    let a = u16x16::new(
+        24627, 1925, 40631, 41120, 48598, 56441, 57360, 63413, 60803, 9134, 1910, 34890, 8361,
+        20497, 16343, 44260,
+    );
+    let b = u16x16::new(
+        63771, 7054, 62761, 8243, 13185, 3930, 52006, 48295, 37094, 2357, 31496, 1199, 13321,
+        56020, 36805, 30263,
+    );
+    let r = i64x4::new(
+        -92943092347286,
+        48992691988728,
+        14581413941960,
+        32018981198856,
+    );
+
+    assert_eq!(r, transmute(lasx_xvhsubw_wu_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhsubw_du_wu() {
+    let a = u32x8::new(
+        1851655538, 2991049929, 4109504012, 1371213815, 2264711690, 1359668665, 2742473455,
+        1279993359,
+    );
+    let b = u32x8::new(
+        4047783060, 556492643, 3984363807, 4250070195, 975052988, 1299555592, 2868269900,
+        2929723348,
+    );
+    let r = i64x4::new(-1056733131, -2613149992, 384615677, -1588276541);
+
+    assert_eq!(r, transmute(lasx_xvhsubw_du_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmod_b() {
+    let a = i8x32::new(
+        -14, -64, -81, 32, -14, -85, 120, 64, 95, 126, -11, 38, 2, -53, 40, 54, -35, 41, 58,
+        -60, 86, -9, 57, -11, 34, -17, -81, 89, -55, 25, 84, -101,
+    );
+    let b = i8x32::new(
+        -98, -114, 25, 100, -111, 71, 35, 63, -23, 3, 93, -41, -3, -48, 91, 95, 98, 92, -113,
+        -82, -81, 121, -35, 73, -83, -95, 75, 65, 26, 60, -124, -5,
+    );
+    let r = i64x4::new(
+        76546840437899506,
+        3902645063778631683,
+        -786169480790529571,
+        -48385121157714142,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmod_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmod_h() {
+    let a = i16x16::new(
+        13568, -26495, 27958, 11226, -17868, -9288, -10627, -29659, -16286, -27756, 22645,
+        -14990, 1109, 782, 5976, -13268,
+    );
+    let b = i16x16::new(
+        22907, -30762, -26890, -2623, -3889, -8952, 27558, -27225, -1007, -2649, -19000, -1212,
+        3583, -14136, -1124, 6289,
+    );
+    let r = i64x4::new(
+        206607222489298176,
+        -684874256681470216,
+        -125522180245094574,
+        -194216204870745003,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmod_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmod_w() {
+    let a = i32x8::new(
+        1309045772,
+        -1137265851,
+        -1474148809,
+        -826641461,
+        517262391,
+        -454945903,
+        -2059227752,
+        1033836629,
+    );
+    let b = i32x8::new(
+        1742453362, -859625876, 711512169, 963835525, 1823286802, 1062091570, 1215420851,
+        -845753957,
+    );
+    let r = i64x4::new(
+        -1192454611378211828,
+        -3550398036268816631,
+        -1953977774316925897,
+        807808928635455307,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmod_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmod_d() {
+    let a = i64x4::new(
+        1878041523555568774,
+        1556025246870009445,
+        8042729508142516845,
+        -3048989907394276239,
+    );
+    let b = i64x4::new(
+        4139731099187900579,
+        -5256541293724606275,
+        -289001035147795771,
+        -6358290177153594057,
+    );
+    let r = i64x4::new(
+        1878041523555568774,
+        1556025246870009445,
+        239701559152031028,
+        -3048989907394276239,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmod_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmod_bu() {
+    let a = u8x32::new(
+        124, 195, 23, 51, 29, 150, 162, 114, 37, 233, 71, 130, 185, 243, 82, 178, 55, 114, 198,
+        194, 51, 128, 183, 135, 254, 147, 93, 254, 157, 231, 225, 75,
+    );
+    let b = u8x32::new(
+        4, 234, 86, 5, 151, 127, 208, 171, 229, 154, 21, 203, 87, 142, 153, 152, 109, 75, 195,
+        182, 135, 251, 242, 45, 15, 229, 168, 223, 89, 83, 178, 220,
+    );
+    let r = i64x4::new(
+        8260190079890735872,
+        1896689493177028389,
+        51650877471270711,
+        5417620637589803790,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmod_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmod_hu() {
+    let a = u16x16::new(
+        59302, 64062, 17665, 34634, 39674, 40771, 56476, 39054, 20128, 46806, 28975, 5092,
+        32039, 65514, 52991, 10995,
+    );
+    let b = u16x16::new(
+        30365, 10559, 8088, 37622, 54157, 864, 21095, 43558, 39181, 49555, 45853, 63130, 49482,
+        1077, 5568, 1505,
+    );
+    let r = i64x4::new(
+        -8698133335059959543,
+        -7453958975338079494,
+        1433395031155560096,
+        129490854556368167,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmod_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmod_wu() {
+    let a = u32x8::new(
+        2536195964, 1025991305, 145727133, 1179968501, 2535376324, 2624321769, 500804646,
+        3445505165,
+    );
+    let b = u32x8::new(
+        4283722185, 726568518, 2648066980, 2591107739, 3836915245, 1768721904, 1082904228,
+        128214904,
+    );
+    let r = i64x4::new(
+        1286011080378369916,
+        5067926122250870429,
+        3674773441172391364,
+        480682694340619302,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmod_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmod_du() {
+    let a = u64x4::new(
+        3050922509882516945,
+        14221067967600195195,
+        8310753426098198776,
+        150087784552479859,
+    );
+    let b = u64x4::new(
+        9108987739022803721,
+        14892726191598876390,
+        10175125705243076843,
+        8880022576671073801,
+    );
+    let r = i64x4::new(
+        3050922509882516945,
+        -4225676106109356421,
+        8310753426098198776,
+        150087784552479859,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmod_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrepl128vei_b() {
+    let a = i8x32::new(
+        14, 7, 83, 99, -72, -90, 66, -53, 33, 27, -21, 110, -96, -58, -96, 54, -73, 74, -33,
+        51, -15, -108, -39, 124, 124, -74, -17, -17, -41, 84, 46, -73,
+    );
+    let r = i64x4::new(
+        2387225703656530209,
+        2387225703656530209,
+        8970181431921507452,
+        8970181431921507452,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrepl128vei_b::<8>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrepl128vei_h() {
+    let a = i16x16::new(
+        2674, -3702, -21458, 12674, 26270, 949, -26647, 9913, 30933, 30654, -32697, -13873,
+        16165, -5608, 18102, -20233,
+    );
+    let r = i64x4::new(
+        3567468290076979586,
+        3567468290076979586,
+        -3904680457625679409,
+        -3904680457625679409,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrepl128vei_h::<3>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrepl128vei_w() {
+    let a = i32x8::new(
+        -64196701,
+        1709481199,
+        -1911955655,
+        1777845271,
+        1233260806,
+        -309058551,
+        -557473503,
+        -1179212061,
+    );
+    let r = i64x4::new(
+        7342165844541349103,
+        7342165844541349103,
+        -1327396365108239351,
+        -1327396365108239351,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrepl128vei_w::<1>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrepl128vei_d() {
+    let a = i64x4::new(
+        5505097689447100650,
+        -5456987454315761481,
+        4427502889722976813,
+        8082072270131265608,
+    );
+    let r = i64x4::new(
+        5505097689447100650,
+        5505097689447100650,
+        4427502889722976813,
+        4427502889722976813,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrepl128vei_d::<0>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickev_b() {
+    let a = i8x32::new(
+        68, 32, 62, -48, -57, 81, -17, -49, 89, 83, 84, -17, -84, 27, 125, 34, 45, 22, -76,
+        -126, -58, -15, 52, 46, -101, -120, -128, -63, 125, -119, 62, -25,
+    );
+    let b = i8x32::new(
+        -18, 6, -55, 4, 74, 5, 59, 34, 92, 70, 29, -38, 91, 22, 15, 54, 5, -31, -103, -121,
+        -83, 48, -87, -100, 69, 89, -111, -61, 66, 85, 5, 122,
+    );
+    let r = i64x4::new(
+        1106510415418542574,
+        9055705695986859588,
+        379025047038040325,
+        4502896606534087725,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpickev_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickev_h() {
+    let a = i16x16::new(
+        13779, -9769, -21673, -32164, -29136, -24643, -35, -10237, -15874, -1630, -366, -22027,
+        -18176, 10211, -7522, 20788,
+    );
+    let b = i16x16::new(
+        16573, -27194, 21452, -4952, 10891, -6280, -31016, -14088, -21903, -8934, 20641, 23162,
+        -12223, 6236, -15855, -20126,
+    );
+    let r = i64x4::new(
+        -8730181099762990915,
+        -9695284500679213,
+        -4462556776803227023,
+        -2117051360895385090,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpickev_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickev_w() {
+    let a = i32x8::new(
+        -946752951,
+        -207147822,
+        -193366329,
+        -1481453777,
+        -750923229,
+        -575660669,
+        -1037215364,
+        1221718353,
+    );
+    let b = i32x8::new(
+        -1468110932,
+        -1007107613,
+        1371137124,
+        1715394094,
+        -920814431,
+        907354058,
+        597912747,
+        1796030124,
+    );
+    let r = i64x4::new(
+        5888989108738353068,
+        -830502055854362039,
+        2568015697600674977,
+        -4454806063744691677,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpickev_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickev_d() {
+    let a = i64x4::new(
+        -94428288198650872,
+        4107006669052123351,
+        1952973857169882715,
+        -3468095864189526981,
+    );
+    let b = i64x4::new(
+        -2104254403922616194,
+        -5215534061403539132,
+        4917599455110663395,
+        -3171208575864229825,
+    );
+    let r = i64x4::new(
+        -2104254403922616194,
+        -94428288198650872,
+        4917599455110663395,
+        1952973857169882715,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpickev_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickod_b() {
+    let a = i8x32::new(
+        -56, -8, -6, -10, 108, -8, 122, 120, -75, -26, -47, 2, -35, -87, -61, 70, -24, -48,
+        125, 19, -66, 42, -2, -49, -94, -84, -63, 74, -45, -54, -120, 56,
+    );
+    let b = i8x32::new(
+        -65, -120, -46, -90, -108, -41, -28, -32, -125, -114, -59, 122, -3, 76, -67, -50, -59,
+        -94, 83, 122, -100, 12, -81, -57, 6, 29, 6, 85, -94, -36, -30, -43,
+    );
+    let r = i64x4::new(
+        -3581352849590212984,
+        5091604042614372088,
+        -3036458462372660574,
+        4092165317489988560,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpickod_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickod_h() {
+    let a = i16x16::new(
+        -31000, 26625, -24749, -26219, 27675, -16099, 12139, 4936, 17198, 8639, 15258, 14842,
+        -6785, 3344, 2053, 21006,
+    );
+    let b = i16x16::new(
+        -1278, -30287, -424, 21484, 7821, 21393, 23139, -7886, 2473, 16757, -29424, 14324,
+        15035, 18736, -9314, 7772,
+    );
+    let r = i64x4::new(
+        -2219619782696859215,
+        1389572817918715905,
+        2187703990441230709,
+        5912677724127371711,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpickod_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickod_w() {
+    let a = i32x8::new(
+        2143199982,
+        -991627533,
+        1630737785,
+        -175139906,
+        -976073052,
+        -1793301951,
+        -834831207,
+        3306425,
+    );
+    let b = i32x8::new(
+        1564508527,
+        626529718,
+        264606833,
+        -1943354886,
+        1166719003,
+        -869473680,
+        1896581238,
+        -1078061273,
+    );
+    let r = i64x4::new(
+        -8346645679265278538,
+        -752220165191174413,
+        -4630237907193634192,
+        14200989743342145,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpickod_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickod_d() {
+    let a = i64x4::new(
+        4767160600123418734,
+        8001080746285135394,
+        -2817760190229042067,
+        3923084493864153244,
+    );
+    let b = i64x4::new(
+        -3317389585990069371,
+        8793937455278562227,
+        7703929803523851571,
+        5524330706927878132,
+    );
+    let r = i64x4::new(
+        8793937455278562227,
+        8001080746285135394,
+        5524330706927878132,
+        3923084493864153244,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpickod_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvilvh_b() {
+    let a = i8x32::new(
+        -72, 73, -43, 126, -52, 83, 85, -79, -99, 67, 27, 28, 39, -21, -74, -30, 61, 83, 80,
+        -18, 48, 18, 55, 82, 107, -26, -7, 17, 91, -87, 97, 84,
+    );
+    let b = i8x32::new(
+        -3, -33, -12, -52, 73, 87, -102, -3, -114, -95, -78, 65, -102, 36, 40, 102, 102, 115,
+        48, -41, 109, -110, -6, 9, -8, 86, 119, -37, 25, 96, 23, 62,
+    );
+    let r = i64x4::new(
+        2035938959000968590,
+        -2132817086653388902,
+        1286896411905256440,
+        6070396101995813657,
+    );
+
+    assert_eq!(r, transmute(lasx_xvilvh_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvilvh_h() {
+    let a = i16x16::new(
+        -28753, 23947, 10110, -8166, 18168, -1619, 12029, 10309, 22060, -11658, 8123, 22354,
+        23552, 27450, -16412, 24672,
+    );
+    let b = i16x16::new(
+        -31442, 23864, 15251, -12304, -23752, -1685, -10720, 21446, 19318, 27618, 10892, -9393,
+        -29179, 13870, 16716, 10233,
+    );
+    let r = i64x4::new(
+        -455433748147035336,
+        2901817645567170080,
+        7726547683447442949,
+        6944594579025051980,
+    );
+
+    assert_eq!(r, transmute(lasx_xvilvh_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvilvh_w() {
+    let a = i32x8::new(
+        678797694,
+        -1852295486,
+        -632882964,
+        -375269950,
+        1655683337,
+        562516909,
+        -759600517,
+        595568887,
+    );
+    let b = i32x8::new(
+        -2114925053,
+        1623015448,
+        -398485927,
+        -271020427,
+        -284878929,
+        -1558239614,
+        -902548533,
+        1778292534,
+    );
+    let r = i64x4::new(
+        -2718211628679063975,
+        -1611772158397608331,
+        -3262459375147273269,
+        2557948893958412086,
+    );
+
+    assert_eq!(r, transmute(lasx_xvilvh_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvilvh_d() {
+    let a = i64x4::new(
+        -5521345585808929096,
+        -2494281556296927351,
+        2989419257337371241,
+        -1576924492614617443,
+    );
+    let b = i64x4::new(
+        -7666029279891695247,
+        -1067545656448973211,
+        7271996920619620214,
+        -3924745280397255469,
+    );
+    let r = i64x4::new(
+        -1067545656448973211,
+        -2494281556296927351,
+        -3924745280397255469,
+        -1576924492614617443,
+    );
+
+    assert_eq!(r, transmute(lasx_xvilvh_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvilvl_b() {
+    let a = i8x32::new(
+        -79, -60, -80, 23, 8, 83, -52, -72, 18, 98, 69, -81, -15, -95, 68, -38, 108, -9, -95,
+        110, 63, -24, -106, -24, 78, -109, 117, 10, 36, 13, -9, -70,
+    );
+    let b = i8x32::new(
+        -4, -37, -54, -19, 91, 52, 111, -6, 23, 24, 50, 18, 58, 109, 35, -89, -55, -31, 21,
+        -28, 76, 16, -53, -16, 73, 97, -99, 70, 75, -124, 75, 70,
+    );
+    let r = i64x4::new(
+        1724228617285382652,
+        -5117553248043792293,
+        7990688754587233481,
+        -1661662459983806644,
+    );
+
+    assert_eq!(r, transmute(lasx_xvilvl_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvilvl_h() {
+    let a = i16x16::new(
+        16116, 7715, 3432, 24398, -2759, -24490, -19436, 8863, -24282, 23416, -26870, -3179,
+        -23599, -9862, 20524, 10277,
+    );
+    let b = i16x16::new(
+        -29120, 15023, -2814, 7040, -19198, -5516, 30715, 18311, -1346, 32030, -17709, -30250,
+        21978, 26007, -6093, 28687,
+    );
+    let r = i64x4::new(
+        2171643969672613440,
+        6867456718581331202,
+        6591155625162898110,
+        -894657396213105965,
+    );
+
+    assert_eq!(r, transmute(lasx_xvilvl_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvilvl_w() {
+    let a = i32x8::new(
+        1489997232,
+        1342252220,
+        136381167,
+        288285197,
+        -1772559171,
+        1615944068,
+        1604328217,
+        -70958228,
+    );
+    let b = i32x8::new(
+        -794555105,
+        44816804,
+        2089609888,
+        313909292,
+        2017363432,
+        -1414750261,
+        1773836405,
+        138829633,
+    );
+    let r = i64x4::new(
+        6399489386070936863,
+        5764929387928213924,
+        -7613083667652508184,
+        6940426927105417163,
+    );
+
+    assert_eq!(r, transmute(lasx_xvilvl_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvilvl_d() {
+    let a = i64x4::new(
+        2785967349713819381,
+        4295622653064831557,
+        -2688716944239585727,
+        1495201372757695383,
+    );
+    let b = i64x4::new(
+        -6882080563044023861,
+        8040350606767129885,
+        9211364387423765025,
+        -7760991016985753125,
+    );
+    let r = i64x4::new(
+        -6882080563044023861,
+        2785967349713819381,
+        9211364387423765025,
+        -2688716944239585727,
+    );
+
+    assert_eq!(r, transmute(lasx_xvilvl_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpackev_b() {
+    let a = i8x32::new(
+        34, -14, -37, 93, 107, -43, -84, 47, -2, 72, -44, -4, -21, -45, 91, 44, -67, 47, 78,
+        -88, -77, 54, -48, -4, -115, 28, 45, -112, -16, -93, -125, 86,
+    );
+    let b = i8x32::new(
+        45, -46, 115, 63, -60, -89, 34, 1, -32, 96, -41, -112, 72, 24, 68, 64, 65, -60, 104,
+        -83, -54, 125, -86, 98, -18, -128, 68, -66, -17, 92, 8, 64,
+    );
+    let r = i64x4::new(
+        -6043149256738266579,
+        6576640053908864736,
+        -3410716086299476671,
+        -9004682544879989266,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpackev_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpackev_h() {
+    let a = i16x16::new(
+        -31926, 14925, 3993, -25807, -28395, 26414, 8241, 24589, -2983, -24679, 19318, 9614,
+        10323, 27545, -18762, -18536,
+    );
+    let b = i16x16::new(
+        -7985, 4641, -22978, 7805, 3248, 14824, -30918, 8002, 2172, -19190, -6029, 4840, 24125,
+        16864, 9543, -919,
+    );
+    let r = i64x4::new(
+        1124112369426555087,
+        2319783968684444848,
+        5437789184814811260,
+        -5280992525495869891,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpackev_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpackev_w() {
+    let a = i32x8::new(
+        -332151772,
+        1303690878,
+        1282065842,
+        -1700272560,
+        -443102472,
+        2142454870,
+        78857966,
+        -1548128347,
+    );
+    let b = i32x8::new(
+        -804493639,
+        452785364,
+        -1917157806,
+        -914796730,
+        -2002581887,
+        -390090579,
+        927546388,
+        154785025,
+    );
+    let r = i64x4::new(
+        -1426580994557974855,
+        5506430865086512722,
+        -1903110623724370303,
+        338692385926626324,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpackev_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpackev_d() {
+    let a = i64x4::new(
+        6553071732696091666,
+        6908931613033995721,
+        -3601691172781761847,
+        -4565881074922016381,
+    );
+    let b = i64x4::new(
+        -4424638855877852796,
+        -3616236802390284562,
+        -8253892234265412575,
+        6668303162003192752,
+    );
+    let r = i64x4::new(
+        -4424638855877852796,
+        6553071732696091666,
+        -8253892234265412575,
+        -3601691172781761847,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpackev_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpackod_b() {
+    let a = i8x32::new(
+        62, -60, -127, 84, -107, -106, -66, -119, -110, 28, 57, 97, 19, 34, -37, -7, -42, -117,
+        104, -27, 81, 106, -19, 80, -20, 127, -104, 54, -37, 108, -37, 51,
+    );
+    let b = i8x32::new(
+        -126, 96, -65, -4, 53, 69, -10, -33, 102, 21, -35, 115, -63, 15, -13, -3, 25, 100, 22,
+        -95, -81, 17, -18, 101, -67, -115, 82, 4, 123, -94, 98, 91,
+    );
+    let r = i64x4::new(
+        -8511919546184186784,
+        -433152539702911979,
+        5793153120781568868,
+        3700670962761760653,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpackod_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpackod_h() {
+    let a = i16x16::new(
+        -15659, -944, 746, -2159, -14115, 32333, 7687, 7300, 16484, -5418, 17483, -23753,
+        -11433, 8096, 6365, -19623,
+    );
+    let b = i16x16::new(
+        -16063, 24227, 15870, -31985, -14423, 10575, -5597, -29174, 8408, 3527, 9997, 27250,
+        16855, -32478, -12854, 24292,
+    );
+    let r = i64x4::new(
+        -607560370037432669,
+        2054923505707592015,
+        -6685758080009499193,
+        -5523279134117035742,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpackod_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpackod_w() {
+    let a = i32x8::new(
+        -842203551,
+        -1271389188,
+        -2068525802,
+        -1822181077,
+        -986051686,
+        -837897746,
+        37690010,
+        -1697819510,
+    );
+    let b = i32x8::new(
+        224471764,
+        -768842241,
+        -1859806928,
+        1498474664,
+        -223957810,
+        2079941216,
+        -338745357,
+        -2090020855,
+    );
+    let r = i64x4::new(
+        -5460574979421870593,
+        -7826208131606583128,
+        -3598743414382173600,
+        -7292079267755798519,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpackod_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpackod_d() {
+    let a = i64x4::new(
+        -7495668983396862169,
+        8274812346114337628,
+        4379006400301575850,
+        -8628096693516187272,
+    );
+    let b = i64x4::new(
+        -8614497367106654999,
+        -7004520942966577002,
+        5232114663469258860,
+        5306174777811604017,
+    );
+    let r = i64x4::new(
+        -7004520942966577002,
+        8274812346114337628,
+        5306174777811604017,
+        -8628096693516187272,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpackod_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvshuf_b() {
+    let a = i8x32::new(
+        39, -115, -21, 29, -109, -123, 49, 7, 120, 96, 121, 123, -87, 122, -27, 5, -103, -90,
+        -93, 98, -37, -100, 93, 27, -86, 15, -22, -80, -5, -16, 124, 124,
+    );
+    let b = i8x32::new(
+        -102, 106, 26, -77, 48, 65, 21, -98, 122, -73, 124, -79, 94, 69, 52, -84, -21, -99,
+        -41, 63, -91, 26, -63, 44, -37, -5, -99, 53, -126, -109, -61, -55,
+    );
+    let c = i8x32::new(
+        0, 27, 12, 22, 17, 20, 12, 27, 24, 7, 29, 9, 30, 3, 21, 25, 25, 15, 16, 11, 11, 12, 9,
+        11, 29, 16, 7, 30, 18, 12, 8, 10,
+    );
+    let r = i64x4::new(
+        8889704949103885210,
+        6955162998750748280,
+        3889845868208703759,
+        -7071915151180654096,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvshuf_b(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvshuf_h() {
+    let a = i16x16::new(14, 0, 11, 10, 2, 6, 5, 1, 3, 12, 4, 7, 10, 8, 10, 4);
+    let b = i16x16::new(
+        -21254, 15426, -9904, -9348, 19843, 4700, -18790, 16378, -12463, 13093, 1534, -947,
+        -22603, -31524, -24301, -13577,
+    );
+    let c = i16x16::new(
+        -6824, -21705, 6609, -73, 752, 8612, -13615, 29408, 31778, -1056, 20474, 23005, -10590,
+        8605, -3153, 16014,
+    );
+    let r = i64x4::new(
+        -2787486839872112998,
+        -6109377377843734063,
+        4507776271131171293,
+        -2980813411407821314,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvshuf_h(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvshuf_w() {
+    let a = i32x8::new(6, 4, 1, 5, 3, 0, 3, 2);
+    let b = i32x8::new(
+        112260284,
+        143215906,
+        -519532509,
+        2126848278,
+        -1874926296,
+        888441697,
+        -716493665,
+        -1989603791,
+    );
+    let c = i32x8::new(
+        174486498,
+        1186503117,
+        -1753459384,
+        1078106035,
+        -2055158107,
+        2071085725,
+        1120609144,
+        -109951450,
+    );
+    let r = i64x4::new(
+        482154252195106851,
+        615107633723513293,
+        -8826836853489252826,
+        4812979629263570470,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvshuf_w(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvshuf_d() {
+    let a = i64x4::new(0, 1, 2, 3);
+    let b = i64x4::new(
+        -4818789571452434899,
+        -1419914372991806078,
+        -1036924962456047190,
+        5694315469710360861,
+    );
+    let c = i64x4::new(
+        6580926913588532380,
+        -6246203397488305553,
+        -6030997396381573391,
+        -9089767205636240503,
+    );
+    let r = i64x4::new(
+        6580926913588532380,
+        -6246203397488305553,
+        -1036924962456047190,
+        5694315469710360861,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvshuf_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvand_v() {
+    let a = u8x32::new(
+        90, 203, 15, 155, 63, 105, 53, 48, 190, 209, 178, 76, 210, 20, 95, 140, 100, 15, 124,
+        254, 188, 84, 233, 191, 139, 236, 35, 122, 198, 9, 3, 147,
+    );
+    let b = u8x32::new(
+        213, 245, 251, 19, 199, 6, 225, 234, 198, 129, 17, 8, 53, 155, 124, 177, 193, 194, 146,
+        194, 233, 18, 7, 81, 49, 91, 33, 177, 131, 65, 221, 245,
+    );
+    let r = i64x4::new(
+        2315131713829454160,
+        -9197458677956574842,
+        1225278890617864768,
+        -7998109804568426495,
+    );
+
+    assert_eq!(r, transmute(lasx_xvand_v(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvandi_b() {
+    let a = u8x32::new(
+        76, 191, 179, 169, 134, 148, 220, 33, 48, 114, 218, 175, 149, 53, 89, 64, 173, 218,
+        209, 46, 131, 153, 196, 101, 69, 5, 138, 207, 219, 29, 3, 11,
+    );
+    let r = i64x4::new(
+        2381282727478636300,
+        2573978984653344,
+        2667266788571548205,
+        793492300495455493,
+    );
+
+    assert_eq!(r, transmute(lasx_xvandi_b::<47>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvor_v() {
+    let a = u8x32::new(
+        125, 60, 243, 199, 224, 172, 254, 103, 105, 229, 245, 138, 160, 89, 141, 68, 218, 162,
+        229, 242, 225, 91, 142, 124, 4, 158, 13, 29, 31, 24, 19, 236,
+    );
+    let b = u8x32::new(
+        61, 24, 19, 82, 93, 44, 145, 86, 125, 230, 60, 205, 17, 204, 228, 220, 145, 189, 138,
+        34, 184, 52, 178, 93, 142, 223, 59, 0, 197, 149, 61, 209,
+    );
+    let r = i64x4::new(
+        8646820015824387197,
+        -2527120060116506755,
+        9060820211815399387,
+        -198266276987019378,
+    );
+
+    assert_eq!(r, transmute(lasx_xvor_v(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvori_b() {
+    let a = u8x32::new(
+        224, 64, 88, 211, 150, 151, 191, 121, 45, 29, 78, 44, 95, 182, 208, 27, 245, 89, 219,
+        195, 171, 1, 240, 194, 102, 138, 54, 60, 40, 239, 106, 1,
+    );
+    let r = i64x4::new(
+        9079248013888353524,
+        9220265364544191869,
+        -651766303824052747,
+        8466485259632311926,
+    );
+
+    assert_eq!(r, transmute(lasx_xvori_b::<116>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvnor_v() {
+    let a = u8x32::new(
+        76, 54, 61, 63, 251, 146, 243, 33, 217, 111, 210, 198, 26, 170, 74, 175, 96, 81, 208,
+        187, 214, 194, 59, 158, 142, 191, 224, 234, 79, 178, 30, 115,
+    );
+    let b = u8x32::new(
+        188, 24, 29, 204, 122, 22, 58, 38, 82, 168, 2, 213, 73, 48, 85, 251, 211, 186, 195, 15,
+        123, 225, 156, 253, 77, 213, 172, 132, 177, 163, 80, 23,
+    );
+    let r = i64x4::new(
+        -2881062395696725757,
+        45112567624699940,
+        18045185911686156,
+        -8601510250130767824,
+    );
+
+    assert_eq!(r, transmute(lasx_xvnor_v(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvnori_b() {
+    let a = u8x32::new(
+        111, 178, 133, 23, 105, 149, 64, 248, 248, 8, 96, 98, 70, 20, 213, 175, 56, 216, 223,
+        118, 46, 113, 0, 12, 209, 39, 73, 77, 16, 194, 218, 171,
+    );
+    let r = i64x4::new(
+        440871273092500496,
+        5767503740212762118,
+        5935197095815284294,
+        6053994920729270286,
+    );
+
+    assert_eq!(r, transmute(lasx_xvnori_b::<161>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvxor_v() {
+    let a = u8x32::new(
+        126, 139, 80, 168, 116, 128, 183, 120, 15, 152, 183, 62, 51, 179, 32, 150, 207, 108,
+        88, 207, 22, 73, 189, 112, 204, 236, 216, 24, 10, 70, 249, 168,
+    );
+    let b = u8x32::new(
+        3, 89, 57, 121, 152, 63, 89, 15, 254, 77, 130, 223, 192, 140, 229, 207, 202, 154, 208,
+        62, 3, 30, 110, 85, 8, 137, 208, 97, 40, 65, 148, 234,
+    );
+    let r = i64x4::new(
+        8642055758817120893,
+        6468646756475590129,
+        2725617951247496709,
+        4786489823605581252,
+    );
+
+    assert_eq!(r, transmute(lasx_xvxor_v(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvxori_b() {
+    let a = u8x32::new(
+        36, 245, 58, 172, 188, 20, 51, 56, 127, 7, 39, 87, 209, 54, 137, 206, 217, 81, 137, 48,
+        141, 135, 84, 138, 252, 157, 45, 234, 89, 34, 196, 168,
+    );
+    let r = i64x4::new(
+        -8394526022023166313,
+        9023671463178450124,
+        4172361022876344938,
+        1979210996964535887,
+    );
+
+    assert_eq!(r, transmute(lasx_xvxori_b::<179>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitsel_v() {
+    let a = u8x32::new(
+        69, 119, 244, 57, 103, 127, 203, 220, 144, 88, 221, 99, 13, 153, 253, 10, 8, 78, 153,
+        186, 144, 233, 66, 26, 137, 170, 201, 216, 251, 59, 188, 201,
+    );
+    let b = u8x32::new(
+        58, 118, 243, 153, 246, 176, 29, 116, 177, 226, 235, 9, 57, 218, 185, 77, 171, 107,
+        162, 224, 75, 59, 187, 183, 56, 33, 90, 30, 188, 49, 190, 107,
+    );
+    let c = u8x32::new(
+        8, 253, 144, 97, 31, 113, 95, 153, 184, 212, 7, 183, 120, 52, 43, 202, 55, 34, 46, 82,
+        88, 35, 171, 65, 101, 142, 107, 208, 15, 137, 143, 201,
+    );
+    let r = i64x4::new(
+        6097098147492034125,
+        5259528428215584944,
+        2011960906681118251,
+        5313741768184438952,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvbitsel_v(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbitseli_b() {
+    let a = u8x32::new(
+        178, 71, 136, 149, 190, 92, 86, 87, 135, 81, 18, 106, 61, 240, 71, 242, 187, 166, 218,
+        183, 12, 80, 244, 242, 232, 140, 161, 227, 35, 23, 225, 97,
+    );
+    let b = u8x32::new(
+        173, 155, 189, 0, 17, 102, 85, 215, 175, 177, 175, 162, 203, 4, 46, 80, 41, 131, 12,
+        130, 254, 191, 191, 230, 198, 211, 197, 37, 29, 13, 108, 138,
+    );
+    let r = i64x4::new(
+        -7776240335059051363,
+        -8057901949774876500,
+        -7737254534663338600,
+        -8463358690923847794,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvbitseli_b::<156>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvshuf4i_b() {
+    let a = i8x32::new(
+        108, -102, 33, -112, -6, -76, 115, -16, 40, -100, -76, 37, -61, -55, -102, 17, 25, 99,
+        89, -78, 55, -35, 116, 64, 75, 14, -106, 67, -49, 18, -91, -41,
+    );
+    let r = i64x4::new(
+        -5408624464691684710,
+        -3958160729736635236,
+        -2503757449887849629,
+        1357573681433480718,
+    );
+
+    assert_eq!(r, transmute(lasx_xvshuf4i_b::<117>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvshuf4i_h() {
+    let a = i16x16::new(
+        -6971, -14860, 30437, 17998, 739, 5931, -29626, 13221, 14940, -31006, -17153, -20574,
+        19219, 15653, -6222, 26534,
+    );
+    let r = i64x4::new(
+        -4182640851919387148,
+        1669484871499978539,
+        -8727220014624373022,
+        4406041774853078309,
+    );
+
+    assert_eq!(r, transmute(lasx_xvshuf4i_h::<125>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvshuf4i_w() {
+    let a = i32x8::new(
+        -1698591186,
+        -189845668,
+        1075366445,
+        -1020663141,
+        -48015581,
+        913540401,
+        -1408537529,
+        218710667,
+    );
+    let r = i64x4::new(
+        4618663713566149165,
+        -7295393590547476946,
+        -6049622619357221817,
+        -206225345846487261,
+    );
+
+    assert_eq!(r, transmute(lasx_xvshuf4i_w::<10>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvreplgr2vr_b() {
+    let r = i64x4::new(
+        8463800222054970741,
+        8463800222054970741,
+        8463800222054970741,
+        8463800222054970741,
+    );
+
+    assert_eq!(r, transmute(lasx_xvreplgr2vr_b(-139770763)));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvreplgr2vr_h() {
+    let r = i64x4::new(
+        -1100020993973555013,
+        -1100020993973555013,
+        -1100020993973555013,
+        -1100020993973555013,
+    );
+
+    assert_eq!(r, transmute(lasx_xvreplgr2vr_h(-111546181)));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvreplgr2vr_w() {
+    let r = i64x4::new(
+        -8112237653938959659,
+        -8112237653938959659,
+        -8112237653938959659,
+        -8112237653938959659,
+    );
+
+    assert_eq!(r, transmute(lasx_xvreplgr2vr_w(-1888777515)));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvreplgr2vr_d() {
+    let r = i64x4::new(
+        -1472556476011894783,
+        -1472556476011894783,
+        -1472556476011894783,
+        -1472556476011894783,
+    );
+
+    assert_eq!(r, transmute(lasx_xvreplgr2vr_d(-1472556476011894783)));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpcnt_b() {
+    let a = i8x32::new(
+        -78, -95, 2, -80, -45, 8, -113, 34, -100, -34, 69, 126, -9, -4, -51, 89, -32, 120, 99,
+        84, 74, -26, -84, 118, -104, -104, -2, -10, 56, 17, 66, 116,
+    );
+    let r = i64x4::new(
+        145523683996271364,
+        289644378270664196,
+        361419380590117891,
+        288795538114413315,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpcnt_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpcnt_h() {
+    let a = i16x16::new(
+        11626, 5283, -7476, -20299, -21862, -7933, -26579, 26723, -24113, 8952, 15751, -20804,
+        3834, 23833, -21664, 23370,
+    );
+    let r = i64x4::new(
+        2251834173816840,
+        1970354902138888,
+        2814788422270985,
+        2251829878980617,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpcnt_h(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpcnt_w() {
+    let a = i32x8::new(
+        769725316,
+        1329443403,
+        3455051,
+        -1024015807,
+        1113804345,
+        533788195,
+        1478448269,
+        663132689,
+    );
+    let r = i64x4::new(77309411341, 60129542155, 73014444046, 55834574863);
+
+    assert_eq!(r, transmute(lasx_xvpcnt_w(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpcnt_d() {
+    let a = i64x4::new(
+        -1195667126994002745,
+        574485287218873120,
+        4359670550805993357,
+        -166544779870738672,
+    );
+    let r = i64x4::new(33, 31, 29, 33);
+
+    assert_eq!(r, transmute(lasx_xvpcnt_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvclo_b() {
+    let a = i8x32::new(
+        -87, -42, 123, 30, -64, -61, 45, 65, 116, 65, 36, 53, -53, 107, 76, 11, -15, -38, -46,
+        88, -114, -107, 55, 53, -61, -70, -103, -62, 21, -29, 40, 95,
+    );
+    let r = i64x4::new(2207613190657, 8589934592, 1103806726660, 3298568503554);
+
+    assert_eq!(r, transmute(lasx_xvclo_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvclo_h() {
+    let a = i16x16::new(
+        -4880, 19940, -15012, -1377, -9664, 29017, 15571, -20185, -11621, 32665, -31110, 32554,
+        -31842, 20391, -23474, -18820,
+    );
+    let r = i64x4::new(
+        1407383473487875,
+        281474976710658,
+        4294967298,
+        281479271677953,
+    );
+
+    assert_eq!(r, transmute(lasx_xvclo_h(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvclo_w() {
+    let a = i32x8::new(
+        -472837395,
+        -2135587215,
+        -2000467762,
+        411236038,
+        -1457849736,
+        1672236706,
+        -1251091450,
+        -777023005,
+    );
+    let r = i64x4::new(4294967299, 1, 1, 8589934593);
+
+    assert_eq!(r, transmute(lasx_xvclo_w(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvclo_d() {
+    let a = i64x4::new(
+        -2662002076602604283,
+        1069611961163112747,
+        -5322946916564324351,
+        7672935739349466106,
+    );
+    let r = i64x4::new(2, 0, 1, 0);
+
+    assert_eq!(r, transmute(lasx_xvclo_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvclz_b() {
+    let a = i8x32::new(
+        48, -6, 70, -124, -16, -25, -31, -91, -16, -19, -117, -25, -17, 92, 40, 116, -123, 91,
+        22, -73, 100, 103, -72, 27, 14, -67, 118, 82, 90, 31, -83, -15,
+    );
+    let r = i64x4::new(65538, 72621643502977024, 216173885920575744, 3302846693380);
+
+    assert_eq!(r, transmute(lasx_xvclz_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvclz_h() {
+    let a = i16x16::new(
+        -11088, -2624, 9587, 10227, -21358, -32061, -32593, 20863, -13412, -5184, -28388,
+        12581, 27368, 29494, 2214, -12445,
+    );
+    let r = i64x4::new(
+        562958543355904,
+        281474976710656,
+        562949953421312,
+        17179934721,
+    );
+
+    assert_eq!(r, transmute(lasx_xvclz_h(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvclz_w() {
+    let a = i32x8::new(
+        -1816955803,
+        631623303,
+        -844798554,
+        -571080345,
+        439698339,
+        -377278351,
+        -2011143491,
+        1645796965,
+    );
+    let r = i64x4::new(8589934592, 0, 3, 4294967296);
+
+    assert_eq!(r, transmute(lasx_xvclz_w(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvclz_d() {
+    let a = i64x4::new(
+        -3450263516250458188,
+        -4779789731770767580,
+        -2256592148267722054,
+        4713387490250241941,
+    );
+    let r = i64x4::new(0, 0, 0, 1);
+
+    assert_eq!(r, transmute(lasx_xvclz_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfadd_s() {
+    let a = u32x8::new(
+        1058561863, 1064952952, 1049344074, 1062702316, 1057792746, 1062620339, 1060506486,
+        1055219670,
+    );
+    let b = u32x8::new(
+        1058369685, 1062538381, 1060953918, 1045575432, 1041469388, 993916160, 1061165480,
+        1040806504,
+    );
+    let r = i64x4::new(
+        4604781644817557486,
+        4577360739647446450,
+        4564128465094280925,
+        4545553165339792015,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfadd_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfadd_d() {
+    let a = u64x4::new(
+        4604104186982846811,
+        4594101328742252424,
+        4601686809902104562,
+        4591010495556540480,
+    );
+    let b = u64x4::new(
+        4599295489329742538,
+        4597621922535438280,
+        4568770145289685248,
+        4606509170156045614,
+    );
+    let r = i64x4::new(
+        4606916121688765120,
+        4600365225266215848,
+        4601738557736193412,
+        4607242424158867483,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfadd_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfsub_s() {
+    let a = u32x8::new(
+        1051284612, 1063062529, 1065074933, 1061303845, 1040445544, 1065277127, 1050456038,
+        1028474080,
+    );
+    let b = u32x8::new(
+        1061323418, 1047742504, 1041252032, 1046362676, 1058536139, 1062234929, 1060266892,
+        1051059318,
+    );
+    let r = i64x4::new(
+        4548699359865974960,
+        4542627446496145733,
+        4483806600207662434,
+        -4716328899074058446,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfsub_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfsub_d() {
+    let a = u64x4::new(
+        4600171060344923522,
+        4605546915627674696,
+        4592595361373027936,
+        4605218827740699453,
+    );
+    let b = u64x4::new(
+        4605618236610286151,
+        4595024973508085836,
+        4603596942845220543,
+        4598338803059870948,
+    );
+    let r = i64x4::new(
+        -4621313823233868020,
+        4604082677323287093,
+        -4620839705514447386,
+        4602885236169716939,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfsub_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmul_s() {
+    let a = u32x8::new(
+        1052320864, 1047132356, 1062268100, 1046708728, 1041045324, 1063314176, 1059310073,
+        1049796536,
+    );
+    let b = u32x8::new(
+        1064358048, 1061515003, 1057528231, 1058432998, 1063900744, 1052241494, 1052600868,
+        1042517172,
+    );
+    let r = i64x4::new(
+        4482332724193798395,
+        4469165660137518684,
+        4513050635226112077,
+        4412217640780718091,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfmul_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmul_d() {
+    let a = u64x4::new(
+        4606629864418855094,
+        4605003539487786257,
+        4590479879446676128,
+        4606513106899913084,
+    );
+    let b = u64x4::new(
+        4605920112889960858,
+        4598179153756612874,
+        4606290518673084028,
+        4605164664361830142,
+    );
+    let r = i64x4::new(
+        4605444995749970010,
+        4596002305251241714,
+        4589904028032657573,
+        4604645288864682176,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfmul_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfdiv_s() {
+    let a = u32x8::new(
+        1057794250, 1042162504, 1058563973, 1059452123, 1050358290, 1044764232, 1058075458,
+        1044755920,
+    );
+    let b = u32x8::new(
+        1059441919, 1061487805, 1048043892, 1042438684, 1061822186, 1057796721, 1060121466,
+        1051587390,
+    );
+    let r = i64x4::new(
+        4489379395443175003,
+        4648514715526194553,
+        4518231675762938086,
+        4544549637634302505,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfdiv_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfdiv_d() {
+    let a = u64x4::new(
+        4599185246498765334,
+        4599944651523203368,
+        4605116834688397287,
+        4604853047950220214,
+    );
+    let b = u64x4::new(
+        4564176709757936128,
+        4602766877113246240,
+        4596205261335386636,
+        4603651841724508284,
+    );
+    let r = i64x4::new(
+        4641804750140101849,
+        4604327948136618660,
+        4616067223277414565,
+        4608170208670026319,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfdiv_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcvt_h_s() {
+    let a = u32x8::new(
+        1058469229, 1050453282, 1035903176, 1054073088, 1063294292, 1008492480, 1057298766,
+        1061246000,
+    );
+    let b = u32x8::new(
+        1023462464, 1060058935, 1063991271, 1051666694, 1026891648, 1059128978, 1040948004,
+        1063761400,
+    );
+    let r = i64x4::new(
+        3853176214889572358,
+        3935915130522777784,
+        4268902673740736937,
+        4182498428240214789,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfcvt_h_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcvt_s_d() {
+    let a = u64x4::new(
+        4597447768952621592,
+        4604521658660448767,
+        4602704275491810917,
+        4598917842979840742,
+    );
+    let b = u64x4::new(
+        4599553378754216492,
+        4584512794443142976,
+        4602292684825622938,
+        4600582838384043714,
+    );
+    let r = i64x4::new(
+        4394300226931207022,
+        4554371141198369562,
+        4522860581064345217,
+        4509540616169896248,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfcvt_s_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmin_s() {
+    let a = u32x8::new(
+        1055713836, 1054644052, 1049275150, 1057289061, 1061461229, 1041818012, 1060715063,
+        1040785036,
+    );
+    let b = u32x8::new(
+        1048823100, 1053139848, 1065067350, 1058425698, 1057910475, 1058359832, 1051231814,
+        1042813160,
+    );
+    let r = i64x4::new(
+        4523201206323234108,
+        4541021940462824206,
+        4474574290981646027,
+        4470137692837414470,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfmin_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmin_d() {
+    let a = u64x4::new(
+        4594570070884899116,
+        4601942383326036568,
+        4603863714261060635,
+        4604069842204647079,
+    );
+    let b = u64x4::new(
+        4597923907797027300,
+        4602734374246572404,
+        4583371218452703040,
+        4596668800324369880,
+    );
+    let r = i64x4::new(
+        4594570070884899116,
+        4601942383326036568,
+        4583371218452703040,
+        4596668800324369880,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfmin_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmina_s() {
+    let a = u32x8::new(
+        1051583574, 1048334100, 1008901056, 1048010844, 1058048126, 1046481300, 1034708664,
+        1062424645,
+    );
+    let b = u32x8::new(
+        1057050977, 1054905968, 1057610003, 1058883162, 1036134312, 1020267520, 1059621961,
+        1062129138,
+    );
+    let r = i64x4::new(
+        4502560675833177174,
+        4501172301842258880,
+        4382015632607160232,
+        4561809912873379512,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfmina_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmina_d() {
+    let a = u64x4::new(
+        4600343614636459278,
+        4586078532026713744,
+        4605522001302794605,
+        4604680104437291828,
+    );
+    let b = u64x4::new(
+        4606967369913508220,
+        4606214846243616482,
+        4587216688083732016,
+        4597161583916257152,
+    );
+    let r = i64x4::new(
+        4600343614636459278,
+        4586078532026713744,
+        4587216688083732016,
+        4597161583916257152,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfmina_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmax_s() {
+    let a = u32x8::new(
+        1040557328, 1056374346, 1061211328, 1043258760, 1036675480, 1065222105, 1042177632,
+        1023489024,
+    );
+    let b = u32x8::new(
+        1030428272, 1047669536, 1035741736, 1064496616, 1062615049, 1064308633, 1058514955,
+        1065140306,
+    );
+    let r = i64x4::new(
+        4537093269443945744,
+        4571978153483881664,
+        4575094105013893129,
+        4574742780979947531,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfmax_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmax_d() {
+    let a = u64x4::new(
+        4598455083545818248,
+        4600184556479215682,
+        4605785336194907924,
+        4595051938027720488,
+    );
+    let b = u64x4::new(
+        4598044308154343000,
+        4602111953345143140,
+        4606540384570465960,
+        4602928137069840177,
+    );
+    let r = i64x4::new(
+        4598455083545818248,
+        4602111953345143140,
+        4606540384570465960,
+        4602928137069840177,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfmax_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmaxa_s() {
+    let a = u32x8::new(
+        1029731152, 1046633312, 1057699093, 1057848545, 1056015154, 1053369950, 1043177732,
+        1054203026,
+    );
+    let b = u32x8::new(
+        1056523808, 1057137213, 1057627244, 1053365006, 1056989330, 1060333719, 1061877148,
+        1001482496,
+    );
+    let r = i64x4::new(
+        4540369758276109856,
+        4543424905953883413,
+        4554098647008043154,
+        4527767521076114844,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfmaxa_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmaxa_d() {
+    let a = u64x4::new(
+        4607057953546777183,
+        4598029803916303580,
+        4606768199731078735,
+        4577576246859464512,
+    );
+    let b = u64x4::new(
+        4602769751297399272,
+        4606575139730018588,
+        4600779924965638822,
+        4596362093665607644,
+    );
+    let r = i64x4::new(
+        4607057953546777183,
+        4606575139730018588,
+        4606768199731078735,
+        4596362093665607644,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfmaxa_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfclass_s() {
+    let a = u32x8::new(
+        1055311824, 1052041740, 1046016912, 1053948390, 1064758783, 1058940353, 1054333862,
+        1048790772,
+    );
+    let r = i64x4::new(549755814016, 549755814016, 549755814016, 549755814016);
+
+    assert_eq!(r, transmute(lasx_xvfclass_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfclass_d() {
+    let a = u64x4::new(
+        4601866312729243692,
+        4603727160924846294,
+        4581175864218244800,
+        4596173124127472804,
+    );
+    let r = i64x4::new(128, 128, 128, 128);
+
+    assert_eq!(r, transmute(lasx_xvfclass_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfsqrt_s() {
+    let a = u32x8::new(
+        1065040686, 1045332480, 1058748054, 1041454996, 1045312756, 1048325884, 1051863384,
+        1061201844,
+    );
+    let r = i64x4::new(
+        4532289266943630008,
+        4522237574588618202,
+        4539089286789972523,
+        4566109703441416989,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfsqrt_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfsqrt_d() {
+    let a = u64x4::new(
+        4604266936093488453,
+        4603635094556032126,
+        4604345755115950647,
+        4595358066919885688,
+    );
+    let r = i64x4::new(
+        4605582601319773315,
+        4605187935290824484,
+        4605630368329407402,
+        4601138545884238765,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfsqrt_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrecip_s() {
+    let a = u32x8::new(
+        1060913758, 1057137592, 1056500078, 1053365486, 1052072368, 1058849416, 1061191779,
+        1061827646,
+    );
+    let r = i64x4::new(
+        4610230120071696079,
+        4621525987145000223,
+        4598466002793312350,
+        4585242601638738136,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfrecip_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrecip_d() {
+    let a = u64x4::new(
+        4600818034032403792,
+        4605811415521276862,
+        4603750608638111426,
+        4602783159858591242,
+    );
+    let r = i64x4::new(
+        4612858666853570563,
+        4607990995462358858,
+        4609954512138978824,
+        4611482062367896141,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfrecip_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx,frecipe")]
+unsafe fn test_lasx_xvfrecipe_s() {
+    let a = u32x8::new(
+        1061538089, 1009467584, 1043164316, 1030910448, 1059062619, 1048927856, 1064915194,
+        1028524176,
+    );
+    let r = i64x4::new(
+        4809660548434472067,
+        4721787188318892829,
+        4644815739361740708,
+        4728509413412007938,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfrecipe_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx,frecipe")]
+unsafe fn test_lasx_xvfrecipe_d() {
+    let a = u64x4::new(
+        4599514006383746620,
+        4607114589130093485,
+        4603063439897885463,
+        4602774413388259784,
+    );
+    let r = i64x4::new(
+        4614125529786744832,
+        4607216711966392320,
+        4610977572161847296,
+        4611499011256352768,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfrecipe_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx,frecipe")]
+unsafe fn test_lasx_xvfrsqrte_s() {
+    let a = u32x8::new(
+        1042369896, 1033402040, 1063640659, 1061099374, 1064617699, 1050687308, 1049602990,
+        1047907124,
+    );
+    let r = i64x4::new(
+        4641680627989561881,
+        4581330281566770462,
+        4604034110053345047,
+        4612427253546066334,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfrsqrte_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx,frecipe")]
+unsafe fn test_lasx_xvfrsqrte_d() {
+    let a = u64x4::new(
+        4601640737224225970,
+        4602882853441572005,
+        4594899837086694432,
+        4596019513190087348,
+    );
+    let r = i64x4::new(
+        4609450077243572224,
+        4608908592999825408,
+        4612828109287194624,
+        4612346183891812352,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfrsqrte_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrint_s() {
+    let a = u32x8::new(
+        1043178464, 1038460040, 1061848728, 1058680620, 1058193187, 1046712064, 1061839389,
+        1062791786,
+    );
+    let r = i64x4::new(0, 4575657222473777152, 1065353216, 4575657222473777152);
+
+    assert_eq!(r, transmute(lasx_xvfrint_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrint_d() {
+    let a = u64x4::new(
+        4602995275079155807,
+        4605303966018459675,
+        4604656441302899118,
+        4598894354395850360,
+    );
+    let r = i64x4::new(
+        4607182418800017408,
+        4607182418800017408,
+        4607182418800017408,
+        0,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfrint_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrsqrt_s() {
+    let a = u32x8::new(
+        1061523868, 1058283912, 1058667997, 1055761106, 1039496312, 1051937612, 1064817002,
+        1028487648,
+    );
+    let r = i64x4::new(
+        4586992255349404714,
+        4592512950478375290,
+        4600512219702681066,
+        4651901116840286347,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfrsqrt_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrsqrt_d() {
+    let a = u64x4::new(
+        4605274633765138187,
+        4606739923803408012,
+        4600049100582648664,
+        4595639907624537812,
+    );
+    let r = i64x4::new(
+        4607751568495560074,
+        4607297292863467031,
+        4610247933797877877,
+        4612495411087822923,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfrsqrt_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvflogb_s() {
+    let a = u32x8::new(
+        1060538931, 1046083924, 1058790721, 1059749771, 1051275772, 1063729353, 1063250692,
+        1040020680,
+    );
+    let r = i64x4::new(
+        -4593671616705069056,
+        -4647714812233515008,
+        -4647714812225126400,
+        -4575657218195587072,
+    );
+
+    assert_eq!(r, transmute(lasx_xvflogb_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvflogb_d() {
+    let a = u64x4::new(
+        4595455049368719724,
+        4604388813668624941,
+        4600944141083734502,
+        4606323839843915451,
+    );
+    let r = i64x4::new(
+        -4609434218613702656,
+        -4616189618054758400,
+        -4611686018427387904,
+        -4616189618054758400,
+    );
+
+    assert_eq!(r, transmute(lasx_xvflogb_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcvth_s_h() {
+    let a = i16x16::new(
+        1011, -3094, -23967, -2302, -29675, 24707, 31603, 27606, -10030, -23722, -4960, 8886,
+        4716, -14999, -10137, 25474,
+    );
+    let r = i64x4::new(
+        4904525550435082240,
+        5006525043206676480,
+        -4562955662106198016,
+        4931511963987271680,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfcvth_s_h(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcvth_d_s() {
+    let a = u32x8::new(
+        1060080295, 1063430965, 1058931094, 1057151472, 1062318208, 1041069740, 1040628608,
+        1062563894,
+    );
+    let r = i64x4::new(
+        4603734568304902144,
+        4602779141018746880,
+        4593908495954214912,
+        4605684912954015744,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfcvth_d_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcvtl_s_h() {
+    let a = i16x16::new(
+        -18572, -3633, 26136, -30442, 5487, 21033, 2005, -18343, 32598, 19034, -13880, 19435,
+        17289, 6097, -12500, -28967,
+    );
+    let r = i64x4::new(
+        -4163050086719389696,
+        -5106307920098557952,
+        4704924606608883712,
+        4719033540912152576,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfcvtl_s_h(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcvtl_d_s() {
+    let a = u32x8::new(
+        1059008236, 1026243936, 1059912059, 1060873661, 1059957992, 1049687936, 1054458174,
+        1049339368,
+    );
+    let r = i64x4::new(
+        4603775983600795648,
+        4586185783978754048,
+        4604285879970693120,
+        4598772185639682048,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfcvtl_d_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftint_w_s() {
+    let a = u32x8::new(
+        1052778524, 1039011152, 1033877208, 1049693252, 1062408118, 1030474672, 1042423356,
+        1038564616,
+    );
+    let r = i64x4::new(0, 0, 1, 0);
+
+    assert_eq!(r, transmute(lasx_xvftint_w_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftint_l_d() {
+    let a = u64x4::new(
+        4592491724896152048,
+        4600509745735788044,
+        4603560565683465563,
+        4606886496010904906,
+    );
+    let r = i64x4::new(0, 0, 1, 1);
+
+    assert_eq!(r, transmute(lasx_xvftint_l_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftint_wu_s() {
+    let a = u32x8::new(
+        1063402225, 1023548352, 1060204123, 1061208993, 1059244058, 1039466608, 1058287960,
+        1058024007,
+    );
+    let r = i64x4::new(1, 4294967297, 1, 4294967297);
+
+    assert_eq!(r, transmute(lasx_xvftint_wu_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftint_lu_d() {
+    let a = u64x4::new(
+        4601437466420634120,
+        4585269234107004032,
+        4602560385055197892,
+        4595388119831910552,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvftint_lu_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrz_w_s() {
+    let a = u32x8::new(
+        1045143016, 1048815390, 1047014848, 1055489924, 1060619700, 1055895842, 1061091259,
+        1052720902,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrz_w_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrz_l_d() {
+    let a = u64x4::new(
+        4603359584605772664,
+        4597259202045947564,
+        4606604696181460379,
+        4590200021857252112,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrz_l_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrz_wu_s() {
+    let a = u32x8::new(
+        1063820452, 1055661474, 1056124138, 1058294578, 1014656512, 1017634272, 1061863649,
+        1032276584,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrz_wu_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrz_lu_d() {
+    let a = u64x4::new(
+        4593109369482747112,
+        4606352005652581516,
+        4604267331764801794,
+        4603828603416455704,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrz_lu_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvffint_s_w() {
+    let a = i32x8::new(
+        -1936685818,
+        -292241542,
+        -386041592,
+        -1489663378,
+        1127778163,
+        -365070454,
+        -1830468239,
+        1453047639,
+    );
+    let r = i64x4::new(
+        -3635713297473937674,
+        -3552894890528992200,
+        -3625938366378905329,
+        5669248528000103797,
+    );
+
+    assert_eq!(r, transmute(lasx_xvffint_s_w(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvffint_d_l() {
+    let a = i64x4::new(
+        -3627358051950006798,
+        3291026422392521824,
+        9114456262655749128,
+        -101300809730113961,
+    );
+    let r = i64x4::new(
+        -4338888956717313783,
+        4883826182423482562,
+        4890802832263617419,
+        -4362160337941248997,
+    );
+
+    assert_eq!(r, transmute(lasx_xvffint_d_l(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvffint_s_wu() {
+    let a = u32x8::new(
+        1942522276, 3012872942, 4057450175, 3500418877, 3140467966, 1802049055, 2479355692,
+        3991791589,
+    );
+    let r = i64x4::new(
+        5707068753731621139,
+        5715248415876700103,
+        5680959067724132285,
+        5723492283472660471,
+    );
+
+    assert_eq!(r, transmute(lasx_xvffint_s_wu(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvffint_d_lu() {
+    let a = u64x4::new(
+        10285239871254038779,
+        10585860489684064217,
+        15302850682570301194,
+        12001223008770454391,
+    );
+    let r = i64x4::new(
+        4891427685477873921,
+        4891574472889216707,
+        4893877690756836940,
+        4892265567869239358,
+    );
+
+    assert_eq!(r, transmute(lasx_xvffint_d_lu(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvreplve_b() {
+    let a = i8x32::new(
+        -75, -65, 124, 6, 28, -41, 60, 12, -41, 91, 81, -114, 54, 98, -78, -94, 13, 26, 36,
+        112, -41, -74, -94, -71, 43, 54, 17, 60, -27, -89, 98, -78,
+    );
+    let r = i64x4::new(
+        -2893606913523066921,
+        -2893606913523066921,
+        -5280832617179597130,
+        -5280832617179597130,
+    );
+
+    assert_eq!(r, transmute(lasx_xvreplve_b(transmute(a), 5)));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvreplve_h() {
+    let a = i16x16::new(
+        10589, 16925, 2072, 2556, -20735, 27162, -30076, -21408, 26095, 24700, 11691, -31646,
+        14016, 23092, 1827, 2108,
+    );
+    let r = i64x4::new(
+        719461018576357884,
+        719461018576357884,
+        -8907411554322709406,
+        -8907411554322709406,
+    );
+
+    assert_eq!(r, transmute(lasx_xvreplve_h(transmute(a), -5)));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvreplve_w() {
+    let a = i32x8::new(
+        -1943637254,
+        265328695,
+        1624811313,
+        -907897952,
+        733901407,
+        -598309268,
+        -2022404353,
+        -945690723,
+    );
+    let r = i64x4::new(
+        1139578067980687415,
+        1139578067980687415,
+        -2569718735257041300,
+        -2569718735257041300,
+    );
+
+    assert_eq!(r, transmute(lasx_xvreplve_w(transmute(a), 1)));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvreplve_d() {
+    let a = i64x4::new(
+        8108160509866679259,
+        2816226171091081324,
+        -7945890434069746992,
+        7527726914374549897,
+    );
+    let r = i64x4::new(
+        8108160509866679259,
+        8108160509866679259,
+        -7945890434069746992,
+        -7945890434069746992,
+    );
+
+    assert_eq!(r, transmute(lasx_xvreplve_d(transmute(a), -6)));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpermi_w() {
+    let a = i32x8::new(
+        1434116256,
+        1142162281,
+        -1871700525,
+        -394957889,
+        382419347,
+        -785097055,
+        -1928161383,
+        -401992430,
+    );
+    let b = i32x8::new(
+        -1595257764,
+        1089333930,
+        -235320537,
+        -1276032758,
+        -803245169,
+        -82420548,
+        -1649409266,
+        665022456,
+    );
+    let r = i64x4::new(
+        -1010694009402824022,
+        -1696331215410035863,
+        -7084158850976817988,
+        -1726544336579699039,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvpermi_w::<217>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvandn_v() {
+    let a = u8x32::new(
+        174, 130, 100, 230, 117, 190, 128, 90, 135, 70, 67, 190, 102, 177, 131, 213, 116, 200,
+        40, 62, 198, 99, 109, 141, 122, 251, 83, 215, 87, 248, 140, 29,
+    );
+    let b = u8x32::new(
+        146, 145, 124, 157, 66, 158, 147, 40, 44, 251, 68, 171, 189, 227, 212, 251, 56, 131,
+        99, 225, 136, 245, 154, 179, 245, 155, 220, 217, 4, 18, 19, 17,
+    );
+    let r = i64x4::new(
+        2311191042782138640,
+        3050136072551184680,
+        3644137813819196168,
+        5350223724150917,
+    );
+
+    assert_eq!(r, transmute(lasx_xvandn_v(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvneg_b() {
+    let a = i8x32::new(
+        -41, -111, 119, -69, 55, 67, 126, -127, -123, 59, 34, -93, -12, -33, -35, -11, 89, -72,
+        -52, 6, 106, 79, 77, 58, -123, -99, 44, 27, 96, -32, -57, 75,
+    );
+    let r = i64x4::new(
+        9188114861941944105,
+        802521495600285051,
+        -4128761171367671641,
+        -5388239603749330053,
+    );
+
+    assert_eq!(r, transmute(lasx_xvneg_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvneg_h() {
+    let a = i16x16::new(
+        -4516, 26216, -27554, -11408, 20653, 18328, -4198, -15292, 23460, 9679, -8566, 23542,
+        -2503, 31678, 9261, -19575,
+    );
+    let r = i64x4::new(
+        3211184880420917668,
+        4304333377225928531,
+        -6626447107371719588,
+        5510114370614593991,
+    );
+
+    assert_eq!(r, transmute(lasx_xvneg_h(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvneg_w() {
+    let a = i32x8::new(
+        740574678,
+        2076342027,
+        968647939,
+        130194259,
+        1872650231,
+        -1690505081,
+        -594724042,
+        1453048102,
+    );
+    let r = i64x4::new(
+        -8917821097720956374,
+        -559180081205634307,
+        7260664039039148041,
+        -6240794077010148150,
+    );
+
+    assert_eq!(r, transmute(lasx_xvneg_w(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvneg_d() {
+    let a = i64x4::new(
+        -5535082554430398173,
+        7802847596802572188,
+        -4410306127860279470,
+        906750919774206543,
+    );
+    let r = i64x4::new(
+        5535082554430398173,
+        -7802847596802572188,
+        4410306127860279470,
+        -906750919774206543,
+    );
+
+    assert_eq!(r, transmute(lasx_xvneg_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmuh_b() {
+    let a = i8x32::new(
+        -1, 124, 20, 57, 41, 122, 83, 77, 119, 119, 127, 45, 107, 51, 67, 89, 59, 88, 71, -124,
+        62, 101, -53, -37, 2, 102, 69, 72, -83, 115, -102, 5,
+    );
+    let b = i8x32::new(
+        108, -29, 45, -93, 78, -21, 19, 10, 52, 107, 104, 75, 31, -9, -27, 72, -68, -20, -102,
+        95, 106, 38, -79, -7, 42, -112, -7, -41, 40, 124, 115, 91,
+    );
+    let r = i64x4::new(
+        218131067805364735,
+        1871524972886962456,
+        76576697723648496,
+        131228860074087168,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmuh_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmuh_h() {
+    let a = i16x16::new(
+        16678, 11413, -27848, -7978, -31217, -4869, 11843, 2166, -13263, -23440, 16372, 27675,
+        23654, 25588, -21093, 1464,
+    );
+    let b = i16x16::new(
+        -20021, -19612, 828, 8516, 14133, -5487, -7596, 26880, -23795, 2896, 7031, 19513,
+        -6376, 6003, -19930, -2328,
+    );
+    let r = i64x4::new(
+        -291609583629571048,
+        250225357332407731,
+        2319361354285454031,
+        -14890625691814142,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmuh_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmuh_w() {
+    let a = i32x8::new(
+        -833159784,
+        -1689012066,
+        1138643536,
+        1201390084,
+        -1615224698,
+        -984104182,
+        -991848752,
+        -18112020,
+    );
+    let b = i32x8::new(
+        -846972528,
+        -848270332,
+        -2071563046,
+        -1685604813,
+        2085038950,
+        696813713,
+        2076492369,
+        -867396671,
+    );
+    let r = i64x4::new(
+        1432738825969009962,
+        -2025068907290430086,
+        -685737288172100118,
+        15710306989437773,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmuh_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmuh_d() {
+    let a = i64x4::new(
+        -3091297468664313081,
+        -4254143725647386536,
+        -6994439148056979459,
+        878201001794537760,
+    );
+    let b = i64x4::new(
+        -2819683255232823594,
+        272893378245750433,
+        -2696341058713804350,
+        5752544304986593708,
+    );
+    let r = i64x4::new(
+        472521311864415951,
+        -62934014165103622,
+        1022369767923424239,
+        273863514955286020,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmuh_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmuh_bu() {
+    let a = u8x32::new(
+        252, 82, 157, 236, 123, 56, 117, 92, 87, 103, 53, 123, 55, 40, 186, 21, 199, 125, 151,
+        2, 152, 104, 145, 142, 138, 222, 115, 99, 79, 43, 91, 11,
+    );
+    let b = u8x32::new(
+        106, 138, 241, 29, 35, 19, 100, 212, 48, 52, 216, 195, 63, 32, 226, 9, 68, 212, 1, 104,
+        22, 101, 248, 114, 169, 245, 173, 78, 68, 135, 101, 145,
+    );
+    let r = i64x4::new(
+        5489047988046343272,
+        46167451136431120,
+        4579080056940291892,
+        442221464076014683,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmuh_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmuh_hu() {
+    let a = u16x16::new(
+        63486, 10379, 4610, 59627, 39525, 8192, 13999, 30090, 39838, 4996, 62860, 23112, 32783,
+        45419, 34018, 15191,
+    );
+    let b = u16x16::new(
+        50083, 9034, 31705, 24116, 14858, 32357, 59501, 26719, 43788, 29210, 55002, 25980,
+        7566, 49006, 61645, 1668,
+    );
+    let r = i64x4::new(
+        6175852041879338372,
+        3452908124314018560,
+        2579100322063607801,
+        108786773599653576,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmuh_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmuh_wu() {
+    let a = u32x8::new(
+        604375860, 434631772, 87186606, 1568632560, 3782451787, 1385975439, 3741892279,
+        2636678075,
+    );
+    let b = u32x8::new(
+        2866028752, 733937737, 283660427, 1865216280, 1246451636, 3799448094, 3234768261,
+        1243610100,
+    );
+    let r = i64x4::new(
+        318992658905816335,
+        2925838984554208529,
+        5265941737799272199,
+        3278999485098399815,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmuh_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmuh_du() {
+    let a = u64x4::new(
+        9309142847278954140,
+        11105915746381107654,
+        776831405492317725,
+        7350193390691079752,
+    );
+    let b = u64x4::new(
+        6484084708453170899,
+        12483776948923073243,
+        16553528344993857967,
+        3939779038690448735,
+    );
+    let r = i64x4::new(
+        3272191045945883120,
+        7515894102360886861,
+        697104087242456940,
+        1569823798457591419,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmuh_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsllwil_h_b() {
+    let a = i8x32::new(
+        -80, -11, 36, -88, -14, -123, -124, -15, -73, 95, -109, 108, -41, -128, 74, 81, 42, 54,
+        -105, 1, -17, -78, 85, 63, -18, 22, -37, 78, -116, -76, -104, -80,
+    );
+    let r = i64x4::new(
+        -396314289023943936,
+        -67281036482904288,
+        4777859115647648,
+        283732621893107440,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsllwil_h_b::<4>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsllwil_w_h() {
+    let a = i16x16::new(
+        -26490, -6081, 17297, 4860, -12591, -12327, -8532, -26767, -1364, 8756, 17192, -2170,
+        -9517, -24859, -20497, 19179,
+    );
+    let r = i64x4::new(
+        -53489037427331072,
+        42749012123355136,
+        77018594794627072,
+        -19087521822982144,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsllwil_w_h::<11>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsllwil_d_w() {
+    let a = i32x8::new(
+        -279919227,
+        1520692612,
+        58332548,
+        -1055411175,
+        -1879666532,
+        -1328702681,
+        2013268804,
+        1780320808,
+    );
+    let r = i64x4::new(
+        -4586196615168,
+        24915027755008,
+        -30796456460288,
+        -21769464725504,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsllwil_d_w::<14>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsllwil_hu_bu() {
+    let a = u8x32::new(
+        166, 242, 65, 29, 16, 173, 110, 19, 218, 174, 141, 254, 161, 96, 39, 227, 221, 101,
+        204, 143, 26, 87, 89, 20, 72, 61, 5, 44, 62, 179, 22, 150,
+    );
+    let r = i64x4::new(
+        261217712426980544,
+        171151904487768576,
+        1288057531186289568,
+        180156217344131904,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsllwil_hu_bu::<5>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsllwil_wu_hu() {
+    let a = u16x16::new(
+        28185, 27375, 29501, 18099, 10709, 55262, 57183, 25962, 46284, 59737, 9967, 49646,
+        20816, 18431, 34014, 61614,
+    );
+    let r = i64x4::new(
+        1926344372325335040,
+        1273603901354885120,
+        4203617671699431424,
+        3493526673607606272,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsllwil_wu_hu::<14>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsllwil_du_wu() {
+    let a = u32x8::new(
+        3871859378, 2804433615, 2931671754, 4116141862, 2330569940, 549563545, 2423689534,
+        763790591,
+    );
+    let r = i64x4::new(
+        1039344337701306368,
+        752809416264253440,
+        625607604583792640,
+        147522340803051520,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsllwil_du_wu::<28>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsran_b_h() {
+    let a = i16x16::new(
+        -3209, -6235, 10611, -108, -9326, 31718, 21536, 23681, -6783, -12443, -19057, 16054,
+        30697, -5640, -15815, -16666,
+    );
+    let b = i16x16::new(
+        -29110, -4589, 15031, -23437, 23404, 22985, -4128, -14921, 3799, -12876, -14071,
+        -20170, -30663, -21093, 2493, -19963,
+    );
+    let r = i64x4::new(-5107013816536599300, 0, -576745268203292981, 0);
+
+    assert_eq!(r, transmute(lasx_xvsran_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsran_h_w() {
+    let a = i32x8::new(
+        596228330,
+        -1214659999,
+        1365164495,
+        -1509876796,
+        191976733,
+        887390545,
+        1777692712,
+        -916491986,
+    );
+    let b = i32x8::new(
+        325990384,
+        675640582,
+        253768478,
+        -874708050,
+        -1204136396,
+        185722351,
+        -1391425532,
+        -614583871,
+    );
+    let r = i64x4::new(-7492863874014043255, 0, -5145548381371170633, 0);
+
+    assert_eq!(r, transmute(lasx_xvsran_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsran_w_d() {
+    let a = i64x4::new(
+        8440735619768910515,
+        3831375747389155813,
+        -7157949860071951471,
+        8075321479849390902,
+    );
+    let b = i64x4::new(
+        -4836402813541090096,
+        -5722420231286296070,
+        -8822340179414145626,
+        7458838578211487240,
+    );
+    let r = i64x4::new(58054624080, 0, 1863787881113495402, 0);
+
+    assert_eq!(r, transmute(lasx_xvsran_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssran_b_h() {
+    let a = i16x16::new(
+        27446, 31312, 14232, -17034, -2200, 9528, 17283, 22858, -16583, -20644, -19786, -30210,
+        -15134, -5982, 7374, -10469,
+    );
+    let b = i16x16::new(
+        32393, 13397, -26656, -25817, -11729, -3876, 5367, 32237, -5363, 14821, 8454, -2793,
+        30922, -19145, -25237, 355,
+    );
+    let r = i64x4::new(179865806513864501, 0, -9222296776751415043, 0);
+
+    assert_eq!(r, transmute(lasx_xvssran_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssran_h_w() {
+    let a = i32x8::new(
+        1069406291,
+        -421683701,
+        -1805581192,
+        775037443,
+        2123240059,
+        1014398272,
+        -968236564,
+        1181957260,
+    );
+    let b = i32x8::new(
+        -313676516,
+        794950557,
+        -1459200584,
+        -1233298689,
+        310419478,
+        2115419690,
+        370441503,
+        353523551,
+    );
+    let r = i64x4::new(281015415144451, 0, 281472829161978, 0);
+
+    assert_eq!(r, transmute(lasx_xvssran_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssran_w_d() {
+    let a = i64x4::new(
+        -3959032103812617007,
+        -6999276452061988148,
+        4785867104307053316,
+        -5846301556546422840,
+    );
+    let b = i64x4::new(
+        -9038176721428294357,
+        -7430682151090141786,
+        3023804747709575069,
+        -4263412213075666259,
+    );
+    let r = i64x4::new(-109363692856335914, 0, -713658208354305, 0);
+
+    assert_eq!(r, transmute(lasx_xvssran_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssran_bu_h() {
+    let a = u16x16::new(
+        15557, 60840, 1956, 59995, 38025, 11411, 47465, 2661, 64580, 57024, 5440, 30131, 5746,
+        43753, 23484, 38540,
+    );
+    let b = u16x16::new(
+        22970, 29096, 60132, 33800, 43597, 36861, 5794, 9818, 31709, 42253, 40665, 26755,
+        45611, 14534, 22385, 24914,
+    );
+    let r = i64x4::new(144116287595479055, 0, 71776131929997312, 0);
+
+    assert_eq!(r, transmute(lasx_xvssran_bu_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssran_hu_w() {
+    let a = u32x8::new(
+        2082097075, 1270167653, 972125472, 2358850873, 720341052, 2316145162, 1290262192,
+        3046238320,
+    );
+    let b = u32x8::new(
+        2086901452, 208185378, 3688640302, 858280348, 2470849871, 2168901411, 1405490695,
+        3256489998,
+    );
+    let r = i64x4::new(254837589540863, 0, 281470681765343, 0);
+
+    assert_eq!(r, transmute(lasx_xvssran_hu_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssran_wu_d() {
+    let a = u64x4::new(
+        12808251596834061909,
+        18221436405775299246,
+        16388143564854988150,
+        17532454272773126756,
+    );
+    let b = u64x4::new(
+        5233973111979334474,
+        11067258236306167045,
+        5186189126720253469,
+        15129384477845142857,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvssran_wu_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrarn_b_h() {
+    let a = i16x16::new(
+        13316, 16982, 17373, -4234, 12579, 29238, 26519, -27768, 29243, -28641, -6034, -30599,
+        7597, 22800, -24346, -21360,
+    );
+    let b = i16x16::new(
+        2182, -26731, -7280, -21775, 13607, -10194, -26196, 2085, 14341, 30747, 19786, -15409,
+        13019, 31558, 333, -15416,
+    );
+    let r = i64x4::new(-7204067930850651184, 0, -5909457163402939758, 0);
+
+    assert_eq!(r, transmute(lasx_xvsrarn_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrarn_h_w() {
+    let a = i32x8::new(
+        1424546002,
+        -1218125754,
+        2040047341,
+        -1355580190,
+        957370543,
+        -1800756932,
+        -244296865,
+        -324211997,
+    );
+    let b = i32x8::new(
+        -873611939,
+        -646116137,
+        -2104124404,
+        269272004,
+        -873453569,
+        -222623147,
+        -1684845205,
+        1120133990,
+    );
+    let r = i64x4::new(4021320339558432771, 0, -5499970420202995712, 0);
+
+    assert_eq!(r, transmute(lasx_xvsrarn_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrarn_w_d() {
+    let a = i64x4::new(
+        8313795273655551715,
+        -4571575745587141829,
+        3452416880072805381,
+        -3498451052052081526,
+    );
+    let b = i64x4::new(
+        1902594917407971969,
+        7038774598204297904,
+        1354840157561429239,
+        9153650925323248775,
+    );
+    let r = i64x4::new(-69752906595470, 0, -7240468610764767136, 0);
+
+    assert_eq!(r, transmute(lasx_xvsrarn_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarn_b_h() {
+    let a = i16x16::new(
+        30268, -30574, -1837, 13767, -29475, -25587, -27160, 25225, 4600, 30417, 28, -6434,
+        -6579, 16114, -5281, -15339,
+    );
+    let b = i16x16::new(
+        -29433, 6019, 25218, 19636, -20124, 25723, 21788, 20831, 32007, 16431, -14025, 1630,
+        -8234, 9749, 12924, 11326,
+    );
+    let r = i64x4::new(142413695971000447, 0, -141179869986524, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrarn_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarn_h_w() {
+    let a = i32x8::new(
+        170943894,
+        -1558232070,
+        1056252926,
+        -626239215,
+        -1035289292,
+        -1714887456,
+        869374752,
+        1218167748,
+    );
+    let b = i32x8::new(
+        -541237538,
+        -280182861,
+        655685335,
+        1285042104,
+        -1042547864,
+        -1616713045,
+        901223026,
+        -913984956,
+    );
+    let r = i64x4::new(-10414028872220672, 0, 9223104806137135104, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrarn_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarn_w_d() {
+    let a = i64x4::new(
+        -7095223716985142210,
+        -1864464750390939278,
+        3939082291268576295,
+        652125571964745491,
+    );
+    let b = i64x4::new(
+        -3290989318705091519,
+        -1709619047887212993,
+        6583279263353400787,
+        -8657326507673774559,
+    );
+    let r = i64x4::new(2147483648, 0, 326062786704572415, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrarn_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarn_bu_h() {
+    let a = u16x16::new(
+        26678, 38033, 32719, 23307, 55563, 49876, 43497, 48918, 15082, 47368, 47490, 13865,
+        14066, 28158, 29325, 39432,
+    );
+    let b = u16x16::new(
+        14063, 62353, 26936, 63778, 59375, 39648, 62782, 47347, 52496, 47247, 21846, 59427,
+        51935, 24463, 38090, 55890,
+    );
+    let r = i64x4::new(4286578689, 0, 8163878114427135, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrarn_bu_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarn_hu_w() {
+    let a = u32x8::new(
+        2720431924, 4147079016, 3167137960, 1370790237, 4041948877, 3496440502, 1072767482,
+        2933895593,
+    );
+    let b = u32x8::new(
+        747428871, 338187819, 2081920183, 3557659142, 2646673999, 138734404, 3410962197,
+        3574237192,
+    );
+    let r = i64x4::new(-281474976710656, 0, 2199023255552, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrarn_hu_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarn_wu_d() {
+    let a = u64x4::new(
+        6490501207978917237,
+        8209259321665773339,
+        14187940483119607818,
+        18034167934937299566,
+    );
+    let b = u64x4::new(
+        16181569100899671009,
+        7894668117654109960,
+        16341906792341189640,
+        4752425178296070145,
+    );
+    let r = i64x4::new(-3539373509, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrarn_wu_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrln_b_h() {
+    let a = i16x16::new(
+        -8859, -11711, 4363, -9439, -25357, 1884, 29173, -24389, 21528, -30451, -30750, -2629,
+        -22379, -10965, 22026, 4187,
+    );
+    let b = i16x16::new(
+        21400, -30654, 29959, 14320, 6060, -24401, -522, -8436, 27927, -10967, 11921, 19837,
+        3224, 2334, 27694, -1779,
+    );
+    let r = i64x4::new(776589499955319005, 0, 285495199351976, 0);
+
+    assert_eq!(r, transmute(lasx_xvsrln_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrln_h_w() {
+    let a = i32x8::new(
+        -741337180,
+        -1087033752,
+        1206017450,
+        -177254878,
+        -1655113328,
+        -889941782,
+        -267978430,
+        1844637616,
+    );
+    let b = i32x8::new(
+        196728630,
+        -568667475,
+        -273820408,
+        -1204576979,
+        -639636375,
+        889717098,
+        93317070,
+        -1535736032,
+    );
+    let r = i64x4::new(-6090306652816735409, 0, -1175228277373752196, 0);
+
+    assert_eq!(r, transmute(lasx_xvsrln_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrln_w_d() {
+    let a = i64x4::new(
+        -9145728687467639594,
+        8409501532987558867,
+        4702360266572413762,
+        -3159959081500746646,
+    );
+    let b = i64x4::new(
+        8658043654634750665,
+        -5736940948870912859,
+        -8385798465328465883,
+        -3467766742630042131,
+    );
+    let r = i64x4::new(262796920316080678, 0, 1866060245111069, 0);
+
+    assert_eq!(r, transmute(lasx_xvsrln_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrln_bu_h() {
+    let a = u16x16::new(
+        11222, 49369, 51083, 11755, 50527, 33895, 45751, 48397, 60912, 8893, 53498, 37814,
+        34588, 16791, 58737, 47927,
+    );
+    let b = u16x16::new(
+        44696, 19424, 49640, 20286, 46891, 46704, 50673, 49527, 19154, 6152, 25954, 33988,
+        37143, 16014, 63839, 56839,
+    );
+    let r = i64x4::new(-996419305685, 0, -71773920038018305, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrln_bu_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrln_hu_w() {
+    let a = u32x8::new(
+        2345037823, 2695836952, 4130802340, 2404297034, 295813801, 2039155670, 3495629229,
+        1556296817,
+    );
+    let b = u32x8::new(
+        294807188, 58363281, 19412242, 562851868, 1581507437, 3738447960, 1843096024, 195940565,
+    );
+    let r = i64x4::new(2319476961249468, 0, 208855326080470286, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrln_hu_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrln_wu_d() {
+    let a = u64x4::new(
+        1202535702403380748,
+        15707874870216391550,
+        13668879554311196884,
+        12302928023198114227,
+    );
+    let b = u64x4::new(
+        1500625420116916625,
+        18438653662202195541,
+        12192242821332678016,
+        6891738943843097628,
+    );
+    let r = i64x4::new(-1, 0, -1, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrln_wu_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlrn_b_h() {
+    let a = i16x16::new(
+        -12342, 30454, 25730, 6015, 26316, -10548, -7973, -11903, 14548, -7939, 27317, -22987,
+        -25067, -26999, 30994, -21757,
+    );
+    let b = i16x16::new(
+        31424, 29919, 27640, 2377, -27671, 6812, -24773, -17881, -24476, -13065, 24935, 4284,
+        4227, 20246, -28660, -22488,
+    );
+    let r = i64x4::new(-6693460433276960310, 0, -6122543899663285619, 0);
+
+    assert_eq!(r, transmute(lasx_xvsrlrn_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlrn_h_w() {
+    let a = i32x8::new(
+        48275673,
+        2044228048,
+        2011304917,
+        727641203,
+        711821092,
+        1084745670,
+        -1100065176,
+        1918073576,
+    );
+    let b = i32x8::new(
+        -609574414,
+        559467902,
+        -1150013148,
+        -2027938157,
+        -294433871,
+        -690493396,
+        1585922176,
+        1450222536,
+    );
+    let r = i64x4::new(390723813551243448, 0, 6015496732136052023, 0);
+
+    assert_eq!(r, transmute(lasx_xvsrlrn_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlrn_w_d() {
+    let a = i64x4::new(
+        -2014408193554501338,
+        -6765353383424633305,
+        5967977535334656496,
+        3402886661353956602,
+    );
+    let b = i64x4::new(
+        5950007641993014960,
+        2150696278963909567,
+        -4878722002685010440,
+        7186750387494925249,
+    );
+    let r = i64x4::new(4295025675, 0, -3281590872273059757, 0);
+
+    assert_eq!(r, transmute(lasx_xvsrlrn_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrn_bu_h() {
+    let a = u16x16::new(
+        4000, 26692, 55377, 5068, 29863, 20111, 65511, 27422, 7702, 63753, 34415, 139, 25413,
+        7385, 60703, 6991,
+    );
+    let b = u16x16::new(
+        60293, 44656, 25351, 5858, 32033, 34410, 41111, 15552, 22567, 60279, 27841, 635, 63102,
+        61738, 21315, 12439,
+    );
+    let r = i64x4::new(-258385232527491, 0, 4034951496335359804, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrlrn_bu_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrn_hu_w() {
+    let a = u32x8::new(
+        1512713352, 3525452897, 3680819492, 4269631286, 1077814176, 4243464555, 472893356,
+        2300045605,
+    );
+    let b = u32x8::new(
+        677817847, 3453937427, 172488718, 1972766946, 1046876255, 486725940, 1920931524,
+        3626282368,
+    );
+    let r = i64x4::new(-3854303052, 0, -4029743103, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrlrn_hu_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrn_wu_d() {
+    let a = u64x4::new(
+        4599848732973711922,
+        15463958724268349352,
+        4237045593978887151,
+        9203743234400791071,
+    );
+    let b = u64x4::new(
+        15971018346755767904,
+        235976279705162838,
+        15093271767346221587,
+        12421981949945891560,
+    );
+    let r = i64x4::new(-3223981555, 0, 35952127557763071, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrlrn_wu_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrstpi_b() {
+    let a = i8x32::new(
+        -16, -22, -111, -51, 76, 5, -7, -91, 99, -21, 88, -22, 39, 49, 5, -92, 64, -124, 62,
+        98, 108, -72, 96, -71, 50, 121, -20, -59, 69, 86, -45, -4,
+    );
+    let b = i8x32::new(
+        34, 105, -73, 60, 0, 99, -75, -90, -92, -86, 97, 72, 28, -72, 89, 120, 9, -116, 91, 83,
+        -104, 9, -13, -69, -74, 11, 0, -65, -1, -29, -117, -97,
+    );
+    let r = i64x4::new(
+        -6487147960825943312,
+        -6627837229100635390,
+        -5088864803284417472,
+        -228744298392422143,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvfrstpi_b::<24>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrstpi_h() {
+    let a = i16x16::new(
+        20931, 3906, -9803, -1590, 13500, -5932, 24528, -5092, 5805, 13930, 18709, -29274,
+        -4438, -28349, -16792, -12293,
+    );
+    let b = i16x16::new(
+        25543, -11013, -16650, -29925, 4461, 18433, 13374, 9428, 26865, -4164, -13533, -10962,
+        -8190, -12396, 472, 9930,
+    );
+    let r = i64x4::new(
+        -447545208418971197,
+        -1433165230546602820,
+        -8239898463019854163,
+        -3459962532381069654,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvfrstpi_h::<10>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrstp_b() {
+    let a = i8x32::new(
+        -104, -22, 61, 22, 9, -98, -4, 16, 115, -71, 58, 60, -74, 82, 83, 120, 120, -76, 92,
+        -20, 37, 35, -57, -10, 47, -90, -97, -3, 27, -117, 77, 75,
+    );
+    let b = i8x32::new(
+        29, 125, -59, -37, -90, 2, -50, -85, -72, 9, 38, 58, -122, 62, 66, -25, 27, 108, -84,
+        1, -6, 9, -62, 80, 77, 16, 68, 121, -110, -117, -33, 90,
+    );
+    let c = i8x32::new(
+        122, -19, -9, 106, -21, 115, -78, 36, -91, -76, 31, -109, -81, -42, 64, 54, -42, 104,
+        -10, 41, 36, -38, 119, 49, -46, 79, -83, 96, -51, 113, -126, 105,
+    );
+    let r = i64x4::new(
+        1224026960602983064,
+        8670364650262673779,
+        -719974344639597448,
+        5426146078386791983,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvfrstp_b(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrstp_h() {
+    let a = i16x16::new(
+        -9233, 24063, -20305, -23399, -22605, 11453, -986, -31974, 19489, -22401, -5866,
+        -32108, -8271, 27096, -1449, -1571,
+    );
+    let b = i16x16::new(
+        -27552, -7496, 14541, 20848, -24250, -18305, -23029, -15273, -2721, -22998, 32468,
+        11610, -23627, -30946, 1373, -6292,
+    );
+    let c = i16x16::new(
+        -14010, 12802, 15942, 32257, 32320, 28150, 20653, -9131, 4498, -8203, 4826, 11234,
+        -20272, 17945, -15074, 28179,
+    );
+    let r = i64x4::new(
+        -6586038712809825297,
+        -8999880904595888205,
+        -9037598549398827999,
+        -441921935067521103,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvfrstp_h(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvshuf4i_d() {
+    let a = i64x4::new(
+        -8852874241090285557,
+        -6166977094442369600,
+        3546810114463111685,
+        2862787957781039790,
+    );
+    let b = i64x4::new(
+        7077230945960720129,
+        -5857643695380455375,
+        -8499609572374301387,
+        9199878426816461564,
+    );
+    let r = i64x4::new(
+        -5857643695380455375,
+        -8852874241090285557,
+        9199878426816461564,
+        3546810114463111685,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvshuf4i_d::<115>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbsrl_v() {
+    let a = i8x32::new(
+        79, 63, 116, -13, 32, -126, 102, -10, -64, 71, -81, -118, -128, -14, 21, 13, 75, 38, 6,
+        30, -2, 62, 83, 84, 37, -74, -123, 97, -18, -91, -74, 122,
+    );
+    let r = i64x4::new(
+        -691722414719746225,
+        942926330900465600,
+        6076269583399265867,
+        8842437361645499941,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbsrl_v::<0>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvbsll_v() {
+    let a = i8x32::new(
+        -101, -112, 50, 67, 51, 4, 101, -35, 34, 44, 17, -5, -113, 12, 52, 63, -61, 11, -55,
+        12, -55, 6, -98, -116, -104, -58, -93, -35, -18, 109, -49, 69,
+    );
+    let r = i64x4::new(
+        -2493582200462471013,
+        4554278935710477346,
+        -8314200401506661437,
+        5030360181484275352,
+    );
+
+    assert_eq!(r, transmute(lasx_xvbsll_v::<0>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvextrins_b() {
+    let a = i8x32::new(
+        17, -80, 64, 44, -72, 82, -2, 38, -55, -73, 25, 31, 4, -29, -17, -48, 104, -21, -34,
+        -20, -21, 70, -35, 46, 99, -119, -21, 1, -57, -91, -18, 20,
+    );
+    let b = i8x32::new(
+        -77, -46, -33, 123, 16, 123, -111, 58, 36, -70, 57, -6, -59, 45, -77, -82, -98, -91,
+        -44, -27, -123, 108, -117, 80, 118, -39, -48, -95, 85, -53, 92, 73,
+    );
+    let r = i64x4::new(
+        2809773906502660113,
+        -3391242387545540663,
+        3376932729242184552,
+        1508325199364983139,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvextrins_b::<69>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvextrins_h() {
+    let a = i16x16::new(
+        -10446, -20013, -2609, -3677, 25411, -15077, 11399, 31407, -25336, 8187, 17545, 4284,
+        14539, -25105, -16568, -899,
+    );
+    let b = i16x16::new(
+        -17598, -13358, 1810, -11305, -19139, 20824, 10197, 16587, 27552, -14288, 10157,
+        -25428, -25392, -10580, -28041, 20313,
+    );
+    let r = i64x4::new(
+        2870470609909045042,
+        8840333555190686531,
+        -7892764466205713144,
+        -252835685454628661,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvextrins_h::<190>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvextrins_w() {
+    let a = i32x8::new(
+        538640697,
+        -1247440870,
+        2006632382,
+        -1215324238,
+        -1411224161,
+        -1343292937,
+        -407107379,
+        -1849972197,
+    );
+    let b = i32x8::new(
+        1928001842,
+        817819193,
+        -1886180706,
+        -2057556111,
+        -1558391607,
+        1824082297,
+        -341759024,
+        147045346,
+    );
+    let r = i64x4::new(
+        -5357717739525968327,
+        -5219777854239488066,
+        -5769399231538706055,
+        -7945570080736409395,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvextrins_w::<133>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvextrins_d() {
+    let a = i64x4::new(
+        -7415577103741432638,
+        9028147385060226899,
+        3806483413885303329,
+        -8139040440396540849,
+    );
+    let b = i64x4::new(
+        -7025567873801693340,
+        8074885789654734557,
+        -9150208635842546941,
+        -6790202101278745327,
+    );
+    let r = i64x4::new(
+        -7415577103741432638,
+        -7025567873801693340,
+        3806483413885303329,
+        -9150208635842546941,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvextrins_d::<210>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmskltz_b() {
+    let a = i8x32::new(
+        123, 97, -46, 106, -84, -121, 69, 50, 76, -32, -42, 117, -89, 121, 85, 101, 103, 26,
+        -117, 20, -90, 44, 126, -128, -120, 12, -28, -18, 45, 77, 45, -59,
+    );
+    let r = i64x4::new(5684, 0, 36244, 0);
+
+    assert_eq!(r, transmute(lasx_xvmskltz_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmskltz_h() {
+    let a = i16x16::new(
+        -9300, 15427, 23501, 8110, 29557, -8385, -18123, -869, 19048, 30280, 32130, 6792, 3533,
+        -19264, -7144, 21429,
+    );
+    let r = i64x4::new(225, 0, 96, 0);
+
+    assert_eq!(r, transmute(lasx_xvmskltz_h(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmskltz_w() {
+    let a = i32x8::new(
+        -1225647162,
+        786607282,
+        -476336095,
+        -591696091,
+        1992561919,
+        -832745020,
+        1971757146,
+        -1595190261,
+    );
+    let r = i64x4::new(13, 0, 10, 0);
+
+    assert_eq!(r, transmute(lasx_xvmskltz_w(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmskltz_d() {
+    let a = i64x4::new(
+        1070935900765754723,
+        8590124656098588796,
+        2469446778159209649,
+        5778474674811894997,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvmskltz_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsigncov_b() {
+    let a = i8x32::new(
+        88, -3, -96, 121, 86, -94, 40, 5, -55, -8, 84, 31, -93, -72, -28, 58, -87, 56, 8, 94,
+        97, -72, 116, 71, 73, -21, -109, 123, 81, 125, 24, -23,
+    );
+    let b = i8x32::new(
+        92, -37, 80, 100, 79, -105, -24, 16, -113, -66, -48, 32, 107, 11, -100, -43, 7, 99, 24,
+        38, 84, -40, 55, -73, -112, 84, 59, -88, -102, 83, -65, 87,
+    );
+    let r = i64x4::new(
+        1218339488916317532,
+        -3070059025110384015,
+        -5244678899168156679,
+        -6215157037026399088,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsigncov_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsigncov_h() {
+    let a = i16x16::new(
+        14096, 7677, -14561, -21692, 19661, -15938, 19461, 3041, -31532, 19690, -2669, -20964,
+        -23817, -21867, 16694, -15396,
+    );
+    let b = i16x16::new(
+        -15034, -7726, 181, 30057, -22414, -21472, 21361, 4765, -12995, -32566, 7068, -18429,
+        -22953, -7497, 14762, -10184,
+    );
+    let r = i64x4::new(
+        -8460012673615870650,
+        1341320010229917810,
+        5187553466109276867,
+        2866604565619890601,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsigncov_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsigncov_w() {
+    let a = i32x8::new(
+        -1256172687,
+        1338321047,
+        354406336,
+        -462763275,
+        187721986,
+        -940691165,
+        -1179299422,
+        -1424929206,
+    );
+    let b = i32x8::new(
+        -118338197,
+        331139357,
+        644951541,
+        -1931633026,
+        -3454036,
+        -520396646,
+        1909538523,
+        41991994,
+    );
+    let r = i64x4::new(
+        1422232708851806869,
+        8296300675188469237,
+        2235086579809602476,
+        -180354238538399451,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsigncov_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsigncov_d() {
+    let a = i64x4::new(
+        3750427451628106019,
+        -1382697069711266350,
+        -503292598450220754,
+        -2919664281580184898,
+    );
+    let b = i64x4::new(
+        -1642478899758371170,
+        4653675866380276086,
+        -6612106063359352920,
+        -293290471183495768,
+    );
+    let r = i64x4::new(
+        -1642478899758371170,
+        -4653675866380276086,
+        6612106063359352920,
+        293290471183495768,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsigncov_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmadd_s() {
+    let a = u32x8::new(
+        1062320727, 1052840336, 1056978973, 1021320864, 1047491708, 1057181752, 1065099904,
+        1057641824,
+    );
+    let b = u32x8::new(
+        1031536608, 1056182872, 1060915258, 1049713234, 1050950720, 1059791774, 1059318083,
+        1051234082,
+    );
+    let c = u32x8::new(
+        1061252634, 1060194113, 1034936984, 1061661636, 1060064922, 1006614016, 1059417135,
+        1050039034,
+    );
+    let r = i64x4::new(
+        4566451999453631823,
+        4560361667101758314,
+        4518113787508851321,
+        4535521032267853298,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvfmadd_s(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmadd_d() {
+    let a = u64x4::new(
+        4602842753634531585,
+        4595402401334175048,
+        4601214875019142940,
+        4604030967498454410,
+    );
+    let b = u64x4::new(
+        4598948128295145186,
+        4601733706721520294,
+        4603769303486824150,
+        4604117155996961650,
+    );
+    let c = u64x4::new(
+        4580452284864657312,
+        4600663302047027414,
+        4606609389472923777,
+        4596161355449103520,
+    );
+    let r = i64x4::new(
+        4595235980529776159,
+        4602058356150948088,
+        4608067122875931060,
+        4603786516863404306,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvfmadd_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmsub_s() {
+    let a = u32x8::new(
+        1053706718, 1064190592, 1065194002, 1049204796, 1058065270, 1054990514, 1052198782,
+        1061344475,
+    );
+    let b = u32x8::new(
+        1052072326, 1062946662, 1062413428, 1054564788, 1064477491, 1062331484, 1058685254,
+        1048115308,
+    );
+    let c = u32x8::new(
+        1051545776, 1052538894, 1034162080, 1012676672, 1042769032, 1060397176, 1036487208,
+        1047947488,
+    );
+    let r = i64x4::new(
+        4529410253708099330,
+        4454144102220210572,
+        -4706385850068449532,
+        -4799792193244875572,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvfmsub_s(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfmsub_d() {
+    let a = u64x4::new(
+        4600920645370262278,
+        4606351881217070920,
+        4605318237650453082,
+        4606278590304909259,
+    );
+    let b = u64x4::new(
+        4587150424227513280,
+        4605394922115166652,
+        4600659107885415374,
+        4603309679459912257,
+    );
+    let c = u64x4::new(
+        4599568550479871818,
+        4607122878168983077,
+        4594751414351299244,
+        4606268515473003992,
+    );
+    let r = i64x4::new(
+        -4624155064942819898,
+        -4624913073348173037,
+        4594667261719455656,
+        -4622752308912416305,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvfmsub_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfnmadd_s() {
+    let a = u32x8::new(
+        1039663832, 1061072453, 1059429769, 1055008244, 1064943875, 1031669664, 1057273263,
+        1059384715,
+    );
+    let b = u32x8::new(
+        1048864374, 1058998841, 1057533884, 1058902812, 1062707313, 1041334952, 1042897040,
+        1049077472,
+    );
+    let c = u32x8::new(
+        1059665677, 1057796240, 1060649005, 1032551792, 1054598086, 1052603136, 1052306030,
+        1040847308,
+    );
+    let r = i64x4::new(
+        -4647271481419416743,
+        -4706804117592845625,
+        -4701205915483756606,
+        -4711770517136945317,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvfnmadd_s(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfnmadd_d() {
+    let a = u64x4::new(
+        4604608697786889945,
+        4602612366462296312,
+        4601635234875928748,
+        4605244074506891174,
+    );
+    let b = u64x4::new(
+        4589783027170388200,
+        4605787546878420832,
+        4591185942485517728,
+        4604114400983891746,
+    );
+    let c = u64x4::new(
+        4606499207929193159,
+        4602090155238640016,
+        4605981237511158859,
+        4603473909221104351,
+    );
+    let r = i64x4::new(
+        -4616415827217001188,
+        -4617209466841496233,
+        -4617030428660783542,
+        -4615713336403701073,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvfnmadd_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfnmsub_s() {
+    let a = u32x8::new(
+        1064224098, 1059043256, 1061588698, 1059572349, 1061959798, 1042453224, 1036562968,
+        1056461556,
+    );
+    let b = u32x8::new(
+        1061205590, 1049560178, 1059192066, 1061005027, 1054917726, 1061034231, 1058796762,
+        1061794461,
+    );
+    let c = u32x8::new(
+        1025067264, 1063481799, 1058824148, 1061822410, 1057397992, 1059256144, 1059389703,
+        1052234474,
+    );
+    let r = i64x4::new(
+        4555061808459295114,
+        4511379579414633985,
+        4540975425961318277,
+        -4846656492652873586,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvfnmsub_s(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfnmsub_d() {
+    let a = u64x4::new(
+        4585643461608569024,
+        4605011746261589541,
+        4602843862374894962,
+        4596919096453581616,
+    );
+    let b = u64x4::new(
+        4603616678040017345,
+        4599749349009999872,
+        4603258706135001603,
+        4603783118222515934,
+    );
+    let c = u64x4::new(
+        4605444602262387771,
+        4593682097024038340,
+        4599004459823205548,
+        4595599337151422272,
+    );
+    let r = i64x4::new(
+        4605237590347011909,
+        -4629492016214849095,
+        4570217977506301115,
+        4586582751878211231,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvfnmsub_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrne_w_s() {
+    let a = u32x8::new(
+        1064249874, 1024076480, 1048811302, 1045498088, 1062853975, 1050962974, 1062155621,
+        1062916560,
+    );
+    let r = i64x4::new(1, 0, 1, 4294967297);
+
+    assert_eq!(r, transmute(lasx_xvftintrne_w_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrne_l_d() {
+    let a = u64x4::new(
+        4591358556337662184,
+        4604590073262881231,
+        4606169601365380521,
+        4596710878897869904,
+    );
+    let r = i64x4::new(0, 1, 1, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrne_l_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrp_w_s() {
+    let a = u32x8::new(
+        1036136200, 1059809120, 1051167120, 1057100667, 1042968648, 1063707411, 1063195788,
+        1061888439,
+    );
+    let r = i64x4::new(4294967297, 4294967297, 4294967297, 4294967297);
+
+    assert_eq!(r, transmute(lasx_xvftintrp_w_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrp_l_d() {
+    let a = u64x4::new(
+        4585505041718488768,
+        4601087510575360504,
+        4599806583262831052,
+        4595165936320641380,
+    );
+    let r = i64x4::new(1, 1, 1, 1);
+
+    assert_eq!(r, transmute(lasx_xvftintrp_l_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrm_w_s() {
+    let a = u32x8::new(
+        1057789434, 1054177120, 1060875884, 1015620960, 1056089726, 1050746790, 1022621568,
+        1056386214,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrm_w_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrm_l_d() {
+    let a = u64x4::new(
+        4603222821759326038,
+        4603232821889844771,
+        4606305215983768062,
+        4597476035020392948,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrm_l_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftint_w_d() {
+    let a = u64x4::new(
+        4590993770331821784,
+        4601838197892262822,
+        4578381772647210176,
+        4602974423286505396,
+    );
+    let b = u64x4::new(
+        4598764447835256340,
+        4585609299219476064,
+        4605520309365062132,
+        4604323432136071446,
+    );
+    let r = i64x4::new(0, 0, 4294967297, 4294967296);
+
+    assert_eq!(r, transmute(lasx_xvftint_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvffint_s_l() {
+    let a = i64x4::new(
+        -4594969696763236122,
+        -6690984686308779928,
+        4592510749553568480,
+        -8490928078748263946,
+    );
+    let b = i64x4::new(
+        7654740714754719601,
+        4897940113865969438,
+        5957877121068211806,
+        -7012236593339611923,
+    );
+    let r = i64x4::new(
+        6811678997581428276,
+        -2397684876741504398,
+        -2395175097567191741,
+        -2383622820954443903,
+    );
+
+    assert_eq!(r, transmute(lasx_xvffint_s_l(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrz_w_d() {
+    let a = u64x4::new(
+        4596886727296090208,
+        4602058111141126830,
+        4582692816602031424,
+        4600921050551730962,
+    );
+    let b = u64x4::new(
+        4594050684390877628,
+        4605818316975650567,
+        4606490477487570572,
+        4599704434038566766,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrz_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrp_w_d() {
+    let a = u64x4::new(
+        4589404978031986168,
+        4606941481982333029,
+        4594924203912769356,
+        4597184562267174648,
+    );
+    let b = u64x4::new(
+        4604805957576412467,
+        4605348751714663856,
+        4603064242276236026,
+        4597541345541924472,
+    );
+    let r = i64x4::new(4294967297, 4294967297, 4294967297, 4294967297);
+
+    assert_eq!(r, transmute(lasx_xvftintrp_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrm_w_d() {
+    let a = u64x4::new(
+        4606666486099429909,
+        4601456430561276036,
+        4591400719822715992,
+        4601150269438174040,
+    );
+    let b = u64x4::new(
+        4601898131328640396,
+        4603752803994862807,
+        4602971578268526784,
+        4607166074459830797,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrm_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrne_w_d() {
+    let a = u64x4::new(
+        4603578020825687150,
+        4602331063342270938,
+        4607074154698712999,
+        4606049262608662240,
+    );
+    let b = u64x4::new(
+        4604303573618654118,
+        4605305650790770757,
+        4594624155139674016,
+        4597424226611516804,
+    );
+    let r = i64x4::new(4294967297, 1, 0, 4294967297);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvftintrne_w_d(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftinth_l_s() {
+    let a = u32x8::new(
+        1060793948, 1047845056, 1008256256, 1062225417, 1052160478, 1061682279, 1017836000,
+        1061679812,
+    );
+    let r = i64x4::new(0, 1, 0, 1);
+
+    assert_eq!(r, transmute(lasx_xvftinth_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintl_l_s() {
+    let a = u32x8::new(
+        1049069272, 1055517436, 1058463365, 1060600954, 1053028452, 1058398899, 1062375625,
+        1064635140,
+    );
+    let r = i64x4::new(0, 0, 0, 1);
+
+    assert_eq!(r, transmute(lasx_xvftintl_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvffinth_d_w() {
+    let a = i32x8::new(
+        -158173087,
+        -27800957,
+        1158068870,
+        278371207,
+        106487733,
+        -1801338365,
+        -1891310322,
+        -527557220,
+    );
+    let r = i64x4::new(
+        4742644100887478272,
+        4733449902607040512,
+        -4477652498412208128,
+        -4485741486683455488,
+    );
+
+    assert_eq!(r, transmute(lasx_xvffinth_d_w(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvffintl_d_w() {
+    let a = i32x8::new(
+        -1977997193,
+        -1979528264,
+        836984862,
+        -201390618,
+        1072540196,
+        -288815065,
+        -387961600,
+        -174426466,
+    );
+    let r = i64x4::new(
+        -4477288907322425344,
+        -4477282485545205760,
+        4742280327634878464,
+        -4489746915386195968,
+    );
+
+    assert_eq!(r, transmute(lasx_xvffintl_d_w(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrzh_l_s() {
+    let a = u32x8::new(
+        1056351604, 1063464564, 1064583750, 1057296352, 1041896748, 1045603520, 1056628952,
+        1057862380,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrzh_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrzl_l_s() {
+    let a = u32x8::new(
+        1037928632, 1054629686, 1054996640, 1060820265, 1056507210, 1065161891, 1061180536,
+        1053528304,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrzl_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrph_l_s() {
+    let a = u32x8::new(
+        1059417377, 1040833844, 1045894588, 1063338397, 1056670958, 1064221427, 1042275464,
+        1040737828,
+    );
+    let r = i64x4::new(1, 1, 1, 1);
+
+    assert_eq!(r, transmute(lasx_xvftintrph_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrpl_l_s() {
+    let a = u32x8::new(
+        1050993336, 1043212320, 1055353974, 1052104546, 1049173258, 1052001038, 1062670733,
+        1064792601,
+    );
+    let r = i64x4::new(1, 1, 1, 1);
+
+    assert_eq!(r, transmute(lasx_xvftintrpl_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrmh_l_s() {
+    let a = u32x8::new(
+        1050100898, 1059826813, 1064587005, 1060468211, 1054982654, 1058930731, 1048352436,
+        1059136196,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrmh_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrml_l_s() {
+    let a = u32x8::new(
+        1064932806, 1062327525, 1041996288, 1056298428, 1055943822, 1051470160, 1059582897,
+        1054164774,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrml_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrneh_l_s() {
+    let a = u32x8::new(
+        1064823377, 1059036914, 1061655628, 1036637816, 1061056914, 1057581036, 1048480136,
+        1057425421,
+    );
+    let r = i64x4::new(1, 0, 0, 1);
+
+    assert_eq!(r, transmute(lasx_xvftintrneh_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvftintrnel_l_s() {
+    let a = u32x8::new(
+        1051117486, 1064733813, 1057650292, 1054601720, 1060065354, 1042171252, 1055495904,
+        1060965253,
+    );
+    let r = i64x4::new(0, 1, 1, 0);
+
+    assert_eq!(r, transmute(lasx_xvftintrnel_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrintrne_s() {
+    let a = u32x8::new(
+        1042191636, 1057149553, 1054208692, 1059070307, 1043946500, 1058368204, 1065187361,
+        1055502338,
+    );
+    let r = i64x4::new(
+        4575657221408423936,
+        4575657221408423936,
+        4575657221408423936,
+        1065353216,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfrintrne_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrintrne_d() {
+    let a = u64x4::new(
+        4595948761324680740,
+        4599917619990044612,
+        4603982357523822254,
+        4602664966963180606,
+    );
+    let r = i64x4::new(0, 0, 4607182418800017408, 0);
+
+    assert_eq!(r, transmute(lasx_xvfrintrne_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrintrz_s() {
+    let a = u32x8::new(
+        1058076241, 1061463006, 1057120056, 1053378848, 1048357040, 1060603738, 1014341632,
+        1064059317,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfrintrz_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrintrz_d() {
+    let a = u64x4::new(
+        4601618692275492658,
+        4600007493587145094,
+        4605876890989719085,
+        4600499427656278116,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfrintrz_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrintrp_s() {
+    let a = u32x8::new(
+        1061637682, 1060303004, 1048139028, 1064254459, 1060496485, 1063015260, 1050062098,
+        1060031891,
+    );
+    let r = i64x4::new(
+        4575657222473777152,
+        4575657222473777152,
+        4575657222473777152,
+        4575657222473777152,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfrintrp_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrintrp_d() {
+    let a = u64x4::new(
+        4596277205079353652,
+        4602920367780564368,
+        4605931026619472063,
+        4600342272679781386,
+    );
+    let r = i64x4::new(
+        4607182418800017408,
+        4607182418800017408,
+        4607182418800017408,
+        4607182418800017408,
+    );
+
+    assert_eq!(r, transmute(lasx_xvfrintrp_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrintrm_s() {
+    let a = u32x8::new(
+        1052396158, 1055096688, 1056860582, 1050315636, 1062873063, 1057089721, 1060819485,
+        1031018704,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfrintrm_s(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfrintrm_d() {
+    let a = u64x4::new(
+        4593814259274657568,
+        4602367426014166064,
+        4595326936223928604,
+        4605375676692406871,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfrintrm_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvld() {
+    let a: [i8; 32] = [
+        86, 26, -5, 19, -6, -100, -44, 108, -106, 70, -118, 126, 31, -112, -39, -11, -120, -25,
+        -62, -45, 43, 83, 3, -116, 87, -28, -69, -91, -68, -126, -96, -88,
+    ];
+    let r = i64x4::new(
+        7842065449049856598,
+        -731394999529617770,
+        -8357745035768043640,
+        -6295888532317936553,
+    );
+
+    assert_eq!(r, transmute(lasx_xvld::<0>(a.as_ptr())));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvst() {
+    let a = i8x32::new(
+        88, 98, -23, 115, 114, -11, 37, 91, -109, 37, -83, 109, -95, -96, -38, 5, -13, 112,
+        113, -80, 90, -37, -112, -76, 57, -113, -52, -109, -125, -124, -52, -18,
+    );
+    let mut o: [i8; 32] = [
+        52, -18, -107, -17, 53, 34, 71, -16, 7, -75, -38, -105, -114, 37, 36, 62, -91, 104, 87,
+        85, 74, -94, -53, -98, -77, -7, -17, 107, -9, -78, -64, -68,
+    ];
+    let r = i64x4::new(
+        6567925503509488216,
+        421826130302805395,
+        -5435603567682424589,
+        -1239470096778490055,
+    );
+
+    lasx_xvst::<0>(transmute(a), o.as_mut_ptr());
+    assert_eq!(r, transmute(o));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvstelm_b() {
+    let a = i8x32::new(
+        -5, -21, 65, 59, 32, 48, -6, 103, 97, 7, 43, -113, -102, 30, -32, -75, 71, 80, 71, -83,
+        73, -113, -77, 110, -111, -85, 8, 101, -41, 127, -20, 92,
+    );
+    let mut o: [i8; 32] = [
+        -29, -20, -68, -24, 64, 3, -46, 0, -51, -114, 2, 12, 120, -127, -52, 114, -102, -91,
+        -118, 57, 124, 0, -68, -77, -33, 18, -124, -23, -108, 127, -65, -18,
+    ];
+    let r = i64x4::new(
+        59113322426723335,
+        8272128968170311373,
+        -5495516911757515366,
+        -1243134694581333281,
+    );
+
+    lasx_xvstelm_b::<0, 9>(transmute(a), o.as_mut_ptr());
+    assert_eq!(r, transmute(o));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvstelm_h() {
+    let a = i16x16::new(
+        -11648, -19047, -15513, 1973, 24885, -9476, 7637, 28480, 13018, 7333, -12654, 16215,
+        26055, 26861, -1163, 20219,
+    );
+    let mut o: [i8; 32] = [
+        23, 88, -111, 29, 32, 115, 1, -69, 82, 35, 2, 27, 44, -48, 117, -60, 88, 72, 106, -42,
+        73, 79, 56, -63, 58, 55, -84, -49, 124, 26, -123, 64,
+    ];
+    let r = i64x4::new(
+        -4971565931868119595,
+        -4290294182150266030,
+        -4523778647145166760,
+        4649151313692342074,
+    );
+
+    lasx_xvstelm_h::<0, 6>(transmute(a), o.as_mut_ptr());
+    assert_eq!(r, transmute(o));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvstelm_w() {
+    let a = i32x8::new(
+        -1636077495,
+        -1913212378,
+        402520069,
+        1598923340,
+        -615956201,
+        -719313542,
+        -1002278595,
+        -1955360887,
+    );
+    let mut o: [i8; 32] = [
+        -111, 55, 4, 18, 52, 121, -113, 36, -50, 17, -101, 124, -119, -45, -16, 64, 57, -59,
+        -31, 29, -24, 92, 56, -72, 60, 90, 23, -26, -15, -40, -18, 75,
+    ];
+    let r = i64x4::new(
+        2634457572879213132,
+        4679472600292463054,
+        -5172282020031511239,
+        5471549130760739388,
+    );
+
+    lasx_xvstelm_w::<0, 3>(transmute(a), o.as_mut_ptr());
+    assert_eq!(r, transmute(o));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvstelm_d() {
+    let a = i64x4::new(
+        -7526664681033668234,
+        9215683190885160466,
+        -7392730922884510993,
+        8273081902285331784,
+    );
+    let mut o: [i8; 32] = [
+        -19, -84, 7, -70, 72, -73, -100, -123, 14, -16, 82, 9, -66, -78, -112, -3, 124, 110,
+        103, -66, -1, 109, 69, 70, 103, 8, -6, 99, -125, -94, 100, -56,
+    ];
+    let r = i64x4::new(
+        -7526664681033668234,
+        -175443856197488626,
+        5063574301226528380,
+        -4006899083251152793,
+    );
+
+    lasx_xvstelm_d::<0, 0>(transmute(a), o.as_mut_ptr());
+    assert_eq!(r, transmute(o));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvinsve0_w() {
+    let a = i32x8::new(
+        -1106154721,
+        634412656,
+        -1100544436,
+        -1769767887,
+        -1012647261,
+        2136829593,
+        1072879419,
+        -1993022923,
+    );
+    let b = i32x8::new(
+        -2041359214,
+        -474600924,
+        276373021,
+        687517976,
+        -1931658504,
+        392817806,
+        -1316466623,
+        736368242,
+    );
+    let r = i64x4::new(
+        2724781612877310751,
+        -7601095192981600692,
+        -8767571060235945309,
+        -8559968273390446789,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvinsve0_w::<5>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvinsve0_d() {
+    let a = i64x4::new(
+        -3740248607430046939,
+        1767794107206960110,
+        -9137064168958473066,
+        -7852825851844941424,
+    );
+    let b = i64x4::new(
+        431855113748835185,
+        3288039304988384340,
+        -5708126726787922006,
+        4289161164888851504,
+    );
+    let r = i64x4::new(
+        -3740248607430046939,
+        1767794107206960110,
+        -9137064168958473066,
+        431855113748835185,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvinsve0_d::<3>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickve_w() {
+    let a = i32x8::new(
+        -1564826515,
+        -458927896,
+        1138467779,
+        1659848021,
+        -885088458,
+        -737326650,
+        -47750787,
+        -414548426,
+    );
+    let r = i64x4::new(1138467779, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvpickve_w::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickve_d() {
+    let a = i64x4::new(
+        8402618222187512066,
+        -7057900739934826301,
+        -6839567064019939265,
+        8714541331515896284,
+    );
+    let r = i64x4::new(8402618222187512066, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvpickve_d::<0>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrn_b_h() {
+    let a = i16x16::new(
+        -798, 1398, -623, -4797, -18857, 26443, 16384, -16263, 21881, -27973, -23498, -9777,
+        26657, -16754, 19690, 951,
+    );
+    let b = i16x16::new(
+        -3568, 18618, 18284, -20348, 30931, -13978, -28022, 30586, 8502, -29737, 27777, 2457,
+        -24560, 7519, 9137, 13151,
+    );
+    let r = i64x4::new(3463408299017240959, 0, 35748968851799935, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrlrn_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrn_h_w() {
+    let a = i32x8::new(
+        -709437285,
+        1569944173,
+        840839991,
+        1276120983,
+        -1380474679,
+        1717565103,
+        1662438257,
+        41628460,
+    );
+    let b = i32x8::new(
+        1222449199,
+        -859865335,
+        -1646420307,
+        2051326847,
+        -1328302771,
+        -2115559725,
+        275103578,
+        95546356,
+    );
+    let r = i64x4::new(422210317549567, 0, 11259106657337343, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrlrn_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrn_w_d() {
+    let a = i64x4::new(
+        6389812745870818755,
+        8763001741694997752,
+        -1562866978917178065,
+        9133752987191586761,
+    );
+    let b = i64x4::new(
+        -7467566672980641247,
+        -2330366242646492110,
+        7828472137399229278,
+        5811058912891800907,
+    );
+    let r = i64x4::new(33428474336875, 0, 9223372034707292159, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrlrn_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrln_b_h() {
+    let a = i16x16::new(
+        1623, -14920, 1170, 12351, -25346, 8330, 32675, 4619, -31613, -16397, 9976, -5234,
+        20684, 31015, -27130, 426,
+    );
+    let b = i16x16::new(
+        20578, -6736, -13719, -3491, 28139, 17968, -30166, 24185, -29828, 6212, 17476, 15478,
+        -21520, -14119, -3397, 14549,
+    );
+    let r = i64x4::new(657383790217428863, 0, 941881790371430152, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrln_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrln_h_w() {
+    let a = i32x8::new(
+        -1842464126,
+        -1331342000,
+        -1187112242,
+        453446042,
+        960156121,
+        -1968872136,
+        -603223901,
+        -1134334019,
+    );
+    let b = i32x8::new(
+        -592357508,
+        969628508,
+        2062627988,
+        -1366484086,
+        -1901031633,
+        1742501272,
+        -1277076789,
+        2022930291,
+    );
+    let r = i64x4::new(9223103287866884105, 0, 1696871892814295669, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrln_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrln_w_d() {
+    let a = i64x4::new(
+        6056280160463852946,
+        3937140140114293823,
+        -6849002485680852776,
+        8030598250493987596,
+    );
+    let b = i64x4::new(
+        7030461610430840286,
+        3499193251729970464,
+        1325445643267409553,
+        -1126160333119085812,
+    );
+    let r = i64x4::new(3937140138060021759, 0, 9223372034707292159, 0);
+
+    assert_eq!(r, transmute(lasx_xvssrln_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvorn_v() {
+    let a = i8x32::new(
+        -112, -60, -62, -15, 46, 34, 52, -37, 122, -78, -19, 95, -80, -17, -47, -38, 49, -4,
+        -92, -111, 17, 38, 13, -58, -51, -39, -94, -58, -123, -32, 27, -12,
+    );
+    let b = i8x32::new(
+        79, -128, 107, 13, 36, -50, 69, -31, 63, 17, -79, 95, -58, 12, 0, 94, -33, -112, -46,
+        80, 57, 78, 40, 71, -44, 127, 1, 41, -79, -109, -55, 5,
+    );
+    let r = i64x4::new(
+        -2324363183275966544,
+        -288230676800471302,
+        -81144131007676623,
+        -126121887133672977,
+    );
+
+    assert_eq!(r, transmute(lasx_xvorn_v(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvldi() {
+    let r = i64x4::new(
+        -1679332213128,
+        -1679332213128,
+        -1679332213128,
+        -1679332213128,
+    );
+
+    assert_eq!(r, transmute(lasx_xvldi::<2680>()));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvldx() {
+    let a: [i8; 32] = [
+        108, -99, 50, 65, 4, -113, -105, 42, 11, 14, 121, -66, -35, -37, -126, -77, -17, 83,
+        -77, 28, -33, -105, -107, 20, 119, 103, 51, 7, -108, 37, -15, -93,
+    ];
+    let r = i64x4::new(
+        3069078919512759660,
+        -5511601248518205941,
+        1483258636803462127,
+        -6633479458433833097,
+    );
+
+    assert_eq!(r, transmute(lasx_xvldx(a.as_ptr(), 0)));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvstx() {
+    let a = i8x32::new(
+        -124, -113, -93, 99, -114, 45, -113, 30, 80, -29, 126, 12, -88, -106, -117, -12, 63,
+        -56, -65, -120, -128, -93, -97, 117, -23, 30, -14, -37, 30, -3, 60, -58,
+    );
+    let mut o: [i8; 32] = [
+        31, -103, -100, 104, 70, 123, -86, -93, -10, 88, 2, 88, 45, -4, 120, -23, -4, 71, -56,
+        100, 122, -46, 113, 113, -106, -127, -49, 31, -4, -85, 85, -37,
+    ];
+    let r = i64x4::new(
+        2202028832387731332,
+        -825400458184039600,
+        8475672796179974207,
+        -4162173646616256791,
+    );
+
+    lasx_xvstx(transmute(a), o.as_mut_ptr(), 0);
+    assert_eq!(r, transmute(o));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvextl_qu_du() {
+    let a = u64x4::new(
+        13363392893058409879,
+        13062266778638186908,
+        4121325568380818738,
+        16525525054189099432,
+    );
+    let r = i64x4::new(-5083351180651141737, 0, 4121325568380818738, 0);
+
+    assert_eq!(r, transmute(lasx_xvextl_qu_du(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvinsgr2vr_w() {
+    let a = i32x8::new(
+        37894851,
+        6792754,
+        -1258538001,
+        -1755752185,
+        45667801,
+        270850755,
+        -1397420984,
+        -643296765,
+    );
+    let r = i64x4::new(
+        29174656317668035,
+        -7540898211419112465,
+        1163295138520418131,
+        -2762938564400051128,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvinsgr2vr_w::<4>(transmute(a), -596457645))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvinsgr2vr_d() {
+    let a = i64x4::new(
+        -8759780246633869569,
+        7376911929131157332,
+        8748197595361481626,
+        15419583081814202,
+    );
+    let r = i64x4::new(
+        -8759780246633869569,
+        7376911929131157332,
+        8748197595361481626,
+        -1262509914,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvinsgr2vr_d::<3>(transmute(a), -1262509914))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvreplve0_b() {
+    let a = i8x32::new(
+        48, -8, -123, 35, -50, -64, 25, -100, -19, -112, 93, 46, -80, 59, 28, 42, -47, -52, 18,
+        -55, 50, -48, -25, -127, 97, 19, 71, -24, -71, -21, -114, -110,
+    );
+    let r = i64x4::new(
+        3472328296227680304,
+        3472328296227680304,
+        3472328296227680304,
+        3472328296227680304,
+    );
+
+    assert_eq!(r, transmute(lasx_xvreplve0_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvreplve0_h() {
+    let a = i16x16::new(
+        412, 15338, 12582, -13132, -4679, 11713, 23076, 26826, 14471, -7190, 2282, 29936,
+        25689, 11463, -14855, 18183,
+    );
+    let r = i64x4::new(
+        115969459958317468,
+        115969459958317468,
+        115969459958317468,
+        115969459958317468,
+    );
+
+    assert_eq!(r, transmute(lasx_xvreplve0_h(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvreplve0_w() {
+    let a = i32x8::new(
+        1243734577,
+        1718395406,
+        -1635863561,
+        863207308,
+        71140354,
+        1238191531,
+        -785900261,
+        -1886172704,
+    );
+    let r = i64x4::new(
+        5341799334363128369,
+        5341799334363128369,
+        5341799334363128369,
+        5341799334363128369,
+    );
+
+    assert_eq!(r, transmute(lasx_xvreplve0_w(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvreplve0_d() {
+    let a = i64x4::new(
+        -7669512117913941619,
+        3607794435492173678,
+        6416911432565038933,
+        7089802970627232981,
+    );
+    let r = i64x4::new(
+        -7669512117913941619,
+        -7669512117913941619,
+        -7669512117913941619,
+        -7669512117913941619,
+    );
+
+    assert_eq!(r, transmute(lasx_xvreplve0_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvreplve0_q() {
+    let a = i8x32::new(
+        38, -64, -93, 68, 35, 91, 48, -77, 11, -127, -113, -96, -101, 2, -106, -104, 66, 3,
+        -45, 82, 95, 100, -99, 112, -127, 125, 100, 20, 17, -9, 77, -6,
+    );
+    let r = i64x4::new(
+        -5534823735004774362,
+        -7451765666000961269,
+        -5534823735004774362,
+        -7451765666000961269,
+    );
+
+    assert_eq!(r, transmute(lasx_xvreplve0_q(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_vext2xv_h_b() {
+    let a = i8x32::new(
+        -114, -31, -50, -82, -63, -45, 61, -97, -121, 119, 25, 112, 43, 80, 70, 86, -80, 101,
+        109, -126, 58, 103, 8, -108, 124, -29, 93, -96, 26, -11, -63, 58,
+    );
+    let r = i64x4::new(
+        -22799683568926834,
+        -27302806455844927,
+        31525304773640071,
+        24207148650070059,
+    );
+
+    assert_eq!(r, transmute(lasx_vext2xv_h_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_vext2xv_w_h() {
+    let a = i16x16::new(
+        24818, 30826, -26283, -18137, -18647, -30298, 9378, -8000, 3374, -6396, 3703, 19569,
+        25155, 17959, 16236, 26635,
+    );
+    let r = i64x4::new(
+        132396661891314,
+        -77893526906539,
+        -130124624185559,
+        -34359738358622,
+    );
+
+    assert_eq!(r, transmute(lasx_vext2xv_w_h(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_vext2xv_d_w() {
+    let a = i32x8::new(
+        -585251458,
+        -2113345963,
+        -1846838006,
+        -474453663,
+        -1394782646,
+        229470412,
+        1572845627,
+        -904846098,
+    );
+    let r = i64x4::new(-585251458, -2113345963, -1846838006, -474453663);
+
+    assert_eq!(r, transmute(lasx_vext2xv_d_w(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_vext2xv_w_b() {
+    let a = i8x32::new(
+        36, -56, 126, -123, -107, 6, 4, -114, -114, 112, -98, -14, 4, -112, 83, -33, 94, -20,
+        -123, 85, -34, -65, -73, -33, -84, -29, 9, 42, -76, -59, -84, -18,
+    );
+    let r = i64x4::new(-240518168540, -528280977282, 30064770965, -489626271740);
+
+    assert_eq!(r, transmute(lasx_vext2xv_w_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_vext2xv_d_h() {
+    let a = i16x16::new(
+        28568, -25911, 12053, -2728, -19449, -11747, -4351, 8975, -18854, 29749, -13852, 32702,
+        6750, 21089, -15985, 20408,
+    );
+    let r = i64x4::new(28568, -25911, 12053, -2728);
+
+    assert_eq!(r, transmute(lasx_vext2xv_d_h(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_vext2xv_d_b() {
+    let a = i8x32::new(
+        18, 112, -36, -67, -20, 76, -103, -91, -114, 14, -121, 115, 35, -36, -123, 13, -107,
+        -52, 82, 36, 90, 43, -21, 13, -61, -84, 21, -59, 59, -116, -79, -65,
+    );
+    let r = i64x4::new(18, 112, -36, -67);
+
+    assert_eq!(r, transmute(lasx_vext2xv_d_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_vext2xv_hu_bu() {
+    let a = i8x32::new(
+        38, -47, -21, -14, 36, 120, -8, -12, 76, 36, 42, 41, -54, 103, 93, 60, -6, -1, 68, -86,
+        49, 60, 6, -17, -118, -56, -71, 7, 1, 79, 68, 95,
+    );
+    let r = i64x4::new(
+        68117953694990374,
+        68680959477153828,
+        11540654436122700,
+        16888898041348298,
+    );
+
+    assert_eq!(r, transmute(lasx_vext2xv_hu_bu(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_vext2xv_wu_hu() {
+    let a = i16x16::new(
+        -31465, -19962, 4074, 27214, -1117, 19026, -8469, -13109, 19316, 5127, 15001, -32657,
+        4699, 24472, 1480, -18381,
+    );
+    let r = i64x4::new(
+        195738839581975,
+        116883239997418,
+        81716047838115,
+        225172250484459,
+    );
+
+    assert_eq!(r, transmute(lasx_vext2xv_wu_hu(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_vext2xv_du_wu() {
+    let a = i32x8::new(
+        -267466250,
+        -936328606,
+        -1799333696,
+        1035808674,
+        -2072455456,
+        239819000,
+        1616827243,
+        740798354,
+    );
+    let r = i64x4::new(4027501046, 3358638690, 2495633600, 1035808674);
+
+    assert_eq!(r, transmute(lasx_vext2xv_du_wu(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_vext2xv_wu_bu() {
+    let a = i8x32::new(
+        54, -26, 32, 112, -121, 62, -95, -28, -103, -110, -103, 110, 127, -48, 101, -81, 35,
+        -54, -116, 14, -97, 97, -45, 85, -18, 126, 31, 115, -59, 10, -16, -71,
+    );
+    let r = i64x4::new(987842478134, 481036337184, 266287972487, 979252543649);
+
+    assert_eq!(r, transmute(lasx_vext2xv_wu_bu(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_vext2xv_du_hu() {
+    let a = i16x16::new(
+        -4235, -24126, -30181, 19598, -24220, 19618, -8899, 20393, 31336, -6256, 3392, -18554,
+        -31864, -32356, -15170, 18814,
+    );
+    let r = i64x4::new(61301, 41410, 35355, 19598);
+
+    assert_eq!(r, transmute(lasx_vext2xv_du_hu(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_vext2xv_du_bu() {
+    let a = i8x32::new(
+        69, 25, 36, -52, -55, 23, -66, 10, 23, 74, 121, 113, 82, 22, 49, -96, -124, 46, -78,
+        72, -37, 113, 126, -115, 79, -105, -39, -110, -96, 77, -54, -35,
+    );
+    let r = i64x4::new(69, 25, 36, 204);
+
+    assert_eq!(r, transmute(lasx_vext2xv_du_bu(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpermi_q() {
+    let a = i8x32::new(
+        53, 32, -81, -96, 38, -39, 42, -111, -82, -104, -58, 101, 92, -89, -77, 71, -121, -110,
+        -125, -48, 97, 91, 90, -120, 44, -98, -107, 3, -85, 64, -45, -14,
+    );
+    let b = i8x32::new(
+        23, -5, 51, 85, 46, -5, -102, 2, -73, -121, 18, -2, 113, -122, -117, -20, -47, 84, 117,
+        -17, -21, -78, -91, 69, 6, 34, -115, 73, -21, 9, -36, 92,
+    );
+    let r = i64x4::new(
+        5018614086178788561,
+        6691234052521665030,
+        -8621478060979154297,
+        -949343993201320404,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvpermi_q::<49>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpermi_d() {
+    let a = i64x4::new(
+        539162827834580224,
+        7362188367992869351,
+        1609032298240495217,
+        1788653247091024267,
+    );
+    let r = i64x4::new(
+        7362188367992869351,
+        1609032298240495217,
+        539162827834580224,
+        1609032298240495217,
+    );
+
+    assert_eq!(r, transmute(lasx_xvpermi_d::<137>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvperm_w() {
+    let a = i32x8::new(
+        -708303872,
+        -376964930,
+        -1808535729,
+        -2054828055,
+        71139817,
+        -306901690,
+        -1914618818,
+        -1977032311,
+    );
+    let b = i32x8::new(
+        1288050919, 621948080, 1756136778, 1515604090, 408174564, 1809111645, 451808315,
+        1595060072,
+    );
+    let r = i64x4::new(
+        -3042141963552235127,
+        -7767601807216087217,
+        -1318132721565990423,
+        -3042141963630030871,
+    );
+
+    assert_eq!(r, transmute(lasx_xvperm_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvldrepl_b() {
+    let a: [i8; 32] = [
+        -37, -75, -9, 68, 120, 101, -40, 41, -16, -103, 89, 95, 83, 50, -109, 30, 72, -8, 21,
+        -41, -5, -67, -60, -85, 111, 105, 122, -69, -33, -5, 118, -114,
+    ];
+    let r = i64x4::new(
+        -2604246222170760229,
+        -2604246222170760229,
+        -2604246222170760229,
+        -2604246222170760229,
+    );
+
+    assert_eq!(r, transmute(lasx_xvldrepl_b::<0>(a.as_ptr())));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvldrepl_h() {
+    let a: [i8; 32] = [
+        9, 11, -106, 72, -118, -25, 63, -96, -91, -77, -71, 41, -74, -21, -12, 79, -78, -66,
+        -20, -66, 5, -116, -88, 0, 7, -59, 7, 36, -83, -122, -42, -71,
+    ];
+    let r = i64x4::new(
+        795178942675356425,
+        795178942675356425,
+        795178942675356425,
+        795178942675356425,
+    );
+
+    assert_eq!(r, transmute(lasx_xvldrepl_h::<0>(a.as_ptr())));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvldrepl_w() {
+    let a: [i8; 32] = [
+        42, 19, -74, -120, -24, 115, 114, 79, 108, 51, 109, 64, -123, 115, 4, 60, -127, 78,
+        -103, 44, 28, 14, 75, 19, 126, 86, -22, -55, -66, 32, -11, 112,
+    ];
+    let r = i64x4::new(
+        -8595661765386824918,
+        -8595661765386824918,
+        -8595661765386824918,
+        -8595661765386824918,
+    );
+
+    assert_eq!(r, transmute(lasx_xvldrepl_w::<0>(a.as_ptr())));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvldrepl_d() {
+    let a: [i8; 32] = [
+        -58, -81, 9, -23, -6, 105, 110, 81, 123, -99, -71, 23, 21, 18, 21, -94, 123, 120, -87,
+        -27, 43, 83, 12, -68, 80, 26, 14, 64, 61, 4, -104, -45,
+    ];
+    let r = i64x4::new(
+        5867743890882801606,
+        5867743890882801606,
+        5867743890882801606,
+        5867743890882801606,
+    );
+
+    assert_eq!(r, transmute(lasx_xvldrepl_d::<0>(a.as_ptr())));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickve2gr_w() {
+    let a = i32x8::new(
+        -171617667,
+        1234499290,
+        -496270783,
+        916647463,
+        1367768596,
+        -1156952470,
+        172419522,
+        -1633257882,
+    );
+    let r: i32 = 1367768596;
+
+    assert_eq!(r, transmute(lasx_xvpickve2gr_w::<4>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickve2gr_wu() {
+    let a = i32x8::new(
+        -547854042,
+        1057749415,
+        -1081569551,
+        -1895010720,
+        -1615052351,
+        -472405371,
+        1482004122,
+        -1099972589,
+    );
+    let r: u32 = 3194994707;
+
+    assert_eq!(r, transmute(lasx_xvpickve2gr_wu::<7>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickve2gr_d() {
+    let a = i64x4::new(
+        5494820280860382649,
+        -235896250341393106,
+        6739870851682505277,
+        -2213972721378902369,
+    );
+    let r: i64 = 6739870851682505277;
+
+    assert_eq!(r, transmute(lasx_xvpickve2gr_d::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickve2gr_du() {
+    let a = i64x4::new(
+        -3274379179178335548,
+        -1748909263142723978,
+        -4272175049937479582,
+        -8920910898336101981,
+    );
+    let r: u64 = 9525833175373449635;
+
+    assert_eq!(r, transmute(lasx_xvpickve2gr_du::<3>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwev_q_d() {
+    let a = i64x4::new(
+        -1487944422194570539,
+        6635250509470966842,
+        -5056614467208325955,
+        -6125778217946781600,
+    );
+    let b = i64x4::new(
+        -5984805769944216142,
+        5786714665975619996,
+        -2702111374414975767,
+        -5035182099645850808,
+    );
+    let r = i64x4::new(-7472750192138786681, -1, -7758725841623301722, -1);
+
+    assert_eq!(r, transmute(lasx_xvaddwev_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwev_d_w() {
+    let a = i32x8::new(
+        675098803,
+        -75093512,
+        -81250247,
+        -121202336,
+        -1671001294,
+        -285443775,
+        1247275542,
+        1556903730,
+    );
+    let b = i32x8::new(
+        -60118452,
+        780831551,
+        -1865678894,
+        -1327225627,
+        -1638401313,
+        1476017431,
+        -1866352749,
+        -523966227,
+    );
+    let r = i64x4::new(614980351, -1946929141, -3309402607, -619077207);
+
+    assert_eq!(r, transmute(lasx_xvaddwev_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwev_w_h() {
+    let a = i16x16::new(
+        22608, -32211, 15906, -27286, -31014, -22869, -2185, 30553, 0, 12445, 343, -20393,
+        -7421, 12619, -32283, 25803,
+    );
+    let b = i16x16::new(
+        -922, 25119, -27975, 3966, 7351, -30447, -29386, 20153, -8260, -10355, 15526, -17976,
+        30119, 32034, -21917, 30756,
+    );
+    let r = i64x4::new(
+        -51835960273738,
+        -135592117558383,
+        68161130979260,
+        -232787227420502,
+    );
+
+    assert_eq!(r, transmute(lasx_xvaddwev_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwev_h_b() {
+    let a = i8x32::new(
+        101, 34, 41, -107, -36, -117, 4, -53, -1, -113, 85, 83, 24, -54, -19, -128, 34, 37,
+        -45, 11, -78, -60, -13, 10, -97, -34, -128, 8, 88, 107, 65, -45,
+    );
+    let b = i8x32::new(
+        -117, -119, -45, -12, -81, 85, -5, -43, 118, 117, 123, -107, 55, -109, 18, 96, -89,
+        -92, -16, -107, 64, 123, 12, -1, 110, 18, -96, 77, -60, -100, -102, -47,
+    );
+    let r = i64x4::new(
+        -498216402960,
+        -281135660662667,
+        -55838507063,
+        -10414449598922739,
+    );
+
+    assert_eq!(r, transmute(lasx_xvaddwev_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwev_q_du() {
+    let a = u64x4::new(
+        10116771403081209132,
+        4409447541453417390,
+        898338891308675373,
+        2921491360808722992,
+    );
+    let b = u64x4::new(
+        13196093984731278668,
+        13568223424734996564,
+        18446645167103959087,
+        1830481894073719508,
+    );
+    let r = i64x4::new(4866121314102936184, 1, 898239984703082844, 1);
+
+    assert_eq!(r, transmute(lasx_xvaddwev_q_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwev_d_wu() {
+    let a = u32x8::new(
+        1198556156, 4098846235, 136525854, 1406990253, 2217403106, 390213570, 1993119836,
+        1839111140,
+    );
+    let b = u32x8::new(
+        2802853372, 1144229232, 3262242038, 3483335391, 3804489865, 583269177, 2356229233,
+        699141534,
+    );
+    let r = i64x4::new(4001409528, 3398767892, 6021892971, 4349349069);
+
+    assert_eq!(r, transmute(lasx_xvaddwev_d_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwev_w_hu() {
+    let a = u16x16::new(
+        6322, 31121, 27313, 37809, 33019, 46908, 8254, 44176, 58710, 48196, 24711, 20406,
+        18042, 38301, 32766, 13444,
+    );
+    let b = u16x16::new(
+        14794, 51570, 1750, 49106, 762, 47300, 64778, 26934, 42322, 39382, 42708, 58300, 788,
+        59906, 54890, 41392,
+    );
+    let r = i64x4::new(
+        124824634544764,
+        313670051595253,
+        289562400230056,
+        376479653317006,
+    );
+
+    assert_eq!(r, transmute(lasx_xvaddwev_w_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwev_h_bu() {
+    let a = u8x32::new(
+        161, 193, 11, 51, 139, 70, 76, 148, 89, 35, 229, 97, 137, 39, 176, 219, 87, 90, 7, 151,
+        124, 135, 127, 143, 231, 76, 225, 208, 193, 51, 197, 27,
+    );
+    let b = u8x32::new(
+        60, 218, 230, 194, 245, 20, 179, 100, 21, 163, 236, 184, 84, 87, 122, 61, 25, 209, 185,
+        207, 241, 56, 216, 245, 230, 103, 251, 152, 157, 115, 48, 190,
+    );
+    let r = i64x4::new(
+        71777768344453341,
+        83880492278022254,
+        96547484687401072,
+        68962872563859917,
+    );
+
+    assert_eq!(r, transmute(lasx_xvaddwev_h_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwev_q_d() {
+    let a = i64x4::new(
+        -7742993219420546326,
+        -101212755683599810,
+        -6868163898247798277,
+        -8375244535493076926,
+    );
+    let b = i64x4::new(
+        2520168195081268699,
+        9108054891736382097,
+        6081995959065773172,
+        -7633503910634037993,
+    );
+    let r = i64x4::new(8183582659207736591, -1, 5496584216395980167, -1);
+
+    assert_eq!(r, transmute(lasx_xvsubwev_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwev_d_w() {
+    let a = i32x8::new(
+        -331902539,
+        -410274173,
+        61822184,
+        -21356706,
+        -1286351195,
+        1770474991,
+        -682957064,
+        -1751781451,
+    );
+    let b = i32x8::new(
+        1613863191,
+        982997422,
+        -1638727663,
+        -849407734,
+        -68285193,
+        822007285,
+        144325628,
+        1766216748,
+    );
+    let r = i64x4::new(-1945765730, 1700549847, -1218066002, -827282692);
+
+    assert_eq!(r, transmute(lasx_xvsubwev_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwev_w_h() {
+    let a = i16x16::new(
+        28743, 20624, 20703, 30472, -4294, 10753, -24932, 2990, 15363, 6155, 32468, -23754,
+        -2447, 26852, 22688, -14794,
+    );
+    let b = i16x16::new(
+        23978, -18333, -16768, 15041, 16101, -22819, -5374, -14505, -14490, -28486, 31912,
+        -14640, 9360, -7613, -27955, 24096,
+    );
+    let r = i64x4::new(
+        160936719553181,
+        -83996675428267,
+        2388001846429,
+        217514323726817,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsubwev_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwev_h_b() {
+    let a = i8x32::new(
+        -15, -3, 45, 48, -83, -44, 39, -105, -84, -28, 100, 105, 92, -27, -25, -10, -66, 81,
+        -107, 86, -125, 111, 23, -60, -67, -7, -53, 26, 114, -11, -82, -3,
+    );
+    let b = i8x32::new(
+        -3, -39, 34, -41, 12, -46, 111, -59, 120, -86, -90, -16, -80, 110, 115, -3, 124, 93,
+        -42, 74, 52, 126, -65, 28, 109, 69, -64, 67, -69, -62, -61, 39,
+    );
+    let r = i64x4::new(
+        -19985131367563276,
+        -39405757992599756,
+        25050517008809794,
+        -5910188531122352,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsubwev_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwev_q_du() {
+    let a = u64x4::new(
+        4097334132097570986,
+        3004224617145960419,
+        6567223884870023457,
+        342771278501784235,
+    );
+    let b = u64x4::new(
+        11278175901218237219,
+        17453302179390276683,
+        10469031865427428464,
+        13567003215182256574,
+    );
+    let r = i64x4::new(-7180841769120666233, -1, -3901807980557405007, -1);
+
+    assert_eq!(r, transmute(lasx_xvsubwev_q_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwev_d_wu() {
+    let a = u32x8::new(
+        1172933923, 3561590261, 603333963, 754041205, 663327014, 1707091866, 2563659074,
+        2321081680,
+    );
+    let b = u32x8::new(
+        3703975407, 3067249102, 1688677432, 1970014868, 2563703919, 3474073919, 962829505,
+        706481691,
+    );
+    let r = i64x4::new(-2531041484, -1085343469, -1900376905, 1600829569);
+
+    assert_eq!(r, transmute(lasx_xvsubwev_d_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwev_w_hu() {
+    let a = u16x16::new(
+        59679, 17198, 28545, 44644, 31522, 21827, 19256, 56166, 8797, 57585, 50535, 47800,
+        56204, 43584, 6516, 57953,
+    );
+    let b = u16x16::new(
+        12708, 41280, 57347, 58871, 47516, 27619, 53764, 58057, 32314, 65212, 64025, 62782,
+        47743, 20389, 33764, 7173,
+    );
+    let r = i64x4::new(
+        -123703648012421,
+        -148206436499066,
+        -57934813879261,
+        -117029268872947,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsubwev_w_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwev_h_bu() {
+    let a = u8x32::new(
+        56, 244, 182, 253, 193, 214, 55, 239, 186, 251, 78, 32, 93, 2, 4, 132, 53, 6, 173, 35,
+        84, 227, 58, 79, 196, 41, 163, 128, 246, 219, 120, 87,
+    );
+    let b = u8x32::new(
+        90, 193, 215, 114, 199, 50, 46, 90, 225, 253, 111, 26, 28, 238, 131, 245, 47, 87, 30,
+        95, 33, 50, 192, 132, 14, 240, 47, 254, 29, 155, 145, 45,
+    );
+    let r = i64x4::new(
+        2814728290172894,
+        -35747038576508967,
+        -37717427826524154,
+        -7035942402260810,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsubwev_h_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwev_q_d() {
+    let a = i64x4::new(
+        -683494492458261228,
+        -5241422472417437680,
+        6650370058493421125,
+        4779596395103551457,
+    );
+    let b = i64x4::new(
+        -1623383963768224463,
+        6756255500546970238,
+        -7555682488592816357,
+        -7648860611106928873,
+    );
+    let r = i64x4::new(
+        5539873801618144468,
+        60150126978886031,
+        3692294931598396487,
+        -2723954123981949807,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwev_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwev_d_w() {
+    let a = i32x8::new(
+        2140792624,
+        1544321576,
+        1549060875,
+        -630248052,
+        -1129263074,
+        -73878937,
+        521128826,
+        22556670,
+    );
+    let b = i32x8::new(
+        -346749156,
+        1202859377,
+        1486656968,
+        370617591,
+        1270867102,
+        -810144613,
+        1735249190,
+        -1555085961,
+    );
+    let r = i64x4::new(
+        -742318035543025344,
+        2302922143674927000,
+        -1435143290249991548,
+        904288373202150940,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwev_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwev_w_h() {
+    let a = i16x16::new(
+        14750, -29841, -17709, -8196, 31466, 7862, -25367, -12539, 9353, 10914, -12320, -17148,
+        -6831, -498, 2288, 29204,
+    );
+    let b = i16x16::new(
+        -12026, 22388, -5312, 184, 18130, -7473, -25877, 31312, -9813, 24876, 26780, -7436,
+        -15441, 11581, -22259, 14954,
+    );
+    let r = i64x4::new(
+        404028471005501364,
+        2819310417355001844,
+        -1417036837779175293,
+        -218736636965849761,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwev_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwev_h_b() {
+    let a = i8x32::new(
+        -32, 93, 5, -3, -61, -113, 57, 15, -19, 95, 84, 13, 85, -84, 23, 37, -74, -33, -40, 52,
+        9, -63, 21, 55, 68, -20, -70, -53, 117, 50, -31, 80,
+    );
+    let b = i8x32::new(
+        7, 32, 85, -70, -87, -72, -87, 1, 26, -19, -128, 116, -6, -98, -11, -79, -19, 4, 90,
+        47, 88, 112, -37, -100, -119, -82, 7, 77, -62, 76, 61, -120,
+    );
+    let r = i64x4::new(
+        -1395811616088785120,
+        -70933880974017006,
+        -218702651231042178,
+        -532018857412992924,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwev_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwev_q_du() {
+    let a = u64x4::new(
+        3072820657428859233,
+        11609640493721306675,
+        12008349959063387869,
+        5948138397283294636,
+    );
+    let b = u64x4::new(
+        10527245875383164815,
+        7916669328935928828,
+        3031495739290315758,
+        13060234924687571269,
+    );
+    let r = i64x4::new(
+        -1534093344768443345,
+        1753606948871441014,
+        -1876472381986713482,
+        1973424773030267173,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwev_q_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwev_d_wu() {
+    let a = u32x8::new(
+        2949007290, 703271383, 711423165, 1456866992, 3752229871, 2536591346, 2389736494,
+        3966991514,
+    );
+    let b = u32x8::new(
+        196315048, 1279932854, 2296087324, 1350671471, 2200714021, 3470805434, 130970026,
+        3503786742,
+    );
+    let r = i64x4::new(
+        578934507688699920,
+        1633489711156460460,
+        8257584887124721291,
+        312983850752328844,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwev_d_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwev_w_hu() {
+    let a = u16x16::new(
+        47934, 48824, 8863, 27185, 38746, 3540, 44988, 31735, 10219, 30176, 19749, 47625, 9605,
+        42752, 51816, 20943,
+    );
+    let b = u16x16::new(
+        1352, 35948, 33502, 40543, 34675, 10670, 35261, 56591, 28340, 28503, 7709, 11425,
+        35242, 32021, 61306, 37078,
+    );
+    let r = i64x4::new(
+        1275297019994103664,
+        6813200545333146478,
+        653887472362785596,
+        -4803214827614038190,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwev_w_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwev_h_bu() {
+    let a = u8x32::new(
+        181, 7, 169, 169, 172, 103, 102, 36, 203, 92, 62, 74, 182, 211, 40, 13, 241, 11, 168,
+        240, 139, 224, 217, 76, 58, 133, 28, 147, 22, 142, 180, 136,
+    );
+    let b = u8x32::new(
+        247, 29, 191, 188, 209, 191, 193, 157, 228, 251, 166, 237, 216, 180, 183, 151, 51, 82,
+        28, 3, 146, 77, 65, 127, 70, 150, 194, 49, 235, 0, 88, 29,
+    );
+    let r = i64x4::new(
+        5541270789125811875,
+        2060565673950885068,
+        3970291708878401539,
+        4458585836433706972,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwev_h_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwod_q_d() {
+    let a = i64x4::new(
+        -4400532975246140561,
+        6103963578734860361,
+        6538041862964443552,
+        9150349465675238484,
+    );
+    let b = i64x4::new(
+        8731574776501689511,
+        8529056615916614298,
+        -5177328656834536965,
+        -8950246356268516094,
+    );
+    let r = i64x4::new(-3813723879058076957, 0, 200103109406722390, 0);
+
+    assert_eq!(r, transmute(lasx_xvaddwod_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwod_d_w() {
+    let a = i32x8::new(
+        107177346,
+        1165229099,
+        -1855482949,
+        -1506158220,
+        -530530472,
+        -1932018412,
+        1027697605,
+        -653089829,
+    );
+    let b = i32x8::new(
+        605852783,
+        1977495085,
+        71767549,
+        -1079077108,
+        -1117877219,
+        1146297949,
+        -89842401,
+        1580029832,
+    );
+    let r = i64x4::new(3142724184, -2585235328, -785720463, 926940003);
+
+    assert_eq!(r, transmute(lasx_xvaddwod_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwod_w_h() {
+    let a = i16x16::new(
+        8333, 3159, -8340, 2860, -10086, -10705, -22151, 9693, -10758, 24078, -6146, -22105,
+        -9685, -11464, 1434, -10313,
+    );
+    let b = i16x16::new(
+        24703, 26602, -11086, -20999, -31901, 27136, 3427, -26885, 13303, 12337, 32133, 9869,
+        13049, -11935, 7268, -24263,
+    );
+    let r = i64x4::new(
+        -77906411752383,
+        -73839077736401,
+        -52553219797441,
+        -148498494282599,
+    );
+
+    assert_eq!(r, transmute(lasx_xvaddwod_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwod_h_b() {
+    let a = i8x32::new(
+        84, -26, 37, -73, 68, -16, -46, 83, -36, 80, -20, 61, 84, -41, 48, 23, 117, 43, -82,
+        -1, -6, -5, -88, -59, -24, 126, -122, -29, -30, 41, 88, -82,
+    );
+    let b = i8x32::new(
+        101, -60, -48, 109, 26, -30, -114, -67, 36, -33, -1, -26, 102, 46, 10, -96, 122, -84,
+        121, -64, 14, -41, -110, -120, 7, -54, 69, -95, 24, -112, -75, 47,
+    );
+    let r = i64x4::new(
+        4784877038010282,
+        -20547651822747601,
+        -50102739132219433,
+        -9570449863999416,
+    );
+
+    assert_eq!(r, transmute(lasx_xvaddwod_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwod_q_du() {
+    let a = u64x4::new(
+        5678527968265482955,
+        15561833412025074700,
+        6604122729549136851,
+        2064090124976043119,
+    );
+    let b = u64x4::new(
+        17348958871868652420,
+        3636555885647953059,
+        13556112850172780139,
+        15106752613120000479,
+    );
+    let r = i64x4::new(751645223963476143, 1, -1275901335613508018, 0);
+
+    assert_eq!(r, transmute(lasx_xvaddwod_q_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwod_d_wu() {
+    let a = u32x8::new(
+        1981196003, 503742005, 890731178, 1132725820, 1082789967, 1773388022, 3687035574,
+        2761826754,
+    );
+    let b = u32x8::new(
+        239559029, 4254142036, 2675411124, 540730773, 3579454499, 389539593, 2282534290,
+        2381309647,
+    );
+    let r = i64x4::new(4757884041, 1673456593, 2162927615, 5143136401);
+
+    assert_eq!(r, transmute(lasx_xvaddwod_d_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwod_w_hu() {
+    let a = u16x16::new(
+        2281, 18176, 25719, 13571, 60992, 4744, 29330, 13668, 8334, 51018, 34330, 25476, 39478,
+        10512, 18653, 36146,
+    );
+    let b = u16x16::new(
+        12509, 23819, 52059, 39413, 59587, 22877, 24693, 50088, 16716, 29478, 46962, 20510,
+        63245, 56365, 48918, 21693,
+    );
+    let r = i64x4::new(
+        227564547253259,
+        273829934951397,
+        197508366154352,
+        248416613500221,
+    );
+
+    assert_eq!(r, transmute(lasx_xvaddwod_w_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwod_h_bu() {
+    let a = u8x32::new(
+        60, 80, 117, 71, 182, 90, 20, 252, 34, 80, 102, 107, 49, 1, 75, 51, 175, 113, 29, 130,
+        107, 245, 172, 220, 129, 144, 11, 136, 248, 112, 109, 250,
+    );
+    let b = u8x32::new(
+        138, 100, 21, 101, 14, 54, 118, 39, 31, 118, 184, 186, 69, 89, 154, 138, 240, 210, 94,
+        39, 11, 71, 157, 238, 181, 78, 88, 102, 165, 50, 235, 48,
+    );
+    let r = i64x4::new(
+        81909836709363892,
+        53199157164572870,
+        128916896554221891,
+        83880238860075230,
+    );
+
+    assert_eq!(r, transmute(lasx_xvaddwod_h_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwod_q_d() {
+    let a = i64x4::new(
+        -3945435774433072696,
+        -5580639112190912700,
+        -8147998114407044390,
+        -4275535762638580926,
+    );
+    let b = i64x4::new(
+        4407006886911950173,
+        -7345495209927165189,
+        -2920599937444079395,
+        6487551432709971357,
+    );
+    let r = i64x4::new(1764856097736252489, 0, 7683656878360999333, -1);
+
+    assert_eq!(r, transmute(lasx_xvsubwod_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwod_d_w() {
+    let a = i32x8::new(
+        1480945437,
+        -383133422,
+        -450202465,
+        -1667474532,
+        425467038,
+        483856367,
+        397851792,
+        2047398851,
+    );
+    let b = i32x8::new(
+        -1994579383,
+        576791476,
+        -807849214,
+        -1675047435,
+        1888930513,
+        -1622703443,
+        1826948151,
+        -1929022406,
+    );
+    let r = i64x4::new(-959924898, 7572903, 2106559810, 3976421257);
+
+    assert_eq!(r, transmute(lasx_xvsubwod_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwod_w_h() {
+    let a = i16x16::new(
+        17856, 7337, -32600, -17170, 20316, -23074, 3419, 31841, -19556, 25126, 32449, -4845,
+        -4101, -15325, -15552, -29507,
+    );
+    let b = i16x16::new(
+        -5321, -4306, 7409, -32016, -5351, 21871, 12529, 25151, -16361, 17466, 24705, 14901,
+        -30601, 20878, 16678, -25393,
+    );
+    let r = i64x4::new(
+        63763084488059,
+        28737626132591,
+        -84808424219156,
+        -17665200524651,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsubwod_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwod_h_b() {
+    let a = i8x32::new(
+        18, -21, -84, 117, -114, 12, 106, -85, -51, -119, -70, -63, 118, -92, 124, 114, -40,
+        -12, 116, 97, 61, 0, 121, 33, 123, 85, 26, -89, 30, 99, 21, 25,
+    );
+    let b = i8x32::new(
+        23, 122, -99, -17, -36, -51, -64, 99, 20, -7, 85, 1, 65, -15, -45, 43, -82, 77, 103,
+        57, -10, 27, 105, -78, 78, 69, 75, 65, 94, -116, 22, 39,
+    );
+    let r = i64x4::new(
+        -51791125122973839,
+        20265871901523856,
+        31525081430163367,
+        -3939721971105776,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsubwod_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwod_q_du() {
+    let a = u64x4::new(
+        14173893774454482457,
+        3810444305251451895,
+        11573438380633440776,
+        14010021571042449665,
+    );
+    let b = u64x4::new(
+        3850106411190823856,
+        9879970351878579373,
+        18286343935048656427,
+        15814090293156005950,
+    );
+    let r = i64x4::new(-6069526046627127478, -1, -1804068722113556285, -1);
+
+    assert_eq!(r, transmute(lasx_xvsubwod_q_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwod_d_wu() {
+    let a = u32x8::new(
+        3407590693, 1202785013, 1220235957, 847407948, 1753366487, 1588252312, 949725107,
+        660365194,
+    );
+    let b = u32x8::new(
+        3894489434, 440627342, 2074663244, 1619627426, 1047192238, 3243399158, 5736380,
+        2062766786,
+    );
+    let r = i64x4::new(762157671, -772219478, -1655146846, -1402401592);
+
+    assert_eq!(r, transmute(lasx_xvsubwod_d_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwod_w_hu() {
+    let a = u16x16::new(
+        5666, 61402, 18774, 63704, 5634, 763, 10164, 61056, 3316, 2644, 36526, 37166, 39369,
+        62637, 25134, 63401,
+    );
+    let b = u16x16::new(
+        42490, 58823, 51099, 26297, 14231, 33107, 29618, 35846, 40233, 15170, 7280, 21532,
+        43600, 42150, 29384, 25015,
+    );
+    let r = i64x4::new(
+        160661841644051,
+        108280420467112,
+        67151813660434,
+        164866614644743,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsubwod_w_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsubwod_h_bu() {
+    let a = u8x32::new(
+        52, 64, 145, 201, 179, 240, 245, 105, 232, 134, 159, 238, 112, 26, 116, 151, 98, 187,
+        75, 8, 123, 231, 244, 249, 2, 61, 252, 18, 221, 229, 97, 180,
+    );
+    let b = u8x32::new(
+        161, 161, 97, 228, 198, 212, 5, 77, 243, 42, 221, 12, 112, 20, 43, 195, 186, 156, 232,
+        81, 76, 136, 175, 151, 238, 192, 18, 14, 227, 58, 213, 181,
+    );
+    let r = i64x4::new(
+        7881423900245919,
+        -12384873190653860,
+        27584960029720607,
+        -280740536975491,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsubwod_h_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwod_q_d() {
+    let a = i64x4::new(
+        -4810434630060465465,
+        4688732257687902806,
+        -4456839103181700987,
+        -8917453762606400882,
+    );
+    let b = i64x4::new(
+        6208173123158669961,
+        -127816522776177372,
+        1052866109299034740,
+        233879409784875239,
+    );
+    let r = i64x4::new(
+        -5178962405540445672,
+        -32487980047399636,
+        -4213378220890601950,
+        -113061080830775254,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwod_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwod_d_w() {
+    let a = i32x8::new(
+        -2055655783,
+        -830862243,
+        -847861086,
+        -336854390,
+        -1217543653,
+        -1512465773,
+        -1029760180,
+        696500116,
+    );
+    let b = i32x8::new(
+        1867516505,
+        -867512649,
+        533129786,
+        1783687399,
+        -1192533976,
+        1399910380,
+        -1289839662,
+        -1915471625,
+    );
+    let r = i64x4::new(
+        720783505379011707,
+        -600842930740831610,
+        -2117316535017423740,
+        -1334126209007208500,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwod_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwod_w_h() {
+    let a = i16x16::new(
+        -11721, 24971, -11669, 16270, -6825, 11583, 26517, -2001, -9346, -14979, 6799, -913,
+        32665, 19801, 21245, 3779,
+    );
+    let b = i16x16::new(
+        -22224, -12256, 16952, -4627, -11217, 527, 18001, -14755, -27194, 17253, -12454,
+        -27169, 32549, 32431, 24685, 20780,
+    );
+    let r = i64x4::new(
+        -323330674561769120,
+        126807857153516721,
+        106537943419101521,
+        337273560374881751,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwod_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwod_h_b() {
+    let a = i8x32::new(
+        95, 23, -127, -44, -50, -2, -107, -94, 28, -90, 111, -51, -6, 84, -14, 63, 28, 31,
+        -120, 33, -68, -22, 49, 85, -42, 36, -99, -60, 119, -39, 55, -81,
+    );
+    let b = i8x32::new(
+        -76, -123, 85, -8, 61, 68, -54, 35, 75, 25, -10, 41, -88, 30, 106, 13, -47, 51, 14, 52,
+        -61, 53, -114, -91, -69, 3, -27, -105, -56, 89, -97, 35,
+    );
+    let r = i64x4::new(
+        -925771782493768461,
+        230538833401607990,
+        -2176932477699619283,
+        -797714991416606612,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwod_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwod_q_du() {
+    let a = u64x4::new(
+        7091632338891003648,
+        3739044658401562681,
+        17715177360220060439,
+        15881729055260995184,
+    );
+    let b = u64x4::new(
+        3957896596496566926,
+        14072319404382751448,
+        8435476695188152907,
+        13452684919273724788,
+    );
+    let r = i64x4::new(
+        6176011447065373208,
+        2852374949748893805,
+        5535184026733238976,
+        -6864651532066967840,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwod_q_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwod_d_wu() {
+    let a = u32x8::new(
+        2766740249, 1667577703, 3569036313, 1579235215, 3396253061, 2456107502, 1991409426,
+        75424938,
+    );
+    let b = u32x8::new(
+        3618661585, 2352411935, 3028582487, 1023986068, 3092028317, 3835802450, 3486468402,
+        2263667528,
+    );
+    let r = i64x4::new(
+        3922829691077085305,
+        1617114858254984620,
+        -9025600900074571716,
+        170736982952013264,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwod_d_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwod_w_hu() {
+    let a = u16x16::new(
+        55236, 28771, 53988, 52341, 33854, 22292, 10394, 61333, 4522, 48545, 32239, 37616,
+        60335, 27122, 32053, 14922,
+    );
+    let b = u16x16::new(
+        64490, 59642, 2029, 25643, 55072, 32592, 44282, 23992, 17266, 4336, 3878, 44058, 48161,
+        63520, 51113, 10126,
+    );
+    let r = i64x4::new(
+        5764620336637638830,
+        6320050114866848320,
+        7117988002098042608,
+        648970298882764352,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwod_w_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwod_h_bu() {
+    let a = u8x32::new(
+        34, 239, 30, 169, 91, 195, 107, 97, 212, 207, 110, 55, 238, 210, 149, 21, 238, 150, 4,
+        49, 158, 137, 81, 246, 145, 164, 238, 229, 151, 250, 105, 19,
+    );
+    let b = u8x32::new(
+        109, 186, 165, 193, 216, 121, 71, 232, 9, 233, 215, 188, 234, 112, 250, 183, 159, 61,
+        140, 67, 64, 225, 148, 142, 58, 178, 120, 106, 37, 216, 186, 161,
+    );
+    let r = i64x4::new(
+        6334414217787583910,
+        1081809353807543399,
+        -8614127794670853186,
+        861263883582730760,
+    );
+
+    assert_eq!(r, transmute(lasx_xvmulwod_h_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwev_d_wu_w() {
+    let a = u32x8::new(
+        1465537318, 1382340624, 1603365560, 1355400303, 145165353, 3595116789, 4194509835,
+        314900647,
+    );
+    let b = i32x8::new(
+        -2079155596,
+        -637150629,
+        -1781445929,
+        -2000249885,
+        1523945572,
+        -1514431741,
+        -1149336021,
+        1501805778,
+    );
+    let r = i64x4::new(-613618278, -178080369, 1669110925, 3045173814);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvaddwev_d_wu_w(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwev_w_hu_h() {
+    let a = u16x16::new(
+        748, 28718, 22726, 4135, 23777, 12746, 33222, 13229, 5619, 33293, 48512, 19489, 24736,
+        5690, 53405, 55687,
+    );
+    let b = i16x16::new(
+        8622, -30951, -14339, -27770, -7815, -8146, 31809, -9126, -16637, 3437, 23015, 376,
+        -964, 9550, -5336, -25533,
+    );
+    let r = i64x4::new(
+        36021890720922,
+        279306018242138,
+        307210420737270,
+        206454782975196,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvaddwev_w_hu_h(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwev_h_bu_b() {
+    let a = u8x32::new(
+        88, 218, 182, 176, 220, 158, 136, 109, 143, 78, 151, 35, 3, 38, 106, 192, 31, 178, 127,
+        52, 28, 247, 210, 133, 22, 228, 225, 177, 65, 2, 28, 171,
+    );
+    let b = i8x32::new(
+        -1, 67, 111, 96, 125, 14, -82, -67, -93, -127, 85, -72, 20, -47, 83, -13, -87, -111,
+        27, -75, 125, 39, 93, 89, 25, 66, -76, -14, -52, -50, 43, -81,
+    );
+    let r = i64x4::new(
+        15201130525294679,
+        53198869398028338,
+        85287575083483080,
+        19984779190796335,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvaddwev_h_bu_b(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwev_d_wu_w() {
+    let a = u32x8::new(
+        1117566668, 2171866262, 3863150800, 2917715295, 3911708395, 1228484642, 2321269874,
+        4261467450,
+    );
+    let b = i32x8::new(
+        298065186,
+        1000727430,
+        -1974818719,
+        -2115019739,
+        1124007321,
+        786270369,
+        -898501534,
+        600072896,
+    );
+    let r = i64x4::new(
+        333107716764820248,
+        -7629022514159825200,
+        4396788873597159795,
+        -2085664542616986716,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmulwev_d_wu_w(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwev_w_hu_h() {
+    let a = u16x16::new(
+        22502, 13622, 44730, 46411, 64382, 64178, 62884, 38859, 27367, 39034, 18915, 47916,
+        24716, 55834, 5119, 58864,
+    );
+    let b = i16x16::new(
+        21292, -10920, 292, 28750, -26856, 28754, -1172, -21835, 20852, -32278, -12338, 25813,
+        -10142, -19321, -22247, 30137,
+    );
+    let r = i64x4::new(
+        56097255526935944,
+        -316539293307705904,
+        -1002330561839921236,
+        -489121149480921704,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmulwev_w_hu_h(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwev_h_bu_b() {
+    let a = u8x32::new(
+        64, 87, 43, 223, 59, 110, 8, 116, 204, 242, 108, 218, 63, 128, 143, 210, 147, 184, 202,
+        200, 78, 84, 158, 241, 147, 241, 17, 99, 53, 113, 83, 131,
+    );
+    let b = i8x32::new(
+        59, 34, 117, 84, 8, -46, -24, -51, 38, -14, -14, 47, -52, 32, -19, -121, 65, 44, 108,
+        -40, -89, 15, -31, 88, -51, 75, 71, -50, -15, -77, -11, -98,
+    );
+    let r = i64x4::new(
+        -54041167974166848,
+        -764500102863118776,
+        -1378412775185308333,
+        -256708593179958601,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmulwev_h_bu_b(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwod_d_wu_w() {
+    let a = u32x8::new(
+        2842977577, 726151833, 3624948328, 3635170403, 2399571401, 2980175388, 1959530649,
+        2789073224,
+    );
+    let b = i32x8::new(
+        1477701582,
+        -1440126406,
+        -1077662088,
+        60551123,
+        287903770,
+        -1406443306,
+        1729475940,
+        1185250387,
+    );
+    let r = i64x4::new(-713974573, 3695721526, 1573732082, 3974323611);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvaddwod_d_wu_w(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwod_w_hu_h() {
+    let a = u16x16::new(
+        15858, 62454, 8143, 63292, 12915, 37488, 58571, 3762, 9835, 37317, 31941, 1155, 43404,
+        17532, 22889, 49328,
+    );
+    let b = i16x16::new(
+        -10821, -16732, 3696, -6656, 20270, 19108, -9737, 3921, -19713, 14465, -4985, 8060,
+        19692, -13193, -8849, 8523,
+    );
+    let r = i64x4::new(
+        243249767821978,
+        32998233791764,
+        39578123684422,
+        248468153045235,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvaddwod_w_hu_h(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwod_h_bu_b() {
+    let a = u8x32::new(
+        207, 56, 245, 126, 208, 205, 19, 229, 182, 28, 85, 188, 132, 80, 149, 101, 93, 95, 56,
+        213, 181, 220, 90, 139, 206, 87, 97, 213, 245, 152, 219, 209,
+    );
+    let b = i8x32::new(
+        30, -46, -91, 101, 47, -13, 3, -11, -106, 65, 62, 83, 92, -28, -71, 122, 15, -84, -19,
+        -97, -128, -82, 28, -105, 111, -73, 119, -25, 7, 76, 54, 72,
+    );
+    let r = i64x4::new(
+        61362369571520522,
+        62769143162536029,
+        9570741921251339,
+        79095447720558606,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvaddwod_h_bu_b(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwod_d_wu_w() {
+    let a = u32x8::new(
+        3988094295, 3678296912, 2524886697, 507830363, 60676336, 2042142864, 911246321,
+        2627081751,
+    );
+    let b = i32x8::new(
+        -1423964992,
+        -300941917,
+        -1300830690,
+        301547719,
+        -728801849,
+        1812067428,
+        -1853372246,
+        1459690332,
+    );
+    let r = i64x4::new(
+        -1106953723992460304,
+        153135087601591997,
+        3700500567177033792,
+        3834725833308331332,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmulwod_d_wu_w(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwod_w_hu_h() {
+    let a = u16x16::new(
+        22867, 24578, 38420, 43680, 56323, 53684, 33271, 54214, 382, 37378, 51385, 11786, 9873,
+        685, 59607, 7054,
+    );
+    let b = i16x16::new(
+        14263, 1867, -4762, 7093, 9219, 14229, 23256, -2657, -24665, -648, 14592, -26979,
+        12560, 28471, -30607, 30723,
+    );
+    let r = i64x4::new(
+        1330676388419350166,
+        -618675426746189372,
+        -1365690048421401872,
+        930805492797249067,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmulwod_w_hu_h(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwod_h_bu_b() {
+    let a = u8x32::new(
+        106, 63, 35, 106, 240, 140, 62, 226, 24, 172, 209, 236, 201, 120, 85, 107, 133, 48,
+        166, 220, 124, 12, 206, 73, 77, 93, 122, 44, 170, 245, 79, 125,
+    );
+    let b = i8x32::new(
+        49, -59, 51, -69, -83, 90, 118, 66, -127, -31, -92, -123, 22, -96, 127, -91, 103, 27,
+        111, -67, 79, 32, 36, 51, -18, -108, -123, -57, -30, 14, -66, -118,
+    );
+    let r = i64x4::new(
+        4198534873019773307,
+        -2740489848885548244,
+        1047932990890181904,
+        -4151741170613692220,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmulwod_h_bu_b(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhaddw_q_d() {
+    let a = i64x4::new(
+        7195063412416833019,
+        -7198414538777237107,
+        3618874101468146190,
+        5075453792844537994,
+    );
+    let b = i64x4::new(
+        -4177888634615683669,
+        159708792916303045,
+        -493012886919538920,
+        -3327952250593224264,
+    );
+    let r = i64x4::new(7070440900316630840, -1, 4582440905924999074, 0);
+
+    assert_eq!(r, transmute(lasx_xvhaddw_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhaddw_qu_du() {
+    let a = u64x4::new(
+        14174115972304041760,
+        11184692435390355059,
+        6036753630285484734,
+        16987794702390801127,
+    );
+    let b = u64x4::new(
+        919078441558396978,
+        520168700921507198,
+        13672733098019829533,
+        11854214779067813220,
+    );
+    let r = i64x4::new(-6342973196760799579, 0, -6232960347008472572, 1);
+
+    assert_eq!(r, transmute(lasx_xvhaddw_qu_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhsubw_q_d() {
+    let a = i64x4::new(
+        671584889846600733,
+        8179701147067091777,
+        8820752382384406910,
+        -8816577614727005023,
+    );
+    let b = i64x4::new(
+        2862152648469207935,
+        4714581857093657849,
+        3474818266521795377,
+        -2843283552126606269,
+    );
+    let r = i64x4::new(5317548498597883842, 0, 6155348192460751216, -1);
+
+    assert_eq!(r, transmute(lasx_xvhsubw_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvhsubw_qu_du() {
+    let a = u64x4::new(
+        15891261469744917624,
+        6124172835044839452,
+        13470444488722494141,
+        514760401991858000,
+    );
+    let b = u64x4::new(
+        6113118953514320833,
+        14909065838985392334,
+        1730613981074135290,
+        11653977149369645375,
+    );
+    let r = i64x4::new(11053881530518619, 0, -1215853579082277290, -1);
+
+    assert_eq!(r, transmute(lasx_xvhsubw_qu_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwev_q_d() {
+    let a = i64x4::new(
+        6851852253375557634,
+        -687859074247996461,
+        -2847020890783636723,
+        -3396011480229435207,
+    );
+    let b = i64x4::new(
+        4881265308617523092,
+        -6946920457192015262,
+        2620975855235645060,
+        -3109202070840153061,
+    );
+    let c = i64x4::new(
+        8576064979838144125,
+        4734381367362523796,
+        1223742651533162362,
+        -6069819910741619678,
+    );
+    let r = i64x4::new(
+        -8703171595748273338,
+        1581487120574302805,
+        942353693594667509,
+        -3222137980934690913,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwev_q_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwev_d_w() {
+    let a = i64x4::new(
+        4283476221971713520,
+        5997311160552489534,
+        -7461538125080812198,
+        584666845411625444,
+    );
+    let b = i32x8::new(
+        -1699017988,
+        -1597461813,
+        1949179714,
+        -22329469,
+        -25282868,
+        -1833476595,
+        -712935020,
+        -1228584225,
+    );
+    let c = i32x8::new(
+        1933742369,
+        -902774021,
+        1152039469,
+        -966950160,
+        -2014121439,
+        -847909444,
+        205263209,
+        533619002,
+    );
+    let r = i64x4::new(
+        998013152882979948,
+        8242843123254621400,
+        -7410615358602605146,
+        438327515397946264,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwev_d_w(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwev_w_h() {
+    let a = i32x8::new(
+        -497197979,
+        2128466895,
+        1827806706,
+        -1515704287,
+        1900959403,
+        -10679846,
+        1566686168,
+        -747997169,
+    );
+    let b = i16x16::new(
+        13631, 27024, -7774, -32582, 29199, 15396, -401, -17852, 10337, 15890, -26044, 11510,
+        10732, 3619, 18520, -7838,
+    );
+    let c = i16x16::new(
+        24759, -9415, -26783, -18619, 13757, -17352, 16725, -25610, 14981, 21116, 23650,
+        -18473, 13862, 20053, 3522, -18723,
+    );
+    let r = i64x4::new(
+        -8410788748874544018,
+        -6538705505380766203,
+        -2691314320519116016,
+        -2932473655038329632,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwev_w_h(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwev_h_b() {
+    let a = i16x16::new(
+        -2623, -5568, -5250, 8004, 12247, 20872, 32727, 17906, -11062, -13097, -29604, 32623,
+        -13541, 1792, -32288, 28892,
+    );
+    let b = i8x32::new(
+        -8, 40, -69, 8, -104, 45, -81, 60, -52, -13, -3, -37, 77, 20, 76, -82, -102, 112, 71,
+        -10, -62, 75, 112, 96, 49, -67, 98, 67, -118, -51, -77, 67,
+    );
+    let c = i8x32::new(
+        -40, 23, 23, 75, -24, -86, -52, -98, 74, -106, -3, -8, -40, 43, 31, -7, -120, -68,
+        -122, -119, 103, 59, 49, -2, -77, 113, 119, 80, 101, -6, 116, 33,
+    );
+    let r = i64x4::new(
+        3438767965960271617,
+        5703373312375201999,
+        -7719324334317042534,
+        5618332147678887006,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwev_h_b(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwev_q_du() {
+    let a = u64x4::new(
+        11023906961007219829,
+        13619495672295375563,
+        7572980537071490433,
+        10145709682911964133,
+    );
+    let b = u64x4::new(
+        1145103061481704635,
+        2210139848484195129,
+        8860436254952346498,
+        12573896192036293152,
+    );
+    let c = u64x4::new(
+        17650249419725637273,
+        9888846271395867734,
+        14715851951823475494,
+        14739680783109267384,
+    );
+    let r = i64x4::new(
+        -6602489221663665608,
+        -3731588670586723767,
+        4220731810419531981,
+        -1232639684640242354,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwev_q_du(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwev_d_wu() {
+    let a = u64x4::new(
+        8055198384779363938,
+        9925260815913558465,
+        6835430604549063591,
+        15441192025398831710,
+    );
+    let b = u32x8::new(
+        1867493599, 3245935582, 1629087126, 1061202312, 3389402698, 3034357496, 1394979327,
+        2925040328,
+    );
+    let c = u32x8::new(
+        1765089209, 2899492783, 2529172711, 2742597877, 1149322351, 3557681406, 3462656435,
+        2152082771,
+    );
+    let r = i64x4::new(
+        -7095252889458714487,
+        -4401240554875374565,
+        -7715797191809385027,
+        1824782095017799339,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwev_d_wu(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwev_w_hu() {
+    let a = u32x8::new(
+        4068171594, 513777862, 1662628135, 150786756, 3404482708, 1100545508, 1296617840,
+        2568385675,
+    );
+    let b = u16x16::new(
+        9976, 32227, 62018, 53049, 21882, 59596, 30529, 48620, 19006, 49187, 50174, 12259,
+        3616, 50420, 60433, 40578,
+    );
+    let c = u16x16::new(
+        34105, 44006, 33269, 34929, 41783, 55207, 10361, 3583, 20219, 63815, 58487, 18415,
+        9646, 27639, 14059, 7949,
+    );
+    let r = i64x4::new(
+        -7378378399913155454,
+        2006169455487925341,
+        -1116240736353519778,
+        -3766489066592466128,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwev_w_hu(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwev_h_bu() {
+    let a = u16x16::new(
+        54677, 20231, 5485, 25733, 3289, 32970, 11379, 23649, 29852, 32207, 10148, 12942,
+        13168, 40138, 12570, 48782,
+    );
+    let b = u8x32::new(
+        83, 24, 36, 206, 232, 251, 52, 50, 21, 26, 144, 30, 118, 81, 232, 118, 197, 143, 213,
+        244, 155, 125, 186, 64, 225, 178, 192, 14, 230, 216, 201, 105,
+    );
+    let c = u8x32::new(
+        66, 75, 68, 238, 158, 103, 71, 149, 162, 2, 116, 125, 70, 2, 36, 29, 7, 16, 38, 243,
+        166, 196, 122, 253, 77, 64, 67, 156, 8, 203, 49, 225,
+    );
+    let r = i64x4::new(
+        8282582185414224635,
+        9007565081835870755,
+        -8416510656124192257,
+        -1943522820234774755,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwev_h_bu(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwod_q_d() {
+    let a = i64x4::new(
+        9157238656205642393,
+        -8031082356106754985,
+        -4372970903210999763,
+        -8400782536501424126,
+    );
+    let b = i64x4::new(
+        -2947828926389048030,
+        286858961466620958,
+        -7198913950768528345,
+        -4558524846284502477,
+    );
+    let c = i64x4::new(
+        -8966978539573787816,
+        5965781064088812819,
+        6785842876481166596,
+        8957716835940181125,
+    );
+    let r = i64x4::new(
+        8877886904970852051,
+        -7938310550223636451,
+        -7469536578939176788,
+        7832347329765892515,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwod_q_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwod_d_w() {
+    let a = i64x4::new(
+        3501241531332783035,
+        968696574349111989,
+        1223338638204507697,
+        5231578199334978816,
+    );
+    let b = i32x8::new(
+        1210545902,
+        706290701,
+        -1971714524,
+        2103465668,
+        -305785715,
+        -218897263,
+        280223963,
+        -838568119,
+    );
+    let c = i32x8::new(
+        -949605894,
+        -1724400178,
+        172821226,
+        2123929230,
+        -909785648,
+        1230257751,
+        620207705,
+        1402502047,
+    );
+    let r = i64x4::new(
+        2283313720808638257,
+        5436308790915787629,
+        954038583726072184,
+        4055484695888539223,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwod_d_w(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwod_w_h() {
+    let a = i32x8::new(
+        -598204125,
+        -531177195,
+        1076911560,
+        259752194,
+        -1069455958,
+        -916568789,
+        -1193369377,
+        1159492541,
+    );
+    let b = i16x16::new(
+        10650, -5211, -12808, -28115, -27527, 6937, -16741, 16285, -6142, -7067, -10826, -6660,
+        -22889, -25629, -3527, -6119,
+    );
+    let c = i16x16::new(
+        16852, -6030, -13801, 9261, 24273, 26563, 11733, -28445, 25099, 14402, -23168, -31577,
+        25012, 1004, -19731, -30323,
+    );
+    let r = i64x4::new(
+        -3399682261363746659,
+        -873916884449488685,
+        -3033389236009017420,
+        5776898425431129891,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwod_w_h(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwod_h_b() {
+    let a = i16x16::new(
+        -2829, -7831, 1134, 23799, 31864, -8205, -20884, 2782, -724, -8414, 10611, 31362,
+        15971, -25563, 3175, -6328,
+    );
+    let b = i8x32::new(
+        112, 116, -120, 74, -42, 25, 1, 19, 51, 102, -40, -73, -28, 14, -45, -57, -17, -77,
+        -111, -98, -9, 114, -32, -69, 45, -122, -65, 56, -78, 21, 111, -19,
+    );
+    let c = i8x32::new(
+        59, 63, -124, -50, -52, 12, 38, 62, 77, -127, 76, -78, 64, -80, -5, 28, 110, -44, -100,
+        45, -43, 62, 66, 112, -49, -120, 123, -18, 34, -119, -20, -74,
+    );
+    let r = i64x4::new(
+        7030406655824433535,
+        334016295025592798,
+        6652455533761006184,
+        -1385416929418315885,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwod_h_b(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwod_q_du() {
+    let a = u64x4::new(
+        1898209592653721751,
+        10926860906964806867,
+        18361012878168580252,
+        14644115162811948975,
+    );
+    let b = u64x4::new(
+        1945372576834807415,
+        5117230234174825110,
+        14390591298317442216,
+        9089518245930555118,
+    );
+    let c = u64x4::new(
+        17504435078500289086,
+        15243444480193333955,
+        7810225885258468877,
+        13257884975254190749,
+    );
+    let r = i64x4::new(
+        1757588433868711129,
+        -3291266200231780782,
+        95766916559772818,
+        2730111333315477935,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwod_q_du(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwod_d_wu() {
+    let a = u64x4::new(
+        2715769757208659525,
+        2216806074012029777,
+        6525838187075271506,
+        15876394068735907698,
+    );
+    let b = u32x8::new(
+        3928005420, 3020795031, 3881759315, 3226709793, 1296481505, 1362116053, 1131484424,
+        3814393787,
+    );
+    let c = u32x8::new(
+        2745998525, 4219603367, 1735962907, 3082063756, 2410634838, 3360953922, 2094521244,
+        1329875844,
+    );
+    let r = i64x4::new(
+        -2984417432676422714,
+        -6285012695561959331,
+        -7342896596084770244,
+        2502320151861337310,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwod_d_wu(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwod_w_hu() {
+    let a = u32x8::new(
+        2005770472, 418747954, 1467912967, 68663314, 284343496, 1733214400, 2615496661,
+        3890476135,
+    );
+    let b = u16x16::new(
+        58498, 2430, 4588, 20804, 7171, 26934, 39619, 36043, 59802, 43896, 1388, 64198, 49922,
+        4660, 8826, 1254,
+    );
+    let c = u16x16::new(
+        17893, 61614, 2263, 35439, 2530, 16965, 34585, 18123, 54862, 61539, 38281, 59547,
+        42561, 50393, 65080, 29977,
+    );
+    let r = i64x4::new(
+        4965072004097651852,
+        3100410633753647765,
+        5416148797706570800,
+        -1575823510936973847,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwod_w_hu(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwod_h_bu() {
+    let a = u16x16::new(
+        36194, 9930, 14883, 39417, 2438, 15023, 58620, 33090, 16572, 36810, 21479, 35773,
+        33259, 56285, 62068, 46564,
+    );
+    let b = u8x32::new(
+        34, 125, 103, 16, 211, 122, 70, 50, 215, 127, 193, 64, 67, 238, 249, 121, 154, 248, 31,
+        26, 187, 25, 188, 191, 248, 214, 207, 40, 155, 190, 91, 127,
+    );
+    let c = u8x32::new(
+        67, 32, 89, 53, 76, 235, 37, 230, 178, 122, 2, 56, 126, 94, 210, 6, 69, 2, 54, 188, 23,
+        253, 185, 113, 97, 190, 149, 34, 20, 7, 214, 32,
+    );
+    let r = i64x4::new(
+        -4114695625116050174,
+        -8928319877028035060,
+        -2302345889489730900,
+        -4195956656687996737,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwod_h_bu(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwev_q_du_d() {
+    let a = i64x4::new(
+        7904206285198314726,
+        -1225358394899025904,
+        5806604712820367446,
+        -4659173034171397511,
+    );
+    let b = u64x4::new(
+        6100446525668817642,
+        10688882673264876757,
+        1423085255226033079,
+        13938405669196411480,
+    );
+    let c = i64x4::new(
+        -8389902415543029131,
+        -8632894406175228839,
+        2642929561135509190,
+        -3267299416902109004,
+    );
+    let r = i64x4::new(
+        6198441987982339544,
+        -3999948362488217274,
+        7993947555517161952,
+        -4455282630855544942,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwev_q_du_d(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwev_d_wu_w() {
+    let a = i64x4::new(
+        -3676091534899840180,
+        -2004073272115093645,
+        5676581346203765904,
+        8270698864684440208,
+    );
+    let b = u32x8::new(
+        397399052, 3551436848, 2738656943, 743389966, 3499899009, 2260562895, 1875038063,
+        133906470,
+    );
+    let c = i32x8::new(
+        512699397,
+        -586471006,
+        -81269365,
+        -1769533728,
+        2120410562,
+        -2111545843,
+        -1045820519,
+        -2113967596,
+    );
+    let r = i64x4::new(
+        -3472345280571068536,
+        -2226642182825544840,
+        -5348939902888852654,
+        6309745584493025511,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwev_d_wu_w(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwev_w_hu_h() {
+    let a = i32x8::new(
+        -1456024465,
+        1292205813,
+        -1759432335,
+        -548381486,
+        1089611198,
+        478189353,
+        -1368461698,
+        -1240728243,
+    );
+    let b = u16x16::new(
+        65391, 50824, 24841, 10069, 30833, 20379, 53070, 4097, 15307, 38738, 30453, 47989,
+        55589, 23759, 34121, 44875,
+    );
+    let c = i16x16::new(
+        -28613, 8390, 29884, 18408, -17696, -3658, 16755, 18613, 24281, -18, -26803, 16674,
+        -7826, -21398, -12825, 21830,
+    );
+    let r = i64x4::new(
+        8738343996720489220,
+        1463752189638585937,
+        -1451881076969873711,
+        -7208372751461990044,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwev_w_hu_h(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwev_h_bu_b() {
+    let a = i16x16::new(
+        -2899, -28885, 21233, 25414, 18986, 27436, 5272, 11999, -21932, -7709, -1809, -22022,
+        19152, 6809, 3926, 23920,
+    );
+    let b = u8x32::new(
+        60, 166, 243, 60, 101, 145, 58, 139, 11, 119, 37, 242, 205, 208, 21, 14, 69, 216, 114,
+        226, 255, 0, 96, 241, 247, 89, 59, 46, 160, 208, 252, 246,
+    );
+    let c = i8x32::new(
+        -37, 73, -80, 20, 87, 34, -43, -125, -37, -126, -19, -52, -10, 38, -55, -26, 4, 79,
+        -59, 3, -48, -97, 73, -126, -122, 84, -108, 90, -73, 123, 53, 51,
+    );
+    let r = i64x4::new(
+        6451535402254461953,
+        3052328487586973843,
+        -4225844162003621016,
+        -7954234670014147302,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwev_h_bu_b(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwod_q_du_d() {
+    let a = i64x4::new(
+        5040785179692297413,
+        -5698968703706500445,
+        -731068043920228861,
+        3965235820245190976,
+    );
+    let b = u64x4::new(
+        10854493275645220911,
+        16138982903185851834,
+        5339741244155318123,
+        14666659343881516356,
+    );
+    let c = i64x4::new(
+        3608705967944035653,
+        2602681461334264776,
+        2583771862194956886,
+        -8807004962159335926,
+    );
+    let r = i64x4::new(
+        2112387857800094741,
+        -3421893061573111304,
+        -7025246865660777813,
+        -3037048219893810668,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwod_q_du_d(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwod_d_wu_w() {
+    let a = i64x4::new(
+        -6548782426860122444,
+        -5512378810555054389,
+        -8313251399158871596,
+        -2631108805874731030,
+    );
+    let b = u32x8::new(
+        3411181446, 4063156506, 4162056821, 1798829201, 223212533, 2591023005, 958942780,
+        723906610,
+    );
+    let c = i32x8::new(
+        -1601726534,
+        -337872632,
+        396528058,
+        691753867,
+        2049925652,
+        -947032016,
+        -1272465465,
+        -802105137,
+    );
+    let r = i64x4::new(
+        -7921611809770266236,
+        -4268031754690784122,
+        7679710934623151940,
+        -3211758016463986600,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwod_d_wu_w(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwod_w_hu_h() {
+    let a = i32x8::new(
+        29411709, -487241679, -445814375, -898026796, 1702472835, 1332407325, 428234819,
+        36330620,
+    );
+    let b = u16x16::new(
+        6115, 7084, 54578, 41741, 10808, 9353, 62741, 13372, 25833, 45511, 2751, 162, 49362,
+        49913, 10572, 63054,
+    );
+    let c = i16x16::new(
+        -4084, 19702, -31174, 24313, 27489, -17948, -4193, -6492, -20772, 11511, 15075, -18053,
+        -30409, 25187, -9190, -5069,
+    );
+    let r = i64x4::new(
+        2266055901231345861,
+        -4229846225082649443,
+        5710084886827853700,
+        -1216721738864979826,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwod_w_hu_h(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmaddwod_h_bu_b() {
+    let a = i16x16::new(
+        -31362, 19310, 4398, 21644, -18947, -19503, 21298, 6464, -22249, 24001, 29448, 11657,
+        -25193, -16348, 5631, 18801,
+    );
+    let b = u8x32::new(
+        255, 169, 91, 69, 97, 249, 150, 91, 30, 132, 219, 186, 87, 159, 227, 164, 250, 45, 9,
+        167, 101, 32, 191, 101, 124, 84, 2, 10, 146, 179, 65, 134,
+    );
+    let c = i8x32::new(
+        -69, 4, -26, 80, -124, 33, 78, -58, -13, -100, 88, -23, 70, 18, 48, -30, -81, 4, 29,
+        -53, 118, 123, 7, 51, 27, 62, -41, -75, 114, 101, -44, 93,
+    );
+    let r = i64x4::new(
+        4606673651486328866,
+        434701133187613293,
+        4731174792733829579,
+        8799854033754305007,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmaddwod_h_bu_b(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrotr_b() {
+    let a = i8x32::new(
+        -76, -66, -50, 116, 83, -40, -66, 16, 118, -125, 54, 31, 77, -105, -66, 96, 81, -86,
+        -10, 31, -90, 37, 33, -20, 68, -9, -69, -76, -120, 95, 49, -94,
+    );
+    let b = i8x32::new(
+        91, -91, -119, -120, 66, -54, 8, -3, -118, -6, -52, -20, 13, 106, -107, -104, -59, -50,
+        31, 106, -25, -35, 115, 62, -31, 120, 59, -89, 7, 35, -100, -87,
+    );
+    let r = i64x4::new(
+        -9169831505165814378,
+        6986742644414341277,
+        -5538256227715405174,
+        5842271601646106402,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrotr_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrotr_h() {
+    let a = i16x16::new(
+        -391, -26680, -19180, 8374, -10657, 16157, 18976, -9288, -10450, 9732, 26117, 31925,
+        20483, -14847, -1605, 8796,
+    );
+    let b = i16x16::new(
+        -24978, -7031, 20444, 9930, -18507, -2797, 10351, -20863, 2342, -7299, 397, -8738,
+        -6411, 11173, 25086, -9162,
+    );
+    let r = i64x4::new(
+        3280961714933987815,
+        7916365250426044082,
+        -948799184442377380,
+        8109266518466894464,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrotr_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrotr_w() {
+    let a = i32x8::new(
+        807443288, 659305929, 215715568, 461653638, 1156975794, -140043152, 572930522,
+        -305210344,
+    );
+    let b = i32x8::new(
+        425095120, 2007398487, 1779876326, 867842254, -355714240, 1021676577, 2008058921,
+        -149962463,
+    );
+    let r = i64x4::new(
+        -7463711091125112800,
+        1880373866945277499,
+        8922631659077373106,
+        8567937817891640092,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrotr_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrotr_d() {
+    let a = i64x4::new(
+        1798291896688439472,
+        -8678294225084614636,
+        -3360425612013625394,
+        6141382649032010789,
+    );
+    let b = i64x4::new(
+        -4687895735595482806,
+        7366925603772764024,
+        113747709542135138,
+        -4369447114926223278,
+    );
+    let r = i64x4::new(
+        3172290282099188988,
+        -8034032776515152761,
+        8319107233083774893,
+        4254025119287920211,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrotr_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvadd_q() {
+    let a = i64x4::new(
+        -2609166907920397576,
+        4277631384595295751,
+        -6908798269010317006,
+        5982715628809494048,
+    );
+    let b = i64x4::new(
+        -8390221664220170851,
+        5630840603034329774,
+        -482468290988389688,
+        -4276184844647827597,
+    );
+    let r = i64x4::new(
+        7447355501568983189,
+        -8538272086079926090,
+        -7391266559998706694,
+        1706530784161666452,
+    );
+
+    assert_eq!(r, transmute(lasx_xvadd_q(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsub_q() {
+    let a = i64x4::new(
+        5635628360514667431,
+        8563800808356171400,
+        -8195308523117763518,
+        3653510787018366900,
+    );
+    let b = i64x4::new(
+        2471979813421155001,
+        4980523206404219656,
+        5227116936323454967,
+        2410762289023585517,
+    );
+    let r = i64x4::new(
+        3163648547093512430,
+        3583277601951951744,
+        5024318614268333131,
+        1242748497994781383,
+    );
+
+    assert_eq!(r, transmute(lasx_xvsub_q(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwev_q_du_d() {
+    let a = u64x4::new(
+        11512774700636858764,
+        3877920437650491653,
+        5348767768447622976,
+        10610828160678410847,
+    );
+    let b = i64x4::new(
+        4538357695196601706,
+        962354258063947537,
+        461386020283085419,
+        -3214659782190620189,
+    );
+    let r = i64x4::new(-2395611677876091146, 0, 5810153788730708395, 0);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvaddwev_q_du_d(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvaddwod_q_du_d() {
+    let a = u64x4::new(
+        2811249209376266688,
+        65866753992142741,
+        10134352057937866409,
+        17378632901315704999,
+    );
+    let b = i64x4::new(
+        771717384571916075,
+        -6276542900978063061,
+        -782791668238120654,
+        -4337892955900394734,
+    );
+    let r = i64x4::new(-6210676146985920320, -1, -5406004128294241351, 0);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvaddwod_q_du_d(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwev_q_du_d() {
+    let a = u64x4::new(
+        1631079386587456680,
+        4565265601922112419,
+        5351621054404189773,
+        12518175210587903555,
+    );
+    let b = i64x4::new(
+        7907402685955854803,
+        -6034016436240875818,
+        -1692667855436677787,
+        857071248435905820,
+    );
+    let r = i64x4::new(
+        -9215090926608146824,
+        699180379527824028,
+        8322461491295210849,
+        -491063186927300825,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmulwev_q_du_d(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmulwod_q_du_d() {
+    let a = u64x4::new(
+        16516519389168658270,
+        11550123424719201061,
+        18023411584703351911,
+        5733925898426927381,
+    );
+    let b = i64x4::new(
+        -1630542181497141953,
+        -8299748862195853267,
+        -3768558747736596235,
+        -8223031783298003100,
+    );
+    let r = i64x4::new(
+        8208983644526863745,
+        -5196750351687252927,
+        2416926856050984756,
+        -2556020440107861891,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvmulwod_q_du_d(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmskgez_b() {
+    let a = i8x32::new(
+        3, -116, -122, 1, -82, 30, 73, 60, 22, 102, -51, -22, 59, 125, -61, -78, 89, 25, 31,
+        107, 111, 27, -119, -90, 119, 49, -86, -82, 1, -113, -8, -40,
+    );
+    let r = i64x4::new(13289, 0, 4927, 0);
+
+    assert_eq!(r, transmute(lasx_xvmskgez_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvmsknz_b() {
+    let a = i8x32::new(
+        52, -33, -37, -47, -126, -26, -42, -37, -96, 90, -32, 25, 62, -95, 114, 53, -88, -66,
+        -49, -31, -126, -89, -92, 127, -113, -43, 41, 40, -79, 108, -63, -57,
+    );
+    let r = i64x4::new(65535, 0, 65535, 0);
+
+    assert_eq!(r, transmute(lasx_xvmsknz_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvexth_h_b() {
+    let a = i8x32::new(
+        86, 82, -64, 55, 99, -98, 18, 55, 53, -101, -88, -23, 101, -32, -7, -69, -92, 77, 92,
+        -110, 99, 46, 88, -36, 84, 42, 42, -1, -24, -95, -48, -7,
+    );
+    let r = i64x4::new(
+        -6192823156408267,
+        -19140324188225435,
+        -281294585331628,
+        -1689051729887256,
+    );
+
+    assert_eq!(r, transmute(lasx_xvexth_h_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvexth_w_h() {
+    let a = i16x16::new(
+        -22892, -26139, 11053, 11772, -13928, 20772, 16551, -20590, -10608, 9266, 29842,
+        -10111, -3519, 29175, 10737, -27281,
+    );
+    let r = i64x4::new(
+        89219355625880,
+        -88433376608089,
+        125309965824577,
+        -117171002791439,
+    );
+
+    assert_eq!(r, transmute(lasx_xvexth_w_h(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvexth_d_w() {
+    let a = i32x8::new(
+        -825627036,
+        -1996938691,
+        78514216,
+        -1063299454,
+        257564527,
+        -138481584,
+        -1487536177,
+        1875317589,
+    );
+    let r = i64x4::new(78514216, -1063299454, -1487536177, 1875317589);
+
+    assert_eq!(r, transmute(lasx_xvexth_d_w(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvexth_q_d() {
+    let a = i64x4::new(
+        5979507577341197552,
+        5196480214883180720,
+        -8000060569264941491,
+        7776492634988202392,
+    );
+    let r = i64x4::new(5196480214883180720, 0, 7776492634988202392, 0);
+
+    assert_eq!(r, transmute(lasx_xvexth_q_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvexth_hu_bu() {
+    let a = u8x32::new(
+        47, 59, 186, 7, 161, 218, 234, 101, 186, 179, 42, 250, 253, 76, 169, 142, 127, 7, 4,
+        56, 123, 5, 152, 53, 224, 98, 177, 197, 49, 13, 16, 40,
+    );
+    let r = i64x4::new(
+        70368924578021562,
+        39970172547367165,
+        55451330627633376,
+        11259067788754993,
+    );
+
+    assert_eq!(r, transmute(lasx_xvexth_hu_bu(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvexth_wu_hu() {
+    let a = u16x16::new(
+        11201, 3109, 64518, 58951, 32582, 32792, 2605, 46256, 28808, 30095, 54960, 26138,
+        39952, 56608, 20537, 49215,
+    );
+    let r = i64x4::new(
+        140840567603014,
+        198668007246381,
+        243129508731920,
+        211376815493177,
+    );
+
+    assert_eq!(r, transmute(lasx_xvexth_wu_hu(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvexth_du_wu() {
+    let a = u32x8::new(
+        1580507769, 1550554068, 3486710391, 717721410, 434913819, 742461632, 1954296323,
+        1406265475,
+    );
+    let r = i64x4::new(3486710391, 717721410, 1954296323, 1406265475);
+
+    assert_eq!(r, transmute(lasx_xvexth_du_wu(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvexth_qu_du() {
+    let a = u64x4::new(
+        15671254659731561180,
+        6305760528044738869,
+        3619266805555730982,
+        3857202168052068182,
+    );
+    let r = i64x4::new(6305760528044738869, 0, 3857202168052068182, 0);
+
+    assert_eq!(r, transmute(lasx_xvexth_qu_du(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrotri_b() {
+    let a = i8x32::new(
+        37, 16, -44, -97, 31, 23, 58, -46, 3, -22, 31, -79, 59, -102, -113, 89, -12, 97, -16,
+        -83, -69, -115, 127, -110, -107, -36, -16, -51, 26, 48, -58, -4,
+    );
+    let r = i64x4::new(
+        3288597436994224466,
+        -7640170181100982736,
+        3024123976131483215,
+        -3500418816657076903,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrotri_b::<4>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrotri_h() {
+    let a = i16x16::new(
+        8999, -7250, -4236, 2845, 21265, -24726, -14769, -11915, -12193, 28179, 16866, -23983,
+        -11259, 31467, -30522, 8490,
+    );
+    let r = i64x4::new(
+        1601837713137157710,
+        -6707112604456344030,
+        4945941697313284287,
+        4779464405959485451,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrotri_h::<15>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrotri_w() {
+    let a = i32x8::new(
+        1273906952,
+        1323123989,
+        -1657206810,
+        -758313569,
+        30529353,
+        -1084318195,
+        470709136,
+        -1831448763,
+    );
+    let r = i64x4::new(
+        -6725603050124640824,
+        -5477967444476451040,
+        -4487859208269579718,
+        -1679179889808014898,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrotri_w::<11>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrotri_d() {
+    let a = i64x4::new(
+        -6269890993217399490,
+        4900582678319344510,
+        4744796290155065976,
+        7326839228001128846,
+    );
+    let r = i64x4::new(
+        1530846727385147611,
+        3134017167653815720,
+        -5586642937907364280,
+        -7958311692822812825,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrotri_d::<16>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvextl_q_d() {
+    let a = i64x4::new(
+        -4167783494125842132,
+        -8818287186975390348,
+        7476993593286219399,
+        362651956781912161,
+    );
+    let r = i64x4::new(-4167783494125842132, -1, 7476993593286219399, 0);
+
+    assert_eq!(r, transmute(lasx_xvextl_q_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlni_b_h() {
+    let a = i8x32::new(
+        -122, -57, 103, 68, 81, 117, 10, -11, 85, 78, 51, -68, 17, 5, 57, 15, 82, -13, -58, 32,
+        -126, -109, -28, -108, -90, -102, -13, -26, 80, 87, 44, 12,
+    );
+    let b = i8x32::new(
+        107, 49, -98, -36, -98, 81, 126, -15, 96, 112, 83, 75, 70, 12, -92, -96, 119, -26, -75,
+        9, -68, 107, 80, 126, -58, 38, -112, 85, 36, -27, 17, -109,
+    );
+    let r = i64x4::new(
+        775944073576565014,
+        -913733859716807048,
+        3554001380194360167,
+        -4434515480828965835,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrlni_b_h::<4>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlni_h_w() {
+    let a = i16x16::new(
+        7707, -22772, -29741, -9919, -14059, 17567, -31900, -30801, -21839, 26160, 23241,
+        -17751, 11400, 21178, -10087, -1621,
+    );
+    let b = i16x16::new(
+        26329, -6694, -20485, 30132, 26844, -6674, 8539, 29251, -25304, -9125, -8199, 29075,
+        25395, -30076, -29212, -25696,
+    );
+    let r = i64x4::new(
+        8233677356103165402,
+        -8669635304329468148,
+        -7232628700111184805,
+        -456179975298914768,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrlni_h_w::<16>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlni_w_d() {
+    let a = i32x8::new(
+        -406185034,
+        -895467686,
+        -717037773,
+        -469050531,
+        -1539233593,
+        -1778247886,
+        -1546187185,
+        -2026338244,
+    );
+    let b = i32x8::new(
+        -446056064,
+        -1691954961,
+        -981213165,
+        -458936270,
+        -1860231155,
+        2056121344,
+        1905674092,
+        45485615,
+    );
+    let r = i64x4::new(
+        2975767411517832185,
+        195580534167951033,
+        -5943753303447109596,
+        -3593292886058873687,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrlni_w_d::<26>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlni_d_q() {
+    let a = i64x4::new(
+        7597626924148193039,
+        8987414085353164021,
+        -3181901883582161412,
+        -5484978136186304133,
+    );
+    let b = i64x4::new(
+        3950632511964740415,
+        1415609115522181708,
+        3151552885247761103,
+        -4372710870967542224,
+    );
+    let r = i64x4::new(5149955, 32696021, 51201034, 47154629);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrlni_d_q::<102>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlrni_b_h() {
+    let a = i8x32::new(
+        34, -7, -78, 100, -21, -1, 17, 9, -61, -37, -34, -101, 35, -116, 122, -18, -81, -45,
+        109, -42, 100, -92, -112, -23, -31, 5, -113, 35, 49, 53, 114, -92,
+    );
+    let b = i8x32::new(
+        112, 121, 80, 76, -7, 100, 61, 66, 108, 0, 80, -24, -2, 119, -19, 70, -14, -70, -100,
+        -17, -108, -13, -85, 119, -8, -115, -56, -35, 14, -83, -84, 17,
+    );
+    let r = i64x4::new(
+        5150121261709741177,
+        -1257457727085451783,
+        1345976567149686971,
+        -6614340865598630188,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrlrni_b_h::<8>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlrni_h_w() {
+    let a = i16x16::new(
+        27342, -1239, 27928, 29682, 26896, -14508, -15889, 28618, 8114, -5723, 6531, 16489,
+        9888, 9809, 24468, -17705,
+    );
+    let b = i16x16::new(
+        -4757, 26542, -29532, 16718, -14266, 32474, -3741, 20715, -3284, 22232, -12159, 12153,
+        9095, 12312, -9885, 15691,
+    );
+    let r = i64x4::new(
+        6884832036274927467,
+        6201354748313553750,
+        6830765589305542553,
+        -4972667551599548162,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrlrni_h_w::<5>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlrni_w_d() {
+    let a = i32x8::new(
+        289756682,
+        172661920,
+        1612205654,
+        1151400165,
+        -170063304,
+        -1551308632,
+        700728065,
+        -2116148576,
+    );
+    let b = i32x8::new(
+        -2050725054,
+        1576856049,
+        1261747784,
+        550730851,
+        956136959,
+        -2117291501,
+        333722873,
+        1623097423,
+    );
+    let r = i64x4::new(
+        1154968246271901,
+        2414660678666580,
+        3403881842227606,
+        4569312628338973,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrlrni_w_d::<43>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrlrni_d_q() {
+    let a = i64x4::new(
+        3267303445176803893,
+        -4716941928717011909,
+        -2526932137083513614,
+        895449181781228437,
+    );
+    let b = i64x4::new(
+        2365189083440669290,
+        -2671456009299896653,
+        -5051789062015102943,
+        8552962343526201846,
+    );
+    let r = i64x4::new(3, 3, 2, 0);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrlrni_d_q::<126>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlni_b_h() {
+    let a = i8x32::new(
+        -107, 64, -59, -36, -40, 105, -55, -99, -41, 36, -103, -12, -28, -101, 45, 100, 73,
+        -21, -30, -52, 105, 47, 41, -81, -123, -14, -118, 97, -35, 59, 106, 86,
+    );
+    let b = i8x32::new(
+        -73, 37, -91, -37, 7, -55, -86, -122, 88, 17, 59, 126, -32, -53, 61, -110, 23, 50,
+        -108, -47, 85, 64, -55, -30, 95, 76, -30, -4, -20, 62, 101, 45,
+    );
+    let r = i64x4::new(
+        9187201950435737471,
+        9187201950435737471,
+        9187201950435737471,
+        9187201950435737471,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlni_b_h::<4>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlni_h_w() {
+    let a = i16x16::new(
+        5930, 18178, 9007, -17010, -26714, -2479, 7566, 5590, 16536, 7100, -23266, -11745,
+        -13529, 4421, -4886, -13565,
+    );
+    let b = i16x16::new(
+        -24390, 15351, 27329, 19807, 29414, -20147, 32425, 16919, -13702, 24649, 12504, 19625,
+        -21621, -18266, -9493, 32188,
+    );
+    let r = i64x4::new(4294967296, 4295032832, 4294967296, 281474976776192);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlni_h_w::<31>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlni_w_d() {
+    let a = i32x8::new(
+        1916052008,
+        -1597654810,
+        -1773664899,
+        -1047601895,
+        1186726373,
+        -322280569,
+        -1340612407,
+        -1064828410,
+    );
+    let b = i32x8::new(
+        962275142,
+        367045968,
+        -1148735443,
+        -1235460518,
+        1290051946,
+        -1409071527,
+        -1206112029,
+        -438247212,
+    );
+    let r = i64x4::new(
+        9223372034707292159,
+        9223372034707292159,
+        9223372034707292159,
+        9223372034707292159,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlni_w_d::<14>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlni_d_q() {
+    let a = i64x4::new(
+        8566260488262197161,
+        7230026431777616732,
+        5171247138929999763,
+        7672209083386018537,
+    );
+    let b = i64x4::new(
+        7413144581871225401,
+        1963917804351928008,
+        4461413294595322647,
+        -319568179542390733,
+    );
+    let r = i64x4::new(
+        9223372036854775807,
+        9223372036854775807,
+        9223372036854775807,
+        9223372036854775807,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlni_d_q::<35>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlni_bu_h() {
+    let a = u8x32::new(
+        63, 110, 160, 217, 255, 151, 31, 161, 90, 119, 205, 201, 53, 121, 107, 243, 140, 191,
+        5, 109, 173, 46, 21, 136, 126, 162, 107, 116, 221, 46, 104, 127,
+    );
+    let b = i8x32::new(
+        -19, -66, 94, -50, 114, -125, 71, -72, 91, -112, -36, -97, 0, -113, 63, 124, 21, 67,
+        -17, 63, -30, -100, -64, 42, 84, 106, 81, 119, 26, 105, -15, 93,
+    );
+    let r = i64x4::new(
+        1085669953590270231,
+        2165977494045465357,
+        796308158196942600,
+        1082286764800150807,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlni_bu_h::<11>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlni_hu_w() {
+    let a = u16x16::new(
+        6179, 35983, 31969, 1127, 39823, 7636, 13877, 49933, 49881, 18256, 23272, 43743, 14779,
+        42488, 11284, 24455,
+    );
+    let b = i16x16::new(
+        -2976, 3715, -23929, -18386, 13544, -26884, -14757, 9675, -17650, 8814, 4366, 2063,
+        1167, -30247, -25786, 9281,
+    );
+    let r = i64x4::new(4295032832, 281474976710657, 4294967296, 4295032832);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlni_hu_w::<31>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlni_wu_d() {
+    let a = u32x8::new(
+        1435242449, 2536238660, 3898848008, 4040623161, 743412748, 1784708443, 2900988959,
+        1523459155,
+    );
+    let b = i32x8::new(
+        -1925581805,
+        -241685045,
+        745827979,
+        -811389509,
+        834544392,
+        1909578565,
+        2098160602,
+        -1160686393,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlni_wu_d::<24>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlni_du_q() {
+    let a = u64x4::new(
+        4906311251686769180,
+        5252105969529596252,
+        3036147848110573085,
+        6245591556930524613,
+    );
+    let b = i64x4::new(
+        -1139822228972687264,
+        -3655945315912724740,
+        6046255801009758548,
+        -8615916243772089902,
+    );
+    let r = i64x4::new(420379, 149273, 279408, 177510);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlni_du_q::<109>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrni_b_h() {
+    let a = i8x32::new(
+        -47, 51, -118, -97, 65, -7, -102, 38, -97, -64, 87, -87, -10, 84, -105, -80, -8, 81,
+        -112, -40, -15, 20, -72, -108, -23, -18, 93, -125, -55, 33, 12, -21,
+    );
+    let b = i8x32::new(
+        92, 106, -122, -65, -16, 86, 50, -59, 59, -29, -92, -41, 101, 10, 35, 106, -53, 112,
+        79, 78, -52, 18, 62, 29, -78, -65, -73, 122, -105, -105, -27, -72,
+    );
+    let r = i64x4::new(
+        9157365602904407935,
+        9187201949596876648,
+        9187201949272276863,
+        9170594926804238207,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlrni_b_h::<7>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrni_h_w() {
+    let a = i16x16::new(
+        27571, 10886, 30311, -21575, -21376, -15868, 15443, -27608, -9760, 16249, 24860, -3987,
+        25742, 25311, 2125, -1676,
+    );
+    let b = i16x16::new(
+        20889, 11322, -17186, 17589, 10767, 165, 25424, -3527, -16029, -18830, -3174, -27403,
+        20745, 19828, 7102, 10767,
+    );
+    let r = i64x4::new(
+        9223113262927675391,
+        9223231297218904063,
+        9223231297218904063,
+        9223231297218904063,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlrni_h_w::<11>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrni_w_d() {
+    let a = i32x8::new(
+        13685129,
+        -1250749430,
+        -1392632470,
+        -496387445,
+        -859105657,
+        -188800497,
+        1260867999,
+        -2071975844,
+    );
+    let b = i32x8::new(
+        -2147085485,
+        -1138150986,
+        1740486083,
+        129550606,
+        761255804,
+        107768592,
+        -897233831,
+        2135540054,
+    );
+    let r = i64x4::new(
+        9223372034707292159,
+        9223372034707292159,
+        9223372034707292159,
+        9223372034707292159,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlrni_w_d::<27>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrni_d_q() {
+    let a = i64x4::new(
+        2126435331828238132,
+        2988359539712387032,
+        2606687986635590409,
+        -5337426820831497192,
+    );
+    let b = i64x4::new(
+        5599657380360976171,
+        1936278255544613151,
+        4350470739273890826,
+        4020807834764701096,
+    );
+    let r = i64x4::new(1803299650, 2783126700, 3744669105, 12209003095);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlrni_d_q::<94>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrni_bu_h() {
+    let a = u8x32::new(
+        10, 53, 247, 169, 200, 197, 15, 35, 40, 63, 25, 238, 115, 150, 127, 27, 72, 180, 151,
+        194, 68, 16, 94, 145, 159, 31, 157, 147, 248, 155, 228, 94,
+    );
+    let b = i8x32::new(
+        28, 76, -8, 123, -45, -21, 72, -114, -80, -30, 52, -17, -86, 92, 41, -102, 69, 7, 126,
+        112, 93, 90, 52, 3, 85, -11, 40, 64, -22, -54, 38, 100,
+    );
+    let r = i64x4::new(-1, -1, -3422552204, -1);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlrni_bu_h::<4>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrni_hu_w() {
+    let a = u16x16::new(
+        46345, 65470, 38947, 23932, 57842, 4833, 48042, 40409, 15235, 53592, 48941, 4323, 7891,
+        47087, 8916, 53135,
+    );
+    let b = i16x16::new(
+        -10645, 13954, 25607, -15109, -23253, 24216, -29088, 13185, 29191, -11398, -4777,
+        -18744, -9822, -25345, 19767, -4882,
+    );
+    let r = i64x4::new(
+        3711633057434515075,
+        -7072319501391495233,
+        -1373988209909181574,
+        -3490368948780347048,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlrni_hu_w::<16>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrni_wu_d() {
+    let a = u32x8::new(
+        2556609468, 306738319, 398886007, 1398704761, 4256553589, 1589981150, 4133102348,
+        1371421151,
+    );
+    let b = i32x8::new(
+        1653381194,
+        1981734587,
+        -1912314738,
+        1375487329,
+        900885316,
+        -1157483971,
+        1097724788,
+        -1431477856,
+    );
+    let r = i64x4::new(
+        22535693409672,
+        22917945492626,
+        46913927786177,
+        22471268898737,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlrni_wu_d::<50>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrlrni_du_q() {
+    let a = u64x4::new(
+        10427296977042877275,
+        11482184389991123309,
+        17526981944466620659,
+        4352829566336418219,
+    );
+    let b = i64x4::new(
+        -2649024960844804464,
+        -4562273421517696438,
+        -4420539680558072379,
+        3588904051642804143,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrlrni_du_q::<53>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrani_b_h() {
+    let a = i8x32::new(
+        -75, 121, -21, -15, 41, -7, 35, 38, -68, -73, -76, -71, 96, 43, -94, 56, -117, -109,
+        -28, -15, -125, 90, -42, -48, -12, -96, -55, 4, 32, 81, 64, -29,
+    );
+    let b = i8x32::new(
+        -57, -25, -35, -108, 14, 83, 114, -49, -48, 1, -109, 103, 36, -56, 111, 36, 126, 67,
+        32, 11, -52, 28, -69, -5, -2, 118, -85, -104, -45, 106, 32, -56,
+    );
+    let r = i64x4::new(
+        2650481638178526439,
+        4047532886406590841,
+        -4005221281806152893,
+        -2066865665249447533,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrani_b_h::<8>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrani_h_w() {
+    let a = i16x16::new(
+        -18891, 1637, 13894, -632, 7479, -28444, -346, -630, -10322, -16816, 24786, -20705,
+        25886, -25922, 2142, 28477,
+    );
+    let b = i16x16::new(
+        21255, -8544, -16076, 8180, -16685, -4813, 15309, -18986, 11259, -27708, -15696, 2064,
+        -27273, -24407, -22250, 31561,
+    );
+    let r = i64x4::new(
+        4309310235152241415,
+        -97358218970876363,
+        -6262653890212123653,
+        603030581262079918,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrani_h_w::<0>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrani_w_d() {
+    let a = i32x8::new(
+        495374215,
+        -1163373413,
+        976054174,
+        1739213032,
+        -1526300426,
+        -1390196250,
+        1721157436,
+        191851664,
+    );
+    let b = i32x8::new(
+        -250032972,
+        -1792143742,
+        873982753,
+        1073657849,
+        -422789767,
+        -119562076,
+        282475947,
+        -868496874,
+    );
+    let r = i64x4::new(
+        -5770703783532497,
+        8837345064960477617,
+        -4342418500326127026,
+        -5262798083402195350,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrani_w_d::<28>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrani_d_q() {
+    let a = i64x4::new(
+        -5439693678259807595,
+        -4580626123248901724,
+        2865305240006228264,
+        -8764287857747577448,
+    );
+    let b = i64x4::new(
+        -5465329153910229449,
+        -6398397336342204188,
+        -6140402929126091639,
+        -227294431853722285,
+    );
+    let r = i64x4::new(
+        -1599599334085551047,
+        -1145156530812225431,
+        -56823607963430572,
+        -2191071964436894362,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrani_d_q::<66>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrarni_b_h() {
+    let a = i8x32::new(
+        -127, 9, 115, 3, -36, -14, 60, 5, -69, -24, 124, -51, 64, -85, 106, -22, -9, -70, 26,
+        -34, 108, 46, 86, 1, -82, -103, 79, 112, -121, 40, 36, 4,
+    );
+    let b = i8x32::new(
+        7, 107, -33, 86, -95, -124, 35, 9, -27, 101, -64, 109, 34, 39, 38, -48, 20, 118, -95,
+        127, 77, -88, 31, 76, 81, 10, 73, -32, -72, 121, 97, 83,
+    );
+    let r = i64x4::new(
+        176445634160258736,
+        -6362222276348332136,
+        3935026383906273889,
+        4794087966981481135,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrarni_b_h::<4>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrarni_h_w() {
+    let a = i16x16::new(
+        72, 17107, 2659, -22852, 13209, -19338, 29569, 8828, -14716, 1062, 26914, 1211, 14641,
+        462, -8884, 7159,
+    );
+    let b = i16x16::new(
+        -17232, 10103, 3681, -11092, 29619, 422, -25692, 26710, 4183, 27520, 31478, -21569,
+        -7123, 10033, 12272, -5070,
+    );
+    let r = i64x4::new(
+        3120663839319243742,
+        4483961363433351552,
+        1808363419292516360,
+        -292761337443576989,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrarni_h_w::<9>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrarni_w_d() {
+    let a = i32x8::new(
+        1755618482, 374523356, -792192312, 1238002187, -327197280, 1104823907, 1830966401,
+        1692510686,
+    );
+    let b = i32x8::new(
+        -918051703,
+        -2012887920,
+        1331552048,
+        -1402691916,
+        1043562559,
+        2068236941,
+        -2026755109,
+        267314745,
+    );
+    let r = i64x4::new(-1, 4294967296, 1, 4294967297);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrarni_w_d::<63>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvsrarni_d_q() {
+    let a = i64x4::new(
+        567870545843316755,
+        -8340148388286757707,
+        5574111920016803397,
+        4080639718254229578,
+    );
+    let b = i64x4::new(
+        2101950651821444783,
+        -8893233216031885881,
+        -1626396509648873280,
+        -8228614332001484946,
+    );
+    let r = i64x4::new(-32353394, -30341283, -29935525, 14845281);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvsrarni_d_q::<102>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrani_b_h() {
+    let a = i8x32::new(
+        -9, 79, -25, 24, 113, 13, 74, -64, -92, 21, 94, -9, -20, -54, -92, 20, 108, 43, 104,
+        -53, 111, -89, -71, -19, 63, 98, -15, 42, 22, 34, -71, -122,
+    );
+    let b = i8x32::new(
+        89, 71, -128, -8, -83, 115, -7, -12, 45, -29, 1, 28, 32, 14, -93, 78, 52, -30, -13, 38,
+        -28, -8, -119, -64, 67, 107, -11, 52, 25, -112, 40, 98,
+    );
+    let r = i64x4::new(
+        9183261305727861887,
+        9187548296613953407,
+        9187483425433943936,
+        -9187484529219043201,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrani_b_h::<5>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrani_h_w() {
+    let a = i16x16::new(
+        22159, -17585, 27907, -32059, 19510, -6875, -20701, -10302, -10451, 21878, 24873,
+        29927, 15505, -6217, -18330, 22702,
+    );
+    let b = i16x16::new(
+        -29049, -3742, -27686, -18440, 8738, 8686, 29608, -1629, -4626, 5557, -6248, 23821,
+        29245, 14976, -6969, 13087,
+    );
+    let r = i64x4::new(
+        -9223231301513871360,
+        -9223231297218904064,
+        9223231297218904063,
+        9223231301513871359,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrani_h_w::<0>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrani_w_d() {
+    let a = i32x8::new(
+        1147593201,
+        1099386066,
+        1235877455,
+        692249820,
+        -2135577276,
+        -886668236,
+        -2044672723,
+        1727555657,
+    );
+    let b = i32x8::new(
+        -1064236617,
+        1620556139,
+        1782308008,
+        -1034014776,
+        1536995212,
+        533284065,
+        -1618986886,
+        1843302450,
+    );
+    let r = i64x4::new(
+        -542123656805187,
+        362937621548090,
+        966419181272650,
+        905739883141428,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrani_w_d::<45>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrani_d_q() {
+    let a = i64x4::new(
+        1412208151721093534,
+        -7916875471977804537,
+        -1917411313405122179,
+        -2235840015390028939,
+    );
+    let b = i64x4::new(
+        1186137621302436836,
+        759241008247506430,
+        -5558106622572300047,
+        -7286741001002884564,
+    );
+    let r = i64x4::new(
+        1482892594233410,
+        -15462647406206650,
+        -14231916017583759,
+        -4366875030058651,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrani_d_q::<73>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrani_bu_h() {
+    let a = u8x32::new(
+        1, 205, 104, 217, 117, 189, 143, 23, 134, 233, 247, 251, 129, 173, 74, 226, 108, 34,
+        242, 113, 228, 183, 247, 66, 58, 69, 232, 3, 194, 209, 216, 161,
+    );
+    let b = i8x32::new(
+        83, 63, -94, -103, -40, -9, 26, 104, 112, -91, 71, 32, 61, 29, 79, -128, 112, -4, -66,
+        -40, -90, -40, 49, 54, -61, 120, 6, -48, 10, -33, -13, 10,
+    );
+    let r = i64x4::new(283674100629507, 16777216, 30115102720, 17246979842);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrani_bu_h::<12>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrani_hu_w() {
+    let a = u16x16::new(
+        19110, 42732, 10660, 61644, 61010, 42962, 42748, 16931, 50634, 1738, 45781, 12001,
+        56715, 59669, 23910, 35943,
+    );
+    let b = i16x16::new(
+        12924, 5630, 2751, 7961, 14757, 29792, -8632, 13429, -3048, -12501, -25328, 421, 20048,
+        -3741, -20350, 17918,
+    );
+    let r = i64x4::new(-1, -281474976710656, -281471439994880, 4294967295);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrani_hu_w::<9>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrani_wu_d() {
+    let a = u32x8::new(
+        790410016, 1846446414, 17060282, 4137690011, 4225559886, 456167206, 2038191803,
+        3549679132,
+    );
+    let b = i32x8::new(
+        1124175113,
+        194327297,
+        1714613,
+        1768781089,
+        1565600638,
+        -239088013,
+        -1330211045,
+        -142923536,
+    );
+    let r = i64x4::new(7418804384752972, 1803170, 0, 445475);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrani_wu_d::<42>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrani_du_q() {
+    let a = u64x4::new(
+        4439635547532985516,
+        2617773814700322397,
+        11329239143202498931,
+        5215941649340885573,
+    );
+    let b = i64x4::new(
+        4456285459677935774,
+        -4219123236529314050,
+        2135308934637733797,
+        -2097442597384769114,
+    );
+    let r = i64x4::new(0, 1162, 0, 2316);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrani_du_q::<115>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarni_b_h() {
+    let a = i8x32::new(
+        37, 100, -16, 92, 65, -5, 44, -80, 109, -99, -15, -22, -16, -48, -109, 81, -4, -31,
+        -48, -58, 103, -92, -51, -109, 32, -112, 36, 90, 79, 79, 66, -50,
+    );
+    let b = i8x32::new(
+        -17, 23, 25, -46, 124, -97, -58, -51, -68, -108, -48, 69, -126, 115, 46, 13, 66, 54,
+        114, 115, -18, -123, 98, 118, -7, -56, -122, -103, -94, -100, 112, 77,
+    );
+    let r = i64x4::new(
+        3854939995940880480,
+        9187532907754651519,
+        9187484529219108735,
+        -9187484524924075896,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrarni_b_h::<6>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarni_h_w() {
+    let a = i16x16::new(
+        -2373, -24512, -6581, 18622, -28242, 12319, -8850, -19323, 12925, -6513, -5054, 31054,
+        6907, -16683, -29917, 16639,
+    );
+    let b = i16x16::new(
+        30767, -4399, -21574, -27342, 15257, -2000, -28741, 6286, -10858, -32114, -2565, 11901,
+        -815, -13930, 31355, 23314,
+    );
+    let r = i64x4::new(
+        3659161808928759,
+        -10695946033365040,
+        13229207942856641,
+        9288532501594099,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrarni_h_w::<25>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarni_w_d() {
+    let a = i32x8::new(
+        -409627486,
+        1892097986,
+        -1750910325,
+        -1547433679,
+        -1884419017,
+        1579223214,
+        1151303281,
+        -1571586603,
+    );
+    let b = i32x8::new(
+        364285131,
+        2006347587,
+        155571363,
+        -1533032556,
+        -1176543806,
+        163000547,
+        557435884,
+        -1610070779,
+    );
+    let r = i64x4::new(-12884901884, -12884901884, -12884901888, -12884901885);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrarni_w_d::<61>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarni_d_q() {
+    let a = i64x4::new(
+        -3977765823996238174,
+        7327308686384468121,
+        6534356875603597306,
+        -6213176538981319905,
+    );
+    let b = i64x4::new(
+        3336126315622887836,
+        -1421822040970831870,
+        -3632342560101816908,
+        6607031745644833811,
+    );
+    let r = i64x4::new(-2, 13, 11, -11);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrarni_d_q::<123>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarni_bu_h() {
+    let a = u8x32::new(
+        193, 242, 89, 76, 29, 42, 190, 17, 62, 209, 26, 45, 231, 78, 123, 125, 177, 121, 30,
+        205, 85, 184, 30, 54, 64, 91, 228, 123, 242, 32, 245, 116,
+    );
+    let b = i8x32::new(
+        5, 94, 72, 83, 78, 23, 5, 51, 110, -86, 74, 70, -99, 111, -112, -94, -89, 1, -14, -17,
+        116, -105, 29, 34, -52, 15, 45, 47, -121, -106, 28, 37,
+    );
+    let r = i64x4::new(
+        7901090775700760,
+        2239427009405719296,
+        648531557811748864,
+        2091956210793185310,
+    );
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrarni_bu_h::<10>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarni_hu_w() {
+    let a = u16x16::new(
+        51760, 63593, 22275, 32531, 40741, 58073, 26835, 39742, 8352, 44544, 27074, 30619,
+        37450, 62701, 34849, 52300,
+    );
+    let b = i16x16::new(
+        4460, -2173, 9587, -13951, -27036, 22540, -29433, 21420, 8161, -13247, -22431, -17918,
+        14542, -22571, 29221, -25316,
+    );
+    let r = i64x4::new(281479271677952, 131072, 0, 131072);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrarni_hu_w::<30>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarni_wu_d() {
+    let a = u32x8::new(
+        3607104991, 1691528601, 1646387994, 3297780207, 1308777898, 2787161654, 1384884119,
+        2469722276,
+    );
+    let b = i32x8::new(
+        1057151305,
+        1547571989,
+        -1438179575,
+        -674675006,
+        -1782337903,
+        -1886071573,
+        1398821536,
+        -842047108,
+    );
+    let r = i64x4::new(3, 3, 0, 0);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrarni_wu_d::<61>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvssrarni_du_q() {
+    let a = u64x4::new(
+        17745120891134780613,
+        17495926160423737090,
+        17172121380293899495,
+        9650615204759187347,
+    );
+    let b = i64x4::new(
+        -1697356653875036425,
+        8295898722167744374,
+        3345487212441260159,
+        -6164422872274135032,
+    );
+    let r = i64x4::new(-1, 0, 0, 0);
+
+    assert_eq!(
+        r,
+        transmute(lasx_xvssrarni_du_q::<15>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xbnz_b() {
+    let a = u8x32::new(
+        52, 144, 253, 233, 192, 255, 120, 244, 63, 161, 189, 203, 12, 208, 233, 255, 43, 119,
+        120, 82, 121, 194, 249, 47, 211, 41, 120, 204, 13, 67, 208, 223,
+    );
+    let r: i32 = 1;
+
+    assert_eq!(r, transmute(lasx_xbnz_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xbnz_d() {
+    let a = u64x4::new(
+        1072041358626911785,
+        13770317343519767693,
+        7609734988530058463,
+        15151929908370022007,
+    );
+    let r: i32 = 1;
+
+    assert_eq!(r, transmute(lasx_xbnz_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xbnz_h() {
+    let a = u16x16::new(
+        19391, 20489, 16878, 56279, 52740, 3527, 27948, 60443, 25278, 61969, 6762, 35448,
+        28924, 34327, 22427, 5444,
+    );
+    let r: i32 = 1;
+
+    assert_eq!(r, transmute(lasx_xbnz_h(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xbnz_v() {
+    let a = u8x32::new(
+        137, 127, 48, 118, 43, 194, 48, 37, 231, 38, 31, 50, 240, 208, 254, 90, 200, 158, 40,
+        38, 192, 180, 105, 245, 102, 149, 53, 213, 112, 215, 100, 152,
+    );
+    let r: i32 = 1;
+
+    assert_eq!(r, transmute(lasx_xbnz_v(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xbnz_w() {
+    let a = u32x8::new(
+        1332660055, 2747714226, 143160005, 119041189, 2584280725, 894305940, 2774463674,
+        2502507106,
+    );
+    let r: i32 = 1;
+
+    assert_eq!(r, transmute(lasx_xbnz_w(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xbz_b() {
+    let a = u8x32::new(
+        156, 147, 147, 177, 127, 216, 32, 152, 55, 208, 206, 60, 244, 31, 57, 39, 72, 181, 147,
+        141, 238, 33, 32, 5, 231, 1, 227, 42, 133, 202, 103, 67,
+    );
+    let r: i32 = 0;
+
+    assert_eq!(r, transmute(lasx_xbz_b(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xbz_d() {
+    let a = u64x4::new(
+        6400818938894159638,
+        10728379594538160633,
+        1581126190179348917,
+        18400090329472768228,
+    );
+    let r: i32 = 0;
+
+    assert_eq!(r, transmute(lasx_xbz_d(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xbz_h() {
+    let a = u16x16::new(
+        34066, 39412, 64746, 3863, 50032, 22525, 9079, 56473, 53585, 42778, 58380, 52817,
+        62358, 53187, 65430, 56633,
+    );
+    let r: i32 = 0;
+
+    assert_eq!(r, transmute(lasx_xbz_h(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xbz_v() {
+    let a = u8x32::new(
+        163, 229, 46, 44, 39, 89, 56, 38, 233, 178, 116, 135, 122, 191, 3, 141, 240, 213, 178,
+        12, 81, 195, 113, 34, 100, 51, 70, 4, 238, 90, 144, 128,
+    );
+    let r: i32 = 0;
+
+    assert_eq!(r, transmute(lasx_xbz_v(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xbz_w() {
+    let a = u32x8::new(
+        1201964702, 3804322072, 2566580464, 1047038968, 3180983430, 3379242404, 4047354705,
+        444599201,
+    );
+    let r: i32 = 0;
+
+    assert_eq!(r, transmute(lasx_xbz_w(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_caf_d() {
+    let a = u64x4::new(
+        4606839356548580067,
+        4597657891815152040,
+        4603435215712027397,
+        4604372277177725810,
+    );
+    let b = u64x4::new(
+        4603866787258734895,
+        4605750987205548493,
+        4594271025112584476,
+        4604044410019184426,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_caf_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_caf_s() {
+    let a = u32x8::new(
+        1027122768, 1048202064, 1061996851, 1056399152, 1053612728, 1059134546, 1058685361,
+        1059303636,
+    );
+    let b = u32x8::new(
+        1052329028, 1041170924, 1053459178, 1051113546, 1055408428, 1052614588, 1059435003,
+        1062279267,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_caf_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_ceq_d() {
+    let a = u64x4::new(
+        4604351168364659876,
+        4598833803415332886,
+        4605119133668748091,
+        4606763866461983079,
+    );
+    let b = u64x4::new(
+        4604789538755812401,
+        4598766034813670762,
+        4594451263359797256,
+        4601380068795295764,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_ceq_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_ceq_s() {
+    let a = u32x8::new(
+        1064654513, 1047582960, 1060336644, 1065079996, 1052824856, 1061207347, 1063892428,
+        1001614208,
+    );
+    let b = u32x8::new(
+        1044141476, 1021192768, 1060376772, 1050417278, 1061038362, 1056139396, 1057149355,
+        1055333616,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_ceq_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cle_d() {
+    let a = u64x4::new(
+        4595367725174333184,
+        4596790595174909884,
+        4593132764781967144,
+        4599038464418852978,
+    );
+    let b = u64x4::new(
+        4602705386887165787,
+        4606260944252637140,
+        4599015506541096164,
+        4595819902199976812,
+    );
+    let r = i64x4::new(-1, -1, -1, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cle_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cle_s() {
+    let a = u32x8::new(
+        1062033024, 1059343465, 1055578206, 1041885056, 1044779744, 1062731853, 1043491496,
+        1049977384,
+    );
+    let b = u32x8::new(
+        1056391070, 1056787090, 1064058770, 1062459426, 1064795941, 1064011655, 1031362688,
+        1057735956,
+    );
+    let r = i64x4::new(0, -1, -1, -4294967296);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cle_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_clt_d() {
+    let a = u64x4::new(
+        4604242319890507255,
+        4600980435115810514,
+        4605419716078684891,
+        4599564622270556718,
+    );
+    let b = u64x4::new(
+        4589220872256482592,
+        4602715102780925632,
+        4604097858141367250,
+        4605812683073652447,
+    );
+    let r = i64x4::new(0, -1, 0, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_clt_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_clt_s() {
+    let a = u32x8::new(
+        1051323696, 1049201802, 1005628672, 1056692360, 1044683352, 1052201626, 1058314596,
+        1020000992,
+    );
+    let b = u32x8::new(
+        1055411522, 1059584260, 1046257332, 1041146612, 1064440240, 1064500639, 1062809438,
+        1064342005,
+    );
+    let r = i64x4::new(-1, 4294967295, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_clt_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cne_d() {
+    let a = u64x4::new(
+        4598267260722064680,
+        4605603034614740670,
+        4604843132364965720,
+        4595126942010545664,
+    );
+    let b = u64x4::new(
+        4606134769367779594,
+        4605453748913122312,
+        4599415837069158138,
+        4601771367817563314,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cne_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cne_s() {
+    let a = u32x8::new(
+        1042659128, 1065350244, 1032310576, 1061728337, 1062313491, 1063903497, 1063781692,
+        1057998506,
+    );
+    let b = u32x8::new(
+        1041065828, 1061625246, 1045204740, 1054328432, 1036315496, 1061417737, 1047548872,
+        1049890404,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cne_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cor_d() {
+    let a = u64x4::new(
+        4603862490470319449,
+        4601565668275439290,
+        4606067119428218406,
+        4606327024345603527,
+    );
+    let b = u64x4::new(
+        4605708081396913008,
+        4604379998889664770,
+        4584756849579116944,
+        4604755606278723296,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cor_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cor_s() {
+    let a = u32x8::new(
+        1058610981, 1045033144, 1052398652, 1063724666, 1043910192, 1059183076, 1058489697,
+        1040176728,
+    );
+    let b = u32x8::new(
+        1032397784, 1054938542, 1057767324, 1054806424, 1055680194, 1057342938, 1060622406,
+        1055092632,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cor_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cueq_d() {
+    let a = u64x4::new(
+        4603200339689238557,
+        4602812037576416711,
+        4606851174908484583,
+        4606385521842539189,
+    );
+    let b = u64x4::new(
+        4603085364717668671,
+        4606853743461984788,
+        4585080339878261296,
+        4606053791400332699,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cueq_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cueq_s() {
+    let a = u32x8::new(
+        1057425562, 1063555579, 1046256744, 1022920160, 1065220069, 1052327026, 1014579968,
+        1048239780,
+    );
+    let b = u32x8::new(
+        1049557526, 1053332678, 1051191726, 1064421754, 1057629639, 1060344219, 1035702088,
+        1050028150,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cueq_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cule_d() {
+    let a = u64x4::new(
+        4604971750650888499,
+        4593662226049896016,
+        4595869612440915848,
+        4601748250340185114,
+    );
+    let b = u64x4::new(
+        4591931242171514960,
+        4603997046544929558,
+        4604974910786711097,
+        4594297721205202168,
+    );
+    let r = i64x4::new(0, -1, -1, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cule_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cule_s() {
+    let a = u32x8::new(
+        1052434396, 1026804576, 1041964148, 1063157036, 1048709802, 1060293833, 1047340196,
+        1024531168,
+    );
+    let b = u32x8::new(
+        1047645820, 1057293405, 1052020188, 1057942586, 1063407758, 1049107470, 1057298442,
+        1048069496,
+    );
+    let r = i64x4::new(-4294967296, 4294967295, 4294967295, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cule_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cult_d() {
+    let a = u64x4::new(
+        4606775288794066380,
+        4598808693757211694,
+        4606790379429412870,
+        4605939949509363873,
+    );
+    let b = u64x4::new(
+        4603717006707963555,
+        4603504390160152243,
+        4603259926905419449,
+        4601857582598168522,
+    );
+    let r = i64x4::new(0, -1, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cult_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cult_s() {
+    let a = u32x8::new(
+        1040152456, 1054570724, 1057645741, 1059637215, 1036822376, 1036413584, 1003370880,
+        1061729841,
+    );
+    let b = u32x8::new(
+        1060169565, 1056061318, 1052047112, 1053313212, 1044605328, 1064898859, 1050643938,
+        1064626494,
+    );
+    let r = i64x4::new(-1, 0, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cult_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cun_d() {
+    let a = u64x4::new(
+        4601926997709293092,
+        4595132289995141556,
+        4600980852994617218,
+        4594388740429843072,
+    );
+    let b = u64x4::new(
+        4600518403789793172,
+        4603476024215184625,
+        4605134822967030979,
+        4602608048300777812,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cun_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cune_d() {
+    let a = u64x4::new(
+        4592724877670260624,
+        4603613641675881288,
+        4597286359527586476,
+        4601708681880094032,
+    );
+    let b = u64x4::new(
+        4598966797677485150,
+        4587297906823272784,
+        4604035321505064646,
+        4604260243109134356,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cune_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cune_s() {
+    let a = u32x8::new(
+        1064210263, 1059501406, 1055862424, 1054523594, 1059174050, 1050594182, 1052822848,
+        1051372950,
+    );
+    let b = u32x8::new(
+        1032533328, 1051044268, 1051967492, 1051754540, 1059816024, 1063426731, 1052204618,
+        1064439988,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cune_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_cun_s() {
+    let a = u32x8::new(
+        1062241044, 1056379734, 1063223413, 1034390344, 1044998176, 1057590594, 1059237612,
+        1057447940,
+    );
+    let b = u32x8::new(
+        1058046704, 1055331758, 1057614999, 1063039091, 1058229285, 1058774306, 1059987402,
+        1033042696,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_cun_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_saf_d() {
+    let a = u64x4::new(
+        4594182209901295828,
+        4581464082173647264,
+        4590099234403759840,
+        4604004273369365130,
+    );
+    let b = u64x4::new(
+        4606819328552123016,
+        4604091229052796023,
+        4604586834115148931,
+        4605037320947641934,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_saf_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_saf_s() {
+    let a = u32x8::new(
+        1060812428, 1061245324, 1063578557, 1030594672, 1059247505, 1044611124, 1052152258,
+        1054967010,
+    );
+    let b = u32x8::new(
+        1036948656, 1051225988, 1058720867, 1032456856, 1051436132, 1041087636, 1047267492,
+        1051250362,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_saf_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_seq_d() {
+    let a = u64x4::new(
+        4604294705496916441,
+        4593686918662327792,
+        4605517303678922516,
+        4604494135015023007,
+    );
+    let b = u64x4::new(
+        4606394023713761400,
+        4604455367892895376,
+        4599018364404818718,
+        4605980286735586821,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_seq_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_seq_s() {
+    let a = u32x8::new(
+        1044265248, 1058937550, 1056790200, 1052048406, 1059868687, 1051483336, 1046520332,
+        1043191144,
+    );
+    let b = u32x8::new(
+        1063109529, 1055603330, 1062415892, 1040213636, 1058253673, 1058703239, 1061796632,
+        1061413795,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_seq_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sle_d() {
+    let a = u64x4::new(
+        4602408684767598022,
+        4594250615798987092,
+        4604963756353006013,
+        4606163211162118467,
+    );
+    let b = u64x4::new(
+        4601947900990812282,
+        4593344345788988968,
+        4593039683552237328,
+        4589469470804985856,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sle_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sle_s() {
+    let a = u32x8::new(
+        1058066838, 1064865582, 1052694366, 1057408270, 1045092236, 1055900780, 1062509444,
+        1031929176,
+    );
+    let b = u32x8::new(
+        1034008560, 1055354624, 1065161513, 1050271030, 1063181654, 1057764124, 1061600359,
+        1025107040,
+    );
+    let r = i64x4::new(0, 4294967295, -1, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sle_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_slt_d() {
+    let a = u64x4::new(
+        4604369883332347358,
+        4591650558117088944,
+        4596580563429877336,
+        4602996385956780830,
+    );
+    let b = u64x4::new(
+        4600043432599191356,
+        4605816405801305324,
+        4604195043424640949,
+        4599985899346669220,
+    );
+    let r = i64x4::new(0, -1, -1, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_slt_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_slt_s() {
+    let a = u32x8::new(
+        1059565142, 1057830491, 1052849564, 1049794018, 1063910487, 1059818709, 1027439600,
+        1057381646,
+    );
+    let b = u32x8::new(
+        1040414724, 1040288116, 1043374880, 1056311634, 1065024654, 1056424062, 1057720509,
+        1063111390,
+    );
+    let r = i64x4::new(0, -4294967296, 4294967295, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_slt_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sne_d() {
+    let a = u64x4::new(
+        4593560649779963032,
+        4604654429289647502,
+        4603296524089071766,
+        4600835325257043198,
+    );
+    let b = u64x4::new(
+        4605487761864572918,
+        4605408876521930103,
+        4598422649694782656,
+        4592189012823412008,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sne_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sne_s() {
+    let a = u32x8::new(
+        1042871300, 1062745184, 1064937837, 1040277356, 1057066266, 1018600128, 1059841200,
+        1051941856,
+    );
+    let b = u32x8::new(
+        1061164420, 1056972365, 1057052091, 1057171641, 1057154275, 1064004148, 1053173190,
+        1062872949,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sne_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sor_d() {
+    let a = u64x4::new(
+        4600032844669681944,
+        4594463383805270076,
+        4592958727948323240,
+        4598474090378898318,
+    );
+    let b = u64x4::new(
+        4602979608704078034,
+        4606565228276935378,
+        4604003678580242406,
+        4604391192007326981,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sor_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sor_s() {
+    let a = u32x8::new(
+        1061014415, 1062349523, 1051726058, 1055193302, 1042014376, 1060862292, 1049178518,
+        1057703558,
+    );
+    let b = u32x8::new(
+        1049131624, 1041520484, 1065237143, 1062513527, 1050805196, 1050889556, 1064403532,
+        1054988022,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sor_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sueq_d() {
+    let a = u64x4::new(
+        4603806425689581476,
+        4602719352745602774,
+        4594235151654053920,
+        4598585482869376160,
+    );
+    let b = u64x4::new(
+        4597192397006933792,
+        4602801475688800384,
+        4599539096838817414,
+        4603943496423544517,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sueq_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sueq_s() {
+    let a = u32x8::new(
+        1063023580, 1064528754, 1050308238, 1037288408, 1040252868, 1052571256, 1054474094,
+        1060927468,
+    );
+    let b = u32x8::new(
+        1046997360, 1061154107, 1053281976, 1040631584, 1047759184, 1060702185, 1058969574,
+        1055588604,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sueq_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sule_d() {
+    let a = u64x4::new(
+        4603957166332235709,
+        4606383957649489661,
+        4606330328898957118,
+        4604578658311008992,
+    );
+    let b = u64x4::new(
+        4603539942547513158,
+        4603598897708702396,
+        4606250921023174648,
+        4592187933910963896,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sule_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sule_s() {
+    let a = u32x8::new(
+        1048433556, 1057438072, 1054557166, 1065240380, 1060486424, 1064222633, 1065198422,
+        1034306768,
+    );
+    let b = u32x8::new(
+        1041928380, 1018285056, 1055996038, 1059481010, 1024438512, 1052197062, 1055194940,
+        1033264360,
+    );
+    let r = i64x4::new(0, 4294967295, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sule_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sult_d() {
+    let a = u64x4::new(
+        4605366058991696696,
+        4601232121105121062,
+        4601996581218373232,
+        4602266745451684294,
+    );
+    let b = u64x4::new(
+        4599548937774345734,
+        4604614363604787867,
+        4593970533267593656,
+        4605031421622352277,
+    );
+    let r = i64x4::new(0, -1, 0, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sult_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sult_s() {
+    let a = u32x8::new(
+        1044761596, 1015684704, 1049105674, 1061214845, 1031561696, 1055360952, 1060420352,
+        1063461022,
+    );
+    let b = u32x8::new(
+        1063585876, 1063262278, 1062673201, 1059017275, 1032877328, 1063558131, 1057454077,
+        1062968413,
+    );
+    let r = i64x4::new(-1, 4294967295, -1, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sult_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sun_d() {
+    let a = u64x4::new(
+        4581684619237043552,
+        4604681260167973492,
+        4602321601943005466,
+        4605768364153053538,
+    );
+    let b = u64x4::new(
+        4604919109359715487,
+        4606713834219051412,
+        4601813019181652070,
+        4598024963761131488,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sun_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sune_d() {
+    let a = u64x4::new(
+        4602842520488526831,
+        4586377859926895520,
+        4595797380069114560,
+        4597668933134490352,
+    );
+    let b = u64x4::new(
+        4603719292253049421,
+        4601306102929155814,
+        4606447272167981658,
+        4595752422326832136,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sune_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sune_s() {
+    let a = u32x8::new(
+        1060627725, 1063145029, 1064291001, 1058025149, 1037522088, 1059097656, 1041307400,
+        1059437048,
+    );
+    let b = u32x8::new(
+        1048507540, 1059109210, 1029412928, 1063377178, 1059646047, 1061716080, 1057060099,
+        1040743680,
+    );
+    let r = i64x4::new(-1, -1, -1, -1);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sune_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvfcmp_sun_s() {
+    let a = u32x8::new(
+        1062269194, 1017878048, 1020862944, 1063553320, 1052587356, 1041348304, 1063597708,
+        1046660292,
+    );
+    let b = u32x8::new(
+        1053486118, 1028652080, 1057647183, 1051605726, 987074560, 1053988970, 1063915975,
+        1039720984,
+    );
+    let r = i64x4::new(0, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvfcmp_sun_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickve_d_f() {
+    let a = u64x4::new(
+        4601462012634722388,
+        4605596490350167974,
+        4589580703778483496,
+        4590176684263748456,
+    );
+    let r = i64x4::new(4605596490350167974, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvpickve_d_f::<1>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvpickve_w_f() {
+    let a = u32x8::new(
+        1050978982, 1040565756, 1052944866, 1048054444, 1050714578, 1048632290, 1064399621,
+        1049634380,
+    );
+    let r = i64x4::new(1040565756, 0, 0, 0);
+
+    assert_eq!(r, transmute(lasx_xvpickve_w_f::<1>(transmute(a))));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrepli_b() {
+    let r = i64x4::new(
+        -940422246894996750,
+        -940422246894996750,
+        -940422246894996750,
+        -940422246894996750,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrepli_b::<498>()));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrepli_d() {
+    let r = i64x4::new(169, 169, 169, 169);
+
+    assert_eq!(r, transmute(lasx_xvrepli_d::<169>()));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrepli_h() {
+    let r = i64x4::new(
+        -108650998892986755,
+        -108650998892986755,
+        -108650998892986755,
+        -108650998892986755,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrepli_h::<-387>()));
+}
+
+#[simd_test(enable = "lasx")]
+unsafe fn test_lasx_xvrepli_w() {
+    let r = i64x4::new(
+        -1662152343940,
+        -1662152343940,
+        -1662152343940,
+        -1662152343940,
+    );
+
+    assert_eq!(r, transmute(lasx_xvrepli_w::<-388>()));
+}
diff --git a/library/stdarch/crates/core_arch/src/loongarch64/lasx/types.rs b/library/stdarch/crates/core_arch/src/loongarch64/lasx/types.rs
new file mode 100644
index 0000000000000..9611517e6370f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/loongarch64/lasx/types.rs
@@ -0,0 +1,33 @@
+types! {
+    #![unstable(feature = "stdarch_loongarch", issue = "117427")]
+
+    /// LOONGARCH-specific 256-bit wide vector of 32 packed `i8`.
+    pub struct v32i8(32 x pub(crate) i8);
+
+    /// LOONGARCH-specific 256-bit wide vector of 16 packed `i16`.
+    pub struct v16i16(16 x pub(crate) i16);
+
+    /// LOONGARCH-specific 256-bit wide vector of 8 packed `i32`.
+    pub struct v8i32(8 x pub(crate) i32);
+
+    /// LOONGARCH-specific 256-bit wide vector of 4 packed `i64`.
+    pub struct v4i64(4 x pub(crate) i64);
+
+    /// LOONGARCH-specific 256-bit wide vector of 32 packed `u8`.
+    pub struct v32u8(32 x pub(crate) u8);
+
+    /// LOONGARCH-specific 256-bit wide vector of 16 packed `u16`.
+    pub struct v16u16(16 x pub(crate) u16);
+
+    /// LOONGARCH-specific 256-bit wide vector of 8 packed `u32`.
+    pub struct v8u32(8 x pub(crate) u32);
+
+    /// LOONGARCH-specific 256-bit wide vector of 4 packed `u64`.
+    pub struct v4u64(4 x pub(crate) u64);
+
+    /// LOONGARCH-specific 128-bit wide vector of 8 packed `f32`.
+    pub struct v8f32(8 x pub(crate) f32);
+
+    /// LOONGARCH-specific 256-bit wide vector of 4 packed `f64`.
+    pub struct v4f64(4 x pub(crate) f64);
+}
diff --git a/library/stdarch/crates/core_arch/src/loongarch64/lsx/generated.rs b/library/stdarch/crates/core_arch/src/loongarch64/lsx/generated.rs
new file mode 100644
index 0000000000000..2bc364f3e069e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/loongarch64/lsx/generated.rs
@@ -0,0 +1,6879 @@
+// This code is automatically generated. DO NOT MODIFY.
+//
+// Instead, modify `crates/stdarch-gen-loongarch/lsx.spec` and run the following command to re-generate this file:
+//
+// ```
+// OUT_DIR=`pwd`/crates/core_arch cargo run -p stdarch-gen-loongarch -- crates/stdarch-gen-loongarch/lsx.spec
+// ```
+
+use super::types::*;
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.loongarch.lsx.vsll.b"]
+    fn __lsx_vsll_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsll.h"]
+    fn __lsx_vsll_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsll.w"]
+    fn __lsx_vsll_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsll.d"]
+    fn __lsx_vsll_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vslli.b"]
+    fn __lsx_vslli_b(a: v16i8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vslli.h"]
+    fn __lsx_vslli_h(a: v8i16, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vslli.w"]
+    fn __lsx_vslli_w(a: v4i32, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vslli.d"]
+    fn __lsx_vslli_d(a: v2i64, b: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsra.b"]
+    fn __lsx_vsra_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsra.h"]
+    fn __lsx_vsra_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsra.w"]
+    fn __lsx_vsra_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsra.d"]
+    fn __lsx_vsra_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsrai.b"]
+    fn __lsx_vsrai_b(a: v16i8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrai.h"]
+    fn __lsx_vsrai_h(a: v8i16, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrai.w"]
+    fn __lsx_vsrai_w(a: v4i32, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsrai.d"]
+    fn __lsx_vsrai_d(a: v2i64, b: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsrar.b"]
+    fn __lsx_vsrar_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrar.h"]
+    fn __lsx_vsrar_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrar.w"]
+    fn __lsx_vsrar_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsrar.d"]
+    fn __lsx_vsrar_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsrari.b"]
+    fn __lsx_vsrari_b(a: v16i8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrari.h"]
+    fn __lsx_vsrari_h(a: v8i16, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrari.w"]
+    fn __lsx_vsrari_w(a: v4i32, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsrari.d"]
+    fn __lsx_vsrari_d(a: v2i64, b: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsrl.b"]
+    fn __lsx_vsrl_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrl.h"]
+    fn __lsx_vsrl_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrl.w"]
+    fn __lsx_vsrl_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsrl.d"]
+    fn __lsx_vsrl_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsrli.b"]
+    fn __lsx_vsrli_b(a: v16i8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrli.h"]
+    fn __lsx_vsrli_h(a: v8i16, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrli.w"]
+    fn __lsx_vsrli_w(a: v4i32, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsrli.d"]
+    fn __lsx_vsrli_d(a: v2i64, b: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsrlr.b"]
+    fn __lsx_vsrlr_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrlr.h"]
+    fn __lsx_vsrlr_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrlr.w"]
+    fn __lsx_vsrlr_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsrlr.d"]
+    fn __lsx_vsrlr_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsrlri.b"]
+    fn __lsx_vsrlri_b(a: v16i8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrlri.h"]
+    fn __lsx_vsrlri_h(a: v8i16, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrlri.w"]
+    fn __lsx_vsrlri_w(a: v4i32, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsrlri.d"]
+    fn __lsx_vsrlri_d(a: v2i64, b: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vbitclr.b"]
+    fn __lsx_vbitclr_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vbitclr.h"]
+    fn __lsx_vbitclr_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vbitclr.w"]
+    fn __lsx_vbitclr_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vbitclr.d"]
+    fn __lsx_vbitclr_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vbitclri.b"]
+    fn __lsx_vbitclri_b(a: v16u8, b: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vbitclri.h"]
+    fn __lsx_vbitclri_h(a: v8u16, b: u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vbitclri.w"]
+    fn __lsx_vbitclri_w(a: v4u32, b: u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vbitclri.d"]
+    fn __lsx_vbitclri_d(a: v2u64, b: u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vbitset.b"]
+    fn __lsx_vbitset_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vbitset.h"]
+    fn __lsx_vbitset_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vbitset.w"]
+    fn __lsx_vbitset_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vbitset.d"]
+    fn __lsx_vbitset_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vbitseti.b"]
+    fn __lsx_vbitseti_b(a: v16u8, b: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vbitseti.h"]
+    fn __lsx_vbitseti_h(a: v8u16, b: u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vbitseti.w"]
+    fn __lsx_vbitseti_w(a: v4u32, b: u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vbitseti.d"]
+    fn __lsx_vbitseti_d(a: v2u64, b: u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vbitrev.b"]
+    fn __lsx_vbitrev_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vbitrev.h"]
+    fn __lsx_vbitrev_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vbitrev.w"]
+    fn __lsx_vbitrev_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vbitrev.d"]
+    fn __lsx_vbitrev_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vbitrevi.b"]
+    fn __lsx_vbitrevi_b(a: v16u8, b: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vbitrevi.h"]
+    fn __lsx_vbitrevi_h(a: v8u16, b: u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vbitrevi.w"]
+    fn __lsx_vbitrevi_w(a: v4u32, b: u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vbitrevi.d"]
+    fn __lsx_vbitrevi_d(a: v2u64, b: u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vadd.b"]
+    fn __lsx_vadd_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vadd.h"]
+    fn __lsx_vadd_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vadd.w"]
+    fn __lsx_vadd_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vadd.d"]
+    fn __lsx_vadd_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vaddi.bu"]
+    fn __lsx_vaddi_bu(a: v16i8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vaddi.hu"]
+    fn __lsx_vaddi_hu(a: v8i16, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vaddi.wu"]
+    fn __lsx_vaddi_wu(a: v4i32, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vaddi.du"]
+    fn __lsx_vaddi_du(a: v2i64, b: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsub.b"]
+    fn __lsx_vsub_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsub.h"]
+    fn __lsx_vsub_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsub.w"]
+    fn __lsx_vsub_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsub.d"]
+    fn __lsx_vsub_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsubi.bu"]
+    fn __lsx_vsubi_bu(a: v16i8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsubi.hu"]
+    fn __lsx_vsubi_hu(a: v8i16, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsubi.wu"]
+    fn __lsx_vsubi_wu(a: v4i32, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsubi.du"]
+    fn __lsx_vsubi_du(a: v2i64, b: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmax.b"]
+    fn __lsx_vmax_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vmax.h"]
+    fn __lsx_vmax_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmax.w"]
+    fn __lsx_vmax_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmax.d"]
+    fn __lsx_vmax_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmaxi.b"]
+    fn __lsx_vmaxi_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vmaxi.h"]
+    fn __lsx_vmaxi_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmaxi.w"]
+    fn __lsx_vmaxi_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmaxi.d"]
+    fn __lsx_vmaxi_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmax.bu"]
+    fn __lsx_vmax_bu(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vmax.hu"]
+    fn __lsx_vmax_hu(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vmax.wu"]
+    fn __lsx_vmax_wu(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vmax.du"]
+    fn __lsx_vmax_du(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vmaxi.bu"]
+    fn __lsx_vmaxi_bu(a: v16u8, b: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vmaxi.hu"]
+    fn __lsx_vmaxi_hu(a: v8u16, b: u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vmaxi.wu"]
+    fn __lsx_vmaxi_wu(a: v4u32, b: u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vmaxi.du"]
+    fn __lsx_vmaxi_du(a: v2u64, b: u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vmin.b"]
+    fn __lsx_vmin_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vmin.h"]
+    fn __lsx_vmin_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmin.w"]
+    fn __lsx_vmin_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmin.d"]
+    fn __lsx_vmin_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmini.b"]
+    fn __lsx_vmini_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vmini.h"]
+    fn __lsx_vmini_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmini.w"]
+    fn __lsx_vmini_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmini.d"]
+    fn __lsx_vmini_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmin.bu"]
+    fn __lsx_vmin_bu(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vmin.hu"]
+    fn __lsx_vmin_hu(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vmin.wu"]
+    fn __lsx_vmin_wu(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vmin.du"]
+    fn __lsx_vmin_du(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vmini.bu"]
+    fn __lsx_vmini_bu(a: v16u8, b: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vmini.hu"]
+    fn __lsx_vmini_hu(a: v8u16, b: u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vmini.wu"]
+    fn __lsx_vmini_wu(a: v4u32, b: u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vmini.du"]
+    fn __lsx_vmini_du(a: v2u64, b: u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vseq.b"]
+    fn __lsx_vseq_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vseq.h"]
+    fn __lsx_vseq_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vseq.w"]
+    fn __lsx_vseq_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vseq.d"]
+    fn __lsx_vseq_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vseqi.b"]
+    fn __lsx_vseqi_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vseqi.h"]
+    fn __lsx_vseqi_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vseqi.w"]
+    fn __lsx_vseqi_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vseqi.d"]
+    fn __lsx_vseqi_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vslti.b"]
+    fn __lsx_vslti_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vslt.b"]
+    fn __lsx_vslt_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vslt.h"]
+    fn __lsx_vslt_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vslt.w"]
+    fn __lsx_vslt_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vslt.d"]
+    fn __lsx_vslt_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vslti.h"]
+    fn __lsx_vslti_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vslti.w"]
+    fn __lsx_vslti_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vslti.d"]
+    fn __lsx_vslti_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vslt.bu"]
+    fn __lsx_vslt_bu(a: v16u8, b: v16u8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vslt.hu"]
+    fn __lsx_vslt_hu(a: v8u16, b: v8u16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vslt.wu"]
+    fn __lsx_vslt_wu(a: v4u32, b: v4u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vslt.du"]
+    fn __lsx_vslt_du(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vslti.bu"]
+    fn __lsx_vslti_bu(a: v16u8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vslti.hu"]
+    fn __lsx_vslti_hu(a: v8u16, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vslti.wu"]
+    fn __lsx_vslti_wu(a: v4u32, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vslti.du"]
+    fn __lsx_vslti_du(a: v2u64, b: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsle.b"]
+    fn __lsx_vsle_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsle.h"]
+    fn __lsx_vsle_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsle.w"]
+    fn __lsx_vsle_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsle.d"]
+    fn __lsx_vsle_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vslei.b"]
+    fn __lsx_vslei_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vslei.h"]
+    fn __lsx_vslei_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vslei.w"]
+    fn __lsx_vslei_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vslei.d"]
+    fn __lsx_vslei_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsle.bu"]
+    fn __lsx_vsle_bu(a: v16u8, b: v16u8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsle.hu"]
+    fn __lsx_vsle_hu(a: v8u16, b: v8u16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsle.wu"]
+    fn __lsx_vsle_wu(a: v4u32, b: v4u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsle.du"]
+    fn __lsx_vsle_du(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vslei.bu"]
+    fn __lsx_vslei_bu(a: v16u8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vslei.hu"]
+    fn __lsx_vslei_hu(a: v8u16, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vslei.wu"]
+    fn __lsx_vslei_wu(a: v4u32, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vslei.du"]
+    fn __lsx_vslei_du(a: v2u64, b: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsat.b"]
+    fn __lsx_vsat_b(a: v16i8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsat.h"]
+    fn __lsx_vsat_h(a: v8i16, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsat.w"]
+    fn __lsx_vsat_w(a: v4i32, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsat.d"]
+    fn __lsx_vsat_d(a: v2i64, b: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsat.bu"]
+    fn __lsx_vsat_bu(a: v16u8, b: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vsat.hu"]
+    fn __lsx_vsat_hu(a: v8u16, b: u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vsat.wu"]
+    fn __lsx_vsat_wu(a: v4u32, b: u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vsat.du"]
+    fn __lsx_vsat_du(a: v2u64, b: u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vadda.b"]
+    fn __lsx_vadda_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vadda.h"]
+    fn __lsx_vadda_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vadda.w"]
+    fn __lsx_vadda_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vadda.d"]
+    fn __lsx_vadda_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsadd.b"]
+    fn __lsx_vsadd_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsadd.h"]
+    fn __lsx_vsadd_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsadd.w"]
+    fn __lsx_vsadd_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsadd.d"]
+    fn __lsx_vsadd_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsadd.bu"]
+    fn __lsx_vsadd_bu(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vsadd.hu"]
+    fn __lsx_vsadd_hu(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vsadd.wu"]
+    fn __lsx_vsadd_wu(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vsadd.du"]
+    fn __lsx_vsadd_du(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vavg.b"]
+    fn __lsx_vavg_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vavg.h"]
+    fn __lsx_vavg_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vavg.w"]
+    fn __lsx_vavg_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vavg.d"]
+    fn __lsx_vavg_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vavg.bu"]
+    fn __lsx_vavg_bu(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vavg.hu"]
+    fn __lsx_vavg_hu(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vavg.wu"]
+    fn __lsx_vavg_wu(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vavg.du"]
+    fn __lsx_vavg_du(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vavgr.b"]
+    fn __lsx_vavgr_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vavgr.h"]
+    fn __lsx_vavgr_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vavgr.w"]
+    fn __lsx_vavgr_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vavgr.d"]
+    fn __lsx_vavgr_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vavgr.bu"]
+    fn __lsx_vavgr_bu(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vavgr.hu"]
+    fn __lsx_vavgr_hu(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vavgr.wu"]
+    fn __lsx_vavgr_wu(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vavgr.du"]
+    fn __lsx_vavgr_du(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vssub.b"]
+    fn __lsx_vssub_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vssub.h"]
+    fn __lsx_vssub_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vssub.w"]
+    fn __lsx_vssub_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vssub.d"]
+    fn __lsx_vssub_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vssub.bu"]
+    fn __lsx_vssub_bu(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vssub.hu"]
+    fn __lsx_vssub_hu(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vssub.wu"]
+    fn __lsx_vssub_wu(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vssub.du"]
+    fn __lsx_vssub_du(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vabsd.b"]
+    fn __lsx_vabsd_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vabsd.h"]
+    fn __lsx_vabsd_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vabsd.w"]
+    fn __lsx_vabsd_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vabsd.d"]
+    fn __lsx_vabsd_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vabsd.bu"]
+    fn __lsx_vabsd_bu(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vabsd.hu"]
+    fn __lsx_vabsd_hu(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vabsd.wu"]
+    fn __lsx_vabsd_wu(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vabsd.du"]
+    fn __lsx_vabsd_du(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vmul.b"]
+    fn __lsx_vmul_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vmul.h"]
+    fn __lsx_vmul_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmul.w"]
+    fn __lsx_vmul_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmul.d"]
+    fn __lsx_vmul_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmadd.b"]
+    fn __lsx_vmadd_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vmadd.h"]
+    fn __lsx_vmadd_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmadd.w"]
+    fn __lsx_vmadd_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmadd.d"]
+    fn __lsx_vmadd_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmsub.b"]
+    fn __lsx_vmsub_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vmsub.h"]
+    fn __lsx_vmsub_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmsub.w"]
+    fn __lsx_vmsub_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmsub.d"]
+    fn __lsx_vmsub_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vdiv.b"]
+    fn __lsx_vdiv_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vdiv.h"]
+    fn __lsx_vdiv_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vdiv.w"]
+    fn __lsx_vdiv_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vdiv.d"]
+    fn __lsx_vdiv_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vdiv.bu"]
+    fn __lsx_vdiv_bu(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vdiv.hu"]
+    fn __lsx_vdiv_hu(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vdiv.wu"]
+    fn __lsx_vdiv_wu(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vdiv.du"]
+    fn __lsx_vdiv_du(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vhaddw.h.b"]
+    fn __lsx_vhaddw_h_b(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vhaddw.w.h"]
+    fn __lsx_vhaddw_w_h(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vhaddw.d.w"]
+    fn __lsx_vhaddw_d_w(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vhaddw.hu.bu"]
+    fn __lsx_vhaddw_hu_bu(a: v16u8, b: v16u8) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vhaddw.wu.hu"]
+    fn __lsx_vhaddw_wu_hu(a: v8u16, b: v8u16) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vhaddw.du.wu"]
+    fn __lsx_vhaddw_du_wu(a: v4u32, b: v4u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vhsubw.h.b"]
+    fn __lsx_vhsubw_h_b(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vhsubw.w.h"]
+    fn __lsx_vhsubw_w_h(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vhsubw.d.w"]
+    fn __lsx_vhsubw_d_w(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vhsubw.hu.bu"]
+    fn __lsx_vhsubw_hu_bu(a: v16u8, b: v16u8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vhsubw.wu.hu"]
+    fn __lsx_vhsubw_wu_hu(a: v8u16, b: v8u16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vhsubw.du.wu"]
+    fn __lsx_vhsubw_du_wu(a: v4u32, b: v4u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmod.b"]
+    fn __lsx_vmod_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vmod.h"]
+    fn __lsx_vmod_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmod.w"]
+    fn __lsx_vmod_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmod.d"]
+    fn __lsx_vmod_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmod.bu"]
+    fn __lsx_vmod_bu(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vmod.hu"]
+    fn __lsx_vmod_hu(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vmod.wu"]
+    fn __lsx_vmod_wu(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vmod.du"]
+    fn __lsx_vmod_du(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vreplve.b"]
+    fn __lsx_vreplve_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vreplve.h"]
+    fn __lsx_vreplve_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vreplve.w"]
+    fn __lsx_vreplve_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vreplve.d"]
+    fn __lsx_vreplve_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vreplvei.b"]
+    fn __lsx_vreplvei_b(a: v16i8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vreplvei.h"]
+    fn __lsx_vreplvei_h(a: v8i16, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vreplvei.w"]
+    fn __lsx_vreplvei_w(a: v4i32, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vreplvei.d"]
+    fn __lsx_vreplvei_d(a: v2i64, b: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vpickev.b"]
+    fn __lsx_vpickev_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vpickev.h"]
+    fn __lsx_vpickev_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vpickev.w"]
+    fn __lsx_vpickev_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vpickev.d"]
+    fn __lsx_vpickev_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vpickod.b"]
+    fn __lsx_vpickod_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vpickod.h"]
+    fn __lsx_vpickod_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vpickod.w"]
+    fn __lsx_vpickod_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vpickod.d"]
+    fn __lsx_vpickod_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vilvh.b"]
+    fn __lsx_vilvh_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vilvh.h"]
+    fn __lsx_vilvh_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vilvh.w"]
+    fn __lsx_vilvh_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vilvh.d"]
+    fn __lsx_vilvh_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vilvl.b"]
+    fn __lsx_vilvl_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vilvl.h"]
+    fn __lsx_vilvl_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vilvl.w"]
+    fn __lsx_vilvl_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vilvl.d"]
+    fn __lsx_vilvl_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vpackev.b"]
+    fn __lsx_vpackev_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vpackev.h"]
+    fn __lsx_vpackev_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vpackev.w"]
+    fn __lsx_vpackev_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vpackev.d"]
+    fn __lsx_vpackev_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vpackod.b"]
+    fn __lsx_vpackod_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vpackod.h"]
+    fn __lsx_vpackod_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vpackod.w"]
+    fn __lsx_vpackod_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vpackod.d"]
+    fn __lsx_vpackod_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vshuf.h"]
+    fn __lsx_vshuf_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vshuf.w"]
+    fn __lsx_vshuf_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vshuf.d"]
+    fn __lsx_vshuf_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vand.v"]
+    fn __lsx_vand_v(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vandi.b"]
+    fn __lsx_vandi_b(a: v16u8, b: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vor.v"]
+    fn __lsx_vor_v(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vori.b"]
+    fn __lsx_vori_b(a: v16u8, b: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vnor.v"]
+    fn __lsx_vnor_v(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vnori.b"]
+    fn __lsx_vnori_b(a: v16u8, b: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vxor.v"]
+    fn __lsx_vxor_v(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vxori.b"]
+    fn __lsx_vxori_b(a: v16u8, b: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vbitsel.v"]
+    fn __lsx_vbitsel_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vbitseli.b"]
+    fn __lsx_vbitseli_b(a: v16u8, b: v16u8, c: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vshuf4i.b"]
+    fn __lsx_vshuf4i_b(a: v16i8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vshuf4i.h"]
+    fn __lsx_vshuf4i_h(a: v8i16, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vshuf4i.w"]
+    fn __lsx_vshuf4i_w(a: v4i32, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vreplgr2vr.b"]
+    fn __lsx_vreplgr2vr_b(a: i32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vreplgr2vr.h"]
+    fn __lsx_vreplgr2vr_h(a: i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vreplgr2vr.w"]
+    fn __lsx_vreplgr2vr_w(a: i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vreplgr2vr.d"]
+    fn __lsx_vreplgr2vr_d(a: i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vpcnt.b"]
+    fn __lsx_vpcnt_b(a: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vpcnt.h"]
+    fn __lsx_vpcnt_h(a: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vpcnt.w"]
+    fn __lsx_vpcnt_w(a: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vpcnt.d"]
+    fn __lsx_vpcnt_d(a: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vclo.b"]
+    fn __lsx_vclo_b(a: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vclo.h"]
+    fn __lsx_vclo_h(a: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vclo.w"]
+    fn __lsx_vclo_w(a: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vclo.d"]
+    fn __lsx_vclo_d(a: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vclz.b"]
+    fn __lsx_vclz_b(a: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vclz.h"]
+    fn __lsx_vclz_h(a: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vclz.w"]
+    fn __lsx_vclz_w(a: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vclz.d"]
+    fn __lsx_vclz_d(a: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vpickve2gr.b"]
+    fn __lsx_vpickve2gr_b(a: v16i8, b: u32) -> i32;
+    #[link_name = "llvm.loongarch.lsx.vpickve2gr.h"]
+    fn __lsx_vpickve2gr_h(a: v8i16, b: u32) -> i32;
+    #[link_name = "llvm.loongarch.lsx.vpickve2gr.w"]
+    fn __lsx_vpickve2gr_w(a: v4i32, b: u32) -> i32;
+    #[link_name = "llvm.loongarch.lsx.vpickve2gr.d"]
+    fn __lsx_vpickve2gr_d(a: v2i64, b: u32) -> i64;
+    #[link_name = "llvm.loongarch.lsx.vpickve2gr.bu"]
+    fn __lsx_vpickve2gr_bu(a: v16i8, b: u32) -> u32;
+    #[link_name = "llvm.loongarch.lsx.vpickve2gr.hu"]
+    fn __lsx_vpickve2gr_hu(a: v8i16, b: u32) -> u32;
+    #[link_name = "llvm.loongarch.lsx.vpickve2gr.wu"]
+    fn __lsx_vpickve2gr_wu(a: v4i32, b: u32) -> u32;
+    #[link_name = "llvm.loongarch.lsx.vpickve2gr.du"]
+    fn __lsx_vpickve2gr_du(a: v2i64, b: u32) -> u64;
+    #[link_name = "llvm.loongarch.lsx.vinsgr2vr.b"]
+    fn __lsx_vinsgr2vr_b(a: v16i8, b: i32, c: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vinsgr2vr.h"]
+    fn __lsx_vinsgr2vr_h(a: v8i16, b: i32, c: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vinsgr2vr.w"]
+    fn __lsx_vinsgr2vr_w(a: v4i32, b: i32, c: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vinsgr2vr.d"]
+    fn __lsx_vinsgr2vr_d(a: v2i64, b: i64, c: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfadd.s"]
+    fn __lsx_vfadd_s(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfadd.d"]
+    fn __lsx_vfadd_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfsub.s"]
+    fn __lsx_vfsub_s(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfsub.d"]
+    fn __lsx_vfsub_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfmul.s"]
+    fn __lsx_vfmul_s(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfmul.d"]
+    fn __lsx_vfmul_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfdiv.s"]
+    fn __lsx_vfdiv_s(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfdiv.d"]
+    fn __lsx_vfdiv_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfcvt.h.s"]
+    fn __lsx_vfcvt_h_s(a: v4f32, b: v4f32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vfcvt.s.d"]
+    fn __lsx_vfcvt_s_d(a: v2f64, b: v2f64) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfmin.s"]
+    fn __lsx_vfmin_s(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfmin.d"]
+    fn __lsx_vfmin_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfmina.s"]
+    fn __lsx_vfmina_s(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfmina.d"]
+    fn __lsx_vfmina_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfmax.s"]
+    fn __lsx_vfmax_s(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfmax.d"]
+    fn __lsx_vfmax_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfmaxa.s"]
+    fn __lsx_vfmaxa_s(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfmaxa.d"]
+    fn __lsx_vfmaxa_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfclass.s"]
+    fn __lsx_vfclass_s(a: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfclass.d"]
+    fn __lsx_vfclass_d(a: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfsqrt.s"]
+    fn __lsx_vfsqrt_s(a: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfsqrt.d"]
+    fn __lsx_vfsqrt_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfrecip.s"]
+    fn __lsx_vfrecip_s(a: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfrecip.d"]
+    fn __lsx_vfrecip_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfrecipe.s"]
+    fn __lsx_vfrecipe_s(a: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfrecipe.d"]
+    fn __lsx_vfrecipe_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfrsqrte.s"]
+    fn __lsx_vfrsqrte_s(a: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfrsqrte.d"]
+    fn __lsx_vfrsqrte_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfrint.s"]
+    fn __lsx_vfrint_s(a: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfrint.d"]
+    fn __lsx_vfrint_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfrsqrt.s"]
+    fn __lsx_vfrsqrt_s(a: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfrsqrt.d"]
+    fn __lsx_vfrsqrt_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vflogb.s"]
+    fn __lsx_vflogb_s(a: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vflogb.d"]
+    fn __lsx_vflogb_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfcvth.s.h"]
+    fn __lsx_vfcvth_s_h(a: v8i16) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfcvth.d.s"]
+    fn __lsx_vfcvth_d_s(a: v4f32) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfcvtl.s.h"]
+    fn __lsx_vfcvtl_s_h(a: v8i16) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfcvtl.d.s"]
+    fn __lsx_vfcvtl_d_s(a: v4f32) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vftint.w.s"]
+    fn __lsx_vftint_w_s(a: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vftint.l.d"]
+    fn __lsx_vftint_l_d(a: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vftint.wu.s"]
+    fn __lsx_vftint_wu_s(a: v4f32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vftint.lu.d"]
+    fn __lsx_vftint_lu_d(a: v2f64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vftintrz.w.s"]
+    fn __lsx_vftintrz_w_s(a: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vftintrz.l.d"]
+    fn __lsx_vftintrz_l_d(a: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vftintrz.wu.s"]
+    fn __lsx_vftintrz_wu_s(a: v4f32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vftintrz.lu.d"]
+    fn __lsx_vftintrz_lu_d(a: v2f64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vffint.s.w"]
+    fn __lsx_vffint_s_w(a: v4i32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vffint.d.l"]
+    fn __lsx_vffint_d_l(a: v2i64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vffint.s.wu"]
+    fn __lsx_vffint_s_wu(a: v4u32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vffint.d.lu"]
+    fn __lsx_vffint_d_lu(a: v2u64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vandn.v"]
+    fn __lsx_vandn_v(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vneg.b"]
+    fn __lsx_vneg_b(a: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vneg.h"]
+    fn __lsx_vneg_h(a: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vneg.w"]
+    fn __lsx_vneg_w(a: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vneg.d"]
+    fn __lsx_vneg_d(a: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmuh.b"]
+    fn __lsx_vmuh_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vmuh.h"]
+    fn __lsx_vmuh_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmuh.w"]
+    fn __lsx_vmuh_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmuh.d"]
+    fn __lsx_vmuh_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmuh.bu"]
+    fn __lsx_vmuh_bu(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vmuh.hu"]
+    fn __lsx_vmuh_hu(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vmuh.wu"]
+    fn __lsx_vmuh_wu(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vmuh.du"]
+    fn __lsx_vmuh_du(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vsllwil.h.b"]
+    fn __lsx_vsllwil_h_b(a: v16i8, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsllwil.w.h"]
+    fn __lsx_vsllwil_w_h(a: v8i16, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsllwil.d.w"]
+    fn __lsx_vsllwil_d_w(a: v4i32, b: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsllwil.hu.bu"]
+    fn __lsx_vsllwil_hu_bu(a: v16u8, b: u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vsllwil.wu.hu"]
+    fn __lsx_vsllwil_wu_hu(a: v8u16, b: u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vsllwil.du.wu"]
+    fn __lsx_vsllwil_du_wu(a: v4u32, b: u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vsran.b.h"]
+    fn __lsx_vsran_b_h(a: v8i16, b: v8i16) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsran.h.w"]
+    fn __lsx_vsran_h_w(a: v4i32, b: v4i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsran.w.d"]
+    fn __lsx_vsran_w_d(a: v2i64, b: v2i64) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vssran.b.h"]
+    fn __lsx_vssran_b_h(a: v8i16, b: v8i16) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vssran.h.w"]
+    fn __lsx_vssran_h_w(a: v4i32, b: v4i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vssran.w.d"]
+    fn __lsx_vssran_w_d(a: v2i64, b: v2i64) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vssran.bu.h"]
+    fn __lsx_vssran_bu_h(a: v8u16, b: v8u16) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vssran.hu.w"]
+    fn __lsx_vssran_hu_w(a: v4u32, b: v4u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vssran.wu.d"]
+    fn __lsx_vssran_wu_d(a: v2u64, b: v2u64) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vsrarn.b.h"]
+    fn __lsx_vsrarn_b_h(a: v8i16, b: v8i16) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrarn.h.w"]
+    fn __lsx_vsrarn_h_w(a: v4i32, b: v4i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrarn.w.d"]
+    fn __lsx_vsrarn_w_d(a: v2i64, b: v2i64) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vssrarn.b.h"]
+    fn __lsx_vssrarn_b_h(a: v8i16, b: v8i16) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vssrarn.h.w"]
+    fn __lsx_vssrarn_h_w(a: v4i32, b: v4i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vssrarn.w.d"]
+    fn __lsx_vssrarn_w_d(a: v2i64, b: v2i64) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vssrarn.bu.h"]
+    fn __lsx_vssrarn_bu_h(a: v8u16, b: v8u16) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vssrarn.hu.w"]
+    fn __lsx_vssrarn_hu_w(a: v4u32, b: v4u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vssrarn.wu.d"]
+    fn __lsx_vssrarn_wu_d(a: v2u64, b: v2u64) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vsrln.b.h"]
+    fn __lsx_vsrln_b_h(a: v8i16, b: v8i16) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrln.h.w"]
+    fn __lsx_vsrln_h_w(a: v4i32, b: v4i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrln.w.d"]
+    fn __lsx_vsrln_w_d(a: v2i64, b: v2i64) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vssrln.bu.h"]
+    fn __lsx_vssrln_bu_h(a: v8u16, b: v8u16) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vssrln.hu.w"]
+    fn __lsx_vssrln_hu_w(a: v4u32, b: v4u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vssrln.wu.d"]
+    fn __lsx_vssrln_wu_d(a: v2u64, b: v2u64) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vsrlrn.b.h"]
+    fn __lsx_vsrlrn_b_h(a: v8i16, b: v8i16) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrlrn.h.w"]
+    fn __lsx_vsrlrn_h_w(a: v4i32, b: v4i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrlrn.w.d"]
+    fn __lsx_vsrlrn_w_d(a: v2i64, b: v2i64) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vssrlrn.bu.h"]
+    fn __lsx_vssrlrn_bu_h(a: v8u16, b: v8u16) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vssrlrn.hu.w"]
+    fn __lsx_vssrlrn_hu_w(a: v4u32, b: v4u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vssrlrn.wu.d"]
+    fn __lsx_vssrlrn_wu_d(a: v2u64, b: v2u64) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vfrstpi.b"]
+    fn __lsx_vfrstpi_b(a: v16i8, b: v16i8, c: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vfrstpi.h"]
+    fn __lsx_vfrstpi_h(a: v8i16, b: v8i16, c: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vfrstp.b"]
+    fn __lsx_vfrstp_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vfrstp.h"]
+    fn __lsx_vfrstp_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vshuf4i.d"]
+    fn __lsx_vshuf4i_d(a: v2i64, b: v2i64, c: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vbsrl.v"]
+    fn __lsx_vbsrl_v(a: v16i8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vbsll.v"]
+    fn __lsx_vbsll_v(a: v16i8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vextrins.b"]
+    fn __lsx_vextrins_b(a: v16i8, b: v16i8, c: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vextrins.h"]
+    fn __lsx_vextrins_h(a: v8i16, b: v8i16, c: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vextrins.w"]
+    fn __lsx_vextrins_w(a: v4i32, b: v4i32, c: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vextrins.d"]
+    fn __lsx_vextrins_d(a: v2i64, b: v2i64, c: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmskltz.b"]
+    fn __lsx_vmskltz_b(a: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vmskltz.h"]
+    fn __lsx_vmskltz_h(a: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmskltz.w"]
+    fn __lsx_vmskltz_w(a: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmskltz.d"]
+    fn __lsx_vmskltz_d(a: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsigncov.b"]
+    fn __lsx_vsigncov_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsigncov.h"]
+    fn __lsx_vsigncov_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsigncov.w"]
+    fn __lsx_vsigncov_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsigncov.d"]
+    fn __lsx_vsigncov_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfmadd.s"]
+    fn __lsx_vfmadd_s(a: v4f32, b: v4f32, c: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfmadd.d"]
+    fn __lsx_vfmadd_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfmsub.s"]
+    fn __lsx_vfmsub_s(a: v4f32, b: v4f32, c: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfmsub.d"]
+    fn __lsx_vfmsub_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfnmadd.s"]
+    fn __lsx_vfnmadd_s(a: v4f32, b: v4f32, c: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfnmadd.d"]
+    fn __lsx_vfnmadd_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfnmsub.s"]
+    fn __lsx_vfnmsub_s(a: v4f32, b: v4f32, c: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfnmsub.d"]
+    fn __lsx_vfnmsub_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vftintrne.w.s"]
+    fn __lsx_vftintrne_w_s(a: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vftintrne.l.d"]
+    fn __lsx_vftintrne_l_d(a: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vftintrp.w.s"]
+    fn __lsx_vftintrp_w_s(a: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vftintrp.l.d"]
+    fn __lsx_vftintrp_l_d(a: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vftintrm.w.s"]
+    fn __lsx_vftintrm_w_s(a: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vftintrm.l.d"]
+    fn __lsx_vftintrm_l_d(a: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vftint.w.d"]
+    fn __lsx_vftint_w_d(a: v2f64, b: v2f64) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vffint.s.l"]
+    fn __lsx_vffint_s_l(a: v2i64, b: v2i64) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vftintrz.w.d"]
+    fn __lsx_vftintrz_w_d(a: v2f64, b: v2f64) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vftintrp.w.d"]
+    fn __lsx_vftintrp_w_d(a: v2f64, b: v2f64) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vftintrm.w.d"]
+    fn __lsx_vftintrm_w_d(a: v2f64, b: v2f64) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vftintrne.w.d"]
+    fn __lsx_vftintrne_w_d(a: v2f64, b: v2f64) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vftintl.l.s"]
+    fn __lsx_vftintl_l_s(a: v4f32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vftinth.l.s"]
+    fn __lsx_vftinth_l_s(a: v4f32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vffinth.d.w"]
+    fn __lsx_vffinth_d_w(a: v4i32) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vffintl.d.w"]
+    fn __lsx_vffintl_d_w(a: v4i32) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vftintrzl.l.s"]
+    fn __lsx_vftintrzl_l_s(a: v4f32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vftintrzh.l.s"]
+    fn __lsx_vftintrzh_l_s(a: v4f32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vftintrpl.l.s"]
+    fn __lsx_vftintrpl_l_s(a: v4f32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vftintrph.l.s"]
+    fn __lsx_vftintrph_l_s(a: v4f32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vftintrml.l.s"]
+    fn __lsx_vftintrml_l_s(a: v4f32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vftintrmh.l.s"]
+    fn __lsx_vftintrmh_l_s(a: v4f32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vftintrnel.l.s"]
+    fn __lsx_vftintrnel_l_s(a: v4f32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vftintrneh.l.s"]
+    fn __lsx_vftintrneh_l_s(a: v4f32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfrintrne.s"]
+    fn __lsx_vfrintrne_s(a: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfrintrne.d"]
+    fn __lsx_vfrintrne_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfrintrz.s"]
+    fn __lsx_vfrintrz_s(a: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfrintrz.d"]
+    fn __lsx_vfrintrz_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfrintrp.s"]
+    fn __lsx_vfrintrp_s(a: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfrintrp.d"]
+    fn __lsx_vfrintrp_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vfrintrm.s"]
+    fn __lsx_vfrintrm_s(a: v4f32) -> v4f32;
+    #[link_name = "llvm.loongarch.lsx.vfrintrm.d"]
+    fn __lsx_vfrintrm_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.loongarch.lsx.vstelm.b"]
+    fn __lsx_vstelm_b(a: v16i8, b: *mut i8, c: i32, d: u32);
+    #[link_name = "llvm.loongarch.lsx.vstelm.h"]
+    fn __lsx_vstelm_h(a: v8i16, b: *mut i8, c: i32, d: u32);
+    #[link_name = "llvm.loongarch.lsx.vstelm.w"]
+    fn __lsx_vstelm_w(a: v4i32, b: *mut i8, c: i32, d: u32);
+    #[link_name = "llvm.loongarch.lsx.vstelm.d"]
+    fn __lsx_vstelm_d(a: v2i64, b: *mut i8, c: i32, d: u32);
+    #[link_name = "llvm.loongarch.lsx.vaddwev.d.w"]
+    fn __lsx_vaddwev_d_w(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vaddwev.w.h"]
+    fn __lsx_vaddwev_w_h(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vaddwev.h.b"]
+    fn __lsx_vaddwev_h_b(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vaddwod.d.w"]
+    fn __lsx_vaddwod_d_w(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vaddwod.w.h"]
+    fn __lsx_vaddwod_w_h(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vaddwod.h.b"]
+    fn __lsx_vaddwod_h_b(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vaddwev.d.wu"]
+    fn __lsx_vaddwev_d_wu(a: v4u32, b: v4u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vaddwev.w.hu"]
+    fn __lsx_vaddwev_w_hu(a: v8u16, b: v8u16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vaddwev.h.bu"]
+    fn __lsx_vaddwev_h_bu(a: v16u8, b: v16u8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vaddwod.d.wu"]
+    fn __lsx_vaddwod_d_wu(a: v4u32, b: v4u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vaddwod.w.hu"]
+    fn __lsx_vaddwod_w_hu(a: v8u16, b: v8u16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vaddwod.h.bu"]
+    fn __lsx_vaddwod_h_bu(a: v16u8, b: v16u8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vaddwev.d.wu.w"]
+    fn __lsx_vaddwev_d_wu_w(a: v4u32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vaddwev.w.hu.h"]
+    fn __lsx_vaddwev_w_hu_h(a: v8u16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vaddwev.h.bu.b"]
+    fn __lsx_vaddwev_h_bu_b(a: v16u8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vaddwod.d.wu.w"]
+    fn __lsx_vaddwod_d_wu_w(a: v4u32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vaddwod.w.hu.h"]
+    fn __lsx_vaddwod_w_hu_h(a: v8u16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vaddwod.h.bu.b"]
+    fn __lsx_vaddwod_h_bu_b(a: v16u8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsubwev.d.w"]
+    fn __lsx_vsubwev_d_w(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsubwev.w.h"]
+    fn __lsx_vsubwev_w_h(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsubwev.h.b"]
+    fn __lsx_vsubwev_h_b(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsubwod.d.w"]
+    fn __lsx_vsubwod_d_w(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsubwod.w.h"]
+    fn __lsx_vsubwod_w_h(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsubwod.h.b"]
+    fn __lsx_vsubwod_h_b(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsubwev.d.wu"]
+    fn __lsx_vsubwev_d_wu(a: v4u32, b: v4u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsubwev.w.hu"]
+    fn __lsx_vsubwev_w_hu(a: v8u16, b: v8u16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsubwev.h.bu"]
+    fn __lsx_vsubwev_h_bu(a: v16u8, b: v16u8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsubwod.d.wu"]
+    fn __lsx_vsubwod_d_wu(a: v4u32, b: v4u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsubwod.w.hu"]
+    fn __lsx_vsubwod_w_hu(a: v8u16, b: v8u16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsubwod.h.bu"]
+    fn __lsx_vsubwod_h_bu(a: v16u8, b: v16u8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vaddwev.q.d"]
+    fn __lsx_vaddwev_q_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vaddwod.q.d"]
+    fn __lsx_vaddwod_q_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vaddwev.q.du"]
+    fn __lsx_vaddwev_q_du(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vaddwod.q.du"]
+    fn __lsx_vaddwod_q_du(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsubwev.q.d"]
+    fn __lsx_vsubwev_q_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsubwod.q.d"]
+    fn __lsx_vsubwod_q_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsubwev.q.du"]
+    fn __lsx_vsubwev_q_du(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsubwod.q.du"]
+    fn __lsx_vsubwod_q_du(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vaddwev.q.du.d"]
+    fn __lsx_vaddwev_q_du_d(a: v2u64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vaddwod.q.du.d"]
+    fn __lsx_vaddwod_q_du_d(a: v2u64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmulwev.d.w"]
+    fn __lsx_vmulwev_d_w(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmulwev.w.h"]
+    fn __lsx_vmulwev_w_h(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmulwev.h.b"]
+    fn __lsx_vmulwev_h_b(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmulwod.d.w"]
+    fn __lsx_vmulwod_d_w(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmulwod.w.h"]
+    fn __lsx_vmulwod_w_h(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmulwod.h.b"]
+    fn __lsx_vmulwod_h_b(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmulwev.d.wu"]
+    fn __lsx_vmulwev_d_wu(a: v4u32, b: v4u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmulwev.w.hu"]
+    fn __lsx_vmulwev_w_hu(a: v8u16, b: v8u16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmulwev.h.bu"]
+    fn __lsx_vmulwev_h_bu(a: v16u8, b: v16u8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmulwod.d.wu"]
+    fn __lsx_vmulwod_d_wu(a: v4u32, b: v4u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmulwod.w.hu"]
+    fn __lsx_vmulwod_w_hu(a: v8u16, b: v8u16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmulwod.h.bu"]
+    fn __lsx_vmulwod_h_bu(a: v16u8, b: v16u8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmulwev.d.wu.w"]
+    fn __lsx_vmulwev_d_wu_w(a: v4u32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmulwev.w.hu.h"]
+    fn __lsx_vmulwev_w_hu_h(a: v8u16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmulwev.h.bu.b"]
+    fn __lsx_vmulwev_h_bu_b(a: v16u8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmulwod.d.wu.w"]
+    fn __lsx_vmulwod_d_wu_w(a: v4u32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmulwod.w.hu.h"]
+    fn __lsx_vmulwod_w_hu_h(a: v8u16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmulwod.h.bu.b"]
+    fn __lsx_vmulwod_h_bu_b(a: v16u8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmulwev.q.d"]
+    fn __lsx_vmulwev_q_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmulwod.q.d"]
+    fn __lsx_vmulwod_q_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmulwev.q.du"]
+    fn __lsx_vmulwev_q_du(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmulwod.q.du"]
+    fn __lsx_vmulwod_q_du(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmulwev.q.du.d"]
+    fn __lsx_vmulwev_q_du_d(a: v2u64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmulwod.q.du.d"]
+    fn __lsx_vmulwod_q_du_d(a: v2u64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vhaddw.q.d"]
+    fn __lsx_vhaddw_q_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vhaddw.qu.du"]
+    fn __lsx_vhaddw_qu_du(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vhsubw.q.d"]
+    fn __lsx_vhsubw_q_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vhsubw.qu.du"]
+    fn __lsx_vhsubw_qu_du(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vmaddwev.d.w"]
+    fn __lsx_vmaddwev_d_w(a: v2i64, b: v4i32, c: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmaddwev.w.h"]
+    fn __lsx_vmaddwev_w_h(a: v4i32, b: v8i16, c: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmaddwev.h.b"]
+    fn __lsx_vmaddwev_h_b(a: v8i16, b: v16i8, c: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmaddwev.d.wu"]
+    fn __lsx_vmaddwev_d_wu(a: v2u64, b: v4u32, c: v4u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vmaddwev.w.hu"]
+    fn __lsx_vmaddwev_w_hu(a: v4u32, b: v8u16, c: v8u16) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vmaddwev.h.bu"]
+    fn __lsx_vmaddwev_h_bu(a: v8u16, b: v16u8, c: v16u8) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vmaddwod.d.w"]
+    fn __lsx_vmaddwod_d_w(a: v2i64, b: v4i32, c: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmaddwod.w.h"]
+    fn __lsx_vmaddwod_w_h(a: v4i32, b: v8i16, c: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmaddwod.h.b"]
+    fn __lsx_vmaddwod_h_b(a: v8i16, b: v16i8, c: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmaddwod.d.wu"]
+    fn __lsx_vmaddwod_d_wu(a: v2u64, b: v4u32, c: v4u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vmaddwod.w.hu"]
+    fn __lsx_vmaddwod_w_hu(a: v4u32, b: v8u16, c: v8u16) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vmaddwod.h.bu"]
+    fn __lsx_vmaddwod_h_bu(a: v8u16, b: v16u8, c: v16u8) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vmaddwev.d.wu.w"]
+    fn __lsx_vmaddwev_d_wu_w(a: v2i64, b: v4u32, c: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmaddwev.w.hu.h"]
+    fn __lsx_vmaddwev_w_hu_h(a: v4i32, b: v8u16, c: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmaddwev.h.bu.b"]
+    fn __lsx_vmaddwev_h_bu_b(a: v8i16, b: v16u8, c: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmaddwod.d.wu.w"]
+    fn __lsx_vmaddwod_d_wu_w(a: v2i64, b: v4u32, c: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmaddwod.w.hu.h"]
+    fn __lsx_vmaddwod_w_hu_h(a: v4i32, b: v8u16, c: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vmaddwod.h.bu.b"]
+    fn __lsx_vmaddwod_h_bu_b(a: v8i16, b: v16u8, c: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vmaddwev.q.d"]
+    fn __lsx_vmaddwev_q_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmaddwod.q.d"]
+    fn __lsx_vmaddwod_q_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmaddwev.q.du"]
+    fn __lsx_vmaddwev_q_du(a: v2u64, b: v2u64, c: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vmaddwod.q.du"]
+    fn __lsx_vmaddwod_q_du(a: v2u64, b: v2u64, c: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vmaddwev.q.du.d"]
+    fn __lsx_vmaddwev_q_du_d(a: v2i64, b: v2u64, c: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmaddwod.q.du.d"]
+    fn __lsx_vmaddwod_q_du_d(a: v2i64, b: v2u64, c: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vrotr.b"]
+    fn __lsx_vrotr_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vrotr.h"]
+    fn __lsx_vrotr_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vrotr.w"]
+    fn __lsx_vrotr_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vrotr.d"]
+    fn __lsx_vrotr_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vadd.q"]
+    fn __lsx_vadd_q(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsub.q"]
+    fn __lsx_vsub_q(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vldrepl.b"]
+    fn __lsx_vldrepl_b(a: *const i8, b: i32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vldrepl.h"]
+    fn __lsx_vldrepl_h(a: *const i8, b: i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vldrepl.w"]
+    fn __lsx_vldrepl_w(a: *const i8, b: i32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vldrepl.d"]
+    fn __lsx_vldrepl_d(a: *const i8, b: i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vmskgez.b"]
+    fn __lsx_vmskgez_b(a: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vmsknz.b"]
+    fn __lsx_vmsknz_b(a: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vexth.h.b"]
+    fn __lsx_vexth_h_b(a: v16i8) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vexth.w.h"]
+    fn __lsx_vexth_w_h(a: v8i16) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vexth.d.w"]
+    fn __lsx_vexth_d_w(a: v4i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vexth.q.d"]
+    fn __lsx_vexth_q_d(a: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vexth.hu.bu"]
+    fn __lsx_vexth_hu_bu(a: v16u8) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vexth.wu.hu"]
+    fn __lsx_vexth_wu_hu(a: v8u16) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vexth.du.wu"]
+    fn __lsx_vexth_du_wu(a: v4u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vexth.qu.du"]
+    fn __lsx_vexth_qu_du(a: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vrotri.b"]
+    fn __lsx_vrotri_b(a: v16i8, b: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vrotri.h"]
+    fn __lsx_vrotri_h(a: v8i16, b: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vrotri.w"]
+    fn __lsx_vrotri_w(a: v4i32, b: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vrotri.d"]
+    fn __lsx_vrotri_d(a: v2i64, b: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vextl.q.d"]
+    fn __lsx_vextl_q_d(a: v2i64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsrlni.b.h"]
+    fn __lsx_vsrlni_b_h(a: v16i8, b: v16i8, c: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrlni.h.w"]
+    fn __lsx_vsrlni_h_w(a: v8i16, b: v8i16, c: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrlni.w.d"]
+    fn __lsx_vsrlni_w_d(a: v4i32, b: v4i32, c: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsrlni.d.q"]
+    fn __lsx_vsrlni_d_q(a: v2i64, b: v2i64, c: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsrlrni.b.h"]
+    fn __lsx_vsrlrni_b_h(a: v16i8, b: v16i8, c: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrlrni.h.w"]
+    fn __lsx_vsrlrni_h_w(a: v8i16, b: v8i16, c: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrlrni.w.d"]
+    fn __lsx_vsrlrni_w_d(a: v4i32, b: v4i32, c: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsrlrni.d.q"]
+    fn __lsx_vsrlrni_d_q(a: v2i64, b: v2i64, c: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vssrlni.b.h"]
+    fn __lsx_vssrlni_b_h(a: v16i8, b: v16i8, c: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vssrlni.h.w"]
+    fn __lsx_vssrlni_h_w(a: v8i16, b: v8i16, c: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vssrlni.w.d"]
+    fn __lsx_vssrlni_w_d(a: v4i32, b: v4i32, c: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vssrlni.d.q"]
+    fn __lsx_vssrlni_d_q(a: v2i64, b: v2i64, c: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vssrlni.bu.h"]
+    fn __lsx_vssrlni_bu_h(a: v16u8, b: v16i8, c: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vssrlni.hu.w"]
+    fn __lsx_vssrlni_hu_w(a: v8u16, b: v8i16, c: u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vssrlni.wu.d"]
+    fn __lsx_vssrlni_wu_d(a: v4u32, b: v4i32, c: u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vssrlni.du.q"]
+    fn __lsx_vssrlni_du_q(a: v2u64, b: v2i64, c: u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vssrlrni.b.h"]
+    fn __lsx_vssrlrni_b_h(a: v16i8, b: v16i8, c: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vssrlrni.h.w"]
+    fn __lsx_vssrlrni_h_w(a: v8i16, b: v8i16, c: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vssrlrni.w.d"]
+    fn __lsx_vssrlrni_w_d(a: v4i32, b: v4i32, c: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vssrlrni.d.q"]
+    fn __lsx_vssrlrni_d_q(a: v2i64, b: v2i64, c: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vssrlrni.bu.h"]
+    fn __lsx_vssrlrni_bu_h(a: v16u8, b: v16i8, c: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vssrlrni.hu.w"]
+    fn __lsx_vssrlrni_hu_w(a: v8u16, b: v8i16, c: u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vssrlrni.wu.d"]
+    fn __lsx_vssrlrni_wu_d(a: v4u32, b: v4i32, c: u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vssrlrni.du.q"]
+    fn __lsx_vssrlrni_du_q(a: v2u64, b: v2i64, c: u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vsrani.b.h"]
+    fn __lsx_vsrani_b_h(a: v16i8, b: v16i8, c: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrani.h.w"]
+    fn __lsx_vsrani_h_w(a: v8i16, b: v8i16, c: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrani.w.d"]
+    fn __lsx_vsrani_w_d(a: v4i32, b: v4i32, c: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsrani.d.q"]
+    fn __lsx_vsrani_d_q(a: v2i64, b: v2i64, c: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vsrarni.b.h"]
+    fn __lsx_vsrarni_b_h(a: v16i8, b: v16i8, c: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vsrarni.h.w"]
+    fn __lsx_vsrarni_h_w(a: v8i16, b: v8i16, c: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vsrarni.w.d"]
+    fn __lsx_vsrarni_w_d(a: v4i32, b: v4i32, c: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vsrarni.d.q"]
+    fn __lsx_vsrarni_d_q(a: v2i64, b: v2i64, c: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vssrani.b.h"]
+    fn __lsx_vssrani_b_h(a: v16i8, b: v16i8, c: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vssrani.h.w"]
+    fn __lsx_vssrani_h_w(a: v8i16, b: v8i16, c: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vssrani.w.d"]
+    fn __lsx_vssrani_w_d(a: v4i32, b: v4i32, c: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vssrani.d.q"]
+    fn __lsx_vssrani_d_q(a: v2i64, b: v2i64, c: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vssrani.bu.h"]
+    fn __lsx_vssrani_bu_h(a: v16u8, b: v16i8, c: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vssrani.hu.w"]
+    fn __lsx_vssrani_hu_w(a: v8u16, b: v8i16, c: u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vssrani.wu.d"]
+    fn __lsx_vssrani_wu_d(a: v4u32, b: v4i32, c: u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vssrani.du.q"]
+    fn __lsx_vssrani_du_q(a: v2u64, b: v2i64, c: u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vssrarni.b.h"]
+    fn __lsx_vssrarni_b_h(a: v16i8, b: v16i8, c: u32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vssrarni.h.w"]
+    fn __lsx_vssrarni_h_w(a: v8i16, b: v8i16, c: u32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vssrarni.w.d"]
+    fn __lsx_vssrarni_w_d(a: v4i32, b: v4i32, c: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vssrarni.d.q"]
+    fn __lsx_vssrarni_d_q(a: v2i64, b: v2i64, c: u32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vssrarni.bu.h"]
+    fn __lsx_vssrarni_bu_h(a: v16u8, b: v16i8, c: u32) -> v16u8;
+    #[link_name = "llvm.loongarch.lsx.vssrarni.hu.w"]
+    fn __lsx_vssrarni_hu_w(a: v8u16, b: v8i16, c: u32) -> v8u16;
+    #[link_name = "llvm.loongarch.lsx.vssrarni.wu.d"]
+    fn __lsx_vssrarni_wu_d(a: v4u32, b: v4i32, c: u32) -> v4u32;
+    #[link_name = "llvm.loongarch.lsx.vssrarni.du.q"]
+    fn __lsx_vssrarni_du_q(a: v2u64, b: v2i64, c: u32) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.vpermi.w"]
+    fn __lsx_vpermi_w(a: v4i32, b: v4i32, c: u32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vld"]
+    fn __lsx_vld(a: *const i8, b: i32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vst"]
+    fn __lsx_vst(a: v16i8, b: *mut i8, c: i32);
+    #[link_name = "llvm.loongarch.lsx.vssrlrn.b.h"]
+    fn __lsx_vssrlrn_b_h(a: v8i16, b: v8i16) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vssrlrn.h.w"]
+    fn __lsx_vssrlrn_h_w(a: v4i32, b: v4i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vssrlrn.w.d"]
+    fn __lsx_vssrlrn_w_d(a: v2i64, b: v2i64) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vssrln.b.h"]
+    fn __lsx_vssrln_b_h(a: v8i16, b: v8i16) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vssrln.h.w"]
+    fn __lsx_vssrln_h_w(a: v4i32, b: v4i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vssrln.w.d"]
+    fn __lsx_vssrln_w_d(a: v2i64, b: v2i64) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vorn.v"]
+    fn __lsx_vorn_v(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vldi"]
+    fn __lsx_vldi(a: i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vshuf.b"]
+    fn __lsx_vshuf_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vldx"]
+    fn __lsx_vldx(a: *const i8, b: i64) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vstx"]
+    fn __lsx_vstx(a: v16i8, b: *mut i8, c: i64);
+    #[link_name = "llvm.loongarch.lsx.vextl.qu.du"]
+    fn __lsx_vextl_qu_du(a: v2u64) -> v2u64;
+    #[link_name = "llvm.loongarch.lsx.bnz.b"]
+    fn __lsx_bnz_b(a: v16u8) -> i32;
+    #[link_name = "llvm.loongarch.lsx.bnz.d"]
+    fn __lsx_bnz_d(a: v2u64) -> i32;
+    #[link_name = "llvm.loongarch.lsx.bnz.h"]
+    fn __lsx_bnz_h(a: v8u16) -> i32;
+    #[link_name = "llvm.loongarch.lsx.bnz.v"]
+    fn __lsx_bnz_v(a: v16u8) -> i32;
+    #[link_name = "llvm.loongarch.lsx.bnz.w"]
+    fn __lsx_bnz_w(a: v4u32) -> i32;
+    #[link_name = "llvm.loongarch.lsx.bz.b"]
+    fn __lsx_bz_b(a: v16u8) -> i32;
+    #[link_name = "llvm.loongarch.lsx.bz.d"]
+    fn __lsx_bz_d(a: v2u64) -> i32;
+    #[link_name = "llvm.loongarch.lsx.bz.h"]
+    fn __lsx_bz_h(a: v8u16) -> i32;
+    #[link_name = "llvm.loongarch.lsx.bz.v"]
+    fn __lsx_bz_v(a: v16u8) -> i32;
+    #[link_name = "llvm.loongarch.lsx.bz.w"]
+    fn __lsx_bz_w(a: v4u32) -> i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.caf.d"]
+    fn __lsx_vfcmp_caf_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.caf.s"]
+    fn __lsx_vfcmp_caf_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.ceq.d"]
+    fn __lsx_vfcmp_ceq_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.ceq.s"]
+    fn __lsx_vfcmp_ceq_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cle.d"]
+    fn __lsx_vfcmp_cle_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cle.s"]
+    fn __lsx_vfcmp_cle_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.clt.d"]
+    fn __lsx_vfcmp_clt_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.clt.s"]
+    fn __lsx_vfcmp_clt_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cne.d"]
+    fn __lsx_vfcmp_cne_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cne.s"]
+    fn __lsx_vfcmp_cne_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cor.d"]
+    fn __lsx_vfcmp_cor_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cor.s"]
+    fn __lsx_vfcmp_cor_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cueq.d"]
+    fn __lsx_vfcmp_cueq_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cueq.s"]
+    fn __lsx_vfcmp_cueq_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cule.d"]
+    fn __lsx_vfcmp_cule_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cule.s"]
+    fn __lsx_vfcmp_cule_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cult.d"]
+    fn __lsx_vfcmp_cult_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cult.s"]
+    fn __lsx_vfcmp_cult_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cun.d"]
+    fn __lsx_vfcmp_cun_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cune.d"]
+    fn __lsx_vfcmp_cune_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cune.s"]
+    fn __lsx_vfcmp_cune_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.cun.s"]
+    fn __lsx_vfcmp_cun_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.saf.d"]
+    fn __lsx_vfcmp_saf_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.saf.s"]
+    fn __lsx_vfcmp_saf_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.seq.d"]
+    fn __lsx_vfcmp_seq_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.seq.s"]
+    fn __lsx_vfcmp_seq_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sle.d"]
+    fn __lsx_vfcmp_sle_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sle.s"]
+    fn __lsx_vfcmp_sle_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.slt.d"]
+    fn __lsx_vfcmp_slt_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.slt.s"]
+    fn __lsx_vfcmp_slt_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sne.d"]
+    fn __lsx_vfcmp_sne_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sne.s"]
+    fn __lsx_vfcmp_sne_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sor.d"]
+    fn __lsx_vfcmp_sor_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sor.s"]
+    fn __lsx_vfcmp_sor_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sueq.d"]
+    fn __lsx_vfcmp_sueq_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sueq.s"]
+    fn __lsx_vfcmp_sueq_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sule.d"]
+    fn __lsx_vfcmp_sule_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sule.s"]
+    fn __lsx_vfcmp_sule_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sult.d"]
+    fn __lsx_vfcmp_sult_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sult.s"]
+    fn __lsx_vfcmp_sult_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sun.d"]
+    fn __lsx_vfcmp_sun_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sune.d"]
+    fn __lsx_vfcmp_sune_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sune.s"]
+    fn __lsx_vfcmp_sune_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vfcmp.sun.s"]
+    fn __lsx_vfcmp_sun_s(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.loongarch.lsx.vrepli.b"]
+    fn __lsx_vrepli_b(a: i32) -> v16i8;
+    #[link_name = "llvm.loongarch.lsx.vrepli.d"]
+    fn __lsx_vrepli_d(a: i32) -> v2i64;
+    #[link_name = "llvm.loongarch.lsx.vrepli.h"]
+    fn __lsx_vrepli_h(a: i32) -> v8i16;
+    #[link_name = "llvm.loongarch.lsx.vrepli.w"]
+    fn __lsx_vrepli_w(a: i32) -> v4i32;
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsll_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vsll_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsll_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vsll_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsll_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vsll_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsll_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vsll_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslli_b<const IMM3: u32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vslli_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslli_h<const IMM4: u32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vslli_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslli_w<const IMM5: u32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vslli_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslli_d<const IMM6: u32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vslli_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsra_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vsra_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsra_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vsra_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsra_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vsra_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsra_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vsra_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrai_b<const IMM3: u32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vsrai_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrai_h<const IMM4: u32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vsrai_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrai_w<const IMM5: u32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsrai_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrai_d<const IMM6: u32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vsrai_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrar_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vsrar_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrar_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vsrar_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrar_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vsrar_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrar_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vsrar_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrari_b<const IMM3: u32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vsrari_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrari_h<const IMM4: u32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vsrari_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrari_w<const IMM5: u32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsrari_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrari_d<const IMM6: u32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vsrari_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrl_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vsrl_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrl_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vsrl_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrl_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vsrl_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrl_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vsrl_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrli_b<const IMM3: u32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vsrli_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrli_h<const IMM4: u32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vsrli_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrli_w<const IMM5: u32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsrli_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrli_d<const IMM6: u32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vsrli_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlr_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vsrlr_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlr_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vsrlr_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlr_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vsrlr_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlr_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vsrlr_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlri_b<const IMM3: u32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vsrlri_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlri_h<const IMM4: u32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vsrlri_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlri_w<const IMM5: u32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsrlri_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlri_d<const IMM6: u32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vsrlri_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitclr_b(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vbitclr_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitclr_h(a: v8u16, b: v8u16) -> v8u16 {
+    __lsx_vbitclr_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitclr_w(a: v4u32, b: v4u32) -> v4u32 {
+    __lsx_vbitclr_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitclr_d(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vbitclr_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitclri_b<const IMM3: u32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vbitclri_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitclri_h<const IMM4: u32>(a: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vbitclri_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitclri_w<const IMM5: u32>(a: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vbitclri_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitclri_d<const IMM6: u32>(a: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vbitclri_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitset_b(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vbitset_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitset_h(a: v8u16, b: v8u16) -> v8u16 {
+    __lsx_vbitset_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitset_w(a: v4u32, b: v4u32) -> v4u32 {
+    __lsx_vbitset_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitset_d(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vbitset_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitseti_b<const IMM3: u32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vbitseti_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitseti_h<const IMM4: u32>(a: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vbitseti_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitseti_w<const IMM5: u32>(a: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vbitseti_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitseti_d<const IMM6: u32>(a: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vbitseti_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitrev_b(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vbitrev_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitrev_h(a: v8u16, b: v8u16) -> v8u16 {
+    __lsx_vbitrev_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitrev_w(a: v4u32, b: v4u32) -> v4u32 {
+    __lsx_vbitrev_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitrev_d(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vbitrev_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitrevi_b<const IMM3: u32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vbitrevi_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitrevi_h<const IMM4: u32>(a: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vbitrevi_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitrevi_w<const IMM5: u32>(a: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vbitrevi_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitrevi_d<const IMM6: u32>(a: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vbitrevi_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vadd_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vadd_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vadd_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vadd_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vadd_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vadd_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vadd_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vadd_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddi_bu<const IMM5: u32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vaddi_bu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddi_hu<const IMM5: u32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vaddi_hu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddi_wu<const IMM5: u32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vaddi_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddi_du<const IMM5: u32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vaddi_du(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsub_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vsub_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsub_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vsub_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsub_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vsub_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsub_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vsub_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubi_bu<const IMM5: u32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsubi_bu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubi_hu<const IMM5: u32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsubi_hu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubi_wu<const IMM5: u32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsubi_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubi_du<const IMM5: u32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsubi_du(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmax_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vmax_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmax_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vmax_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmax_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vmax_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmax_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vmax_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaxi_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vmaxi_b(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaxi_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vmaxi_h(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaxi_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vmaxi_w(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaxi_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vmaxi_d(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmax_bu(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vmax_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmax_hu(a: v8u16, b: v8u16) -> v8u16 {
+    __lsx_vmax_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmax_wu(a: v4u32, b: v4u32) -> v4u32 {
+    __lsx_vmax_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmax_du(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vmax_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaxi_bu<const IMM5: u32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vmaxi_bu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaxi_hu<const IMM5: u32>(a: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vmaxi_hu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaxi_wu<const IMM5: u32>(a: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vmaxi_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaxi_du<const IMM5: u32>(a: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vmaxi_du(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmin_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vmin_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmin_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vmin_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmin_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vmin_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmin_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vmin_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmini_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vmini_b(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmini_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vmini_h(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmini_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vmini_w(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmini_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vmini_d(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmin_bu(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vmin_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmin_hu(a: v8u16, b: v8u16) -> v8u16 {
+    __lsx_vmin_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmin_wu(a: v4u32, b: v4u32) -> v4u32 {
+    __lsx_vmin_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmin_du(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vmin_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmini_bu<const IMM5: u32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vmini_bu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmini_hu<const IMM5: u32>(a: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vmini_hu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmini_wu<const IMM5: u32>(a: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vmini_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmini_du<const IMM5: u32>(a: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vmini_du(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vseq_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vseq_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vseq_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vseq_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vseq_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vseq_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vseq_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vseq_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vseqi_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vseqi_b(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vseqi_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vseqi_h(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vseqi_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vseqi_w(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vseqi_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vseqi_d(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslti_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vslti_b(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslt_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vslt_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslt_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vslt_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslt_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vslt_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslt_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vslt_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslti_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vslti_h(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslti_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vslti_w(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslti_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vslti_d(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslt_bu(a: v16u8, b: v16u8) -> v16i8 {
+    __lsx_vslt_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslt_hu(a: v8u16, b: v8u16) -> v8i16 {
+    __lsx_vslt_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslt_wu(a: v4u32, b: v4u32) -> v4i32 {
+    __lsx_vslt_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslt_du(a: v2u64, b: v2u64) -> v2i64 {
+    __lsx_vslt_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslti_bu<const IMM5: u32>(a: v16u8) -> v16i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vslti_bu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslti_hu<const IMM5: u32>(a: v8u16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vslti_hu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslti_wu<const IMM5: u32>(a: v4u32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vslti_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslti_du<const IMM5: u32>(a: v2u64) -> v2i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vslti_du(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsle_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vsle_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsle_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vsle_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsle_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vsle_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsle_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vsle_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslei_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vslei_b(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslei_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vslei_h(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslei_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vslei_w(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslei_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    __lsx_vslei_d(a, IMM_S5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsle_bu(a: v16u8, b: v16u8) -> v16i8 {
+    __lsx_vsle_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsle_hu(a: v8u16, b: v8u16) -> v8i16 {
+    __lsx_vsle_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsle_wu(a: v4u32, b: v4u32) -> v4i32 {
+    __lsx_vsle_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsle_du(a: v2u64, b: v2u64) -> v2i64 {
+    __lsx_vsle_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslei_bu<const IMM5: u32>(a: v16u8) -> v16i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vslei_bu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslei_hu<const IMM5: u32>(a: v8u16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vslei_hu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslei_wu<const IMM5: u32>(a: v4u32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vslei_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vslei_du<const IMM5: u32>(a: v2u64) -> v2i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vslei_du(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsat_b<const IMM3: u32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vsat_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsat_h<const IMM4: u32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vsat_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsat_w<const IMM5: u32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsat_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsat_d<const IMM6: u32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vsat_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsat_bu<const IMM3: u32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vsat_bu(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsat_hu<const IMM4: u32>(a: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vsat_hu(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsat_wu<const IMM5: u32>(a: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsat_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsat_du<const IMM6: u32>(a: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vsat_du(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vadda_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vadda_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vadda_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vadda_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vadda_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vadda_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vadda_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vadda_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsadd_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vsadd_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsadd_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vsadd_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsadd_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vsadd_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsadd_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vsadd_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsadd_bu(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vsadd_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsadd_hu(a: v8u16, b: v8u16) -> v8u16 {
+    __lsx_vsadd_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsadd_wu(a: v4u32, b: v4u32) -> v4u32 {
+    __lsx_vsadd_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsadd_du(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vsadd_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavg_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vavg_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavg_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vavg_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavg_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vavg_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavg_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vavg_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavg_bu(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vavg_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavg_hu(a: v8u16, b: v8u16) -> v8u16 {
+    __lsx_vavg_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavg_wu(a: v4u32, b: v4u32) -> v4u32 {
+    __lsx_vavg_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavg_du(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vavg_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavgr_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vavgr_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavgr_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vavgr_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavgr_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vavgr_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavgr_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vavgr_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavgr_bu(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vavgr_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavgr_hu(a: v8u16, b: v8u16) -> v8u16 {
+    __lsx_vavgr_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavgr_wu(a: v4u32, b: v4u32) -> v4u32 {
+    __lsx_vavgr_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vavgr_du(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vavgr_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssub_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vssub_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssub_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vssub_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssub_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vssub_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssub_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vssub_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssub_bu(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vssub_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssub_hu(a: v8u16, b: v8u16) -> v8u16 {
+    __lsx_vssub_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssub_wu(a: v4u32, b: v4u32) -> v4u32 {
+    __lsx_vssub_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssub_du(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vssub_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vabsd_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vabsd_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vabsd_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vabsd_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vabsd_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vabsd_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vabsd_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vabsd_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vabsd_bu(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vabsd_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vabsd_hu(a: v8u16, b: v8u16) -> v8u16 {
+    __lsx_vabsd_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vabsd_wu(a: v4u32, b: v4u32) -> v4u32 {
+    __lsx_vabsd_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vabsd_du(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vabsd_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmul_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vmul_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmul_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vmul_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmul_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vmul_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmul_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vmul_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmadd_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8 {
+    __lsx_vmadd_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmadd_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    __lsx_vmadd_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmadd_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    __lsx_vmadd_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmadd_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64 {
+    __lsx_vmadd_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmsub_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8 {
+    __lsx_vmsub_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmsub_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    __lsx_vmsub_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmsub_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    __lsx_vmsub_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmsub_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64 {
+    __lsx_vmsub_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vdiv_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vdiv_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vdiv_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vdiv_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vdiv_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vdiv_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vdiv_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vdiv_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vdiv_bu(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vdiv_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vdiv_hu(a: v8u16, b: v8u16) -> v8u16 {
+    __lsx_vdiv_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vdiv_wu(a: v4u32, b: v4u32) -> v4u32 {
+    __lsx_vdiv_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vdiv_du(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vdiv_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhaddw_h_b(a: v16i8, b: v16i8) -> v8i16 {
+    __lsx_vhaddw_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhaddw_w_h(a: v8i16, b: v8i16) -> v4i32 {
+    __lsx_vhaddw_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhaddw_d_w(a: v4i32, b: v4i32) -> v2i64 {
+    __lsx_vhaddw_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhaddw_hu_bu(a: v16u8, b: v16u8) -> v8u16 {
+    __lsx_vhaddw_hu_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhaddw_wu_hu(a: v8u16, b: v8u16) -> v4u32 {
+    __lsx_vhaddw_wu_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhaddw_du_wu(a: v4u32, b: v4u32) -> v2u64 {
+    __lsx_vhaddw_du_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhsubw_h_b(a: v16i8, b: v16i8) -> v8i16 {
+    __lsx_vhsubw_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhsubw_w_h(a: v8i16, b: v8i16) -> v4i32 {
+    __lsx_vhsubw_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhsubw_d_w(a: v4i32, b: v4i32) -> v2i64 {
+    __lsx_vhsubw_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhsubw_hu_bu(a: v16u8, b: v16u8) -> v8i16 {
+    __lsx_vhsubw_hu_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhsubw_wu_hu(a: v8u16, b: v8u16) -> v4i32 {
+    __lsx_vhsubw_wu_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhsubw_du_wu(a: v4u32, b: v4u32) -> v2i64 {
+    __lsx_vhsubw_du_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmod_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vmod_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmod_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vmod_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmod_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vmod_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmod_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vmod_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmod_bu(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vmod_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmod_hu(a: v8u16, b: v8u16) -> v8u16 {
+    __lsx_vmod_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmod_wu(a: v4u32, b: v4u32) -> v4u32 {
+    __lsx_vmod_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmod_du(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vmod_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vreplve_b(a: v16i8, b: i32) -> v16i8 {
+    __lsx_vreplve_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vreplve_h(a: v8i16, b: i32) -> v8i16 {
+    __lsx_vreplve_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vreplve_w(a: v4i32, b: i32) -> v4i32 {
+    __lsx_vreplve_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vreplve_d(a: v2i64, b: i32) -> v2i64 {
+    __lsx_vreplve_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vreplvei_b<const IMM4: u32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vreplvei_b(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vreplvei_h<const IMM3: u32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vreplvei_h(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vreplvei_w<const IMM2: u32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM2, 2);
+    __lsx_vreplvei_w(a, IMM2)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vreplvei_d<const IMM1: u32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    __lsx_vreplvei_d(a, IMM1)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickev_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vpickev_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickev_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vpickev_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickev_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vpickev_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickev_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vpickev_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickod_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vpickod_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickod_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vpickod_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickod_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vpickod_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickod_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vpickod_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vilvh_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vilvh_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vilvh_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vilvh_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vilvh_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vilvh_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vilvh_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vilvh_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vilvl_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vilvl_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vilvl_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vilvl_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vilvl_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vilvl_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vilvl_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vilvl_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpackev_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vpackev_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpackev_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vpackev_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpackev_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vpackev_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpackev_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vpackev_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpackod_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vpackod_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpackod_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vpackod_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpackod_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vpackod_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpackod_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vpackod_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vshuf_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    __lsx_vshuf_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vshuf_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    __lsx_vshuf_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vshuf_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64 {
+    __lsx_vshuf_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vand_v(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vand_v(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vandi_b<const IMM8: u32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vandi_b(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vor_v(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vor_v(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vori_b<const IMM8: u32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vori_b(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vnor_v(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vnor_v(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vnori_b<const IMM8: u32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vnori_b(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vxor_v(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vxor_v(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vxori_b<const IMM8: u32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vxori_b(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitsel_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8 {
+    __lsx_vbitsel_v(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbitseli_b<const IMM8: u32>(a: v16u8, b: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vbitseli_b(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vshuf4i_b<const IMM8: u32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vshuf4i_b(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vshuf4i_h<const IMM8: u32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vshuf4i_h(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vshuf4i_w<const IMM8: u32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vshuf4i_w(a, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vreplgr2vr_b(a: i32) -> v16i8 {
+    __lsx_vreplgr2vr_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vreplgr2vr_h(a: i32) -> v8i16 {
+    __lsx_vreplgr2vr_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vreplgr2vr_w(a: i32) -> v4i32 {
+    __lsx_vreplgr2vr_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vreplgr2vr_d(a: i64) -> v2i64 {
+    __lsx_vreplgr2vr_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpcnt_b(a: v16i8) -> v16i8 {
+    __lsx_vpcnt_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpcnt_h(a: v8i16) -> v8i16 {
+    __lsx_vpcnt_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpcnt_w(a: v4i32) -> v4i32 {
+    __lsx_vpcnt_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpcnt_d(a: v2i64) -> v2i64 {
+    __lsx_vpcnt_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vclo_b(a: v16i8) -> v16i8 {
+    __lsx_vclo_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vclo_h(a: v8i16) -> v8i16 {
+    __lsx_vclo_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vclo_w(a: v4i32) -> v4i32 {
+    __lsx_vclo_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vclo_d(a: v2i64) -> v2i64 {
+    __lsx_vclo_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vclz_b(a: v16i8) -> v16i8 {
+    __lsx_vclz_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vclz_h(a: v8i16) -> v8i16 {
+    __lsx_vclz_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vclz_w(a: v4i32) -> v4i32 {
+    __lsx_vclz_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vclz_d(a: v2i64) -> v2i64 {
+    __lsx_vclz_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickve2gr_b<const IMM4: u32>(a: v16i8) -> i32 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vpickve2gr_b(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickve2gr_h<const IMM3: u32>(a: v8i16) -> i32 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vpickve2gr_h(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickve2gr_w<const IMM2: u32>(a: v4i32) -> i32 {
+    static_assert_uimm_bits!(IMM2, 2);
+    __lsx_vpickve2gr_w(a, IMM2)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickve2gr_d<const IMM1: u32>(a: v2i64) -> i64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    __lsx_vpickve2gr_d(a, IMM1)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickve2gr_bu<const IMM4: u32>(a: v16i8) -> u32 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vpickve2gr_bu(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickve2gr_hu<const IMM3: u32>(a: v8i16) -> u32 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vpickve2gr_hu(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickve2gr_wu<const IMM2: u32>(a: v4i32) -> u32 {
+    static_assert_uimm_bits!(IMM2, 2);
+    __lsx_vpickve2gr_wu(a, IMM2)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpickve2gr_du<const IMM1: u32>(a: v2i64) -> u64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    __lsx_vpickve2gr_du(a, IMM1)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vinsgr2vr_b<const IMM4: u32>(a: v16i8, b: i32) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vinsgr2vr_b(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vinsgr2vr_h<const IMM3: u32>(a: v8i16, b: i32) -> v8i16 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vinsgr2vr_h(a, b, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vinsgr2vr_w<const IMM2: u32>(a: v4i32, b: i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM2, 2);
+    __lsx_vinsgr2vr_w(a, b, IMM2)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vinsgr2vr_d<const IMM1: u32>(a: v2i64, b: i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    __lsx_vinsgr2vr_d(a, b, IMM1)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfadd_s(a: v4f32, b: v4f32) -> v4f32 {
+    __lsx_vfadd_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfadd_d(a: v2f64, b: v2f64) -> v2f64 {
+    __lsx_vfadd_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfsub_s(a: v4f32, b: v4f32) -> v4f32 {
+    __lsx_vfsub_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfsub_d(a: v2f64, b: v2f64) -> v2f64 {
+    __lsx_vfsub_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmul_s(a: v4f32, b: v4f32) -> v4f32 {
+    __lsx_vfmul_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmul_d(a: v2f64, b: v2f64) -> v2f64 {
+    __lsx_vfmul_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfdiv_s(a: v4f32, b: v4f32) -> v4f32 {
+    __lsx_vfdiv_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfdiv_d(a: v2f64, b: v2f64) -> v2f64 {
+    __lsx_vfdiv_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcvt_h_s(a: v4f32, b: v4f32) -> v8i16 {
+    __lsx_vfcvt_h_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcvt_s_d(a: v2f64, b: v2f64) -> v4f32 {
+    __lsx_vfcvt_s_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmin_s(a: v4f32, b: v4f32) -> v4f32 {
+    __lsx_vfmin_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmin_d(a: v2f64, b: v2f64) -> v2f64 {
+    __lsx_vfmin_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmina_s(a: v4f32, b: v4f32) -> v4f32 {
+    __lsx_vfmina_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmina_d(a: v2f64, b: v2f64) -> v2f64 {
+    __lsx_vfmina_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmax_s(a: v4f32, b: v4f32) -> v4f32 {
+    __lsx_vfmax_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmax_d(a: v2f64, b: v2f64) -> v2f64 {
+    __lsx_vfmax_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmaxa_s(a: v4f32, b: v4f32) -> v4f32 {
+    __lsx_vfmaxa_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmaxa_d(a: v2f64, b: v2f64) -> v2f64 {
+    __lsx_vfmaxa_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfclass_s(a: v4f32) -> v4i32 {
+    __lsx_vfclass_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfclass_d(a: v2f64) -> v2i64 {
+    __lsx_vfclass_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfsqrt_s(a: v4f32) -> v4f32 {
+    __lsx_vfsqrt_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfsqrt_d(a: v2f64) -> v2f64 {
+    __lsx_vfsqrt_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrecip_s(a: v4f32) -> v4f32 {
+    __lsx_vfrecip_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrecip_d(a: v2f64) -> v2f64 {
+    __lsx_vfrecip_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx,frecipe")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrecipe_s(a: v4f32) -> v4f32 {
+    __lsx_vfrecipe_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx,frecipe")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrecipe_d(a: v2f64) -> v2f64 {
+    __lsx_vfrecipe_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx,frecipe")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrsqrte_s(a: v4f32) -> v4f32 {
+    __lsx_vfrsqrte_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx,frecipe")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrsqrte_d(a: v2f64) -> v2f64 {
+    __lsx_vfrsqrte_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrint_s(a: v4f32) -> v4f32 {
+    __lsx_vfrint_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrint_d(a: v2f64) -> v2f64 {
+    __lsx_vfrint_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrsqrt_s(a: v4f32) -> v4f32 {
+    __lsx_vfrsqrt_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrsqrt_d(a: v2f64) -> v2f64 {
+    __lsx_vfrsqrt_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vflogb_s(a: v4f32) -> v4f32 {
+    __lsx_vflogb_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vflogb_d(a: v2f64) -> v2f64 {
+    __lsx_vflogb_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcvth_s_h(a: v8i16) -> v4f32 {
+    __lsx_vfcvth_s_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcvth_d_s(a: v4f32) -> v2f64 {
+    __lsx_vfcvth_d_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcvtl_s_h(a: v8i16) -> v4f32 {
+    __lsx_vfcvtl_s_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcvtl_d_s(a: v4f32) -> v2f64 {
+    __lsx_vfcvtl_d_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftint_w_s(a: v4f32) -> v4i32 {
+    __lsx_vftint_w_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftint_l_d(a: v2f64) -> v2i64 {
+    __lsx_vftint_l_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftint_wu_s(a: v4f32) -> v4u32 {
+    __lsx_vftint_wu_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftint_lu_d(a: v2f64) -> v2u64 {
+    __lsx_vftint_lu_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrz_w_s(a: v4f32) -> v4i32 {
+    __lsx_vftintrz_w_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrz_l_d(a: v2f64) -> v2i64 {
+    __lsx_vftintrz_l_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrz_wu_s(a: v4f32) -> v4u32 {
+    __lsx_vftintrz_wu_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrz_lu_d(a: v2f64) -> v2u64 {
+    __lsx_vftintrz_lu_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vffint_s_w(a: v4i32) -> v4f32 {
+    __lsx_vffint_s_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vffint_d_l(a: v2i64) -> v2f64 {
+    __lsx_vffint_d_l(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vffint_s_wu(a: v4u32) -> v4f32 {
+    __lsx_vffint_s_wu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vffint_d_lu(a: v2u64) -> v2f64 {
+    __lsx_vffint_d_lu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vandn_v(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vandn_v(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vneg_b(a: v16i8) -> v16i8 {
+    __lsx_vneg_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vneg_h(a: v8i16) -> v8i16 {
+    __lsx_vneg_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vneg_w(a: v4i32) -> v4i32 {
+    __lsx_vneg_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vneg_d(a: v2i64) -> v2i64 {
+    __lsx_vneg_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmuh_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vmuh_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmuh_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vmuh_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmuh_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vmuh_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmuh_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vmuh_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmuh_bu(a: v16u8, b: v16u8) -> v16u8 {
+    __lsx_vmuh_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmuh_hu(a: v8u16, b: v8u16) -> v8u16 {
+    __lsx_vmuh_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmuh_wu(a: v4u32, b: v4u32) -> v4u32 {
+    __lsx_vmuh_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmuh_du(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vmuh_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsllwil_h_b<const IMM3: u32>(a: v16i8) -> v8i16 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vsllwil_h_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsllwil_w_h<const IMM4: u32>(a: v8i16) -> v4i32 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vsllwil_w_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsllwil_d_w<const IMM5: u32>(a: v4i32) -> v2i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsllwil_d_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsllwil_hu_bu<const IMM3: u32>(a: v16u8) -> v8u16 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vsllwil_hu_bu(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsllwil_wu_hu<const IMM4: u32>(a: v8u16) -> v4u32 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vsllwil_wu_hu(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsllwil_du_wu<const IMM5: u32>(a: v4u32) -> v2u64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsllwil_du_wu(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsran_b_h(a: v8i16, b: v8i16) -> v16i8 {
+    __lsx_vsran_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsran_h_w(a: v4i32, b: v4i32) -> v8i16 {
+    __lsx_vsran_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsran_w_d(a: v2i64, b: v2i64) -> v4i32 {
+    __lsx_vsran_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssran_b_h(a: v8i16, b: v8i16) -> v16i8 {
+    __lsx_vssran_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssran_h_w(a: v4i32, b: v4i32) -> v8i16 {
+    __lsx_vssran_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssran_w_d(a: v2i64, b: v2i64) -> v4i32 {
+    __lsx_vssran_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssran_bu_h(a: v8u16, b: v8u16) -> v16u8 {
+    __lsx_vssran_bu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssran_hu_w(a: v4u32, b: v4u32) -> v8u16 {
+    __lsx_vssran_hu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssran_wu_d(a: v2u64, b: v2u64) -> v4u32 {
+    __lsx_vssran_wu_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrarn_b_h(a: v8i16, b: v8i16) -> v16i8 {
+    __lsx_vsrarn_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrarn_h_w(a: v4i32, b: v4i32) -> v8i16 {
+    __lsx_vsrarn_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrarn_w_d(a: v2i64, b: v2i64) -> v4i32 {
+    __lsx_vsrarn_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarn_b_h(a: v8i16, b: v8i16) -> v16i8 {
+    __lsx_vssrarn_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarn_h_w(a: v4i32, b: v4i32) -> v8i16 {
+    __lsx_vssrarn_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarn_w_d(a: v2i64, b: v2i64) -> v4i32 {
+    __lsx_vssrarn_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarn_bu_h(a: v8u16, b: v8u16) -> v16u8 {
+    __lsx_vssrarn_bu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarn_hu_w(a: v4u32, b: v4u32) -> v8u16 {
+    __lsx_vssrarn_hu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarn_wu_d(a: v2u64, b: v2u64) -> v4u32 {
+    __lsx_vssrarn_wu_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrln_b_h(a: v8i16, b: v8i16) -> v16i8 {
+    __lsx_vsrln_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrln_h_w(a: v4i32, b: v4i32) -> v8i16 {
+    __lsx_vsrln_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrln_w_d(a: v2i64, b: v2i64) -> v4i32 {
+    __lsx_vsrln_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrln_bu_h(a: v8u16, b: v8u16) -> v16u8 {
+    __lsx_vssrln_bu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrln_hu_w(a: v4u32, b: v4u32) -> v8u16 {
+    __lsx_vssrln_hu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrln_wu_d(a: v2u64, b: v2u64) -> v4u32 {
+    __lsx_vssrln_wu_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlrn_b_h(a: v8i16, b: v8i16) -> v16i8 {
+    __lsx_vsrlrn_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlrn_h_w(a: v4i32, b: v4i32) -> v8i16 {
+    __lsx_vsrlrn_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlrn_w_d(a: v2i64, b: v2i64) -> v4i32 {
+    __lsx_vsrlrn_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrn_bu_h(a: v8u16, b: v8u16) -> v16u8 {
+    __lsx_vssrlrn_bu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrn_hu_w(a: v4u32, b: v4u32) -> v8u16 {
+    __lsx_vssrlrn_hu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrn_wu_d(a: v2u64, b: v2u64) -> v4u32 {
+    __lsx_vssrlrn_wu_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrstpi_b<const IMM5: u32>(a: v16i8, b: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vfrstpi_b(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrstpi_h<const IMM5: u32>(a: v8i16, b: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vfrstpi_h(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrstp_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8 {
+    __lsx_vfrstp_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrstp_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    __lsx_vfrstp_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vshuf4i_d<const IMM8: u32>(a: v2i64, b: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vshuf4i_d(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbsrl_v<const IMM5: u32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vbsrl_v(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vbsll_v<const IMM5: u32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vbsll_v(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vextrins_b<const IMM8: u32>(a: v16i8, b: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vextrins_b(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vextrins_h<const IMM8: u32>(a: v8i16, b: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vextrins_h(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vextrins_w<const IMM8: u32>(a: v4i32, b: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vextrins_w(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vextrins_d<const IMM8: u32>(a: v2i64, b: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vextrins_d(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmskltz_b(a: v16i8) -> v16i8 {
+    __lsx_vmskltz_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmskltz_h(a: v8i16) -> v8i16 {
+    __lsx_vmskltz_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmskltz_w(a: v4i32) -> v4i32 {
+    __lsx_vmskltz_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmskltz_d(a: v2i64) -> v2i64 {
+    __lsx_vmskltz_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsigncov_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vsigncov_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsigncov_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vsigncov_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsigncov_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vsigncov_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsigncov_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vsigncov_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmadd_s(a: v4f32, b: v4f32, c: v4f32) -> v4f32 {
+    __lsx_vfmadd_s(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmadd_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64 {
+    __lsx_vfmadd_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmsub_s(a: v4f32, b: v4f32, c: v4f32) -> v4f32 {
+    __lsx_vfmsub_s(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfmsub_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64 {
+    __lsx_vfmsub_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfnmadd_s(a: v4f32, b: v4f32, c: v4f32) -> v4f32 {
+    __lsx_vfnmadd_s(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfnmadd_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64 {
+    __lsx_vfnmadd_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfnmsub_s(a: v4f32, b: v4f32, c: v4f32) -> v4f32 {
+    __lsx_vfnmsub_s(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfnmsub_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64 {
+    __lsx_vfnmsub_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrne_w_s(a: v4f32) -> v4i32 {
+    __lsx_vftintrne_w_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrne_l_d(a: v2f64) -> v2i64 {
+    __lsx_vftintrne_l_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrp_w_s(a: v4f32) -> v4i32 {
+    __lsx_vftintrp_w_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrp_l_d(a: v2f64) -> v2i64 {
+    __lsx_vftintrp_l_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrm_w_s(a: v4f32) -> v4i32 {
+    __lsx_vftintrm_w_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrm_l_d(a: v2f64) -> v2i64 {
+    __lsx_vftintrm_l_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftint_w_d(a: v2f64, b: v2f64) -> v4i32 {
+    __lsx_vftint_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vffint_s_l(a: v2i64, b: v2i64) -> v4f32 {
+    __lsx_vffint_s_l(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrz_w_d(a: v2f64, b: v2f64) -> v4i32 {
+    __lsx_vftintrz_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrp_w_d(a: v2f64, b: v2f64) -> v4i32 {
+    __lsx_vftintrp_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrm_w_d(a: v2f64, b: v2f64) -> v4i32 {
+    __lsx_vftintrm_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrne_w_d(a: v2f64, b: v2f64) -> v4i32 {
+    __lsx_vftintrne_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintl_l_s(a: v4f32) -> v2i64 {
+    __lsx_vftintl_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftinth_l_s(a: v4f32) -> v2i64 {
+    __lsx_vftinth_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vffinth_d_w(a: v4i32) -> v2f64 {
+    __lsx_vffinth_d_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vffintl_d_w(a: v4i32) -> v2f64 {
+    __lsx_vffintl_d_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrzl_l_s(a: v4f32) -> v2i64 {
+    __lsx_vftintrzl_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrzh_l_s(a: v4f32) -> v2i64 {
+    __lsx_vftintrzh_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrpl_l_s(a: v4f32) -> v2i64 {
+    __lsx_vftintrpl_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrph_l_s(a: v4f32) -> v2i64 {
+    __lsx_vftintrph_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrml_l_s(a: v4f32) -> v2i64 {
+    __lsx_vftintrml_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrmh_l_s(a: v4f32) -> v2i64 {
+    __lsx_vftintrmh_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrnel_l_s(a: v4f32) -> v2i64 {
+    __lsx_vftintrnel_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vftintrneh_l_s(a: v4f32) -> v2i64 {
+    __lsx_vftintrneh_l_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrintrne_s(a: v4f32) -> v4f32 {
+    __lsx_vfrintrne_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrintrne_d(a: v2f64) -> v2f64 {
+    __lsx_vfrintrne_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrintrz_s(a: v4f32) -> v4f32 {
+    __lsx_vfrintrz_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrintrz_d(a: v2f64) -> v2f64 {
+    __lsx_vfrintrz_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrintrp_s(a: v4f32) -> v4f32 {
+    __lsx_vfrintrp_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrintrp_d(a: v2f64) -> v2f64 {
+    __lsx_vfrintrp_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrintrm_s(a: v4f32) -> v4f32 {
+    __lsx_vfrintrm_s(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfrintrm_d(a: v2f64) -> v2f64 {
+    __lsx_vfrintrm_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vstelm_b<const IMM_S8: i32, const IMM4: u32>(a: v16i8, mem_addr: *mut i8) {
+    static_assert_simm_bits!(IMM_S8, 8);
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vstelm_b(a, mem_addr, IMM_S8, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vstelm_h<const IMM_S8: i32, const IMM3: u32>(a: v8i16, mem_addr: *mut i8) {
+    static_assert_simm_bits!(IMM_S8, 8);
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vstelm_h(a, mem_addr, IMM_S8, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vstelm_w<const IMM_S8: i32, const IMM2: u32>(a: v4i32, mem_addr: *mut i8) {
+    static_assert_simm_bits!(IMM_S8, 8);
+    static_assert_uimm_bits!(IMM2, 2);
+    __lsx_vstelm_w(a, mem_addr, IMM_S8, IMM2)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vstelm_d<const IMM_S8: i32, const IMM1: u32>(a: v2i64, mem_addr: *mut i8) {
+    static_assert_simm_bits!(IMM_S8, 8);
+    static_assert_uimm_bits!(IMM1, 1);
+    __lsx_vstelm_d(a, mem_addr, IMM_S8, IMM1)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwev_d_w(a: v4i32, b: v4i32) -> v2i64 {
+    __lsx_vaddwev_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwev_w_h(a: v8i16, b: v8i16) -> v4i32 {
+    __lsx_vaddwev_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwev_h_b(a: v16i8, b: v16i8) -> v8i16 {
+    __lsx_vaddwev_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwod_d_w(a: v4i32, b: v4i32) -> v2i64 {
+    __lsx_vaddwod_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwod_w_h(a: v8i16, b: v8i16) -> v4i32 {
+    __lsx_vaddwod_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwod_h_b(a: v16i8, b: v16i8) -> v8i16 {
+    __lsx_vaddwod_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwev_d_wu(a: v4u32, b: v4u32) -> v2i64 {
+    __lsx_vaddwev_d_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwev_w_hu(a: v8u16, b: v8u16) -> v4i32 {
+    __lsx_vaddwev_w_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwev_h_bu(a: v16u8, b: v16u8) -> v8i16 {
+    __lsx_vaddwev_h_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwod_d_wu(a: v4u32, b: v4u32) -> v2i64 {
+    __lsx_vaddwod_d_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwod_w_hu(a: v8u16, b: v8u16) -> v4i32 {
+    __lsx_vaddwod_w_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwod_h_bu(a: v16u8, b: v16u8) -> v8i16 {
+    __lsx_vaddwod_h_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwev_d_wu_w(a: v4u32, b: v4i32) -> v2i64 {
+    __lsx_vaddwev_d_wu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwev_w_hu_h(a: v8u16, b: v8i16) -> v4i32 {
+    __lsx_vaddwev_w_hu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwev_h_bu_b(a: v16u8, b: v16i8) -> v8i16 {
+    __lsx_vaddwev_h_bu_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwod_d_wu_w(a: v4u32, b: v4i32) -> v2i64 {
+    __lsx_vaddwod_d_wu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwod_w_hu_h(a: v8u16, b: v8i16) -> v4i32 {
+    __lsx_vaddwod_w_hu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwod_h_bu_b(a: v16u8, b: v16i8) -> v8i16 {
+    __lsx_vaddwod_h_bu_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwev_d_w(a: v4i32, b: v4i32) -> v2i64 {
+    __lsx_vsubwev_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwev_w_h(a: v8i16, b: v8i16) -> v4i32 {
+    __lsx_vsubwev_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwev_h_b(a: v16i8, b: v16i8) -> v8i16 {
+    __lsx_vsubwev_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwod_d_w(a: v4i32, b: v4i32) -> v2i64 {
+    __lsx_vsubwod_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwod_w_h(a: v8i16, b: v8i16) -> v4i32 {
+    __lsx_vsubwod_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwod_h_b(a: v16i8, b: v16i8) -> v8i16 {
+    __lsx_vsubwod_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwev_d_wu(a: v4u32, b: v4u32) -> v2i64 {
+    __lsx_vsubwev_d_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwev_w_hu(a: v8u16, b: v8u16) -> v4i32 {
+    __lsx_vsubwev_w_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwev_h_bu(a: v16u8, b: v16u8) -> v8i16 {
+    __lsx_vsubwev_h_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwod_d_wu(a: v4u32, b: v4u32) -> v2i64 {
+    __lsx_vsubwod_d_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwod_w_hu(a: v8u16, b: v8u16) -> v4i32 {
+    __lsx_vsubwod_w_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwod_h_bu(a: v16u8, b: v16u8) -> v8i16 {
+    __lsx_vsubwod_h_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwev_q_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vaddwev_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwod_q_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vaddwod_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwev_q_du(a: v2u64, b: v2u64) -> v2i64 {
+    __lsx_vaddwev_q_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwod_q_du(a: v2u64, b: v2u64) -> v2i64 {
+    __lsx_vaddwod_q_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwev_q_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vsubwev_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwod_q_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vsubwod_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwev_q_du(a: v2u64, b: v2u64) -> v2i64 {
+    __lsx_vsubwev_q_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsubwod_q_du(a: v2u64, b: v2u64) -> v2i64 {
+    __lsx_vsubwod_q_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwev_q_du_d(a: v2u64, b: v2i64) -> v2i64 {
+    __lsx_vaddwev_q_du_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vaddwod_q_du_d(a: v2u64, b: v2i64) -> v2i64 {
+    __lsx_vaddwod_q_du_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwev_d_w(a: v4i32, b: v4i32) -> v2i64 {
+    __lsx_vmulwev_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwev_w_h(a: v8i16, b: v8i16) -> v4i32 {
+    __lsx_vmulwev_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwev_h_b(a: v16i8, b: v16i8) -> v8i16 {
+    __lsx_vmulwev_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwod_d_w(a: v4i32, b: v4i32) -> v2i64 {
+    __lsx_vmulwod_d_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwod_w_h(a: v8i16, b: v8i16) -> v4i32 {
+    __lsx_vmulwod_w_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwod_h_b(a: v16i8, b: v16i8) -> v8i16 {
+    __lsx_vmulwod_h_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwev_d_wu(a: v4u32, b: v4u32) -> v2i64 {
+    __lsx_vmulwev_d_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwev_w_hu(a: v8u16, b: v8u16) -> v4i32 {
+    __lsx_vmulwev_w_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwev_h_bu(a: v16u8, b: v16u8) -> v8i16 {
+    __lsx_vmulwev_h_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwod_d_wu(a: v4u32, b: v4u32) -> v2i64 {
+    __lsx_vmulwod_d_wu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwod_w_hu(a: v8u16, b: v8u16) -> v4i32 {
+    __lsx_vmulwod_w_hu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwod_h_bu(a: v16u8, b: v16u8) -> v8i16 {
+    __lsx_vmulwod_h_bu(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwev_d_wu_w(a: v4u32, b: v4i32) -> v2i64 {
+    __lsx_vmulwev_d_wu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwev_w_hu_h(a: v8u16, b: v8i16) -> v4i32 {
+    __lsx_vmulwev_w_hu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwev_h_bu_b(a: v16u8, b: v16i8) -> v8i16 {
+    __lsx_vmulwev_h_bu_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwod_d_wu_w(a: v4u32, b: v4i32) -> v2i64 {
+    __lsx_vmulwod_d_wu_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwod_w_hu_h(a: v8u16, b: v8i16) -> v4i32 {
+    __lsx_vmulwod_w_hu_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwod_h_bu_b(a: v16u8, b: v16i8) -> v8i16 {
+    __lsx_vmulwod_h_bu_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwev_q_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vmulwev_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwod_q_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vmulwod_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwev_q_du(a: v2u64, b: v2u64) -> v2i64 {
+    __lsx_vmulwev_q_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwod_q_du(a: v2u64, b: v2u64) -> v2i64 {
+    __lsx_vmulwod_q_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwev_q_du_d(a: v2u64, b: v2i64) -> v2i64 {
+    __lsx_vmulwev_q_du_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmulwod_q_du_d(a: v2u64, b: v2i64) -> v2i64 {
+    __lsx_vmulwod_q_du_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhaddw_q_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vhaddw_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhaddw_qu_du(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vhaddw_qu_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhsubw_q_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vhsubw_q_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vhsubw_qu_du(a: v2u64, b: v2u64) -> v2u64 {
+    __lsx_vhsubw_qu_du(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwev_d_w(a: v2i64, b: v4i32, c: v4i32) -> v2i64 {
+    __lsx_vmaddwev_d_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwev_w_h(a: v4i32, b: v8i16, c: v8i16) -> v4i32 {
+    __lsx_vmaddwev_w_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwev_h_b(a: v8i16, b: v16i8, c: v16i8) -> v8i16 {
+    __lsx_vmaddwev_h_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwev_d_wu(a: v2u64, b: v4u32, c: v4u32) -> v2u64 {
+    __lsx_vmaddwev_d_wu(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwev_w_hu(a: v4u32, b: v8u16, c: v8u16) -> v4u32 {
+    __lsx_vmaddwev_w_hu(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwev_h_bu(a: v8u16, b: v16u8, c: v16u8) -> v8u16 {
+    __lsx_vmaddwev_h_bu(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwod_d_w(a: v2i64, b: v4i32, c: v4i32) -> v2i64 {
+    __lsx_vmaddwod_d_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwod_w_h(a: v4i32, b: v8i16, c: v8i16) -> v4i32 {
+    __lsx_vmaddwod_w_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwod_h_b(a: v8i16, b: v16i8, c: v16i8) -> v8i16 {
+    __lsx_vmaddwod_h_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwod_d_wu(a: v2u64, b: v4u32, c: v4u32) -> v2u64 {
+    __lsx_vmaddwod_d_wu(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwod_w_hu(a: v4u32, b: v8u16, c: v8u16) -> v4u32 {
+    __lsx_vmaddwod_w_hu(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwod_h_bu(a: v8u16, b: v16u8, c: v16u8) -> v8u16 {
+    __lsx_vmaddwod_h_bu(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwev_d_wu_w(a: v2i64, b: v4u32, c: v4i32) -> v2i64 {
+    __lsx_vmaddwev_d_wu_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwev_w_hu_h(a: v4i32, b: v8u16, c: v8i16) -> v4i32 {
+    __lsx_vmaddwev_w_hu_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwev_h_bu_b(a: v8i16, b: v16u8, c: v16i8) -> v8i16 {
+    __lsx_vmaddwev_h_bu_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwod_d_wu_w(a: v2i64, b: v4u32, c: v4i32) -> v2i64 {
+    __lsx_vmaddwod_d_wu_w(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwod_w_hu_h(a: v4i32, b: v8u16, c: v8i16) -> v4i32 {
+    __lsx_vmaddwod_w_hu_h(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwod_h_bu_b(a: v8i16, b: v16u8, c: v16i8) -> v8i16 {
+    __lsx_vmaddwod_h_bu_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwev_q_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64 {
+    __lsx_vmaddwev_q_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwod_q_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64 {
+    __lsx_vmaddwod_q_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwev_q_du(a: v2u64, b: v2u64, c: v2u64) -> v2u64 {
+    __lsx_vmaddwev_q_du(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwod_q_du(a: v2u64, b: v2u64, c: v2u64) -> v2u64 {
+    __lsx_vmaddwod_q_du(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwev_q_du_d(a: v2i64, b: v2u64, c: v2i64) -> v2i64 {
+    __lsx_vmaddwev_q_du_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmaddwod_q_du_d(a: v2i64, b: v2u64, c: v2i64) -> v2i64 {
+    __lsx_vmaddwod_q_du_d(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vrotr_b(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vrotr_b(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vrotr_h(a: v8i16, b: v8i16) -> v8i16 {
+    __lsx_vrotr_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vrotr_w(a: v4i32, b: v4i32) -> v4i32 {
+    __lsx_vrotr_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vrotr_d(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vrotr_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vadd_q(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vadd_q(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsub_q(a: v2i64, b: v2i64) -> v2i64 {
+    __lsx_vsub_q(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vldrepl_b<const IMM_S12: i32>(mem_addr: *const i8) -> v16i8 {
+    static_assert_simm_bits!(IMM_S12, 12);
+    __lsx_vldrepl_b(mem_addr, IMM_S12)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vldrepl_h<const IMM_S11: i32>(mem_addr: *const i8) -> v8i16 {
+    static_assert_simm_bits!(IMM_S11, 11);
+    __lsx_vldrepl_h(mem_addr, IMM_S11)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vldrepl_w<const IMM_S10: i32>(mem_addr: *const i8) -> v4i32 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    __lsx_vldrepl_w(mem_addr, IMM_S10)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vldrepl_d<const IMM_S9: i32>(mem_addr: *const i8) -> v2i64 {
+    static_assert_simm_bits!(IMM_S9, 9);
+    __lsx_vldrepl_d(mem_addr, IMM_S9)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmskgez_b(a: v16i8) -> v16i8 {
+    __lsx_vmskgez_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vmsknz_b(a: v16i8) -> v16i8 {
+    __lsx_vmsknz_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vexth_h_b(a: v16i8) -> v8i16 {
+    __lsx_vexth_h_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vexth_w_h(a: v8i16) -> v4i32 {
+    __lsx_vexth_w_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vexth_d_w(a: v4i32) -> v2i64 {
+    __lsx_vexth_d_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vexth_q_d(a: v2i64) -> v2i64 {
+    __lsx_vexth_q_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vexth_hu_bu(a: v16u8) -> v8u16 {
+    __lsx_vexth_hu_bu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vexth_wu_hu(a: v8u16) -> v4u32 {
+    __lsx_vexth_wu_hu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vexth_du_wu(a: v4u32) -> v2u64 {
+    __lsx_vexth_du_wu(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vexth_qu_du(a: v2u64) -> v2u64 {
+    __lsx_vexth_qu_du(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vrotri_b<const IMM3: u32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    __lsx_vrotri_b(a, IMM3)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vrotri_h<const IMM4: u32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vrotri_h(a, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vrotri_w<const IMM5: u32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vrotri_w(a, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vrotri_d<const IMM6: u32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vrotri_d(a, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vextl_q_d(a: v2i64) -> v2i64 {
+    __lsx_vextl_q_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlni_b_h<const IMM4: u32>(a: v16i8, b: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vsrlni_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlni_h_w<const IMM5: u32>(a: v8i16, b: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsrlni_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlni_w_d<const IMM6: u32>(a: v4i32, b: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vsrlni_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlni_d_q<const IMM7: u32>(a: v2i64, b: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lsx_vsrlni_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlrni_b_h<const IMM4: u32>(a: v16i8, b: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vsrlrni_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlrni_h_w<const IMM5: u32>(a: v8i16, b: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsrlrni_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlrni_w_d<const IMM6: u32>(a: v4i32, b: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vsrlrni_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrlrni_d_q<const IMM7: u32>(a: v2i64, b: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lsx_vsrlrni_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlni_b_h<const IMM4: u32>(a: v16i8, b: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vssrlni_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlni_h_w<const IMM5: u32>(a: v8i16, b: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vssrlni_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlni_w_d<const IMM6: u32>(a: v4i32, b: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vssrlni_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlni_d_q<const IMM7: u32>(a: v2i64, b: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lsx_vssrlni_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlni_bu_h<const IMM4: u32>(a: v16u8, b: v16i8) -> v16u8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vssrlni_bu_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlni_hu_w<const IMM5: u32>(a: v8u16, b: v8i16) -> v8u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vssrlni_hu_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlni_wu_d<const IMM6: u32>(a: v4u32, b: v4i32) -> v4u32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vssrlni_wu_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlni_du_q<const IMM7: u32>(a: v2u64, b: v2i64) -> v2u64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lsx_vssrlni_du_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrni_b_h<const IMM4: u32>(a: v16i8, b: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vssrlrni_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrni_h_w<const IMM5: u32>(a: v8i16, b: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vssrlrni_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrni_w_d<const IMM6: u32>(a: v4i32, b: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vssrlrni_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrni_d_q<const IMM7: u32>(a: v2i64, b: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lsx_vssrlrni_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrni_bu_h<const IMM4: u32>(a: v16u8, b: v16i8) -> v16u8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vssrlrni_bu_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrni_hu_w<const IMM5: u32>(a: v8u16, b: v8i16) -> v8u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vssrlrni_hu_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrni_wu_d<const IMM6: u32>(a: v4u32, b: v4i32) -> v4u32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vssrlrni_wu_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrni_du_q<const IMM7: u32>(a: v2u64, b: v2i64) -> v2u64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lsx_vssrlrni_du_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrani_b_h<const IMM4: u32>(a: v16i8, b: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vsrani_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrani_h_w<const IMM5: u32>(a: v8i16, b: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsrani_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrani_w_d<const IMM6: u32>(a: v4i32, b: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vsrani_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrani_d_q<const IMM7: u32>(a: v2i64, b: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lsx_vsrani_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrarni_b_h<const IMM4: u32>(a: v16i8, b: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vsrarni_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrarni_h_w<const IMM5: u32>(a: v8i16, b: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vsrarni_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrarni_w_d<const IMM6: u32>(a: v4i32, b: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vsrarni_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vsrarni_d_q<const IMM7: u32>(a: v2i64, b: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lsx_vsrarni_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrani_b_h<const IMM4: u32>(a: v16i8, b: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vssrani_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrani_h_w<const IMM5: u32>(a: v8i16, b: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vssrani_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrani_w_d<const IMM6: u32>(a: v4i32, b: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vssrani_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrani_d_q<const IMM7: u32>(a: v2i64, b: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lsx_vssrani_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrani_bu_h<const IMM4: u32>(a: v16u8, b: v16i8) -> v16u8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vssrani_bu_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrani_hu_w<const IMM5: u32>(a: v8u16, b: v8i16) -> v8u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vssrani_hu_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrani_wu_d<const IMM6: u32>(a: v4u32, b: v4i32) -> v4u32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vssrani_wu_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrani_du_q<const IMM7: u32>(a: v2u64, b: v2i64) -> v2u64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lsx_vssrani_du_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarni_b_h<const IMM4: u32>(a: v16i8, b: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vssrarni_b_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarni_h_w<const IMM5: u32>(a: v8i16, b: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vssrarni_h_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarni_w_d<const IMM6: u32>(a: v4i32, b: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vssrarni_w_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarni_d_q<const IMM7: u32>(a: v2i64, b: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lsx_vssrarni_d_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarni_bu_h<const IMM4: u32>(a: v16u8, b: v16i8) -> v16u8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    __lsx_vssrarni_bu_h(a, b, IMM4)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarni_hu_w<const IMM5: u32>(a: v8u16, b: v8i16) -> v8u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __lsx_vssrarni_hu_w(a, b, IMM5)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarni_wu_d<const IMM6: u32>(a: v4u32, b: v4i32) -> v4u32 {
+    static_assert_uimm_bits!(IMM6, 6);
+    __lsx_vssrarni_wu_d(a, b, IMM6)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrarni_du_q<const IMM7: u32>(a: v2u64, b: v2i64) -> v2u64 {
+    static_assert_uimm_bits!(IMM7, 7);
+    __lsx_vssrarni_du_q(a, b, IMM7)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vpermi_w<const IMM8: u32>(a: v4i32, b: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    __lsx_vpermi_w(a, b, IMM8)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vld<const IMM_S12: i32>(mem_addr: *const i8) -> v16i8 {
+    static_assert_simm_bits!(IMM_S12, 12);
+    __lsx_vld(mem_addr, IMM_S12)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vst<const IMM_S12: i32>(a: v16i8, mem_addr: *mut i8) {
+    static_assert_simm_bits!(IMM_S12, 12);
+    __lsx_vst(a, mem_addr, IMM_S12)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrn_b_h(a: v8i16, b: v8i16) -> v16i8 {
+    __lsx_vssrlrn_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrn_h_w(a: v4i32, b: v4i32) -> v8i16 {
+    __lsx_vssrlrn_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrlrn_w_d(a: v2i64, b: v2i64) -> v4i32 {
+    __lsx_vssrlrn_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrln_b_h(a: v8i16, b: v8i16) -> v16i8 {
+    __lsx_vssrln_b_h(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrln_h_w(a: v4i32, b: v4i32) -> v8i16 {
+    __lsx_vssrln_h_w(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vssrln_w_d(a: v2i64, b: v2i64) -> v4i32 {
+    __lsx_vssrln_w_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vorn_v(a: v16i8, b: v16i8) -> v16i8 {
+    __lsx_vorn_v(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vldi<const IMM_S13: i32>() -> v2i64 {
+    static_assert_simm_bits!(IMM_S13, 13);
+    __lsx_vldi(IMM_S13)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vshuf_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8 {
+    __lsx_vshuf_b(a, b, c)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vldx(mem_addr: *const i8, b: i64) -> v16i8 {
+    __lsx_vldx(mem_addr, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vstx(a: v16i8, mem_addr: *mut i8, b: i64) {
+    __lsx_vstx(a, mem_addr, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vextl_qu_du(a: v2u64) -> v2u64 {
+    __lsx_vextl_qu_du(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_bnz_b(a: v16u8) -> i32 {
+    __lsx_bnz_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_bnz_d(a: v2u64) -> i32 {
+    __lsx_bnz_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_bnz_h(a: v8u16) -> i32 {
+    __lsx_bnz_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_bnz_v(a: v16u8) -> i32 {
+    __lsx_bnz_v(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_bnz_w(a: v4u32) -> i32 {
+    __lsx_bnz_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_bz_b(a: v16u8) -> i32 {
+    __lsx_bz_b(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_bz_d(a: v2u64) -> i32 {
+    __lsx_bz_d(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_bz_h(a: v8u16) -> i32 {
+    __lsx_bz_h(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_bz_v(a: v16u8) -> i32 {
+    __lsx_bz_v(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_bz_w(a: v4u32) -> i32 {
+    __lsx_bz_w(a)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_caf_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_caf_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_caf_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_caf_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_ceq_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_ceq_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_ceq_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_ceq_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cle_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_cle_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cle_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_cle_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_clt_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_clt_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_clt_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_clt_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cne_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_cne_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cne_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_cne_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cor_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_cor_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cor_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_cor_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cueq_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_cueq_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cueq_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_cueq_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cule_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_cule_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cule_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_cule_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cult_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_cult_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cult_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_cult_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cun_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_cun_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cune_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_cune_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cune_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_cune_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_cun_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_cun_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_saf_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_saf_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_saf_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_saf_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_seq_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_seq_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_seq_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_seq_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sle_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_sle_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sle_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_sle_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_slt_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_slt_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_slt_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_slt_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sne_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_sne_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sne_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_sne_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sor_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_sor_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sor_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_sor_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sueq_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_sueq_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sueq_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_sueq_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sule_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_sule_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sule_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_sule_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sult_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_sult_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sult_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_sult_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sun_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_sun_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sune_d(a: v2f64, b: v2f64) -> v2i64 {
+    __lsx_vfcmp_sune_d(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sune_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_sune_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vfcmp_sun_s(a: v4f32, b: v4f32) -> v4i32 {
+    __lsx_vfcmp_sun_s(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vrepli_b<const IMM_S10: i32>() -> v16i8 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    __lsx_vrepli_b(IMM_S10)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vrepli_d<const IMM_S10: i32>() -> v2i64 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    __lsx_vrepli_d(IMM_S10)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vrepli_h<const IMM_S10: i32>() -> v8i16 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    __lsx_vrepli_h(IMM_S10)
+}
+
+#[inline]
+#[target_feature(enable = "lsx")]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lsx_vrepli_w<const IMM_S10: i32>() -> v4i32 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    __lsx_vrepli_w(IMM_S10)
+}
diff --git a/library/stdarch/crates/core_arch/src/loongarch64/lsx/mod.rs b/library/stdarch/crates/core_arch/src/loongarch64/lsx/mod.rs
new file mode 100644
index 0000000000000..67a08985a9637
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/loongarch64/lsx/mod.rs
@@ -0,0 +1,21 @@
+//! LoongArch64 LSX intrinsics
+
+#![allow(non_camel_case_types)]
+
+#[rustfmt::skip]
+mod types;
+
+#[rustfmt::skip]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub use self::types::*;
+
+#[rustfmt::skip]
+mod generated;
+
+#[rustfmt::skip]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub use self::generated::*;
+
+#[rustfmt::skip]
+#[cfg(test)]
+mod tests;
diff --git a/library/stdarch/crates/core_arch/src/loongarch64/lsx/tests.rs b/library/stdarch/crates/core_arch/src/loongarch64/lsx/tests.rs
new file mode 100644
index 0000000000000..5670bd4378a84
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/loongarch64/lsx/tests.rs
@@ -0,0 +1,7164 @@
+// This code is automatically generated. DO NOT MODIFY.
+// See crates/stdarch-gen-loongarch/README.md
+
+use crate::{
+    core_arch::{loongarch64::*, simd::*},
+    mem::transmute,
+};
+use stdarch_test::simd_test;
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsll_b() {
+    let a = i8x16::new(
+        -96, 33, -12, -39, 82, 20, 52, 0, -99, -60, -50, -85, -6, -83, -52, -23,
+    );
+    let b = i8x16::new(
+        50, 37, 88, 105, -45, -52, 119, 2, 19, 109, 95, 116, -101, -126, -104, -119,
+    );
+    let r = i64x2::new(70990221811840, -3257029622096690968);
+
+    assert_eq!(r, transmute(lsx_vsll_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsll_h() {
+    let a = i16x8::new(2551, -25501, -5868, -8995, 27363, 18426, -10212, -26148);
+    let b = i16x8::new(-10317, -20778, -9962, -8975, 25298, 12929, -13803, -18669);
+    let r = i64x2::new(-5063658964307128392, -3539825456407336052);
+
+    assert_eq!(r, transmute(lsx_vsll_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsll_w() {
+    let a = i32x4::new(1371197240, -1100536513, 781269067, -294302078);
+    let b = i32x4::new(82237029, -819106294, -96895338, -456101700);
+    let r = i64x2::new(-7163824029380778240, 2305843009528266752);
+
+    assert_eq!(r, transmute(lsx_vsll_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsll_d() {
+    let a = i64x2::new(5700293115058898640, 9057986892130087440);
+    let b = i64x2::new(8592669249977019309, -1379694176202045825);
+    let r = i64x2::new(1790743801833193472, 0);
+
+    assert_eq!(r, transmute(lsx_vsll_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslli_b() {
+    let a = i8x16::new(
+        90, 123, 29, -67, 120, -106, 104, -39, -62, -56, -92, -75, 113, 123, -120, -52,
+    );
+    let r = i64x2::new(-2780807324588213414, -3708578564830607166);
+
+    assert_eq!(r, transmute(lsx_vslli_b::<0>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslli_h() {
+    let a = i16x8::new(18469, -14840, 23655, -3474, 7467, 2798, -15418, 26847);
+    let r = i64x2::new(-7241759886206301888, 4017476402818337472);
+
+    assert_eq!(r, transmute(lsx_vslli_h::<6>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslli_w() {
+    let a = i32x4::new(20701902, -1777432355, 6349179, 1747667894);
+    let r = i64x2::new(4189319625752393728, -5967594959501136896);
+
+    assert_eq!(r, transmute(lsx_vslli_w::<10>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslli_d() {
+    let a = i64x2::new(-5896889635782282086, -8807609320972692839);
+    let r = i64x2::new(-4233027607937510592, -5142337165482896608);
+
+    assert_eq!(r, transmute(lsx_vslli_d::<5>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsra_b() {
+    let a = i8x16::new(
+        0, 72, -102, -88, 101, -100, 66, -113, 68, -13, 2, 4, -61, 66, -24, 72,
+    );
+    let b = i8x16::new(
+        34, 5, 102, 83, -87, 43, 94, 107, -84, 88, -103, 5, 127, 43, -28, -69,
+    );
+    let r = i64x2::new(-1080315035391229440, 720022881735668484);
+
+    assert_eq!(r, transmute(lsx_vsra_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsra_h() {
+    let a = i16x8::new(29313, 15702, 30839, 9343, -19597, 5316, -32305, -13755);
+    let b = i16x8::new(14017, 3796, 23987, -27244, -13363, 21333, -10262, 23633);
+    let r = i64x2::new(164116464290576704, -1935703552267190275);
+
+    assert_eq!(r, transmute(lsx_vsra_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsra_w() {
+    let a = i32x4::new(-309802992, -833530117, -1757716660, 1577882592);
+    let b = i32x4::new(-670772992, 2044335288, -1224858031, 520588790);
+    let r = i64x2::new(-210763200496, 1619202657181);
+
+    assert_eq!(r, transmute(lsx_vsra_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsra_d() {
+    let a = i64x2::new(-1372092312892164486, 6937900992858870877);
+    let b = i64x2::new(4251079558060308329, 4657697142994416829);
+    let r = i64x2::new(-623956, 3);
+
+    assert_eq!(r, transmute(lsx_vsra_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrai_b() {
+    let a = i8x16::new(
+        -4, 92, -7, -110, 81, -20, -18, -113, 43, 110, -105, 53, -101, -100, -56, -120,
+    );
+    let r = i64x2::new(-2018743940785760257, -2093355901512246518);
+
+    assert_eq!(r, transmute(lsx_vsrai_b::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrai_h() {
+    let a = i16x8::new(-22502, -7299, 19084, -21578, -28082, 20851, 23456, 15524);
+    let r = i64x2::new(-1688828385492998, 844446405361657);
+
+    assert_eq!(r, transmute(lsx_vsrai_h::<12>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrai_w() {
+    let a = i32x4::new(743537539, 1831641900, -1639033567, -984629971);
+    let r = i64x2::new(30008936499988, -16131897170029);
+
+    assert_eq!(r, transmute(lsx_vsrai_w::<18>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrai_d() {
+    let a = i64x2::new(-8375997486414293750, 1714581574012370587);
+    let r = i64x2::new(-476121, 97462);
+
+    assert_eq!(r, transmute(lsx_vsrai_d::<44>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrar_b() {
+    let a = i8x16::new(
+        123, 17, -3, 27, 49, 89, -61, 105, -77, 87, 87, 15, -113, 75, -69, 40,
+    );
+    let b = i8x16::new(
+        14, 5, 123, -33, 72, -126, -70, -33, -124, -55, -82, -78, -33, -12, -25, -114,
+    );
+    let r = i64x2::new(139917463134404866, 143840305941130491);
+
+    assert_eq!(r, transmute(lsx_vsrar_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrar_h() {
+    let a = i16x8::new(-25154, -18230, -10510, -29541, 25913, 29143, 21372, 14979);
+    let b = i16x8::new(-26450, 2176, 31587, 2222, 13726, 30172, 1067, -14273);
+    let r = i64x2::new(-287115463426050, 42950131714);
+
+    assert_eq!(r, transmute(lsx_vsrar_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrar_w() {
+    let a = i32x4::new(-139995520, 1671693163, -640570871, 2138298219);
+    let b = i32x4::new(-1532076758, 940127488, 1781366421, 1497262222);
+    let r = i64x2::new(7179867468326627830, 560544771735247);
+
+    assert_eq!(r, transmute(lsx_vsrar_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrar_d() {
+    let a = i64x2::new(-489385672013329488, -1253364580216579403);
+    let b = i64x2::new(3571440266112779495, -725943254065719378);
+    let r = i64x2::new(-890187, -17811);
+
+    assert_eq!(r, transmute(lsx_vsrar_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrari_b() {
+    let a = i8x16::new(
+        -20, 33, -49, -120, -30, -40, 67, 93, -77, -2, 16, -36, 108, -107, 23, -53,
+    );
+    let r = i64x2::new(867219992078845182, -503291487652282122);
+
+    assert_eq!(r, transmute(lsx_vsrari_b::<3>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrari_h() {
+    let a = i16x8::new(29939, -1699, 12357, 30805, -30883, 31936, 15701, -11818);
+    let r = i64x2::new(4222154715365391, -1688815499411471);
+
+    assert_eq!(r, transmute(lsx_vsrari_h::<11>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrari_w() {
+    let a = i32x4::new(588196178, -1058764534, 1325397591, 1169671026);
+    let r = i64x2::new(-4294967295, 4294967297);
+
+    assert_eq!(r, transmute(lsx_vsrari_w::<30>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrari_d() {
+    let a = i64x2::new(-2795326946470057100, 6746045132217841338);
+    let r = i64x2::new(-174707934154378569, 421627820763615084);
+
+    assert_eq!(r, transmute(lsx_vsrari_d::<4>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrl_b() {
+    let a = i8x16::new(
+        73, 74, 66, -104, -30, 25, 93, -107, 105, -89, -115, -22, -94, -36, -55, -28,
+    );
+    let b = i8x16::new(
+        81, 13, -9, -46, -24, 0, 91, 123, 90, -52, -24, 56, 64, -4, -66, -17,
+    );
+    let r = i64x2::new(1300161376517358116, 72917012339034650);
+
+    assert_eq!(r, transmute(lsx_vsrl_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrl_h() {
+    let a = i16x8::new(29049, 13489, 20776, -12268, 25704, -28758, -6146, -27463);
+    let b = i16x8::new(16605, -13577, -26644, -17739, 11000, -29283, -15971, 20169);
+    let r = i64x2::new(468374382728249347, 20829178341621860);
+
+    assert_eq!(r, transmute(lsx_vsrl_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrl_w() {
+    let a = i32x4::new(-2108561731, -402290458, -1418385618, 1489749824);
+    let b = i32x4::new(1777885221, -1725401090, 1849724045, -1051851102);
+    let r = i64x2::new(12953227061, 1599606693325790121);
+
+    assert_eq!(r, transmute(lsx_vsrl_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrl_d() {
+    let a = i64x2::new(2854528248771186187, 804951867404831945);
+    let b = i64x2::new(-7903128394835365398, 7601347629202818185);
+    let r = i64x2::new(649044, 1572171616025062);
+
+    assert_eq!(r, transmute(lsx_vsrl_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrli_b() {
+    let a = i8x16::new(
+        84, -108, 98, 45, 126, -124, 105, 108, 0, 61, -29, -31, -75, -41, 114, -33,
+    );
+    let r = i64x2::new(1952909805632365845, 3971107439766933248);
+
+    assert_eq!(r, transmute(lsx_vsrli_b::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrli_h() {
+    let a = i16x8::new(29545, 354, 27695, 20915, -32766, -24491, 10641, 20310);
+    let r = i64x2::new(11259230996660281, 10977609996304448);
+
+    assert_eq!(r, transmute(lsx_vsrli_h::<9>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrli_w() {
+    let a = i32x4::new(627703601, 922874410, -234412645, -1216101872);
+    let r = i64x2::new(3870813506329215, 12913695352717769);
+
+    assert_eq!(r, transmute(lsx_vsrli_w::<10>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrli_d() {
+    let a = i64x2::new(1407685950714554203, -6076144426076800688);
+    let r = i64x2::new(9, 85);
+
+    assert_eq!(r, transmute(lsx_vsrli_d::<57>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlr_b() {
+    let a = i8x16::new(
+        -79, 91, -123, 112, -84, 70, -78, -74, -104, 27, -94, -46, -49, -78, 113, -2,
+    );
+    let b = i8x16::new(
+        23, 4, -120, -11, -13, 103, 84, 58, -108, 121, -66, -9, -81, 91, 71, -33,
+    );
+    let r = i64x2::new(3317746744565237249, 144420860932066826);
+
+    assert_eq!(r, transmute(lsx_vsrlr_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlr_h() {
+    let a = i16x8::new(14153, -26873, 3115, 28304, 4881, -8446, 28628, 8837);
+    let b = i16x8::new(19500, -26403, -1282, 12290, -18989, 25105, -24347, 6707);
+    let r = i64x2::new(1991716935204929539, 311033695131730530);
+
+    assert_eq!(r, transmute(lsx_vsrlr_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlr_w() {
+    let a = i32x4::new(1997879294, 120007491, -1807289594, -1854395615);
+    let b = i32x4::new(1830015593, -1452673200, 962662328, -252736055);
+    let r = i64x2::new(7864089021084, 20473000998469780);
+
+    assert_eq!(r, transmute(lsx_vsrlr_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlr_d() {
+    let a = i64x2::new(5993546441420611680, 4358546479290416194);
+    let b = i64x2::new(-1543621369665313706, 8544381131364512650);
+    let r = i64x2::new(1428972826343, 4256393046182047);
+
+    assert_eq!(r, transmute(lsx_vsrlr_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlri_b() {
+    let a = i8x16::new(
+        -41, 87, -43, -35, 79, -10, -103, 1, 52, -35, 8, -17, -116, 84, -91, 51,
+    );
+    let r = i64x2::new(93866580842851436, 1896906350202744602);
+
+    assert_eq!(r, transmute(lsx_vsrlri_b::<1>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlri_h() {
+    let a = i16x8::new(-18045, 1968, 22966, 3692, 2010, -17108, 3373, -30706);
+    let r = i64x2::new(1039304252363684227, -8642956144778934310);
+
+    assert_eq!(r, transmute(lsx_vsrlri_h::<0>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlri_w() {
+    let a = i32x4::new(1306456564, -1401620667, -839707416, -1634862919);
+    let r = i64x2::new(1553353645217275455, 1428132662790218397);
+
+    assert_eq!(r, transmute(lsx_vsrlri_w::<3>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlri_d() {
+    let a = i64x2::new(-3683179565838693027, 6160461828074490983);
+    let r = i64x2::new(205, 85);
+
+    assert_eq!(r, transmute(lsx_vsrlri_d::<56>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitclr_b() {
+    let a = u8x16::new(
+        238, 18, 41, 55, 84, 12, 87, 155, 124, 76, 175, 240, 181, 121, 58, 183,
+    );
+    let b = u8x16::new(
+        57, 132, 149, 173, 76, 177, 99, 144, 8, 167, 2, 144, 70, 60, 105, 232,
+    );
+    let r = i64x2::new(-7325372782311046420, -5316383129963115396);
+
+    assert_eq!(r, transmute(lsx_vbitclr_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitclr_h() {
+    let a = u16x8::new(14340, 59474, 49868, 46012, 53117, 6307, 22589, 53749);
+    let b = u16x8::new(26587, 57597, 34751, 38678, 23919, 45729, 62569, 5978);
+    let r = i64x2::new(-5495443997997256700, -3317648531059028099);
+
+    assert_eq!(r, transmute(lsx_vbitclr_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitclr_w() {
+    let a = u32x4::new(1581022148, 2519245321, 296293885, 127383934);
+    let b = u32x4::new(1968231094, 2827365864, 4097273355, 4016923215);
+    let r = i64x2::new(-7626667807832507452, 546969093373761021);
+
+    assert_eq!(r, transmute(lsx_vbitclr_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitclr_d() {
+    let a = u64x2::new(17203892527896963423, 12937109545250696056);
+    let b = u64x2::new(5723204188033770667, 2981956604140378920);
+    let r = i64x2::new(-1242851545812588193, -5509634528458855560);
+
+    assert_eq!(r, transmute(lsx_vbitclr_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitclri_b() {
+    let a = u8x16::new(
+        146, 23, 223, 183, 109, 56, 35, 105, 178, 156, 170, 57, 196, 164, 185, 161,
+    );
+    let r = i64x2::new(7503621968728299154, -6865556469255070542);
+
+    assert_eq!(r, transmute(lsx_vbitclri_b::<0>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitclri_h() {
+    let a = u16x8::new(17366, 58985, 22108, 45942, 27326, 19605, 9632, 32322);
+    let r = i64x2::new(-5515130134779575338, 8809640793386347198);
+
+    assert_eq!(r, transmute(lsx_vbitclri_h::<10>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitclri_w() {
+    let a = u32x4::new(718858183, 3771164920, 1842485081, 896350597);
+    let r = i64x2::new(-2249714073768237625, 3849796501707560281);
+
+    assert_eq!(r, transmute(lsx_vbitclri_w::<9>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitclri_d() {
+    let a = u64x2::new(10838658690401820648, 3833745076866321369);
+    let r = i64x2::new(-7608085933063544856, 3833744527110507481);
+
+    assert_eq!(r, transmute(lsx_vbitclri_d::<39>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitset_b() {
+    let a = u8x16::new(
+        229, 230, 162, 180, 94, 215, 193, 145, 28, 90, 35, 171, 225, 7, 84, 128,
+    );
+    let b = u8x16::new(
+        209, 178, 73, 112, 118, 233, 139, 239, 2, 23, 209, 152, 236, 51, 195, 75,
+    );
+    let r = i64x2::new(-7941579666116909337, -8620998056061183460);
+
+    assert_eq!(r, transmute(lsx_vbitset_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitset_h() {
+    let a = u16x8::new(967, 49899, 53264, 29198, 56634, 42461, 51022, 31627);
+    let b = u16x8::new(64512, 23847, 57770, 47705, 8024, 31966, 14493, 50266);
+    let r = i64x2::new(8218739538452480967, 9190693790629616954);
+
+    assert_eq!(r, transmute(lsx_vbitset_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitset_w() {
+    let a = u32x4::new(2899706360, 1274114722, 1170526770, 3308854969);
+    let b = u32x4::new(3259082048, 1303228302, 1429001720, 209615081);
+    let r = i64x2::new(5472281065241838073, -4235320193476931022);
+
+    assert_eq!(r, transmute(lsx_vbitset_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitset_d() {
+    let a = u64x2::new(8117422063017946604, 5026948610774344635);
+    let b = u64x2::new(12687331714071910183, 1753585392879336372);
+    let r = i64x2::new(8117422612773760492, 5031452210401715131);
+
+    assert_eq!(r, transmute(lsx_vbitset_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitseti_b() {
+    let a = u8x16::new(
+        163, 123, 56, 129, 159, 111, 214, 85, 141, 240, 190, 190, 175, 215, 20, 81,
+    );
+    let r = i64x2::new(6185254145054243811, 5860546440891134157);
+
+    assert_eq!(r, transmute(lsx_vbitseti_b::<6>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitseti_h() {
+    let a = u16x8::new(15222, 59961, 52253, 2908, 61562, 41309, 63627, 4191);
+    let r = i64x2::new(819316619673811830, 1179934905985921146);
+
+    assert_eq!(r, transmute(lsx_vbitseti_h::<1>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitseti_w() {
+    let a = u32x4::new(3788412756, 1863556832, 1913138259, 1199998627);
+    let r = i64x2::new(8012922850722617172, 5162962059379878995);
+
+    assert_eq!(r, transmute(lsx_vbitseti_w::<21>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitseti_d() {
+    let a = u64x2::new(10744510173660993785, 16946223211744108759);
+    let r = i64x2::new(-7702233900048557831, -1500520861831225129);
+
+    assert_eq!(r, transmute(lsx_vbitseti_d::<27>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitrev_b() {
+    let a = u8x16::new(
+        50, 114, 173, 149, 9, 38, 147, 232, 52, 235, 56, 98, 113, 120, 249, 238,
+    );
+    let b = u8x16::new(
+        252, 187, 218, 48, 148, 63, 222, 247, 56, 181, 124, 130, 243, 202, 86, 253,
+    );
+    let r = i64x2::new(7553563628828981794, -3550669970358088907);
+
+    assert_eq!(r, transmute(lsx_vbitrev_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitrev_h() {
+    let a = u16x8::new(8304, 965, 30335, 58555, 41304, 8461, 30573, 59417);
+    let b = u16x8::new(21347, 23131, 57157, 13786, 34463, 33445, 23964, 48087);
+    let r = i64x2::new(-2253077037977362312, -1686202867067838120);
+
+    assert_eq!(r, transmute(lsx_vbitrev_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitrev_w() {
+    let a = u32x4::new(3821500454, 1067219398, 1766391845, 676798616);
+    let b = u32x4::new(3330530584, 4153020036, 822570638, 2652744506);
+    let r = i64x2::new(4583672484591007782, 3195058299616182309);
+
+    assert_eq!(r, transmute(lsx_vbitrev_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitrev_d() {
+    let a = u64x2::new(16016664040604304047, 18062107512190600767);
+    let b = u64x2::new(10942298949673565895, 12884740754463765660);
+    let r = i64x2::new(-2430080033105247697, -384636561250515393);
+
+    assert_eq!(r, transmute(lsx_vbitrev_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitrevi_b() {
+    let a = u8x16::new(
+        184, 147, 93, 34, 212, 175, 25, 125, 50, 34, 160, 241, 228, 231, 77, 110,
+    );
+    let r = i64x2::new(8727320563398842300, 7658903196653594166);
+
+    assert_eq!(r, transmute(lsx_vbitrevi_b::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitrevi_h() {
+    let a = u16x8::new(15083, 24599, 61212, 12408, 48399, 59833, 45416, 58826);
+    let r = i64x2::new(8104420064785562347, -6500117680329458417);
+
+    assert_eq!(r, transmute(lsx_vbitrevi_h::<14>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitrevi_w() {
+    let a = u32x4::new(1200613355, 1418062686, 3847355950, 3312937419);
+    let r = i64x2::new(6099540060505368555, -4226793400815190482);
+
+    assert_eq!(r, transmute(lsx_vbitrevi_w::<21>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitrevi_d() {
+    let a = u64x2::new(295858379748270823, 1326723086853575042);
+    let r = i64x2::new(295858379748254439, 1326723086853591426);
+
+    assert_eq!(r, transmute(lsx_vbitrevi_d::<14>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vadd_b() {
+    let a = i8x16::new(
+        14, -124, 73, 125, 119, 60, 127, -10, 31, 89, 50, -88, 29, -28, -53, -8,
+    );
+    let b = i8x16::new(
+        94, -52, -56, 75, -104, 77, 16, 82, 82, 69, -81, -75, 25, -102, -109, 23,
+    );
+    let r = i64x2::new(5228548393274527852, 1107461330348121713);
+
+    assert_eq!(r, transmute(lsx_vadd_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vadd_h() {
+    let a = i16x8::new(14051, -27363, -25412, -27329, 25098, 5182, -13698, -15422);
+    let b = i16x8::new(-25040, 15453, -28080, -31322, -24429, -12453, -18073, 27019);
+    let r = i64x2::new(1938006946753467667, 3264410328302682781);
+
+    assert_eq!(r, transmute(lsx_vadd_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vadd_w() {
+    let a = i32x4::new(-724548235, -1051318497, -203352059, 1502361914);
+    let b = i32x4::new(-1169804484, 389773725, -731843701, -1825112934);
+    let r = i64x2::new(-2841313158179161935, -1386205072290870384);
+
+    assert_eq!(r, transmute(lsx_vadd_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vadd_d() {
+    let a = i64x2::new(-7298628992874088690, 8943248591432696479);
+    let b = i64x2::new(7093939531558864473, 4047047970310912233);
+    let r = i64x2::new(-204689461315224217, -5456447511965942904);
+
+    assert_eq!(r, transmute(lsx_vadd_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddi_bu() {
+    let a = i8x16::new(
+        -126, 4, -123, -78, -37, -26, -41, -119, -16, -82, 33, 59, -110, -98, 26, -6,
+    );
+    let r = i64x2::new(-7790681010872578420, 298548864442153210);
+
+    assert_eq!(r, transmute(lsx_vaddi_bu::<10>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddi_hu() {
+    let a = i16x8::new(-16986, -28417, 11657, 16608, -30167, 18602, 8897, -854);
+    let r = i64x2::new(4681541984598867390, -233585914045887935);
+
+    assert_eq!(r, transmute(lsx_vaddi_hu::<24>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddi_wu() {
+    let a = i32x4::new(1142343549, 56714754, -180143297, 408668191);
+    let r = i64x2::new(243588023362963327, 1755216527965240129);
+
+    assert_eq!(r, transmute(lsx_vaddi_wu::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddi_du() {
+    let a = i64x2::new(4516502893749962130, 9158051921593642947);
+    let r = i64x2::new(4516502893749962139, 9158051921593642956);
+
+    assert_eq!(r, transmute(lsx_vaddi_du::<9>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsub_b() {
+    let a = i8x16::new(
+        125, 95, 56, 31, 69, -81, 65, -123, -72, 14, -43, 81, -12, -107, 106, 3,
+    );
+    let b = i8x16::new(
+        -80, 10, -21, 84, -99, 8, 125, -66, 79, -71, 123, 61, 61, -31, 41, -118,
+    );
+    let r = i64x2::new(-4051929421319416371, 8737463450488952169);
+
+    assert_eq!(r, transmute(lsx_vsub_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsub_h() {
+    let a = i16x8::new(-17949, -2606, 1774, 18199, 28344, 28423, 16206, 25414);
+    let b = i16x8::new(15368, 16207, 9677, 21447, -29583, -22036, 1845, 15671);
+    let r = i64x2::new(-913983189443969573, 2742472381424198215);
+
+    assert_eq!(r, transmute(lsx_vsub_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsub_w() {
+    let a = i32x4::new(678216285, 1230738403, -1278396773, -1257816042);
+    let b = i32x4::new(617176389, -1376778690, 1463940361, 620446698);
+    let r = i64x2::new(-7247543435452521192, -8067077040042720878);
+
+    assert_eq!(r, transmute(lsx_vsub_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsub_d() {
+    let a = i64x2::new(7239192343295591267, -5127457864580422409);
+    let b = i64x2::new(1314101702815749241, 7673634401554993450);
+    let r = i64x2::new(5925090640479842026, 5645651807574135757);
+
+    assert_eq!(r, transmute(lsx_vsub_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubi_bu() {
+    let a = i8x16::new(
+        -83, 36, 83, -2, 40, -92, 98, -95, -24, 113, 46, -20, 120, -93, 28, 85,
+    );
+    let r = i64x2::new(-8192169673836457574, 4758493248402185941);
+
+    assert_eq!(r, transmute(lsx_vsubi_bu::<19>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubi_hu() {
+    let a = i16x8::new(13272, -26858, -235, 16054, 29698, 1377, 4604, -3878);
+    let r = i64x2::new(4514576075959186376, -1096043853912116238);
+
+    assert_eq!(r, transmute(lsx_vsubi_hu::<16>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubi_wu() {
+    let a = i32x4::new(1277091145, -2076591216, -1523555105, -945754023);
+    let r = i64x2::new(-8918891362898748088, -4061982600368986914);
+
+    assert_eq!(r, transmute(lsx_vsubi_wu::<1>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubi_du() {
+    let a = i64x2::new(-8248876128472283209, -2119651236628000925);
+    let r = i64x2::new(-8248876128472283234, -2119651236628000950);
+
+    assert_eq!(r, transmute(lsx_vsubi_du::<25>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmax_b() {
+    let a = i8x16::new(
+        -120, -51, 13, 82, 100, 7, 127, 17, -89, -95, -45, 121, 64, -60, 89, 105,
+    );
+    let b = i8x16::new(
+        -47, -64, 96, 41, -30, -122, 3, -7, 123, -96, 68, 36, 14, 31, 74, -22,
+    );
+    let r = i64x2::new(1260734548147228113, 7591133008682590587);
+
+    assert_eq!(r, transmute(lsx_vmax_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmax_h() {
+    let a = i16x8::new(-14821, -29280, 26700, -12293, 2186, -23309, 13454, -1630);
+    let b = i16x8::new(25637, -11569, -23103, 6983, -17125, 5183, -709, 5986);
+    let r = i64x2::new(1965654441534120997, 1684966995419662474);
+
+    assert_eq!(r, transmute(lsx_vmax_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmax_w() {
+    let a = i32x4::new(-2113940850, -647459228, -686153447, 852904547);
+    let b = i32x4::new(643859790, -389733899, -1309288060, 1934346522);
+    let r = i64x2::new(-1673894349703707314, 8307955054730158361);
+
+    assert_eq!(r, transmute(lsx_vmax_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmax_d() {
+    let a = i64x2::new(-990960773872867733, 6406870358170165030);
+    let b = i64x2::new(-6137495199657896371, 2160025776787809810);
+    let r = i64x2::new(-990960773872867733, 6406870358170165030);
+
+    assert_eq!(r, transmute(lsx_vmax_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaxi_b() {
+    let a = i8x16::new(
+        -67, 109, 33, -22, -96, 84, -56, 81, 122, 23, -70, -71, -42, 108, -50, 23,
+    );
+    let r = i64x2::new(5908253215318699518, 1728939149412407162);
+
+    assert_eq!(r, transmute(lsx_vmaxi_b::<-2>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaxi_h() {
+    let a = i16x8::new(-14059, 19536, 15816, 28251, 23079, -10486, -11781, 25565);
+    let r = i64x2::new(7952017497535807498, 7195907822558272039);
+
+    assert_eq!(r, transmute(lsx_vmaxi_h::<10>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaxi_w() {
+    let a = i32x4::new(-1136628686, -168033999, -2082324641, -1789957469);
+    let r = i64x2::new(55834574861, 55834574861);
+
+    assert_eq!(r, transmute(lsx_vmaxi_w::<13>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaxi_d() {
+    let a = i64x2::new(-490958606840895025, -602287987736508723);
+    let r = i64x2::new(-5, -5);
+
+    assert_eq!(r, transmute(lsx_vmaxi_d::<-5>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmax_bu() {
+    let a = u8x16::new(
+        22, 96, 70, 57, 83, 248, 184, 163, 4, 150, 223, 247, 226, 242, 18, 63,
+    );
+    let b = u8x16::new(
+        13, 251, 236, 121, 148, 91, 24, 176, 232, 197, 195, 34, 31, 120, 173, 27,
+    );
+    let r = i64x2::new(-5712542810735052010, 4588590651995571688);
+
+    assert_eq!(r, transmute(lsx_vmax_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmax_hu() {
+    let a = u16x8::new(1178, 52364, 32269, 22619, 17388, 4159, 51894, 12662);
+    let b = u16x8::new(61508, 27224, 11696, 15294, 30725, 4809, 55995, 24012);
+    let r = i64x2::new(6366821095949791300, 6759017637785204741);
+
+    assert_eq!(r, transmute(lsx_vmax_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmax_wu() {
+    let a = u32x4::new(2081333956, 40837464, 1440470019, 1657093799);
+    let b = u32x4::new(2856502284, 546582019, 3814541188, 2370198139);
+    let r = i64x2::new(2347551899043152908, -8266820577849948284);
+
+    assert_eq!(r, transmute(lsx_vmax_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmax_du() {
+    let a = u64x2::new(17105634039018730835, 11926654155810942548);
+    let b = u64x2::new(15559502733477870114, 3537017767853389449);
+    let r = i64x2::new(-1341110034690820781, -6520089917898609068);
+
+    assert_eq!(r, transmute(lsx_vmax_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaxi_bu() {
+    let a = u8x16::new(
+        216, 225, 158, 238, 152, 8, 124, 241, 175, 62, 154, 175, 216, 127, 235, 143,
+    );
+    let r = i64x2::new(-1045930669804428840, -8076220938123067729);
+
+    assert_eq!(r, transmute(lsx_vmaxi_bu::<27>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaxi_hu() {
+    let a = u16x8::new(56394, 18974, 59, 64239, 15178, 38205, 20044, 21066);
+    let r = i64x2::new(-365072790147113910, 5929637950214978378);
+
+    assert_eq!(r, transmute(lsx_vmaxi_hu::<23>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaxi_wu() {
+    let a = u32x4::new(2234002286, 3837532269, 3218694441, 2956128392);
+    let r = i64x2::new(-1964668478775874706, -5750269304073789143);
+
+    assert_eq!(r, transmute(lsx_vmaxi_wu::<15>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaxi_du() {
+    let a = u64x2::new(3145066433415682744, 697260191203805367);
+    let r = i64x2::new(3145066433415682744, 697260191203805367);
+
+    assert_eq!(r, transmute(lsx_vmaxi_du::<15>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmin_b() {
+    let a = i8x16::new(
+        -18, -126, -77, 105, 18, -106, -12, 89, 93, 22, -51, -103, -63, -106, -23, -125,
+    );
+    let b = i8x16::new(
+        -10, 83, 19, -119, -1, 95, 11, 25, -11, 38, -28, -23, -36, -104, 110, 0,
+    );
+    let r = i64x2::new(1870285769536668398, -8941449826914199819);
+
+    assert_eq!(r, transmute(lsx_vmin_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmin_h() {
+    let a = i16x8::new(7767, 30288, -1525, 24469, 16179, 7042, 6326, 21055);
+    let b = i16x8::new(-5519, 15267, -28304, -5842, 32145, 6582, -9646, -24918);
+    let r = i64x2::new(-1644216902720689551, -7013553423522578637);
+
+    assert_eq!(r, transmute(lsx_vmin_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmin_w() {
+    let a = i32x4::new(280954204, 1916591882, 1901481995, 787566518);
+    let b = i32x4::new(-425011290, -2104111279, 175390640, 571448257);
+    let r = i64x2::new(-9037089126579775578, 2454351575346593712);
+
+    assert_eq!(r, transmute(lsx_vmin_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmin_d() {
+    let a = i64x2::new(5262417572890363865, 5296071757031183187);
+    let b = i64x2::new(7269804448576860985, -2384075780126369706);
+    let r = i64x2::new(5262417572890363865, -2384075780126369706);
+
+    assert_eq!(r, transmute(lsx_vmin_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmini_b() {
+    let a = i8x16::new(
+        -20, 19, 89, -115, 65, 94, -124, -17, 36, -127, -101, -123, -122, -62, 44, 121,
+    );
+    let r = i64x2::new(-1187557278141451540, -940475489144045070);
+
+    assert_eq!(r, transmute(lsx_vmini_b::<-14>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmini_h() {
+    let a = i16x8::new(26119, -26421, -26720, 11534, 11181, -13024, -9525, -1565);
+    let r = i64x2::new(-677708916064259, -440267769697468419);
+
+    assert_eq!(r, transmute(lsx_vmini_h::<-3>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmini_w() {
+    let a = i32x4::new(1937226480, -56354461, -210581139, 118641668);
+    let r = i64x2::new(-242040566978707451, 25559222637);
+
+    assert_eq!(r, transmute(lsx_vmini_w::<5>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmini_d() {
+    let a = i64x2::new(-6839357499730806877, 2982085289136510651);
+    let r = i64x2::new(-6839357499730806877, 11);
+
+    assert_eq!(r, transmute(lsx_vmini_d::<11>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmin_bu() {
+    let a = u8x16::new(
+        72, 253, 194, 62, 100, 41, 53, 50, 53, 249, 47, 215, 113, 227, 189, 66,
+    );
+    let b = u8x16::new(
+        20, 165, 214, 231, 201, 17, 81, 203, 41, 209, 98, 88, 135, 118, 100, 83,
+    );
+    let r = i64x2::new(3617816997909406996, 4784078933357220137);
+
+    assert_eq!(r, transmute(lsx_vmin_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmin_hu() {
+    let a = u16x8::new(45665, 56395, 48109, 47478, 46813, 59058, 42125, 32550);
+    let b = u16x8::new(30424, 14541, 7654, 46014, 42452, 14971, 14903, 13871);
+    let r = i64x2::new(-5494921620712753448, 3904403410832303572);
+
+    assert_eq!(r, transmute(lsx_vmin_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmin_wu() {
+    let a = u32x4::new(1809171870, 3212127932, 1131140001, 2157144340);
+    let b = u32x4::new(1456829356, 2264966310, 1587887390, 645429404);
+    let r = i64x2::new(-8718787844260924500, 2772098183187911585);
+
+    assert_eq!(r, transmute(lsx_vmin_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmin_du() {
+    let a = u64x2::new(6641707046382446478, 5750385968612732680);
+    let b = u64x2::new(15079551366517035256, 13891052596545854864);
+    let r = i64x2::new(6641707046382446478, 5750385968612732680);
+
+    assert_eq!(r, transmute(lsx_vmin_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmini_bu() {
+    let a = u8x16::new(
+        14, 244, 217, 183, 206, 234, 5, 185, 152, 22, 4, 35, 30, 177, 252, 137,
+    );
+    let r = i64x2::new(361700864190383365, 361700864190317829);
+
+    assert_eq!(r, transmute(lsx_vmini_bu::<5>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmini_hu() {
+    let a = u16x8::new(51791, 41830, 16737, 31634, 36341, 58491, 48701, 8690);
+    let r = i64x2::new(5066626891382802, 5066626891382802);
+
+    assert_eq!(r, transmute(lsx_vmini_hu::<18>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmini_wu() {
+    let a = u32x4::new(1158888991, 2639721369, 556001789, 2902942998);
+    let r = i64x2::new(77309411346, 77309411346);
+
+    assert_eq!(r, transmute(lsx_vmini_wu::<18>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmini_du() {
+    let a = u64x2::new(17903595768445663391, 13119300660970895532);
+    let r = i64x2::new(13, 13);
+
+    assert_eq!(r, transmute(lsx_vmini_du::<13>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vseq_b() {
+    let a = i8x16::new(
+        8, 73, 39, 20, 64, -98, -64, 83, 32, 84, -121, 9, -45, -118, -26, 100,
+    );
+    let b = i8x16::new(
+        -90, -2, -77, -76, -19, 48, 91, 31, 65, -29, -112, -7, 77, 98, -126, 5,
+    );
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vseq_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vseq_h() {
+    let a = i16x8::new(7490, 32190, -24684, 16245, -18425, -12556, 19179, -23230);
+    let b = i16x8::new(-7387, -24074, 15709, -4629, 30465, -9504, -21403, -30287);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vseq_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vseq_w() {
+    let a = i32x4::new(-364333737, 833593451, -1047433707, 1224903962);
+    let b = i32x4::new(-493722413, -522973881, -1254416384, -884207273);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vseq_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vseq_d() {
+    let a = i64x2::new(8059130761383772313, -728251064129355704);
+    let b = i64x2::new(3023654898382436999, 1783520577741396523);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vseq_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vseqi_b() {
+    let a = i8x16::new(
+        114, -39, -58, -47, -46, 68, 126, -41, 50, -24, 109, 120, -81, -22, 86, 2,
+    );
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vseqi_b::<12>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vseqi_h() {
+    let a = i16x8::new(-3205, 25452, 20774, 22065, -8424, 16590, -15971, -14154);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vseqi_h::<-1>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vseqi_w() {
+    let a = i32x4::new(199798215, -798304779, -1812193878, -1830438161);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vseqi_w::<11>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vseqi_d() {
+    let a = i64x2::new(-7376858177879278972, 1947027764115386661);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vseqi_d::<3>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslti_b() {
+    let a = i8x16::new(
+        45, 70, 62, 83, 116, -29, -34, -91, 96, 48, 109, 92, -18, 93, 14, 22,
+    );
+    let r = i64x2::new(-1099511627776, 1095216660480);
+
+    assert_eq!(r, transmute(lsx_vslti_b::<-4>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslt_b() {
+    let a = i8x16::new(
+        -68, 126, 28, -97, -24, 118, 61, -9, 5, 115, -122, 5, -40, 107, -98, -93,
+    );
+    let b = i8x16::new(
+        22, 124, 33, 93, 0, -81, -62, 63, 1, 35, -64, 23, 61, 9, -56, 89,
+    );
+    let r = i64x2::new(-72056494526365441, -280375465148416);
+
+    assert_eq!(r, transmute(lsx_vslt_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslt_h() {
+    let a = i16x8::new(32283, 16403, -32598, 8049, -10290, 21116, 23894, 5619);
+    let b = i16x8::new(-10624, 12762, 31216, 13253, 2299, -12591, -8652, -22348);
+    let r = i64x2::new(-4294967296, 65535);
+
+    assert_eq!(r, transmute(lsx_vslt_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslt_w() {
+    let a = i32x4::new(-158999818, -1928813163, -140040541, 494178107);
+    let b = i32x4::new(-1849021639, -756143028, 54274044, 646446450);
+    let r = i64x2::new(-4294967296, -1);
+
+    assert_eq!(r, transmute(lsx_vslt_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslt_d() {
+    let a = i64x2::new(-179055155347449719, 6182805737835801255);
+    let b = i64x2::new(1481173131774551907, 270656941607020532);
+    let r = i64x2::new(-1, 0);
+
+    assert_eq!(r, transmute(lsx_vslt_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslti_h() {
+    let a = i16x8::new(-8902, 5527, 17224, -27356, 4424, 28839, 29975, 18805);
+    let r = i64x2::new(-281474976645121, 0);
+
+    assert_eq!(r, transmute(lsx_vslti_h::<14>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslti_w() {
+    let a = i32x4::new(995282502, -1964668207, -996118772, 1812234755);
+    let r = i64x2::new(-4294967296, 4294967295);
+
+    assert_eq!(r, transmute(lsx_vslti_w::<14>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslti_d() {
+    let a = i64x2::new(1441753618400573134, 3878439049744730841);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vslti_d::<14>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslt_bu() {
+    let a = u8x16::new(
+        55, 192, 87, 242, 253, 133, 53, 76, 135, 6, 39, 64, 82, 182, 147, 19,
+    );
+    let b = u8x16::new(
+        108, 77, 229, 137, 242, 115, 152, 252, 99, 101, 44, 100, 58, 120, 101, 22,
+    );
+    let r = i64x2::new(-281474959998721, -72057589742960896);
+
+    assert_eq!(r, transmute(lsx_vslt_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslt_hu() {
+    let a = u16x8::new(16382, 2642, 8944, 48121, 7472, 49176, 63264, 1135);
+    let b = u16x8::new(513, 13075, 20319, 44422, 12609, 18638, 20227, 21354);
+    let r = i64x2::new(281474976645120, -281474976645121);
+
+    assert_eq!(r, transmute(lsx_vslt_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslt_wu() {
+    let a = u32x4::new(137339688, 2061001419, 2322333619, 2113106148);
+    let b = u32x4::new(1402243125, 1129899238, 2591537060, 4152171743);
+    let r = i64x2::new(4294967295, -1);
+
+    assert_eq!(r, transmute(lsx_vslt_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslt_du() {
+    let a = u64x2::new(15914553432791856307, 11132190561956652500);
+    let b = u64x2::new(835355141719377733, 10472626544222695938);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vslt_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslti_bu() {
+    let a = u8x16::new(
+        215, 70, 65, 148, 249, 56, 59, 18, 118, 56, 250, 53, 144, 189, 98, 56,
+    );
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vslti_bu::<7>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslti_hu() {
+    let a = u16x8::new(60550, 12178, 30950, 44771, 25514, 35987, 55940, 21614);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vslti_hu::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslti_wu() {
+    let a = u32x4::new(912580668, 18660032, 3405726641, 4033549497);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vslti_wu::<8>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslti_du() {
+    let a = u64x2::new(17196150830761730262, 5893061291971214149);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vslti_du::<14>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsle_b() {
+    let a = i8x16::new(
+        16, 13, 47, 41, 9, -73, 92, 108, -77, -106, -115, -20, 107, -101, -54, 16,
+    );
+    let b = i8x16::new(
+        71, 43, 24, 28, 83, 69, -109, -33, 81, 71, -126, -61, -45, -11, -105, -70,
+    );
+    let r = i64x2::new(281470681808895, 280375465148415);
+
+    assert_eq!(r, transmute(lsx_vsle_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsle_h() {
+    let a = i16x8::new(15130, 12644, -27298, 13979, 28696, -28425, 23806, -20696);
+    let b = i16x8::new(-30602, -9535, 10944, 3343, -1093, 6600, -19453, -4561);
+    let r = i64x2::new(281470681743360, -281470681808896);
+
+    assert_eq!(r, transmute(lsx_vsle_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsle_w() {
+    let a = i32x4::new(-549852719, 335768045, 1882235130, 603655976);
+    let b = i32x4::new(-1810853975, 2021418524, 215198844, 1124361386);
+    let r = i64x2::new(-4294967296, -4294967296);
+
+    assert_eq!(r, transmute(lsx_vsle_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsle_d() {
+    let a = i64x2::new(-5807954019703375704, 7802006580674332206);
+    let b = i64x2::new(71694374951002423, -4307912969104303925);
+    let r = i64x2::new(-1, 0);
+
+    assert_eq!(r, transmute(lsx_vsle_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslei_b() {
+    let a = i8x16::new(
+        22, -8, 10, 55, 103, -103, -106, 30, 54, 82, 29, 44, 75, -9, 36, 111,
+    );
+    let r = i64x2::new(72056494526365440, 280375465082880);
+
+    assert_eq!(r, transmute(lsx_vslei_b::<3>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslei_h() {
+    let a = i16x8::new(31276, -16628, -30006, -20587, 2104, -30062, 18261, -6449);
+    let r = i64x2::new(-65536, -281470681808896);
+
+    assert_eq!(r, transmute(lsx_vslei_h::<-3>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslei_w() {
+    let a = i32x4::new(-1890390435, 1289536678, 1490122113, 2120063492);
+    let r = i64x2::new(4294967295, 0);
+
+    assert_eq!(r, transmute(lsx_vslei_w::<-16>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslei_d() {
+    let a = i64x2::new(-123539898448811963, 8007480165241051883);
+    let r = i64x2::new(-1, 0);
+
+    assert_eq!(r, transmute(lsx_vslei_d::<8>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsle_bu() {
+    let a = u8x16::new(
+        156, 210, 61, 51, 143, 107, 237, 69, 241, 117, 66, 79, 161, 68, 22, 152,
+    );
+    let b = u8x16::new(
+        83, 68, 27, 36, 209, 74, 204, 32, 123, 97, 44, 82, 238, 202, 133, 107,
+    );
+    let r = i64x2::new(1095216660480, 72057594021150720);
+
+    assert_eq!(r, transmute(lsx_vsle_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsle_hu() {
+    let a = u16x8::new(57583, 52549, 12485, 59674, 7283, 26602, 6409, 58628);
+    let b = u16x8::new(50529, 35111, 24746, 62465, 21587, 30574, 11054, 11653);
+    let r = i64x2::new(-4294967296, 281474976710655);
+
+    assert_eq!(r, transmute(lsx_vsle_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsle_wu() {
+    let a = u32x4::new(3325048208, 3863618944, 2967312103, 2626474550);
+    let b = u32x4::new(1321018603, 1091195011, 3525236625, 4061062671);
+    let r = i64x2::new(0, -1);
+
+    assert_eq!(r, transmute(lsx_vsle_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsle_du() {
+    let a = u64x2::new(17131200460153340378, 17148253643287276161);
+    let b = u64x2::new(16044633718831874991, 3531311371811276914);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vsle_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslei_bu() {
+    let a = u8x16::new(
+        33, 181, 170, 160, 192, 237, 16, 175, 82, 65, 186, 46, 143, 9, 37, 35,
+    );
+    let r = i64x2::new(71776119061217280, 280375465082880);
+
+    assert_eq!(r, transmute(lsx_vslei_bu::<18>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslei_hu() {
+    let a = u16x8::new(1430, 10053, 35528, 28458, 2394, 22098, 40236, 20853);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vslei_hu::<10>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslei_wu() {
+    let a = u32x4::new(3289026584, 3653636092, 2919866047, 2895662832);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vslei_wu::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vslei_du() {
+    let a = u64x2::new(17462377852989253439, 17741928456729041079);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vslei_du::<12>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsat_b() {
+    let a = i8x16::new(
+        -66, 2, -76, 126, 9, -44, -37, -42, 8, 68, -72, 10, 113, 70, 58, 44,
+    );
+    let r = i64x2::new(-2964542792447819074, 3186937137643144200);
+
+    assert_eq!(r, transmute(lsx_vsat_b::<7>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsat_h() {
+    let a = i16x8::new(-22234, -8008, -23350, 13768, 26313, -27447, -3569, 6025);
+    let r = i64x2::new(576451960371214336, 576451960371152895);
+
+    assert_eq!(r, transmute(lsx_vsat_h::<11>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsat_w() {
+    let a = i32x4::new(-84179653, 874415975, 1823119516, 1667850968);
+    let r = i64x2::new(137438953440, 133143986207);
+
+    assert_eq!(r, transmute(lsx_vsat_w::<5>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsat_d() {
+    let a = i64x2::new(6859869867233872152, 2514172105675226457);
+    let r = i64x2::new(262143, 262143);
+
+    assert_eq!(r, transmute(lsx_vsat_d::<18>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsat_bu() {
+    let a = u8x16::new(
+        119, 190, 12, 39, 41, 110, 238, 29, 14, 135, 54, 90, 36, 89, 72, 91,
+    );
+    let r = i64x2::new(2125538672170008439, 6577605268441825038);
+
+    assert_eq!(r, transmute(lsx_vsat_bu::<6>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsat_hu() {
+    let a = u16x8::new(36681, 34219, 6160, 8687, 4544, 20195, 35034, 916);
+    let r = i64x2::new(287953294993589247, 257835472485549055);
+
+    assert_eq!(r, transmute(lsx_vsat_hu::<9>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsat_wu() {
+    let a = u32x4::new(1758000759, 4138051566, 2705324001, 3927640324);
+    let r = i64x2::new(70364449226751, 70364449226751);
+
+    assert_eq!(r, transmute(lsx_vsat_wu::<13>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsat_du() {
+    let a = u64x2::new(1953136817312581670, 2606878300382729363);
+    let r = i64x2::new(9007199254740991, 9007199254740991);
+
+    assert_eq!(r, transmute(lsx_vsat_du::<52>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vadda_b() {
+    let a = i8x16::new(
+        -44, -56, -103, -51, 118, -127, -39, -96, -49, 75, -110, 35, 123, -61, 57, 104,
+    );
+    let b = i8x16::new(
+        79, 88, -93, 36, 117, -15, -81, -18, -117, -47, -13, 83, -31, -61, 60, 14,
+    );
+    let r = i64x2::new(8248499858970022011, 8535863472581999270);
+
+    assert_eq!(r, transmute(lsx_vadda_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vadda_h() {
+    let a = i16x8::new(15992, -5603, -27115, -15673, 11461, -31471, -31137, -2291);
+    let b = i16x8::new(-21543, 21720, 14529, -19143, -28953, 13450, 8037, 29413);
+    let r = i64x2::new(-8646732423142600033, 8924050915627474398);
+
+    assert_eq!(r, transmute(lsx_vadda_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vadda_w() {
+    let a = i32x4::new(1188987464, -1693707744, -1561184997, -104072194);
+    let b = i32x4::new(287041349, 249467792, 312776520, 1314435078);
+    let r = i64x2::new(8345875378983299469, 6092442344252138029);
+
+    assert_eq!(r, transmute(lsx_vadda_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vadda_d() {
+    let a = i64x2::new(1747309060022550268, -6715694127559156035);
+    let b = i64x2::new(-4324432602362661920, 6402427893748093984);
+    let r = i64x2::new(6071741662385212188, -5328622052402301597);
+
+    assert_eq!(r, transmute(lsx_vadda_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsadd_b() {
+    let a = i8x16::new(
+        6, -114, -40, 76, -8, 4, -110, -105, -104, 86, -27, 68, -102, 108, 113, 76,
+    );
+    let b = i8x16::new(
+        -47, 102, 105, 84, -127, 70, -116, 57, 66, 47, 74, -35, 61, -85, 48, -50,
+    );
+    let r = i64x2::new(-3422653801050278697, 1909270979770548186);
+
+    assert_eq!(r, transmute(lsx_vsadd_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsadd_h() {
+    let a = i16x8::new(-25724, -16509, -25895, 31488, -18727, 16765, 3340, 21218);
+    let b = i16x8::new(26970, 17131, 15547, -7614, -8479, 22338, 3567, -22299);
+    let r = i64x2::new(6720170624686097630, -304244782337649222);
+
+    assert_eq!(r, transmute(lsx_vsadd_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsadd_w() {
+    let a = i32x4::new(-1981320133, -1751087788, 1176481176, 253883202);
+    let b = i32x4::new(-1026388582, 222487110, 501504960, -1863994162);
+    let r = i64x2::new(-6565289918505943040, -6915373914453178024);
+
+    assert_eq!(r, transmute(lsx_vsadd_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsadd_d() {
+    let a = i64x2::new(-1967787987610391555, -8103697759704177767);
+    let b = i64x2::new(-6599608819082608284, -5088169537193133686);
+    let r = i64x2::new(-8567396806692999839, -9223372036854775808);
+
+    assert_eq!(r, transmute(lsx_vsadd_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsadd_bu() {
+    let a = u8x16::new(
+        182, 156, 225, 235, 23, 111, 224, 152, 158, 254, 143, 58, 230, 188, 119, 239,
+    );
+    let b = u8x16::new(
+        40, 219, 72, 211, 12, 37, 59, 28, 206, 173, 87, 21, 125, 229, 110, 102,
+    );
+    let r = i64x2::new(-5404438145481572386, -7318352348905473);
+
+    assert_eq!(r, transmute(lsx_vsadd_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsadd_hu() {
+    let a = u16x8::new(52962, 42889, 37893, 55695, 51804, 38647, 13774, 40745);
+    let b = u16x8::new(31219, 59227, 25607, 62798, 18845, 3238, 19902, 24978);
+    let r = i64x2::new(-8740258447361, -136834913009665);
+
+    assert_eq!(r, transmute(lsx_vsadd_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsadd_wu() {
+    let a = u32x4::new(1617769210, 1445524000, 4168062781, 912440538);
+    let b = u32x4::new(3676524021, 3894343575, 904432536, 1616820031);
+    let r = i64x2::new(-1, -7583652642497232897);
+
+    assert_eq!(r, transmute(lsx_vsadd_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsadd_du() {
+    let a = u64x2::new(3740778533337193809, 14274264382641271168);
+    let b = u64x2::new(11054638512585704882, 3549000132135395099);
+    let r = i64x2::new(-3651327027786652925, -623479558932885349);
+
+    assert_eq!(r, transmute(lsx_vsadd_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavg_b() {
+    let a = i8x16::new(
+        117, 127, 54, 98, -91, 42, 42, 76, 29, 63, -21, 26, -77, -7, -81, 78,
+    );
+    let b = i8x16::new(
+        30, 62, -76, -20, 127, 89, -99, -82, 69, -114, 84, 80, -78, -102, -107, 43,
+    );
+    let r = i64x2::new(-152206416164856247, 4369276355735447089);
+
+    assert_eq!(r, transmute(lsx_vavg_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavg_h() {
+    let a = i16x8::new(-12604, -917, -12088, 13367, -2577, -1073, 1365, -25654);
+    let b = i16x8::new(-3088, -25854, -32552, -8417, 7808, -12495, 22032, -5168);
+    let r = i64x2::new(696836182083297626, -4337760619710117321);
+
+    assert_eq!(r, transmute(lsx_vavg_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavg_w() {
+    let a = i32x4::new(826230751, 1801449269, -284345024, 1777295732);
+    let b = i32x4::new(-324844828, -1580060766, -1909832882, 328273785);
+    let r = i64x2::new(475428188150908257, 4521676108535152711);
+
+    assert_eq!(r, transmute(lsx_vavg_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavg_d() {
+    let a = i64x2::new(1486723108337487211, 6178549804180384276);
+    let b = i64x2::new(3169904420607189220, 5159962511251707672);
+    let r = i64x2::new(2328313764472338215, 5669256157716045974);
+
+    assert_eq!(r, transmute(lsx_vavg_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavg_bu() {
+    let a = u8x16::new(
+        84, 85, 64, 60, 241, 96, 145, 145, 51, 253, 205, 150, 135, 87, 248, 55,
+    );
+    let b = u8x16::new(
+        179, 216, 158, 135, 196, 75, 59, 209, 8, 58, 142, 152, 16, 220, 199, 21,
+    );
+    let r = i64x2::new(-5663745084945885565, 2801126043194071837);
+
+    assert_eq!(r, transmute(lsx_vavg_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavg_hu() {
+    let a = u16x8::new(46978, 53346, 32276, 58377, 57638, 42860, 43999, 59924);
+    let b = u16x8::new(44835, 36733, 12115, 42874, 4819, 12201, 27397, 25394);
+    let r = i64x2::new(-4196978047981735086, -6439149718662907396);
+
+    assert_eq!(r, transmute(lsx_vavg_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavg_wu() {
+    let a = u32x4::new(529045804, 31575520, 1599127613, 3465214369);
+    let b = u32x4::new(160886383, 26081142, 459122380, 2523086630);
+    let r = i64x2::new(123816739188229069, -5586965600173345916);
+
+    assert_eq!(r, transmute(lsx_vavg_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavg_du() {
+    let a = u64x2::new(11603952465622489487, 9916150703735650033);
+    let b = u64x2::new(9749063966076740681, 5963120178993456389);
+    let r = i64x2::new(-7770235857859936532, 7939635441364553211);
+
+    assert_eq!(r, transmute(lsx_vavg_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavgr_b() {
+    let a = i8x16::new(
+        42, -6, 89, -102, -107, 103, 13, -3, -19, -93, 0, 0, -17, 70, 54, 86,
+    );
+    let b = i8x16::new(
+        8, -32, -122, 22, -94, 44, 58, 54, -26, -34, -21, 27, -111, -96, -68, -122,
+    );
+    let r = i64x2::new(1883712581662731545, -1226681417271426582);
+
+    assert_eq!(r, transmute(lsx_vavgr_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavgr_h() {
+    let a = i16x8::new(-6008, 3940, -4691, -4052, 15265, -7180, 976, 11656);
+    let b = i16x8::new(-9758, -8332, 20577, 31066, 31120, 14788, -22323, 16722);
+    let r = i64x2::new(3801916629507170613, 3994084079587580569);
+
+    assert_eq!(r, transmute(lsx_vavgr_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavgr_w() {
+    let a = i32x4::new(-518881442, 2037406651, -1244322310, -1948025633);
+    let b = i32x4::new(1278058715, -155858446, -195547847, -750518746);
+    let r = i64x2::new(4040594005688324125, -5795079921582298726);
+
+    assert_eq!(r, transmute(lsx_vavgr_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavgr_d() {
+    let a = i64x2::new(-1958143381023430514, 3633380184275298119);
+    let b = i64x2::new(8758126674980055299, -7441643514470614533);
+    let r = i64x2::new(3399991646978312393, -1904131665097658207);
+
+    assert_eq!(r, transmute(lsx_vavgr_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavgr_bu() {
+    let a = u8x16::new(
+        205, 114, 125, 237, 6, 194, 197, 217, 10, 191, 130, 30, 247, 116, 199, 100,
+    );
+    let b = u8x16::new(
+        6, 139, 195, 209, 115, 27, 109, 34, 91, 48, 166, 147, 170, 83, 9, 65,
+    );
+    let r = i64x2::new(9122444831751176042, 6010164553039771699);
+
+    assert_eq!(r, transmute(lsx_vavgr_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavgr_hu() {
+    let a = u16x8::new(49326, 55416, 46414, 26192, 61759, 37293, 22943, 26741);
+    let b = u16x8::new(26111, 34713, 61420, 23702, 29204, 9543, 62786, 7043);
+    let r = i64x2::new(7022187818705851223, 4754859411904311722);
+
+    assert_eq!(r, transmute(lsx_vavgr_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavgr_wu() {
+    let a = u32x4::new(3560278529, 2406185766, 3420917939, 1379681517);
+    let b = u32x4::new(1930150361, 3668628165, 2983921396, 2410913126);
+    let r = i64x2::new(-5401180487351753235, 8140240017388800980);
+
+    assert_eq!(r, transmute(lsx_vavgr_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vavgr_du() {
+    let a = u64x2::new(3442342130569215862, 4810216499730807927);
+    let b = u64x2::new(8650759135311802962, 11380630663742852932);
+    let r = i64x2::new(6046550632940509412, 8095423581736830430);
+
+    assert_eq!(r, transmute(lsx_vavgr_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssub_b() {
+    let a = i8x16::new(
+        49, 58, 94, 93, 7, 40, -34, 27, 75, -67, -71, 2, -117, -22, 78, -78,
+    );
+    let b = i8x16::new(
+        -104, 71, -79, -113, 21, 34, 36, 19, 92, 32, -77, 91, 28, -43, -69, 62,
+    );
+    let r = i64x2::new(628822736562549631, -9187601072510296593);
+
+    assert_eq!(r, transmute(lsx_vssub_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssub_h() {
+    let a = i16x8::new(14676, -4176, 31759, -22564, 6643, 20831, 15260, 18518);
+    let b = i16x8::new(-26027, 6118, -13204, 25080, 12458, 8441, 24701, 11617);
+    let r = i64x2::new(-9223231300041015297, 1942699741282756937);
+
+    assert_eq!(r, transmute(lsx_vssub_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssub_w() {
+    let a = i32x4::new(-359085176, -924784873, 1280567100, 1138686008);
+    let b = i32x4::new(-1808829767, 2144666490, 146236682, 1180114488);
+    let r = i64x2::new(-9223372035405031217, -177933965588659662);
+
+    assert_eq!(r, transmute(lsx_vssub_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssub_d() {
+    let a = i64x2::new(628092957162650618, 1527439654680677883);
+    let b = i64x2::new(-2293337525465880409, 5736255249834646932);
+    let r = i64x2::new(2921430482628531027, -4208815595153969049);
+
+    assert_eq!(r, transmute(lsx_vssub_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssub_bu() {
+    let a = u8x16::new(
+        198, 146, 80, 65, 122, 45, 61, 106, 212, 129, 170, 111, 183, 102, 130, 148,
+    );
+    let b = u8x16::new(
+        16, 110, 145, 170, 113, 220, 82, 86, 9, 255, 200, 230, 204, 22, 213, 203,
+    );
+    let r = i64x2::new(1441151919413273782, 87960930222283);
+
+    assert_eq!(r, transmute(lsx_vssub_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssub_hu() {
+    let a = u16x8::new(62355, 31259, 41090, 62278, 449, 36606, 38644, 57485);
+    let b = u16x8::new(50468, 33060, 15257, 59071, 59343, 21993, 42978, 20097);
+    let r = i64x2::new(902801202201243247, -7922957643493867520);
+
+    assert_eq!(r, transmute(lsx_vssub_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssub_wu() {
+    let a = u32x4::new(360162968, 3504892941, 1150347916, 2195977376);
+    let b = u32x4::new(31483972, 3489479082, 152079374, 1875131600);
+    let r = i64x2::new(66202020638834260, 1378022115978010238);
+
+    assert_eq!(r, transmute(lsx_vssub_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssub_du() {
+    let a = u64x2::new(14887776146288736271, 417684393846230822);
+    let b = u64x2::new(6460869225596371206, 16765308520486969885);
+    let r = i64x2::new(8426906920692365065, 0);
+
+    assert_eq!(r, transmute(lsx_vssub_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vabsd_b() {
+    let a = i8x16::new(
+        -80, -35, -110, -126, -9, -18, -111, -50, -68, 115, -53, 79, -35, 102, -85, 68,
+    );
+    let b = i8x16::new(
+        85, -87, -91, 4, -102, 47, 70, 8, -16, 86, -14, -127, 2, -58, 10, 39,
+    );
+    let r = i64x2::new(4230359294854509733, 2116586434120326452);
+
+    assert_eq!(r, transmute(lsx_vabsd_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vabsd_h() {
+    let a = i16x8::new(-9487, 3116, 31071, -3514, -4374, 29502, 15788, 8887);
+    let b = i16x8::new(9346, 27961, 21592, 10762, -6831, 17219, 14968, -1750);
+    let r = i64x2::new(4018377481144584593, 2994052849949411737);
+
+    assert_eq!(r, transmute(lsx_vabsd_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vabsd_w() {
+    let a = i32x4::new(1772435833, -142335623, -905419863, -1391379125);
+    let b = i32x4::new(-638463360, -1154268425, 818053243, -1766966029);
+    let r = i64x2::new(4346218292750542585, 1613133471209364690);
+
+    assert_eq!(r, transmute(lsx_vabsd_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vabsd_d() {
+    let a = i64x2::new(-1345697660428932390, -6981332546532147421);
+    let b = i64x2::new(-8533946706796471089, 1165272962517390961);
+    let r = i64x2::new(7188249046367538699, 8146605509049538382);
+
+    assert_eq!(r, transmute(lsx_vabsd_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vabsd_bu() {
+    let a = u8x16::new(
+        3, 31, 230, 199, 201, 67, 112, 189, 15, 214, 56, 113, 214, 23, 217, 54,
+    );
+    let b = u8x16::new(
+        207, 196, 133, 201, 150, 94, 74, 221, 222, 61, 222, 248, 105, 208, 154, 128,
+    );
+    let r = i64x2::new(2316568964225934796, 5350198762417854927);
+
+    assert_eq!(r, transmute(lsx_vabsd_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vabsd_hu() {
+    let a = u16x8::new(30314, 20737, 52964, 57347, 14004, 37245, 9170, 22466);
+    let b = u16x8::new(42102, 40052, 6807, 16289, 29686, 38061, 42843, 26642);
+    let r = i64x2::new(-6889746235852116468, 1175584127230950722);
+
+    assert_eq!(r, transmute(lsx_vabsd_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vabsd_wu() {
+    let a = u32x4::new(1481954749, 4094293310, 3199531334, 4211151920);
+    let b = u32x4::new(3008439409, 976530727, 1726048801, 4235308512);
+    let r = i64x2::new(-5056055741505581388, 103751774096297765);
+
+    assert_eq!(r, transmute(lsx_vabsd_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vabsd_du() {
+    let a = u64x2::new(14212221485552223583, 1471016340493959617);
+    let b = u64x2::new(305704565845198935, 18327726360649467511);
+    let r = i64x2::new(-4540227154002526968, -1590034053554043722);
+
+    assert_eq!(r, transmute(lsx_vabsd_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmul_b() {
+    let a = i8x16::new(
+        -108, -77, -99, -81, 97, 59, -58, 100, 104, -89, -58, -96, -25, 125, 127, -61,
+    );
+    let b = i8x16::new(
+        64, 109, -119, -124, -55, -11, -90, -123, 72, -18, 83, 46, 102, -25, -11, 27,
+    );
+    let r = i64x2::new(-836412611799730432, -7959044669412588992);
+
+    assert_eq!(r, transmute(lsx_vmul_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmul_h() {
+    let a = i16x8::new(20255, 19041, 15158, 5077, -29421, -8508, 6583, -968);
+    let b = i16x8::new(-18582, -25667, 17674, 8424, -17121, -21798, 28934, -353);
+    let r = i64x2::new(-7419436171490628650, 3947512047518358605);
+
+    assert_eq!(r, transmute(lsx_vmul_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmul_w() {
+    let a = i32x4::new(1875532791, -2038975148, 754073945, 1245315915);
+    let b = i32x4::new(1754730718, 782084571, 894216679, -1895747372);
+    let r = i64x2::new(6602438528086061106, 4680306660704041039);
+
+    assert_eq!(r, transmute(lsx_vmul_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmul_d() {
+    let a = i64x2::new(-4093110041189429887, 5371368149814248867);
+    let b = i64x2::new(8096709215426138432, -5454415917204378153);
+    let r = i64x2::new(-1062747544199352000, -649255846668983579);
+
+    assert_eq!(r, transmute(lsx_vmul_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmadd_b() {
+    let a = i8x16::new(
+        60, 90, -59, 50, 52, 30, -124, 62, -71, -71, -38, 22, 6, -18, 93, 102,
+    );
+    let b = i8x16::new(
+        22, 41, -112, 44, -93, -82, 11, -47, 37, -120, -108, 33, -66, 27, -74, -2,
+    );
+    let c = i8x16::new(
+        103, 59, 65, -2, -55, 98, -11, 85, 84, 50, -17, 14, -19, 120, 7, -90,
+    );
+    let r = i64x2::new(-6698055306094195434, 1898151712142019037);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmadd_b(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmadd_h() {
+    let a = i16x8::new(24257, 11879, -5695, -12734, -31748, 30664, 11820, 3259);
+    let b = i16x8::new(23734, 11732, -14134, -26857, 30756, 2629, 25687, 15749);
+    let c = i16x8::new(-9000, -804, 10411, 17571, -4985, -22809, -5536, -1762);
+    let r = i64x2::new(2154858825190408273, -6966693911367840008);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmadd_h(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmadd_w() {
+    let a = i32x4::new(1344709991, 1633778942, 1825268167, 917193207);
+    let b = i32x4::new(147354288, -1478483633, -941638228, -173023515);
+    let c = i32x4::new(-1301057792, -1104623642, -1440212635, -8186971);
+    let r = i64x2::new(4970798576846304615, -3981205637140381021);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmadd_w(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmadd_d() {
+    let a = i64x2::new(-7021558423493045864, 7607197079929138141);
+    let b = i64x2::new(-7461017148544541027, -326746346508808472);
+    let c = i64x2::new(9019083511238971943, 8084580083589700502);
+    let r = i64x2::new(-7790478971542305405, -5909066061947936819);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmadd_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmsub_b() {
+    let a = i8x16::new(
+        -114, -46, 82, -75, -22, 31, 79, 84, -108, -13, -40, -121, -2, -20, 75, -35,
+    );
+    let b = i8x16::new(
+        -29, 61, -62, 87, -22, 53, 51, 24, -27, -74, 119, -20, 21, 5, 14, -92,
+    );
+    let c = i8x16::new(
+        -57, 111, 112, -66, 100, -31, -70, -71, 92, 63, 108, 61, -115, 17, -75, 16,
+    );
+    let r = i64x2::new(-269782211120439527, -7105106341430810296);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmsub_b(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmsub_h() {
+    let a = i16x8::new(28727, 27408, -23829, -25297, 24892, 31830, -2674, -17919);
+    let b = i16x8::new(6329, 13060, 18913, 18407, 28125, -26009, -14135, 22627);
+    let c = i16x8::new(26144, 29029, 6084, 10072, 21090, -4197, 21706, -19485);
+    let r = i64x2::new(-5420122113954766057, 2393824782223771810);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmsub_h(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmsub_w() {
+    let a = i32x4::new(385413537, 143148625, 1902013465, -1637986171);
+    let b = i32x4::new(-1124183308, 1253368192, 1310051041, -750553442);
+    let c = i32x4::new(921070544, 1408695249, -136396947, -1525372302);
+    let r = i64x2::new(-9168294401733980319, -6685995888074347700);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmsub_w(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmsub_d() {
+    let a = i64x2::new(-5022267712807149796, 8788062746333130381);
+    let b = i64x2::new(594946727227821886, -4907188100068238790);
+    let c = i64x2::new(-5753096081940451712, 2150588928473907718);
+    let r = i64x2::new(-734195902542963684, -4942536302810424015);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmsub_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vdiv_b() {
+    let a = i8x16::new(
+        56, 78, 12, -67, -45, -79, 3, -81, 85, 97, 41, -86, 106, -102, 35, 59,
+    );
+    let b = i8x16::new(
+        48, -92, -93, -74, -32, 113, 86, -8, -99, -21, -14, -19, 124, -113, 29, -120,
+    );
+    let r = i64x2::new(720575944674246657, 281475060530176);
+
+    assert_eq!(r, transmute(lsx_vdiv_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vdiv_h() {
+    let a = i16x8::new(17409, -1878, -20289, -20815, 23275, 32438, 27688, 29943);
+    let b = i16x8::new(-11221, 24673, 19931, 3799, -3251, -21373, -13758, -31286);
+    let r = i64x2::new(-1125904201744385, 281470681743353);
+
+    assert_eq!(r, transmute(lsx_vdiv_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vdiv_w() {
+    let a = i32x4::new(912619458, 297234237, 1790081728, 1556369143);
+    let b = i32x4::new(-775731190, 1887886939, 1001718213, 1135075421);
+    let r = i64x2::new(4294967295, 4294967297);
+
+    assert_eq!(r, transmute(lsx_vdiv_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vdiv_d() {
+    let a = i64x2::new(8060378764891126625, 720122833079320324);
+    let b = i64x2::new(-9175012156877545557, -6390704898809702209);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vdiv_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vdiv_bu() {
+    let a = u8x16::new(
+        153, 216, 32, 99, 9, 152, 44, 162, 131, 155, 164, 32, 248, 152, 88, 220,
+    );
+    let b = u8x16::new(
+        27, 125, 253, 245, 104, 196, 141, 201, 107, 65, 51, 126, 107, 90, 130, 185,
+    );
+    let r = i64x2::new(261, 72058702139687425);
+
+    assert_eq!(r, transmute(lsx_vdiv_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vdiv_hu() {
+    let a = u16x8::new(47825, 17349, 21777, 60576, 31104, 31380, 8974, 51905);
+    let b = u16x8::new(25282, 44917, 13706, 63351, 58837, 46710, 29092, 57823);
+    let r = i64x2::new(4294967297, 0);
+
+    assert_eq!(r, transmute(lsx_vdiv_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vdiv_wu() {
+    let a = u32x4::new(1861719625, 952645030, 2402876315, 3695614684);
+    let b = u32x4::new(1130189258, 1211056894, 2357258312, 3855913706);
+    let r = i64x2::new(1, 1);
+
+    assert_eq!(r, transmute(lsx_vdiv_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vdiv_du() {
+    let a = u64x2::new(7958239212167095743, 5349587769754015194);
+    let b = u64x2::new(14945948123666054968, 10864054932328247404);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vdiv_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhaddw_h_b() {
+    let a = i8x16::new(
+        33, -91, 3, -119, 28, -34, -19, -51, 41, -83, 102, 116, 45, 50, -94, 121,
+    );
+    let b = i8x16::new(
+        49, 50, 108, -49, -44, -25, 99, 7, -101, 39, -125, 11, -21, -99, -123, 29,
+    );
+    let r = i64x2::new(13791943145684950, -562821104926904);
+
+    assert_eq!(r, transmute(lsx_vhaddw_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhaddw_w_h() {
+    let a = i16x8::new(-20323, -26647, 21748, 24233, 27893, -27604, 16391, 14873);
+    let b = i16x8::new(
+        -10851, -15249, -11124, -22012, -32205, -17044, 27739, -19038,
+    );
+    let r = i64x2::new(56307021213062, 183021441324639);
+
+    assert_eq!(r, transmute(lsx_vhaddw_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhaddw_d_w() {
+    let a = i32x4::new(1127296124, -1382562520, -1791538949, 534516309);
+    let b = i32x4::new(-1119468785, -1334232049, -1752131604, -2016112631);
+    let r = i64x2::new(-2502031305, -1217615295);
+
+    assert_eq!(r, transmute(lsx_vhaddw_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhaddw_hu_bu() {
+    let a = u8x16::new(
+        72, 148, 45, 246, 151, 252, 69, 31, 91, 247, 215, 57, 125, 49, 141, 27,
+    );
+    let b = u8x16::new(
+        76, 120, 158, 172, 253, 12, 131, 16, 18, 131, 114, 207, 1, 100, 48, 141,
+    );
+    let r = i64x2::new(45601115212087520, 21110838012870921);
+
+    assert_eq!(r, transmute(lsx_vhaddw_hu_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhaddw_wu_hu() {
+    let a = u16x8::new(46665, 29041, 34462, 31370, 18289, 12579, 33777, 52188);
+    let b = u16x8::new(40369, 53005, 64424, 35720, 9231, 19965, 20662, 8208);
+    let r = i64x2::new(411432097222434, 312888367535410);
+
+    assert_eq!(r, transmute(lsx_vhaddw_wu_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhaddw_du_wu() {
+    let a = u32x4::new(3058953381, 3443284865, 3364703869, 2180288462);
+    let b = u32x4::new(728838120, 1267673009, 2659634151, 2264611356);
+    let r = i64x2::new(4172122985, 4839922613);
+
+    assert_eq!(r, transmute(lsx_vhaddw_du_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhsubw_h_b() {
+    let a = i8x16::new(
+        20, -94, 56, 36, -78, -53, -65, 62, -23, 3, -26, 16, -36, 92, -87, -21,
+    );
+    let b = i8x16::new(
+        -45, -92, 19, 45, -108, 44, 78, -127, -49, 23, -6, -3, 24, -8, 90, 51,
+    );
+    let r = i64x2::new(-4503363402989617, -31243430355664844);
+
+    assert_eq!(r, transmute(lsx_vhsubw_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhsubw_w_h() {
+    let a = i16x8::new(-32636, -15640, 17489, 24551, 28768, 8187, -7376, -16756);
+    let b = i16x8::new(-14204, -13312, 8240, -4455, -6362, -4711, -30790, -15773);
+    let r = i64x2::new(70059506530916, 60275571046613);
+
+    assert_eq!(r, transmute(lsx_vhsubw_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhsubw_d_w() {
+    let a = i32x4::new(-1518455529, -1873161613, -1441786902, 713965134);
+    let b = i32x4::new(-1671723008, 870456702, 264823818, 13322401);
+    let r = i64x2::new(-201438605, 449141316);
+
+    assert_eq!(r, transmute(lsx_vhsubw_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhsubw_hu_bu() {
+    let a = u8x16::new(
+        67, 78, 163, 156, 17, 58, 245, 19, 180, 161, 166, 207, 240, 5, 221, 157,
+    );
+    let b = u8x16::new(
+        122, 131, 70, 56, 162, 5, 241, 241, 43, 5, 7, 236, 195, 26, 6, 17,
+    );
+    let r = i64x2::new(-62206416523952172, 42783380429340790);
+
+    assert_eq!(r, transmute(lsx_vhsubw_hu_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhsubw_wu_hu() {
+    let a = u16x8::new(48161, 61606, 48243, 42252, 5643, 40672, 13711, 1172);
+    let b = u16x8::new(5212, 32159, 36502, 59290, 7604, 229, 35511, 47443);
+    let r = i64x2::new(24696062008394, -147484881944276);
+
+    assert_eq!(r, transmute(lsx_vhsubw_wu_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhsubw_du_wu() {
+    let a = u32x4::new(2721083043, 781151638, 4268150742, 392308867);
+    let b = u32x4::new(1383087137, 2403951939, 360532131, 3513614550);
+    let r = i64x2::new(-601935499, 31776736);
+
+    assert_eq!(r, transmute(lsx_vhsubw_du_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmod_b() {
+    let a = i8x16::new(
+        -89, -117, 89, -114, -65, 67, -20, 38, -38, -118, 30, 91, -16, -100, -109, -35,
+    );
+    let b = i8x16::new(
+        94, -92, -13, 26, -6, -121, 39, -114, 74, -108, 95, 108, -65, -21, 67, 92,
+    );
+    let r = i64x2::new(2804691417388804007, -2461515231199824166);
+
+    assert_eq!(r, transmute(lsx_vmod_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmod_h() {
+    let a = i16x8::new(-29453, 12108, 10947, 28516, 4854, 1994, -30042, -18472);
+    let b = i16x8::new(1550, 9221, -12080, 14553, -24847, 28286, 1074, 192);
+    let r = i64x2::new(3930282117007147005, -10982007906888970);
+
+    assert_eq!(r, transmute(lsx_vmod_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmod_w() {
+    let a = i32x4::new(-2061299866, -1170666395, -1617297141, 594549537);
+    let b = i32x4::new(344507881, 1692387020, -1397506903, -1257953510);
+    let r = i64x2::new(-5027973877095011085, 2553570821342119010);
+
+    assert_eq!(r, transmute(lsx_vmod_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmod_d() {
+    let a = i64x2::new(-6018318621764124581, -5715738494441059378);
+    let b = i64x2::new(4636642606889723746, -259899475747531088);
+    let r = i64x2::new(-1381676014874400835, -257849503742906530);
+
+    assert_eq!(r, transmute(lsx_vmod_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmod_bu() {
+    let a = u8x16::new(
+        122, 163, 72, 171, 64, 10, 201, 101, 196, 162, 190, 86, 253, 173, 221, 65,
+    );
+    let b = u8x16::new(
+        186, 243, 157, 205, 48, 190, 55, 245, 72, 203, 140, 64, 8, 25, 252, 227,
+    );
+    let r = i64x2::new(7287961163701724026, 4745974892933063220);
+
+    assert_eq!(r, transmute(lsx_vmod_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmod_hu() {
+    let a = u16x8::new(26509, 32785, 35218, 8560, 18289, 13375, 35585, 60973);
+    let b = u16x8::new(15317, 24954, 61354, 3720, 21471, 6193, 8193, 35745);
+    let r = i64x2::new(315403234587388856, 7101062794264266609);
+
+    assert_eq!(r, transmute(lsx_vmod_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmod_wu() {
+    let a = u32x4::new(3940871454, 2498938081, 2241198148, 777660345);
+    let b = u32x4::new(49228057, 2249712923, 358897384, 1782599598);
+    let r = i64x2::new(1070413902953059662, 3340025749258890964);
+
+    assert_eq!(r, transmute(lsx_vmod_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmod_du() {
+    let a = u64x2::new(7747010922784437137, 16089799939101946183);
+    let b = u64x2::new(16850073055169051895, 16069565262862467484);
+    let r = i64x2::new(7747010922784437137, 20234676239478699);
+
+    assert_eq!(r, transmute(lsx_vmod_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vreplve_b() {
+    let a = i8x16::new(
+        -62, -110, -89, -84, -11, -37, 90, -28, -41, -37, -53, 123, -55, 22, 20, -80,
+    );
+    let r = i64x2::new(-2893606913523066921, -2893606913523066921);
+
+    assert_eq!(r, transmute(lsx_vreplve_b(transmute(a), -8)));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vreplve_h() {
+    let a = i16x8::new(-29429, -23495, 8705, -7614, -25353, 11887, -25989, -12818);
+    let r = i64x2::new(-3607719825936298514, -3607719825936298514);
+
+    assert_eq!(r, transmute(lsx_vreplve_h(transmute(a), 7)));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vreplve_w() {
+    let a = i32x4::new(1584940676, 95787593, -1655264847, 682404402);
+    let r = i64x2::new(411404579393346121, 411404579393346121);
+
+    assert_eq!(r, transmute(lsx_vreplve_w(transmute(a), -3)));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vreplve_d() {
+    let a = i64x2::new(7614424214598615675, -7096892795239148002);
+    let r = i64x2::new(7614424214598615675, 7614424214598615675);
+
+    assert_eq!(r, transmute(lsx_vreplve_d(transmute(a), 0)));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vreplvei_b() {
+    let a = i8x16::new(
+        62, -120, 10, 58, 124, -30, 57, -78, -114, 6, -39, 46, 58, -72, -44, 21,
+    );
+    let r = i64x2::new(-2097865012304223518, -2097865012304223518);
+
+    assert_eq!(r, transmute(lsx_vreplvei_b::<5>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vreplvei_h() {
+    let a = i16x8::new(-15455, -4410, 5029, 25863, -23170, 26570, 27423, -834);
+    let r = i64x2::new(7719006069021698847, 7719006069021698847);
+
+    assert_eq!(r, transmute(lsx_vreplvei_h::<6>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vreplvei_w() {
+    let a = i32x4::new(1843143434, 491125746, -328585251, -1996512058);
+    let r = i64x2::new(7916240772710277898, 7916240772710277898);
+
+    assert_eq!(r, transmute(lsx_vreplvei_w::<0>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vreplvei_d() {
+    let a = i64x2::new(4333963848299154309, -8310246545782080694);
+    let r = i64x2::new(-8310246545782080694, -8310246545782080694);
+
+    assert_eq!(r, transmute(lsx_vreplvei_d::<1>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickev_b() {
+    let a = i8x16::new(
+        89, 84, -94, 3, 41, -86, -10, 120, 62, -102, 44, -88, 12, -75, -13, 65,
+    );
+    let b = i8x16::new(
+        -31, 44, -76, -76, 52, -71, 44, -110, -4, 124, -38, 76, 108, 43, 54, 60,
+    );
+    let r = i64x2::new(3921750152141124833, -933322373843017127);
+
+    assert_eq!(r, transmute(lsx_vpickev_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickev_h() {
+    let a = i16x8::new(-5994, -14344, -28338, -25788, 5710, 1638, 494, -2554);
+    let b = i16x8::new(-5248, -1786, -21768, 23214, -4223, 23538, -24936, -32316);
+    let r = i64x2::new(-7018596679058658432, 139073165196191894);
+
+    assert_eq!(r, transmute(lsx_vpickev_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickev_w() {
+    let a = i32x4::new(548489620, -968269400, -179106837, -1739507044);
+    let b = i32x4::new(-1187277846, -787064901, -980229113, 1746235326);
+    let r = i64x2::new(-4210051979814398998, -769258006856513132);
+
+    assert_eq!(r, transmute(lsx_vpickev_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickev_d() {
+    let a = i64x2::new(1789073368466131160, 9168587701455881156);
+    let b = i64x2::new(6574352346370076190, -3979792156310826694);
+    let r = i64x2::new(6574352346370076190, 1789073368466131160);
+
+    assert_eq!(r, transmute(lsx_vpickev_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickod_b() {
+    let a = i8x16::new(
+        -125, 4, -27, 25, 117, 98, -51, -93, -37, 110, -127, 115, 114, -108, 74, -85,
+    );
+    let b = i8x16::new(
+        93, -72, 89, 104, 84, 15, 77, 74, 91, -34, 118, -108, 13, 21, 105, 114,
+    );
+    let r = i64x2::new(8220640377280882872, -6083110277645985532);
+
+    assert_eq!(r, transmute(lsx_vpickod_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickod_h() {
+    let a = i16x8::new(1454, -18740, 13146, 10497, 4897, 31962, 19208, 21910);
+    let b = i16x8::new(12047, 25024, -10709, -28077, 24357, 19934, 10289, 28546);
+    let r = i64x2::new(8035070303515402688, 6167254016163165900);
+
+    assert_eq!(r, transmute(lsx_vpickod_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickod_w() {
+    let a = i32x4::new(869069429, -1916930406, 1864611728, -1640302268);
+    let b = i32x4::new(-99240403, 314407358, 543396756, 1976776696);
+    let r = i64x2::new(8490191261129341374, -7045044594236590438);
+
+    assert_eq!(r, transmute(lsx_vpickod_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickod_d() {
+    let a = i64x2::new(7031942541839550339, -7578696032343374601);
+    let b = i64x2::new(-4197243771252175958, -543692393753629390);
+    let r = i64x2::new(-543692393753629390, -7578696032343374601);
+
+    assert_eq!(r, transmute(lsx_vpickod_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vilvh_b() {
+    let a = i8x16::new(
+        -58, -103, -5, 33, 124, -24, -18, 20, 22, -100, -6, 16, 40, 89, -41, -37,
+    );
+    let b = i8x16::new(
+        -42, 76, 46, -4, 67, 45, 99, -7, 63, 20, 113, -50, 67, -23, -20, 112,
+    );
+    let r = i64x2::new(1211180715666052671, -2634368371891034045);
+
+    assert_eq!(r, transmute(lsx_vilvh_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vilvh_h() {
+    let a = i16x8::new(24338, 259, -22693, 16519, -28272, -16751, 1883, 16217);
+    let b = i16x8::new(23768, -31845, 28689, 14757, 9499, 7795, -13573, -10011);
+    let r = i64x2::new(-4714953853167983333, 4564918175499275003);
+
+    assert_eq!(r, transmute(lsx_vilvh_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vilvh_w() {
+    let a = i32x4::new(-968342074, -1976160649, -1249304918, -279518364);
+    let b = i32x4::new(-737076987, 38515006, 602108871, -63099569);
+    let r = i64x2::new(-5365723764939852857, -1200522227779556017);
+
+    assert_eq!(r, transmute(lsx_vilvh_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vilvh_d() {
+    let a = i64x2::new(2505149669372896333, 5375050218784453679);
+    let b = i64x2::new(-2160658667838026389, 1449429407527660400);
+    let r = i64x2::new(1449429407527660400, 5375050218784453679);
+
+    assert_eq!(r, transmute(lsx_vilvh_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vilvl_b() {
+    let a = i8x16::new(
+        57, 109, 61, 96, 101, 69, -42, 118, 112, -17, 63, 68, -54, 32, 17, -122,
+    );
+    let b = i8x16::new(
+        -48, -30, -102, 100, -3, 85, 100, 46, 82, 67, -20, -56, 93, 96, -39, 108,
+    );
+    let r = i64x2::new(6945744258789947856, 8515979671552484861);
+
+    assert_eq!(r, transmute(lsx_vilvl_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vilvl_h() {
+    let a = i16x8::new(28844, -23308, 4163, -8033, 12472, -16423, 14534, 31242);
+    let b = i16x8::new(11601, 6788, 3174, -4208, -25999, -25660, -4591, 7133);
+    let r = i64x2::new(-6560589601043632815, -2260825085889541018);
+
+    assert_eq!(r, transmute(lsx_vilvl_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vilvl_w() {
+    let a = i32x4::new(-997094955, 1731171907, 1528236839, -646874689);
+    let b = i32x4::new(486029703, 1245981961, 112180197, 1939621508);
+    let r = i64x2::new(-4282490222245561977, 7435326725564935433);
+
+    assert_eq!(r, transmute(lsx_vilvl_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vilvl_d() {
+    let a = i64x2::new(7063413230460842607, -4234618008113981723);
+    let b = i64x2::new(3142531875873363679, 736682102982019415);
+    let r = i64x2::new(3142531875873363679, 7063413230460842607);
+
+    assert_eq!(r, transmute(lsx_vilvl_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpackev_b() {
+    let a = i8x16::new(
+        63, 38, -47, 98, 19, 68, -27, 1, 108, 65, 108, 31, -102, 37, -27, 50,
+    );
+    let b = i8x16::new(
+        59, 11, -44, 73, -74, -15, 61, 17, -37, 117, -39, 28, 38, 49, -34, -86,
+    );
+    let r = i64x2::new(-1928363389519380677, -1882898104368665381);
+
+    assert_eq!(r, transmute(lsx_vpackev_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpackev_h() {
+    let a = i16x8::new(26574, -30949, 26762, -28439, 5382, -25386, 5192, -9816);
+    let b = i16x8::new(-9444, 5210, -14402, 17972, 16606, 2450, 5123, 14727);
+    let r = i64x2::new(7533052947329899292, 1461440082551914718);
+
+    assert_eq!(r, transmute(lsx_vpackev_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpackev_w() {
+    let a = i32x4::new(1312465803, -1752635324, -1943199176, -362848304);
+    let b = i32x4::new(-872903277, 1255047449, -2110158279, 682925573);
+    let r = i64x2::new(5636997704425442707, -8345976908349339079);
+
+    assert_eq!(r, transmute(lsx_vpackev_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpackev_d() {
+    let a = i64x2::new(7118943335298607169, 3038173153862744209);
+    let b = i64x2::new(-9119315954224042738, -4563700463464702181);
+    let r = i64x2::new(-9119315954224042738, 7118943335298607169);
+
+    assert_eq!(r, transmute(lsx_vpackev_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpackod_b() {
+    let a = i8x16::new(
+        94, -48, 43, -58, -47, 27, -33, 60, 50, -38, 41, -41, 76, -46, 103, -60,
+    );
+    let b = i8x16::new(
+        -117, -11, 72, -9, -99, -52, -102, -22, -7, -8, 8, -65, 101, 29, 86, 27,
+    );
+    let r = i64x2::new(4389351353151377653, -4315624792288929032);
+
+    assert_eq!(r, transmute(lsx_vpackod_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpackod_h() {
+    let a = i16x8::new(-18827, 19151, 4246, -15752, -1028, 29166, 3421, -32610);
+    let b = i16x8::new(-23247, 17928, -13353, -20146, 5696, 22071, -10728, -30262);
+    let r = i64x2::new(-4433598883325590008, -9178747487946648009);
+
+    assert_eq!(r, transmute(lsx_vpackod_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpackod_w() {
+    let a = i32x4::new(-1183976810, 11929980, -1445863799, 1567314918);
+    let b = i32x4::new(445270781, 793617340, -1461557030, -22199234);
+    let r = i64x2::new(51238874735551420, 6731566319615689790);
+
+    assert_eq!(r, transmute(lsx_vpackod_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpackod_d() {
+    let a = i64x2::new(-4549504442184266063, -4670773907187480618);
+    let b = i64x2::new(9039771682296134623, -6404442538060227683);
+    let r = i64x2::new(-6404442538060227683, -4670773907187480618);
+
+    assert_eq!(r, transmute(lsx_vpackod_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vshuf_h() {
+    let a = i16x8::new(7, 12, 6, 8, 11, 2, 4, 7);
+    let b = i16x8::new(19221, 5841, 2738, -31394, -31337, -27662, 24655, 28090);
+    let c = i16x8::new(27835, 20061, 7214, -10489, -14005, -27870, -12303, 14443);
+    let r = i64x2::new(5410459163590867051, 4065564413064545630);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vshuf_h(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vshuf_w() {
+    let a = i32x4::new(0, 3, 4, 6);
+    let b = i32x4::new(921730307, -1175025178, 241337062, 53139449);
+    let c = i32x4::new(-67250654, 55397321, 1170999941, 1704507894);
+    let r = i64x2::new(7320805664731551266, 1036534789524454659);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vshuf_w(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vshuf_d() {
+    let a = i64x2::new(1, 2);
+    let b = i64x2::new(4033696695079994582, -3146912063343863773);
+    let c = i64x2::new(-4786751363389755273, 1769232540309840996);
+    let r = i64x2::new(1769232540309840996, 4033696695079994582);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vshuf_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vand_v() {
+    let a = u8x16::new(
+        105, 106, 193, 101, 82, 63, 227, 23, 246, 17, 117, 134, 98, 233, 41, 128,
+    );
+    let b = u8x16::new(
+        254, 161, 164, 46, 166, 61, 123, 67, 90, 217, 49, 98, 166, 236, 128, 175,
+    );
+    let r = i64x2::new(244105884219744360, -9223116804091473582);
+
+    assert_eq!(r, transmute(lsx_vand_v(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vandi_b() {
+    let a = u8x16::new(
+        167, 0, 108, 41, 255, 45, 24, 175, 229, 222, 89, 15, 63, 15, 187, 213,
+    );
+    let r = i64x2::new(-8135737750142058361, -7666517314596397435);
+
+    assert_eq!(r, transmute(lsx_vandi_b::<159>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vor_v() {
+    let a = u8x16::new(
+        87, 193, 209, 232, 106, 36, 72, 199, 202, 213, 174, 2, 78, 181, 135, 178,
+    );
+    let b = u8x16::new(
+        253, 19, 178, 143, 132, 123, 29, 28, 200, 36, 9, 212, 12, 35, 164, 169,
+    );
+    let r = i64x2::new(-2351582766212852737, -4924766118269159990);
+
+    assert_eq!(r, transmute(lsx_vor_v(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vori_b() {
+    let a = u8x16::new(
+        134, 61, 120, 206, 181, 179, 192, 181, 115, 179, 137, 110, 147, 51, 93, 65,
+    );
+    let r = i64x2::new(-589140355308650538, -3179554720060804109);
+
+    assert_eq!(r, transmute(lsx_vori_b::<210>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vnor_v() {
+    let a = u8x16::new(
+        116, 165, 106, 148, 116, 117, 91, 213, 195, 131, 160, 33, 223, 207, 12, 147,
+    );
+    let b = u8x16::new(
+        242, 233, 135, 143, 129, 199, 130, 192, 222, 143, 223, 103, 232, 53, 98, 129,
+    );
+    let r = i64x2::new(3036560889408918025, 7823034030269427744);
+
+    assert_eq!(r, transmute(lsx_vnor_v(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vnori_b() {
+    let a = u8x16::new(
+        142, 138, 177, 202, 121, 170, 99, 149, 251, 153, 234, 191, 10, 185, 182, 212,
+    );
+    let r = i64x2::new(5227628601268782144, 596802560304890884);
+
+    assert_eq!(r, transmute(lsx_vnori_b::<51>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vxor_v() {
+    let a = u8x16::new(
+        33, 58, 188, 69, 128, 23, 145, 174, 229, 254, 21, 227, 196, 131, 115, 100,
+    );
+    let b = u8x16::new(
+        10, 61, 91, 105, 232, 114, 191, 215, 83, 11, 124, 157, 132, 242, 94, 59,
+    );
+    let r = i64x2::new(8732028225622312747, 6858262329367852470);
+
+    assert_eq!(r, transmute(lsx_vxor_v(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vxori_b() {
+    let a = u8x16::new(
+        27, 105, 197, 119, 145, 141, 167, 209, 51, 206, 89, 42, 45, 215, 239, 160,
+    );
+    let r = i64x2::new(3478586993001400570, 4687744515358339026);
+
+    assert_eq!(r, transmute(lsx_vxori_b::<225>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitsel_v() {
+    let a = u8x16::new(
+        217, 159, 221, 209, 154, 9, 59, 230, 33, 109, 205, 229, 188, 222, 1, 94,
+    );
+    let b = u8x16::new(
+        49, 116, 245, 6, 184, 146, 9, 1, 133, 27, 12, 4, 47, 11, 8, 133,
+    );
+    let c = u8x16::new(
+        140, 105, 10, 4, 218, 82, 128, 160, 67, 218, 139, 14, 248, 53, 35, 81,
+    );
+    let r = i64x2::new(5060668949517432401, 1081087304254897953);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vbitsel_v(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbitseli_b() {
+    let a = u8x16::new(
+        224, 93, 78, 91, 41, 115, 130, 96, 34, 22, 227, 254, 0, 44, 237, 193,
+    );
+    let b = u8x16::new(
+        138, 4, 83, 190, 229, 199, 235, 99, 62, 236, 201, 78, 160, 181, 45, 187,
+    );
+    let r = i64x2::new(4857631126842327370, 8881540057610709020);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vbitseli_b::<65>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vshuf4i_b() {
+    let a = i8x16::new(
+        -83, 65, -54, 44, -52, -97, -93, 54, 118, -10, -20, -43, -60, -86, -116, -47,
+    );
+    let r = i64x2::new(3937170420478429898, -3347145886530736916);
+
+    assert_eq!(r, transmute(lsx_vshuf4i_b::<234>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vshuf4i_h() {
+    let a = i16x8::new(27707, -1094, -15784, -28387, 31634, -12323, -30387, -11480);
+    let r = i64x2::new(-7989953385787032646, -3231104182470389795);
+
+    assert_eq!(r, transmute(lsx_vshuf4i_h::<209>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vshuf4i_w() {
+    let a = i32x4::new(768986805, -1036149600, -1196682940, -214444511);
+    let r = i64x2::new(3302773179299516085, -5139714087882845884);
+
+    assert_eq!(r, transmute(lsx_vshuf4i_w::<160>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vreplgr2vr_b() {
+    let r = i64x2::new(795741901218843403, 795741901218843403);
+
+    assert_eq!(r, transmute(lsx_vreplgr2vr_b(970839819)));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vreplgr2vr_h() {
+    let r = i64x2::new(-6504141532176800324, -6504141532176800324);
+
+    assert_eq!(r, transmute(lsx_vreplgr2vr_h(93693372)));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vreplgr2vr_w() {
+    let r = i64x2::new(-6737078705572473188, -6737078705572473188);
+
+    assert_eq!(r, transmute(lsx_vreplgr2vr_w(-1568598372)));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vreplgr2vr_d() {
+    let r = i64x2::new(5000134708087557572, 5000134708087557572);
+
+    assert_eq!(r, transmute(lsx_vreplgr2vr_d(5000134708087557572)));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpcnt_b() {
+    let a = i8x16::new(
+        29, -96, 22, 17, 38, -51, -97, 82, 17, -82, -30, -42, -44, 107, -51, 80,
+    );
+    let r = i64x2::new(217867142450840068, 145528077781566722);
+
+    assert_eq!(r, transmute(lsx_vpcnt_b(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpcnt_h() {
+    let a = i16x8::new(-512, 10388, -21267, -27094, 1085, -26444, -29360, -11576);
+    let r = i64x2::new(1970367786975239, 1970350607237126);
+
+    assert_eq!(r, transmute(lsx_vpcnt_h(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpcnt_w() {
+    let a = i32x4::new(1399276601, -2094725994, -100739325, -1239551533);
+    let r = i64x2::new(47244640271, 81604378645);
+
+    assert_eq!(r, transmute(lsx_vpcnt_w(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpcnt_d() {
+    let a = i64x2::new(-4470823169399930539, 3184270543884128372);
+    let r = i64x2::new(29, 25);
+
+    assert_eq!(r, transmute(lsx_vpcnt_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vclo_b() {
+    let a = i8x16::new(
+        94, 66, -88, -43, 113, 10, 5, -96, 96, 78, 3, -30, -24, -29, 20, 115,
+    );
+    let r = i64x2::new(72057594071547904, 3311470116864);
+
+    assert_eq!(r, transmute(lsx_vclo_b(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vclo_h() {
+    let a = i16x8::new(-5432, 27872, -9150, 27393, 25236, 1028, -21312, -25189);
+    let r = i64x2::new(8589934595, 281479271677952);
+
+    assert_eq!(r, transmute(lsx_vclo_h(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vclo_w() {
+    let a = i32x4::new(1214322611, -1755838761, -1222326743, -1511364419);
+    let r = i64x2::new(4294967296, 4294967297);
+
+    assert_eq!(r, transmute(lsx_vclo_w(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vclo_d() {
+    let a = i64x2::new(-249299854527467825, -459308653408461862);
+    let r = i64x2::new(6, 5);
+
+    assert_eq!(r, transmute(lsx_vclo_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vclz_b() {
+    let a = i8x16::new(
+        -103, -39, -51, -74, -68, 126, -124, 33, 30, 54, -46, -53, -9, 96, 17, 74,
+    );
+    let r = i64x2::new(144116287587483648, 72903118479688195);
+
+    assert_eq!(r, transmute(lsx_vclz_b(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vclz_h() {
+    let a = i16x8::new(1222, 32426, 3164, -10763, 10189, -4197, -21841, -28676);
+    let r = i64x2::new(17179934725, 2);
+
+    assert_eq!(r, transmute(lsx_vclz_h(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vclz_w() {
+    let a = i32x4::new(-490443689, -1039971379, -217310592, -1921086575);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vclz_w(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vclz_d() {
+    let a = i64x2::new(4630351532137644314, -6587611980764816064);
+    let r = i64x2::new(1, 0);
+
+    assert_eq!(r, transmute(lsx_vclz_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickve2gr_b() {
+    let a = i8x16::new(
+        119, 126, -107, -59, 22, -27, -67, 39, -66, -101, 34, -26, -16, 61, 20, 51,
+    );
+    let r: i32 = 51;
+
+    assert_eq!(r, transmute(lsx_vpickve2gr_b::<15>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickve2gr_h() {
+    let a = i16x8::new(-12924, 31013, 18171, 20404, 21226, 14128, -6255, 26521);
+    let r: i32 = 21226;
+
+    assert_eq!(r, transmute(lsx_vpickve2gr_h::<4>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickve2gr_w() {
+    let a = i32x4::new(-1559379275, 2065542381, -1882161334, 1502157419);
+    let r: i32 = -1882161334;
+
+    assert_eq!(r, transmute(lsx_vpickve2gr_w::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickve2gr_d() {
+    let a = i64x2::new(-6941380853339482104, 8405634758774935528);
+    let r: i64 = -6941380853339482104;
+
+    assert_eq!(r, transmute(lsx_vpickve2gr_d::<0>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickve2gr_bu() {
+    let a = i8x16::new(
+        18, -111, 100, 2, -105, 20, 92, -40, -57, 117, 6, -119, -94, 86, -52, 35,
+    );
+    let r: u32 = 199;
+
+    assert_eq!(r, transmute(lsx_vpickve2gr_bu::<8>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickve2gr_hu() {
+    let a = i16x8::new(25003, 5139, -12977, 7550, -12177, 19294, -2216, 12693);
+    let r: u32 = 25003;
+
+    assert_eq!(r, transmute(lsx_vpickve2gr_hu::<0>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickve2gr_wu() {
+    let a = i32x4::new(-295894883, 551663550, -710853968, 82692774);
+    let r: u32 = 3999072413;
+
+    assert_eq!(r, transmute(lsx_vpickve2gr_wu::<0>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpickve2gr_du() {
+    let a = i64x2::new(748282319555413922, -1352335765832355666);
+    let r: u64 = 748282319555413922;
+
+    assert_eq!(r, transmute(lsx_vpickve2gr_du::<0>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vinsgr2vr_b() {
+    let a = i8x16::new(
+        58, 12, -107, 35, 111, -15, -99, 117, 119, 92, -18, 32, -44, -34, 53, -34,
+    );
+    let r = i64x2::new(8475195533421775930, -2423536021788533641);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vinsgr2vr_b::<14>(transmute(a), 1333652061))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vinsgr2vr_h() {
+    let a = i16x8::new(-20591, 7819, 25287, -11296, 4604, 28833, -1306, 6418);
+    let r = i64x2::new(-3179432729573085295, 1806782266980897276);
+
+    assert_eq!(r, transmute(lsx_vinsgr2vr_h::<5>(transmute(a), -987420193)));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vinsgr2vr_w() {
+    let a = i32x4::new(1608179655, 886830932, -621638499, 2021214690);
+    let r = i64x2::new(3808909851629379527, 8681050995079237782);
+
+    assert_eq!(r, transmute(lsx_vinsgr2vr_w::<2>(transmute(a), -960507754)));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vinsgr2vr_d() {
+    let a = i64x2::new(-6562091001143116290, -2425423285843953307);
+    let r = i64x2::new(-6562091001143116290, -233659266);
+
+    assert_eq!(r, transmute(lsx_vinsgr2vr_d::<1>(transmute(a), -233659266)));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfadd_s() {
+    let a = u32x4::new(1063501234, 1064367472, 1065334422, 1012846272);
+    let b = u32x4::new(1050272808, 1054022924, 1064036136, 1063113730);
+    let r = i64x2::new(4588396142719948771, 4567018621615066847);
+
+    assert_eq!(r, transmute(lsx_vfadd_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfadd_d() {
+    let a = u64x2::new(4602410992567934854, 4605792798803129629);
+    let b = u64x2::new(4605819027271079334, 4601207158507578498);
+    let r = i64x2::new(4608685566198055604, 4608371493448991663);
+
+    assert_eq!(r, transmute(lsx_vfadd_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfsub_s() {
+    let a = u32x4::new(1064451273, 1059693825, 1036187576, 1050580506);
+    let b = u32x4::new(1063475462, 1045836432, 1065150677, 1042376676);
+    let r = i64x2::new(4532926601401089072, 4475386505810184670);
+
+    assert_eq!(r, transmute(lsx_vfsub_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfsub_d() {
+    let a = u64x2::new(4601910797424251354, 4606993182294978423);
+    let b = u64x2::new(4605973926398825814, 4600156145303017004);
+    let r = i64x2::new(-4622342180736116526, 4603750919602422881);
+
+    assert_eq!(r, transmute(lsx_vfsub_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmul_s() {
+    let a = u32x4::new(1060566900, 1061147127, 1010818944, 1053672244);
+    let b = u32x4::new(1065241951, 1044285812, 1050678216, 1009264512);
+    let r = i64x2::new(4471727895898079441, 4289440988347233543);
+
+    assert_eq!(r, transmute(lsx_vfmul_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmul_d() {
+    let a = u64x2::new(4593483834506733144, 4602939512559809908);
+    let b = u64x2::new(4605208047666947899, 4599634375243914522);
+    let r = i64x2::new(4591550625791030606, 4595475933048682142);
+
+    assert_eq!(r, transmute(lsx_vfmul_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfdiv_s() {
+    let a = u32x4::new(1057501460, 1051070718, 1065221347, 1051828876);
+    let b = u32x4::new(1055538538, 1042248668, 1061233585, 1063649172);
+    let r = i64x2::new(4613180427594946541, 4523223175100126088);
+
+    assert_eq!(r, transmute(lsx_vfdiv_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfdiv_d() {
+    let a = u64x2::new(4591718910407182664, 4607068478646496456);
+    let b = u64x2::new(4606326032528596062, 4601783079746725386);
+    let r = i64x2::new(4592460108638699314, 4612120084672695832);
+
+    assert_eq!(r, transmute(lsx_vfdiv_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcvt_h_s() {
+    let a = u32x4::new(1020611712, 1046448896, 1062035346, 1052255382);
+    let b = u32x4::new(1049501482, 1043939972, 1042291392, 1041250232);
+    let r = i64x2::new(3495410141992989809, 3873441386606634666);
+
+    assert_eq!(r, transmute(lsx_vfcvt_h_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcvt_s_d() {
+    let a = u64x2::new(4586066291858051968, 4597324798333789044);
+    let b = u64x2::new(4600251021237488420, 4593890179408150924);
+    let r = i64x2::new(4469319308295208818, 4496796258465732597);
+
+    assert_eq!(r, transmute(lsx_vfcvt_s_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmin_s() {
+    let a = u32x4::new(1016310272, 1064492378, 1043217948, 1060534856);
+    let b = u32x4::new(1060093085, 1026130528, 1057322097, 1057646773);
+    let r = i64x2::new(4407197060203522560, 4542558301798153756);
+
+    assert_eq!(r, transmute(lsx_vfmin_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmin_d() {
+    let a = u64x2::new(4603437440563473519, 4603158282529654079);
+    let b = u64x2::new(4584808359801648672, 4602712060570539582);
+    let r = i64x2::new(4584808359801648672, 4602712060570539582);
+
+    assert_eq!(r, transmute(lsx_vfmin_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmina_s() {
+    let a = u32x4::new(1061417856, 1052257408, 1056830440, 1055199170);
+    let b = u32x4::new(1049119234, 1058336224, 1057046116, 1029386720);
+    let r = i64x2::new(4519411155382848002, 4421182298393539560);
+
+    assert_eq!(r, transmute(lsx_vfmina_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmina_d() {
+    let a = u64x2::new(4599160304044702024, 4603774209349450318);
+    let b = u64x2::new(4599088744110071826, 4598732503789588496);
+    let r = i64x2::new(4599088744110071826, 4598732503789588496);
+
+    assert_eq!(r, transmute(lsx_vfmina_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmax_s() {
+    let a = u32x4::new(1054002242, 1061130492, 1034716288, 1064963760);
+    let b = u32x4::new(1042175760, 1040826492, 1059132266, 1050815434);
+    let r = i64x2::new(4557520760982391874, 4573984521684325226);
+
+    assert_eq!(r, transmute(lsx_vfmax_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmax_d() {
+    let a = u64x2::new(4606275407710467505, 4593284088749839728);
+    let b = u64x2::new(4593616624275112016, 4605244843740986156);
+    let r = i64x2::new(4606275407710467505, 4605244843740986156);
+
+    assert_eq!(r, transmute(lsx_vfmax_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmaxa_s() {
+    let a = u32x4::new(1059031357, 1043496676, 1044317464, 1055811838);
+    let b = u32x4::new(1064739422, 1055122552, 1049654310, 1057411362);
+    let r = i64x2::new(4531716855176798814, 4541547219258471462);
+
+    assert_eq!(r, transmute(lsx_vfmaxa_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmaxa_d() {
+    let a = u64x2::new(4559235973242941440, 4606304546706191737);
+    let b = u64x2::new(4603647289310579471, 4603999027307573908);
+    let r = i64x2::new(4603647289310579471, 4606304546706191737);
+
+    assert_eq!(r, transmute(lsx_vfmaxa_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfclass_s() {
+    let a = u32x4::new(1059786314, 1058231666, 1061513647, 1038650488);
+    let r = i64x2::new(549755814016, 549755814016);
+
+    assert_eq!(r, transmute(lsx_vfclass_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfclass_d() {
+    let a = u64x2::new(4601724705608768104, 4601126152607382566);
+    let r = i64x2::new(128, 128);
+
+    assert_eq!(r, transmute(lsx_vfclass_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfsqrt_s() {
+    let a = u32x4::new(1055398716, 1050305974, 995168768, 1064901995);
+    let r = i64x2::new(4543169501430832482, 4574681629207255333);
+
+    assert_eq!(r, transmute(lsx_vfsqrt_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfsqrt_d() {
+    let a = u64x2::new(4605784293613801157, 4602267946351406890);
+    let r = i64x2::new(4606453893731357485, 4604397310232711799);
+
+    assert_eq!(r, transmute(lsx_vfsqrt_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrecip_s() {
+    let a = u32x4::new(1003452672, 1050811504, 1044295808, 1064402913);
+    let r = i64x2::new(4632552602764963931, 4577820515916044016);
+
+    assert_eq!(r, transmute(lsx_vfrecip_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrecip_d() {
+    let a = u64x2::new(4598634931235673106, 4598630619264835010);
+    let r = i64x2::new(4615355353482170689, 4615362460048142095);
+
+    assert_eq!(r, transmute(lsx_vfrecip_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx,frecipe")]
+unsafe fn test_lsx_vfrecipe_s() {
+    let a = u32x4::new(1057583779, 1062308847, 1060089100, 1048454688);
+    let r = i64x2::new(4583644530211711115, 4647978179615164140);
+
+    assert_eq!(r, transmute(lsx_vfrecipe_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx,frecipe")]
+unsafe fn test_lsx_vfrecipe_d() {
+    let a = u64x2::new(4605515926442181274, 4605369703273365674);
+    let r = i64x2::new(4608204937770303488, 4608317161507651584);
+
+    assert_eq!(r, transmute(lsx_vfrecipe_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx,frecipe")]
+unsafe fn test_lsx_vfrsqrte_s() {
+    let a = u32x4::new(1064377488, 1055815904, 1056897740, 1064016656);
+    let r = i64x2::new(4592421282989204764, 4577184195020153336);
+
+    assert_eq!(r, transmute(lsx_vfrsqrte_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx,frecipe")]
+unsafe fn test_lsx_vfrsqrte_d() {
+    let a = u64x2::new(4602766865443628663, 4605323203937791867);
+    let r = i64x2::new(4608986772678901760, 4607734355383549952);
+
+    assert_eq!(r, transmute(lsx_vfrsqrte_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrint_s() {
+    let a = u32x4::new(1062138521, 1056849108, 1034089720, 1038314384);
+    let r = i64x2::new(1065353216, 0);
+
+    assert_eq!(r, transmute(lsx_vfrint_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrint_d() {
+    let a = u64x2::new(4598620052333442366, 4603262362368837514);
+    let r = i64x2::new(0, 4607182418800017408);
+
+    assert_eq!(r, transmute(lsx_vfrint_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrsqrt_s() {
+    let a = u32x4::new(1058614029, 1050504950, 1013814976, 1062355001);
+    let r = i64x2::new(4604601921912011494, 4579384257679777264);
+
+    assert_eq!(r, transmute(lsx_vfrsqrt_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrsqrt_d() {
+    let a = u64x2::new(4602924191185043139, 4606088351077917251);
+    let r = i64x2::new(4608881149202581394, 4607483676176768181);
+
+    assert_eq!(r, transmute(lsx_vfrsqrt_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vflogb_s() {
+    let a = u32x4::new(1053488512, 1061429282, 1064965594, 1061326585);
+    let r = i64x2::new(-4647714812225126400, -4647714812233515008);
+
+    assert_eq!(r, transmute(lsx_vflogb_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vflogb_d() {
+    let a = u64x2::new(4589481276789128632, 4599408395082246526);
+    let r = i64x2::new(-4607182418800017408, -4611686018427387904);
+
+    assert_eq!(r, transmute(lsx_vflogb_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcvth_s_h() {
+    let a = i16x8::new(29550, -13884, 689, -1546, 24006, -19112, -12769, 1779);
+    let r = i64x2::new(-4707668984349540352, 4097818267320836096);
+
+    assert_eq!(r, transmute(lsx_vfcvth_s_h(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcvth_d_s() {
+    let a = u32x4::new(1051543000, 1042275304, 1038283216, 1063876621);
+    let r = i64x2::new(4592649323212177408, 4606389677895712768);
+
+    assert_eq!(r, transmute(lsx_vfcvth_d_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcvtl_s_h() {
+    let a = i16x8::new(-21951, -13772, -17190, 9566, -19227, 9682, 13427, -30861);
+    let r = i64x2::new(-4519784435355738112, 4371798972740354048);
+
+    assert_eq!(r, transmute(lsx_vfcvtl_s_h(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcvtl_d_s() {
+    let a = u32x4::new(1059809930, 1051084496, 1062618346, 1058273673);
+    let r = i64x2::new(4604206389789720576, 4599521958080544768);
+
+    assert_eq!(r, transmute(lsx_vfcvtl_d_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftint_w_s() {
+    let a = u32x4::new(1064738153, 1040181800, 1064331056, 1050732566);
+    let r = i64x2::new(1, 1);
+
+    assert_eq!(r, transmute(lsx_vftint_w_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftint_l_d() {
+    let a = u64x2::new(4602244632405616462, 4606437548563176328);
+    let r = i64x2::new(0, 1);
+
+    assert_eq!(r, transmute(lsx_vftint_l_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftint_wu_s() {
+    let a = u32x4::new(1051598962, 1051261298, 1059326008, 1057784192);
+    let r = i64x2::new(0, 4294967297);
+
+    assert_eq!(r, transmute(lsx_vftint_wu_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftint_lu_d() {
+    let a = u64x2::new(4605561240422589260, 4595241299507769712);
+    let r = i64x2::new(1, 0);
+
+    assert_eq!(r, transmute(lsx_vftint_lu_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrz_w_s() {
+    let a = u32x4::new(1027659872, 1064207676, 1058472873, 1055740014);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrz_w_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrz_l_d() {
+    let a = u64x2::new(4605051539601556532, 4605129242354661923);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrz_l_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrz_wu_s() {
+    let a = u32x4::new(1060876751, 1053710034, 1057340881, 1055555596);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrz_wu_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrz_lu_d() {
+    let a = u64x2::new(4598711097624940956, 4598268778109474002);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrz_lu_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vffint_s_w() {
+    let a = i32x4::new(81337967, 1396520141, 2124859806, 1655115736);
+    let r = i64x2::new(5667351778062705614, 5676028806041521555);
+
+    assert_eq!(r, transmute(lsx_vffint_s_w(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vffint_d_l() {
+    let a = i64x2::new(-1543454772280682525, -7672333112582708041);
+    let r = i64x2::new(-4344448119835677720, -4333977527979901593);
+
+    assert_eq!(r, transmute(lsx_vffint_d_l(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vffint_s_wu() {
+    let a = u32x4::new(2224947834, 194720725, 2248289069, 1131100007);
+    let r = i64x2::new(5564675890493038082, 5658445755393114667);
+
+    assert_eq!(r, transmute(lsx_vffint_s_wu(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vffint_d_lu() {
+    let a = u64x2::new(11793247389644223387, 1356636411353166515);
+    let r = i64x2::new(4892164017273962878, 4878194157796724979);
+
+    assert_eq!(r, transmute(lsx_vffint_d_lu(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vandn_v() {
+    let a = u8x16::new(
+        69, 83, 176, 218, 73, 205, 105, 229, 131, 233, 158, 58, 63, 68, 94, 223,
+    );
+    let b = u8x16::new(
+        12, 197, 21, 164, 196, 200, 144, 3, 232, 91, 46, 182, 156, 14, 53, 106,
+    );
+    let r = i64x2::new(184648152262214664, 2315143230533931624);
+
+    assert_eq!(r, transmute(lsx_vandn_v(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vneg_b() {
+    let a = i8x16::new(
+        -118, -51, 32, 96, -18, 11, -3, 86, 77, 78, -120, 105, -47, 6, -127, -49,
+    );
+    let r = i64x2::new(-6195839201974406282, 3566844512212398771);
+
+    assert_eq!(r, transmute(lsx_vneg_b(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vneg_h() {
+    let a = i16x8::new(-6540, 25893, -2534, 29805, -28719, -16331, -20168, 14650);
+    let r = i64x2::new(-8389350794815923828, -4123521786840387537);
+
+    assert_eq!(r, transmute(lsx_vneg_h(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vneg_w() {
+    let a = i32x4::new(-927815384, -898911982, 716171852, -2025175544);
+    let r = i64x2::new(3860797565600356056, 8698062733717804468);
+
+    assert_eq!(r, transmute(lsx_vneg_w(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vneg_d() {
+    let a = i64x2::new(4241851098775470984, 2487122929432859927);
+    let r = i64x2::new(-4241851098775470984, -2487122929432859927);
+
+    assert_eq!(r, transmute(lsx_vneg_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmuh_b() {
+    let a = i8x16::new(
+        -123, 8, -7, 107, 85, 70, 44, 54, -34, -38, 48, 6, -23, 54, 25, -117,
+    );
+    let b = i8x16::new(
+        41, -97, -9, -98, 27, 101, -95, 58, 102, -37, -72, -8, 94, -112, -22, -61,
+    );
+    let r = i64x2::new(931993372669836524, 2017024359980467698);
+
+    assert_eq!(r, transmute(lsx_vmuh_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmuh_h() {
+    let a = i16x8::new(-7394, -18356, -22999, 24389, 5841, 15177, -27319, -19905);
+    let b = i16x8::new(-446, -16863, 19467, -13578, -9673, -26572, -7864, 9855);
+    let r = i64x2::new(-1422322400225984462, -842721997477184351);
+
+    assert_eq!(r, transmute(lsx_vmuh_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmuh_w() {
+    let a = i32x4::new(1709346012, -2115891417, -530450121, 975457270);
+    let b = i32x4::new(-1684820454, 449222301, 1106076122, 431017950);
+    let r = i64x2::new(-950505610786872114, 420439596918869732);
+
+    assert_eq!(r, transmute(lsx_vmuh_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmuh_d() {
+    let a = i64x2::new(1852303942214142839, -864913423017390364);
+    let b = i64x2::new(-1208434038665242614, -6078343251861677818);
+    let r = i64x2::new(-121343209662433286, 284995587689374477);
+
+    assert_eq!(r, transmute(lsx_vmuh_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmuh_bu() {
+    let a = u8x16::new(
+        7, 62, 97, 52, 145, 32, 36, 208, 81, 215, 70, 254, 95, 229, 130, 220,
+    );
+    let b = u8x16::new(
+        220, 110, 97, 25, 127, 138, 167, 150, 128, 32, 130, 157, 177, 237, 123, 244,
+    );
+    let r = i64x2::new(8725461799780227590, -3369022092985820632);
+
+    assert_eq!(r, transmute(lsx_vmuh_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmuh_hu() {
+    let a = u16x8::new(28423, 34360, 7900, 61040, 62075, 6281, 10041, 37733);
+    let b = u16x8::new(14769, 6489, 58866, 5997, 46648, 26325, 42186, 26942);
+    let r = i64x2::new(1572068217944938757, 4366267597274655896);
+
+    assert_eq!(r, transmute(lsx_vmuh_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmuh_wu() {
+    let a = u32x4::new(1924935822, 3107975337, 289660636, 1367017690);
+    let b = u32x4::new(1981234883, 1290836259, 1284878577, 702668871);
+    let r = i64x2::new(4011887256539048298, 960560772888018584);
+
+    assert_eq!(r, transmute(lsx_vmuh_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmuh_du() {
+    let a = u64x2::new(11605461634325977288, 4587630571657223131);
+    let b = u64x2::new(14805542397189366587, 10025341254588295994);
+    let r = i64x2::new(-9132083796568587258, 2493261783600858707);
+
+    assert_eq!(r, transmute(lsx_vmuh_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsllwil_h_b() {
+    let a = i8x16::new(
+        -45, 48, 102, -110, 126, -43, 65, 14, 75, 88, 62, 46, -109, 119, -77, 59,
+    );
+    let r = i64x2::new(-990777899147527584, 126109727303143360);
+
+    assert_eq!(r, transmute(lsx_vsllwil_h_b::<5>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsllwil_w_h() {
+    let a = i16x8::new(25135, -4241, 25399, -32451, 5597, -16847, 3192, -14694);
+    let r = i64x2::new(-9326057613926912, -71360503652913664);
+
+    assert_eq!(r, transmute(lsx_vsllwil_w_h::<9>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsllwil_d_w() {
+    let a = i32x4::new(1472328927, -2106442262, 379100488, -607174188);
+    let r = i64x2::new(6030659284992, -8627987505152);
+
+    assert_eq!(r, transmute(lsx_vsllwil_d_w::<12>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsllwil_hu_bu() {
+    let a = u8x16::new(
+        102, 12, 222, 193, 16, 21, 161, 189, 127, 57, 231, 81, 97, 68, 171, 68,
+    );
+    let r = i64x2::new(6953679870551405312, 6809531147446388736);
+
+    assert_eq!(r, transmute(lsx_vsllwil_hu_bu::<7>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsllwil_wu_hu() {
+    let a = u16x8::new(370, 47410, 29611, 6206, 10390, 34658, 65264, 5264);
+    let r = i64x2::new(52127846272954880, 6823569169558272);
+
+    assert_eq!(r, transmute(lsx_vsllwil_wu_hu::<8>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsllwil_du_wu() {
+    let a = u32x4::new(3249798491, 4098547305, 1101510259, 3478509641);
+    let r = i64x2::new(13630642809995264, 17190553355550720);
+
+    assert_eq!(r, transmute(lsx_vsllwil_du_wu::<22>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsran_b_h() {
+    let a = i16x8::new(-12554, -869, 6838, -18394, -26140, 20902, -222, -12466);
+    let b = i16x8::new(-12507, -16997, -17826, 5682, -298, -28572, -8117, -13478);
+    let r = i64x2::new(-864943573596831881, 0);
+
+    assert_eq!(r, transmute(lsx_vsran_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsran_h_w() {
+    let a = i32x4::new(-950913431, 1557805031, 693572398, 1180916410);
+    let b = i32x4::new(-52337348, -677553123, -58200260, -1473338606);
+    let r = i64x2::new(1267763303694925820, 0);
+
+    assert_eq!(r, transmute(lsx_vsran_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsran_w_d() {
+    let a = i64x2::new(-1288554130833689959, -11977059487539737);
+    let b = i64x2::new(-8585295495893484131, -2657141976436452013);
+    let r = i64x2::new(-5882350952887806270, 0);
+
+    assert_eq!(r, transmute(lsx_vsran_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssran_b_h() {
+    let a = i16x8::new(-4232, -6038, -25131, -31144, -8955, 30109, -20875, 31748);
+    let b = i16x8::new(9459, 15241, 22170, 28027, 5348, 14784, 22613, -9469);
+    let r = i64x2::new(9187483431610086528, 0);
+
+    assert_eq!(r, transmute(lsx_vssran_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssran_h_w() {
+    let a = i32x4::new(-287861089, -1513011801, -2092611716, -303792243);
+    let b = i32x4::new(2070726003, -944816867, -160621862, -1222036466);
+    let r = i64x2::new(-5219109151313101350, 0);
+
+    assert_eq!(r, transmute(lsx_vssran_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssran_w_d() {
+    let a = i64x2::new(-3241370354549914429, -6946993314161316482);
+    let b = i64x2::new(-7078666005882550400, -2564990402652718339);
+    let r = i64x2::new(-15032385536, 0);
+
+    assert_eq!(r, transmute(lsx_vssran_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssran_bu_h() {
+    let a = u16x8::new(42413, 20386, 34692, 25088, 5477, 58748, 14986, 55598);
+    let b = u16x8::new(2372, 26267, 4722, 47876, 44857, 55242, 45998, 51450);
+    let r = i64x2::new(47227865344, 0);
+
+    assert_eq!(r, transmute(lsx_vssran_bu_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssran_hu_w() {
+    let a = u32x4::new(98545765, 1277336728, 1198651242, 2259455561);
+    let b = u32x4::new(2085279153, 2679576985, 2935643238, 3797496208);
+    let r = i64x2::new(281470684234479, 0);
+
+    assert_eq!(r, transmute(lsx_vssran_hu_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssran_wu_d() {
+    let a = u64x2::new(13769400838855917836, 9078517924805296472);
+    let b = u64x2::new(3904652404244024971, 4230656884168675704);
+    let r = i64x2::new(536870912000, 0);
+
+    assert_eq!(r, transmute(lsx_vssran_wu_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrarn_b_h() {
+    let a = i16x8::new(416, 1571, 19122, -32078, 26657, 3230, 12936, -5041);
+    let b = i16x8::new(-19071, -903, 11542, -25909, 24111, 14882, -27192, -8283);
+    let r = i64x2::new(7076043428318610384, 0);
+
+    assert_eq!(r, transmute(lsx_vsrarn_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrarn_h_w() {
+    let a = i32x4::new(-1553871953, -1700232136, 1934164676, -322997351);
+    let b = i32x4::new(-1571698573, 1467958613, -1857488008, 424713310);
+    let r = i64x2::new(498163119212, 0);
+
+    assert_eq!(r, transmute(lsx_vsrarn_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrarn_w_d() {
+    let a = i64x2::new(3489546309777968442, 4424654979674624573);
+    let b = i64x2::new(-8645668865455529235, -3129277582817496880);
+    let r = i64x2::new(-8628090759335017621, 0);
+
+    assert_eq!(r, transmute(lsx_vsrarn_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarn_b_h() {
+    let a = i16x8::new(18764, -32156, 11073, -19939, -921, -18342, -16600, -13755);
+    let b = i16x8::new(24298, 2343, 24641, 20910, 3142, -1171, 25850, 15932);
+    let r = i64x2::new(-148338468081139694, 0);
+
+    assert_eq!(r, transmute(lsx_vssrarn_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarn_h_w() {
+    let a = i32x4::new(-319370354, 225260835, 556195246, -699782233);
+    let b = i32x4::new(1911424854, -931292983, -1710824608, -1179580317);
+    let r = i64x2::new(-9223231301513904204, 0);
+
+    assert_eq!(r, transmute(lsx_vssrarn_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarn_w_d() {
+    let a = i64x2::new(2645407519038125699, -6014465513887172991);
+    let b = i64x2::new(2843689038926761304, -6830262024912907383);
+    let r = i64x2::new(-9223372034707292161, 0);
+
+    assert_eq!(r, transmute(lsx_vssrarn_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarn_bu_h() {
+    let a = u16x8::new(291, 64545, 16038, 57382, 18088, 10736, 57416, 55855);
+    let b = u16x8::new(60210, 40155, 14296, 25577, 1550, 1674, 5330, 10645);
+    let r = i64x2::new(10999415373897, 0);
+
+    assert_eq!(r, transmute(lsx_vssrarn_bu_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarn_hu_w() {
+    let a = u32x4::new(2157227758, 1970326245, 1829195047, 4061259315);
+    let b = u32x4::new(3570029841, 3229468238, 1070101998, 3159433736);
+    let r = i64x2::new(281474976645120, 0);
+
+    assert_eq!(r, transmute(lsx_vssrarn_hu_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarn_wu_d() {
+    let a = u64x2::new(8474558908443232483, 12352412821911429821);
+    let b = u64x2::new(1112771813772164907, 646071836375127186);
+    let r = i64x2::new(963446, 0);
+
+    assert_eq!(r, transmute(lsx_vssrarn_wu_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrln_b_h() {
+    let a = i16x8::new(11215, 29524, -2225, -13955, 13622, 15178, -22920, 29185);
+    let b = i16x8::new(-11667, 13077, -23656, 5150, -23771, -31329, 20729, 15169);
+    let r = i64x2::new(23363148983015937, 0);
+
+    assert_eq!(r, transmute(lsx_vsrln_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrln_h_w() {
+    let a = i32x4::new(273951092, 1016537129, 330941412, 1091816631);
+    let b = i32x4::new(1775989751, -1602688801, -801213995, -1801759515);
+    let r = i64x2::new(-7033214568759295968, 0);
+
+    assert_eq!(r, transmute(lsx_vsrln_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrln_w_d() {
+    let a = i64x2::new(-4929290425724370873, -9113314549902232460);
+    let b = i64x2::new(-1428152872702150626, 3907864416256094744);
+    let r = i64x2::new(-8718771486483115547, 0);
+
+    assert_eq!(r, transmute(lsx_vsrln_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrln_bu_h() {
+    let a = u16x8::new(53048, 1006, 61143, 41996, 57058, 25724, 43969, 62847);
+    let b = u16x8::new(41072, 41125, 44619, 49581, 20733, 905, 47558, 7801);
+    let r = i64x2::new(8862857593125412863, 0);
+
+    assert_eq!(r, transmute(lsx_vssrln_bu_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrln_hu_w() {
+    let a = u32x4::new(1889365848, 1818261427, 2701385771, 4063178210);
+    let b = u32x4::new(1325069171, 1380839173, 3495604120, 2839043866);
+    let r = i64x2::new(16889194387279379, 0);
+
+    assert_eq!(r, transmute(lsx_vssrln_hu_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrln_wu_d() {
+    let a = u64x2::new(7819967077464554342, 9878605573134710521);
+    let b = u64x2::new(3908262745817581251, 17131627096934512209);
+    let r = i64x2::new(-1, 0);
+
+    assert_eq!(r, transmute(lsx_vssrln_wu_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlrn_b_h() {
+    let a = i16x8::new(-28299, -15565, -30638, -10884, -2538, 23256, 25217, 14524);
+    let b = i16x8::new(22830, -27866, -24616, -9547, 11336, 320, 19908, 7056);
+    let r = i64x2::new(-4888418841542521598, 0);
+
+    assert_eq!(r, transmute(lsx_vsrlrn_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlrn_h_w() {
+    let a = i32x4::new(-146271143, 1373068571, 1580809863, -915867973);
+    let b = i32x4::new(1387862348, 119424523, 185407104, 1890720739);
+    let r = i64x2::new(2222313691660711041, 0);
+
+    assert_eq!(r, transmute(lsx_vsrlrn_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlrn_w_d() {
+    let a = i64x2::new(-4585118244955419935, -6462467970618862820);
+    let b = i64x2::new(-8550351213501194562, 7071641301481388656);
+    let r = i64x2::new(182866822561795, 0);
+
+    assert_eq!(r, transmute(lsx_vsrlrn_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrn_bu_h() {
+    let a = u16x8::new(13954, 8090, 46576, 53579, 4322, 20972, 17281, 18603);
+    let b = u16x8::new(51122, 39148, 45511, 57479, 62603, 43668, 5537, 61004);
+    let r = i64x2::new(432344477600776959, 0);
+
+    assert_eq!(r, transmute(lsx_vssrlrn_bu_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrn_hu_w() {
+    let a = u32x4::new(959062112, 2073250884, 2500149644, 3919033303);
+    let b = u32x4::new(1618795892, 3678356443, 862445734, 2115250342);
+    let r = i64x2::new(-4293983341, 0);
+
+    assert_eq!(r, transmute(lsx_vssrlrn_hu_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrn_wu_d() {
+    let a = u64x2::new(13828499145464267218, 4059850184169338184);
+    let b = u64x2::new(13406765083608623828, 7214649593148131096);
+    let r = i64x2::new(-1, 0);
+
+    assert_eq!(r, transmute(lsx_vssrlrn_wu_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrstpi_b() {
+    let a = i8x16::new(
+        116, 124, 21, 48, 24, 119, -108, 103, -77, -95, 68, -76, 67, -82, -96, 17,
+    );
+    let b = i8x16::new(
+        -124, -52, -31, -108, 33, 71, -22, 0, -38, -20, -6, -90, 41, -58, -51, -51,
+    );
+    let r = i64x2::new(7463721428229389428, 1270206412966109619);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vfrstpi_b::<28>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrstpi_h() {
+    let a = i16x8::new(8411, -11473, 30045, -14781, 12135, -6534, -3622, 21173);
+    let b = i16x8::new(9590, -8044, 15088, 4172, 1721, 27581, -19895, -25679);
+    let r = i64x2::new(-4160352588467724069, 5959935604366651239);
+
+    assert_eq!(r, transmute(lsx_vfrstpi_h::<1>(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrstp_b() {
+    let a = i8x16::new(
+        41, -46, -4, 113, -42, 96, 62, 9, 12, -71, -82, 3, 4, -42, 43, -57,
+    );
+    let b = i8x16::new(
+        -123, 108, -25, -29, -60, 41, -50, -93, 33, 99, 43, 36, 41, 88, 125, 27,
+    );
+    let c = i8x16::new(
+        94, 2, 35, 33, 56, -117, -67, 85, 48, 94, -20, 112, -92, 47, -13, -80,
+    );
+    let r = i64x2::new(666076269049074217, -4107047547431896820);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vfrstp_b(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrstp_h() {
+    let a = i16x8::new(-23724, -17384, -24117, -29825, -19683, -3257, 18098, 7693);
+    let b = i16x8::new(-20325, 3010, -32157, -32381, 13895, 10305, -4480, -12994);
+    let c = i16x8::new(-2897, -31862, -29510, -16688, -12596, -6396, 20900, -22026);
+    let r = i64x2::new(-8394813283989150892, 77734399685405);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vfrstp_h(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vshuf4i_d() {
+    let a = i64x2::new(358242861525536259, -3448068840836542886);
+    let b = i64x2::new(-5242415653399550268, -1504319281108156436);
+    let r = i64x2::new(-3448068840836542886, -5242415653399550268);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vshuf4i_d::<153>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbsrl_v() {
+    let a = i8x16::new(
+        67, 57, -68, -24, 50, 58, 127, -80, -9, 17, 119, 81, 4, 110, 63, 56,
+    );
+    let r = i64x2::new(4570595419764160432, 56);
+
+    assert_eq!(r, transmute(lsx_vbsrl_v::<7>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vbsll_v() {
+    let a = i8x16::new(
+        -25, -57, 97, -71, 66, 71, -127, 74, -32, -1, 36, 111, 116, 79, 49, -92,
+    );
+    let r = i64x2::new(0, -1801439850948198400);
+
+    assert_eq!(r, transmute(lsx_vbsll_v::<15>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vextrins_b() {
+    let a = i8x16::new(
+        72, 112, -116, 99, 55, 19, 50, -123, -98, -90, 79, -29, 18, -87, 79, 74,
+    );
+    let b = i8x16::new(
+        -107, 59, -127, 85, -65, -45, 80, 65, 30, -46, -56, -117, 107, 122, 11, -55,
+    );
+    let r = i64x2::new(-8848989189215300792, 5354684380554962590);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vextrins_b::<21>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vextrins_h() {
+    let a = i16x8::new(-8903, 13698, -1855, 30429, -28178, 21171, -17068, -10547);
+    let b = i16x8::new(-16309, 24895, 7753, 1535, 20205, 23989, 27706, -24274);
+    let r = i64x2::new(8565108990437154105, -2968508409504886290);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vextrins_h::<33>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vextrins_w() {
+    let a = i32x4::new(1225397826, 1289583478, 1287364839, 1276008188);
+    let b = i32x4::new(1511106319, -1591171516, -989081993, 1462597836);
+    let r = i64x2::new(5538718864697333314, -6834029622259375897);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vextrins_w::<57>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vextrins_d() {
+    let a = i64x2::new(7112618873032505596, -3605623410483258197);
+    let b = i64x2::new(-8508848216355653905, -4655572653097801607);
+    let r = i64x2::new(7112618873032505596, -8508848216355653905);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vextrins_d::<62>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmskltz_b() {
+    let a = i8x16::new(
+        94, -6, -27, 108, 33, -86, -64, 68, 68, 9, -92, -83, -61, 99, 103, -77,
+    );
+    let r = i64x2::new(40038, 0);
+
+    assert_eq!(r, transmute(lsx_vmskltz_b(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmskltz_h() {
+    let a = i16x8::new(16730, 29121, -23447, -8647, -22303, 21817, 30964, -27069);
+    let r = i64x2::new(156, 0);
+
+    assert_eq!(r, transmute(lsx_vmskltz_h(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmskltz_w() {
+    let a = i32x4::new(-657282776, -1247210048, 162595942, 949871015);
+    let r = i64x2::new(3, 0);
+
+    assert_eq!(r, transmute(lsx_vmskltz_w(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmskltz_d() {
+    let a = i64x2::new(7728638770319849738, 4250984610820351699);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vmskltz_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsigncov_b() {
+    let a = i8x16::new(
+        37, -39, 115, 66, -114, -76, -55, -39, -94, 114, 38, 13, 76, 124, 64, -67,
+    );
+    let b = i8x16::new(
+        -56, -98, -95, 45, 65, -53, -16, 126, 78, -69, -10, 115, -110, 125, -110, -27,
+    );
+    let r = i64x2::new(-9074694153930972472, 1986788453588057010);
+
+    assert_eq!(r, transmute(lsx_vsigncov_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsigncov_h() {
+    let a = i16x8::new(-2481, 28461, 27326, -11105, -17659, 25439, 5753, -743);
+    let b = i16x8::new(27367, 4727, -2962, 14937, 26207, -19075, -26630, 10708);
+    let r = i64x2::new(-4204122973533661927, -3013866947575178847);
+
+    assert_eq!(r, transmute(lsx_vsigncov_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsigncov_w() {
+    let a = i32x4::new(-1532048051, -2015529516, -586660708, 727735992);
+    let b = i32x4::new(-1719915889, 290419288, 202835952, -1715336967);
+    let r = i64x2::new(-1247341342367689359, -7367316170792699888);
+
+    assert_eq!(r, transmute(lsx_vsigncov_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsigncov_d() {
+    let a = i64x2::new(150793719457004094, -135856607031921617);
+    let b = i64x2::new(-7146260093067324952, -4263419240070336957);
+    let r = i64x2::new(-7146260093067324952, 4263419240070336957);
+
+    assert_eq!(r, transmute(lsx_vsigncov_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmadd_s() {
+    let a = u32x4::new(1053592010, 1057663388, 1062706459, 1052867704);
+    let b = u32x4::new(1058664483, 1064225083, 1063099591, 1054461138);
+    let c = u32x4::new(1054468004, 1058982987, 1020391296, 1060092638);
+    let r = i64x2::new(4580180050664125165, 4564646927777478184);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vfmadd_s(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmadd_d() {
+    let a = u64x2::new(4606327684689705003, 4598694159366762396);
+    let b = u64x2::new(4605185255799132053, 4599088917574843416);
+    let c = u64x2::new(4602818020827041428, 4603108774373140110);
+    let r = i64x2::new(4608172630826345532, 4603863964483257995);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vfmadd_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmsub_s() {
+    let a = u32x4::new(1044400636, 1063313520, 1060460798, 1056994960);
+    let b = u32x4::new(1016037632, 1057190051, 1042434224, 1054669464);
+    let c = u32x4::new(1063213924, 1047859900, 1063932683, 1059194076);
+    let r = i64x2::new(4492556612533126096, -4695805165913139817);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vfmsub_s(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfmsub_d() {
+    let a = u64x2::new(4594815360286672212, 4596595309069193244);
+    let b = u64x2::new(4603027383886900468, 4603059771165364192);
+    let c = u64x2::new(4602620994011391758, 4604927875076111771);
+    let r = i64x2::new(-4622272149514797982, -4619451105624653598);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vfmsub_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfnmadd_s() {
+    let a = u32x4::new(1061642899, 1052761434, 1063541119, 1058091924);
+    let b = u32x4::new(1044610040, 1047755448, 1062197759, 1051199080);
+    let c = u32x4::new(1061915520, 1064953425, 1057353824, 1063041453);
+    let r = i64x2::new(-4645363120071402583, -4645972958179775591);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vfnmadd_s(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfnmadd_d() {
+    let a = u64x2::new(4581972604415454304, 4606375442608807393);
+    let b = u64x2::new(4601574488118710932, 4600732882837014710);
+    let c = u64x2::new(4598552045727299030, 4597905936756546488);
+    let r = i64x2::new(-4624646832280694111, -4619798024319766060);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vfnmadd_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfnmsub_s() {
+    let a = u32x4::new(1063347858, 1055637882, 1012264384, 1037368648);
+    let b = u32x4::new(1054477234, 1065181074, 1060000965, 1061867853);
+    let c = u32x4::new(1064036393, 1038991248, 1057711476, 1049339888);
+    let r = i64x2::new(-4706852781727946153, 4486413029030305466);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vfnmsub_s(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfnmsub_d() {
+    let a = u64x2::new(4604322037070318179, 4603593616949749938);
+    let b = u64x2::new(4598988625246003058, 4600654731040688846);
+    let c = u64x2::new(4601892672002082676, 4603822465490492305);
+    let r = i64x2::new(4598264167668253799, 4600765330842720520);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vfnmsub_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrne_w_s() {
+    let a = u32x4::new(1031214064, 1059673230, 1042813024, 1053602874);
+    let r = i64x2::new(4294967296, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrne_w_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrne_l_d() {
+    let a = u64x2::new(4606989588359571497, 4604713245380178790);
+    let r = i64x2::new(1, 1);
+
+    assert_eq!(r, transmute(lsx_vftintrne_l_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrp_w_s() {
+    let a = u32x4::new(1061716225, 1050491008, 1064711040, 1065018777);
+    let r = i64x2::new(4294967297, 4294967297);
+
+    assert_eq!(r, transmute(lsx_vftintrp_w_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrp_l_d() {
+    let a = u64x2::new(4587516915944025472, 4601504548481216392);
+    let r = i64x2::new(1, 1);
+
+    assert_eq!(r, transmute(lsx_vftintrp_l_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrm_w_s() {
+    let a = u32x4::new(1045772456, 1065200707, 1061587478, 1035467272);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrm_w_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrm_l_d() {
+    let a = u64x2::new(4597123259408216804, 4594399417822716772);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrm_l_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftint_w_d() {
+    let a = u64x2::new(4602226310642310974, 4598315153561102162);
+    let b = u64x2::new(4606905060326467647, 4606985586417166381);
+    let r = i64x2::new(4294967297, 0);
+
+    assert_eq!(r, transmute(lsx_vftint_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vffint_s_l() {
+    let a = i64x2::new(-958368210120518642, 317739970300630807);
+    let b = i64x2::new(5814449889729512723, -111756032377486319);
+    let r = i64x2::new(-2610252963668467161, 6669016150524087533);
+
+    assert_eq!(r, transmute(lsx_vffint_s_l(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrz_w_d() {
+    let a = u64x2::new(4588311497244995104, 4604793095801710714);
+    let b = u64x2::new(4599106720144900270, 4600531579473237336);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrz_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrp_w_d() {
+    let a = u64x2::new(4595926440353149184, 4601703964116560606);
+    let b = u64x2::new(4606104970322966899, 4595679410565085836);
+    let r = i64x2::new(4294967297, 4294967297);
+
+    assert_eq!(r, transmute(lsx_vftintrp_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrm_w_d() {
+    let a = u64x2::new(4603847521361653326, 4600607722530696016);
+    let b = u64x2::new(4606733822200032543, 4589510164179968984);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrm_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrne_w_d() {
+    let a = u64x2::new(4601878512717779358, 4597694557130026508);
+    let b = u64x2::new(4599197176714081204, 4605745859931721980);
+    let r = i64x2::new(4294967296, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrne_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintl_l_s() {
+    let a = u32x4::new(1058856635, 1060563398, 1061422616, 1056124918);
+    let r = i64x2::new(1, 1);
+
+    assert_eq!(r, transmute(lsx_vftintl_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftinth_l_s() {
+    let a = u32x4::new(1045383680, 1040752748, 1061879518, 1054801708);
+    let r = i64x2::new(1, 0);
+
+    assert_eq!(r, transmute(lsx_vftinth_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vffinth_d_w() {
+    let a = i32x4::new(517100418, -188510766, 949226647, -87467194);
+    let r = i64x2::new(4741245898611228672, -4497729803343888384);
+
+    assert_eq!(r, transmute(lsx_vffinth_d_w(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vffintl_d_w() {
+    let a = i32x4::new(1273684401, -2137528906, -2109294912, -1646387998);
+    let r = i64x2::new(4743129027571613696, -4476619782820462592);
+
+    assert_eq!(r, transmute(lsx_vffintl_d_w(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrzl_l_s() {
+    let a = u32x4::new(1031186688, 987838976, 1034565688, 1061017371);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrzl_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrzh_l_s() {
+    let a = u32x4::new(1049433828, 1048953580, 1060964637, 1059899586);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrzh_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrpl_l_s() {
+    let a = u32x4::new(1061834803, 1064858941, 1060475110, 1063896216);
+    let r = i64x2::new(1, 1);
+
+    assert_eq!(r, transmute(lsx_vftintrpl_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrph_l_s() {
+    let a = u32x4::new(1059691939, 1065187151, 1059017027, 1061117394);
+    let r = i64x2::new(1, 1);
+
+    assert_eq!(r, transmute(lsx_vftintrph_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrml_l_s() {
+    let a = u32x4::new(1062985651, 1065211455, 1056421466, 1057373572);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrml_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrmh_l_s() {
+    let a = u32x4::new(1050224290, 1063763666, 1057677270, 1063622234);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrmh_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrnel_l_s() {
+    let a = u32x4::new(1060174609, 1050974638, 1047193308, 1062040876);
+    let r = i64x2::new(1, 0);
+
+    assert_eq!(r, transmute(lsx_vftintrnel_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vftintrneh_l_s() {
+    let a = u32x4::new(1055675382, 1036879184, 1064176794, 1063791852);
+    let r = i64x2::new(1, 1);
+
+    assert_eq!(r, transmute(lsx_vftintrneh_l_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrintrne_s() {
+    let a = u32x4::new(1054667842, 1061395025, 1062986478, 1062529334);
+    let r = i64x2::new(4575657221408423936, 4575657222473777152);
+
+    assert_eq!(r, transmute(lsx_vfrintrne_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrintrne_d() {
+    let a = u64x2::new(4603260356641870565, 4601614335120512898);
+    let r = i64x2::new(4607182418800017408, 0);
+
+    assert_eq!(r, transmute(lsx_vfrintrne_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrintrz_s() {
+    let a = u32x4::new(1063039577, 1033416832, 1052369306, 1057885024);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfrintrz_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrintrz_d() {
+    let a = u64x2::new(4601515428088814484, 4604735152905786794);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfrintrz_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrintrp_s() {
+    let a = u32x4::new(1061968959, 1056597596, 1064869916, 1058742360);
+    let r = i64x2::new(4575657222473777152, 4575657222473777152);
+
+    assert_eq!(r, transmute(lsx_vfrintrp_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrintrp_d() {
+    let a = u64x2::new(4603531792479663401, 4587997630530425392);
+    let r = i64x2::new(4607182418800017408, 4607182418800017408);
+
+    assert_eq!(r, transmute(lsx_vfrintrp_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrintrm_s() {
+    let a = u32x4::new(1058024441, 1044087184, 1059777964, 1050835426);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfrintrm_s(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfrintrm_d() {
+    let a = u64x2::new(4589388034824743512, 4606800774570289382);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfrintrm_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vstelm_b() {
+    let a = i8x16::new(
+        -70, -74, -13, -53, -37, -28, -84, -8, 110, -98, -26, 71, 55, 104, -8, -50,
+    );
+    let mut o: [i8; 16] = [
+        97, 16, 51, -123, 4, 14, 108, 36, -40, -53, 29, 67, 102, 63, -15, -39,
+    ];
+    let r = i64x2::new(2624488095427530938, -2742340989646681128);
+
+    lsx_vstelm_b::<0, 0>(transmute(a), o.as_mut_ptr());
+    assert_eq!(r, transmute(o));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vstelm_h() {
+    let a = i16x8::new(-7427, -5749, 19902, -9799, 28691, -16170, 11920, 24129);
+    let mut o: [i8; 16] = [
+        123, 19, -3, 118, -43, -40, -48, -81, 23, -114, -72, 26, 117, 98, -43, -112,
+    ];
+    let r = i64x2::new(-5777879910580360821, -8010388107109560809);
+
+    lsx_vstelm_h::<0, 1>(transmute(a), o.as_mut_ptr());
+    assert_eq!(r, transmute(o));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vstelm_w() {
+    let a = i32x4::new(424092909, 1956922334, -640221305, -164680666);
+    let mut o: [i8; 16] = [
+        -12, -50, 8, 91, 60, -48, 94, -99, -64, -51, 3, -44, 7, -49, 62, -69,
+    ];
+    let r = i64x2::new(-7107014201697162202, -4954294907532227136);
+
+    lsx_vstelm_w::<0, 3>(transmute(a), o.as_mut_ptr());
+    assert_eq!(r, transmute(o));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vstelm_d() {
+    let a = i64x2::new(2628828971609511929, 9138529437562240974);
+    let mut o: [i8; 16] = [
+        48, -98, 127, -32, 90, 120, 50, 2, 90, 120, -113, 19, -120, 105, 27, -22,
+    ];
+    let r = i64x2::new(2628828971609511929, -1577551211298588582);
+
+    lsx_vstelm_d::<0, 0>(transmute(a), o.as_mut_ptr());
+    assert_eq!(r, transmute(o));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwev_d_w() {
+    let a = i32x4::new(-1889902301, 326462140, 1088579813, 626337726);
+    let b = i32x4::new(-2105551735, -1478351177, 1027048582, -607110700);
+    let r = i64x2::new(-3995454036, 2115628395);
+
+    assert_eq!(r, transmute(lsx_vaddwev_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwev_w_h() {
+    let a = i16x8::new(7813, 337, -10949, -8624, 14298, -27002, -12747, 17169);
+    let b = i16x8::new(-17479, -32614, 24343, 25426, -14077, -12419, 10115, 23013);
+    let r = i64x2::new(57531086920254, -11304353922851);
+
+    assert_eq!(r, transmute(lsx_vaddwev_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwev_h_b() {
+    let a = i8x16::new(
+        -122, -50, 126, -108, 72, 89, -50, -96, -37, -68, 63, -41, -1, -49, 90, 117,
+    );
+    let b = i8x16::new(
+        -89, 6, -27, 58, 80, -29, 28, 104, 30, 69, -39, 76, 42, 34, 25, -24,
+    );
+    let r = i64x2::new(-6191796646052051, 32369798417022969);
+
+    assert_eq!(r, transmute(lsx_vaddwev_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwod_d_w() {
+    let a = i32x4::new(-1721333318, -347227654, -936088440, 1975890670);
+    let b = i32x4::new(420515981, 473447119, 1471756335, 1044924117);
+    let r = i64x2::new(126219465, 3020814787);
+
+    assert_eq!(r, transmute(lsx_vaddwod_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwod_w_h() {
+    let a = i16x8::new(13058, 5020, 31112, -31710, 19542, -9009, -21764, -1881);
+    let b = i16x8::new(-26581, -22301, 18214, -3616, -24489, 12150, -10765, -24232);
+    let r = i64x2::new(-151719719748481, -112154480997307);
+
+    assert_eq!(r, transmute(lsx_vaddwod_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwod_h_b() {
+    let a = i8x16::new(
+        -53, 61, 10, -18, -31, 26, 113, -14, -62, 6, 127, -43, 86, 33, 94, 57,
+    );
+    let b = i8x16::new(
+        37, 85, -14, -93, 61, -116, -53, -51, -46, 119, 36, -94, 0, -86, 46, -6,
+    );
+    let r = i64x2::new(-18014780768845678, 14636475441676413);
+
+    assert_eq!(r, transmute(lsx_vaddwod_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwev_d_wu() {
+    let a = u32x4::new(2539947230, 3548211150, 1193982195, 3547334418);
+    let b = u32x4::new(1482213353, 1001198416, 3345983326, 2244256337);
+    let r = i64x2::new(4022160583, 4539965521);
+
+    assert_eq!(r, transmute(lsx_vaddwev_d_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwev_w_hu() {
+    let a = u16x8::new(50844, 55931, 31330, 63416, 32884, 2778, 22874, 13540);
+    let b = u16x8::new(28483, 24704, 9817, 62062, 47674, 8032, 29897, 62737);
+    let r = i64x2::new(176725019407839, 226649719257774);
+
+    assert_eq!(r, transmute(lsx_vaddwev_w_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwev_h_bu() {
+    let a = u8x16::new(
+        233, 165, 29, 130, 62, 173, 207, 120, 32, 254, 152, 27, 30, 159, 92, 76,
+    );
+    let b = u8x16::new(
+        118, 157, 181, 79, 81, 38, 95, 73, 245, 179, 126, 210, 16, 93, 78, 63,
+    );
+    let r = i64x2::new(85006057160704351, 47850943627526421);
+
+    assert_eq!(r, transmute(lsx_vaddwev_h_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwod_d_wu() {
+    let a = u32x4::new(342250989, 1651153980, 174227274, 2092816321);
+    let b = u32x4::new(2782520439, 2496077290, 2678772394, 196273109);
+    let r = i64x2::new(4147231270, 2289089430);
+
+    assert_eq!(r, transmute(lsx_vaddwod_d_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwod_w_hu() {
+    let a = u16x8::new(36372, 35690, 49187, 14265, 54130, 40094, 57017, 10670);
+    let b = u16x8::new(20353, 34039, 21222, 4948, 58293, 4766, 51360, 37497);
+    let r = i64x2::new(82519206727777, 206875689791292);
+
+    assert_eq!(r, transmute(lsx_vaddwod_w_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwod_h_bu() {
+    let a = u8x16::new(
+        248, 1, 83, 240, 60, 173, 151, 39, 55, 39, 131, 86, 86, 18, 5, 110,
+    );
+    let b = u8x16::new(
+        63, 52, 164, 249, 242, 167, 236, 222, 171, 180, 249, 57, 79, 53, 87, 7,
+    );
+    let r = i64x2::new(73466429242409013, 32932877227196635);
+
+    assert_eq!(r, transmute(lsx_vaddwod_h_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwev_d_wu_w() {
+    let a = u32x4::new(3787058271, 4254502892, 1291509641, 2971162106);
+    let b = i32x4::new(-1308530150, 1427930358, 1723198474, 1987356336);
+    let r = i64x2::new(2478528121, 3014708115);
+
+    assert_eq!(r, transmute(lsx_vaddwev_d_wu_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwev_w_hu_h() {
+    let a = u16x8::new(7742, 2564, 7506, 3394, 6835, 41043, 29153, 7959);
+    let b = i16x8::new(-11621, -6593, 7431, -1189, -12361, -15174, 16182, -32434);
+    let r = i64x2::new(64158221463769, 194716637325930);
+
+    assert_eq!(r, transmute(lsx_vaddwev_w_hu_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwev_h_bu_b() {
+    let a = u8x16::new(
+        103, 224, 71, 251, 48, 94, 188, 16, 181, 57, 192, 250, 248, 36, 51, 176,
+    );
+    let b = i8x16::new(
+        36, -32, 108, -95, -21, 20, 67, -107, -65, -124, -19, -50, -120, -36, -79, -12,
+    );
+    let r = i64x2::new(71776235037065355, -7880749580746636);
+
+    assert_eq!(r, transmute(lsx_vaddwev_h_bu_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwod_d_wu_w() {
+    let a = u32x4::new(3763905902, 2910980290, 1912906409, 2257280339);
+    let b = i32x4::new(-1646368557, 586112311, 376247963, 1048800083);
+    let r = i64x2::new(3497092601, 3306080422);
+
+    assert_eq!(r, transmute(lsx_vaddwod_d_wu_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwod_w_hu_h() {
+    let a = u16x8::new(53495, 36399, 39536, 12468, 17601, 52919, 14730, 58963);
+    let b = i16x8::new(31700, 22725, 14068, -14860, -28839, -14513, -1195, 27082);
+    let r = i64x2::new(-10273561712908, 369560461022726);
+
+    assert_eq!(r, transmute(lsx_vaddwod_w_hu_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwod_h_bu_b() {
+    let a = u8x16::new(
+        191, 183, 244, 200, 83, 191, 111, 82, 210, 150, 228, 182, 45, 23, 145, 159,
+    );
+    let b = i8x16::new(
+        -34, -59, -104, -58, -78, 90, -117, 93, 76, -23, 37, 44, -62, 60, 119, -91,
+    );
+    let r = i64x2::new(49259327819481212, 19140654913421439);
+
+    assert_eq!(r, transmute(lsx_vaddwod_h_bu_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwev_d_w() {
+    let a = i32x4::new(1979919903, -1490022083, -1106776488, 2132235386);
+    let b = i32x4::new(-2090701374, 629564229, -1170676885, 1069800209);
+    let r = i64x2::new(4070621277, 63900397);
+
+    assert_eq!(r, transmute(lsx_vsubwev_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwev_w_h() {
+    let a = i16x8::new(1153, -17319, 23560, 30758, -11540, -15757, -5844, -31417);
+    let b = i16x8::new(-23957, 9416, -29569, -13210, 5333, 8420, 18648, -24201);
+    let r = i64x2::new(228187317494294, -105188044063209);
+
+    assert_eq!(r, transmute(lsx_vsubwev_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwev_h_b() {
+    let a = i8x16::new(
+        123, 120, -48, 33, 4, -108, -68, -59, 54, 30, 17, -104, -30, -76, -127, -108,
+    );
+    let b = i8x16::new(
+        -16, 108, -113, 37, -118, 72, 81, 103, 63, -86, -109, -71, -29, 83, -75, 97,
+    );
+    let r = i64x2::new(-41939247539617653, -14355228098887689);
+
+    assert_eq!(r, transmute(lsx_vsubwev_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwod_d_w() {
+    let a = i32x4::new(-1024625027, -1083407596, 1367079411, 1458097720);
+    let b = i32x4::new(1436617964, -45524609, 502994793, -2039550077);
+    let r = i64x2::new(-1037882987, 3497647797);
+
+    assert_eq!(r, transmute(lsx_vsubwod_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwod_w_h() {
+    let a = i16x8::new(-15137, 29913, 8889, -17237, 31133, 28017, 9070, -18477);
+    let b = i16x8::new(-1276, 12669, 24115, 19617, -26739, 1910, -757, 23994);
+    let r = i64x2::new(-158286724709540, -182411556002309);
+
+    assert_eq!(r, transmute(lsx_vsubwod_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwod_h_b() {
+    let a = i8x16::new(
+        -25, -19, -117, -1, 9, 24, -16, 93, 9, -77, -36, 75, 0, 126, 74, -106,
+    );
+    let b = i8x16::new(
+        -91, -3, -112, 5, -88, -14, -1, 8, -100, 65, -26, -24, 41, 124, 17, -108,
+    );
+    let r = i64x2::new(23925540523802608, 562958549909362);
+
+    assert_eq!(r, transmute(lsx_vsubwod_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwev_d_wu() {
+    let a = u32x4::new(2665672710, 2360377198, 3032815602, 1049776563);
+    let b = u32x4::new(1691253880, 1939268473, 1629937431, 2921768539);
+    let r = i64x2::new(974418830, 1402878171);
+
+    assert_eq!(r, transmute(lsx_vsubwev_d_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwev_w_hu() {
+    let a = u16x8::new(8298, 25954, 33403, 10264, 36066, 64035, 18750, 26396);
+    let b = u16x8::new(15957, 42770, 43138, 30319, 50823, 18089, 64120, 18054);
+    let r = i64x2::new(-41807211666923, -194858371266981);
+
+    assert_eq!(r, transmute(lsx_vsubwev_w_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwev_h_bu() {
+    let a = u8x16::new(
+        128, 1, 20, 37, 75, 38, 156, 224, 7, 26, 190, 76, 144, 59, 175, 99,
+    );
+    let b = u8x16::new(
+        141, 113, 141, 61, 31, 32, 161, 158, 220, 37, 240, 180, 56, 229, 5, 26,
+    );
+    let r = i64x2::new(-1407181617889293, 47851128289689387);
+
+    assert_eq!(r, transmute(lsx_vsubwev_h_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwod_d_wu() {
+    let a = u32x4::new(623751944, 3506098576, 826539449, 2248804942);
+    let b = u32x4::new(103354715, 19070238, 1662532733, 3761231766);
+    let r = i64x2::new(3487028338, -1512426824);
+
+    assert_eq!(r, transmute(lsx_vsubwod_d_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwod_w_hu() {
+    let a = u16x8::new(2891, 21215, 21876, 42023, 37208, 16456, 2023, 54703);
+    let b = u16x8::new(21739, 45406, 21733, 63910, 6659, 16020, 1211, 637);
+    let r = i64x2::new(-93999654264447, 232211701825972);
+
+    assert_eq!(r, transmute(lsx_vsubwod_w_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwod_h_bu() {
+    let a = u8x16::new(
+        6, 39, 26, 92, 204, 140, 65, 76, 214, 200, 24, 203, 215, 17, 22, 226,
+    );
+    let b = u8x16::new(
+        89, 14, 101, 173, 231, 124, 106, 127, 125, 115, 109, 27, 121, 175, 229, 175,
+    );
+    let r = i64x2::new(-14355150803107815, 14636020195655765);
+
+    assert_eq!(r, transmute(lsx_vsubwod_h_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwev_q_d() {
+    let a = i64x2::new(-1132117278547342347, -8844779319945501636);
+    let b = i64x2::new(6738886902337351868, -5985538541381931477);
+    let r = i64x2::new(5606769623790009521, 0);
+
+    assert_eq!(r, transmute(lsx_vaddwev_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwod_q_d() {
+    let a = i64x2::new(-8159683400941020659, -1142783567808544783);
+    let b = i64x2::new(-1244049724346527963, -3275029038845457041);
+    let r = i64x2::new(-4417812606654001824, -1);
+
+    assert_eq!(r, transmute(lsx_vaddwod_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwev_q_du() {
+    let a = u64x2::new(16775220860485391359, 8922486068170257729);
+    let b = u64x2::new(6745766838534849346, 15041258018068294402);
+    let r = i64x2::new(5074243625310689089, 1);
+
+    assert_eq!(r, transmute(lsx_vaddwev_q_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwod_q_du() {
+    let a = u64x2::new(17311013772674153390, 11698682577513574290);
+    let b = u64x2::new(13496765248439164553, 4640846570780442359);
+    let r = i64x2::new(-2107214925415534967, 0);
+
+    assert_eq!(r, transmute(lsx_vaddwod_q_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwev_q_d() {
+    let a = i64x2::new(8509296067394123199, 4972040966127046151);
+    let b = i64x2::new(8029026411722387723, -2105201823388787841);
+    let r = i64x2::new(480269655671735476, 0);
+
+    assert_eq!(r, transmute(lsx_vsubwev_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwod_q_d() {
+    let a = i64x2::new(-5518792681032609552, -5818770921355494107);
+    let b = i64x2::new(5758437127240728961, 2933507971643343184);
+    let r = i64x2::new(-8752278892998837291, -1);
+
+    assert_eq!(r, transmute(lsx_vsubwod_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwev_q_du() {
+    let a = u64x2::new(15348090063574162992, 4054607174208637377);
+    let b = u64x2::new(1574118313456291324, 7787456577305510529);
+    let r = i64x2::new(-4672772323591679948, 0);
+
+    assert_eq!(r, transmute(lsx_vsubwev_q_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsubwod_q_du() {
+    let a = u64x2::new(7199085452795040192, 586057639195920839);
+    let b = u64x2::new(5627376085113520030, 12775637764770549815);
+    let r = i64x2::new(6257163948134922640, -1);
+
+    assert_eq!(r, transmute(lsx_vsubwod_q_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwev_q_du_d() {
+    let a = u64x2::new(11103722789624608070, 8912888508651245205);
+    let b = i64x2::new(-1159499132550683978, -4257322329662100669);
+    let r = i64x2::new(-8502520416635627524, 0);
+
+    assert_eq!(r, transmute(lsx_vaddwev_q_du_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vaddwod_q_du_d() {
+    let a = u64x2::new(8904095231861536434, 126069624822744729);
+    let b = i64x2::new(-3902573037873546881, 160140233311333524);
+    let r = i64x2::new(286209858134078253, 0);
+
+    assert_eq!(r, transmute(lsx_vaddwod_q_du_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwev_d_w() {
+    let a = i32x4::new(1287102156, 1220933948, 1816088643, -266313269);
+    let b = i32x4::new(8741677, -276509855, -1214560052, -1338519080);
+    let r = i64x2::new(11251431313755612, -2205748716678689436);
+
+    assert_eq!(r, transmute(lsx_vmulwev_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwev_w_h() {
+    let a = i16x8::new(6427, -15587, -29266, -12748, 29941, -16072, -3936, -4131);
+    let b = i16x8::new(30661, -20472, 1422, -16868, 4256, 9713, -27765, -7287);
+    let r = i64x2::new(-178740441125036345, 469367082934888736);
+
+    assert_eq!(r, transmute(lsx_vmulwev_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwev_h_b() {
+    let a = i8x16::new(
+        -53, -116, -37, -91, -27, -23, 3, -103, -83, 88, 61, -1, 37, 89, -77, -78,
+    );
+    let b = i8x16::new(
+        102, -8, -8, -115, -104, 126, 46, 69, -53, 81, -41, 100, -83, -42, -38, -17,
+    );
+    let r = i64x2::new(38855607073696482, 823864071118590255);
+
+    assert_eq!(r, transmute(lsx_vmulwev_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwod_d_w() {
+    let a = i32x4::new(730217708, -1124949962, -360746398, -1749502167);
+    let b = i32x4::new(63312847, -1377579771, -2054819244, -1416520586);
+    let r = i64x2::new(1549708311038418702, 2478205834807109862);
+
+    assert_eq!(r, transmute(lsx_vmulwod_d_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwod_w_h() {
+    let a = i16x8::new(-16507, -11588, -4739, -32549, -22878, 5561, -6134, -3022);
+    let b = i16x8::new(23748, 11912, 4946, -23048, 22372, 24702, -24875, -27771);
+    let r = i64x2::new(3222038736804363232, 360450672278114574);
+
+    assert_eq!(r, transmute(lsx_vmulwod_w_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwod_h_b() {
+    let a = i8x16::new(
+        -110, 22, -19, -91, 6, 25, -7, 13, 86, -110, -98, -100, -18, -111, 100, 31,
+    );
+    let b = i8x16::new(
+        102, 16, -43, -24, -28, 2, 5, -96, 26, 74, -56, 109, -30, 40, -96, 109,
+    );
+    let r = i64x2::new(-351280556043402912, 951366355207905332);
+
+    assert_eq!(r, transmute(lsx_vmulwod_h_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwev_d_wu() {
+    let a = u32x4::new(2063305123, 761682812, 3318081558, 2848424479);
+    let b = u32x4::new(1769900227, 2256955703, 2342391995, 2407560006);
+    let r = i64x2::new(3651844205567962921, 7772247680216328210);
+
+    assert_eq!(r, transmute(lsx_vmulwev_d_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwev_w_hu() {
+    let a = u16x8::new(9553, 49381, 46053, 13610, 17063, 24513, 41196, 11695);
+    let b = u16x8::new(20499, 45056, 20580, 12771, 53914, 60742, 45402, 40547);
+    let r = i64x2::new(4070644332601545987, 8033224333626513014);
+
+    assert_eq!(r, transmute(lsx_vmulwev_w_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwev_h_bu() {
+    let a = u8x16::new(
+        227, 157, 43, 90, 6, 141, 46, 1, 92, 129, 254, 35, 161, 83, 40, 101,
+    );
+    let b = u8x16::new(
+        111, 233, 206, 13, 205, 128, 21, 105, 114, 77, 138, 243, 4, 51, 173, 180,
+    );
+    let r = i64x2::new(271910110892810861, 1947809607093856504);
+
+    assert_eq!(r, transmute(lsx_vmulwev_h_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwod_d_wu() {
+    let a = u32x4::new(2178610550, 1983075871, 1118106927, 2182535205);
+    let b = u32x4::new(3750239707, 1422851626, 1277923597, 1377279439);
+    let r = i64x2::new(2821622727533716246, 3005960862740149995);
+
+    assert_eq!(r, transmute(lsx_vmulwod_d_wu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwod_w_hu() {
+    let a = u16x8::new(63169, 54563, 40593, 32351, 22785, 46152, 51840, 54366);
+    let b = u16x8::new(38950, 5357, 36233, 17707, 61077, 61518, 5789, 13317);
+    let r = i64x2::new(2460325445475503463, 3109522059894091248);
+
+    assert_eq!(r, transmute(lsx_vmulwod_w_hu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwod_h_bu() {
+    let a = u8x16::new(
+        143, 18, 19, 120, 134, 160, 86, 206, 25, 26, 241, 198, 207, 50, 233, 169,
+    );
+    let b = u8x16::new(
+        244, 115, 210, 167, 103, 242, 182, 127, 214, 208, 47, 86, 54, 81, 161, 139,
+    );
+    let r = i64x2::new(7364114643151226902, 6612146073643521312);
+
+    assert_eq!(r, transmute(lsx_vmulwod_h_bu(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwev_d_wu_w() {
+    let a = u32x4::new(1829687775, 3948847254, 3506011389, 2834786083);
+    let b = i32x4::new(1254729285, 1938836163, -1902169358, -257980375);
+    let r = i64x2::new(2295762833698990875, -6669027432954818262);
+
+    assert_eq!(r, transmute(lsx_vmulwev_d_wu_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwev_w_hu_h() {
+    let a = u16x8::new(50708, 48173, 47753, 19808, 25837, 56376, 50749, 8070);
+    let b = i16x8::new(-30477, -10049, 16428, -30668, 21000, 24834, -3219, -9555);
+    let r = i64x2::new(3369342936690107644, -701630285043265176);
+
+    assert_eq!(r, transmute(lsx_vmulwev_w_hu_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwev_h_bu_b() {
+    let a = u8x16::new(
+        196, 15, 88, 70, 49, 17, 144, 62, 34, 164, 51, 69, 162, 88, 100, 31,
+    );
+    let b = i8x16::new(
+        -92, 119, 90, -113, -83, 119, -28, -14, 57, 93, -21, -38, 42, -105, -67, -73,
+    );
+    let r = i64x2::new(-1134643098233554544, -1885853116779133038);
+
+    assert_eq!(r, transmute(lsx_vmulwev_h_bu_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwod_d_wu_w() {
+    let a = u32x4::new(3252247725, 3029105766, 3286505645, 1763684728);
+    let b = i32x4::new(1204047391, -1970001586, 608763444, -2082771896);
+    let r = i64x2::new(-5967343163181744876, -3673352984882804288);
+
+    assert_eq!(r, transmute(lsx_vmulwod_d_wu_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwod_w_hu_h() {
+    let a = u16x8::new(38405, 41959, 20449, 33265, 58814, 59003, 64929, 20835);
+    let b = i16x8::new(-3735, -12972, -4920, 7170, 11577, 9785, 4896, -537);
+    let r = i64x2::new(1024392868267999948, -48053790042385565);
+
+    assert_eq!(r, transmute(lsx_vmulwod_w_hu_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwod_h_bu_b() {
+    let a = u8x16::new(
+        78, 246, 141, 207, 212, 16, 30, 141, 71, 187, 92, 123, 199, 224, 105, 250,
+    );
+    let b = i8x16::new(
+        46, 11, 86, 64, -118, -53, 125, 48, -122, 104, 53, -111, 39, 16, -94, -56,
+    );
+    let r = i64x2::new(1905300476090387090, -3940634277386171400);
+
+    assert_eq!(r, transmute(lsx_vmulwod_h_bu_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwev_q_d() {
+    let a = i64x2::new(-7300892474466935547, -2126323416087979991);
+    let b = i64x2::new(7023560313675997328, 4368639658790376608);
+    let r = i64x2::new(-1409563343912029488, -2779799970834089134);
+
+    assert_eq!(r, transmute(lsx_vmulwev_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwod_q_d() {
+    let a = i64x2::new(-333821925237206080, -2872872657001472243);
+    let b = i64x2::new(1734538850547798281, 6505001633960390309);
+    let r = i64x2::new(655114704133495137, -1013080750363369114);
+
+    assert_eq!(r, transmute(lsx_vmulwod_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwev_q_du() {
+    let a = u64x2::new(7574912843445409775, 6458810692359816933);
+    let b = u64x2::new(15048173707940873365, 13594773395779002998);
+    let r = i64x2::new(-4049323972691826149, 6179334620527225413);
+
+    assert_eq!(r, transmute(lsx_vmulwev_q_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwod_q_du() {
+    let a = u64x2::new(4945250618288414185, 5836523005600515765);
+    let b = u64x2::new(16172423495582959833, 11676106279348566952);
+    let r = i64x2::new(-66293137947075128, 3694303051148166412);
+
+    assert_eq!(r, transmute(lsx_vmulwod_q_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwev_q_du_d() {
+    let a = u64x2::new(15472635927451755137, 2872062649560660647);
+    let b = i64x2::new(-7071166739782294817, 8496829998090419991);
+    let r = i64x2::new(5234431817964974175, -5931105679667820544);
+
+    assert_eq!(r, transmute(lsx_vmulwev_q_du_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmulwod_q_du_d() {
+    let a = u64x2::new(2980498025260165803, 6347157252532266677);
+    let b = i64x2::new(-9085162554263782091, -3351642387065053502);
+    let r = i64x2::new(-3119502026085414102, -1153233394465180223);
+
+    assert_eq!(r, transmute(lsx_vmulwod_q_du_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhaddw_q_d() {
+    let a = i64x2::new(-7668184096931639781, -2784020394780249366);
+    let b = i64x2::new(9222966760421493517, -8347454331188625422);
+    let r = i64x2::new(6438946365641244151, 0);
+
+    assert_eq!(r, transmute(lsx_vhaddw_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhaddw_qu_du() {
+    let a = u64x2::new(16989728354409608690, 2941626047560944845);
+    let b = u64x2::new(2141387370256045519, 12417156199252644485);
+    let r = i64x2::new(5083013417816990364, 0);
+
+    assert_eq!(r, transmute(lsx_vhaddw_qu_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhsubw_q_d() {
+    let a = i64x2::new(4415650624918824808, -2427685530964051137);
+    let b = i64x2::new(-3245503809142406078, 8660213762027125085);
+    let r = i64x2::new(817818278178354941, 0);
+
+    assert_eq!(r, transmute(lsx_vhsubw_q_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vhsubw_qu_du() {
+    let a = u64x2::new(13300663635362906510, 12554343611316218179);
+    let b = u64x2::new(3098179646743711521, 11374525358855478565);
+    let r = i64x2::new(-8990580109137044958, 0);
+
+    assert_eq!(r, transmute(lsx_vhsubw_qu_du(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwev_d_w() {
+    let a = i64x2::new(7507491558224723369, 7356288879446926343);
+    let b = i32x4::new(-1410295112, 176083487, 1092174685, 1464381516);
+    let c = i32x4::new(1610457028, -1105361927, -790658106, -1804307944);
+    let r = i64x2::new(5236271883550276233, 6492752111583679733);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwev_d_w(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwev_w_h() {
+    let a = i32x4::new(1210747897, 1541928975, -720014144, -2019635451);
+    let b = i16x8::new(12181, 16380, -24682, -13729, 12128, -21312, -23449, 17);
+    let c = i16x8::new(-27087, 21294, 30093, 5456, 28491, -25365, -18595, 14478);
+    let r = i64x2::new(3432424257664054654, -6801515772302723616);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwev_w_h(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwev_h_b() {
+    let a = i16x8::new(-26961, 27058, -26746, 7019, 27143, -20720, 20159, -22095);
+    let b = i8x16::new(
+        126, 29, -29, 63, -17, 109, 56, 67, 91, -76, 83, -101, 51, 39, -109, 16,
+    );
+    let c = i8x16::new(
+        -40, -36, -53, -47, -78, 33, -97, -54, 21, 103, 69, 101, 33, -83, 79, -6,
+    );
+    let r = i64x2::new(446873086821892863, -8642876820889308802);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwev_h_b(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwev_d_wu() {
+    let a = u64x2::new(3288783601225499701, 17730813816531737481);
+    let b = u32x4::new(2583154680, 1751994654, 1115446691, 3761972534);
+    let c = u32x4::new(1143913546, 2487138808, 577997991, 917071165);
+    let r = i64x2::new(6243689231090794981, -71204310712216354);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwev_d_wu(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwev_w_hu() {
+    let a = u32x4::new(805734379, 3876931235, 2135371653, 3482539797);
+    let b = u16x8::new(7507, 65354, 30738, 63434, 34178, 38533, 8774, 9013);
+    let c = u16x8::new(32752, 10153, 5275, 7485, 55213, 62803, 43040, 42218);
+    let r = i64x2::new(-1099052541965094213, -1867428321461954977);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwev_w_hu(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwev_h_bu() {
+    let a = u16x8::new(55814, 6276, 42400, 55862, 19175, 17360, 30132, 17253);
+    let b = u8x16::new(
+        148, 50, 79, 199, 193, 25, 144, 93, 18, 182, 102, 150, 226, 222, 254, 1,
+    );
+    let c = u8x16::new(
+        141, 28, 169, 93, 60, 134, 117, 80, 43, 12, 75, 85, 174, 176, 62, 94,
+    );
+    let r = i64x2::new(2019533326543170442, -9157771529370317331);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwev_h_bu(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwod_d_w() {
+    let a = i64x2::new(1296033816549937177, -2404834118264545479);
+    let b = i32x4::new(-2135765262, -1741194198, -1750008434, -242816495);
+    let c = i32x4::new(178412146, 887047455, -1630315539, 57253350);
+    let r = i64x2::new(-248488065446728913, -2418736176038553729);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwod_d_w(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwod_w_h() {
+    let a = i32x4::new(1810262555, -720984423, 744322940, -172229387);
+    let b = i16x8::new(27584, -15468, -21544, -11891, -16682, 18538, -7573, -1522);
+    let c = i16x8::new(-8815, 3268, -32219, -7020, 13853, 26700, -2030, -5667);
+    let r = i64x2::new(-2738082894011230357, -702674743083530508);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwod_w_h(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwod_h_b() {
+    let a = i16x8::new(32731, -16929, 397, 14417, 22494, 1416, 1669, -12175);
+    let b = i8x16::new(
+        87, 77, -44, -128, -69, 120, 82, -99, -21, 66, -47, -59, -35, 90, -85, 94,
+    );
+    let c = i8x16::new(
+        87, -119, -48, 10, 26, -36, 89, -16, 91, -74, -116, 7, 78, 17, -9, -98,
+    );
+    let r = i64x2::new(4504145731268860944, -6019891587244669750);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwod_h_b(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwod_d_wu() {
+    let a = u64x2::new(8272899369384595612, 11592257149528470828);
+    let b = u32x4::new(244745450, 2190106289, 660562971, 1842569843);
+    let c = u32x4::new(388973541, 2963125445, 520938623, 340863345);
+    let r = i64x2::new(-3684285032134532399, -6226422404099975953);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwod_d_wu(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwod_w_hu() {
+    let a = u32x4::new(2163417444, 940670316, 624242075, 3716350419);
+    let b = u16x8::new(10149, 33560, 21613, 61563, 14556, 33558, 30440, 63972);
+    let c = u16x8::new(9862, 40610, 42783, 2223, 62194, 15996, 61261, 33667);
+    let r = i64x2::new(4627934059328104084, 6765125168025305155);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwod_w_hu(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwod_h_bu() {
+    let a = u16x8::new(17882, 7508, 14715, 47175, 62895, 51393, 34943, 20707);
+    let b = u8x16::new(
+        83, 27, 56, 178, 210, 166, 36, 48, 144, 156, 209, 6, 181, 65, 232, 42,
+    );
+    let c = u8x16::new(
+        127, 23, 147, 75, 137, 205, 146, 169, 72, 89, 154, 45, 185, 229, 28, 217,
+    );
+    let r = i64x2::new(-2884627676759701433, 8394079293504695275);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwod_h_bu(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwev_d_wu_w() {
+    let a = i64x2::new(-6323015107493705206, -3277448760143472563);
+    let b = u32x4::new(2331684563, 1941329953, 2983229925, 1155461882);
+    let c = i32x4::new(-1110134113, -106291268, -391880820, 644991581);
+    let r = i64x2::new(-8911497681635502825, -4446519349401011063);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwev_d_wu_w(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwev_w_hu_h() {
+    let a = i32x4::new(1713941452, 1545069267, -1096163566, -573017556);
+    let b = u16x8::new(28055, 23297, 30225, 2761, 48193, 19269, 2518, 51038);
+    let c = i16x8::new(-7715, -18819, -4701, -3778, 7207, 5810, -4430, -8060);
+    let r = i64x2::new(6025759841279147559, -2509000903003100935);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwev_w_hu_h(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwev_h_bu_b() {
+    let a = i16x8::new(27922, 26192, 14273, -18511, -13090, 27036, 4607, 27830);
+    let b = u8x16::new(
+        85, 234, 241, 30, 218, 135, 230, 175, 34, 217, 231, 43, 159, 81, 198, 89,
+    );
+    let c = i8x16::new(
+        82, -91, 49, -114, 60, -32, -30, 17, 3, 82, -73, -55, -31, -106, -23, -44,
+    );
+    let r = i64x2::new(-7152443150463563700, 6551891650581220676);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwev_h_bu_b(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwod_d_wu_w() {
+    let a = i64x2::new(4995790344325484125, -3678161850757174337);
+    let b = u32x4::new(770268311, 2190608617, 3264567056, 3912406971);
+    let c = i32x4::new(1039193627, -382136981, 178615845, -2029105420);
+    let r = i64x2::new(4158677780872518848, 6829896032850494459);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwod_d_wu_w(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwod_w_hu_h() {
+    let a = i32x4::new(-1650648862, 112052630, 369411463, -1789144688);
+    let b = u16x8::new(33326, 2589, 54571, 14483, 51494, 10946, 54991, 11715);
+    let c = i16x8::new(-13502, 9856, -7830, -1915, 23659, -23776, -29716, 15794);
+    let r = i64x2::new(362141702219265378, -6889634254326488121);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwod_w_hu_h(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwod_h_bu_b() {
+    let a = i16x8::new(16717, -21485, 6612, -8821, -31304, -13638, -10878, -27550);
+    let b = u8x16::new(
+        99, 203, 114, 187, 131, 179, 178, 24, 220, 126, 23, 139, 118, 148, 39, 18,
+    );
+    let c = i8x16::new(
+        99, -47, 53, -116, 110, -65, -107, 123, -42, -51, -120, -102, 51, -56, -103, -58,
+    );
+    let r = i64x2::new(-1651716735493530616, -8048296323958936418);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwod_h_bu_b(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwev_q_d() {
+    let a = i64x2::new(-6837031335752177395, -6960992767212208666);
+    let b = i64x2::new(-4435069404701670756, -2126315287755608563);
+    let c = i64x2::new(-5551390506600609458, -6711686916497928751);
+    let r = i64x2::new(-8173734519403794283, -5626296406109360320);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwev_q_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwod_q_d() {
+    let a = i64x2::new(-1677869231369184389, 8708214911109206592);
+    let b = i64x2::new(-7813673205639863330, -9004405202552727709);
+    let c = i64x2::new(989988865428690976, 7138926957150547746);
+    let r = i64x2::new(-1125748635129453663, 5223492036614230927);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwod_q_d(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwev_q_du() {
+    let a = u64x2::new(17268971871627349752, 17228948998305822956);
+    let b = u64x2::new(10411505101371540933, 14258056959108407269);
+    let c = u64x2::new(10083084353835617951, 7442290876599468511);
+    let r = i64x2::new(4362805751568378451, 4473186691787239539);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwev_q_du(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwod_q_du() {
+    let a = u64x2::new(14967144687255063091, 6224733010665264496);
+    let b = u64x2::new(17625137945884588260, 1535023950244313744);
+    let c = u64x2::new(1841326774698258895, 9587959489663720036);
+    let r = i64x2::new(1938476888214276723, 7022583698667268618);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwod_q_du(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwev_q_du_d() {
+    let a = i64x2::new(7413074575332965326, -6131981171876880542);
+    let b = u64x2::new(7027881729907986450, 9385132453710384328);
+    let c = i64x2::new(6154882990643114022, 8692307970783152636);
+    let r = i64x2::new(-8494196038584058246, -3787080112545186901);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwev_q_du_d(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmaddwod_q_du_d() {
+    let a = i64x2::new(-3567580028466810679, 82284695558926958);
+    let b = u64x2::new(12724355976909764846, 2153966982409398933);
+    let c = i64x2::new(-2209580291901273167, -3993952038101553236);
+    let r = i64x2::new(-613602630799693851, -384076239737958818);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vmaddwod_q_du_d(
+            transmute(a),
+            transmute(b),
+            transmute(c)
+        ))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vrotr_b() {
+    let a = i8x16::new(
+        -115, -5, 112, 87, -91, -10, -42, -109, -71, 30, 80, 109, -37, -36, -82, -61,
+    );
+    let b = i8x16::new(
+        98, 80, -27, -51, -44, -43, 28, -49, -47, 12, -100, -113, 35, -85, 9, 23,
+    );
+    let r = i64x2::new(2841128540244802403, -8694309599374351908);
+
+    assert_eq!(r, transmute(lsx_vrotr_b(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vrotr_h() {
+    let a = i16x8::new(29688, -22641, 11287, 9743, 29744, -9683, -24918, 28489);
+    let b = i16x8::new(-6485, 1418, 8263, -29872, -6491, 3930, -20621, 32531);
+    let r = i64x2::new(2742461657407651598, 3308267577913279393);
+
+    assert_eq!(r, transmute(lsx_vrotr_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vrotr_w() {
+    let a = i32x4::new(-232185187, -1057829624, -1428233439, 314333357);
+    let b = i32x4::new(1956224189, -1858012941, -1889446514, -2130978943);
+    let r = i64x2::new(6458469860191573231, -8548346292466177157);
+
+    assert_eq!(r, transmute(lsx_vrotr_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vrotr_d() {
+    let a = i64x2::new(-8694664621869506061, 3293016169868759706);
+    let b = i64x2::new(4553458262651691654, -5062393334123159235);
+    let r = i64x2::new(-3594618648537251961, 7897385285240526033);
+
+    assert_eq!(r, transmute(lsx_vrotr_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vadd_q() {
+    let a = i64x2::new(2423569640801257553, 678073579687698205);
+    let b = i64x2::new(114135477458514099, 3481307531297359399);
+    let r = i64x2::new(2537705118259771652, 4159381110985057604);
+
+    assert_eq!(r, transmute(lsx_vadd_q(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsub_q() {
+    let a = i64x2::new(7892977690518598837, -3112927447911510492);
+    let b = i64x2::new(-8526086848853095438, -1323481969747305966);
+    let r = i64x2::new(-2027679534337857341, -1789445478164204527);
+
+    assert_eq!(r, transmute(lsx_vsub_q(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vldrepl_b() {
+    let a: [i8; 16] = [
+        -88, 52, -104, -111, 84, -101, -36, 49, 31, 10, 34, -78, 22, 22, 118, 80,
+    ];
+    let r = i64x2::new(-6293595036912670552, -6293595036912670552);
+
+    assert_eq!(r, transmute(lsx_vldrepl_b::<0>(a.as_ptr())));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vldrepl_h() {
+    let a: [i8; 16] = [
+        29, 81, 114, -8, 70, 29, 100, 46, 105, 38, -10, -58, 2, 66, -104, -43,
+    ];
+    let r = i64x2::new(5844917077753549085, 5844917077753549085);
+
+    assert_eq!(r, transmute(lsx_vldrepl_h::<0>(a.as_ptr())));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vldrepl_w() {
+    let a: [i8; 16] = [
+        -56, -83, -27, -88, 85, -105, 81, -74, 124, -76, -29, 34, 99, 36, 36, 37,
+    ];
+    let r = i64x2::new(-6276419428332229176, -6276419428332229176);
+
+    assert_eq!(r, transmute(lsx_vldrepl_w::<0>(a.as_ptr())));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vldrepl_d() {
+    let a: [i8; 16] = [
+        90, -84, 7, 91, -2, 32, 74, 2, -4, 119, 62, 98, -112, -127, -109, 101,
+    ];
+    let r = i64x2::new(164980613173455962, 164980613173455962);
+
+    assert_eq!(r, transmute(lsx_vldrepl_d::<0>(a.as_ptr())));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmskgez_b() {
+    let a = i8x16::new(
+        -121, 102, -85, -2, -103, 100, 119, -46, 35, -16, -66, -43, -61, 79, 40, -43,
+    );
+    let r = i64x2::new(24930, 0);
+
+    assert_eq!(r, transmute(lsx_vmskgez_b(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vmsknz_b() {
+    let a = i8x16::new(
+        -25, 93, 124, 56, -119, -93, -123, 118, -27, 16, -22, 58, -59, 69, 63, -66,
+    );
+    let r = i64x2::new(65535, 0);
+
+    assert_eq!(r, transmute(lsx_vmsknz_b(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vexth_h_b() {
+    let a = i8x16::new(
+        -86, 119, 29, -97, -55, -30, 39, -102, 85, 73, 20, -12, -94, 53, 30, 114,
+    );
+    let r = i64x2::new(-3377613816397739, 32088276197572514);
+
+    assert_eq!(r, transmute(lsx_vexth_h_b(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vexth_w_h() {
+    let a = i16x8::new(14576, -26514, 14165, -15781, 10106, 1864, 23348, 30478);
+    let r = i64x2::new(8005819049850, 130902013270836);
+
+    assert_eq!(r, transmute(lsx_vexth_w_h(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vexth_d_w() {
+    let a = i32x4::new(863783254, 799653326, -1122161877, -652869192);
+    let r = i64x2::new(-1122161877, -652869192);
+
+    assert_eq!(r, transmute(lsx_vexth_d_w(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vexth_q_d() {
+    let a = i64x2::new(2924262436748867523, 1959694872821330818);
+    let r = i64x2::new(1959694872821330818, 0);
+
+    assert_eq!(r, transmute(lsx_vexth_q_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vexth_hu_bu() {
+    let a = u8x16::new(
+        88, 245, 152, 181, 22, 122, 243, 162, 170, 115, 212, 217, 148, 176, 60, 214,
+    );
+    let r = i64x2::new(61080980486815914, 60235902725652628);
+
+    assert_eq!(r, transmute(lsx_vexth_hu_bu(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vexth_wu_hu() {
+    let a = u16x8::new(58875, 18924, 17611, 30197, 33869, 53931, 4693, 53025);
+    let r = i64x2::new(231631881274445, 227740640875093);
+
+    assert_eq!(r, transmute(lsx_vexth_wu_hu(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vexth_du_wu() {
+    let a = u32x4::new(3499742961, 2840979237, 2082263829, 1096292547);
+    let r = i64x2::new(2082263829, 1096292547);
+
+    assert_eq!(r, transmute(lsx_vexth_du_wu(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vexth_qu_du() {
+    let a = u64x2::new(14170556367894986991, 14238702840099699193);
+    let r = i64x2::new(-4208041233609852423, 0);
+
+    assert_eq!(r, transmute(lsx_vexth_qu_du(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vrotri_b() {
+    let a = i8x16::new(
+        7, 49, -22, -120, -94, 53, -19, 95, -84, -30, 31, -25, 30, -98, -86, -5,
+    );
+    let r = i64x2::new(-2919654548887155519, -96080239582005205);
+
+    assert_eq!(r, transmute(lsx_vrotri_b::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vrotri_h() {
+    let a = i16x8::new(-14120, -16812, -19570, -990, 24476, -7640, 20329, 8879);
+    let r = i64x2::new(-556925602567188047, 4998607264501841720);
+
+    assert_eq!(r, transmute(lsx_vrotri_h::<15>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vrotri_w() {
+    let a = i32x4::new(-1760224525, -1644621284, 1835781046, -1487934110);
+    let r = i64x2::new(2845787365010917052, -6209343103231659283);
+
+    assert_eq!(r, transmute(lsx_vrotri_w::<2>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vrotri_d() {
+    let a = i64x2::new(8884634342417174882, 244175985366916345);
+    let r = i64x2::new(-3963790888197019724, 4020656082573561910);
+
+    assert_eq!(r, transmute(lsx_vrotri_d::<52>(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vextl_q_d() {
+    let a = i64x2::new(-5110246490938885255, 377414780188285171);
+    let r = i64x2::new(-5110246490938885255, -1);
+
+    assert_eq!(r, transmute(lsx_vextl_q_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlni_b_h() {
+    let a = i8x16::new(
+        -62, -32, -115, -97, -74, 113, -113, -4, 10, 39, 102, -3, 38, 83, -88, 73,
+    );
+    let b = i8x16::new(
+        115, 89, -35, 113, -13, 93, -90, -127, -73, -66, -71, 19, 37, 76, -89, 116,
+    );
+    let r = i64x2::new(72339077638193409, 72342367599919619);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrlni_b_h::<14>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlni_h_w() {
+    let a = i16x8::new(4205, -10016, 6553, 16160, 26411, 29470, -20643, 30057);
+    let b = i16x8::new(-20939, 15459, 13368, -29800, -25275, -15723, 30837, 7321);
+    let r = i64x2::new(1970530997633039, 8162894584676406);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrlni_h_w::<26>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlni_w_d() {
+    let a = i32x4::new(1705975377, 322077350, -1922153156, -661241171);
+    let b = i32x4::new(1098943214, -1567917396, 297055649, -1122208150);
+    let r = i64x2::new(2133162980935405664, -8022209066041763477);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrlni_w_d::<18>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlni_d_q() {
+    let a = i64x2::new(6325216582707926854, -5129479093920978170);
+    let b = i64x2::new(3985485829689892785, 7685789624553197779);
+    let r = i64x2::new(7505653930227732, 13005141581824778);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrlni_d_q::<74>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlrni_b_h() {
+    let a = i8x16::new(
+        -103, -39, -112, -128, -96, 40, -89, 40, -55, 102, 37, -49, 96, -107, 26, 16,
+    );
+    let b = i8x16::new(
+        -57, 51, 17, 1, 37, 120, -54, 78, -67, 36, 0, -121, -113, 27, -9, 74,
+    );
+    let r = i64x2::new(3201527803797374159, 4635960605099098726);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrlrni_b_h::<6>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlrni_h_w() {
+    let a = i16x8::new(16435, -5399, -4992, 1377, -27419, -9060, 28877, -12666);
+    let b = i16x8::new(30165, -32344, 15225, 17457, -5900, -17127, -30430, 21140);
+    let r = i64x2::new(5919251242624655831, 1856453178786227457);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrlrni_h_w::<6>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlrni_w_d() {
+    let a = i32x4::new(-1783593075, -767627057, 522051412, 1497970809);
+    let b = i32x4::new(-613709101, 1782777798, -1376237383, -2108949489);
+    let r = i64x2::new(8955006813860, 6137508269348);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrlrni_w_d::<52>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrlrni_d_q() {
+    let a = i64x2::new(-8390257423140334242, -5915059672723228155);
+    let b = i64x2::new(4065462044175592876, 5861150325027293506);
+    let r = i64x2::new(42645481, 91180005);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrlrni_d_q::<101>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlni_b_h() {
+    let a = i8x16::new(
+        -126, 26, 50, 111, 24, 36, -59, -44, -12, 82, 16, -39, 10, 27, -76, -81,
+    );
+    let b = i8x16::new(
+        -72, -74, 3, -16, -50, -40, 17, -39, -88, 33, -11, -74, 27, 104, -56, 35,
+    );
+    let r = i64x2::new(72907520922224389, 360294575950070528);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlni_b_h::<13>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlni_h_w() {
+    let a = i16x8::new(8928, 556, 327, 11357, -32577, 24481, -16101, -875);
+    let b = i16x8::new(12, -2621, -27458, -24262, 23377, 16952, 19498, -31793);
+    let r = i64x2::new(74028485831688683, 142145683583401988);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlni_h_w::<23>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlni_w_d() {
+    let a = i32x4::new(1838928968, 1883060425, -990389689, 735664934);
+    let b = i32x4::new(-971263991, -98050158, 134746673, -49144118);
+    let r = i64x2::new(9223372034707292159, 9223372034707292159);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlni_w_d::<12>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlni_d_q() {
+    let a = i64x2::new(-5470954942766391223, 2164868713336601834);
+    let b = i64x2::new(-3507919664178941311, 8800311307152269561);
+    let r = i64x2::new(524539429375, 129036230643);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlni_d_q::<88>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlni_bu_h() {
+    let a = u8x16::new(
+        42, 80, 7, 61, 49, 172, 110, 186, 30, 201, 214, 72, 201, 231, 144, 223,
+    );
+    let b = i8x16::new(
+        39, 98, -57, 124, 78, 127, 89, 26, 44, 57, 9, -36, -100, -41, 7, 30,
+    );
+    let r = i64x2::new(1695451225195267, 434318113941815554);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlni_bu_h::<13>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlni_hu_w() {
+    let a = u16x8::new(47562, 12077, 58166, 40959, 47625, 4449, 45497, 47932);
+    let b = i16x8::new(25513, -19601, -22702, -15840, 32377, 32023, -4115, 25327);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlni_hu_w::<9>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlni_wu_d() {
+    let a = u32x4::new(3924399037, 1624231459, 1033186938, 4207801648);
+    let b = i32x4::new(-343671492, 63408059, -17420952, -742649266);
+    let r = i64x2::new(111669149696, 133143986188);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlni_wu_d::<59>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlni_du_q() {
+    let a = u64x2::new(9385373857335523158, 8829548075644432850);
+    let b = i64x2::new(1935200102096005901, -4336418136884591685);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlni_du_q::<6>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrni_b_h() {
+    let a = i8x16::new(
+        -118, -53, 124, -32, -8, -106, -30, 125, 80, -118, 111, -49, 2, -54, -109, -63,
+    );
+    let b = i8x16::new(
+        -128, 104, -60, -21, -28, 47, -78, 125, -65, -31, 111, 127, -102, -50, 87, 102,
+    );
+    let r = i64x2::new(9187201950435737471, 9187201950435737471);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlrni_b_h::<0>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrni_h_w() {
+    let a = i16x8::new(-6932, -27303, 5931, 1697, 23680, -18344, 21222, 31527);
+    let b = i16x8::new(16541, 32147, -26353, -15678, -7913, -31777, 12521, -25215);
+    let r = i64x2::new(2814784127631368, 2251851353292809);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlrni_h_w::<28>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrni_w_d() {
+    let a = i32x4::new(-528492260, 635780412, 2102955910, -106415932);
+    let b = i32x4::new(-1062242289, 359654281, 1831754020, 1455206052);
+    let r = i64x2::new(9223372034707292159, 9223372034707292159);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlrni_w_d::<1>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrni_d_q() {
+    let a = i64x2::new(-2050671473765220606, -974956007142498603);
+    let b = i64x2::new(4675761647927162976, -5100418369989582579);
+    let r = i64x2::new(9223372036854775807, 9223372036854775807);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlrni_d_q::<60>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrni_bu_h() {
+    let a = u8x16::new(
+        100, 79, 212, 163, 219, 225, 100, 84, 1, 173, 146, 41, 33, 251, 175, 18,
+    );
+    let b = i8x16::new(
+        104, -36, 123, 103, -26, -37, -104, -46, 107, -89, 120, 33, 117, -54, 107, 105,
+    );
+    let r = i64x2::new(217862753078412039, 74310514888869122);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlrni_bu_h::<13>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrni_hu_w() {
+    let a = u16x8::new(35722, 45502, 51777, 63215, 9369, 33224, 15844, 23578);
+    let b = i16x8::new(-18038, 23224, 26314, -15841, 826, -15682, -4109, -24970);
+    let r = i64x2::new(22236939778326573, 12948128109625433);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlrni_hu_w::<25>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrni_wu_d() {
+    let a = u32x4::new(1956924769, 1833875292, 1956412037, 426346371);
+    let b = i32x4::new(-1128409795, 198077570, -1649408138, 1665566624);
+    let r = i64x2::new(447097136224200392, 114446481822641014);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlrni_wu_d::<36>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrni_du_q() {
+    let a = u64x2::new(9048079498548224395, 9603999840623079368);
+    let b = i64x2::new(-404424089294655868, 5140892317651856748);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrlrni_du_q::<38>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrani_b_h() {
+    let a = i8x16::new(
+        127, 75, -70, 122, 36, 105, 73, 54, -17, 44, 92, -80, 11, -110, 81, 51,
+    );
+    let b = i8x16::new(
+        -72, 6, 81, -61, -8, -96, 24, 77, 30, -20, 95, -20, 69, -37, -109, 35,
+    );
+    let r = i64x2::new(2079082344186583605, -7309198813337889445);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrani_b_h::<5>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrani_h_w() {
+    let a = i16x8::new(17089, -15383, 6606, 11797, -17230, -236, 24622, 14114);
+    let b = i16x8::new(4129, 30226, -29368, -25031, 7609, -18203, 28351, -1400);
+    let r = i64x2::new(-8724789849496477438, 2738834860014343212);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrani_h_w::<4>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrani_w_d() {
+    let a = i32x4::new(-382819185, 386357255, 35446809, 1387491503);
+    let b = i32x4::new(934617213, -1024433792, -516094326, 1363620957);
+    let r = i64x2::new(5130829100463783991, -5516717120280852503);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrani_w_d::<24>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrani_d_q() {
+    let a = i64x2::new(-6766658862703543347, -8101175034272755526);
+    let b = i64x2::new(-6351802365852683233, -7612236351910354649);
+    let r = i64x2::new(-58076754393848, -61807060503180);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrani_d_q::<81>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrarni_b_h() {
+    let a = i8x16::new(
+        -71, 50, -70, -110, 89, 96, -70, 126, 10, 119, -124, -91, -44, -66, -120, -110,
+    );
+    let b = i8x16::new(
+        -118, 101, -58, -7, -118, 69, 75, 88, 75, -76, -41, -37, 13, -46, -84, 68,
+    );
+    let r = i64x2::new(-7619391791054112335, 5898503720505399127);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrarni_b_h::<3>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrarni_h_w() {
+    let a = i16x8::new(-13195, 28211, 7711, -1401, -1145, -27232, 15206, 23526);
+    let b = i16x8::new(-21087, 18713, -7401, -30000, 25577, -10794, -28633, -25187);
+    let r = i64x2::new(4268193831744344627, -5202735902940537752);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrarni_h_w::<15>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrarni_w_d() {
+    let a = i32x4::new(-2004832894, -772030708, -2044339682, -161994376);
+    let b = i32x4::new(-314559979, 1401503238, -738119523, -2036313194);
+    let r = i64x2::new(-64424509430, -6);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrarni_w_d::<59>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vsrarni_d_q() {
+    let a = i64x2::new(2532701208156415278, 7815982649469220899);
+    let b = i64x2::new(-202407401251467620, 284380589150850504);
+    let r = i64x2::new(-202407401251467620, 2532701208156415278);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vsrarni_d_q::<0>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrani_b_h() {
+    let a = i8x16::new(
+        -50, 30, 4, -123, 102, 17, -127, 79, -3, 54, -91, 77, -81, -74, -32, 6,
+    );
+    let b = i8x16::new(
+        -125, 114, -41, -31, 70, 17, -109, 98, -43, -79, -24, -39, -79, 49, -43, 61,
+    );
+    let r = i64x2::new(9187203054242332799, 9187483425412448383);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrani_b_h::<0>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrani_h_w() {
+    let a = i16x8::new(-13653, 21802, 26851, -30910, -21293, -13050, -24174, 29805);
+    let b = i16x8::new(9604, -27726, -18692, 147, 23503, 3941, -18536, -25864);
+    let r = i64x2::new(-1970324836909063, 2251786928259077);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrani_h_w::<28>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrani_w_d() {
+    let a = i32x4::new(640738652, 568129780, 2099035547, 1750495014);
+    let b = i32x4::new(2090153020, 2002243310, 567374078, -1386845950);
+    let r = i64x2::new(-45445048943701, 57359288242414);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrani_w_d::<49>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrani_d_q() {
+    let a = i64x2::new(8313689526826187568, -7067970090029512662);
+    let b = i64x2::new(-7547166008384655380, 9056943104343751836);
+    let r = i64x2::new(138197984380245, -107848664703820);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrani_d_q::<80>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrani_bu_h() {
+    let a = u8x16::new(
+        110, 23, 112, 128, 94, 127, 141, 246, 144, 229, 149, 191, 73, 211, 119, 89,
+    );
+    let b = i8x16::new(
+        9, -116, 68, -122, 13, -17, -90, 29, -22, -126, 50, 2, -50, -121, 124, -18,
+    );
+    let r = i64x2::new(0, 72057594037993472);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrani_bu_h::<14>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrani_hu_w() {
+    let a = u16x8::new(23583, 19333, 39698, 13735, 15385, 8819, 61012, 57430);
+    let b = i16x8::new(-18676, -5045, 14040, 25346, -27192, -27172, 13333, 12330);
+    let r = i64x2::new(27021597777199104, 292064788631);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrani_hu_w::<23>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrani_wu_d() {
+    let a = u32x4::new(3826341651, 1946901217, 3504547080, 2702234829);
+    let b = i32x4::new(1013240156, -1783678601, -91667235, 485058283);
+    let r = i64x2::new(-4294967296, 4294967295);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrani_wu_d::<13>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrani_du_q() {
+    let a = u64x2::new(16452622598975149813, 15788367695672970142);
+    let b = i64x2::new(3271075037846423078, -4777595873776840194);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrani_du_q::<33>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarni_b_h() {
+    let a = i8x16::new(
+        -76, 3, 89, 123, 98, -91, 87, 101, 75, 77, -114, 117, -78, 10, -64, 13,
+    );
+    let b = i8x16::new(
+        125, 49, 97, -128, -38, 61, 29, 1, -108, 54, 28, -65, -22, -3, 71, -12,
+    );
+    let r = i64x2::new(-9187201955687071617, 9187201950435803007);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrarni_b_h::<2>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarni_h_w() {
+    let a = i16x8::new(-5012, 11989, 5954, -22500, 4485, 31359, 28715, -16160);
+    let b = i16x8::new(29828, -15046, 20055, -7703, 18306, -411, -15337, 30957);
+    let r = i64x2::new(1125904201809918, -562928478781439);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrarni_h_w::<29>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarni_w_d() {
+    let a = i32x4::new(830116125, -782674123, 1854407155, 1495209920);
+    let b = i32x4::new(2038928041, -944152498, 984207668, -1562095866);
+    let r = i64x2::new(-9223372034707292160, 9223372034707292160);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrarni_w_d::<18>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarni_d_q() {
+    let a = i64x2::new(6798655171089504447, 7326163030789656624);
+    let b = i64x2::new(-2977477884402038599, -1140443471327573805);
+    let r = i64x2::new(-17819429239493341, 114471297356088385);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrarni_d_q::<70>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarni_bu_h() {
+    let a = u8x16::new(
+        75, 193, 237, 8, 33, 177, 31, 133, 119, 169, 163, 98, 159, 36, 131, 221,
+    );
+    let b = i8x16::new(
+        85, 84, -17, -84, 37, -124, -96, -30, -113, 114, -49, -7, 93, -3, -69, 124,
+    );
+    let r = i64x2::new(144115196665790465, 283673999966208);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrarni_bu_h::<14>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarni_hu_w() {
+    let a = u16x8::new(24614, 57570, 38427, 46010, 4180, 57175, 13134, 32047);
+    let b = i16x8::new(20333, -10949, -20123, -1525, 14594, -30628, -30604, -29092);
+    let r = i64x2::new(0, -281474976710656);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrarni_hu_w::<13>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarni_wu_d() {
+    let a = u32x4::new(1854465345, 2301618375, 1724286997, 3204532825);
+    let b = i32x4::new(-1176670423, -1482282410, 777914585, 87761646);
+    let r = i64x2::new(-4294967296, 0);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrarni_wu_d::<15>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrarni_du_q() {
+    let a = u64x2::new(5657125151084901446, 434040259538460448);
+    let b = i64x2::new(4567159404230772553, -10612253426094316);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vssrarni_du_q::<126>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vpermi_w() {
+    let a = i32x4::new(213291370, -674346961, -1480878002, -1600622413);
+    let b = i32x4::new(-1309240039, 1335257352, 852153543, 1125109318);
+    let r = i64x2::new(4832307726087017671, -6360322584335202257);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vpermi_w::<158>(transmute(a), transmute(b)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vld() {
+    let a: [i8; 16] = [
+        127, 127, 77, 66, 64, 25, -50, -34, 2, -7, 107, -87, 45, -88, -51, 41,
+    ];
+    let r = i64x2::new(-2391946588306178177, 3012248639850150146);
+
+    assert_eq!(r, transmute(lsx_vld::<0>(a.as_ptr())));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vst() {
+    let a = i8x16::new(
+        -27, -57, 84, 27, -46, -85, -92, 57, 15, -67, -44, -89, -88, 84, 22, -29,
+    );
+    let mut o: [i8; 16] = [
+        -9, 24, -11, -95, -10, 78, 41, -118, 91, -113, 107, 77, -50, 113, -22, 27,
+    ];
+    let r = i64x2::new(4153633675232462821, -2083384694265299697);
+
+    lsx_vst::<0>(transmute(a), o.as_mut_ptr());
+    assert_eq!(r, transmute(o));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrn_b_h() {
+    let a = i16x8::new(-6731, 13740, 8488, -2854, -3028, 6907, -57, 5317);
+    let b = i16x8::new(17437, 9775, -20467, -31838, 5913, 4238, -7458, 2822);
+    let r = i64x2::new(5981906731171643399, 0);
+
+    assert_eq!(r, transmute(lsx_vssrlrn_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrn_h_w() {
+    let a = i32x4::new(1684402804, 1385352714, 1360229118, 928996904);
+    let b = i32x4::new(-2116426818, 1641049288, 712377342, -1572394121);
+    let r = i64x2::new(31243728857268226, 0);
+
+    assert_eq!(r, transmute(lsx_vssrlrn_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrlrn_w_d() {
+    let a = i64x2::new(-6889047968033387497, -1417681658907465534);
+    let b = i64x2::new(-3890929847852895653, -7819301294522132056);
+    let r = i64x2::new(66519777023098879, 0);
+
+    assert_eq!(r, transmute(lsx_vssrlrn_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrln_b_h() {
+    let a = i16x8::new(6474, 27187, -10340, 1859, 23966, -18880, 3680, 9203);
+    let b = i16x8::new(-14062, -29610, -24609, -8884, -1818, 32133, 29934, -6498);
+    let r = i64x2::new(140183437672319, 0);
+
+    assert_eq!(r, transmute(lsx_vssrln_b_h(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrln_h_w() {
+    let a = i32x4::new(-476821436, -709684595, 1401465952, -1429729676);
+    let b = i32x4::new(-1437891045, 1546371535, -1800954476, -1892390372);
+    let r = i64x2::new(2820489990832156, 0);
+
+    assert_eq!(r, transmute(lsx_vssrln_h_w(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vssrln_w_d() {
+    let a = i64x2::new(2563829598589943649, 1915912925013067420);
+    let b = i64x2::new(2034490755997557661, -3470252066162700534);
+    let r = i64x2::new(9223372034707292159, 0);
+
+    assert_eq!(r, transmute(lsx_vssrln_w_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vorn_v() {
+    let a = i8x16::new(
+        -104, -56, -109, -5, -124, 58, 19, -45, -64, 70, 0, 60, -67, -86, -77, -47,
+    );
+    let b = i8x16::new(
+        18, 99, -128, 74, -16, -127, 71, 94, -99, -119, 16, 43, 121, 77, -57, -24,
+    );
+    let r = i64x2::new(-883973744907789059, -2901520201165080862);
+
+    assert_eq!(r, transmute(lsx_vorn_v(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vldi() {
+    let r = i64x2::new(-404, -404);
+
+    assert_eq!(r, transmute(lsx_vldi::<3692>()));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vshuf_b() {
+    let a = i8x16::new(
+        115, -20, -59, -22, 43, -85, -79, 110, -79, -97, 14, -11, 5, -43, 17, -16,
+    );
+    let b = i8x16::new(
+        -49, -101, -67, -10, -11, 76, -1, -74, 10, 110, 27, -53, 105, 34, 28, 98,
+    );
+    let c = i8x16::new(3, 10, 3, 20, 23, 29, 7, 23, 3, 3, 4, 15, 3, 10, 21, 27);
+    let r = i64x2::new(7977798459094080502, -744470568363493642);
+
+    assert_eq!(
+        r,
+        transmute(lsx_vshuf_b(transmute(a), transmute(b), transmute(c)))
+    );
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vldx() {
+    let a: [i8; 16] = [
+        -102, -39, 3, 31, 58, -5, 78, 11, -96, -111, 11, 114, 103, -3, -86, 37,
+    ];
+    let r = i64x2::new(814864809647659418, 2714260346180964768);
+
+    assert_eq!(r, transmute(lsx_vldx(a.as_ptr(), 0)));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vstx() {
+    let a = i8x16::new(
+        113, -106, 22, -4, 54, 56, 70, -21, -30, 0, -25, -98, 56, -46, -51, 99,
+    );
+    let mut o: [i8; 16] = [
+        -60, -30, -98, 12, 90, 96, 120, -102, -124, 54, -91, -24, 126, -80, 121, -29,
+    ];
+    let r = i64x2::new(-1493444417618012559, 7191635320606490850);
+
+    lsx_vstx(transmute(a), o.as_mut_ptr(), 0);
+    assert_eq!(r, transmute(o));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vextl_qu_du() {
+    let a = u64x2::new(14708598110732796778, 2132245682694336458);
+    let r = i64x2::new(-3738145962976754838, 0);
+
+    assert_eq!(r, transmute(lsx_vextl_qu_du(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_bnz_b() {
+    let a = u8x16::new(
+        84, 211, 197, 223, 221, 228, 88, 147, 165, 38, 137, 91, 54, 252, 130, 198,
+    );
+    let r: i32 = 1;
+
+    assert_eq!(r, transmute(lsx_bnz_b(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_bnz_d() {
+    let a = u64x2::new(2935166648440262530, 9853932033129373129);
+    let r: i32 = 1;
+
+    assert_eq!(r, transmute(lsx_bnz_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_bnz_h() {
+    let a = u16x8::new(55695, 60003, 59560, 35123, 25693, 41352, 61626, 42007);
+    let r: i32 = 1;
+
+    assert_eq!(r, transmute(lsx_bnz_h(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_bnz_v() {
+    let a = u8x16::new(
+        97, 136, 236, 21, 16, 18, 39, 247, 250, 7, 67, 251, 83, 240, 242, 151,
+    );
+    let r: i32 = 1;
+
+    assert_eq!(r, transmute(lsx_bnz_v(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_bnz_w() {
+    let a = u32x4::new(1172712391, 4211490091, 1954893853, 1606462106);
+    let r: i32 = 1;
+
+    assert_eq!(r, transmute(lsx_bnz_w(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_bz_b() {
+    let a = u8x16::new(
+        15, 239, 121, 77, 200, 213, 232, 133, 158, 104, 98, 165, 77, 238, 68, 228,
+    );
+    let r: i32 = 0;
+
+    assert_eq!(r, transmute(lsx_bz_b(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_bz_d() {
+    let a = u64x2::new(6051854163594201075, 9957257179760945130);
+    let r: i32 = 0;
+
+    assert_eq!(r, transmute(lsx_bz_d(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_bz_h() {
+    let a = u16x8::new(19470, 29377, 53886, 60432, 20799, 41755, 54479, 52192);
+    let r: i32 = 0;
+
+    assert_eq!(r, transmute(lsx_bz_h(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_bz_v() {
+    let a = u8x16::new(
+        205, 20, 220, 220, 212, 207, 232, 167, 86, 81, 26, 68, 30, 112, 186, 234,
+    );
+    let r: i32 = 0;
+
+    assert_eq!(r, transmute(lsx_bz_v(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_bz_w() {
+    let a = u32x4::new(840335855, 1404686204, 628335401, 1171808080);
+    let r: i32 = 0;
+
+    assert_eq!(r, transmute(lsx_bz_w(transmute(a))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_caf_d() {
+    let a = u64x2::new(4603762778598497410, 4600578720825355240);
+    let b = u64x2::new(4594845432849836188, 4605165420863530034);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_caf_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_caf_s() {
+    let a = u32x4::new(1057450480, 1041717868, 1063383650, 1052061330);
+    let b = u32x4::new(1058412800, 1058762495, 1028487696, 1027290752);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_caf_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_ceq_d() {
+    let a = u64x2::new(4605168921160906654, 4594290648143726556);
+    let b = u64x2::new(4605937250150464526, 4596769502461699132);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_ceq_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_ceq_s() {
+    let a = u32x4::new(1022481472, 1054281004, 1061611781, 1063964926);
+    let b = u32x4::new(1057471620, 1064008655, 1062698831, 1064822930);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_ceq_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cle_d() {
+    let a = u64x2::new(4594614911097184960, 4595883006410794928);
+    let b = u64x2::new(4596931282408842596, 4592481315209481584);
+    let r = i64x2::new(-1, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cle_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cle_s() {
+    let a = u32x4::new(1056795676, 1033595408, 1059655467, 1052539946);
+    let b = u32x4::new(1021993344, 1043028808, 1064182329, 1054794412);
+    let r = i64x2::new(-4294967296, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cle_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_clt_d() {
+    let a = u64x2::new(4600913855630793750, 4577092243808815872);
+    let b = u64x2::new(4603056125735978454, 4595932368389116476);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_clt_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_clt_s() {
+    let a = u32x4::new(1056969130, 1052243316, 1061133360, 1024378560);
+    let b = u32x4::new(1040327468, 1040072248, 1063314103, 1061361061);
+    let r = i64x2::new(0, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_clt_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cne_d() {
+    let a = u64x2::new(4600626466477018126, 4598733447126827764);
+    let b = u64x2::new(4602354759349431170, 4598595124838935466);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cne_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cne_s() {
+    let a = u32x4::new(1063546111, 1053175192, 1063179686, 1052800226);
+    let b = u32x4::new(1063262940, 1058010357, 1052721962, 1061295988);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cne_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cor_d() {
+    let a = u64x2::new(4607018705522720912, 4606390725849766769);
+    let b = u64x2::new(4606863361114437050, 4600753700959452152);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cor_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cor_s() {
+    let a = u32x4::new(993114880, 1063738833, 1020144864, 1055277186);
+    let b = u32x4::new(1053615382, 1065255138, 1051565294, 1041776832);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cor_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cueq_d() {
+    let a = u64x2::new(4589986692503775384, 4604350239975880608);
+    let b = u64x2::new(4603317345052528721, 4586734343919602352);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cueq_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cueq_s() {
+    let a = u32x4::new(1049781896, 1063241920, 1063535787, 1062764831);
+    let b = u32x4::new(1057082822, 1059761998, 1052599998, 1054369118);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cueq_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cule_d() {
+    let a = u64x2::new(4600113342137410192, 4586591372067099760);
+    let b = u64x2::new(4604253448175093958, 4599648167588382448);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cule_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cule_s() {
+    let a = u32x4::new(1059878844, 1040845348, 1060450143, 1061437832);
+    let b = u32x4::new(1051100696, 1062219104, 1064568294, 1032521352);
+    let r = i64x2::new(-4294967296, 4294967295);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cule_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cult_d() {
+    let a = u64x2::new(4604916546627232568, 4599229615347667200);
+    let b = u64x2::new(4602944708025910986, 4606429728449082215);
+    let r = i64x2::new(0, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cult_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cult_s() {
+    let a = u32x4::new(1061581945, 1058257026, 1059733857, 1064954284);
+    let b = u32x4::new(1030808384, 1044268840, 1050761328, 1037308928);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cult_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cun_d() {
+    let a = u64x2::new(4603128178250554600, 4601297724275716756);
+    let b = u64x2::new(4599145506416791474, 4602762942707610466);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cun_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cune_d() {
+    let a = u64x2::new(4603159382334199523, 4603135754641654385);
+    let b = u64x2::new(4602895209237804084, 4598685577984089858);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cune_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cune_s() {
+    let a = u32x4::new(1059907972, 1059391341, 1025259296, 1050646758);
+    let b = u32x4::new(1049955876, 1032474200, 1023410112, 1050347912);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cune_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_cun_s() {
+    let a = u32x4::new(1054871898, 1059065315, 1037157736, 1056161416);
+    let b = u32x4::new(1053288920, 1059911123, 1058695573, 1062913175);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_cun_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_saf_d() {
+    let a = u64x2::new(4585010456558902064, 4598376734249785852);
+    let b = u64x2::new(4589118818065931376, 4603302333347826011);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_saf_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_saf_s() {
+    let a = u32x4::new(1039827304, 1062400770, 1052695470, 1056530338);
+    let b = u32x4::new(1044756936, 1054667546, 1059141760, 1062203553);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_saf_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_seq_d() {
+    let a = u64x2::new(4604896813051509737, 4596873540510119820);
+    let b = u64x2::new(4594167956310606988, 4596272126122589228);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_seq_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_seq_s() {
+    let a = u32x4::new(1060477925, 1048954814, 1059933669, 1053469148);
+    let b = u32x4::new(1057231588, 1051495460, 1057998997, 1049117328);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_seq_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sle_d() {
+    let a = u64x2::new(4605211142905317821, 4601961488287203912);
+    let b = u64x2::new(4603919005855163252, 4594682846653946884);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sle_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sle_s() {
+    let a = u32x4::new(1053671520, 1055456634, 1063294891, 1059790187);
+    let b = u32x4::new(1045989468, 1052518900, 1046184640, 1032417352);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sle_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_slt_d() {
+    let a = u64x2::new(4601902750800060998, 4605236132294100877);
+    let b = u64x2::new(4600564867142526828, 4585131890265864544);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_slt_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_slt_s() {
+    let a = u32x4::new(1054326748, 1059604229, 1060884737, 1022762624);
+    let b = u32x4::new(1063435026, 1062439603, 1060665555, 1059252630);
+    let r = i64x2::new(-1, -4294967296);
+
+    assert_eq!(r, transmute(lsx_vfcmp_slt_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sne_d() {
+    let a = u64x2::new(4606672121388401433, 4604186491240191582);
+    let b = u64x2::new(4606789952952688555, 4605380358192261377);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sne_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sne_s() {
+    let a = u32x4::new(1062253602, 1053568536, 1056615768, 1055754482);
+    let b = u32x4::new(1055803760, 1063372602, 1062608900, 1054634370);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sne_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sor_d() {
+    let a = u64x2::new(4595713406002022116, 4604653971232015460);
+    let b = u64x2::new(4606380175568635560, 4602092067387067462);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sor_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sor_s() {
+    let a = u32x4::new(1058728243, 1059025743, 1012810944, 1057593472);
+    let b = u32x4::new(1064534350, 1035771168, 1059142426, 1034677600);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sor_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sueq_d() {
+    let a = u64x2::new(4605322679929877488, 4603091890812380784);
+    let b = u64x2::new(4602917609947054533, 4605983209212177197);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sueq_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sueq_s() {
+    let a = u32x4::new(1058057744, 1049762394, 1044222368, 1050250466);
+    let b = u32x4::new(1064871165, 1059796257, 1055456352, 1058662692);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sueq_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sule_d() {
+    let a = u64x2::new(4606210463692472427, 4576137083667840000);
+    let b = u64x2::new(4594044173266256632, 4601549551994738386);
+    let r = i64x2::new(0, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sule_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sule_s() {
+    let a = u32x4::new(1054399614, 1064056006, 1040844632, 1022950656);
+    let b = u32x4::new(1061061244, 1051874412, 1041025316, 1056018690);
+    let r = i64x2::new(4294967295, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sule_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sult_d() {
+    let a = u64x2::new(4593772214968107560, 4602360976974434088);
+    let b = u64x2::new(4603848042095479627, 4605032971316970060);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sult_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sult_s() {
+    let a = u32x4::new(1055857986, 1049674182, 1050153588, 1054289234);
+    let b = u32x4::new(1053631630, 1064026599, 1058029398, 1041182304);
+    let r = i64x2::new(-4294967296, 4294967295);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sult_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sun_d() {
+    let a = u64x2::new(4600661687369290390, 4583739657744995904);
+    let b = u64x2::new(4560681020073292800, 4604624347352815433);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sun_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sune_d() {
+    let a = u64x2::new(4600101879341653256, 4602392889952410448);
+    let b = u64x2::new(4593947987798339484, 4603656097008761637);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sune_d(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sune_s() {
+    let a = u32x4::new(1058419193, 1062297121, 1026375712, 1061355356);
+    let b = u32x4::new(1049327168, 1034635272, 1042258196, 1062844003);
+    let r = i64x2::new(-1, -1);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sune_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vfcmp_sun_s() {
+    let a = u32x4::new(1044637928, 1061035459, 1051032716, 1050118110);
+    let b = u32x4::new(1057442863, 1064573466, 1058086753, 1015993248);
+    let r = i64x2::new(0, 0);
+
+    assert_eq!(r, transmute(lsx_vfcmp_sun_s(transmute(a), transmute(b))));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vrepli_b() {
+    let r = i64x2::new(4340410370284600380, 4340410370284600380);
+
+    assert_eq!(r, transmute(lsx_vrepli_b::<-452>()));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vrepli_d() {
+    let r = i64x2::new(-330, -330);
+
+    assert_eq!(r, transmute(lsx_vrepli_d::<-330>()));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vrepli_h() {
+    let r = i64x2::new(39125618772344971, 39125618772344971);
+
+    assert_eq!(r, transmute(lsx_vrepli_h::<139>()));
+}
+
+#[simd_test(enable = "lsx")]
+unsafe fn test_lsx_vrepli_w() {
+    let r = i64x2::new(-468151435374, -468151435374);
+
+    assert_eq!(r, transmute(lsx_vrepli_w::<-110>()));
+}
diff --git a/library/stdarch/crates/core_arch/src/loongarch64/lsx/types.rs b/library/stdarch/crates/core_arch/src/loongarch64/lsx/types.rs
new file mode 100644
index 0000000000000..4097164c2fae5
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/loongarch64/lsx/types.rs
@@ -0,0 +1,33 @@
+types! {
+    #![unstable(feature = "stdarch_loongarch", issue = "117427")]
+
+    /// LOONGARCH-specific 128-bit wide vector of 16 packed `i8`.
+    pub struct v16i8(16 x pub(crate) i8);
+
+    /// LOONGARCH-specific 128-bit wide vector of 8 packed `i16`.
+    pub struct v8i16(8 x pub(crate) i16);
+
+    /// LOONGARCH-specific 128-bit wide vector of 4 packed `i32`.
+    pub struct v4i32(4 x pub(crate) i32);
+
+    /// LOONGARCH-specific 128-bit wide vector of 2 packed `i64`.
+    pub struct v2i64(2 x pub(crate) i64);
+
+    /// LOONGARCH-specific 128-bit wide vector of 16 packed `u8`.
+    pub struct v16u8(16 x pub(crate) u8);
+
+    /// LOONGARCH-specific 128-bit wide vector of 8 packed `u16`.
+    pub struct v8u16(8 x pub(crate) u16);
+
+    /// LOONGARCH-specific 128-bit wide vector of 4 packed `u32`.
+    pub struct v4u32(4 x pub(crate) u32);
+
+    /// LOONGARCH-specific 128-bit wide vector of 2 packed `u64`.
+    pub struct v2u64(2 x pub(crate) u64);
+
+    /// LOONGARCH-specific 128-bit wide vector of 4 packed `f32`.
+    pub struct v4f32(4 x pub(crate) f32);
+
+    /// LOONGARCH-specific 128-bit wide vector of 2 packed `f64`.
+    pub struct v2f64(2 x pub(crate) f64);
+}
diff --git a/library/stdarch/crates/core_arch/src/loongarch64/mod.rs b/library/stdarch/crates/core_arch/src/loongarch64/mod.rs
new file mode 100644
index 0000000000000..b1704bbb48d4f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/loongarch64/mod.rs
@@ -0,0 +1,376 @@
+//! `LoongArch` intrinsics
+
+mod lasx;
+mod lsx;
+
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub use self::lasx::*;
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub use self::lsx::*;
+
+use crate::arch::asm;
+
+/// Reads the 64-bit stable counter value and the counter ID
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn rdtime_d() -> (i64, isize) {
+    let val: i64;
+    let tid: isize;
+    asm!("rdtime.d {}, {}", out(reg) val, out(reg) tid, options(readonly, nostack));
+    (val, tid)
+}
+
+/// Reads the lower 32-bit stable counter value and the counter ID
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn rdtimel_w() -> (i32, isize) {
+    let val: i32;
+    let tid: isize;
+    asm!("rdtimel.w {}, {}", out(reg) val, out(reg) tid, options(readonly, nostack));
+    (val, tid)
+}
+
+/// Reads the upper 32-bit stable counter value and the counter ID
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn rdtimeh_w() -> (i32, isize) {
+    let val: i32;
+    let tid: isize;
+    asm!("rdtimeh.w {}, {}", out(reg) val, out(reg) tid, options(readonly, nostack));
+    (val, tid)
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.loongarch.crc.w.b.w"]
+    fn __crc_w_b_w(a: i32, b: i32) -> i32;
+    #[link_name = "llvm.loongarch.crc.w.h.w"]
+    fn __crc_w_h_w(a: i32, b: i32) -> i32;
+    #[link_name = "llvm.loongarch.crc.w.w.w"]
+    fn __crc_w_w_w(a: i32, b: i32) -> i32;
+    #[link_name = "llvm.loongarch.crc.w.d.w"]
+    fn __crc_w_d_w(a: i64, b: i32) -> i32;
+    #[link_name = "llvm.loongarch.crcc.w.b.w"]
+    fn __crcc_w_b_w(a: i32, b: i32) -> i32;
+    #[link_name = "llvm.loongarch.crcc.w.h.w"]
+    fn __crcc_w_h_w(a: i32, b: i32) -> i32;
+    #[link_name = "llvm.loongarch.crcc.w.w.w"]
+    fn __crcc_w_w_w(a: i32, b: i32) -> i32;
+    #[link_name = "llvm.loongarch.crcc.w.d.w"]
+    fn __crcc_w_d_w(a: i64, b: i32) -> i32;
+    #[link_name = "llvm.loongarch.cacop.d"]
+    fn __cacop(a: i64, b: i64, c: i64);
+    #[link_name = "llvm.loongarch.dbar"]
+    fn __dbar(a: i32);
+    #[link_name = "llvm.loongarch.ibar"]
+    fn __ibar(a: i32);
+    #[link_name = "llvm.loongarch.movgr2fcsr"]
+    fn __movgr2fcsr(a: i32, b: i32);
+    #[link_name = "llvm.loongarch.movfcsr2gr"]
+    fn __movfcsr2gr(a: i32) -> i32;
+    #[link_name = "llvm.loongarch.csrrd.d"]
+    fn __csrrd(a: i32) -> i64;
+    #[link_name = "llvm.loongarch.csrwr.d"]
+    fn __csrwr(a: i64, b: i32) -> i64;
+    #[link_name = "llvm.loongarch.csrxchg.d"]
+    fn __csrxchg(a: i64, b: i64, c: i32) -> i64;
+    #[link_name = "llvm.loongarch.iocsrrd.b"]
+    fn __iocsrrd_b(a: i32) -> i32;
+    #[link_name = "llvm.loongarch.iocsrrd.h"]
+    fn __iocsrrd_h(a: i32) -> i32;
+    #[link_name = "llvm.loongarch.iocsrrd.w"]
+    fn __iocsrrd_w(a: i32) -> i32;
+    #[link_name = "llvm.loongarch.iocsrrd.d"]
+    fn __iocsrrd_d(a: i32) -> i64;
+    #[link_name = "llvm.loongarch.iocsrwr.b"]
+    fn __iocsrwr_b(a: i32, b: i32);
+    #[link_name = "llvm.loongarch.iocsrwr.h"]
+    fn __iocsrwr_h(a: i32, b: i32);
+    #[link_name = "llvm.loongarch.iocsrwr.w"]
+    fn __iocsrwr_w(a: i32, b: i32);
+    #[link_name = "llvm.loongarch.iocsrwr.d"]
+    fn __iocsrwr_d(a: i64, b: i32);
+    #[link_name = "llvm.loongarch.break"]
+    fn __break(a: i32);
+    #[link_name = "llvm.loongarch.cpucfg"]
+    fn __cpucfg(a: i32) -> i32;
+    #[link_name = "llvm.loongarch.syscall"]
+    fn __syscall(a: i32);
+    #[link_name = "llvm.loongarch.asrtle.d"]
+    fn __asrtle(a: i64, b: i64);
+    #[link_name = "llvm.loongarch.asrtgt.d"]
+    fn __asrtgt(a: i64, b: i64);
+    #[link_name = "llvm.loongarch.lddir.d"]
+    fn __lddir(a: i64, b: i64) -> i64;
+    #[link_name = "llvm.loongarch.ldpte.d"]
+    fn __ldpte(a: i64, b: i64);
+    #[link_name = "llvm.loongarch.frecipe.s"]
+    fn __frecipe_s(a: f32) -> f32;
+    #[link_name = "llvm.loongarch.frecipe.d"]
+    fn __frecipe_d(a: f64) -> f64;
+    #[link_name = "llvm.loongarch.frsqrte.s"]
+    fn __frsqrte_s(a: f32) -> f32;
+    #[link_name = "llvm.loongarch.frsqrte.d"]
+    fn __frsqrte_d(a: f64) -> f64;
+}
+
+/// Calculate the CRC value using the IEEE 802.3 polynomial (0xEDB88320)
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn crc_w_b_w(a: i32, b: i32) -> i32 {
+    __crc_w_b_w(a, b)
+}
+
+/// Calculate the CRC value using the IEEE 802.3 polynomial (0xEDB88320)
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn crc_w_h_w(a: i32, b: i32) -> i32 {
+    __crc_w_h_w(a, b)
+}
+
+/// Calculate the CRC value using the IEEE 802.3 polynomial (0xEDB88320)
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn crc_w_w_w(a: i32, b: i32) -> i32 {
+    __crc_w_w_w(a, b)
+}
+
+/// Calculate the CRC value using the IEEE 802.3 polynomial (0xEDB88320)
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn crc_w_d_w(a: i64, b: i32) -> i32 {
+    __crc_w_d_w(a, b)
+}
+
+/// Calculate the CRC value using the Castagnoli polynomial (0x82F63B78)
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn crcc_w_b_w(a: i32, b: i32) -> i32 {
+    __crcc_w_b_w(a, b)
+}
+
+/// Calculate the CRC value using the Castagnoli polynomial (0x82F63B78)
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn crcc_w_h_w(a: i32, b: i32) -> i32 {
+    __crcc_w_h_w(a, b)
+}
+
+/// Calculate the CRC value using the Castagnoli polynomial (0x82F63B78)
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn crcc_w_w_w(a: i32, b: i32) -> i32 {
+    __crcc_w_w_w(a, b)
+}
+
+/// Calculate the CRC value using the Castagnoli polynomial (0x82F63B78)
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn crcc_w_d_w(a: i64, b: i32) -> i32 {
+    __crcc_w_d_w(a, b)
+}
+
+/// Generates the cache operation instruction
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn cacop<const IMM12: i64>(a: i64, b: i64) {
+    static_assert_simm_bits!(IMM12, 12);
+    __cacop(a, b, IMM12);
+}
+
+/// Generates the memory barrier instruction
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn dbar<const IMM15: i32>() {
+    static_assert_uimm_bits!(IMM15, 15);
+    __dbar(IMM15);
+}
+
+/// Generates the instruction-fetch barrier instruction
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn ibar<const IMM15: i32>() {
+    static_assert_uimm_bits!(IMM15, 15);
+    __ibar(IMM15);
+}
+
+/// Moves data from a GPR to the FCSR
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn movgr2fcsr<const IMM5: i32>(a: i32) {
+    static_assert_uimm_bits!(IMM5, 5);
+    __movgr2fcsr(IMM5, a);
+}
+
+/// Moves data from a FCSR to the GPR
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn movfcsr2gr<const IMM5: i32>() -> i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    __movfcsr2gr(IMM5)
+}
+
+/// Reads the CSR
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn csrrd<const IMM14: i32>() -> i64 {
+    static_assert_uimm_bits!(IMM14, 14);
+    __csrrd(IMM14)
+}
+
+/// Writes the CSR
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn csrwr<const IMM14: i32>(a: i64) -> i64 {
+    static_assert_uimm_bits!(IMM14, 14);
+    __csrwr(a, IMM14)
+}
+
+/// Exchanges the CSR
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn csrxchg<const IMM14: i32>(a: i64, b: i64) -> i64 {
+    static_assert_uimm_bits!(IMM14, 14);
+    __csrxchg(a, b, IMM14)
+}
+
+/// Reads the 8-bit IO-CSR
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn iocsrrd_b(a: i32) -> i32 {
+    __iocsrrd_b(a)
+}
+
+/// Reads the 16-bit IO-CSR
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn iocsrrd_h(a: i32) -> i32 {
+    __iocsrrd_h(a)
+}
+
+/// Reads the 32-bit IO-CSR
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn iocsrrd_w(a: i32) -> i32 {
+    __iocsrrd_w(a)
+}
+
+/// Reads the 64-bit IO-CSR
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn iocsrrd_d(a: i32) -> i64 {
+    __iocsrrd_d(a)
+}
+
+/// Writes the 8-bit IO-CSR
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn iocsrwr_b(a: i32, b: i32) {
+    __iocsrwr_b(a, b)
+}
+
+/// Writes the 16-bit IO-CSR
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn iocsrwr_h(a: i32, b: i32) {
+    __iocsrwr_h(a, b)
+}
+
+/// Writes the 32-bit IO-CSR
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn iocsrwr_w(a: i32, b: i32) {
+    __iocsrwr_w(a, b)
+}
+
+/// Writes the 64-bit IO-CSR
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn iocsrwr_d(a: i64, b: i32) {
+    __iocsrwr_d(a, b)
+}
+
+/// Generates the breakpoint instruction
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn brk<const IMM15: i32>() {
+    static_assert_uimm_bits!(IMM15, 15);
+    __break(IMM15);
+}
+
+/// Reads the CPU configuration register
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn cpucfg(a: i32) -> i32 {
+    __cpucfg(a)
+}
+
+/// Generates the syscall instruction
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn syscall<const IMM15: i32>() {
+    static_assert_uimm_bits!(IMM15, 15);
+    __syscall(IMM15);
+}
+
+/// Generates the less-than-or-equal asseration instruction
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn asrtle(a: i64, b: i64) {
+    __asrtle(a, b);
+}
+
+/// Generates the greater-than asseration instruction
+#[inline]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn asrtgt(a: i64, b: i64) {
+    __asrtgt(a, b);
+}
+
+/// Loads the page table directory entry
+#[inline]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn lddir<const B: i64>(a: i64) -> i64 {
+    __lddir(a, B)
+}
+
+/// Loads the page table entry
+#[inline]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn ldpte<const B: i64>(a: i64) {
+    __ldpte(a, B)
+}
+
+/// Calculate the approximate single-precision result of 1.0 divided
+#[inline]
+#[target_feature(enable = "frecipe")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn frecipe_s(a: f32) -> f32 {
+    __frecipe_s(a)
+}
+
+/// Calculate the approximate double-precision result of 1.0 divided
+#[inline]
+#[target_feature(enable = "frecipe")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn frecipe_d(a: f64) -> f64 {
+    __frecipe_d(a)
+}
+
+/// Calculate the approximate single-precision result of dividing 1.0 by the square root
+#[inline]
+#[target_feature(enable = "frecipe")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn frsqrte_s(a: f32) -> f32 {
+    __frsqrte_s(a)
+}
+
+/// Calculate the approximate double-precision result of dividing 1.0 by the square root
+#[inline]
+#[target_feature(enable = "frecipe")]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+pub unsafe fn frsqrte_d(a: f64) -> f64 {
+    __frsqrte_d(a)
+}
diff --git a/library/stdarch/crates/core_arch/src/macros.rs b/library/stdarch/crates/core_arch/src/macros.rs
new file mode 100644
index 0000000000000..e00b43353679e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/macros.rs
@@ -0,0 +1,165 @@
+//! Utility macros.
+
+#[allow(unused)]
+macro_rules! static_assert {
+    ($e:expr) => {
+        const {
+            assert!($e);
+        }
+    };
+    ($e:expr, $msg:expr) => {
+        const {
+            assert!($e, $msg);
+        }
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_uimm_bits {
+    ($imm:ident, $bits:expr) => {
+        // `0 <= $imm` produces a warning if the immediate has an unsigned type
+        #[allow(unused_comparisons)]
+        {
+            static_assert!(
+                0 <= $imm && $imm < (1 << $bits),
+                concat!(
+                    stringify!($imm),
+                    " doesn't fit in ",
+                    stringify!($bits),
+                    " bits",
+                )
+            )
+        }
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_simm_bits {
+    ($imm:ident, $bits:expr) => {
+        static_assert!(
+            (-1 << ($bits - 1)) - 1 <= $imm && $imm < (1 << ($bits - 1)),
+            concat!(
+                stringify!($imm),
+                " doesn't fit in ",
+                stringify!($bits),
+                " bits",
+            )
+        )
+    };
+}
+
+#[allow(unused)]
+macro_rules! types {
+    (
+        #![$stability_first:meta]
+        $(
+            #![$stability_more:meta]
+        )*
+
+        $(
+            $(#[$doc:meta])*
+            $(stability: [$stability_already: meta])*
+            pub struct $name:ident($len:literal x $v:vis $elem_type:ty);
+        )*
+    ) => (types! {
+        $(
+            #![$stability_more]
+        )*
+
+        $(
+            $(#[$doc])*
+            $(stability: [$stability_already])*
+            stability: [$stability_first]
+            pub struct $name($len x $v $elem_type);
+        )*
+    });
+
+    (
+        $(
+            $(#[$doc:meta])*
+            $(stability: [$stability: meta])+
+            pub struct $name:ident($len:literal x $v:vis $elem_type:ty);
+        )*
+    ) => ($(
+        $(#[$doc])*
+        $(#[$stability])+
+        #[derive(Copy, Clone)]
+        #[allow(non_camel_case_types)]
+        #[repr(simd)]
+        #[allow(clippy::missing_inline_in_public_items)]
+        pub struct $name($v [$elem_type; $len]);
+
+        impl $name {
+            /// Using `my_simd([x; N])` seemingly fails tests,
+            /// so use this internal helper for it instead.
+            #[inline(always)]
+            $v fn splat(value: $elem_type) -> $name {
+                #[derive(Copy, Clone)]
+                #[repr(simd)]
+                struct JustOne([$elem_type; 1]);
+                let one = JustOne([value]);
+                // SAFETY: 0 is always in-bounds because we're shuffling
+                // a simd type with exactly one element.
+                unsafe { simd_shuffle!(one, one, [0; $len]) }
+            }
+
+            /// Returns an array reference containing the entire SIMD vector.
+            $v const fn as_array(&self) -> &[$elem_type; $len] {
+                // SAFETY: this type is just an overaligned `[T; N]` with
+                // potential padding at the end, so pointer casting to a
+                // `&[T; N]` is safe.
+                //
+                // NOTE: This deliberately doesn't just use `&self.0` because it may soon be banned
+                // see https://github.com/rust-lang/compiler-team/issues/838
+                unsafe { &*(self as *const Self as *const [$elem_type; $len]) }
+
+            }
+
+            /// Returns a mutable array reference containing the entire SIMD vector.
+            #[inline]
+            $v fn as_mut_array(&mut self) -> &mut [$elem_type; $len] {
+                // SAFETY: this type is just an overaligned `[T; N]` with
+                // potential padding at the end, so pointer casting to a
+                // `&mut [T; N]` is safe.
+                //
+                // NOTE: This deliberately doesn't just use `&mut self.0` because it may soon be banned
+                // see https://github.com/rust-lang/compiler-team/issues/838
+                unsafe { &mut *(self as *mut Self as *mut [$elem_type; $len]) }
+            }
+        }
+
+        $(#[$stability])+
+        impl crate::fmt::Debug for $name {
+            #[inline]
+            fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result {
+                crate::core_arch::simd::debug_simd_finish(f, stringify!($name), self.as_array())
+            }
+        }
+    )*);
+}
+
+#[allow(unused)]
+#[repr(simd)]
+pub(crate) struct SimdShuffleIdx<const LEN: usize>(pub(crate) [u32; LEN]);
+
+#[allow(unused)]
+macro_rules! simd_shuffle {
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        $crate::intrinsics::simd::simd_shuffle(
+            $x,
+            $y,
+            const { $crate::core_arch::macros::SimdShuffleIdx($idx) },
+        )
+    }};
+}
+
+#[allow(unused)]
+macro_rules! simd_insert {
+    ($x:expr, $idx:expr, $val:expr $(,)?) => {{ $crate::intrinsics::simd::simd_insert($x, const { $idx }, $val) }};
+}
+
+#[allow(unused)]
+macro_rules! simd_extract {
+    ($x:expr, $idx:expr $(,)?) => {{ $crate::intrinsics::simd::simd_extract($x, const { $idx }) }};
+    ($x:expr, $idx:expr, $ty:ty $(,)?) => {{ $crate::intrinsics::simd::simd_extract::<_, $ty>($x, const { $idx }) }};
+}
diff --git a/library/stdarch/crates/core_arch/src/mips/mod.rs b/library/stdarch/crates/core_arch/src/mips/mod.rs
new file mode 100644
index 0000000000000..1de3ffd03d1f0
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/mips/mod.rs
@@ -0,0 +1,20 @@
+//! MIPS
+
+// Building this module (even if unused) for non-fp64 targets fails with an LLVM
+// error.
+#[cfg(target_feature = "fp64")]
+mod msa;
+#[cfg(target_feature = "fp64")]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub use self::msa::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Generates the trap instruction `BREAK`
+#[cfg_attr(test, assert_instr(break))]
+#[inline]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn break_() -> ! {
+    crate::intrinsics::abort()
+}
diff --git a/library/stdarch/crates/core_arch/src/mips/msa.rs b/library/stdarch/crates/core_arch/src/mips/msa.rs
new file mode 100644
index 0000000000000..563e121a7badb
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/mips/msa.rs
@@ -0,0 +1,18398 @@
+//! MIPS SIMD Architecture intrinsics
+//!
+//! The reference is [MIPS Architecture for Programmers Volume IV-j: The
+//! MIPS32 SIMD Architecture Module Revision 1.12][msa_ref].
+//!
+//! [msa_ref]: http://cdn2.imgtec.com/documentation/MD00866-2B-MSA32-AFP-01.12.pdf
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::mem;
+
+types! {
+    #![unstable(feature = "stdarch_mips", issue = "111198")]
+
+    /// MIPS-specific 128-bit wide vector of 16 packed `i8`.
+    pub struct v16i8(16 x i8);
+
+    /// MIPS-specific 128-bit wide vector of 8 packed `i16`.
+    pub struct v8i16(8 x i16);
+
+    /// MIPS-specific 128-bit wide vector of 4 packed `i32`.
+    pub struct v4i32(4 x i32);
+
+    /// MIPS-specific 128-bit wide vector of 2 packed `i64`.
+    pub struct v2i64(2 x i64);
+
+    /// MIPS-specific 128-bit wide vector of 16 packed `u8`.
+    pub struct v16u8(16 x u8);
+
+    /// MIPS-specific 128-bit wide vector of 8 packed `u16`.
+    pub struct v8u16(8 x u16);
+
+    /// MIPS-specific 128-bit wide vector of 4 packed `u32`.
+    pub struct v4u32(4 x u32);
+
+    /// MIPS-specific 128-bit wide vector of 2 packed `u64`.
+    pub struct v2u64(2 x u64);
+
+    // / MIPS-specific 128-bit wide vector of 4 packed `f32`.
+    pub struct v4f32(4 x f32);
+
+    /// MIPS-specific 128-bit wide vector of 2 packed `f64`.
+    pub struct v2f64(2 x f64);
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.mips.add.a.b"]
+    fn msa_add_a_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.add.a.h"]
+    fn msa_add_a_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.add.a.w"]
+    fn msa_add_a_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.add.a.d"]
+    fn msa_add_a_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.adds.a.b"]
+    fn msa_adds_a_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.adds.a.h"]
+    fn msa_adds_a_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.adds.a.w"]
+    fn msa_adds_a_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.adds.a.d"]
+    fn msa_adds_a_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.adds.s.b"]
+    fn msa_adds_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.adds.s.h"]
+    fn msa_adds_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.adds.s.w"]
+    fn msa_adds_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.adds.s.d"]
+    fn msa_adds_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.adds.u.b"]
+    fn msa_adds_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.adds.u.h"]
+    fn msa_adds_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.adds.u.w"]
+    fn msa_adds_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.adds.u.d"]
+    fn msa_adds_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.addv.b"]
+    fn msa_addv_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.addv.h"]
+    fn msa_addv_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.addv.w"]
+    fn msa_addv_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.addv.d"]
+    fn msa_addv_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.addvi.b"]
+    fn msa_addvi_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.addvi.h"]
+    fn msa_addvi_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.addvi.w"]
+    fn msa_addvi_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.addvi.d"]
+    fn msa_addvi_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.and.v"]
+    fn msa_and_v(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.andi.b"]
+    fn msa_andi_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.asub.s.b"]
+    fn msa_asub_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.asub.s.h"]
+    fn msa_asub_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.asub.s.w"]
+    fn msa_asub_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.asub.s.d"]
+    fn msa_asub_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.asub.u.b"]
+    fn msa_asub_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.asub.u.h"]
+    fn msa_asub_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.asub.u.w"]
+    fn msa_asub_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.asub.u.d"]
+    fn msa_asub_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.ave.s.b"]
+    fn msa_ave_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.ave.s.h"]
+    fn msa_ave_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.ave.s.w"]
+    fn msa_ave_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.ave.s.d"]
+    fn msa_ave_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.ave.u.b"]
+    fn msa_ave_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.ave.u.h"]
+    fn msa_ave_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.ave.u.w"]
+    fn msa_ave_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.ave.u.d"]
+    fn msa_ave_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.aver.s.b"]
+    fn msa_aver_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.aver.s.h"]
+    fn msa_aver_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.aver.s.w"]
+    fn msa_aver_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.aver.s.d"]
+    fn msa_aver_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.aver.u.b"]
+    fn msa_aver_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.aver.u.h"]
+    fn msa_aver_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.aver.u.w"]
+    fn msa_aver_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.aver.u.d"]
+    fn msa_aver_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.bclr.b"]
+    fn msa_bclr_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.bclr.h"]
+    fn msa_bclr_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.bclr.w"]
+    fn msa_bclr_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.bclr.d"]
+    fn msa_bclr_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.bclri.b"]
+    fn msa_bclri_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.bclri.h"]
+    fn msa_bclri_h(a: v8u16, b: i32) -> v8u16;
+    #[link_name = "llvm.mips.bclri.w"]
+    fn msa_bclri_w(a: v4u32, b: i32) -> v4u32;
+    #[link_name = "llvm.mips.bclri.d"]
+    fn msa_bclri_d(a: v2u64, b: i32) -> v2u64;
+    #[link_name = "llvm.mips.binsl.b"]
+    fn msa_binsl_b(a: v16u8, b: v16u8, c: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.binsl.h"]
+    fn msa_binsl_h(a: v8u16, b: v8u16, c: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.binsl.w"]
+    fn msa_binsl_w(a: v4u32, b: v4u32, c: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.binsl.d"]
+    fn msa_binsl_d(a: v2u64, b: v2u64, c: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.binsli.b"]
+    fn msa_binsli_b(a: v16u8, b: v16u8, c: i32) -> v16u8;
+    #[link_name = "llvm.mips.binsli.h"]
+    fn msa_binsli_h(a: v8u16, b: v8u16, c: i32) -> v8u16;
+    #[link_name = "llvm.mips.binsli.w"]
+    fn msa_binsli_w(a: v4u32, b: v4u32, c: i32) -> v4u32;
+    #[link_name = "llvm.mips.binsli.d"]
+    fn msa_binsli_d(a: v2u64, b: v2u64, c: i32) -> v2u64;
+    #[link_name = "llvm.mips.binsr.b"]
+    fn msa_binsr_b(a: v16u8, b: v16u8, c: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.binsr.h"]
+    fn msa_binsr_h(a: v8u16, b: v8u16, c: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.binsr.w"]
+    fn msa_binsr_w(a: v4u32, b: v4u32, c: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.binsr.d"]
+    fn msa_binsr_d(a: v2u64, b: v2u64, c: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.binsri.b"]
+    fn msa_binsri_b(a: v16u8, b: v16u8, c: i32) -> v16u8;
+    #[link_name = "llvm.mips.binsri.h"]
+    fn msa_binsri_h(a: v8u16, b: v8u16, c: i32) -> v8u16;
+    #[link_name = "llvm.mips.binsri.w"]
+    fn msa_binsri_w(a: v4u32, b: v4u32, c: i32) -> v4u32;
+    #[link_name = "llvm.mips.binsri.d"]
+    fn msa_binsri_d(a: v2u64, b: v2u64, c: i32) -> v2u64;
+    #[link_name = "llvm.mips.bmnz.v"]
+    fn msa_bmnz_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.bmnzi.b"]
+    fn msa_bmnzi_b(a: v16u8, b: v16u8, c: i32) -> v16u8;
+    #[link_name = "llvm.mips.bmz.v"]
+    fn msa_bmz_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.bmzi.b"]
+    fn msa_bmzi_b(a: v16u8, b: v16u8, c: i32) -> v16u8;
+    #[link_name = "llvm.mips.bneg.b"]
+    fn msa_bneg_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.bneg.h"]
+    fn msa_bneg_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.bneg.w"]
+    fn msa_bneg_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.bneg.d"]
+    fn msa_bneg_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.bnegi.b"]
+    fn msa_bnegi_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.bnegi.h"]
+    fn msa_bnegi_h(a: v8u16, b: i32) -> v8u16;
+    #[link_name = "llvm.mips.bnegi.w"]
+    fn msa_bnegi_w(a: v4u32, b: i32) -> v4u32;
+    #[link_name = "llvm.mips.bnegi.d"]
+    fn msa_bnegi_d(a: v2u64, b: i32) -> v2u64;
+    #[link_name = "llvm.mips.bnz.b"]
+    fn msa_bnz_b(a: v16u8) -> i32;
+    #[link_name = "llvm.mips.bnz.h"]
+    fn msa_bnz_h(a: v8u16) -> i32;
+    #[link_name = "llvm.mips.bnz.w"]
+    fn msa_bnz_w(a: v4u32) -> i32;
+    #[link_name = "llvm.mips.bnz.d"]
+    fn msa_bnz_d(a: v2u64) -> i32;
+    #[link_name = "llvm.mips.bnz.v"]
+    fn msa_bnz_v(a: v16u8) -> i32;
+    #[link_name = "llvm.mips.bsel.v"]
+    fn msa_bsel_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.bseli.b"]
+    fn msa_bseli_b(a: v16u8, b: v16u8, c: i32) -> v16u8;
+    #[link_name = "llvm.mips.bset.b"]
+    fn msa_bset_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.bset.h"]
+    fn msa_bset_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.bset.w"]
+    fn msa_bset_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.bset.d"]
+    fn msa_bset_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.bseti.b"]
+    fn msa_bseti_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.bseti.h"]
+    fn msa_bseti_h(a: v8u16, b: i32) -> v8u16;
+    #[link_name = "llvm.mips.bseti.w"]
+    fn msa_bseti_w(a: v4u32, b: i32) -> v4u32;
+    #[link_name = "llvm.mips.bseti.d"]
+    fn msa_bseti_d(a: v2u64, b: i32) -> v2u64;
+    #[link_name = "llvm.mips.bz.b"]
+    fn msa_bz_b(a: v16u8) -> i32;
+    #[link_name = "llvm.mips.bz.h"]
+    fn msa_bz_h(a: v8u16) -> i32;
+    #[link_name = "llvm.mips.bz.w"]
+    fn msa_bz_w(a: v4u32) -> i32;
+    #[link_name = "llvm.mips.bz.d"]
+    fn msa_bz_d(a: v2u64) -> i32;
+    #[link_name = "llvm.mips.bz.v"]
+    fn msa_bz_v(a: v16u8) -> i32;
+    #[link_name = "llvm.mips.ceq.b"]
+    fn msa_ceq_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.ceq.h"]
+    fn msa_ceq_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.ceq.w"]
+    fn msa_ceq_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.ceq.d"]
+    fn msa_ceq_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.ceqi.b"]
+    fn msa_ceqi_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.ceqi.h"]
+    fn msa_ceqi_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.ceqi.w"]
+    fn msa_ceqi_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.ceqi.d"]
+    fn msa_ceqi_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.cfcmsa"]
+    fn msa_cfcmsa(a: i32) -> i32;
+    #[link_name = "llvm.mips.cle.s.b"]
+    fn msa_cle_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.cle.s.h"]
+    fn msa_cle_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.cle.s.w"]
+    fn msa_cle_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.cle.s.d"]
+    fn msa_cle_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.cle.u.b"]
+    fn msa_cle_u_b(a: v16u8, b: v16u8) -> v16i8;
+    #[link_name = "llvm.mips.cle.u.h"]
+    fn msa_cle_u_h(a: v8u16, b: v8u16) -> v8i16;
+    #[link_name = "llvm.mips.cle.u.w"]
+    fn msa_cle_u_w(a: v4u32, b: v4u32) -> v4i32;
+    #[link_name = "llvm.mips.cle.u.d"]
+    fn msa_cle_u_d(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.mips.clei.s.b"]
+    fn msa_clei_s_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.clei.s.h"]
+    fn msa_clei_s_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.clei.s.w"]
+    fn msa_clei_s_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.clei.s.d"]
+    fn msa_clei_s_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.clei.u.b"]
+    fn msa_clei_u_b(a: v16u8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.clei.u.h"]
+    fn msa_clei_u_h(a: v8u16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.clei.u.w"]
+    fn msa_clei_u_w(a: v4u32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.clei.u.d"]
+    fn msa_clei_u_d(a: v2u64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.clt.s.b"]
+    fn msa_clt_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.clt.s.h"]
+    fn msa_clt_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.clt.s.w"]
+    fn msa_clt_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.clt.s.d"]
+    fn msa_clt_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.clt.u.b"]
+    fn msa_clt_u_b(a: v16u8, b: v16u8) -> v16i8;
+    #[link_name = "llvm.mips.clt.u.h"]
+    fn msa_clt_u_h(a: v8u16, b: v8u16) -> v8i16;
+    #[link_name = "llvm.mips.clt.u.w"]
+    fn msa_clt_u_w(a: v4u32, b: v4u32) -> v4i32;
+    #[link_name = "llvm.mips.clt.u.d"]
+    fn msa_clt_u_d(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.mips.clti.s.b"]
+    fn msa_clti_s_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.clti.s.h"]
+    fn msa_clti_s_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.clti.s.w"]
+    fn msa_clti_s_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.clti.s.d"]
+    fn msa_clti_s_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.clti.u.b"]
+    fn msa_clti_u_b(a: v16u8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.clti.u.h"]
+    fn msa_clti_u_h(a: v8u16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.clti.u.w"]
+    fn msa_clti_u_w(a: v4u32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.clti.u.d"]
+    fn msa_clti_u_d(a: v2u64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.copy.s.b"]
+    fn msa_copy_s_b(a: v16i8, b: i32) -> i32;
+    #[link_name = "llvm.mips.copy.s.h"]
+    fn msa_copy_s_h(a: v8i16, b: i32) -> i32;
+    #[link_name = "llvm.mips.copy.s.w"]
+    fn msa_copy_s_w(a: v4i32, b: i32) -> i32;
+    #[link_name = "llvm.mips.copy.s.d"]
+    fn msa_copy_s_d(a: v2i64, b: i32) -> i64;
+    #[link_name = "llvm.mips.copy.u.b"]
+    fn msa_copy_u_b(a: v16i8, b: i32) -> u32;
+    #[link_name = "llvm.mips.copy.u.h"]
+    fn msa_copy_u_h(a: v8i16, b: i32) -> u32;
+    #[link_name = "llvm.mips.copy.u.w"]
+    fn msa_copy_u_w(a: v4i32, b: i32) -> u32;
+    #[link_name = "llvm.mips.copy.u.d"]
+    fn msa_copy_u_d(a: v2i64, b: i32) -> u64;
+    #[link_name = "llvm.mips.ctcmsa"]
+    fn msa_ctcmsa(imm5: i32, a: i32) -> ();
+    #[link_name = "llvm.mips.div.s.b"]
+    fn msa_div_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.div.s.h"]
+    fn msa_div_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.div.s.w"]
+    fn msa_div_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.div.s.d"]
+    fn msa_div_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.div.u.b"]
+    fn msa_div_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.div.u.h"]
+    fn msa_div_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.div.u.w"]
+    fn msa_div_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.div.u.d"]
+    fn msa_div_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.dotp.s.h"]
+    fn msa_dotp_s_h(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.mips.dotp.s.w"]
+    fn msa_dotp_s_w(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.mips.dotp.s.d"]
+    fn msa_dotp_s_d(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.mips.dotp.u.h"]
+    fn msa_dotp_u_h(a: v16u8, b: v16u8) -> v8u16;
+    #[link_name = "llvm.mips.dotp.u.w"]
+    fn msa_dotp_u_w(a: v8u16, b: v8u16) -> v4u32;
+    #[link_name = "llvm.mips.dotp.u.d"]
+    fn msa_dotp_u_d(a: v4u32, b: v4u32) -> v2u64;
+    #[link_name = "llvm.mips.dpadd.s.h"]
+    fn msa_dpadd_s_h(a: v8i16, b: v16i8, c: v16i8) -> v8i16;
+    #[link_name = "llvm.mips.dpadd.s.w"]
+    fn msa_dpadd_s_w(a: v4i32, b: v8i16, c: v8i16) -> v4i32;
+    #[link_name = "llvm.mips.dpadd.s.d"]
+    fn msa_dpadd_s_d(a: v2i64, b: v4i32, c: v4i32) -> v2i64;
+    #[link_name = "llvm.mips.dpadd.u.h"]
+    fn msa_dpadd_u_h(a: v8u16, b: v16u8, c: v16u8) -> v8u16;
+    #[link_name = "llvm.mips.dpadd.u.w"]
+    fn msa_dpadd_u_w(a: v4u32, b: v8u16, c: v8u16) -> v4u32;
+    #[link_name = "llvm.mips.dpadd.u.d"]
+    fn msa_dpadd_u_d(a: v2u64, b: v4u32, c: v4u32) -> v2u64;
+    #[link_name = "llvm.mips.dpsub.s.h"]
+    fn msa_dpsub_s_h(a: v8i16, b: v16i8, c: v16i8) -> v8i16;
+    #[link_name = "llvm.mips.dpsub.s.w"]
+    fn msa_dpsub_s_w(a: v4i32, b: v8i16, c: v8i16) -> v4i32;
+    #[link_name = "llvm.mips.dpsub.s.d"]
+    fn msa_dpsub_s_d(a: v2i64, b: v4i32, c: v4i32) -> v2i64;
+    #[link_name = "llvm.mips.dpsub.u.h"]
+    fn msa_dpsub_u_h(a: v8i16, b: v16u8, c: v16u8) -> v8i16;
+    #[link_name = "llvm.mips.dpsub.u.w"]
+    fn msa_dpsub_u_w(a: v4i32, b: v8u16, c: v8u16) -> v4i32;
+    #[link_name = "llvm.mips.dpsub.u.d"]
+    fn msa_dpsub_u_d(a: v2i64, b: v4u32, c: v4u32) -> v2i64;
+    #[link_name = "llvm.mips.fadd.w"]
+    fn msa_fadd_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fadd.d"]
+    fn msa_fadd_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fcaf.w"]
+    fn msa_fcaf_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcaf.d"]
+    fn msa_fcaf_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fceq.w"]
+    fn msa_fceq_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fceq.d"]
+    fn msa_fceq_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fclass.w"]
+    fn msa_fclass_w(a: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fclass.d"]
+    fn msa_fclass_d(a: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcle.w"]
+    fn msa_fcle_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcle.d"]
+    fn msa_fcle_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fclt.w"]
+    fn msa_fclt_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fclt.d"]
+    fn msa_fclt_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcne.w"]
+    fn msa_fcne_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcne.d"]
+    fn msa_fcne_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcor.w"]
+    fn msa_fcor_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcor.d"]
+    fn msa_fcor_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcueq.w"]
+    fn msa_fcueq_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcueq.d"]
+    fn msa_fcueq_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcule.w"]
+    fn msa_fcule_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcule.d"]
+    fn msa_fcule_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcult.w"]
+    fn msa_fcult_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcult.d"]
+    fn msa_fcult_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcun.w"]
+    fn msa_fcun_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcun.d"]
+    fn msa_fcun_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcune.w"]
+    fn msa_fcune_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcune.d"]
+    fn msa_fcune_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fdiv.w"]
+    fn msa_fdiv_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fdiv.d"]
+    fn msa_fdiv_d(a: v2f64, b: v2f64) -> v2f64;
+    // FIXME: 16-bit floats
+    // #[link_name = "llvm.mips.fexdo.h"]
+    // fn msa_fexdo_h(a: v4f32, b: v4f32) -> f16x8;
+    #[link_name = "llvm.mips.fexdo.w"]
+    fn msa_fexdo_w(a: v2f64, b: v2f64) -> v4f32;
+    #[link_name = "llvm.mips.fexp2.w"]
+    fn msa_fexp2_w(a: v4f32, b: v4i32) -> v4f32;
+    #[link_name = "llvm.mips.fexp2.d"]
+    fn msa_fexp2_d(a: v2f64, b: v2i64) -> v2f64;
+    // FIXME: 16-bit floats
+    // #[link_name = "llvm.mips.fexupl.w"]
+    // fn msa_fexupl_w(a: f16x8) -> v4f32;
+    #[link_name = "llvm.mips.fexupl.d"]
+    fn msa_fexupl_d(a: v4f32) -> v2f64;
+    // FIXME: 16-bit floats
+    // #[link_name = "llvm.mips.fexupr.w"]
+    // fn msa_fexupr_w(a: f16x8) -> v4f32;
+    #[link_name = "llvm.mips.fexupr.d"]
+    fn msa_fexupr_d(a: v4f32) -> v2f64;
+    #[link_name = "llvm.mips.ffint.s.w"]
+    fn msa_ffint_s_w(a: v4i32) -> v4f32;
+    #[link_name = "llvm.mips.ffint.s.d"]
+    fn msa_ffint_s_d(a: v2i64) -> v2f64;
+    #[link_name = "llvm.mips.ffint.u.w"]
+    fn msa_ffint_u_w(a: v4u32) -> v4f32;
+    #[link_name = "llvm.mips.ffint.u.d"]
+    fn msa_ffint_u_d(a: v2u64) -> v2f64;
+    #[link_name = "llvm.mips.ffql.w"]
+    fn msa_ffql_w(a: v8i16) -> v4f32;
+    #[link_name = "llvm.mips.ffql.d"]
+    fn msa_ffql_d(a: v4i32) -> v2f64;
+    #[link_name = "llvm.mips.ffqr.w"]
+    fn msa_ffqr_w(a: v8i16) -> v4f32;
+    #[link_name = "llvm.mips.ffqr.d"]
+    fn msa_ffqr_d(a: v4i32) -> v2f64;
+    #[link_name = "llvm.mips.fill.b"]
+    fn msa_fill_b(a: i32) -> v16i8;
+    #[link_name = "llvm.mips.fill.h"]
+    fn msa_fill_h(a: i32) -> v8i16;
+    #[link_name = "llvm.mips.fill.w"]
+    fn msa_fill_w(a: i32) -> v4i32;
+    #[link_name = "llvm.mips.fill.d"]
+    fn msa_fill_d(a: i64) -> v2i64;
+    #[link_name = "llvm.mips.flog2.w"]
+    fn msa_flog2_w(a: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.flog2.d"]
+    fn msa_flog2_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmadd.w"]
+    fn msa_fmadd_w(a: v4f32, b: v4f32, c: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmadd.d"]
+    fn msa_fmadd_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmax.w"]
+    fn msa_fmax_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmax.d"]
+    fn msa_fmax_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmax.a.w"]
+    fn msa_fmax_a_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmax.a.d"]
+    fn msa_fmax_a_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmin.w"]
+    fn msa_fmin_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmin.d"]
+    fn msa_fmin_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmin.a.w"]
+    fn msa_fmin_a_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmin.a.d"]
+    fn msa_fmin_a_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmsub.w"]
+    fn msa_fmsub_w(a: v4f32, b: v4f32, c: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmsub.d"]
+    fn msa_fmsub_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmul.w"]
+    fn msa_fmul_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmul.d"]
+    fn msa_fmul_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.frint.w"]
+    fn msa_frint_w(a: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.frint.d"]
+    fn msa_frint_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.frcp.w"]
+    fn msa_frcp_w(a: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.frcp.d"]
+    fn msa_frcp_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.frsqrt.w"]
+    fn msa_frsqrt_w(a: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.frsqrt.d"]
+    fn msa_frsqrt_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fsaf.w"]
+    fn msa_fsaf_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsaf.d"]
+    fn msa_fsaf_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fseq.w"]
+    fn msa_fseq_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fseq.d"]
+    fn msa_fseq_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsle.w"]
+    fn msa_fsle_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsle.d"]
+    fn msa_fsle_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fslt.w"]
+    fn msa_fslt_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fslt.d"]
+    fn msa_fslt_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsne.w"]
+    fn msa_fsne_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsne.d"]
+    fn msa_fsne_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsor.w"]
+    fn msa_fsor_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsor.d"]
+    fn msa_fsor_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsqrt.w"]
+    fn msa_fsqrt_w(a: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fsqrt.d"]
+    fn msa_fsqrt_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fsub.w"]
+    fn msa_fsub_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fsub.d"]
+    fn msa_fsub_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fsueq.w"]
+    fn msa_fsueq_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsueq.d"]
+    fn msa_fsueq_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsule.w"]
+    fn msa_fsule_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsule.d"]
+    fn msa_fsule_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsult.w"]
+    fn msa_fsult_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsult.d"]
+    fn msa_fsult_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsun.w"]
+    fn msa_fsun_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsun.d"]
+    fn msa_fsun_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsune.w"]
+    fn msa_fsune_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsune.d"]
+    fn msa_fsune_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.ftint.s.w"]
+    fn msa_ftint_s_w(a: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.ftint.s.d"]
+    fn msa_ftint_s_d(a: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.ftint.u.w"]
+    fn msa_ftint_u_w(a: v4f32) -> v4u32;
+    #[link_name = "llvm.mips.ftint.u.d"]
+    fn msa_ftint_u_d(a: v2f64) -> v2u64;
+    #[link_name = "llvm.mips.ftq.h"]
+    fn msa_ftq_h(a: v4f32, b: v4f32) -> v8i16;
+    #[link_name = "llvm.mips.ftq.w"]
+    fn msa_ftq_w(a: v2f64, b: v2f64) -> v4i32;
+    #[link_name = "llvm.mips.ftrunc.s.w"]
+    fn msa_ftrunc_s_w(a: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.ftrunc.s.d"]
+    fn msa_ftrunc_s_d(a: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.ftrunc.u.w"]
+    fn msa_ftrunc_u_w(a: v4f32) -> v4u32;
+    #[link_name = "llvm.mips.ftrunc.u.d"]
+    fn msa_ftrunc_u_d(a: v2f64) -> v2u64;
+    #[link_name = "llvm.mips.hadd.s.h"]
+    fn msa_hadd_s_h(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.mips.hadd.s.w"]
+    fn msa_hadd_s_w(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.mips.hadd.s.d"]
+    fn msa_hadd_s_d(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.mips.hadd.u.h"]
+    fn msa_hadd_u_h(a: v16u8, b: v16u8) -> v8u16;
+    #[link_name = "llvm.mips.hadd.u.w"]
+    fn msa_hadd_u_w(a: v8u16, b: v8u16) -> v4u32;
+    #[link_name = "llvm.mips.hadd.u.d"]
+    fn msa_hadd_u_d(a: v4u32, b: v4u32) -> v2u64;
+    #[link_name = "llvm.mips.hsub.s.h"]
+    fn msa_hsub_s_h(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.mips.hsub.s.w"]
+    fn msa_hsub_s_w(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.mips.hsub.s.d"]
+    fn msa_hsub_s_d(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.mips.hsub.u.h"]
+    fn msa_hsub_u_h(a: v16u8, b: v16u8) -> v8i16;
+    #[link_name = "llvm.mips.hsub.u.w"]
+    fn msa_hsub_u_w(a: v8u16, b: v8u16) -> v4i32;
+    #[link_name = "llvm.mips.hsub.u.d"]
+    fn msa_hsub_u_d(a: v4u32, b: v4u32) -> v2i64;
+    #[link_name = "llvm.mips.ilvev.b"]
+    fn msa_ilvev_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.ilvev.h"]
+    fn msa_ilvev_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.ilvev.w"]
+    fn msa_ilvev_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.ilvev.d"]
+    fn msa_ilvev_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.ilvl.b"]
+    fn msa_ilvl_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.ilvl.h"]
+    fn msa_ilvl_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.ilvl.w"]
+    fn msa_ilvl_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.ilvl.d"]
+    fn msa_ilvl_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.ilvod.b"]
+    fn msa_ilvod_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.ilvod.h"]
+    fn msa_ilvod_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.ilvod.w"]
+    fn msa_ilvod_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.ilvod.d"]
+    fn msa_ilvod_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.ilvr.b"]
+    fn msa_ilvr_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.ilvr.h"]
+    fn msa_ilvr_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.ilvr.w"]
+    fn msa_ilvr_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.ilvr.d"]
+    fn msa_ilvr_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.insert.b"]
+    fn msa_insert_b(a: v16i8, b: i32, c: i32) -> v16i8;
+    #[link_name = "llvm.mips.insert.h"]
+    fn msa_insert_h(a: v8i16, b: i32, c: i32) -> v8i16;
+    #[link_name = "llvm.mips.insert.w"]
+    fn msa_insert_w(a: v4i32, b: i32, c: i32) -> v4i32;
+    #[link_name = "llvm.mips.insert.d"]
+    fn msa_insert_d(a: v2i64, b: i32, c: i64) -> v2i64;
+    #[link_name = "llvm.mips.insve.b"]
+    fn msa_insve_b(a: v16i8, b: i32, c: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.insve.h"]
+    fn msa_insve_h(a: v8i16, b: i32, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.insve.w"]
+    fn msa_insve_w(a: v4i32, b: i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.insve.d"]
+    fn msa_insve_d(a: v2i64, b: i32, c: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.ld.b"]
+    fn msa_ld_b(mem_addr: *mut u8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.ld.h"]
+    fn msa_ld_h(mem_addr: *mut u8, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.ld.w"]
+    fn msa_ld_w(mem_addr: *mut u8, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.ld.d"]
+    fn msa_ld_d(mem_addr: *mut u8, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.ldi.b"]
+    fn msa_ldi_b(a: i32) -> v16i8;
+    #[link_name = "llvm.mips.ldi.h"]
+    fn msa_ldi_h(a: i32) -> v8i16;
+    #[link_name = "llvm.mips.ldi.w"]
+    fn msa_ldi_w(a: i32) -> v4i32;
+    #[link_name = "llvm.mips.ldi.d"]
+    fn msa_ldi_d(a: i32) -> v2i64;
+    #[link_name = "llvm.mips.madd.q.h"]
+    fn msa_madd_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.madd.q.w"]
+    fn msa_madd_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.maddr.q.h"]
+    fn msa_maddr_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.maddr.q.w"]
+    fn msa_maddr_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.maddv.b"]
+    fn msa_maddv_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.maddv.h"]
+    fn msa_maddv_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.maddv.w"]
+    fn msa_maddv_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.maddv.d"]
+    fn msa_maddv_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.max.a.b"]
+    fn msa_max_a_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.max.a.h"]
+    fn msa_max_a_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.max.a.w"]
+    fn msa_max_a_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.max.a.d"]
+    fn msa_max_a_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.max.s.b"]
+    fn msa_max_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.max.s.h"]
+    fn msa_max_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.max.s.w"]
+    fn msa_max_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.max.s.d"]
+    fn msa_max_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.max.u.b"]
+    fn msa_max_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.max.u.h"]
+    fn msa_max_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.max.u.w"]
+    fn msa_max_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.max.u.d"]
+    fn msa_max_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.maxi.s.b"]
+    fn msa_maxi_s_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.maxi.s.h"]
+    fn msa_maxi_s_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.maxi.s.w"]
+    fn msa_maxi_s_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.maxi.s.d"]
+    fn msa_maxi_s_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.maxi.u.b"]
+    fn msa_maxi_u_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.maxi.u.h"]
+    fn msa_maxi_u_h(a: v8u16, b: i32) -> v8u16;
+    #[link_name = "llvm.mips.maxi.u.w"]
+    fn msa_maxi_u_w(a: v4u32, b: i32) -> v4u32;
+    #[link_name = "llvm.mips.maxi.u.d"]
+    fn msa_maxi_u_d(a: v2u64, b: i32) -> v2u64;
+    #[link_name = "llvm.mips.min.a.b"]
+    fn msa_min_a_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.min.a.h"]
+    fn msa_min_a_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.min.a.w"]
+    fn msa_min_a_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.min.a.d"]
+    fn msa_min_a_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.min.s.b"]
+    fn msa_min_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.min.s.h"]
+    fn msa_min_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.min.s.w"]
+    fn msa_min_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.min.s.d"]
+    fn msa_min_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.min.u.b"]
+    fn msa_min_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.min.u.h"]
+    fn msa_min_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.min.u.w"]
+    fn msa_min_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.min.u.d"]
+    fn msa_min_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.mini.s.b"]
+    fn msa_mini_s_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.mini.s.h"]
+    fn msa_mini_s_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.mini.s.w"]
+    fn msa_mini_s_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.mini.s.d"]
+    fn msa_mini_s_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.mini.u.b"]
+    fn msa_mini_u_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.mini.u.h"]
+    fn msa_mini_u_h(a: v8u16, b: i32) -> v8u16;
+    #[link_name = "llvm.mips.mini.u.w"]
+    fn msa_mini_u_w(a: v4u32, b: i32) -> v4u32;
+    #[link_name = "llvm.mips.mini.u.d"]
+    fn msa_mini_u_d(a: v2u64, b: i32) -> v2u64;
+    #[link_name = "llvm.mips.mod.s.b"]
+    fn msa_mod_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.mod.s.h"]
+    fn msa_mod_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.mod.s.w"]
+    fn msa_mod_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.mod.s.d"]
+    fn msa_mod_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.mod.u.b"]
+    fn msa_mod_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.mod.u.h"]
+    fn msa_mod_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.mod.u.w"]
+    fn msa_mod_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.mod.u.d"]
+    fn msa_mod_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.move.v"]
+    fn msa_move_v(a: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.msub.q.h"]
+    fn msa_msub_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.msub.q.w"]
+    fn msa_msub_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.msubr.q.h"]
+    fn msa_msubr_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.msubr.q.w"]
+    fn msa_msubr_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.msubv.b"]
+    fn msa_msubv_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.msubv.h"]
+    fn msa_msubv_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.msubv.w"]
+    fn msa_msubv_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.msubv.d"]
+    fn msa_msubv_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.mul.q.h"]
+    fn msa_mul_q_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.mul.q.w"]
+    fn msa_mul_q_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.mulr.q.h"]
+    fn msa_mulr_q_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.mulr.q.w"]
+    fn msa_mulr_q_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.mulv.b"]
+    fn msa_mulv_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.mulv.h"]
+    fn msa_mulv_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.mulv.w"]
+    fn msa_mulv_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.mulv.d"]
+    fn msa_mulv_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.nloc.b"]
+    fn msa_nloc_b(a: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.nloc.h"]
+    fn msa_nloc_h(a: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.nloc.w"]
+    fn msa_nloc_w(a: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.nloc.d"]
+    fn msa_nloc_d(a: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.nlzc.b"]
+    fn msa_nlzc_b(a: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.nlzc.h"]
+    fn msa_nlzc_h(a: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.nlzc.w"]
+    fn msa_nlzc_w(a: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.nlzc.d"]
+    fn msa_nlzc_d(a: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.nor.v"]
+    fn msa_nor_v(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.nori.b"]
+    fn msa_nori_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.or.v"]
+    fn msa_or_v(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.ori.b"]
+    fn msa_ori_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.pckev.b"]
+    fn msa_pckev_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.pckev.h"]
+    fn msa_pckev_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.pckev.w"]
+    fn msa_pckev_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.pckev.d"]
+    fn msa_pckev_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.pckod.b"]
+    fn msa_pckod_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.pckod.h"]
+    fn msa_pckod_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.pckod.w"]
+    fn msa_pckod_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.pckod.d"]
+    fn msa_pckod_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.pcnt.b"]
+    fn msa_pcnt_b(a: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.pcnt.h"]
+    fn msa_pcnt_h(a: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.pcnt.w"]
+    fn msa_pcnt_w(a: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.pcnt.d"]
+    fn msa_pcnt_d(a: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.sat.s.b"]
+    fn msa_sat_s_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.sat.s.h"]
+    fn msa_sat_s_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.sat.s.w"]
+    fn msa_sat_s_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.sat.s.d"]
+    fn msa_sat_s_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.sat.u.b"]
+    fn msa_sat_u_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.sat.u.h"]
+    fn msa_sat_u_h(a: v8u16, b: i32) -> v8u16;
+    #[link_name = "llvm.mips.sat.u.w"]
+    fn msa_sat_u_w(a: v4u32, b: i32) -> v4u32;
+    #[link_name = "llvm.mips.sat.u.d"]
+    fn msa_sat_u_d(a: v2u64, b: i32) -> v2u64;
+    #[link_name = "llvm.mips.shf.b"]
+    fn msa_shf_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.shf.h"]
+    fn msa_shf_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.shf.w"]
+    fn msa_shf_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.sld.b"]
+    fn msa_sld_b(a: v16i8, b: v16i8, c: i32) -> v16i8;
+    #[link_name = "llvm.mips.sld.h"]
+    fn msa_sld_h(a: v8i16, b: v8i16, c: i32) -> v8i16;
+    #[link_name = "llvm.mips.sld.w"]
+    fn msa_sld_w(a: v4i32, b: v4i32, c: i32) -> v4i32;
+    #[link_name = "llvm.mips.sld.d"]
+    fn msa_sld_d(a: v2i64, b: v2i64, c: i32) -> v2i64;
+    #[link_name = "llvm.mips.sldi.b"]
+    fn msa_sldi_b(a: v16i8, b: v16i8, c: i32) -> v16i8;
+    #[link_name = "llvm.mips.sldi.h"]
+    fn msa_sldi_h(a: v8i16, b: v8i16, c: i32) -> v8i16;
+    #[link_name = "llvm.mips.sldi.w"]
+    fn msa_sldi_w(a: v4i32, b: v4i32, c: i32) -> v4i32;
+    #[link_name = "llvm.mips.sldi.d"]
+    fn msa_sldi_d(a: v2i64, b: v2i64, c: i32) -> v2i64;
+    #[link_name = "llvm.mips.sll.b"]
+    fn msa_sll_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.sll.h"]
+    fn msa_sll_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.sll.w"]
+    fn msa_sll_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.sll.d"]
+    fn msa_sll_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.slli.b"]
+    fn msa_slli_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.slli.h"]
+    fn msa_slli_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.slli.w"]
+    fn msa_slli_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.slli.d"]
+    fn msa_slli_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.splat.b"]
+    fn msa_splat_b(a: v16i8, c: i32) -> v16i8;
+    #[link_name = "llvm.mips.splat.h"]
+    fn msa_splat_h(a: v8i16, c: i32) -> v8i16;
+    #[link_name = "llvm.mips.splat.w"]
+    fn msa_splat_w(a: v4i32, w: i32) -> v4i32;
+    #[link_name = "llvm.mips.splat.d"]
+    fn msa_splat_d(a: v2i64, c: i32) -> v2i64;
+    #[link_name = "llvm.mips.splati.b"]
+    fn msa_splati_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.splati.h"]
+    fn msa_splati_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.splati.w"]
+    fn msa_splati_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.splati.d"]
+    fn msa_splati_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.sra.b"]
+    fn msa_sra_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.sra.h"]
+    fn msa_sra_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.sra.w"]
+    fn msa_sra_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.sra.d"]
+    fn msa_sra_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.srai.b"]
+    fn msa_srai_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.srai.h"]
+    fn msa_srai_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.srai.w"]
+    fn msa_srai_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.srai.d"]
+    fn msa_srai_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.srar.b"]
+    fn msa_srar_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.srar.h"]
+    fn msa_srar_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.srar.w"]
+    fn msa_srar_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.srar.d"]
+    fn msa_srar_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.srari.b"]
+    fn msa_srari_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.srari.h"]
+    fn msa_srari_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.srari.w"]
+    fn msa_srari_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.srari.d"]
+    fn msa_srari_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.srl.b"]
+    fn msa_srl_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.srl.h"]
+    fn msa_srl_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.srl.w"]
+    fn msa_srl_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.srl.d"]
+    fn msa_srl_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.srli.b"]
+    fn msa_srli_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.srli.h"]
+    fn msa_srli_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.srli.w"]
+    fn msa_srli_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.srli.d"]
+    fn msa_srli_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.srlr.b"]
+    fn msa_srlr_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.srlr.h"]
+    fn msa_srlr_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.srlr.w"]
+    fn msa_srlr_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.srlr.d"]
+    fn msa_srlr_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.srlri.b"]
+    fn msa_srlri_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.srlri.h"]
+    fn msa_srlri_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.srlri.w"]
+    fn msa_srlri_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.srlri.d"]
+    fn msa_srlri_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.st.b"]
+    fn msa_st_b(a: v16i8, mem_addr: *mut u8, imm_s10: i32) -> ();
+    #[link_name = "llvm.mips.st.h"]
+    fn msa_st_h(a: v8i16, mem_addr: *mut u8, imm_s11: i32) -> ();
+    #[link_name = "llvm.mips.st.w"]
+    fn msa_st_w(a: v4i32, mem_addr: *mut u8, imm_s12: i32) -> ();
+    #[link_name = "llvm.mips.st.d"]
+    fn msa_st_d(a: v2i64, mem_addr: *mut u8, imm_s13: i32) -> ();
+    #[link_name = "llvm.mips.subs.s.b"]
+    fn msa_subs_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.subs.s.h"]
+    fn msa_subs_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.subs.s.w"]
+    fn msa_subs_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.subs.s.d"]
+    fn msa_subs_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.subs.u.b"]
+    fn msa_subs_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.subs.u.h"]
+    fn msa_subs_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.subs.u.w"]
+    fn msa_subs_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.subs.u.d"]
+    fn msa_subs_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.subsus.u.b"]
+    fn msa_subsus_u_b(a: v16u8, b: v16i8) -> v16u8;
+    #[link_name = "llvm.mips.subsus.u.h"]
+    fn msa_subsus_u_h(a: v8u16, b: v8i16) -> v8u16;
+    #[link_name = "llvm.mips.subsus.u.w"]
+    fn msa_subsus_u_w(a: v4u32, b: v4i32) -> v4u32;
+    #[link_name = "llvm.mips.subsus.u.d"]
+    fn msa_subsus_u_d(a: v2u64, b: v2i64) -> v2u64;
+    #[link_name = "llvm.mips.subsuu.s.b"]
+    fn msa_subsuu_s_b(a: v16u8, b: v16u8) -> v16i8;
+    #[link_name = "llvm.mips.subsuu.s.h"]
+    fn msa_subsuu_s_h(a: v8u16, b: v8u16) -> v8i16;
+    #[link_name = "llvm.mips.subsuu.s.w"]
+    fn msa_subsuu_s_w(a: v4u32, b: v4u32) -> v4i32;
+    #[link_name = "llvm.mips.subsuu.s.d"]
+    fn msa_subsuu_s_d(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.mips.subv.b"]
+    fn msa_subv_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.subv.h"]
+    fn msa_subv_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.subv.w"]
+    fn msa_subv_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.subv.d"]
+    fn msa_subv_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.subvi.b"]
+    fn msa_subvi_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.subvi.h"]
+    fn msa_subvi_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.subvi.w"]
+    fn msa_subvi_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.subvi.d"]
+    fn msa_subvi_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.vshf.b"]
+    fn msa_vshf_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.vshf.h"]
+    fn msa_vshf_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.vshf.w"]
+    fn msa_vshf_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.vshf.d"]
+    fn msa_vshf_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.xor.v"]
+    fn msa_xor_v(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.xori.b"]
+    fn msa_xori_b(a: v16u8, b: i32) -> v16u8;
+}
+
+/// Vector Add Absolute Values.
+///
+/// The absolute values of the elements in vector in `a` (sixteen signed 8-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(add_a.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_add_a_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_add_a_b(a, mem::transmute(b))
+}
+
+/// Vector Add Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (eight signed 16-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(add_a.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_add_a_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_add_a_h(a, mem::transmute(b))
+}
+
+/// Vector Add Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (four signed 32-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(add_a.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_add_a_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_add_a_w(a, mem::transmute(b))
+}
+
+/// Vector Add Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (two signed 64-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(add_a.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_add_a_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_add_a_d(a, mem::transmute(b))
+}
+
+/// Signed Saturated Vector Saturated Add of Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (sixteen signed 8-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The saturated signed result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_a.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_adds_a_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_adds_a_b(a, mem::transmute(b))
+}
+
+/// Vector Saturated Add of Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (eight signed 16-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (eight signed 16-bit integer numbers).
+/// The saturated signed result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_a.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_adds_a_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_adds_a_h(a, mem::transmute(b))
+}
+
+/// Vector Saturated Add of Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (four signed 32-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (four signed 32-bit integer numbers).
+/// The saturated signed result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_a.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_adds_a_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_adds_a_w(a, mem::transmute(b))
+}
+
+/// Vector Saturated Add of Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (two signed 64-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (two signed 64-bit integer numbers).
+/// The saturated signed result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_a.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_adds_a_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_adds_a_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Add of Signed Values
+///
+/// The elements in vector in `a` (sixteen signed 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_s.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_adds_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_adds_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Add of Signed Values
+///
+/// The elements in vector in `a` (eight signed 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight signed 16-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_adds_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_adds_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Add of Signed Values
+///
+/// The elements in vector in `a` (four signed 32-bit integer numbers)
+/// are added to the elements in vector `b` (four signed 32-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_adds_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_adds_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Add of Signed Values
+///
+/// The elements in vector in `a` (two signed 64-bit integer numbers)
+/// are added to the elements in vector `b` (two signed 64-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_adds_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_adds_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Add of Unsigned Values
+///
+/// The elements in vector in `a` (sixteen unsigned 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen unsigned 8-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_u.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_adds_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_adds_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Add of Unsigned Values
+///
+/// The elements in vector in `a` (eight unsigned 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight unsigned 16-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_adds_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_adds_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Add of Unsigned Values
+///
+/// The elements in vector in `a` (four unsigned 32-bit integer numbers)
+/// are added to the elements in vector `b` (four unsigned 32-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_adds_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_adds_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Add of Unsigned Values
+///
+/// The elements in vector in `a` (two unsigned 64-bit integer numbers)
+/// are added to the elements in vector `b` (two unsigned 64-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_adds_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_adds_u_d(a, mem::transmute(b))
+}
+
+/// Vector Add
+///
+/// The elements in vector in `a` (sixteen signed 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addv.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_addv_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_addv_b(a, mem::transmute(b))
+}
+
+/// Vector Add
+///
+/// The elements in vector in `a` (eight signed 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addv.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_addv_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_addv_h(a, mem::transmute(b))
+}
+
+/// Vector Add
+///
+/// The elements in vector in `a` (four signed 32-bit integer numbers)
+/// are added to the elements in vector `b` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addv.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_addv_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_addv_w(a, mem::transmute(b))
+}
+
+/// Vector Add
+///
+/// The elements in vector in `a` (two signed 64-bit integer numbers)
+/// are added to the elements in vector `b` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addv.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_addv_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_addv_d(a, mem::transmute(b))
+}
+
+/// Immediate Add
+///
+/// The 5-bit immediate unsigned value `imm5` is added to the elements
+/// vector in `a` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addvi.b, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_addvi_b<const IMM5: i32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_addvi_b(a, IMM5)
+}
+
+/// Immediate Add
+///
+/// The 5-bit immediate unsigned value `imm5` is added to the elements
+/// vector in `a` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addvi.h, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_addvi_h<const IMM5: i32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_addvi_h(a, IMM5)
+}
+
+/// Immediate Add
+///
+/// The 5-bit immediate unsigned value `imm5` is added to the elements
+/// vector in `a` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addvi.w, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_addvi_w<const IMM5: i32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_addvi_w(a, IMM5)
+}
+
+/// Immediate Add
+///
+/// The 5-bit immediate unsigned value `imm5` is added to the elements
+/// vector in `a` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addvi.d, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_addvi_d<const IMM5: i32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_addvi_d(a, IMM5)
+}
+
+/// Vector Logical And
+///
+/// Each bit of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the corresponding bit of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// in a bitwise logical AND operation.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(and.v))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_and_v(a: v16u8, b: v16u8) -> v16u8 {
+    msa_and_v(a, mem::transmute(b))
+}
+
+/// Immediate Logical And
+///
+/// Each byte element of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the 8-bit immediate i8 (signed 8-bit integer number) in a bitwise logical AND operation.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(andi.b, imm8 = 0b10010111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_andi_b<const IMM8: i32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    msa_andi_b(a, IMM8)
+}
+
+/// Vector Absolute Values of Signed Subtract
+///
+/// The signed elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are subtracted from the signed elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The absolute value of the signed result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_s.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_asub_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_asub_s_b(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Signed Subtract
+///
+/// The signed elements in vector `a` (eight signed 16-bit integer numbers)
+/// are subtracted from the signed elements in vector `b` (eight signed 16-bit integer numbers).
+/// The absolute value of the signed result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_asub_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_asub_s_h(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Signed Subtract
+///
+/// The signed elements in vector `a` (four signed 32-bit integer numbers)
+/// are subtracted from the signed elements in vector `b` (four signed 32-bit integer numbers).
+/// The absolute value of the signed result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_asub_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_asub_s_w(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Signed Subtract
+///
+/// The signed elements in vector `a` (two signed 64-bit integer numbers)
+/// are subtracted from the signed elements in vector `b` (two signed 64-bit integer numbers).
+/// The absolute value of the signed result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_asub_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_asub_s_d(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Unsigned Subtract
+///
+/// The unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `b` (sixteen unsigned 8-bit integer numbers).
+/// The absolute value of the unsigned result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_u.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_asub_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_asub_u_b(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Unsigned Subtract
+///
+/// The unsigned elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `b` (eight unsigned 16-bit integer numbers).
+/// The absolute value of the unsigned result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_asub_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_asub_u_h(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Unsigned Subtract
+///
+/// The unsigned elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `b` (four unsigned 32-bit integer numbers).
+/// The absolute value of the unsigned result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_asub_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_asub_u_w(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Unsigned Subtract
+///
+/// The unsigned elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `b` (two unsigned 64-bit integer numbers).
+/// The absolute value of the unsigned result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_asub_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_asub_u_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Average
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The addition is done signed with full precision, i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_s.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ave_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_ave_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Average
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight signed 16-bit integer numbers).
+/// The addition is done signed with full precision, i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ave_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_ave_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Average
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are added to the elements in vector `b` (four signed 32-bit integer numbers).
+/// The addition is done signed with full precision, i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ave_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_ave_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Average
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are added to the elements in vector `b` (two signed 64-bit integer numbers).
+/// The addition is done signed with full precision, i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ave_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_ave_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average
+///
+/// The elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen unsigned 8-bit integer numbers).
+/// The addition is done unsigned with full precision, i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_u.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ave_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_ave_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average
+///
+/// The elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight unsigned 16-bit integer numbers).
+/// The addition is done unsigned with full precision, i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ave_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_ave_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average
+///
+/// The elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are added to the elements in vector `b` (four unsigned 32-bit integer numbers).
+/// The addition is done unsigned with full precision, i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ave_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_ave_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average
+///
+/// The elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are added to the elements in vector `b` (two unsigned 64-bit integer numbers).
+/// The addition is done unsigned with full precision, i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ave_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_ave_u_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Average Rounded
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done signed with full precision,
+/// i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_s.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_aver_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_aver_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Average Rounded
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight signed 16-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done signed with full precision,
+/// i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_aver_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_aver_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Average Rounded
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are added to the elements in vector `b` (four signed 32-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done signed with full precision,
+/// i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_aver_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_aver_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Average Rounded
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are added to the elements in vector `b` (two signed 64-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done signed with full precision,
+/// i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_aver_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_aver_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average Rounded
+///
+/// The elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen unsigned 8-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done unsigned with full precision,
+/// i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_u.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_aver_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_aver_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average Rounded
+///
+/// The elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight unsigned 16-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done unsigned with full precision,
+/// i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_aver_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_aver_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average Rounded
+///
+/// The elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are added to the elements in vector `b` (four unsigned 32-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done unsigned with full precision,
+/// i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_aver_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_aver_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average Rounded
+///
+/// The elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are added to the elements in vector `b` (two unsigned 64-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done unsigned with full precision,
+/// i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_aver_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_aver_u_d(a, mem::transmute(b))
+}
+
+/// Vector Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The bit position is given by the elements in `b` (sixteen unsigned 8-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclr.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bclr_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_bclr_b(a, mem::transmute(b))
+}
+
+/// Vector Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (eight unsigned 16-bit integer numbers).
+/// The bit position is given by the elements in `b` (eight unsigned 16-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclr.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bclr_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_bclr_h(a, mem::transmute(b))
+}
+
+/// Vector Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (four unsigned 32-bit integer numbers).
+/// The bit position is given by the elements in `b` (four unsigned 32-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclr.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bclr_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_bclr_w(a, mem::transmute(b))
+}
+
+/// Vector Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (two unsigned 64-bit integer numbers).
+/// The bit position is given by the elements in `b` (two unsigned 64-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclr.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bclr_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_bclr_d(a, mem::transmute(b))
+}
+
+/// Immediate Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The bit position is given by the immediate `m` modulo the size of the element in bits.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclri.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bclri_b<const IMM3: i32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_bclri_b(a, IMM3)
+}
+
+/// Immediate Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (eight unsigned 16-bit integer numbers).
+/// The bit position is given by the immediate `m` modulo the size of the element in bits.
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclri.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bclri_h<const IMM4: i32>(a: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_bclri_h(a, IMM4)
+}
+
+/// Immediate Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (four unsigned 32-bit integer numbers).
+/// The bit position is given by the immediate `m` modulo the size of the element in bits.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclri.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bclri_w<const IMM5: i32>(a: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_bclri_w(a, IMM5)
+}
+
+/// Immediate Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (two unsigned 64-bit integer numbers).
+/// The bit position is given by the immediate `m` modulo the size of the element in bits.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclri.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bclri_d<const IMM6: i32>(a: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    msa_bclri_d(a, IMM6)
+}
+
+/// Vector Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// to elements in vector `a` (sixteen unsigned 8-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the elements in vector `c` (sixteen unsigned 8-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsl.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsl_b(a: v16u8, b: v16u8, c: v16u8) -> v16u8 {
+    msa_binsl_b(a, mem::transmute(b), c)
+}
+
+/// Vector Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (eight unsigned 16-bit integer numbers)
+/// to elements in vector `a` (eight unsigned 16-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the elements in vector `c` (eight unsigned 16-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsl.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsl_h(a: v8u16, b: v8u16, c: v8u16) -> v8u16 {
+    msa_binsl_h(a, mem::transmute(b), c)
+}
+
+/// Vector Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (four unsigned 32-bit integer numbers)
+/// to elements in vector `a` (four unsigned 32-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the elements in vector `c` (four unsigned 32-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsl.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsl_w(a: v4u32, b: v4u32, c: v4u32) -> v4u32 {
+    msa_binsl_w(a, mem::transmute(b), c)
+}
+
+/// Vector Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (two unsigned 64-bit integer numbers)
+/// to elements in vector `a` (two unsigned 64-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the elements in vector `c` (two unsigned 64-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsl.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsl_d(a: v2u64, b: v2u64, c: v2u64) -> v2u64 {
+    msa_binsl_d(a, mem::transmute(b), c)
+}
+
+/// Immediate Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// to elements in vector `a` (sixteen unsigned 8-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the immediate `imm3` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsli.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsli_b<const IMM3: i32>(a: v16u8, b: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_binsli_b(a, mem::transmute(b), IMM3)
+}
+
+/// Immediate Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (eight unsigned 16-bit integer numbers)
+/// to elements in vector `a` (eight unsigned 16-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the immediate `imm4` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsli.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsli_h<const IMM4: i32>(a: v8u16, b: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_binsli_h(a, mem::transmute(b), IMM4)
+}
+
+/// Immediate Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (four unsigned 32-bit integer numbers)
+/// to elements in vector `a` (four unsigned 32-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the immediate `imm5` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsli.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsli_w<const IMM5: i32>(a: v4u32, b: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_binsli_w(a, mem::transmute(b), IMM5)
+}
+
+/// Immediate Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (two unsigned 64-bit integer numbers)
+/// to elements in vector `a` (two unsigned 64-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the immediate `imm6` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsli.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsli_d<const IMM6: i32>(a: v2u64, b: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    msa_binsli_d(a, mem::transmute(b), IMM6)
+}
+
+/// Vector Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// to elements in vector `a` (sixteen unsigned 8-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the elements in vector `c` (sixteen unsigned 8-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsr.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsr_b(a: v16u8, b: v16u8, c: v16u8) -> v16u8 {
+    msa_binsr_b(a, mem::transmute(b), c)
+}
+
+/// Vector Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (eight unsigned 16-bit integer numbers)
+/// to elements in vector `a` (eight unsigned 16-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the elements in vector `c` (eight unsigned 16-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsr.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsr_h(a: v8u16, b: v8u16, c: v8u16) -> v8u16 {
+    msa_binsr_h(a, mem::transmute(b), c)
+}
+
+/// Vector Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (four unsigned 32-bit integer numbers)
+/// to elements in vector `a` (four unsigned 32-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the elements in vector `c` (four unsigned 32-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsr.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsr_w(a: v4u32, b: v4u32, c: v4u32) -> v4u32 {
+    msa_binsr_w(a, mem::transmute(b), c)
+}
+
+/// Vector Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (two unsigned 64-bit integer numbers)
+/// to elements in vector `a` (two unsigned 64-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the elements in vector `c` (two unsigned 64-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsr.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsr_d(a: v2u64, b: v2u64, c: v2u64) -> v2u64 {
+    msa_binsr_d(a, mem::transmute(b), c)
+}
+
+/// Immediate Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// to elements in vector `a` (sixteen unsigned 8-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the immediate `imm3` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsri.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsri_b<const IMM3: i32>(a: v16u8, b: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_binsri_b(a, mem::transmute(b), IMM3)
+}
+
+/// Immediate Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (eight unsigned 16-bit integer numbers)
+/// to elements in vector `a` (eight unsigned 16-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the immediate `imm4` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsri.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsri_h<const IMM4: i32>(a: v8u16, b: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_binsri_h(a, mem::transmute(b), IMM4)
+}
+
+/// Immediate Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (four unsigned 32-bit integer numbers)
+/// to elements in vector `a` (four unsigned 32-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the immediate `imm5` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsri.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsri_w<const IMM5: i32>(a: v4u32, b: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_binsri_w(a, mem::transmute(b), IMM5)
+}
+
+/// Immediate Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (two unsigned 64-bit integer numbers)
+/// to elements in vector `a` (two unsigned 64-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the immediate `imm6` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsri.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_binsri_d<const IMM6: i32>(a: v2u64, b: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    msa_binsri_d(a, mem::transmute(b), IMM6)
+}
+
+/// Vector Bit Move If Not Zero
+///
+/// Copy to destination vector `a` (sixteen unsigned 8-bit integer numbers) all bits from source vector
+/// `b` (sixteen unsigned 8-bit integer numbers) for which the corresponding bits from target vector `c`
+/// (sixteen unsigned 8-bit integer numbers) are 1 and leaves unchanged all destination bits
+/// for which the corresponding target bits are 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bmnz.v))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bmnz_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8 {
+    msa_bmnz_v(a, mem::transmute(b), c)
+}
+
+/// Immediate Bit Move If Not Zero
+///
+/// Copy to destination vector `a` (sixteen unsigned 8-bit integer numbers) all bits from source vector
+/// `b` (sixteen unsigned 8-bit integer numbers) for which the corresponding bits from immediate `imm8`
+/// are 1 and leaves unchanged all destination bits for which the corresponding target bits are 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bmnzi.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bmnzi_b<const IMM8: i32>(a: v16u8, b: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    msa_bmnzi_b(a, mem::transmute(b), IMM8)
+}
+
+/// Vector Bit Move If Zero
+///
+/// Copy to destination vector `a` (sixteen unsigned 8-bit integer numbers) all bits from source vector
+/// `b` (sixteen unsigned 8-bit integer numbers) for which the corresponding bits from target vector `c`
+/// (sixteen unsigned 8-bit integer numbers) are 0 and leaves unchanged all destination bits
+/// for which the corresponding target bits are 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bmz.v))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bmz_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8 {
+    msa_bmz_v(a, mem::transmute(b), c)
+}
+
+/// Immediate Bit Move If Zero
+///
+/// Copy to destination vector `a` (sixteen unsigned 8-bit integer numbers) all bits from source vector
+/// `b` (sixteen unsigned 8-bit integer numbers) for which the corresponding bits from immediate `imm8`
+/// are 0 and leaves unchanged all destination bits for which the corresponding immediate bits are 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bmzi.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bmzi_b<const IMM8: i32>(a: v16u8, b: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    msa_bmzi_b(a, mem::transmute(b), IMM8)
+}
+
+/// Vector Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bneg.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bneg_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_bneg_b(a, mem::transmute(b))
+}
+
+/// Vector Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (eight unsigned 16-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bneg.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bneg_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_bneg_h(a, mem::transmute(b))
+}
+
+/// Vector Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (four unsigned 32-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (four unsigned 32-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bneg.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bneg_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_bneg_w(a, mem::transmute(b))
+}
+
+/// Vector Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (two unsigned 64-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (two unsigned 64-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bneg.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bneg_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_bneg_d(a, mem::transmute(b))
+}
+
+/// Immediate Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The bit position is given by immediate `imm3` modulo the size of the element in bits.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnegi.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bnegi_b<const IMM3: i32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_bnegi_b(a, IMM3)
+}
+
+/// Immediate Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (eight unsigned 16-bit integer numbers).
+/// The bit position is given by immediate `imm4` modulo the size of the element in bits.
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnegi.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bnegi_h<const IMM4: i32>(a: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_bnegi_h(a, IMM4)
+}
+
+/// Immediate Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (four unsigned 32-bit integer numbers).
+/// The bit position is given by immediate `imm5` modulo the size of the element in bits.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnegi.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bnegi_w<const IMM5: i32>(a: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_bnegi_w(a, IMM5)
+}
+
+/// Immediate Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (two unsigned 64-bit integer numbers).
+/// The bit position is given by immediate `imm6` modulo the size of the element in bits.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnegi.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bnegi_d<const IMM6: i32>(a: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    msa_bnegi_d(a, IMM6)
+}
+
+/// Immediate Branch If All Elements Are Not Zero
+///
+/// PC-relative branch if all elements in `a` (sixteen unsigned 8-bit integer numbers) are not zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnz.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bnz_b(a: v16u8) -> i32 {
+    msa_bnz_b(a)
+}
+
+/// Immediate Branch If All Elements Are Not Zero
+///
+/// PC-relative branch if all elements in `a` (eight unsigned 16-bit integer numbers) are not zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnz.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bnz_h(a: v8u16) -> i32 {
+    msa_bnz_h(a)
+}
+
+/// Immediate Branch If All Elements Are Not Zero
+///
+/// PC-relative branch if all elements in `a` (four unsigned 32-bit integer numbers) are not zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnz.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bnz_w(a: v4u32) -> i32 {
+    msa_bnz_w(a)
+}
+
+/// Immediate Branch If All Elements Are Not Zero
+///
+/// PC-relative branch if all elements in `a` (two unsigned 64-bit integer numbers) are not zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnz.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bnz_d(a: v2u64) -> i32 {
+    msa_bnz_d(a)
+}
+
+/// Immediate Branch If Not Zero (At Least One Element of Any Format Is Not Zero)
+///
+/// PC-relative branch if at least one bit in `a` (four unsigned 32-bit integer numbers) are not zero.
+/// i.e at least one element is not zero regardless of the data format.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnz.v))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bnz_v(a: v16u8) -> i32 {
+    msa_bnz_v(a)
+}
+
+/// Vector Bit Select
+///
+/// Selectively copy bits from the source vectors `b` (eight unsigned 16-bit integer numbers)
+/// and `c` (eight unsigned 16-bit integer numbers)
+/// into destination vector `a` (eight unsigned 16-bit integer numbers) based on the corresponding bit in `a`:
+/// if 0 copies the bit from `b`, if 1 copies the bit from `c`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bsel.v))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bsel_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8 {
+    msa_bsel_v(a, mem::transmute(b), c)
+}
+
+/// Immediate Bit Select
+///
+/// Selectively copy bits from the 8-bit immediate `imm8` and `c` (eight unsigned 16-bit integer numbers)
+/// into destination vector `a` (eight unsigned 16-bit integer numbers) based on the corresponding bit in `a`:
+/// if 0 copies the bit from `b`, if 1 copies the bit from `c`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bseli.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bseli_b<const IMM8: i32>(a: v16u8, b: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    msa_bseli_b(a, mem::transmute(b), IMM8)
+}
+
+/// Vector Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bset.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bset_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_bset_b(a, mem::transmute(b))
+}
+
+/// Vector Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (eight unsigned 16-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bset.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bset_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_bset_h(a, mem::transmute(b))
+}
+
+/// Vector Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (four unsigned 32-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (four unsigned 32-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bset.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bset_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_bset_w(a, mem::transmute(b))
+}
+
+/// Vector Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (two unsigned 64-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (two unsigned 64-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bset.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bset_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_bset_d(a, mem::transmute(b))
+}
+
+/// Immediate Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The bit position is given by immediate `imm3`.
+/// The result is written to vector `a` (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bseti.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bseti_b<const IMM3: i32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_bseti_b(a, IMM3)
+}
+
+/// Immediate Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (eight unsigned 16-bit integer numbers).
+/// The bit position is given by immediate `imm4`.
+/// The result is written to vector `a` (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bseti.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bseti_h<const IMM4: i32>(a: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_bseti_h(a, IMM4)
+}
+
+/// Immediate Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (four unsigned 32-bit integer numbers).
+/// The bit position is given by immediate `imm5`.
+/// The result is written to vector `a` (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bseti.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bseti_w<const IMM5: i32>(a: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_bseti_w(a, IMM5)
+}
+
+/// Immediate Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (two unsigned 64-bit integer numbers).
+/// The bit position is given by immediate `imm6`.
+/// The result is written to vector `a` (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bseti.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bseti_d<const IMM6: i32>(a: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    msa_bseti_d(a, IMM6)
+}
+
+/// Immediate Branch If At Least One Element Is Zero
+///
+/// PC-relative branch if at least one element in `a` (sixteen unsigned 8-bit integer numbers) is zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bz.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bz_b(a: v16u8) -> i32 {
+    msa_bz_b(a)
+}
+
+/// Immediate Branch If At Least One Element Is Zero
+///
+/// PC-relative branch if at least one element in `a` (eight unsigned 16-bit integer numbers) is zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bz.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bz_h(a: v8u16) -> i32 {
+    msa_bz_h(a)
+}
+
+/// Immediate Branch If At Least One Element Is Zero
+///
+/// PC-relative branch if at least one element in `a` (four unsigned 32-bit integer numbers) is zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bz.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bz_w(a: v4u32) -> i32 {
+    msa_bz_w(a)
+}
+
+/// Immediate Branch If At Least One Element Is Zero
+///
+/// PC-relative branch if at least one element in `a` (two unsigned 64-bit integer numbers) is zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bz.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bz_d(a: v2u64) -> i32 {
+    msa_bz_d(a)
+}
+
+/// Immediate Branch If Zero (All Elements of Any Format Are Zero)
+///
+/// PC-relative branch if all elements in `a` (sixteen unsigned 8-bit integer numbers) bits are zero,
+/// i.e. all elements are zero regardless of the data format.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bz.v))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_bz_v(a: v16u8) -> i32 {
+    msa_bz_v(a)
+}
+
+/// Vector Compare Equal
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen signed 8-bit integer numbers) and `b` (sixteen signed 8-bit integer numbers)
+/// elements are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceq.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ceq_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_ceq_b(a, mem::transmute(b))
+}
+
+/// Vector Compare Equal
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight signed 16-bit integer numbers) and `b` (eight signed 16-bit integer numbers)
+/// elements are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceq.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ceq_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_ceq_h(a, mem::transmute(b))
+}
+
+/// Vector Compare Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four signed 32-bit integer numbers) and `b` (four signed 32-bit integer numbers)
+/// elements are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceq.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ceq_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_ceq_w(a, mem::transmute(b))
+}
+
+/// Vector Compare Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two signed 64-bit integer numbers) and `b` (two signed 64-bit integer numbers)
+/// elements are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceq.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ceq_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_ceq_d(a, mem::transmute(b))
+}
+
+/// Immediate Compare Equal
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen signed 8-bit integer numbers) the 5-bit signed immediate imm_s5
+/// are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceqi.b, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ceqi_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_ceqi_b(a, IMM_S5)
+}
+
+/// Immediate Compare Equal
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight signed 16-bit integer numbers) the 5-bit signed immediate imm_s5
+/// are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceqi.h, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ceqi_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_ceqi_h(a, IMM_S5)
+}
+
+/// Immediate Compare Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four signed 32-bit integer numbers) the 5-bit signed immediate imm_s5
+/// are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceqi.w, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ceqi_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_ceqi_w(a, IMM_S5)
+}
+
+/// Immediate Compare Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two signed 64-bit integer numbers) the 5-bit signed immediate imm_s5
+/// are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceqi.d, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ceqi_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_ceqi_d(a, IMM_S5)
+}
+
+/// GPR Copy from MSA Control Register
+///
+/// The sign extended content of MSA control register cs is copied to GPR rd.
+///
+/// Can not be tested in user mode
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cfcmsa, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_cfcmsa<const IMM5: i32>() -> i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_cfcmsa(IMM5)
+}
+
+/// Vector Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen signed 8-bit integer numbers) element
+/// are signed less than or equal to `b` (sixteen signed 8-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_s.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_cle_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_cle_s_b(a, mem::transmute(b))
+}
+
+/// Vector Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight signed 16-bit integer numbers) element
+/// are signed less than or equal to `b` (eight signed 16-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_cle_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_cle_s_h(a, mem::transmute(b))
+}
+
+/// Vector Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four signed 32-bit integer numbers) element
+/// are signed less than or equal to `b` (four signed 32-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_cle_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_cle_s_w(a, mem::transmute(b))
+}
+
+/// Vector Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two signed 64-bit integer numbers) element
+/// are signed less than or equal to `b` (two signed 64-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_cle_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_cle_s_d(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen unsigned 8-bit integer numbers) element
+/// are unsigned less than or equal to `b` (sixteen unsigned 8-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_u.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_cle_u_b(a: v16u8, b: v16u8) -> v16i8 {
+    msa_cle_u_b(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight unsigned 16-bit integer numbers) element
+/// are unsigned less than or equal to `b` (eight unsigned 16-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_cle_u_h(a: v8u16, b: v8u16) -> v8i16 {
+    msa_cle_u_h(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four unsigned 32-bit integer numbers) element
+/// are unsigned less than or equal to `b` (four unsigned 32-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_cle_u_w(a: v4u32, b: v4u32) -> v4i32 {
+    msa_cle_u_w(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two unsigned 64-bit integer numbers) element
+/// are unsigned less than or equal to `b` (two unsigned 64-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_cle_u_d(a: v2u64, b: v2u64) -> v2i64 {
+    msa_cle_u_d(a, mem::transmute(b))
+}
+
+/// Immediate Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen signed 8-bit integer numbers) element
+/// is less than or equal to the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_s.b, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clei_s_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_clei_s_b(a, IMM_S5)
+}
+
+/// Immediate Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight signed 16-bit integer numbers) element
+/// is less than or equal to the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_s.h, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clei_s_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_clei_s_h(a, IMM_S5)
+}
+
+/// Immediate Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four signed 32-bit integer numbers) element
+/// is less than or equal to the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_s.w, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clei_s_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_clei_s_w(a, IMM_S5)
+}
+
+/// Immediate Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two signed 64-bit integer numbers) element
+/// is less than or equal to the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_s.d, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clei_s_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_clei_s_d(a, IMM_S5)
+}
+
+/// Immediate Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen unsigned 8-bit integer numbers) element
+/// is unsigned less than or equal to the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_u.b, imm5 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clei_u_b<const IMM5: i32>(a: v16u8) -> v16i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_clei_u_b(a, IMM5)
+}
+
+/// Immediate Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight unsigned 16-bit integer numbers) element
+/// is unsigned less than or equal to the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_u.h, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clei_u_h<const IMM5: i32>(a: v8u16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_clei_u_h(a, IMM5)
+}
+
+/// Immediate Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four unsigned 32-bit integer numbers) element
+/// is unsigned less than or equal to the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_u.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clei_u_w<const IMM5: i32>(a: v4u32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_clei_u_w(a, IMM5)
+}
+
+/// Immediate Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two unsigned 64-bit integer numbers) element
+/// is unsigned less than or equal to the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_u.d, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clei_u_d<const IMM5: i32>(a: v2u64) -> v2i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_clei_u_d(a, IMM5)
+}
+
+/// Vector Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen signed 8-bit integer numbers) element
+/// are signed less than `b` (sixteen signed 8-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_s.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clt_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_clt_s_b(a, mem::transmute(b))
+}
+
+/// Vector Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight signed 16-bit integer numbers) element
+/// are signed less than `b` (eight signed 16-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clt_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_clt_s_h(a, mem::transmute(b))
+}
+
+/// Vector Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four signed 32-bit integer numbers) element
+/// are signed less than `b` (four signed 32-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clt_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_clt_s_w(a, mem::transmute(b))
+}
+
+/// Vector Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two signed 64-bit integer numbers) element
+/// are signed less than `b` (two signed 64-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clt_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_clt_s_d(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen unsigned 8-bit integer numbers) element
+/// are unsigned less than `b` (sixteen unsigned 8-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_u.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clt_u_b(a: v16u8, b: v16u8) -> v16i8 {
+    msa_clt_u_b(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight unsigned 16-bit integer numbers) element
+/// are unsigned less than `b` (eight unsigned 16-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clt_u_h(a: v8u16, b: v8u16) -> v8i16 {
+    msa_clt_u_h(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four unsigned 32-bit integer numbers) element
+/// are unsigned less than `b` (four unsigned 32-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clt_u_w(a: v4u32, b: v4u32) -> v4i32 {
+    msa_clt_u_w(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two unsigned 64-bit integer numbers) element
+/// are unsigned less than `b` (two unsigned 64-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clt_u_d(a: v2u64, b: v2u64) -> v2i64 {
+    msa_clt_u_d(a, mem::transmute(b))
+}
+
+/// Immediate Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen signed 8-bit integer numbers) element
+/// is less than the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_s.b, imm_s5 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clti_s_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_clti_s_b(a, IMM_S5)
+}
+
+/// Immediate Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight signed 16-bit integer numbers) element
+/// is less than the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_s.h, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clti_s_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_clti_s_h(a, IMM_S5)
+}
+
+/// Immediate Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four signed 32-bit integer numbers) element
+/// is less than the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_s.w, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clti_s_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_clti_s_w(a, IMM_S5)
+}
+
+/// Immediate Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two signed 64-bit integer numbers) element
+/// is less than the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_s.d, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clti_s_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_clti_s_d(a, IMM_S5)
+}
+
+/// Immediate Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen unsigned 8-bit integer numbers) element
+/// is unsigned less than the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_u.b, imm5 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clti_u_b<const IMM5: i32>(a: v16u8) -> v16i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_clti_u_b(a, IMM5)
+}
+
+/// Immediate Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight unsigned 16-bit integer numbers) element
+/// is unsigned less than the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_u.h, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clti_u_h<const IMM5: i32>(a: v8u16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_clti_u_h(a, IMM5)
+}
+
+/// Immediate Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four unsigned 32-bit integer numbers) element
+/// is unsigned less than the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_u.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clti_u_w<const IMM5: i32>(a: v4u32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_clti_u_w(a, IMM5)
+}
+
+/// Immediate Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two unsigned 64-bit integer numbers) element
+/// is unsigned less than the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_u.d, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_clti_u_d<const IMM5: i32>(a: v2u64) -> v2i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_clti_u_d(a, IMM5)
+}
+
+/// Element Copy to GPR Signed
+///
+/// Sign-extend element `imm4` of vector `a` (sixteen signed 8-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_s.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_copy_s_b<const IMM4: i32>(a: v16i8) -> i32 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_copy_s_b(a, IMM4)
+}
+
+/// Element Copy to GPR Signed
+///
+/// Sign-extend element `imm3` of vector `a` (eight signed 16-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_s.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_copy_s_h<const IMM3: i32>(a: v8i16) -> i32 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_copy_s_h(a, IMM3)
+}
+
+/// Element Copy to GPR Signed
+///
+/// Sign-extend element `imm2` of vector `a` (four signed 32-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_s.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_copy_s_w<const IMM2: i32>(a: v4i32) -> i32 {
+    static_assert_uimm_bits!(IMM2, 2);
+    msa_copy_s_w(a, IMM2)
+}
+
+/// Element Copy to GPR Signed
+///
+/// Sign-extend element `imm1` of vector `a` (two signed 64-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_s.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_copy_s_d<const IMM1: i32>(a: v2i64) -> i64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    msa_copy_s_d(a, IMM1)
+}
+
+/// Element Copy to GPR Unsigned
+///
+/// Zero-extend element `imm4` of vector `a` (sixteen signed 8-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_u.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_copy_u_b<const IMM4: i32>(a: v16i8) -> u32 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_copy_u_b(a, IMM4)
+}
+
+/// Element Copy to GPR Unsigned
+///
+/// Zero-extend element `imm3` of vector `a` (eight signed 16-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_u.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_copy_u_h<const IMM3: i32>(a: v8i16) -> u32 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_copy_u_h(a, IMM3)
+}
+
+/// Element Copy to GPR Unsigned
+///
+/// Zero-extend element `imm2` of vector `a` (four signed 32-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_u.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_copy_u_w<const IMM2: i32>(a: v4i32) -> u32 {
+    static_assert_uimm_bits!(IMM2, 2);
+    msa_copy_u_w(a, IMM2)
+}
+
+/// Element Copy to GPR Unsigned
+///
+/// Zero-extend element `imm1` of vector `a` (two signed 64-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_u.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_copy_u_d<const IMM1: i32>(a: v2i64) -> u64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    msa_copy_u_d(a, IMM1)
+}
+
+/// GPR Copy to MSA Control Register
+///
+/// The content of the least significant 31 bits of GPR `imm1` is copied to
+/// MSA control register cd.
+///
+/// Can not be tested in user mode
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ctcmsa, imm1 = 0b1))]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ctcmsa<const IMM5: i32>(a: i32) -> () {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_ctcmsa(IMM5, a)
+}
+
+/// Vector Signed Divide
+///
+/// The signed integer elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_s.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_div_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_div_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Divide
+///
+/// The signed integer elements in vector `a` (eight signed 16-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_div_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_div_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Divide
+///
+/// The signed integer elements in vector `a` (four signed 32-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_div_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_div_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Divide
+///
+/// The signed integer elements in vector `a` (two signed 64-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_div_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_div_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Divide
+///
+/// The unsigned integer elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (sixteen unsigned 8-bit integer numbers).
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_u.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_div_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_div_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Divide
+///
+/// The unsigned integer elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (eight unsigned 16-bit integer numbers).
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_div_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_div_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Divide
+///
+/// The unsigned integer elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (four unsigned 32-bit integer numbers).
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_div_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_div_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Divide
+///
+/// The unsigned integer elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (two unsigned 64-bit integer numbers).
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_div_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_div_u_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Dot Product
+///
+/// The signed integer elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are multiplied by signed integer elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// producing a result the size of the input operands. The multiplication results of
+/// adjacent odd/even elements are added and stored to the destination
+/// vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dotp_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dotp_s_h(a: v16i8, b: v16i8) -> v8i16 {
+    msa_dotp_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Dot Product
+///
+/// The signed integer elements in vector `a` (eight signed 16-bit integer numbers)
+/// are multiplied by signed integer elements in vector `b` (eight signed 16-bit integer numbers)
+/// producing a result the size of the input operands. The multiplication results of
+/// adjacent odd/even elements are added and stored to the destination
+/// vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dotp_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dotp_s_w(a: v8i16, b: v8i16) -> v4i32 {
+    msa_dotp_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Dot Product
+///
+/// The signed integer elements in vector `a` (four signed 32-bit integer numbers)
+/// are multiplied by signed integer elements in vector `b` (four signed 32-bit integer numbers)
+/// producing a result the size of the input operands. The multiplication results of
+/// adjacent odd/even elements are added and stored to the destination
+/// vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dotp_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dotp_s_d(a: v4i32, b: v4i32) -> v2i64 {
+    msa_dotp_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Dot Product
+///
+/// The unsigned integer elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// producing a result the size of the input operands. The multiplication results of
+/// adjacent odd/even elements are added and stored to the destination
+/// vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dotp_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dotp_u_h(a: v16u8, b: v16u8) -> v8u16 {
+    msa_dotp_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Dot Product
+///
+/// The unsigned integer elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// producing a result the size of the input operands. The multiplication results of
+/// adjacent odd/even elements are added and stored to the destination
+/// vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dotp_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dotp_u_w(a: v8u16, b: v8u16) -> v4u32 {
+    msa_dotp_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Dot Product
+///
+/// The unsigned integer elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `b` (four unsigned 32-bit integer numbers)
+/// producing a result the size of the input operands. The multiplication results of
+/// adjacent odd/even elements are added and stored to the destination
+/// vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dotp_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dotp_u_d(a: v4u32, b: v4u32) -> v2u64 {
+    msa_dotp_u_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Dot Product and Add
+///
+/// The signed integer elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are multiplied by signed integer elements in vector `c` (sixteen signed 8-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are added to the vector `a` (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpadd_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dpadd_s_h(a: v8i16, b: v16i8, c: v16i8) -> v8i16 {
+    msa_dpadd_s_h(a, mem::transmute(b), c)
+}
+
+/// Vector Signed Dot Product and Add
+///
+/// The signed integer elements in vector `b` (eight signed 16-bit integer numbers)
+/// are multiplied by signed integer elements in vector `c` (eight signed 16-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are added to the vector `a` (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpadd_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dpadd_s_w(a: v4i32, b: v8i16, c: v8i16) -> v4i32 {
+    msa_dpadd_s_w(a, mem::transmute(b), c)
+}
+
+/// Vector Signed Dot Product and Add
+///
+/// The signed integer elements in vector `b` (four signed 32-bit integer numbers)
+/// are multiplied by signed integer elements in vector `c` (four signed 32-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are added to the vector `a` (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpadd_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dpadd_s_d(a: v2i64, b: v4i32, c: v4i32) -> v2i64 {
+    msa_dpadd_s_d(a, mem::transmute(b), c)
+}
+
+/// Vector Unsigned Dot Product and Add
+///
+/// The unsigned integer elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `c` (sixteen unsigned 8-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are added to the vector `a` (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpadd_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dpadd_u_h(a: v8u16, b: v16u8, c: v16u8) -> v8u16 {
+    msa_dpadd_u_h(a, mem::transmute(b), c)
+}
+
+/// Vector Unsigned Dot Product and Add
+///
+/// The unsigned integer elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `c` (eight unsigned 16-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are added to the vector `a` (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpadd_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dpadd_u_w(a: v4u32, b: v8u16, c: v8u16) -> v4u32 {
+    msa_dpadd_u_w(a, mem::transmute(b), c)
+}
+
+/// Vector Unsigned Dot Product and Add
+///
+/// The unsigned integer elements in vector `b` (four unsigned 32-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `c` (four unsigned 32-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are added to the vector `a` (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpadd_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dpadd_u_d(a: v2u64, b: v4u32, c: v4u32) -> v2u64 {
+    msa_dpadd_u_d(a, mem::transmute(b), c)
+}
+
+/// Vector Signed Dot Product and Add
+///
+/// The signed integer elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are multiplied by signed integer elements in vector `c` (sixteen signed 8-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are subtracted from the integer elements in vector `a`
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpsub_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dpsub_s_h(a: v8i16, b: v16i8, c: v16i8) -> v8i16 {
+    msa_dpsub_s_h(a, mem::transmute(b), c)
+}
+
+/// Vector Signed Dot Product and Add
+///
+/// The signed integer elements in vector `b` (eight signed 16-bit integer numbers)
+/// are multiplied by signed integer elements in vector `c` (eight signed 16-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are subtracted from the integer elements in vector `a`
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpsub_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dpsub_s_w(a: v4i32, b: v8i16, c: v8i16) -> v4i32 {
+    msa_dpsub_s_w(a, mem::transmute(b), c)
+}
+
+/// Vector Signed Dot Product and Add
+///
+/// The signed integer elements in vector `b` (four signed 32-bit integer numbers)
+/// are multiplied by signed integer elements in vector `c` (four signed 32-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are subtracted from the integer elements in vector `a`
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpsub_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dpsub_s_d(a: v2i64, b: v4i32, c: v4i32) -> v2i64 {
+    msa_dpsub_s_d(a, mem::transmute(b), c)
+}
+
+/// Vector Unsigned Dot Product and Add
+///
+/// The unsigned integer elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `c` (sixteen unsigned 8-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are subtracted from the integer elements in vector `a`
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpsub_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dpsub_u_h(a: v8i16, b: v16u8, c: v16u8) -> v8i16 {
+    msa_dpsub_u_h(a, mem::transmute(b), c)
+}
+
+/// Vector Unsigned Dot Product and Add
+///
+/// The unsigned integer elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `c` (eight unsigned 16-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are subtracted from the integer elements in vector `a`
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpsub_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dpsub_u_w(a: v4i32, b: v8u16, c: v8u16) -> v4i32 {
+    msa_dpsub_u_w(a, mem::transmute(b), c)
+}
+
+/// Vector Unsigned Dot Product and Add
+///
+/// The unsigned integer elements in vector `b` (four unsigned 32-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `c` (four unsigned 32-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are subtracted from the integer elements in vector `a`
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpsub_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_dpsub_u_d(a: v2i64, b: v4u32, c: v4u32) -> v2i64 {
+    msa_dpsub_u_d(a, mem::transmute(b), c)
+}
+
+/// Vector Floating-Point Addition
+///
+/// The floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are added to the floating-point elements in `bc` (four 32-bit floating point numbers).
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fadd.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fadd_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fadd_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Addition
+///
+/// The floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// are added to the floating-point elements in `bc` (two 64-bit floating point numbers).
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fadd.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fadd_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fadd_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Always False
+///
+/// Set all bits to 0 in vector (four signed 32-bit integer numbers).
+/// Signaling NaN elements in `a` (four 32-bit floating point numbers)
+/// or `b` (four 32-bit floating point numbers) signal Invalid Operation exception.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcaf.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcaf_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcaf_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Always False
+///
+/// Set all bits to 0 in vector (two signed 64-bit integer numbers).
+/// Signaling NaN elements in `a` (two 64-bit floating point numbers)
+/// or `b` (two 64-bit floating point numbers) signal Invalid Operation exception.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcaf.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcaf_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcaf_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding in `a` (four 32-bit floating point numbers)
+/// and `b` (four 32-bit floating point numbers) elements are ordered and equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fceq.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fceq_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fceq_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding in `a` (two 64-bit floating point numbers)
+/// and `b` (two 64-bit floating point numbers) elements are ordered and equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fceq.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fceq_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fceq_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Class Mask
+///
+/// Store in each element of vector (four signed 32-bit integer numbers)
+/// a bit mask reflecting the floating-point class of the corresponding element of vector
+/// `a` (four 32-bit floating point numbers).
+/// The mask has 10 bits as follows. Bits 0 and 1 indicate NaN values: signaling NaN (bit 0) and quiet NaN (bit 1).
+/// Bits 2, 3, 4, 5 classify negative values: infinity (bit 2), normal (bit 3), subnormal (bit 4), and zero (bit 5).
+/// Bits 6, 7, 8, 9 classify positive values: infinity (bit 6), normal (bit 7), subnormal (bit 8), and zero (bit 9).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fclass.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fclass_w(a: v4f32) -> v4i32 {
+    msa_fclass_w(a)
+}
+
+/// Vector Floating-Point Class Mask
+///
+/// Store in each element of vector (two signed 64-bit integer numbers)
+/// a bit mask reflecting the floating-point class of the corresponding element of vector
+/// `a` (two 64-bit floating point numbers).
+/// The mask has 10 bits as follows. Bits 0 and 1 indicate NaN values: signaling NaN (bit 0) and quiet NaN (bit 1).
+/// Bits 2, 3, 4, 5 classify negative values: infinity (bit 2), normal (bit 3), subnormal (bit 4), and zero (bit 5).
+/// Bits 6, 7, 8, 9 classify positive values: infinity (bit 6), normal (bit 7), subnormal (bit 8), and zero (bit 9).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fclass.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fclass_d(a: v2f64) -> v2i64 {
+    msa_fclass_d(a)
+}
+
+/// Vector Floating-Point Quiet Compare Less or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers) elements are ordered
+/// and either less than or equal to `b` (four 32-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcle.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcle_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcle_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Less or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers) elements are ordered
+/// and either less than or equal to `b` (two 64-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcle.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcle_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcle_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers) elements are ordered
+/// and less than `b` (four 32-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fclt.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fclt_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fclt_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers) elements are ordered
+/// and less than `b` (two 64-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fclt.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fclt_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fclt_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Not Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are ordered and not equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcne.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcne_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcne_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Not Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are ordered and not equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcne.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcne_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcne_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Ordered
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are ordered, i.e. both elements are not NaN values,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcor.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcor_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcor_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Ordered
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are ordered, i.e. both elements are not NaN values,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcor.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcor_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcor_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are unordered or equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcueq.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcueq_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcueq_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are unordered or equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcueq.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcueq_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcueq_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Less or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding elements in `a` (four 32-bit floating point numbers)
+/// are unordered or less than or equal to `b` (four 32-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcule.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcule_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcule_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Less or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding elements in `a` (two 64-bit floating point numbers)
+/// are unordered or less than or equal to `b` (two 64-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcule.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcule_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcule_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding elements in `a` (four 32-bit floating point numbers)
+/// are unordered or less than `b` (four 32-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcult.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcult_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcult_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding elements in `a` (two 64-bit floating point numbers)
+/// are unordered or less than `b` (two 64-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcult.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcult_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcult_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers)
+/// and `b` (four 32-bit floating point numbers) elements are unordered,
+/// i.e. at least one element is a NaN value, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcun.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcun_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcun_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers)
+/// and `b` (two 64-bit floating point numbers) elements are unordered,
+/// i.e. at least one element is a NaN value, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcun.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcun_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcun_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Not Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers)
+/// and `b` (four 32-bit floating point numbers) elements are unordered or not equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcune.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcune_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcune_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Not Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers)
+/// and `b` (two 64-bit floating point numbers) elements are unordered or not equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcune.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fcune_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcune_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Division
+///
+/// The floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are divided by the floating-point elements in vector `b` (four 32-bit floating point numbers).
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fdiv.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fdiv_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fdiv_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Division
+///
+/// The floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// are divided by the floating-point elements in vector `b` (two 64-bit floating point numbers).
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fdiv.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fdiv_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fdiv_d(a, mem::transmute(b))
+}
+
+/* FIXME: 16-bit float
+/// Vector Floating-Point Down-Convert Interchange Format
+///
+/// The floating-point elements in vector `a` (four 64-bit floating point numbers)
+/// and vector `b` (four 64-bit floating point numbers) are down-converted
+/// to a smaller interchange format, i.e. from 64-bit to 32-bit, or from 32-bit to 16-bit.
+/// The result is written to vector (8 16-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexdo.h))]
+    #[unstable(feature = "stdarch_mips", issue = "111198")] pub unsafe fn __msa_fexdo_h(a: v4f32, b: v4f32) -> f16x8 {
+    msa_fexdo_h(a, mem::transmute(b))
+}*/
+
+/// Vector Floating-Point Down-Convert Interchange Format
+///
+/// The floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// and vector `b` (two 64-bit floating point numbers) are down-converted
+/// to a smaller interchange format, i.e. from 64-bit to 32-bit, or from 32-bit to 16-bit.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexdo.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fexdo_w(a: v2f64, b: v2f64) -> v4f32 {
+    msa_fexdo_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Down-Convert Interchange Format
+///
+/// The floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are scaled, i.e. multiplied, by 2 to the power of integer elements in vector `b`
+/// (four signed 32-bit integer numbers).
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexp2.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fexp2_w(a: v4f32, b: v4i32) -> v4f32 {
+    msa_fexp2_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Down-Convert Interchange Format
+///
+/// The floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// are scaled, i.e. multiplied, by 2 to the power of integer elements in vector `b`
+/// (two signed 64-bit integer numbers).
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexp2.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fexp2_d(a: v2f64, b: v2i64) -> v2f64 {
+    msa_fexp2_d(a, mem::transmute(b))
+}
+
+/* FIXME: 16-bit float
+/// Vector Floating-Point Up-Convert Interchange Format Left
+///
+/// The left half floating-point elements in vector `a` (two 16-bit floating point numbers)
+/// are up-converted to a larger interchange format,
+/// i.e. from 16-bit to 32-bit, or from 32-bit to 64-bit.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexupl.w))]
+    #[unstable(feature = "stdarch_mips", issue = "111198")] pub unsafe fn __msa_fexupl_w(a: f16x8) -> v4f32 {
+    msa_fexupl_w(a)
+}*/
+
+/// Vector Floating-Point Up-Convert Interchange Format Left
+///
+/// The left half floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are up-converted to a larger interchange format,
+/// i.e. from 16-bit to 32-bit, or from 32-bit to 64-bit.
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexupl.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fexupl_d(a: v4f32) -> v2f64 {
+    msa_fexupl_d(a)
+}
+
+/* FIXME: 16-bit float
+/// Vector Floating-Point Up-Convert Interchange Format Left
+///
+/// The right half floating-point elements in vector `a` (two 16-bit floating point numbers)
+/// are up-converted to a larger interchange format,
+/// i.e. from 16-bit to 32-bit, or from 32-bit to 64-bit.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexupr.w))]
+    #[unstable(feature = "stdarch_mips", issue = "111198")] pub unsafe fn __msa_fexupr_w(a: f16x8) -> v4f32 {
+    msa_fexupr_w(a)
+} */
+
+/// Vector Floating-Point Up-Convert Interchange Format Left
+///
+/// The right half floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are up-converted to a larger interchange format,
+/// i.e. from 16-bit to 32-bit, or from 32-bit to 64-bit.
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexupr.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fexupr_d(a: v4f32) -> v2f64 {
+    msa_fexupr_d(a)
+}
+
+/// Vector Floating-Point Round and Convert from Signed Integer
+///
+/// The signed integer elements in vector `a` (four signed 32-bit integer numbers)
+/// are converted to floating-point values.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffint_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ffint_s_w(a: v4i32) -> v4f32 {
+    msa_ffint_s_w(a)
+}
+
+/// Vector Floating-Point Round and Convert from Signed Integer
+///
+/// The signed integer elements in vector `a` (two signed 64-bit integer numbers)
+/// are converted to floating-point values.
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffint_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ffint_s_d(a: v2i64) -> v2f64 {
+    msa_ffint_s_d(a)
+}
+
+/// Vector Floating-Point Round and Convert from Unsigned Integer
+///
+/// The unsigned integer elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are converted to floating-point values.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffint_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ffint_u_w(a: v4u32) -> v4f32 {
+    msa_ffint_u_w(a)
+}
+
+/// Vector Floating-Point Round and Convert from Unsigned Integer
+///
+/// The unsigned integer elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are converted to floating-point values.
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffint_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ffint_u_d(a: v2u64) -> v2f64 {
+    msa_ffint_u_d(a)
+}
+
+/// Vector Floating-Point Convert from Fixed-Point Left
+///
+/// The left half fixed-point elements in vector `a` (eight signed 16-bit integer numbers)
+/// are up-converted to floating-point data format.
+/// i.e. from 16-bit Q15 to 32-bit floating-point, or from 32-bit Q31 to 64-bit floating-point.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffql.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ffql_w(a: v8i16) -> v4f32 {
+    msa_ffql_w(a)
+}
+
+/// Vector Floating-Point Convert from Fixed-Point Left
+///
+/// The left half fixed-point elements in vector `a` (four signed 32-bit integer numbers)
+/// are up-converted to floating-point data format.
+/// i.e. from 16-bit Q15 to 32-bit floating-point, or from 32-bit Q31 to 64-bit floating-point.
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffql.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ffql_d(a: v4i32) -> v2f64 {
+    msa_ffql_d(a)
+}
+
+/// Vector Floating-Point Convert from Fixed-Point Left
+///
+/// The right half fixed-point elements in vector `a` (eight signed 16-bit integer numbers)
+/// are up-converted to floating-point data format.
+/// i.e. from 16-bit Q15 to 32-bit floating-point, or from 32-bit Q31 to 64-bit floating-point.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffqr.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ffqr_w(a: v8i16) -> v4f32 {
+    msa_ffqr_w(a)
+}
+
+/// Vector Floating-Point Convert from Fixed-Point Left
+///
+/// The right half fixed-point elements in vector `a` (four signed 32-bit integer numbers)
+/// are up-converted to floating-point data format.
+/// i.e. from 16-bit Q15 to 32-bit floating-point, or from 32-bit Q31 to 64-bit floating-point.
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffqr.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ffqr_d(a: v4i32) -> v2f64 {
+    msa_ffqr_d(a)
+}
+
+/// Vector Fill from GPR
+///
+/// Replicate GPR rs value to all elements in vector (sixteen signed 8-bit integer numbers).
+/// If the source GPR is wider than the destination data format, the destination's elements
+/// will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fill.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fill_b(a: i32) -> v16i8 {
+    msa_fill_b(a)
+}
+
+/// Vector Fill from GPR
+///
+/// Replicate GPR rs value to all elements in vector (eight signed 16-bit integer numbers).
+/// If the source GPR is wider than the destination data format, the destination's elements
+/// will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fill.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fill_h(a: i32) -> v8i16 {
+    msa_fill_h(a)
+}
+
+/// Vector Fill from GPR
+///
+/// Replicate GPR rs value to all elements in vector (four signed 32-bit integer numbers).
+/// If the source GPR is wider than the destination data format, the destination's elements
+/// will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fill.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fill_w(a: i32) -> v4i32 {
+    msa_fill_w(a)
+}
+
+/// Vector Fill from GPR
+///
+/// Replicate GPR rs value to all elements in vector (two signed 64-bit integer numbers).
+/// If the source GPR is wider than the destination data format, the destination's elements
+/// will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fill.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fill_d(a: i64) -> v2i64 {
+    msa_fill_d(a)
+}
+
+/// Vector Floating-Point Base 2 Logarithm
+///
+/// The signed integral base 2 exponents of floating-point elements in vector `a`
+/// (four 32-bit floating point numbers) are written as floating-point values to vector elements
+/// (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(flog2.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_flog2_w(a: v4f32) -> v4f32 {
+    msa_flog2_w(a)
+}
+
+/// Vector Floating-Point Base 2 Logarithm
+///
+/// The signed integral base 2 exponents of floating-point elements in vector `a`
+/// (two 64-bit floating point numbers) are written as floating-point values to vector elements
+/// (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(flog2.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_flog2_d(a: v2f64) -> v2f64 {
+    msa_flog2_d(a)
+}
+
+/// Vector Floating-Point Multiply-Add
+///
+/// The floating-point elements in vector `b` (four 32-bit floating point numbers)
+/// multiplied by floating-point elements in vector `c` (four 32-bit floating point numbers)
+/// are added to the floating-point elements in vector `a` (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmadd.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmadd_w(a: v4f32, b: v4f32, c: v4f32) -> v4f32 {
+    msa_fmadd_w(a, mem::transmute(b), c)
+}
+
+/// Vector Floating-Point Multiply-Add
+///
+/// The floating-point elements in vector `b` (two 64-bit floating point numbers)
+/// multiplied by floating-point elements in vector `c` (two 64-bit floating point numbers)
+/// are added to the floating-point elements in vector `a` (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmadd.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmadd_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64 {
+    msa_fmadd_d(a, mem::transmute(b), c)
+}
+
+/// Vector Floating-Point Maximum
+///
+/// The largest values between corresponding floating-point elements in vector `a`
+/// (four 32-bit floating point numbers) and vector `b` (four 32-bit floating point numbers)
+/// are written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmax.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmax_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fmax_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Maximum
+///
+/// The largest values between corresponding floating-point elements in vector `a`
+/// (two 64-bit floating point numbers) and vector `b` (two 64-bit floating point numbers)
+/// are written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmax.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmax_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fmax_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Maximum Based on Absolute Values
+///
+/// The value with the largest magnitude, i.e. absolute value, between corresponding
+/// floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// and vector `b` (four 32-bit floating point numbers)
+/// are written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmax_a.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmax_a_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fmax_a_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Maximum Based on Absolute Values
+///
+/// The value with the largest magnitude, i.e. absolute value, between corresponding
+/// floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// and vector `b` (two 64-bit floating point numbers)
+/// are written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmax_a.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmax_a_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fmax_a_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Minimum
+///
+/// The smallest values between corresponding floating-point elements in vector `a`
+/// (four 32-bit floating point numbers) and vector `b` (four 32-bit floating point numbers)
+/// are written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmin.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmin_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fmin_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Minimum
+///
+/// The smallest values between corresponding floating-point elements in vector `a`
+/// (two 64-bit floating point numbers) and vector `b` (two 64-bit floating point numbers)
+/// are written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmin.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmin_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fmin_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Minimum Based on Absolute Values
+///
+/// The value with the smallest magnitude, i.e. absolute value, between corresponding
+/// floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// and vector `b` (four 32-bit floating point numbers)
+/// are written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmin_a.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmin_a_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fmin_a_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Minimum Based on Absolute Values
+///
+/// The value with the smallest magnitude, i.e. absolute value, between corresponding
+/// floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// and vector `b` (two 64-bit floating point numbers)
+/// are written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmin_a.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmin_a_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fmin_a_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Multiply-Sub
+///
+/// The floating-point elements in vector `b` (four 32-bit floating point numbers)
+/// multiplied by floating-point elements in vector `c` (four 32-bit floating point numbers)
+/// are subtracted from the floating-point elements in vector `a` (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmsub.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmsub_w(a: v4f32, b: v4f32, c: v4f32) -> v4f32 {
+    msa_fmsub_w(a, mem::transmute(b), c)
+}
+
+/// Vector Floating-Point Multiply-Sub
+///
+/// The floating-point elements in vector `b` (two 64-bit floating point numbers)
+/// multiplied by floating-point elements in vector `c` (two 64-bit floating point numbers)
+/// are subtracted from the floating-point elements in vector `a` (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmsub.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmsub_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64 {
+    msa_fmsub_d(a, mem::transmute(b), c)
+}
+
+/// Vector Floating-Point Multiplication
+///
+/// The floating-point elements in vector `a` (four 32-bit floating point numbers) are
+/// multiplied by floating-point elements in vector `b` (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmul.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmul_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fmul_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Multiplication
+///
+/// The floating-point elements in vector `a` (two 64-bit floating point numbers) are
+/// multiplied by floating-point elements in vector `b` (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmul.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fmul_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fmul_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Round to Integer
+///
+/// The floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are rounded to an integral valued floating-point number in the same format based
+/// on the rounding mode bits RM in MSA Control and Status Register MSACSR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(frint.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_frint_w(a: v4f32) -> v4f32 {
+    msa_frint_w(a)
+}
+
+/// Vector Floating-Point Round to Integer
+///
+/// The floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// are rounded to an integral valued floating-point number in the same format based
+/// on the rounding mode bits RM in MSA Control and Status Register MSACSR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(frint.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_frint_d(a: v2f64) -> v2f64 {
+    msa_frint_d(a)
+}
+
+/// Vector Approximate Floating-Point Reciprocal
+///
+/// The reciprocals of floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are calculated and the result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(frcp.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_frcp_w(a: v4f32) -> v4f32 {
+    msa_frcp_w(a)
+}
+
+/// Vector Approximate Floating-Point Reciprocal
+///
+/// The reciprocals of floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// are calculated and the result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(frcp.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_frcp_d(a: v2f64) -> v2f64 {
+    msa_frcp_d(a)
+}
+
+/// Vector Approximate Floating-Point Reciprocal of Square Root
+///
+/// The reciprocals of the square roots of floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are calculated and the result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(frsqrt.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_frsqrt_w(a: v4f32) -> v4f32 {
+    msa_frsqrt_w(a)
+}
+
+/// Vector Approximate Floating-Point Reciprocal of Square Root
+///
+/// The reciprocals of the square roots of floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// are calculated and the result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(frsqrt.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_frsqrt_d(a: v2f64) -> v2f64 {
+    msa_frsqrt_d(a)
+}
+
+/// Vector Floating-Point Signaling Compare Always False
+///
+/// Set all bits to 0 in vector (four signed 32-bit integer numbers) elements.
+/// Signaling and quiet NaN elements in vector `a` (four 32-bit floating point numbers)
+/// or `b` (four 32-bit floating point numbers) signal Invalid Operation exception.
+/// In case of a floating-point exception, the default result has all bits set to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsaf.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsaf_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsaf_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Always False
+///
+/// Set all bits to 0 in vector (two signed 64-bit integer numbers) elements.
+/// Signaling and quiet NaN elements in vector `a` (two 64-bit floating point numbers)
+/// or `b` (two 64-bit floating point numbers) signal Invalid Operation exception.
+/// In case of a floating-point exception, the default result has all bits set to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsaf.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsaf_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsaf_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers)
+/// and `b` (four 32-bit floating point numbers) elements are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fseq.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fseq_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fseq_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers)
+/// and `b` (two 64-bit floating point numbers) elements are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fseq.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fseq_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fseq_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Less or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) elements
+/// are less than or equal to `b` (four 32-bit floating point numbers) elements, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsle.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsle_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsle_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Less or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) elements
+/// are less than or equal to `b` (two 64-bit floating point numbers) elements, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsle.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsle_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsle_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) elements
+/// are less than `b` (four 32-bit floating point numbers) elements, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fslt.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fslt_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fslt_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) elements
+/// are less than `b` (two 64-bit floating point numbers) elements, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fslt.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fslt_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fslt_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Not Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are not equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsne.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsne_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsne_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Not Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are not equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsne.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsne_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsne_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Ordered
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are ordered,
+/// i.e. both elements are not NaN values, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsor.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsor_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsor_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Ordered
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are ordered,
+/// i.e. both elements are not NaN values, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsor.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsor_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsor_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Square Root
+///
+/// The square roots of floating-point elements in vector `a`
+/// (four 32-bit floating point numbers) are written to vector
+/// (four 32-bit floating point numbers) elements are ordered,.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsqrt.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsqrt_w(a: v4f32) -> v4f32 {
+    msa_fsqrt_w(a)
+}
+
+/// Vector Floating-Point Square Root
+///
+/// The square roots of floating-point elements in vector `a`
+/// (two 64-bit floating point numbers) are written to vector
+/// (two 64-bit floating point numbers) elements are ordered,.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsqrt.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsqrt_d(a: v2f64) -> v2f64 {
+    msa_fsqrt_d(a)
+}
+
+/// Vector Floating-Point Subtraction
+///
+/// The floating-point elements in vector `b` (four 32-bit floating point numbers)
+/// are subtracted from the floating-point elements in vector `a`
+/// (four 32-bit floating point numbers).
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsub.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsub_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fsub_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Subtraction
+///
+/// The floating-point elements in vector `b` (two 64-bit floating point numbers)
+/// are subtracted from the floating-point elements in vector `a`
+/// (two 64-bit floating point numbers).
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsub.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsub_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fsub_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Ordered
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are unordered or equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsueq.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsueq_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsueq_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Ordered
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are unordered or equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsueq.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsueq_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsueq_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered or Less or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) elements are
+/// unordered or less than or equal to `b` (four 32-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsule.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsule_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsule_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered or Less or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) elements are
+/// unordered or less than or equal to `b` (two 64-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsule.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsule_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsule_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered or Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) elements
+/// are unordered or less than `b` (four 32-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsult.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsult_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsult_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered or Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) elements
+/// are unordered or less than `b` (two 64-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsult.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsult_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsult_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are unordered,
+/// i.e. at least one element is a NaN value, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsun.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsun_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsun_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are unordered,
+/// i.e. at least one element is a NaN value, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsun.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsun_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsun_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered or Not Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are unordered or not equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsune.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsune_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsune_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered or Not Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are unordered or not equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsune.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_fsune_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsune_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Convert to Signed Integer
+///
+///The elements in vector `a` (four 32-bit floating point numbers)
+/// are rounded and converted to signed integer values based on the
+/// rounding mode bits RM in MSA Control and Status Register MSACSR.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftint_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ftint_s_w(a: v4f32) -> v4i32 {
+    msa_ftint_s_w(a)
+}
+
+/// Vector Floating-Point Convert to Signed Integer
+///
+///The elements in vector `a` (two 64-bit floating point numbers)
+/// are rounded and converted to signed integer values based on the
+/// rounding mode bits RM in MSA Control and Status Register MSACSR.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftint_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ftint_s_d(a: v2f64) -> v2i64 {
+    msa_ftint_s_d(a)
+}
+
+/// Vector Floating-Point Convert to Unsigned Integer
+///
+/// The elements in vector `a` (four 32-bit floating point numbers)
+/// are rounded and converted to signed integer values based on the
+/// rounding mode bits RM in MSA Control and Status Register MSACSR.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftint_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ftint_u_w(a: v4f32) -> v4u32 {
+    msa_ftint_u_w(a)
+}
+
+/// Vector Floating-Point Convert to Unsigned Integer
+///
+/// The elements in vector `a` (two 64-bit floating point numbers)
+/// are rounded and converted to signed integer values based on the
+/// rounding mode bits RM in MSA Control and Status Register MSACSR.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftint_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ftint_u_d(a: v2f64) -> v2u64 {
+    msa_ftint_u_d(a)
+}
+
+/// Vector Floating-Point Convert to Fixed-Point
+///
+/// The elements in vector `a` (four 32-bit floating point numbers)
+/// and `b` (four 32-bit floating point numbers) are down-converted to a fixed-point
+/// representation, i.e. from 64-bit floating-point to 32-bit Q31 fixed-point
+/// representation, or from 32-bit floating-point to 16-bit Q15 fixed-point representation.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftq.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ftq_h(a: v4f32, b: v4f32) -> v8i16 {
+    msa_ftq_h(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Convert to Fixed-Point
+///
+/// The elements in vector `a` (two 64-bit floating point numbers)
+/// and `b` (two 64-bit floating point numbers) are down-converted to a fixed-point
+/// representation, i.e. from 64-bit floating-point to 32-bit Q31 fixed-point
+/// representation, or from 32-bit floating-point to 16-bit Q15 fixed-point representation.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftq.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ftq_w(a: v2f64, b: v2f64) -> v4i32 {
+    msa_ftq_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Truncate and Convert to Signed Integer
+///
+/// The elements in vector `a` (four 32-bit floating point numbers)
+/// are truncated, i.e. rounded toward zero, to signed integer values.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftrunc_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ftrunc_s_w(a: v4f32) -> v4i32 {
+    msa_ftrunc_s_w(a)
+}
+
+/// Vector Floating-Point Truncate and Convert to Signed Integer
+///
+/// The elements in vector `a` (two 64-bit floating point numbers)
+/// are truncated, i.e. rounded toward zero, to signed integer values.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftrunc_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ftrunc_s_d(a: v2f64) -> v2i64 {
+    msa_ftrunc_s_d(a)
+}
+
+/// Vector Floating-Point Truncate and Convert to Unsigned Integer
+///
+/// The elements in vector `a` (four 32-bit floating point numbers)
+/// are truncated, i.e. rounded toward zero, to unsigned integer values.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftrunc_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ftrunc_u_w(a: v4f32) -> v4u32 {
+    msa_ftrunc_u_w(a)
+}
+
+/// Vector Floating-Point Truncate and Convert to Unsigned Integer
+///
+/// The elements in vector `a` (two 64-bit floating point numbers)
+/// are truncated, i.e. rounded toward zero, to unsigned integer values.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftrunc_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ftrunc_u_d(a: v2f64) -> v2u64 {
+    msa_ftrunc_u_d(a)
+}
+
+/// Vector Signed Horizontal Add
+///
+/// The sign-extended odd elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are added to the sign-extended even elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hadd_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_hadd_s_h(a: v16i8, b: v16i8) -> v8i16 {
+    msa_hadd_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Horizontal Add
+///
+/// The sign-extended odd elements in vector `a` (eight signed 16-bit integer numbers)
+/// are added to the sign-extended even elements in vector `b` (eight signed 16-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hadd_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_hadd_s_w(a: v8i16, b: v8i16) -> v4i32 {
+    msa_hadd_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Horizontal Add
+///
+/// The sign-extended odd elements in vector `a` (four signed 32-bit integer numbers)
+/// are added to the sign-extended even elements in vector `b` (four signed 32-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hadd_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_hadd_s_d(a: v4i32, b: v4i32) -> v2i64 {
+    msa_hadd_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Horizontal Add
+///
+/// The zero-extended odd elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are added to the zero-extended even elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hadd_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_hadd_u_h(a: v16u8, b: v16u8) -> v8u16 {
+    msa_hadd_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Horizontal Add
+///
+/// The zero-extended odd elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are added to the zero-extended even elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hadd_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_hadd_u_w(a: v8u16, b: v8u16) -> v4u32 {
+    msa_hadd_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Horizontal Add
+///
+/// The zero-extended odd elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are added to the zero-extended even elements in vector `b` (four unsigned 32-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hadd_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_hadd_u_d(a: v4u32, b: v4u32) -> v2u64 {
+    msa_hadd_u_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Horizontal Subtract
+///
+/// The sign-extended odd elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are subtracted from the sign-extended elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hsub_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_hsub_s_h(a: v16i8, b: v16i8) -> v8i16 {
+    msa_hsub_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Horizontal Subtract
+///
+/// The sign-extended odd elements in vector `b` (eight signed 16-bit integer numbers)
+/// are subtracted from the sign-extended elements in vector `a` (eight signed 16-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hsub_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_hsub_s_w(a: v8i16, b: v8i16) -> v4i32 {
+    msa_hsub_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Horizontal Subtract
+///
+/// The sign-extended odd elements in vector `b` (four signed 32-bit integer numbers)
+/// are subtracted from the sign-extended elements in vector `a` (four signed 32-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hsub_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_hsub_s_d(a: v4i32, b: v4i32) -> v2i64 {
+    msa_hsub_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Horizontal Subtract
+///
+/// The zero-extended odd elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// are subtracted from the zero-extended elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hsub_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_hsub_u_h(a: v16u8, b: v16u8) -> v8i16 {
+    msa_hsub_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Horizontal Subtract
+///
+/// The zero-extended odd elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// are subtracted from the zero-extended elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hsub_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_hsub_u_w(a: v8u16, b: v8u16) -> v4i32 {
+    msa_hsub_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Horizontal Subtract
+///
+/// The zero-extended odd elements in vector `b` (four unsigned 32-bit integer numbers)
+/// are subtracted from the zero-extended elements in vector `a` (four unsigned 32-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hsub_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_hsub_u_d(a: v4u32, b: v4u32) -> v2i64 {
+    msa_hsub_u_d(a, mem::transmute(b))
+}
+
+/// Vector Interleave Even
+///
+/// Even elements in vectors `a` (sixteen signed 8-bit integer numbers)
+/// and vector `b` (sixteen signed 8-bit integer numbers) are copied to the result
+/// (sixteen signed 8-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvev.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvev_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_ilvev_b(a, mem::transmute(b))
+}
+
+/// Vector Interleave Even
+///
+/// Even elements in vectors `a` (eight signed 16-bit integer numbers)
+/// and vector `b` (eight signed 16-bit integer numbers) are copied to the result
+/// (eight signed 16-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvev.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvev_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_ilvev_h(a, mem::transmute(b))
+}
+
+/// Vector Interleave Even
+///
+/// Even elements in vectors `a` (four signed 32-bit integer numbers)
+/// and vector `b` (four signed 32-bit integer numbers) are copied to the result
+/// (four signed 32-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvev.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvev_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_ilvev_w(a, mem::transmute(b))
+}
+
+/// Vector Interleave Even
+///
+/// Even elements in vectors `a` (two signed 64-bit integer numbers)
+/// and vector `b` (two signed 64-bit integer numbers) are copied to the result
+/// (two signed 64-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvev.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvev_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_ilvev_d(a, mem::transmute(b))
+}
+
+/// Vector Interleave Left
+///
+/// The left half elements in vectors `a` (sixteen signed 8-bit integer numbers)
+/// and vector `b` (sixteen signed 8-bit integer numbers) are copied to the result
+/// (sixteen signed 8-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvl.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvl_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_ilvl_b(a, mem::transmute(b))
+}
+
+/// Vector Interleave Left
+///
+/// The left half elements in vectors `a` (eight signed 16-bit integer numbers)
+/// and vector `b` (eight signed 16-bit integer numbers) are copied to the result
+/// (eight signed 16-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvl.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvl_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_ilvl_h(a, mem::transmute(b))
+}
+
+/// Vector Interleave Left
+///
+/// The left half elements in vectors `a` (four signed 32-bit integer numbers)
+/// and vector `b` (four signed 32-bit integer numbers) are copied to the result
+/// (four signed 32-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvl.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvl_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_ilvl_w(a, mem::transmute(b))
+}
+
+/// Vector Interleave Left
+///
+/// The left half elements in vectors `a` (two signed 64-bit integer numbers)
+/// and vector `b` (two signed 64-bit integer numbers) are copied to the result
+/// (two signed 64-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvl.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvl_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_ilvl_d(a, mem::transmute(b))
+}
+
+/// Vector Interleave Odd
+///
+/// Odd elements in vectors `a` (sixteen signed 8-bit integer numbers)
+/// and vector `b` (sixteen signed 8-bit integer numbers) are copied to the result
+/// (sixteen signed 8-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvod.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvod_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_ilvod_b(a, mem::transmute(b))
+}
+
+/// Vector Interleave Odd
+///
+/// Odd elements in vectors `a` (eight signed 16-bit integer numbers)
+/// and vector `b` (eight signed 16-bit integer numbers) are copied to the result
+/// (eight signed 16-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvod.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvod_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_ilvod_h(a, mem::transmute(b))
+}
+
+/// Vector Interleave Odd
+///
+/// Odd elements in vectors `a` (four signed 32-bit integer numbers)
+/// and vector `b` (four signed 32-bit integer numbers) are copied to the result
+/// (four signed 32-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvod.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvod_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_ilvod_w(a, mem::transmute(b))
+}
+
+/// Vector Interleave Odd
+///
+/// Odd elements in vectors `a` (two signed 64-bit integer numbers)
+/// and vector `b` (two signed 64-bit integer numbers) are copied to the result
+/// (two signed 64-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvod.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvod_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_ilvod_d(a, mem::transmute(b))
+}
+
+/// Vector Interleave Right
+///
+/// The right half elements in vectors `a` (sixteen signed 8-bit integer numbers)
+/// and vector `b` (sixteen signed 8-bit integer numbers) are copied to the result
+/// (sixteen signed 8-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvr.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvr_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_ilvr_b(a, mem::transmute(b))
+}
+
+/// Vector Interleave Right
+///
+/// The right half elements in vectors `a` (eight signed 16-bit integer numbers)
+/// and vector `b` (eight signed 16-bit integer numbers) are copied to the result
+/// (eight signed 16-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvr.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvr_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_ilvr_h(a, mem::transmute(b))
+}
+
+/// Vector Interleave Right
+///
+/// The right half elements in vectors `a` (four signed 32-bit integer numbers)
+/// and vector `b` (four signed 32-bit integer numbers) are copied to the result
+/// (four signed 32-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvr.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvr_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_ilvr_w(a, mem::transmute(b))
+}
+
+/// Vector Interleave Right
+///
+/// The right half elements in vectors `a` (two signed 64-bit integer numbers)
+/// and vector `b` (two signed 64-bit integer numbers) are copied to the result
+/// (two signed 64-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvr.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ilvr_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_ilvr_d(a, mem::transmute(b))
+}
+
+/// GPR Insert Element
+///
+/// Set element `imm4` in vector `a` (sixteen signed 8-bit integer numbers) to GPR `c` value.
+/// All other elements in vector `a` are unchanged. If the source GPR is wider than the
+/// destination data format, the destination's elements will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insert.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_insert_b<const IMM4: i32>(a: v16i8, c: i32) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_insert_b(a, IMM4, c)
+}
+
+/// GPR Insert Element
+///
+/// Set element `imm3` in vector `a` (eight signed 16-bit integer numbers) to GPR `c` value.
+/// All other elements in vector `a` are unchanged. If the source GPR is wider than the
+/// destination data format, the destination's elements will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insert.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_insert_h<const IMM3: i32>(a: v8i16, c: i32) -> v8i16 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_insert_h(a, IMM3, c)
+}
+
+/// GPR Insert Element
+///
+/// Set element `imm2` in vector `a` (four signed 32-bit integer numbers) to GPR `c` value.
+/// All other elements in vector `a` are unchanged. If the source GPR is wider than the
+/// destination data format, the destination's elements will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insert.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_insert_w<const IMM2: i32>(a: v4i32, c: i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM2, 2);
+    msa_insert_w(a, IMM2, c)
+}
+
+/// GPR Insert Element
+///
+/// Set element `imm1` in vector `a` (two signed 64-bit integer numbers) to GPR `c` value.
+/// All other elements in vector `a` are unchanged. If the source GPR is wider than the
+/// destination data format, the destination's elements will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insert.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_insert_d<const IMM1: i32>(a: v2i64, c: i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    msa_insert_d(a, IMM1, c)
+}
+
+/// Element Insert Element
+///
+/// Set element `imm1` in the result vector `a` (sixteen signed 8-bit integer numbers) to element 0
+/// in vector `c` (sixteen signed 8-bit integer numbers) value.
+/// All other elements in vector `a` are unchanged.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insve.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_insve_b<const IMM4: i32>(a: v16i8, c: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_insve_b(a, IMM4, c)
+}
+
+/// Element Insert Element
+///
+/// Set element `imm1` in the result vector `a` (eight signed 16-bit integer numbers) to element 0
+/// in vector `c` (eight signed 16-bit integer numbers) value.
+/// All other elements in vector `a` are unchanged.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insve.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_insve_h<const IMM3: i32>(a: v8i16, c: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_insve_h(a, IMM3, c)
+}
+
+/// Element Insert Element
+///
+/// Set element `imm1` in the result vector `a` (four signed 32-bit integer numbers) to element 0
+/// in vector `c` (four signed 32-bit integer numbers) value.
+/// All other elements in vector `a` are unchanged.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insve.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_insve_w<const IMM2: i32>(a: v4i32, c: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM2, 2);
+    msa_insve_w(a, IMM2, c)
+}
+
+/// Element Insert Element
+///
+/// Set element `imm1` in the result vector `a` (two signed 64-bit integer numbers) to element 0
+/// in vector `c` (two signed 64-bit integer numbers) value.
+/// All other elements in vector `a` are unchanged.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insve.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_insve_d<const IMM1: i32>(a: v2i64, c: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    msa_insve_d(a, IMM1, c)
+}
+
+/// Vector Load
+///
+/// The WRLEN / 8 bytes at the effective memory location addressed by the base
+/// `mem_addr` and the 10-bit signed immediate offset `imm_s10` are fetched and placed in
+/// the vector (sixteen signed 8-bit integer numbers) value.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ld.b, imm_s10 = 0b1111111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ld_b<const IMM_S10: i32>(mem_addr: *mut u8) -> v16i8 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    msa_ld_b(mem_addr, IMM_S10)
+}
+
+/// Vector Load
+///
+/// The WRLEN / 8 bytes at the effective memory location addressed by the base
+/// `mem_addr` and the 10-bit signed immediate offset `imm_s11` are fetched and placed in
+/// the vector (eight signed 16-bit integer numbers) value.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ld.h, imm_s11 = 0b11111111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ld_h<const IMM_S11: i32>(mem_addr: *mut u8) -> v8i16 {
+    static_assert_simm_bits!(IMM_S11, 11);
+    static_assert!(IMM_S11 % 2 == 0);
+    msa_ld_h(mem_addr, IMM_S11)
+}
+
+/// Vector Load
+///
+/// The WRLEN / 8 bytes at the effective memory location addressed by the base
+/// `mem_addr` and the 10-bit signed immediate offset `imm_s12` are fetched and placed in
+/// the vector (four signed 32-bit integer numbers) value.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ld.w, imm_s12 = 0b111111111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ld_w<const IMM_S12: i32>(mem_addr: *mut u8) -> v4i32 {
+    static_assert_simm_bits!(IMM_S12, 12);
+    static_assert!(IMM_S12 % 4 == 0);
+    msa_ld_w(mem_addr, IMM_S12)
+}
+
+/// Vector Load
+///
+/// The WRLEN / 8 bytes at the effective memory location addressed by the base
+/// `mem_addr` and the 10-bit signed immediate offset `imm_s13` are fetched and placed in
+/// the vector (two signed 64-bit integer numbers) value.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ld.d, imm_s13 = 0b1111111111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ld_d<const IMM_S13: i32>(mem_addr: *mut u8) -> v2i64 {
+    static_assert_simm_bits!(IMM_S13, 13);
+    static_assert!(IMM_S13 % 8 == 0);
+    msa_ld_d(mem_addr, IMM_S13)
+}
+
+/// Immediate Load
+///
+/// The signed immediate imm_s10 is replicated in all vector
+/// (sixteen signed 8-bit integer numbers) elements. For byte elements,
+/// only the least significant 8 bits of imm_s10 will be used.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ldi.b, imm_s10 = 0b1111111111))]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ldi_b<const IMM_S10: i32>() -> v16i8 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    msa_ldi_b(IMM_S10)
+}
+
+/// Immediate Load
+///
+/// The signed immediate imm_s10 is replicated in all vector
+/// (eight signed 16-bit integer numbers) elements. For byte elements,
+/// only the least significant 8 bits of imm_s10 will be used.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ldi.h, imm_s10 = 0b1111111111))]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ldi_h<const IMM_S10: i32>() -> v8i16 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    msa_ldi_h(IMM_S10)
+}
+
+/// Immediate Load
+///
+/// The signed immediate imm_s10 is replicated in all vector
+/// (four signed 32-bit integer numbers) elements. For byte elements,
+/// only the least significant 8 bits of imm_s10 will be used.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ldi.w, imm_s10 = 0b1111111111))]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ldi_w<const IMM_S10: i32>() -> v4i32 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    msa_ldi_w(IMM_S10)
+}
+
+/// Immediate Load
+///
+/// The signed immediate imm_s10 is replicated in all vector
+/// (two signed 64-bit integer numbers) elements. For byte elements,
+/// only the least significant 8 bits of imm_s10 will be used.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ldi.d, imm_s10 = 0b1111111111))]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ldi_d<const IMM_S10: i32>() -> v2i64 {
+    static_assert_simm_bits!(IMM_S10, 10);
+    msa_ldi_d(IMM_S10)
+}
+
+/// Vector Fixed-Point Multiply and Add
+///
+/// The products of fixed-point elements in `b` (eight signed 16-bit integer numbers)
+/// by fixed-point elements in vector `c` (eight signed 16-bit integer numbers)
+/// are added to the fixed-point elements in vector `a` (eight signed 16-bit integer numbers).
+/// The multiplication result is not saturated, i.e. exact (-1) * (-1) = 1 is added to the destination.
+/// The saturated fixed-point results are stored to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(madd_q.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_madd_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_madd_q_h(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply and Add
+///
+/// The products of fixed-point elements in `b` (four signed 32-bit integer numbers)
+/// by fixed-point elements in vector `c` (four signed 32-bit integer numbers)
+/// are added to the fixed-point elements in vector `a` (four signed 32-bit integer numbers).
+/// The multiplication result is not saturated, i.e. exact (-1) * (-1) = 1 is added to the destination.
+/// The saturated fixed-point results are stored to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(madd_q.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_madd_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_madd_q_w(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply and Add Rounded
+///
+/// The products of fixed-point elements in `b` (eight signed 16-bit integer numbers)
+/// by fixed-point elements in vector `c` (eight signed 16-bit integer numbers)
+/// are added to the fixed-point elements in vector `a` (eight signed 16-bit integer numbers).
+/// The multiplication result is not saturated, i.e. exact (-1) * (-1) = 1 is added to the destination.
+/// The rounded and saturated fixed-point results are stored to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maddr_q.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maddr_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_maddr_q_h(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply and Add Rounded
+///
+/// The products of fixed-point elements in `b` (four signed 32-bit integer numbers)
+/// by fixed-point elements in vector `c` (four signed 32-bit integer numbers)
+/// are added to the fixed-point elements in vector `a` (four signed 32-bit integer numbers).
+/// The multiplication result is not saturated, i.e. exact (-1) * (-1) = 1 is added to the destination.
+/// The rounded and saturated fixed-point results are stored to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maddr_q.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maddr_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_maddr_q_w(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Add
+///
+/// The integer elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are multiplied by integer elements in vector `c` (sixteen signed 8-bit integer numbers)
+/// and added to the integer elements in vector `a` (sixteen signed 8-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maddv.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maddv_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8 {
+    msa_maddv_b(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Add
+///
+/// The integer elements in vector `b` (eight signed 16-bit integer numbers)
+/// are multiplied by integer elements in vector `c` (eight signed 16-bit integer numbers)
+/// and added to the integer elements in vector `a` (eight signed 16-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maddv.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maddv_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_maddv_h(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Add
+///
+/// The integer elements in vector `b` (four signed 32-bit integer numbers)
+/// are multiplied by integer elements in vector `c` (four signed 32-bit integer numbers)
+/// and added to the integer elements in vector `a` (four signed 32-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maddv.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maddv_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_maddv_w(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Add
+///
+/// The integer elements in vector `b` (two signed 64-bit integer numbers)
+/// are multiplied by integer elements in vector `c` (two signed 64-bit integer numbers)
+/// and added to the integer elements in vector `a` (two signed 64-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maddv.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maddv_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64 {
+    msa_maddv_d(a, mem::transmute(b), c)
+}
+
+/// Vector Maximum Based on Absolute Values
+///
+/// The value with the largest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (sixteen signed 8-bit integer numbers) and
+/// `b` (sixteen signed 8-bit integer numbers) are written to vector
+/// (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_a.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_max_a_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_max_a_b(a, mem::transmute(b))
+}
+
+/// Vector Maximum Based on Absolute Values
+///
+/// The value with the largest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (eight signed 16-bit integer numbers) and
+/// `b` (eight signed 16-bit integer numbers) are written to vector
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_a.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_max_a_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_max_a_h(a, mem::transmute(b))
+}
+
+/// Vector Maximum Based on Absolute Values
+///
+/// The value with the largest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (four signed 32-bit integer numbers) and
+/// `b` (four signed 32-bit integer numbers) are written to vector
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_a.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_max_a_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_max_a_w(a, mem::transmute(b))
+}
+
+/// Vector Maximum Based on Absolute Values
+///
+/// The value with the largest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (two signed 64-bit integer numbers) and
+/// `b` (two signed 64-bit integer numbers) are written to vector
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_a.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_max_a_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_max_a_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// and signed elements in vector `b` (sixteen signed 8-bit integer numbers) are written to vector
+/// (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_s.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_max_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_max_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (eight signed 16-bit integer numbers)
+/// and signed elements in vector `b` (eight signed 16-bit integer numbers) are written to vector
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_max_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_max_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (four signed 32-bit integer numbers)
+/// and signed elements in vector `b` (four signed 32-bit integer numbers) are written to vector
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_max_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_max_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (two signed 64-bit integer numbers)
+/// and signed elements in vector `b` (two signed 64-bit integer numbers) are written to vector
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_max_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_max_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// and unsigned elements in vector `b` (sixteen unsigned 8-bit integer numbers) are written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_u.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_max_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_max_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// and unsigned elements in vector `b` (eight unsigned 16-bit integer numbers) are written to vector
+/// (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_max_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_max_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (four unsigned 32-bit integer numbers)
+/// and unsigned elements in vector `b` (four unsigned 32-bit integer numbers) are written to vector
+/// (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_max_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_max_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (two unsigned 64-bit integer numbers)
+/// and unsigned elements in vector `b` (two unsigned 64-bit integer numbers) are written to vector
+/// (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_max_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_max_u_d(a, mem::transmute(b))
+}
+
+/// Immediate Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_s.b, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maxi_s_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_maxi_s_b(a, IMM_S5)
+}
+
+/// Immediate Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (eight signed 16-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_s.h, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maxi_s_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_maxi_s_h(a, IMM_S5)
+}
+
+/// Immediate Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (four signed 32-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_s.w, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maxi_s_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_maxi_s_w(a, IMM_S5)
+}
+
+/// Immediate Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (two signed 64-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_s.d, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maxi_s_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_maxi_s_d(a, IMM_S5)
+}
+
+/// Immediate Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_u.b, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maxi_u_b<const IMM5: i32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_maxi_u_b(a, IMM5)
+}
+
+/// Immediate Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_u.h, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maxi_u_h<const IMM5: i32>(a: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_maxi_u_h(a, IMM5)
+}
+
+/// Immediate Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (four unsigned 32-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_u.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maxi_u_w<const IMM5: i32>(a: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_maxi_u_w(a, IMM5)
+}
+
+/// Immediate Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (two unsigned 64-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_u.d, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_maxi_u_d<const IMM5: i32>(a: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_maxi_u_d(a, IMM5)
+}
+
+/// Vector Minimum Based on Absolute Value
+///
+/// The value with the smallest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (sixteen signed 8-bit integer numbers) and
+/// `b` (sixteen signed 8-bit integer numbers) are written to vector
+/// (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_a.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_min_a_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_min_a_b(a, mem::transmute(b))
+}
+
+/// Vector Minimum Based on Absolute Value
+///
+/// The value with the smallest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (eight signed 16-bit integer numbers) and
+/// `b` (eight signed 16-bit integer numbers) are written to vector
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_a.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_min_a_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_min_a_h(a, mem::transmute(b))
+}
+
+/// Vector Minimum Based on Absolute Value
+///
+/// The value with the smallest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (four signed 32-bit integer numbers) and
+/// `b` (four signed 32-bit integer numbers) are written to vector
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_a.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_min_a_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_min_a_w(a, mem::transmute(b))
+}
+
+/// Vector Minimum Based on Absolute Value
+///
+/// The value with the smallest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (two signed 64-bit integer numbers) and
+/// `b` (two signed 64-bit integer numbers) are written to vector
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_a.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_min_a_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_min_a_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// and signed elements in vector `b` (sixteen signed 8-bit integer numbers) are written to vector
+/// (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_s.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_min_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_min_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (eight signed 16-bit integer numbers)
+/// and signed elements in vector `b` (eight signed 16-bit integer numbers) are written to vector
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_min_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_min_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (four signed 32-bit integer numbers)
+/// and signed elements in vector `b` (four signed 32-bit integer numbers) are written to vector
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_min_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_min_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (two signed 64-bit integer numbers)
+/// and signed elements in vector `b` (two signed 64-bit integer numbers) are written to vector
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_min_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_min_s_d(a, mem::transmute(b))
+}
+
+/// Immediate Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_s.b, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mini_s_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_mini_s_b(a, IMM_S5)
+}
+
+/// Immediate Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (eight signed 16-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_s.h, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mini_s_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_mini_s_h(a, IMM_S5)
+}
+
+/// Immediate Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (four signed 32-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_s.w, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mini_s_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_mini_s_w(a, IMM_S5)
+}
+
+/// Immediate Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (two signed 64-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_s.d, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mini_s_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_simm_bits!(IMM_S5, 5);
+    msa_mini_s_d(a, IMM_S5)
+}
+
+/// Vector Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// and unsigned elements in vector `b` (sixteen unsigned 8-bit integer numbers) are written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_u.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_min_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_min_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// and unsigned elements in vector `b` (eight unsigned 16-bit integer numbers) are written to vector
+/// (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_min_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_min_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (four unsigned 32-bit integer numbers)
+/// and unsigned elements in vector `b` (four unsigned 32-bit integer numbers) are written to vector
+/// (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_min_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_min_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (two unsigned 64-bit integer numbers)
+/// and unsigned elements in vector `b` (two unsigned 64-bit integer numbers) are written to vector
+/// (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_min_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_min_u_d(a, mem::transmute(b))
+}
+
+/// Immediate Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_u.b, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mini_u_b<const IMM5: i32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_mini_u_b(a, IMM5)
+}
+
+/// Immediate Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_u.h, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mini_u_h<const IMM5: i32>(a: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_mini_u_h(a, IMM5)
+}
+
+/// Immediate Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (four unsigned 32-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_u.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mini_u_w<const IMM5: i32>(a: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_mini_u_w(a, IMM5)
+}
+
+/// Immediate Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (two unsigned 64-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_u.d, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mini_u_d<const IMM5: i32>(a: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_mini_u_d(a, IMM5)
+}
+
+/// Vector Signed Modulo
+///
+/// The signed integer elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (sixteen signed 8-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_s.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mod_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_mod_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Modulo
+///
+/// The signed integer elements in vector `a` (eight signed 16-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (eight signed 16-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (eight signed 16-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mod_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_mod_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Modulo
+///
+/// The signed integer elements in vector `a` (four signed 32-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (four signed 32-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (four signed 32-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mod_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_mod_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Modulo
+///
+/// The signed integer elements in vector `a` (two signed 64-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (two signed 64-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (two signed 64-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mod_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_mod_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Modulo
+///
+/// The unsigned integer elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (sixteen unsigned 8-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (sixteen unsigned 8-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_u.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mod_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_mod_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Modulo
+///
+/// The unsigned integer elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (eight unsigned 16-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (eight unsigned 16-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mod_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_mod_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Modulo
+///
+/// The unsigned integer elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (four unsigned 32-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (four unsigned 32-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mod_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_mod_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Modulo
+///
+/// The unsigned integer elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (two unsigned 64-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (two unsigned 64-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mod_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_mod_u_d(a, mem::transmute(b))
+}
+
+/// Vector Move
+///
+/// Copy all WRLEN bits in vector `a` (eight signed 16-bit integer numbers)
+/// to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(move.v))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_move_v(a: v16i8) -> v16i8 {
+    msa_move_v(a)
+}
+
+/// Vector Fixed-Point Multiply and Subtract
+///
+/// The product of fixed-point elements in vector `c` (eight signed 16-bit integer numbers)
+/// by fixed-point elements in vector `b` (eight signed 16-bit integer numbers)
+/// are subtracted from the fixed-point elements in vector `a`
+/// (eight signed 16-bit integer numbers). The multiplication result is not saturated,
+/// i.e. exact (-1) * (-1) = 1 is subtracted from the destination.
+/// The saturated fixed-point results are stored back to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msub_q.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_msub_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_msub_q_h(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply and Subtract
+///
+/// The product of fixed-point elements in vector `c` (four signed 32-bit integer numbers)
+/// by fixed-point elements in vector `b` (four signed 32-bit integer numbers)
+/// are subtracted from the fixed-point elements in vector `a`
+/// (four signed 32-bit integer numbers). The multiplication result is not saturated,
+/// i.e. exact (-1) * (-1) = 1 is subtracted from the destination.
+/// The saturated fixed-point results are stored back to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msub_q.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_msub_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_msub_q_w(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply and Subtract Rounded
+///
+/// The product of fixed-point elements in vector `c` (eight signed 16-bit integer numbers)
+/// by fixed-point elements in vector `b` (eight signed 16-bit integer numbers)
+/// are subtracted from the fixed-point elements in vector `a`
+/// (eight signed 16-bit integer numbers). The multiplication result is not saturated,
+/// i.e. exact (-1) * (-1) = 1 is subtracted from the destination.
+/// The rounded and saturated fixed-point results are stored back to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msubr_q.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_msubr_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_msubr_q_h(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply and Subtract Rounded
+///
+/// The product of fixed-point elements in vector `c` (four signed 32-bit integer numbers)
+/// by fixed-point elements in vector `b` (four signed 32-bit integer numbers)
+/// are subtracted from the fixed-point elements in vector `a`
+/// (four signed 32-bit integer numbers). The multiplication result is not saturated,
+/// i.e. exact (-1) * (-1) = 1 is subtracted from the destination.
+/// The rounded and saturated fixed-point results are stored back to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msubr_q.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_msubr_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_msubr_q_w(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Subtract
+///
+/// The integer elements in vector `c` (sixteen signed 8-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// and subtracted from the integer elements in vector `a` (sixteen signed 8-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msubv.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_msubv_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8 {
+    msa_msubv_b(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Subtract
+///
+/// The integer elements in vector `c` (eight signed 16-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (eight signed 16-bit integer numbers)
+/// and subtracted from the integer elements in vector `a` (eight signed 16-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msubv.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_msubv_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_msubv_h(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Subtract
+///
+/// The integer elements in vector `c` (four signed 32-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (four signed 32-bit integer numbers)
+/// and subtracted from the integer elements in vector `a` (four signed 32-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msubv.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_msubv_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_msubv_w(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Subtract
+///
+/// The integer elements in vector `c` (two signed 64-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (two signed 64-bit integer numbers)
+/// and subtracted from the integer elements in vector `a` (two signed 64-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msubv.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_msubv_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64 {
+    msa_msubv_d(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply
+///
+/// The fixed-point elements in vector `a` (eight signed 16-bit integer numbers)
+/// multiplied by fixed-point elements in vector `b` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mul_q.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mul_q_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_mul_q_h(a, mem::transmute(b))
+}
+
+/// Vector Fixed-Point Multiply
+///
+/// The fixed-point elements in vector `a` (four signed 32-bit integer numbers)
+/// multiplied by fixed-point elements in vector `b` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mul_q.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mul_q_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_mul_q_w(a, mem::transmute(b))
+}
+
+/// Vector Fixed-Point Multiply Rounded
+///
+/// The fixed-point elements in vector `a` (eight signed 16-bit integer numbers)
+/// multiplied by fixed-point elements in vector `b` (eight signed 16-bit integer numbers).
+/// The rounded result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mulr_q.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mulr_q_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_mulr_q_h(a, mem::transmute(b))
+}
+
+/// Vector Fixed-Point Multiply Rounded
+///
+/// The fixed-point elements in vector `a` (four signed 32-bit integer numbers)
+/// multiplied by fixed-point elements in vector `b` (four signed 32-bit integer numbers).
+/// The rounded result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mulr_q.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mulr_q_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_mulr_q_w(a, mem::transmute(b))
+}
+
+/// Vector Multiply
+///
+/// The integer elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mulv.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mulv_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_mulv_b(a, mem::transmute(b))
+}
+
+/// Vector Multiply
+///
+/// The integer elements in vector `a` (eight signed 16-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mulv.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mulv_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_mulv_h(a, mem::transmute(b))
+}
+
+/// Vector Multiply
+///
+/// The integer elements in vector `a` (four signed 32-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mulv.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mulv_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_mulv_w(a, mem::transmute(b))
+}
+
+/// Vector Multiply
+///
+/// The integer elements in vector `a` (two signed 64-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mulv.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_mulv_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_mulv_d(a, mem::transmute(b))
+}
+
+/// Vector Leading Ones Count
+///
+/// The number of leading ones for elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// is stored to the elements in vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nloc.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_nloc_b(a: v16i8) -> v16i8 {
+    msa_nloc_b(a)
+}
+
+/// Vector Leading Ones Count
+///
+/// The number of leading ones for elements in vector `a` (eight signed 16-bit integer numbers)
+/// is stored to the elements in vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nloc.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_nloc_h(a: v8i16) -> v8i16 {
+    msa_nloc_h(a)
+}
+
+/// Vector Leading Ones Count
+///
+/// The number of leading ones for elements in vector `a` (four signed 32-bit integer numbers)
+/// is stored to the elements in vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nloc.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_nloc_w(a: v4i32) -> v4i32 {
+    msa_nloc_w(a)
+}
+
+/// Vector Leading Ones Count
+///
+/// The number of leading ones for elements in vector `a` (two signed 64-bit integer numbers)
+/// is stored to the elements in vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nloc.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_nloc_d(a: v2i64) -> v2i64 {
+    msa_nloc_d(a)
+}
+
+/// Vector Leading Zeros Count
+///
+/// The number of leading zeros for elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// is stored to the elements in vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nlzc.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_nlzc_b(a: v16i8) -> v16i8 {
+    msa_nlzc_b(a)
+}
+
+/// Vector Leading Zeros Count
+///
+/// The number of leading zeros for elements in vector `a` (eight signed 16-bit integer numbers)
+/// is stored to the elements in vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nlzc.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_nlzc_h(a: v8i16) -> v8i16 {
+    msa_nlzc_h(a)
+}
+
+/// Vector Leading Zeros Count
+///
+/// The number of leading zeros for elements in vector `a` (four signed 32-bit integer numbers)
+/// is stored to the elements in vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nlzc.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_nlzc_w(a: v4i32) -> v4i32 {
+    msa_nlzc_w(a)
+}
+
+/// Vector Leading Zeros Count
+///
+/// The number of leading zeros for elements in vector `a` (two signed 64-bit integer numbers)
+/// is stored to the elements in vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nlzc.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_nlzc_d(a: v2i64) -> v2i64 {
+    msa_nlzc_d(a)
+}
+
+/// Vector Logical Negated Or
+///
+/// Each bit of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the corresponding bit of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// in a bitwise logical NOR operation. The result is written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nor.v))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_nor_v(a: v16u8, b: v16u8) -> v16u8 {
+    msa_nor_v(a, mem::transmute(b))
+}
+
+/// Immediate Logical Negated Or
+///
+/// Each bit of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the 8-bit immediate `imm8`
+/// in a bitwise logical NOR operation. The result is written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nori.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_nori_b<const IMM8: i32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    msa_nori_b(a, IMM8)
+}
+
+/// Vector Logical Or
+///
+/// Each bit of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the corresponding bit of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// in a bitwise logical OR operation. The result is written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(or.v))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_or_v(a: v16u8, b: v16u8) -> v16u8 {
+    msa_or_v(a, mem::transmute(b))
+}
+
+/// Immediate Logical Or
+///
+/// Each bit of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the 8-bit immediate `imm8`
+/// in a bitwise logical OR operation. The result is written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ori.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_ori_b<const IMM8: i32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    msa_ori_b(a, IMM8)
+}
+
+/// Vector Pack Even
+///
+/// Even elements in vectors `a` (sixteen signed 8-bit integer numbers)
+/// are copied to the left half of the result vector and even elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckev.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_pckev_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_pckev_b(a, mem::transmute(b))
+}
+
+/// Vector Pack Even
+///
+/// Even elements in vectors `a` (eight signed 16-bit integer numbers)
+/// are copied to the left half of the result vector and even elements in vector `b`
+/// (eight signed 16-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckev.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_pckev_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_pckev_h(a, mem::transmute(b))
+}
+
+/// Vector Pack Even
+///
+/// Even elements in vectors `a` (four signed 32-bit integer numbers)
+/// are copied to the left half of the result vector and even elements in vector `b`
+/// (four signed 32-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckev.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_pckev_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_pckev_w(a, mem::transmute(b))
+}
+
+/// Vector Pack Even
+///
+/// Even elements in vectors `a` (two signed 64-bit integer numbers)
+/// are copied to the left half of the result vector and even elements in vector `b`
+/// (two signed 64-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckev.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_pckev_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_pckev_d(a, mem::transmute(b))
+}
+
+/// Vector Pack Odd
+///
+/// Odd elements in vectors `a` (sixteen signed 8-bit integer numbers)
+/// are copied to the left half of the result vector and odd elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckod.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_pckod_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_pckod_b(a, mem::transmute(b))
+}
+
+/// Vector Pack Odd
+///
+/// Odd elements in vectors `a` (eight signed 16-bit integer numbers)
+/// are copied to the left half of the result vector and odd elements in vector `b`
+/// (eight signed 16-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckod.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_pckod_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_pckod_h(a, mem::transmute(b))
+}
+
+/// Vector Pack Odd
+///
+/// Odd elements in vectors `a` (four signed 32-bit integer numbers)
+/// are copied to the left half of the result vector and odd elements in vector `b`
+/// (four signed 32-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckod.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_pckod_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_pckod_w(a, mem::transmute(b))
+}
+
+/// Vector Pack Odd
+///
+/// Odd elements in vectors `a` (two signed 64-bit integer numbers)
+/// are copied to the left half of the result vector and odd elements in vector `b`
+/// (two signed 64-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckod.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_pckod_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_pckod_d(a, mem::transmute(b))
+}
+
+/// Vector Population Count
+///
+/// The number of bits set to 1 for elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// is stored to the elements in the result vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pcnt.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_pcnt_b(a: v16i8) -> v16i8 {
+    msa_pcnt_b(a)
+}
+
+/// Vector Population Count
+///
+/// The number of bits set to 1 for elements in vector `a` (eight signed 16-bit integer numbers)
+/// is stored to the elements in the result vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pcnt.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_pcnt_h(a: v8i16) -> v8i16 {
+    msa_pcnt_h(a)
+}
+
+/// Vector Population Count
+///
+/// The number of bits set to 1 for elements in vector `a` (four signed 32-bit integer numbers)
+/// is stored to the elements in the result vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pcnt.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_pcnt_w(a: v4i32) -> v4i32 {
+    msa_pcnt_w(a)
+}
+
+/// Vector Population Count
+///
+/// The number of bits set to 1 for elements in vector `a` (two signed 64-bit integer numbers)
+/// is stored to the elements in the result vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pcnt.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_pcnt_d(a: v2i64) -> v2i64 {
+    msa_pcnt_d(a)
+}
+
+/// Immediate Signed Saturate
+///
+/// Signed elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are saturated to signed values of `imm3+1` bits without changing the data width.
+/// The result is stored in the vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_s.b, imm4 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sat_s_b<const IMM3: i32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_sat_s_b(a, IMM3)
+}
+
+/// Immediate Signed Saturate
+///
+/// Signed elements in vector `a` (eight signed 16-bit integer numbers)
+/// are saturated to signed values of `imm4+1` bits without changing the data width.
+/// The result is stored in the vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_s.h, imm3 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sat_s_h<const IMM4: i32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_sat_s_h(a, IMM4)
+}
+
+/// Immediate Signed Saturate
+///
+/// Signed elements in vector `a` (four signed 32-bit integer numbers)
+/// are saturated to signed values of `imm5+1` bits without changing the data width.
+/// The result is stored in the vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_s.w, imm2 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sat_s_w<const IMM5: i32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_sat_s_w(a, IMM5)
+}
+
+/// Immediate Signed Saturate
+///
+/// Signed elements in vector `a` (two signed 64-bit integer numbers)
+/// are saturated to signed values of `imm6+1` bits without changing the data width.
+/// The result is stored in the vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_s.d, imm1 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sat_s_d<const IMM6: i32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    msa_sat_s_d(a, IMM6)
+}
+
+/// Immediate Unsigned Saturate
+///
+/// Unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are saturated to unsigned values of `imm3+1` bits without changing the data width.
+/// The result is stored in the vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_u.b, imm4 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sat_u_b<const IMM3: i32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_sat_u_b(a, IMM3)
+}
+
+/// Immediate Unsigned Saturate
+///
+/// Unsigned elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are saturated to unsigned values of `imm4+1` bits without changing the data width.
+/// The result is stored in the vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_u.h, imm3 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sat_u_h<const IMM4: i32>(a: v8u16) -> v8u16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_sat_u_h(a, IMM4)
+}
+
+/// Immediate Unsigned Saturate
+///
+/// Unsigned elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are saturated to unsigned values of `imm5+1` bits without changing the data width.
+/// The result is stored in the vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_u.w, imm2 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sat_u_w<const IMM5: i32>(a: v4u32) -> v4u32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_sat_u_w(a, IMM5)
+}
+
+/// Immediate Unsigned Saturate
+///
+/// Unsigned elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are saturated to unsigned values of `imm6+1` bits without changing the data width.
+/// The result is stored in the vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_u.d, imm1 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sat_u_d<const IMM6: i32>(a: v2u64) -> v2u64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    msa_sat_u_d(a, IMM6)
+}
+
+/// Immediate Set Shuffle Elements
+///
+/// The set shuffle instruction works on 4-element sets.
+/// All sets are shuffled in the same way: the element i82i+1..2i in `a`
+/// (sixteen signed 8-bit integer numbers) is copied over the element i in result vector
+/// (sixteen signed 8-bit integer numbers), where i is 0, 1, 2, 3.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(shf.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_shf_b<const IMM8: i32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    msa_shf_b(a, IMM8)
+}
+
+/// Immediate Set Shuffle Elements
+///
+/// The set shuffle instruction works on 4-element sets.
+/// All sets are shuffled in the same way: the element i82i+1..2i in `a`
+/// (eight signed 16-bit integer numbers) is copied over the element i in result vector
+/// (eight signed 16-bit integer numbers), where i is 0, 1, 2, 3.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(shf.h, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_shf_h<const IMM8: i32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM8, 8);
+    msa_shf_h(a, IMM8)
+}
+
+/// Immediate Set Shuffle Elements
+///
+/// The set shuffle instruction works on 4-element sets.
+/// All sets are shuffled in the same way: the element i82i+1..2i in `a`
+/// (four signed 32-bit integer numbers) is copied over the element i in result vector
+/// (four signed 32-bit integer numbers), where i is 0, 1, 2, 3.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(shf.w, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_shf_w<const IMM8: i32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    msa_shf_w(a, IMM8)
+}
+
+/// GPR Columns Slide
+///
+/// Vector registers `a` (sixteen signed 8-bit integer numbers) and `b`
+/// (sixteen signed 8-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by the number of columns given in GPR `c`.
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+/// GPR `c` value is interpreted modulo the number of columns in destination rectangle,
+/// or equivalently, the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sld.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sld_b(a: v16i8, b: v16i8, c: i32) -> v16i8 {
+    msa_sld_b(a, mem::transmute(b), c)
+}
+
+/// GPR Columns Slide
+///
+/// Vector registers `a` (eight signed 16-bit integer numbers) and `b`
+/// (eight signed 16-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by the number of columns given in GPR `c`.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+/// GPR `c` value is interpreted modulo the number of columns in destination rectangle,
+/// or equivalently, the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sld.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sld_h(a: v8i16, b: v8i16, c: i32) -> v8i16 {
+    msa_sld_h(a, mem::transmute(b), c)
+}
+
+/// GPR Columns Slide
+///
+/// Vector registers `a` (four signed 32-bit integer numbers) and `b`
+/// (four signed 32-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by the number of columns given in GPR `c`.
+/// The result is written to vector (four signed 32-bit integer numbers).
+/// GPR `c` value is interpreted modulo the number of columns in destination rectangle,
+/// or equivalently, the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sld.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sld_w(a: v4i32, b: v4i32, c: i32) -> v4i32 {
+    msa_sld_w(a, mem::transmute(b), c)
+}
+
+/// GPR Columns Slide
+///
+/// Vector registers `a` (two signed 64-bit integer numbers) and `b`
+/// (two signed 64-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by the number of columns given in GPR `c`.
+/// The result is written to vector (two signed 64-bit integer numbers).
+/// GPR `c` value is interpreted modulo the number of columns in destination rectangle,
+/// or equivalently, the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sld.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sld_d(a: v2i64, b: v2i64, c: i32) -> v2i64 {
+    msa_sld_d(a, mem::transmute(b), c)
+}
+
+/// Immediate Columns Slide
+///
+/// Vector registers `a` (sixteen signed 8-bit integer numbers) and `b`
+/// (sixteen signed 8-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by `imm1` columns.
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sldi.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sldi_b<const IMM4: i32>(a: v16i8, b: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_sldi_b(a, mem::transmute(b), IMM4)
+}
+
+/// Immediate Columns Slide
+///
+/// Vector registers `a` (eight signed 16-bit integer numbers) and `b`
+/// (eight signed 16-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by `imm1` columns.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sldi.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sldi_h<const IMM3: i32>(a: v8i16, b: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_sldi_h(a, mem::transmute(b), IMM3)
+}
+
+/// Immediate Columns Slide
+///
+/// Vector registers `a` (four signed 32-bit integer numbers) and `b`
+/// (four signed 32-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by `imm1` columns.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sldi.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sldi_w<const IMM2: i32>(a: v4i32, b: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM2, 2);
+    msa_sldi_w(a, mem::transmute(b), IMM2)
+}
+
+/// Immediate Columns Slide
+///
+/// Vector registers `a` (two signed 64-bit integer numbers) and `b`
+/// (two signed 64-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by `imm1` columns.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sldi.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sldi_d<const IMM1: i32>(a: v2i64, b: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    msa_sldi_d(a, mem::transmute(b), IMM1)
+}
+
+/// Vector Shift Left
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted left by the number of bits the elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) specify modulo the size of the
+/// element in bits. The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sll.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sll_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_sll_b(a, mem::transmute(b))
+}
+
+/// Vector Shift Left
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted left by the number of bits the elements in vector `b`
+/// (eight signed 16-bit integer numbers) specify modulo the size of the
+/// element in bits. The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sll.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sll_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_sll_h(a, mem::transmute(b))
+}
+
+/// Vector Shift Left
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted left by the number of bits the elements in vector `b`
+/// (four signed 32-bit integer numbers) specify modulo the size of the
+/// element in bits. The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sll.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sll_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_sll_w(a, mem::transmute(b))
+}
+
+/// Vector Shift Left
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted left by the number of bits the elements in vector `b`
+/// (two signed 64-bit integer numbers) specify modulo the size of the
+/// element in bits. The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sll.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sll_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_sll_d(a, mem::transmute(b))
+}
+
+/// Immediate Shift Left
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted left by `imm4` bits.
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(slli.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_slli_b<const IMM4: i32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_slli_b(a, IMM4)
+}
+
+/// Immediate Shift Left
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted left by `imm3` bits.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(slli.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_slli_h<const IMM3: i32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_slli_h(a, IMM3)
+}
+
+/// Immediate Shift Left
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted left by `imm2` bits.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(slli.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_slli_w<const IMM2: i32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM2, 2);
+    msa_slli_w(a, IMM2)
+}
+
+/// Immediate Shift Left
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted left by `imm1` bits.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(slli.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_slli_d<const IMM1: i32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    msa_slli_d(a, IMM1)
+}
+
+/// GPR Element Splat
+///
+/// Replicate vector `a` (sixteen signed 8-bit integer numbers)
+/// element with index given by GPR `b` to all elements in vector
+/// (sixteen signed 8-bit integer numbers) GPR `b` value is interpreted
+/// modulo the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splat.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_splat_b(a: v16i8, b: i32) -> v16i8 {
+    msa_splat_b(a, mem::transmute(b))
+}
+
+/// GPR Element Splat
+///
+/// Replicate vector `a` (eight signed 16-bit integer numbers)
+/// element with index given by GPR `b` to all elements in vector
+/// (eight signed 16-bit integer numbers) GPR `b` value is interpreted
+/// modulo the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splat.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_splat_h(a: v8i16, b: i32) -> v8i16 {
+    msa_splat_h(a, mem::transmute(b))
+}
+
+/// GPR Element Splat
+///
+/// Replicate vector `a` (four signed 32-bit integer numbers)
+/// element with index given by GPR `b` to all elements in vector
+/// (four signed 32-bit integer numbers) GPR `b` value is interpreted
+/// modulo the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splat.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_splat_w(a: v4i32, b: i32) -> v4i32 {
+    msa_splat_w(a, mem::transmute(b))
+}
+
+/// GPR Element Splat
+///
+/// Replicate vector `a` (two signed 64-bit integer numbers)
+/// element with index given by GPR `b` to all elements in vector
+/// (two signed 64-bit integer numbers) GPR `b` value is interpreted
+/// modulo the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splat.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_splat_d(a: v2i64, b: i32) -> v2i64 {
+    msa_splat_d(a, mem::transmute(b))
+}
+
+/// Immediate Element Splat
+///
+/// Replicate element `imm4` in vector `a` (sixteen signed 8-bit integer numbers)
+/// to all elements in vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splati.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_splati_b<const IMM4: i32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_splati_b(a, IMM4)
+}
+
+/// Immediate Element Splat
+///
+/// Replicate element `imm3` in vector `a` (eight signed 16-bit integer numbers)
+/// to all elements in vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splati.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_splati_h<const IMM3: i32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_splati_h(a, IMM3)
+}
+
+/// Immediate Element Splat
+///
+/// Replicate element `imm2` in vector `a` (four signed 32-bit integer numbers)
+/// to all elements in vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splati.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_splati_w<const IMM2: i32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM2, 2);
+    msa_splati_w(a, IMM2)
+}
+
+/// Immediate Element Splat
+///
+/// Replicate element `imm1` in vector `a` (two signed 64-bit integer numbers)
+/// to all elements in vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splati.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_splati_d<const IMM1: i32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    msa_splati_d(a, IMM1)
+}
+
+/// Vector Shift Right Arithmetic
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sra.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sra_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_sra_b(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Arithmetic
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (eight signed 16-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sra.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sra_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_sra_h(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Arithmetic
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (four signed 32-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sra.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sra_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_sra_w(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Arithmetic
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (two signed 64-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sra.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_sra_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_sra_d(a, mem::transmute(b))
+}
+
+/// Immediate Shift Right Arithmetic
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right arithmetic by `imm3` bits.
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srai.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srai_b<const IMM3: i32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_srai_b(a, IMM3)
+}
+
+/// Immediate Shift Right Arithmetic
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right arithmetic by `imm4` bits.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srai.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srai_h<const IMM4: i32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_srai_h(a, IMM4)
+}
+
+/// Immediate Shift Right Arithmetic
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right arithmetic by `imm5` bits.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srai.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srai_w<const IMM5: i32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_srai_w(a, IMM5)
+}
+
+/// Immediate Shift Right Arithmetic
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right arithmetic by `imm6` bits.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srai.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srai_d<const IMM6: i32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    msa_srai_d(a, IMM6)
+}
+
+/// Vector Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srar.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srar_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_srar_b(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (eight signed 16-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srar.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srar_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_srar_h(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (four signed 32-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srar.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srar_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_srar_w(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (two signed 64-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srar.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srar_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_srar_d(a, mem::transmute(b))
+}
+
+/// Immediate Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right arithmetic by `imm3` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srari.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srari_b<const IMM3: i32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_srari_b(a, IMM3)
+}
+
+/// Immediate Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right arithmetic by `imm4` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srari.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srari_h<const IMM4: i32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_srari_h(a, IMM4)
+}
+
+/// Immediate Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right arithmetic by `imm5` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srari.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srari_w<const IMM5: i32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_srari_w(a, IMM5)
+}
+
+/// Immediate Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right arithmetic by `imm6` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srari.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srari_d<const IMM6: i32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    msa_srari_d(a, IMM6)
+}
+
+/// Vector Shift Right Logical
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srl.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srl_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_srl_b(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Logical
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (eight signed 16-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srl.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srl_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_srl_h(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Logical
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (four signed 32-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srl.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srl_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_srl_w(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Logical
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (two signed 64-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srl.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srl_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_srl_d(a, mem::transmute(b))
+}
+
+/// Immediate Shift Right Logical
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right logical by `imm4` bits.
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srli.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srli_b<const IMM4: i32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_srli_b(a, IMM4)
+}
+
+/// Immediate Shift Right Logical
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right logical by `imm3` bits.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srli.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srli_h<const IMM3: i32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_srli_h(a, IMM3)
+}
+
+/// Immediate Shift Right Logical
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right logical by `imm2` bits.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srli.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srli_w<const IMM2: i32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM2, 2);
+    msa_srli_w(a, IMM2)
+}
+
+/// Immediate Shift Right Logical
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right logical by `imm1` bits.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srli.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srli_d<const IMM1: i32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    msa_srli_d(a, IMM1)
+}
+
+/// Vector Shift Right Logical Rounded
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlr.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srlr_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_srlr_b(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Logical Rounded
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (eight signed 16-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlr.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srlr_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_srlr_h(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Logical Rounded
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (four signed 32-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlr.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srlr_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_srlr_w(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Logical Rounded
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (two signed 64-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlr.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srlr_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_srlr_d(a, mem::transmute(b))
+}
+
+/// Immediate Shift Right Logical Rounded
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right logical by `imm6` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlri.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srlri_b<const IMM3: i32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM3, 3);
+    msa_srlri_b(a, IMM3)
+}
+
+/// Immediate Shift Right Logical Rounded
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right logical by `imm6` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlri.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srlri_h<const IMM4: i32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM4, 4);
+    msa_srlri_h(a, IMM4)
+}
+
+/// Immediate Shift Right Logical Rounded
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right logical by `imm6` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlri.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srlri_w<const IMM5: i32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_srlri_w(a, IMM5)
+}
+
+/// Immediate Shift Right Logical Rounded
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right logical by `imm6` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlri.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_srlri_d<const IMM6: i32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM6, 6);
+    msa_srlri_d(a, IMM6)
+}
+
+/// Vector Store
+///
+/// The WRLEN / 8 bytes in vector `a` (sixteen signed 8-bit integer numbers)
+/// are stored as elements of data format df at the effective memory location
+/// addressed by the base `mem_addr` and the 10-bit signed immediate offset `imm_s10`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(st.b, imm_s10 = 0b1111111111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_st_b<const IMM_S10: i32>(a: v16i8, mem_addr: *mut u8) -> () {
+    static_assert_simm_bits!(IMM_S10, 10);
+    msa_st_b(a, mem_addr, IMM_S10)
+}
+
+/// Vector Store
+///
+/// The WRLEN / 8 bytes in vector `a` (eight signed 16-bit integer numbers)
+/// are stored as elements of data format df at the effective memory location
+/// addressed by the base `mem_addr` and the 11-bit signed immediate offset `imm_s11`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(st.h, imm_s11 = 0b11111111111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_st_h<const IMM_S11: i32>(a: v8i16, mem_addr: *mut u8) -> () {
+    static_assert_simm_bits!(IMM_S11, 11);
+    static_assert!(IMM_S11 % 2 == 0);
+    msa_st_h(a, mem_addr, IMM_S11)
+}
+
+/// Vector Store
+///
+/// The WRLEN / 8 bytes in vector `a` (four signed 32-bit integer numbers)
+/// are stored as elements of data format df at the effective memory location
+/// addressed by the base `mem_addr` and the 12-bit signed immediate offset `imm_s12`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(st.w, imm_s12 = 0b111111111111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_st_w<const IMM_S12: i32>(a: v4i32, mem_addr: *mut u8) -> () {
+    static_assert_simm_bits!(IMM_S12, 12);
+    static_assert!(IMM_S12 % 4 == 0);
+    msa_st_w(a, mem_addr, IMM_S12)
+}
+
+/// Vector Store
+///
+/// The WRLEN / 8 bytes in vector `a` (two signed 64-bit integer numbers)
+/// are stored as elements of data format df at the effective memory location
+/// addressed by the base `mem_addr` and the 13-bit signed immediate offset `imm_s13`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(st.d, imm_s13 = 0b1111111111111))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_st_d<const IMM_S13: i32>(a: v2i64, mem_addr: *mut u8) -> () {
+    static_assert_simm_bits!(IMM_S13, 13);
+    static_assert!(IMM_S13 % 8 == 0);
+    msa_st_d(a, mem_addr, IMM_S13)
+}
+
+/// Vector Signed Saturated Subtract of Signed Values
+///
+/// The elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are subtracted from the elements in vector `a` (sixteen signed 8-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_s.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subs_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_subs_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Signed Values
+///
+/// The elements in vector `b` (eight signed 16-bit integer numbers)
+/// are subtracted from the elements in vector `a` (eight signed 16-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subs_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_subs_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Signed Values
+///
+/// The elements in vector `b` (four signed 32-bit integer numbers)
+/// are subtracted from the elements in vector `a` (four signed 32-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subs_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_subs_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Signed Values
+///
+/// The elements in vector `b` (two signed 64-bit integer numbers)
+/// are subtracted from the elements in vector `a` (two signed 64-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subs_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_subs_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Unsigned Values
+///
+/// The elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// are subtracted from the elements in vector `a` (sixteen unsigned 8-bit integer numbers).
+/// Unsigned arithmetic is performed and under-flows clamp to 0 before writing
+/// the result to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_u.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subs_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_subs_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Unsigned Values
+///
+/// The elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// are subtracted from the elements in vector `a` (eight unsigned 16-bit integer numbers).
+/// Unsigned arithmetic is performed and under-flows clamp to 0 before writing
+/// the result to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subs_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_subs_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Unsigned Values
+///
+/// The elements in vector `b` (four unsigned 32-bit integer numbers)
+/// are subtracted from the elements in vector `a` (four unsigned 32-bit integer numbers).
+/// Unsigned arithmetic is performed and under-flows clamp to 0 before writing
+/// the result to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subs_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_subs_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Unsigned Values
+///
+/// The elements in vector `b` (two unsigned 64-bit integer numbers)
+/// are subtracted from the elements in vector `a` (two unsigned 64-bit integer numbers).
+/// Unsigned arithmetic is performed and under-flows clamp to 0 before writing
+/// the result to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subs_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_subs_u_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Signed from Unsigned
+///
+/// The signed elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The signed result is unsigned saturated and written to
+/// to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsus_u.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subsus_u_b(a: v16u8, b: v16i8) -> v16u8 {
+    msa_subsus_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Signed from Unsigned
+///
+/// The signed elements in vector `b` (eight signed 16-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (eight unsigned 16-bit integer numbers).
+/// The signed result is unsigned saturated and written to
+/// to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsus_u.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subsus_u_h(a: v8u16, b: v8i16) -> v8u16 {
+    msa_subsus_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Signed from Unsigned
+///
+/// The signed elements in vector `b` (four signed 6432it integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (four unsigned 32-bit integer numbers).
+/// The signed result is unsigned saturated and written to
+/// to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsus_u.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subsus_u_w(a: v4u32, b: v4i32) -> v4u32 {
+    msa_subsus_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Signed from Unsigned
+///
+/// The signed elements in vector `b` (two signed 64-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (two unsigned 64-bit integer numbers).
+/// The signed result is unsigned saturated and written to
+/// to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsus_u.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subsus_u_d(a: v2u64, b: v2i64) -> v2u64 {
+    msa_subsus_u_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Unsigned Values
+///
+/// The unsigned elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The signed result is signed saturated and written to
+/// to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsuu_s.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subsuu_s_b(a: v16u8, b: v16u8) -> v16i8 {
+    msa_subsuu_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Unsigned Values
+///
+/// The unsigned elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (eight unsigned 16-bit integer numbers).
+/// The signed result is signed saturated and written to
+/// to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsuu_s.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subsuu_s_h(a: v8u16, b: v8u16) -> v8i16 {
+    msa_subsuu_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Unsigned Values
+///
+/// The unsigned elements in vector `b` (four unsigned 32-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (four unsigned 32-bit integer numbers).
+/// The signed result is signed saturated and written to
+/// to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsuu_s.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subsuu_s_w(a: v4u32, b: v4u32) -> v4i32 {
+    msa_subsuu_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Unsigned Values
+///
+/// The unsigned elements in vector `b` (two unsigned 64-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (two unsigned 64-bit integer numbers).
+/// The signed result is signed saturated and written to
+/// to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsuu_s.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subsuu_s_d(a: v2u64, b: v2u64) -> v2i64 {
+    msa_subsuu_s_d(a, mem::transmute(b))
+}
+
+/// Vector Subtract
+///
+/// The elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are subtracted from the elements in vector `a` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subv.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subv_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_subv_b(a, mem::transmute(b))
+}
+
+/// Vector Subtract
+///
+/// The elements in vector `b` (eight signed 16-bit integer numbers)
+/// are subtracted from the elements in vector `a` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subv.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subv_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_subv_h(a, mem::transmute(b))
+}
+
+/// Vector Subtract
+///
+/// The elements in vector `b` (four signed 32-bit integer numbers)
+/// are subtracted from the elements in vector `a` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subv.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subv_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_subv_w(a, mem::transmute(b))
+}
+
+/// Vector Subtract
+///
+/// The elements in vector `b` (two signed 64-bit integer numbers)
+/// are subtracted from the elements in vector `a` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subv.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subv_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_subv_d(a, mem::transmute(b))
+}
+
+/// Immediate Subtract
+///
+/// The 5-bit immediate unsigned value `imm5`
+/// are subtracted from the elements in vector `a` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subvi.b, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subvi_b<const IMM5: i32>(a: v16i8) -> v16i8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_subvi_b(a, IMM5)
+}
+
+/// Immediate Subtract
+///
+/// The 5-bit immediate unsigned value `imm5`
+/// are subtracted from the elements in vector `a` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subvi.h, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subvi_h<const IMM5: i32>(a: v8i16) -> v8i16 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_subvi_h(a, IMM5)
+}
+
+/// Immediate Subtract
+///
+/// The 5-bit immediate unsigned value `imm5`
+/// are subtracted from the elements in vector `a` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subvi.w, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subvi_w<const IMM5: i32>(a: v4i32) -> v4i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_subvi_w(a, IMM5)
+}
+
+/// Immediate Subtract
+///
+/// The 5-bit immediate unsigned value `imm5`
+/// are subtracted from the elements in vector `a` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subvi.d, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_subvi_d<const IMM5: i32>(a: v2i64) -> v2i64 {
+    static_assert_uimm_bits!(IMM5, 5);
+    msa_subvi_d(a, IMM5)
+}
+
+/// Vector Data Preserving Shuffle
+///
+/// The vector shuffle instructions selectively copy data elements from the
+/// concatenation of vectors `b` (sixteen signed 8-bit integer numbers)
+/// and `c` (sixteen signed 8-bit integer numbers) in to vector `a`
+/// (sixteen signed 8-bit integer numbers) based on the corresponding control element in `a`.
+/// The least significant 6 bits in `a` control elements modulo the number of elements in
+/// the concatenated vectors `b`, `a` specify the index of the source element.
+/// If bit 6 or bit 7 is 1, there will be no copy, but rather the destination element is set to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(vshf.b))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_vshf_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8 {
+    msa_vshf_b(a, mem::transmute(b), c)
+}
+
+/// Vector Data Preserving Shuffle
+///
+/// The vector shuffle instructions selectively copy data elements from the
+/// concatenation of vectors `b` (eight signed 16-bit integer numbers)
+/// and `c` (eight signed 16-bit integer numbers) in to vector `a`
+/// (eight signed 16-bit integer numbers) based on the corresponding control element in `a`.
+/// The least significant 6 bits in `a` control elements modulo the number of elements in
+/// the concatenated vectors `b`, `a` specify the index of the source element.
+/// If bit 6 or bit 7 is 1, there will be no copy, but rather the destination element is set to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(vshf.h))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_vshf_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_vshf_h(a, mem::transmute(b), c)
+}
+
+/// Vector Data Preserving Shuffle
+///
+/// The vector shuffle instructions selectively copy data elements from the
+/// concatenation of vectors `b` (four signed 32-bit integer numbers)
+/// and `c` (four signed 32-bit integer numbers) in to vector `a`
+/// (four signed 32-bit integer numbers) based on the corresponding control element in `a`.
+/// The least significant 6 bits in `a` control elements modulo the number of elements in
+/// the concatenated vectors `b`, `a` specify the index of the source element.
+/// If bit 6 or bit 7 is 1, there will be no copy, but rather the destination element is set to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(vshf.w))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_vshf_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_vshf_w(a, mem::transmute(b), c)
+}
+
+/// Vector Data Preserving Shuffle
+///
+/// The vector shuffle instructions selectively copy data elements from the
+/// concatenation of vectors `b` (two signed 64-bit integer numbers)
+/// and `c` (two signed 64-bit integer numbers) in to vector `a`
+/// (two signed 64-bit integer numbers) based on the corresponding control element in `a`.
+/// The least significant 6 bits in `a` control elements modulo the number of elements in
+/// the concatenated vectors `b`, `a` specify the index of the source element.
+/// If bit 6 or bit 7 is 1, there will be no copy, but rather the destination element is set to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(vshf.d))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_vshf_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64 {
+    msa_vshf_d(a, mem::transmute(b), c)
+}
+
+/// Vector Logical Exclusive Or
+///
+/// Each bit of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the corresponding bit of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// in a bitwise logical XOR operation. The result is written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(xor.v))]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_xor_v(a: v16u8, b: v16u8) -> v16u8 {
+    msa_xor_v(a, mem::transmute(b))
+}
+
+/// Immediate Logical Exclusive Or
+///
+/// Each byte of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the 8-bit immediate `imm8`
+/// in a bitwise logical XOR operation. The result is written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(xori.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_mips", issue = "111198")]
+pub unsafe fn __msa_xori_b<const IMM8: i32>(a: v16u8) -> v16u8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    msa_xori_b(a, IMM8)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        core_arch::{mips::msa::*, simd::*},
+        mem,
+    };
+    use std::{f32, f64};
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_add_a_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -4, -3, -2, -1,
+            -4, -3, -2, -1,
+            -4, -3, -2, -1,
+            -4, -3, -2, -1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            5, 5, 5, 5,
+            5, 5, 5, 5,
+            5, 5, 5, 5,
+            5, 5, 5, 5
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_add_a_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_add_a_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-4, -3, -2, -1, -4, -3, -2, -1);
+        #[rustfmt::skip]
+        let r = i16x8::new(5, 5, 5, 5, 5, 5, 5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_add_a_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_add_a_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(-4, -3, -2, -1);
+        #[rustfmt::skip]
+        let r = i32x4::new(5, 5, 5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_add_a_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_add_a_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(-4, -3);
+        #[rustfmt::skip]
+        let r = i64x2::new(5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_add_a_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_a_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, i8::MAX, 100, i8::MAX,
+            100, i8::MAX, 100, i8::MAX,
+            100, i8::MAX, 100, i8::MAX,
+            100, i8::MAX, 100, i8::MAX
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -4, -3, -2, -100,
+            -4, -3, -2, -100,
+            -4, -3, -2, -100,
+            -4, -3, -2, -100
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            104, 127, 102, 127,
+            104, 127, 102, 127,
+            104, 127, 102, 127,
+            104, 127, 102, 127
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_a_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_a_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            100, i16::MAX, 100, i16::MAX,
+            100, i16::MAX, 100, i16::MAX
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(-4, -3, -2, -1, -4, -3, -2, -1);
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            104, i16::MAX, 102, i16::MAX,
+            104, i16::MAX, 102, i16::MAX
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_a_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_a_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, i32::MAX, 100, i32::MAX);
+        #[rustfmt::skip]
+        let b = i32x4::new(-4, -3, -2, -1);
+        #[rustfmt::skip]
+        let r = i32x4::new(104, i32::MAX, 102, i32::MAX);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_a_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_a_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MAX);
+        #[rustfmt::skip]
+        let b = i64x2::new(-4, -3);
+        #[rustfmt::skip]
+        let r = i64x2::new(104, i64::MAX);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_a_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, i8::MIN, 100, i8::MAX,
+            100, i8::MIN, 100, i8::MAX,
+            100, i8::MIN, 100, i8::MAX,
+            100, i8::MIN, 100, i8::MAX
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -4, -3, -2, 100,
+            -4, -3, -2, 100,
+            -4, -3, -2, 100,
+            -4, -3, -2, 100
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            96, i8::MIN, 98, i8::MAX,
+            96, i8::MIN, 98, i8::MAX,
+            96, i8::MIN, 98, i8::MAX,
+            96, i8::MIN, 98, i8::MAX
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            100, i16::MIN, 100, i16::MAX,
+            100, i16::MIN, 100, i16::MAX
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(-4, -3, -2, 1, -4, -3, -2, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            96, i16::MIN, 98, i16::MAX,
+            96, i16::MIN, 98, i16::MAX
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, i32::MAX, 100, i32::MIN);
+        #[rustfmt::skip]
+        let b = i32x4::new(-4, 3, -2, -1);
+        #[rustfmt::skip]
+        let r = i32x4::new(96, i32::MAX, 98, i32::MIN);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MIN);
+        #[rustfmt::skip]
+        let b = i64x2::new(-4, -3);
+        #[rustfmt::skip]
+        let r = i64x2::new(96, i64::MIN);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            4, 3, 2, 100,
+            4, 3, 2, 100,
+            4, 3, 2, 100,
+            4, 3, 2, 100
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            104, u8::MAX, 102, u8::MAX,
+            104, u8::MAX, 102, u8::MAX,
+            104, u8::MAX, 102, u8::MAX,
+            104, u8::MAX, 102, u8::MAX
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            100, u16::MAX, 100, u16::MAX,
+            100, u16::MAX, 100, u16::MAX
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(4, 3, 2, 1, 4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = u16x8::new(
+            104, u16::MAX, 102, u16::MAX,
+            104, u16::MAX, 102, u16::MAX
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(100, u32::MAX, 100, u32::MAX);
+        #[rustfmt::skip]
+        let b = u32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = u32x4::new(104, u32::MAX, 102, u32::MAX);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(100, u64::MAX);
+        #[rustfmt::skip]
+        let b = u64x2::new(4, 3);
+        #[rustfmt::skip]
+        let r = u64x2::new(104, u64::MAX);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addv_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, i8::MIN, 100, i8::MAX,
+            100, i8::MIN, 100, i8::MAX,
+            100, i8::MIN, 100, i8::MAX,
+            100, i8::MIN, 100, i8::MAX
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -4, -3, -2, 100,
+            -4, -3, -2, 100,
+            -4, -3, -2, 100,
+            -4, -3, -2, 100
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            96, 125, 98, -29,
+            96, 125, 98, -29,
+            96, 125, 98, -29,
+            96, 125, 98, -29
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_addv_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addv_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            100, i16::MIN, 100, i16::MAX,
+            100, i16::MIN, 100, i16::MAX
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(-4, -3, -2, 1, -4, -3, -2, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(96, 32765, 98, -32768, 96, 32765, 98, -32768);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_addv_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addv_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, i32::MAX, 100, i32::MIN);
+        #[rustfmt::skip]
+        let b = i32x4::new(-4, 3, -2, -1);
+        #[rustfmt::skip]
+        let r = i32x4::new(96, -2147483646, 98, 2147483647);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_addv_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addv_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MIN);
+        #[rustfmt::skip]
+        let b = i64x2::new(-4, -3);
+        #[rustfmt::skip]
+        let r = i64x2::new(96, 9223372036854775805);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_addv_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addvi_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, i8::MAX, 100, i8::MAX,
+            100, i8::MAX, 100, i8::MAX,
+            100, i8::MAX, 100, i8::MAX,
+            100, i8::MAX, 100, i8::MAX
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            103, -126, 103, -126,
+            103, -126, 103, -126,
+            103, -126, 103, -126,
+            103, -126, 103, -126
+        );
+
+        assert_eq!(r, mem::transmute(__msa_addvi_b(mem::transmute(a), 67)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addvi_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 3276, -100, -127,
+            i16::MAX, 3276, -100, -127
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            -32766, 3279, -97, -124,
+            -32766, 3279, -97, -124
+        );
+
+        assert_eq!(r, mem::transmute(__msa_addvi_h(mem::transmute(a), 67)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addvi_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, i32::MAX, 100, i32::MIN);
+        #[rustfmt::skip]
+        let r = i32x4::new(103, -2147483646, 103, -2147483645);
+
+        assert_eq!(r, mem::transmute(__msa_addvi_w(mem::transmute(a), 67)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addvi_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MIN);
+        #[rustfmt::skip]
+        let r = i64x2::new(117, -9223372036854775791);
+
+        assert_eq!(r, mem::transmute(__msa_addvi_d(mem::transmute(a), 17)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_and_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX
+    );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            4, 3, 2, 100,
+            4, 3, 2, 100,
+            4, 3, 2, 100,
+            4, 3, 2, 100
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            4, 3, 0, 100,
+            4, 3, 0, 100,
+            4, 3, 0, 100,
+            4, 3, 0, 100
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_and_v(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_andi_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            4, 5, 4, 5,
+            4, 5, 4, 5,
+            4, 5, 4, 5,
+            4, 5, 4, 5
+        );
+
+        assert_eq!(r, mem::transmute(__msa_andi_b(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -1, -2, -3, -4,
+            -1, -2, -3, -4,
+            -1, -2, -3, -4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            5, 5, 5, 5,
+            5, 5, 5, 5,
+            5, 5, 5, 5,
+            5, 5, 5, 5
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-1, -2, -3, -4, -1, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, -7, -8, -9, -6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = i16x8::new(5, 5, 5, 5, 5, 5, 5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(-6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = i32x4::new(5, 5, 5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, -2);
+        #[rustfmt::skip]
+        let b = i64x2::new(-6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            5, 5, 5, 5,
+            5, 5, 5, 5,
+            5, 5, 5, 5,
+            5, 5, 5, 5
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(5, 5, 5, 5, 5, 5, 5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(5, 5, 5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -1, -2, -3, -4,
+            -1, -2, -3, -4,
+            -1, -2, -3, -4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            6, -7, 8, -9,
+            6, -7, 8, -9,
+            6, -7, 8, -9,
+            6, -7, 8, -9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            2, -5, 2, -7,
+            2, -5, 2, -7,
+            2, -5, 2, -7,
+            2, -5, 2, -7
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-1, -2, -3, -4, -1, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(6, -7, 8, -9, 6, -7, 8, -9);
+        #[rustfmt::skip]
+        let r = i16x8::new(2, -5, 2, -7, 2, -5, 2, -7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, -7, 8, -9);
+        #[rustfmt::skip]
+        let r = i32x4::new(2, -5, 2, -7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, -2);
+        #[rustfmt::skip]
+        let b = i64x2::new(-6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(-4, -5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            3, 4, 5, 6,
+            3, 4, 5, 6,
+            3, 4, 5, 6,
+            3, 4, 5, 6
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(3, 4, 5, 6, 3, 4, 5, 6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(3, 4, 5, 6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(3, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -1, -2, 3, -4,
+            -1, -2, 3, -4,
+            -1, -2, 3, -4,
+            -1, -2, 3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, 7, -8, -9,
+            -6, 7, -8, -9,
+            -6, 7, -8, -9,
+            -6, 7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -3, 3, -2, -6,
+            -3, 3, -2, -6,
+            -3, 3, -2, -6,
+            -3, 3, -2, -6
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-1, -2, 3, -4, -1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, 7, -8, -9, -6, 7, -8, -9);
+        #[rustfmt::skip]
+        let r = i16x8::new(-3, 3, -2, -6, -3, 3, -2, -6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(-6, 7, -8, -9);
+        #[rustfmt::skip]
+        let r = i32x4::new(-3, 3, -2, -6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, -2);
+        #[rustfmt::skip]
+        let b = i64x2::new(-6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(-3, -4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            4, 5, 6, 7,
+            4, 5, 6, 7,
+            4, 5, 6, 7,
+            4, 5, 6, 7
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(4, 5, 6, 7, 4, 5, 6, 7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(4, 5, 6, 7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(4, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclr_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            191, 27, 54, 1,
+            191, 27, 54, 1,
+            191, 27, 54, 1,
+            191, 27, 54, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bclr_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclr_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(255, 155, 55, 1, 255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(191, 27, 55, 1, 191, 27, 55, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bclr_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclr_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(191, 27, 55, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bclr_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclr_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(255, 155);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(191, 27);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bclr_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclri_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            247, 147, 55, 1,
+            247, 147, 55, 1,
+            247, 147, 55, 1,
+            247, 147, 55, 1
+        );
+
+        assert_eq!(r, mem::transmute(__msa_bclri_b(mem::transmute(a), 3)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclri_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(2155, 1155, 155, 1, 2155, 1155, 155, 1);
+        #[rustfmt::skip]
+        let r = u16x8::new(107, 1155, 155, 1, 107, 1155, 155, 1);
+
+        assert_eq!(r, mem::transmute(__msa_bclri_h(mem::transmute(a), 11)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclri_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(211111155, 111111155, 11111155, 1);
+        #[rustfmt::skip]
+        let r = u32x4::new(202722547, 102722547, 2722547, 1);
+
+        assert_eq!(r, mem::transmute(__msa_bclri_w(mem::transmute(a), 23)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclri_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(211111111155, 11111111111111155);
+        #[rustfmt::skip]
+        let r = u64x2::new(73672157683, 11110973672157683);
+
+        assert_eq!(r, mem::transmute(__msa_bclri_d(mem::transmute(a), 37)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsl_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            1, 3, 5, 9,
+            1, 3, 5, 9,
+            1, 3, 5, 9,
+            1, 3, 5, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            63, 11, 11, 1,
+            63, 11, 11, 1,
+            63, 11, 11, 1,
+            63, 11, 11, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsl_b(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsl_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            32767, 16384, 8192, 4096,
+            32767, 16384, 8192, 4096
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            21656, 5273, 7081, 2985,
+            21656, 5273, 7081, 2985
+        );
+        #[rustfmt::skip]
+        let c = u16x8::new(
+            3, 7, 9, 13,
+            15, 17, 21, 23
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(
+            24575, 5120, 7040, 2984,
+            21656, 0, 6144, 2816
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsl_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsl_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(2147483647, 536870912, 67108864, 8388608);
+        #[rustfmt::skip]
+        let b = u32x4::new(1036372536, 259093134, 78219975, 1119499719);
+        #[rustfmt::skip]
+        let c = u32x4::new(11, 15, 31, 37);
+        #[rustfmt::skip]
+        let r = u32x4::new(1037041663, 259063808, 78219975, 1082130432);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsl_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsl_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(8006399338, 2882303762);
+        #[rustfmt::skip]
+        let b = u64x2::new(9223372036854775805, 536870912);
+        #[rustfmt::skip]
+        let c = u64x2::new(12, 48);
+        #[rustfmt::skip]
+        let r = u64x2::new(9221120245047489898, 536901394);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsl_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsli_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            7, 7, 11, 9,
+            7, 7, 11, 9,
+            7, 7, 11, 9,
+            7, 7, 11, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsli_b(mem::transmute(a), mem::transmute(b), 5))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsli_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            32767, 16384, 8192, 4096,
+            32767, 16384, 8192, 4096
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            21656, 5273, 7081, 2985,
+            21656, 5273, 7081, 2985
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(
+            21659, 5272, 7080, 2984,
+            21659, 5272, 7080, 2984
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsli_h(mem::transmute(a), mem::transmute(b), 13))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsli_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(2147483647, 536870912, 67108864, 8388608);
+        #[rustfmt::skip]
+        let b = u32x4::new(1036372536, 259093134, 78219975, 1119499719);
+        #[rustfmt::skip]
+        let r = u32x4::new(1036386303, 259080192, 78217216, 1119485952);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsli_w(mem::transmute(a), mem::transmute(b), 17))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsli_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(8006399338, 2882303762);
+        #[rustfmt::skip]
+        let b = u64x2::new(9223372036854775805, 536870912);
+        #[rustfmt::skip]
+        let r = u64x2::new(9223372036854773098, 536901394);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsli_d(mem::transmute(a), mem::transmute(b), 48))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsr_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            1, 3, 5, 9,
+            1, 3, 5, 9,
+            1, 3, 5, 9,
+            1, 3, 5, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            254, 151, 8, 1,
+            254, 151, 8, 1,
+            254, 151, 8, 1,
+            254, 151, 8, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsr_b(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsr_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            32767, 16384, 8192, 4096,
+            32767, 16384, 8192, 4096
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            21656, 5273, 7081, 2985,
+            21656, 5273, 7081, 2985
+        );
+        #[rustfmt::skip]
+        let c = u16x8::new(
+            3, 7, 9, 13,
+            15, 17, 21, 23
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(
+            32760, 16537, 9129, 2985,
+            21656, 16385, 8233, 4265
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsr_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsr_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(2147483647, 536870912, 67108864, 8388608);
+        #[rustfmt::skip]
+        let b = u32x4::new(1036372536, 259093134, 78219975, 1119499719);
+        #[rustfmt::skip]
+        let c = u32x4::new(11, 15, 31, 37);
+        #[rustfmt::skip]
+        let r = u32x4::new(2147482168, 536900238, 78219975, 8388615);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsr_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsr_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(8006399338, 2882303762);
+        #[rustfmt::skip]
+        let b = u64x2::new(9223372036854775805, 536870912);
+        #[rustfmt::skip]
+        let c = u64x2::new(12, 48);
+        #[rustfmt::skip]
+        let r = u64x2::new(8006402045, 536870912);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsr_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsri_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            198, 135, 8, 9,
+            198, 135, 8, 9,
+            198, 135, 8, 9,
+            198, 135, 8, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsri_b(mem::transmute(a), mem::transmute(b), 5))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsri_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            32767, 16384, 8192, 4096,
+            32767, 16384, 8192, 4096
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            21656, 5273, 7081, 2985,
+            21656, 5273, 7081, 2985
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(
+            21656, 21657, 7081, 2985,
+            21656, 21657, 7081, 2985
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsri_h(mem::transmute(a), mem::transmute(b), 13))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsri_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(2147483647, 536870912, 67108864, 8388608);
+        #[rustfmt::skip]
+        let b = u32x4::new(1036372536, 259093134, 78219975, 1119499719);
+        #[rustfmt::skip]
+        let r = u32x4::new(2147338808, 536965774, 67209927, 8533447);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsri_w(mem::transmute(a), mem::transmute(b), 17))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsri_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(8006399338, 2882303762);
+        #[rustfmt::skip]
+        let b = u64x2::new(9223372036854775805, 536870912);
+        #[rustfmt::skip]
+        let r = u64x2::new(562949953421309, 536870912);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsri_d(mem::transmute(a), mem::transmute(b), 48))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bmnz_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            3, 5, 7, 1,
+            3, 5, 7, 1,
+            3, 5, 7, 1,
+            3, 5, 7, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            254, 159, 48, 1,
+            254, 159, 48, 1,
+            254, 159, 48, 1,
+            254, 159, 48, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bmnz_v(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bmnzi_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, u8::MAX, 155, 55,
+            1, u8::MAX, 155, 55,
+            1, u8::MAX, 155, 55,
+            1, u8::MAX, 155, 55
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            249, 159, 51, 7,
+            249, 159, 51, 7,
+            249, 159, 51, 7,
+            249, 159, 51, 7
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bmnzi_b(mem::transmute(a), mem::transmute(b), 7))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bmz_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            3, 5, 7, 1,
+            3, 5, 7, 1,
+            3, 5, 7, 1,
+            3, 5, 7, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            7, 3, 15, 9,
+            7, 3, 15, 9,
+            7, 3, 15, 9,
+            7, 3, 15, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bmz_v(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bmzi_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 255, 155, 55,
+            1, 255, 155, 55,
+            1, 255, 155, 55,
+            1, 255, 155, 55
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            7, 251, 159, 49,
+            7, 251, 159, 49,
+            7, 251, 159, 49,
+            7, 251, 159, 49
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bmzi_b(mem::transmute(a), mem::transmute(b), 7))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bneg_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            191, 27, 54, 3,
+            191, 27, 54, 3,
+            191, 27, 54, 3,
+            191, 27, 54, 3
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bneg_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bneg_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(255, 155, 55, 1, 255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(191, 27, 311, 513, 191, 27, 311, 513);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bneg_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bneg_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(191, 27, 311, 513);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bneg_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bneg_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(255, 155);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(191, 27);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bneg_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnegi_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            50, 100, 127, u8::MAX,
+            50, 100, 127, u8::MAX,
+            50, 100, 127, u8::MAX,
+            50, 100, 127, u8::MAX
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            34, 116, 111, 239,
+            34, 116, 111, 239,
+            34, 116, 111, 239,
+            34, 116, 111, 239
+        );
+
+        assert_eq!(r, mem::transmute(__msa_bnegi_b(mem::transmute(a), 4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnegi_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            32767, 3276, 100, 127,
+            32767, 3276, 100, 127
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(
+            30719, 1228, 2148, 2175,
+            30719, 1228, 2148, 2175
+        );
+
+        assert_eq!(r, mem::transmute(__msa_bnegi_h(mem::transmute(a), 11)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnegi_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(100, 2147483647, 100, 2147483648);
+        #[rustfmt::skip]
+        let r = u32x4::new(16777316, 2130706431, 16777316, 2164260864);
+
+        assert_eq!(r, mem::transmute(__msa_bnegi_w(mem::transmute(a), 24)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnegi_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(100, 9223372036854775808);
+        #[rustfmt::skip]
+        let r = u64x2::new(4398046511204, 9223376434901286912);
+
+        assert_eq!(r, mem::transmute(__msa_bnegi_d(mem::transmute(a), 42)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnz_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 1, 1, 1,
+            1, 1, 1, 1,
+            2, 2, 2, 2,
+            4, 4, 0, 4,
+        );
+        let r = 0 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bnz_b(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnz_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            32767, 3276, 100, 127,
+            32767, 0, 100, 127
+        );
+        let r = 0 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bnz_h(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnz_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(100, 2147483647, 0, 2147483648);
+        let r = 0 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bnz_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnz_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(100, 9223372036854775808);
+        #[rustfmt::skip]
+        let r = 1 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bnz_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnz_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            0, 0, 0, 1,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let r = 1 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bnz_v(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bsel_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            3, 5, 7, 1,
+            3, 5, 7, 1,
+            3, 5, 7, 1,
+            3, 5, 7, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            7, 3, 15, 9,
+            7, 3, 15, 9,
+            7, 3, 15, 9,
+            7, 3, 15, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bsel_v(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bseli_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            121, 29, 57, 9,
+            121, 29, 57, 9,
+            121, 29, 57, 9,
+            121, 29, 57, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bseli_b(mem::transmute(a), mem::transmute(b), 121))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bset_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            255, 155, 55, 3,
+            255, 155, 55, 3,
+            255, 155, 55, 3,
+            255, 155, 55, 3
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bset_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bset_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(255, 155, 55, 1, 255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(255, 155, 311, 513, 255, 155, 311, 513);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bset_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bset_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(255, 155, 311, 513);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bset_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bset_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(255, 155);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(255, 155);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bset_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bseti_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            255, 159, 55, 5,
+            255, 159, 55, 5,
+            255, 159, 55, 5,
+            255, 159, 55, 5
+        );
+
+        assert_eq!(r, mem::transmute(__msa_bseti_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bseti_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(255, 155, 55, 1, 255, 155, 55, 1);
+        #[rustfmt::skip]
+        let r = u16x8::new(255, 159, 55, 5, 255, 159, 55, 5);
+
+        assert_eq!(r, mem::transmute(__msa_bseti_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bseti_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(255, 155, 55, 1);
+        #[rustfmt::skip]
+        let r = u32x4::new(255, 159, 55, 5);
+
+        assert_eq!(r, mem::transmute(__msa_bseti_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bseti_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(255, 155);
+        #[rustfmt::skip]
+        let r = u64x2::new(255, 159);
+
+        assert_eq!(r, mem::transmute(__msa_bseti_d(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bz_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        let r = 0 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bz_b(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bz_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r = 1 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bz_h(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bz_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(255, 0, 55, 1);
+        let r = 1 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bz_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bz_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(255, 0);
+        let r = 1 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bz_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bz_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0
+        );
+        let r = 1 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bz_v(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceq_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, 127, 55, 1,
+            -128, 127, 55, 1,
+            -128, 127, 55, 1,
+            -128, 127, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -128, 126, 55, 1,
+            -128, 126, 55, 1,
+            -128, 126, 55, 1,
+            -128, 126, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -1, 0, -1, -1,
+            -1, 0, -1, -1,
+            -1, 0, -1, -1,
+            -1, 0, -1, -1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ceq_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceq_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(255, 155, 55, 1, 255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = i16x8::new(255, 155, 56, 1, 255, 155, 56, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(-1, -1, 0, -1, -1, -1, 0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ceq_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceq_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = i32x4::new(255, 156, 55, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, -1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ceq_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceq_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(255, 155);
+        #[rustfmt::skip]
+        let b = i64x2::new(255, 156);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ceq_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceqi_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, -1, -4, 15,
+            100, -1, -4, 15,
+            100, -1, -4, 15,
+            100, -1, -4, 15
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            0, 0, -1, 0,
+            0, 0, -1, 0,
+            0, 0, -1, 0,
+            0, 0, -1, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ceqi_b(mem::transmute(a), -4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceqi_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            32767, 3276, 100, -11,
+            32767, 3276, 100, -11
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(0, 0, 0, -1, 0, 0, 0, -1);
+
+        assert_eq!(r, mem::transmute(__msa_ceqi_h(mem::transmute(a), -11)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceqi_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 3, 5, -3);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, -1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_ceqi_w(mem::transmute(a), 5)));
+    }
+
+    // FIXME: https://reviews.llvm.org/D59884
+    // If target type is i64, negative immediate loses the sign
+    // Test passes if 4294967293 is used instead -3 in vector `a`
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_ceqi_d() {
+    //     #[rustfmt::skip]
+    //     let a = i64x2::new(-3, 2);
+    //     #[rustfmt::skip]
+    //     let r = i64x2::new(-1, 0);
+
+    //     assert_eq!(r, mem::transmute(__msa_ceqi_d(mem::transmute(a), -3)));
+    // }
+
+    // Can not be tested in user mode
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_cfcmsa() {
+    //     let r = 5;
+
+    //     assert_eq!(r, mem::transmute(__msa_cfcmsa(5));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, 127, 55, 2,
+            -128, 127, 55, 2,
+            -128, 127, 55, 2,
+            -128, 127, 55, 2
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -128, 126, 55, 1,
+            -128, 126, 55, 1,
+            -128, 126, 55, 1,
+            -128, 126, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -1, 0, -1, 0,
+            -1, 0, -1, 0,
+            -1, 0, -1, 0,
+            -1, 0, -1, 0
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(255, 155, 55, 2, 255, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = i16x8::new(255, 155, 56, 1, 255, 155, 56, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(-1, -1, -1, 0, -1, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(255, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(255, 156, 55, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(255, 155);
+        #[rustfmt::skip]
+        let b = i64x2::new(255, 156);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 127, 55, 2,
+            u8::MAX, 127, 55, 2,
+            u8::MAX, 127, 55, 2,
+            u8::MAX, 127, 55, 2
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            u8::MAX, 126, 55, 1,
+            u8::MAX, 126, 55, 1,
+            u8::MAX, 126, 55, 1,
+            u8::MAX, 126, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            u16::MAX, 155, 55, 2,
+            u16::MAX, 155, 55, 2
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            u16::MAX, 155, 56, 1,
+            u16::MAX, 155, 56, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-1, -1, -1, 0, -1, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(u32::MAX, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = u32x4::new(u32::MAX, 156, 55, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(u64::MAX, 155);
+        #[rustfmt::skip]
+        let b = u64x2::new(u64::MAX, 156);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -2, -127, 100, -127,
+            -2, -127, 100, -127,
+            -2, -127, 100, -127,
+            -2, -127, 100, -127
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(-1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0, -1);
+
+        assert_eq!(r, mem::transmute(__msa_clei_s_b(mem::transmute(a), -2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            32767, 3276, 10, -1,
+            32767, 3276, 10, -1,
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(0, 0, 0, -1, 0, 0, 0, -1);
+
+        assert_eq!(r, mem::transmute(__msa_clei_s_h(mem::transmute(a), -1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 2147483647, 6, 2147483647);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, -1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clei_s_w(mem::transmute(a), 6)));
+    }
+
+    // FIXME: https://reviews.llvm.org/D59884
+    // If target type is i64, negative immediate loses the sign
+    // -3 is represented as 4294967293
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_clei_s_d() {
+    //     #[rustfmt::skip]
+    //     let a = i64x2::new(-3, 11);
+    //     #[rustfmt::skip]
+    //     let r = i64x2::new(-1, 0);
+
+    //     assert_eq!(r, mem::transmute(__msa_clei_s_d(mem::transmute(a), -3)));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            2, 127, 100, 127,
+            2, 127, 100, 127,
+            2, 127, 100, 127,
+            2, 127, 100, 127,
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -1, 0, 0, 0,
+            -1, 0, 0, 0,
+            -1, 0, 0, 0,
+            -1, 0, 0, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_clei_u_b(mem::transmute(a), 25)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            1, 26, 15, 36,
+            1, 26, 15, 36
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-1, 0, -1, 0, -1, 0, -1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clei_u_h(mem::transmute(a), 25)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(25, 32, 25, 32);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, -1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clei_u_w(mem::transmute(a), 31)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(10, 26);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clei_u_d(mem::transmute(a), 25)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, 127, 55, 2,
+            -128, 127, 55, 2,
+            -128, 127, 55, 2,
+            -128, 127, 55, 2
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -127, 126, 56, 1,
+            -127, 126, 56, 1,
+            -127, 126, 56, 1,
+            -127, 126, 56, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -1, 0, -1, 0,
+            -1, 0, -1, 0,
+            -1, 0, -1, 0,
+            -1, 0, -1, 0
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-255, 155, 55, 2, -255, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = i16x8::new(255, 156, 56, 1, 255, 156, 56, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(-1, -1, -1, 0, -1, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-255, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(255, 156, 55, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-255, 155);
+        #[rustfmt::skip]
+        let b = i64x2::new(255, 156);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            128, 127, 55, 2,
+            128, 127, 55, 2,
+            128, 127, 55, 2,
+            128, 127, 55, 2
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            127, 126, 56, 1,
+            127, 126, 56, 1,
+            127, 126, 56, 1,
+            127, 126, 56, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            0, 0, -1, 0,
+            0, 0, -1, 0,
+            0, 0, -1, 0,
+            0, 0, -1, 0
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(255, 155, 55, 2, 255, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = u16x8::new(255, 156, 56, 1, 255, 156, 56, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(0, -1, -1, 0, 0, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(255, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = u32x4::new(255, 156, 55, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(255, 155);
+        #[rustfmt::skip]
+        let b = u64x2::new(255, 156);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            2, -127, -5, 127,
+            2, -127, -5, 127,
+            2, -127, -5, 127,
+            2, -127, -5, 127
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            0, -1, 0, 0,
+            0, -1, 0, 0,
+            0, -1, 0, 0,
+            0, -1, 0, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_clti_s_b(mem::transmute(a), -5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            -1024, 3276, 15, 127,
+            -1024, 3276, 15, 127
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-1, 0, 0, 0, -1, 0, 0, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clti_s_h(mem::transmute(a), 15)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-15, 2147483647, -15, 2147483647);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, -1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clti_s_w(mem::transmute(a), -10)));
+    }
+
+    // FIXME: https://reviews.llvm.org/D59884
+    // If target type is i64, negative immediate loses the sign
+    // -3 is represented as 4294967293
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_clti_s_d() {
+    //     #[rustfmt::skip]
+    //     let a = i64x2::new(-5, -2);
+    //     #[rustfmt::skip]
+    //     let r = i64x2::new(-1, 0);
+
+    //     assert_eq!(r, mem::transmute(__msa_clti_s_d(mem::transmute(a), -3)));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            2, 127, 49, 127,
+            2, 127, 49, 127,
+            2, 127, 49, 127,
+            2, 127, 49, 127,
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -1, 0, 0, 0,
+            -1, 0, 0, 0,
+            -1, 0, 0, 0,
+            -1, 0, 0, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_clti_u_b(mem::transmute(a), 50)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            327, 3276, 100, 127,
+            327, 3276, 100, 127
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clti_u_h(mem::transmute(a), 30)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(100, 2147483647, 100, 2147483647);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, 0, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clti_u_w(mem::transmute(a), 10)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 9223372036854775807);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clti_u_d(mem::transmute(a), 10)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            -100, 127, 4, 127
+        );
+        #[rustfmt::skip]
+        let r = -100 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_copy_s_b(mem::transmute(a), 12)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            32767, 3276, 100, 11,
+            32767, 3276, 100, 11
+        );
+        #[rustfmt::skip]
+        let r = 32767 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_copy_s_h(mem::transmute(a), 4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 2147483647, 5, -2147483647);
+        let r = 2147483647 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_copy_s_w(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(3, 9223372036854775807);
+        #[rustfmt::skip]
+        let r = 9223372036854775807 as i64;
+
+        assert_eq!(r, mem::transmute(__msa_copy_s_d(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_u_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, 127, 4, 127,
+            100, 127, 4, 127,
+            100, 127, 4, 127,
+            100, 127, 4, 127
+        );
+        #[rustfmt::skip]
+        let r = 100 as u32;
+
+        assert_eq!(r, mem::transmute(__msa_copy_u_b(mem::transmute(a), 12)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_u_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            32767, 3276, 100, 11,
+            32767, 3276, 100, 11
+        );
+        #[rustfmt::skip]
+        let r = 32767 as u32;
+
+        assert_eq!(r, mem::transmute(__msa_copy_u_h(mem::transmute(a), 4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_u_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 2147483647, 5, 2147483647);
+        #[rustfmt::skip]
+        let r = 2147483647 as u32;
+
+        assert_eq!(r, mem::transmute(__msa_copy_u_w(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_u_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(3, i64::MAX);
+        #[rustfmt::skip]
+        let r = 9223372036854775807 as u64;
+
+        assert_eq!(r, mem::transmute(__msa_copy_u_d(mem::transmute(a), 1)));
+    }
+
+    // Can not be tested in user mode
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_ctcmsa() {
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -1, -2, -3, -4,
+            -1, -2, -3, -4,
+            -1, -2, -3, -4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            6, 3, 2, 2,
+            6, 3, 2, 2,
+            6, 3, 2, 2,
+            6, 3, 2, 2
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-6, -7, -8, -9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let b = i16x8::new(-1, -2, -3, -4, -1, -2, -3, -4);
+        #[rustfmt::skip]
+        let r = i16x8::new(6, 3, 2, 2, -6, -3, -2, -2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-6, -7, 8, 9);
+        #[rustfmt::skip]
+        let b = i32x4::new(-1, -2, -3, -4);
+        #[rustfmt::skip]
+        let r = i32x4::new(6, 3, -2, -2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-6, 7);
+        #[rustfmt::skip]
+        let b = i64x2::new(-1, -2);
+        #[rustfmt::skip]
+        let r = i64x2::new(6, -3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            6, 3, 2, 2,
+            6, 3, 2, 2,
+            6, 3, 2, 2,
+            6, 3, 2, 2
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let b = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = u16x8::new(6, 3, 2, 2, 6, 3, 2, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let b = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = u32x4::new(6, 3, 2, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let b = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = u64x2::new(6, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dotp_s_h() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -1, -2, -3, 4,
+            -1, -2, -3, -4,
+            -1, -2, -3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(20, -12, 20, 60, 20, -12, 20, 60);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dotp_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dotp_s_w() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-1, -2, -3, -4, -1, -2, -3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, -7, -8, -9, -6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = i32x4::new(20, 60, 20, -12);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dotp_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dotp_s_d() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, -2, -3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(-6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = i64x2::new(20, -12);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dotp_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dotp_u_h() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(20, 60, 20, 60, 20, 60, 20, 60);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dotp_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dotp_u_w() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(20, 60, 20, 60);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dotp_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dotp_u_d() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u64x2::new(20, 60);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dotp_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpadd_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-1, -2, -3, -4, -1, -2, -3, 4);
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -1, -2, -3, 4,
+            -1, -2, -3, -4,
+            -1, -2, -3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let c = i8x16::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(19, -14, 17, 56, 19, -14, 17, 64);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpadd_s_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpadd_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            -1, -2, -3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(19, -14, 17, 56);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpadd_s_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpadd_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, -2);
+        #[rustfmt::skip]
+        let b = i32x4::new(-1, -2, -3, 4);
+        #[rustfmt::skip]
+        let c = i32x4::new(-6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = i64x2::new(19, -14);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpadd_s_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpadd_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(21, 62, 23, 64, 21, 62, 23, 64);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpadd_u_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpadd_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let c = u16x8::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u32x4::new(21, 62, 23, 64);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpadd_u_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpadd_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let c = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u64x2::new(21, 62);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpadd_u_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpsub_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-1, -2, -3, -4, -1, -2, -3, 4);
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -1, -2, -3, 4,
+            -1, -2, -3, -4,
+            -1, -2, -3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let c = i8x16::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-21, 10, -23, -64, -21, 10, -23, -56);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpsub_s_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpsub_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            -1, -2, -3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(-21, 10, -23, -64);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpsub_s_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpsub_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, -2);
+        #[rustfmt::skip]
+        let b = i32x4::new(-1, -2, -3, 4);
+        #[rustfmt::skip]
+        let c = i32x4::new(-6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = i64x2::new(-21, 10);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpsub_s_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpsub_u_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, -2, 3, -4, -1, 2,-3, 4);
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-19, -62, -17, -64, -21, -58, -23, -56);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpsub_u_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpsub_u_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let c = u16x8::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(-19, -62, -17, -64);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpsub_u_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpsub_u_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, -2);
+        #[rustfmt::skip]
+        let b = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let c = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = i64x2::new(-19, -62);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpsub_u_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fadd_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, -4.4);
+        #[rustfmt::skip]
+        let b = f32x4::new(4.4, -3.3, 2.2, -1.1);
+        #[rustfmt::skip]
+        let r = f32x4::new(5.5, -5.5, 5.5, -5.5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fadd_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fadd_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(4.4, -3.3);
+        #[rustfmt::skip]
+        let r = f64x2::new(5.5, -5.5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fadd_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    // Only observed beahiour should be SIGFPE signal
+    // Can not be tested
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcaf_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, -4.4);
+        #[rustfmt::skip]
+        let b = f32x4::new(0.0, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcaf_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    // Only observed beahiour should be SIGFPE signal
+    // Can not be tested
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcaf_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(-2.2, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcaf_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fceq_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(-4.4, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fceq_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fceq_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fceq_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fclass_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(128, 8, 128, 2);
+
+        assert_eq!(r, mem::transmute(__msa_fclass_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fclass_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let r = i64x2::new(128, 8);
+
+        assert_eq!(r, mem::transmute(__msa_fclass_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcle_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(-4.4, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcle_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcle_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcle_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fclt_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(-4.4, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fclt_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fclt_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fclt_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcne_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(-4.4, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcne_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcne_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcne_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcor_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(f32::NAN, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcor_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcor_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcor_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcueq_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(f32::NAN, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, -1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcueq_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcueq_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcueq_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcule_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(f32::NAN, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, -1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcule_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcule_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcule_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcult_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(f32::NAN, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcult_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcult_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcult_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcun_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(f32::NAN, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, 0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcun_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcun_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcun_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcune_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(f32::NAN, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcune_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcune_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcune_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fdiv_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.25, -20.2, 333.333, -425.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(4.0, -2.1, 11.11, 8.2);
+        #[rustfmt::skip]
+        let r = f32x4::new(1.3125, 9.619048, 30.002972, -51.82927);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fdiv_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fdiv_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1111.11, -222222.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(-4.85, 3.33);
+        #[rustfmt::skip]
+        let r = f64x2::new(-229.09484536082473, -66733.3933933934);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fdiv_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    /*// FIXME: 16-bit floats
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fexdo_h() {
+        #[rustfmt::skip]
+        let a = f32x4::new(20.5, 2.3, 4.5, 5.4);
+        #[rustfmt::skip]
+        let b = f32x4::new(1.1, 1.0, 1.0, 1.0);
+        let r = i16x8::new(1, 9, 30, 51, 1, 9, 30, 51);
+
+        assert_eq!(r, mem::transmute(__msa_fexdo_h(mem::transmute(a), mem::transmute(b))));
+    }*/
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fexdo_w() {
+        #[rustfmt::skip]
+        let a = f64x2::new(2000005.5, 2.3);
+        #[rustfmt::skip]
+        let b = f64x2::new(1235689784512.1, 2147483649998.5);
+        #[rustfmt::skip]
+        let r = f32x4::new(
+            1235689800000.0, 2147483600000.0,
+            2000005.5, 2.3
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fexdo_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fexp2_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, -4.4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, -3, 2, 1);
+        #[rustfmt::skip]
+        let r = f32x4::new(17.6, -0.275, 13.2, -8.8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fexp2_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fexp2_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = i64x2::new(-4, 3);
+        #[rustfmt::skip]
+        let r = f64x2::new(0.06875, -17.6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fexp2_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    // FIXME: 16-bit floats
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_fexupl_w() {
+    //     #[rustfmt::skip]
+    //     let a = f16x8(1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5);
+    //     #[rustfmt::skip]
+    //     let r = f32x4::new(5.5, 6.5, 7.5, 8.5);
+
+    //     assert_eq!(r, mem::transmute(__msa_fexupl_w(mem::transmute(a))));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fexupl_d() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 6.5, 7.5, 8.5);
+        #[rustfmt::skip]
+        let r = f64x2::new(7.5, 8.5);
+
+        assert_eq!(r, mem::transmute(__msa_fexupl_d(mem::transmute(a))));
+    }
+
+    // FIXME: 16-bit floats
+    //     #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_fexupr_w() {
+    //     #[rustfmt::skip]
+    //     let a = f16x8(1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5);
+    //     #[rustfmt::skip]
+    //     let r = f32x4::new(1.5, 2.5, 3.5, 4.5);
+
+    //     assert_eq!(r, mem::transmute(__msa_fexupr_w(mem::transmute(a))));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fexupr_d() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 6.5, 7.5, 8.5);
+        #[rustfmt::skip]
+        let r = f64x2::new(5.5, 6.5);
+
+        assert_eq!(r, mem::transmute(__msa_fexupr_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffint_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, 2, -3, 4);
+        #[rustfmt::skip]
+        let r = f32x4::new(-1.0, 2.0, -3.0, 4.0);
+
+        assert_eq!(r, mem::transmute(__msa_ffint_s_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffint_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, 2);
+        #[rustfmt::skip]
+        let r = f64x2::new(-1.0,     2.0);
+
+        assert_eq!(r, mem::transmute(__msa_ffint_s_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffint_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = f32x4::new(1.0, 2.0, 3.0, 4.0);
+
+        assert_eq!(r, mem::transmute(__msa_ffint_u_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffint_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = f64x2::new(1.0, 2.0);
+
+        assert_eq!(r, mem::transmute(__msa_ffint_u_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffql_w() {
+        #[rustfmt::skip]
+        let a = i16x8::new(11, 25, 33, 47, 11, 25, 33, 47);
+        #[rustfmt::skip]
+        let r = f32x4::new(
+            0.00033569336, 0.00076293945,
+            0.0010070801, 0.0014343262
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ffql_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffql_d() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1111, 2222, 3333, 4444);
+        #[rustfmt::skip]
+        let r = f64x2::new(
+            0.000001552049070596695,
+            0.0000020693987607955933
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ffql_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffqr_w() {
+        #[rustfmt::skip]
+        let a = i16x8::new(12, 26, 34, 48, 11, 25, 33, 47);
+        #[rustfmt::skip]
+        let r = f32x4::new(
+            0.00036621094, 0.00079345703,
+            0.0010375977, 0.0014648438
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ffqr_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffqr_d() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1111, 2555, 3333, 475);
+        #[rustfmt::skip]
+        let r = f64x2::new(
+            0.0000005173496901988983,
+            0.0000011897645890712738
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ffqr_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fill_b() {
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            2, 2, 2, 2,
+            2, 2, 2, 2,
+            2, 2, 2, 2,
+            2, 2, 2, 2
+        );
+
+        assert_eq!(r, mem::transmute(__msa_fill_b(2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fill_h() {
+        #[rustfmt::skip]
+        let r = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+
+        assert_eq!(r, mem::transmute(__msa_fill_h(2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fill_w() {
+        #[rustfmt::skip]
+        let r = i32x4::new(2, 2, 2, 2);
+
+        assert_eq!(r, mem::transmute(__msa_fill_w(2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fill_d() {
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 2);
+
+        assert_eq!(r, mem::transmute(__msa_fill_d(2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_flog2_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(8.0, 16.0, 32.0, 64.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(3.0, 4.0, 5.0, 6.0);
+
+        assert_eq!(r, mem::transmute(__msa_flog2_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_flog2_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(8.0, 16.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(3.0, 4.0);
+
+        assert_eq!(r, mem::transmute(__msa_flog2_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmadd_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
+        #[rustfmt::skip]
+        let c = f32x4::new(9.0, 10.0, 11.0, 12.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(46.0, 62.0, 80.0, 100.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmadd_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmadd_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.0, 2.0);
+        #[rustfmt::skip]
+        let b = f64x2::new(3.0, 4.0);
+        #[rustfmt::skip]
+        let c = f64x2::new(5.0, 6.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(16.0, 26.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmadd_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmax_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.0, -6.0, 7.0, 8.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.0, -2.0, 3.0, 4.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(5.0, -2.0, 7.0, 8.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmax_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmax_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.0, 4.0);
+        #[rustfmt::skip]
+        let b = f64x2::new(3.0, 2.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(3.0, 4.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmax_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmax_a_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.0, -6.0, -7.0, -8.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.0, -2.0, 3.0, 4.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(5.0, -6.0, -7.0, -8.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmax_a_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmax_a_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.0, -4.0);
+        #[rustfmt::skip]
+        let b = f64x2::new(3.0, 2.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(3.0, -4.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmax_a_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmin_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.0, -6.0, 7.0, 8.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.0, -2.0, 3.0, 4.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(1.0, -6.0, 3.0, 4.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmin_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmin_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.0, 4.0);
+        #[rustfmt::skip]
+        let b = f64x2::new(3.0, 2.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(1.0, 2.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmin_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmin_a_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.0, -6.0, -7.0, -8.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.0, -2.0, 3.0, 4.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(1.0, -2.0, 3.0, 4.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmin_a_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmin_a_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.0, -4.0);
+        #[rustfmt::skip]
+        let b = f64x2::new(3.0, 2.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(1.0, 2.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmin_a_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmsub_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
+        #[rustfmt::skip]
+        let c = f32x4::new(9.0, 10.0, 11.0, 12.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(-44.0, -58.0, -74.0, -92.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmsub_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmsub_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.0, 2.0);
+        #[rustfmt::skip]
+        let b = f64x2::new(3.0, 4.0);
+        #[rustfmt::skip]
+        let c = f64x2::new(5.0, 6.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(-14.0, -22.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmsub_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmul_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, 4.4);
+        #[rustfmt::skip]
+        let b = f32x4::new(4.4, 3.3, 2.2, -1.1);
+        #[rustfmt::skip]
+        let r = f32x4::new(4.84, -7.26, 7.26, -4.84);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmul_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmul_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(4.0, -3.3);
+        #[rustfmt::skip]
+        let r = f64x2::new(4.4, 7.26);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmul_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_frint_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(2.6, -2.7, 1.3, -1.7);
+        #[rustfmt::skip]
+        let r = f32x4::new(3.0, -3.0, 1.0, -2.0);
+
+        assert_eq!(r, mem::transmute(__msa_frint_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_frint_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(2.6, 1.3);
+        #[rustfmt::skip]
+        let r = f64x2::new(3.0, 1.0);
+
+        assert_eq!(r, mem::transmute(__msa_frint_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_frcp_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(2.6, -2.7, 1.3, -1.7);
+        #[rustfmt::skip]
+        let r = f32x4::new(
+            0.3846154, -0.37037036,
+            0.7692308, -0.58823526
+        );
+
+        assert_eq!(r, mem::transmute(__msa_frcp_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_frcp_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(2.6, 1.3);
+        #[rustfmt::skip]
+        let r = f64x2::new(0.3846153846153846, 0.7692307692307692);
+
+        assert_eq!(r, mem::transmute(__msa_frcp_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_frsqrt_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(2.6, 2.7, 1.3, 1.7);
+        #[rustfmt::skip]
+        let r = f32x4::new(
+            0.6201737, 0.6085806,
+            0.87705797, 0.766965
+        );
+
+        assert_eq!(r, mem::transmute(__msa_frsqrt_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_frsqrt_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(2.6, 1.3);
+        #[rustfmt::skip]
+        let r = f64x2::new(0.6201736729460422, 0.8770580193070292);
+
+        assert_eq!(r, mem::transmute(__msa_frsqrt_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsaf_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 5.5, 5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(-5.5, 5.5, 5.5, 5.5);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsaf_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsaf_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-125.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(125.5, 3.3);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsaf_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fseq_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, -3.3, f32::NAN, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.5, -3.3, f32::NAN, 1.1);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fseq_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fseq_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-125.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(125.5, 5.5);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fseq_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsle_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 5.5, 5.5, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(-5.5, 3.3, 5.5, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsle_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsle_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-125.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(125.5, 3.3);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsle_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fslt_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 5.5, 5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.5, 3.3, 5.5, 1.1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fslt_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fslt_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-125.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(125.5, 3.3);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fslt_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsne_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 5.5, 5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.5, 3.3, 5.5, 1.1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsne_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsne_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-125.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(125.5, 5.5);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsne_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsor_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, f32::NAN, 5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.5, 3.3, 5.5, 1.1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, -1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsor_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsor_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-125.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(125.5, f64::NAN);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsor_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsqrt_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(9.0, 81.0, 1089.0, 10000.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(3.0, 9.0, 33.0, 100.0);
+
+        assert_eq!(r, mem::transmute(__msa_fsqrt_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsqrt_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(81.0, 10000.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(9.0, 100.0);
+
+        assert_eq!(r, mem::transmute(__msa_fsqrt_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsub_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 6.5, 7.5, 8.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(1.25, 1.75, 2.25, 2.75);
+        #[rustfmt::skip]
+        let r = f32x4::new(4.25, 4.75, 5.25, 5.75);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsub_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsub_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(555.5, 55.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(4.25, 3.25);
+        #[rustfmt::skip]
+        let r = f64x2::new(551.25, 52.25);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsub_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsueq_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, f32::NAN, 5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.5, 5.5, -5.5, 5.5);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsueq_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsueq_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(5.5, f64::NAN);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsueq_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsule_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.7, 5.8, 5.9, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.6, 5.9, 5.9, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, -1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsule_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsule_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(5.5, 5.5);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsule_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsult_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 5.5, 5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.6, f32::NAN, 2.2, 1.1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsult_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsult_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(5.5, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(4.4, 3.3);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsult_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsun_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 5.5, f32::NAN, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(4.4, 3.3, 2.2, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, -1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsun_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsun_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(5.5, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(4.4, 3.3);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsun_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsune_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 5.5, f32::NAN, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(4.4, 3.3, 2.2, 5.5);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsune_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsune_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(5.5, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(5.5, 3.3);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsune_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftint_s_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 75.6, -1000.7, 1219.3);
+        #[rustfmt::skip]
+        let r = i32x4::new(-6, 76, -1001, 1219);
+
+        assert_eq!(r, mem::transmute(__msa_ftint_s_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftint_s_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-5.5, 25656.4);
+        #[rustfmt::skip]
+        let r = i64x2::new(-6, 25656);
+
+        assert_eq!(r, mem::transmute(__msa_ftint_s_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftint_u_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 75.6, -1000.7, 1219.3);
+        #[rustfmt::skip]
+        let r = u32x4::new(0, 76, 0, 1219);
+
+        assert_eq!(r, mem::transmute(__msa_ftint_u_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftint_u_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(5.5, -25656.4);
+        #[rustfmt::skip]
+        let r = u64x2::new(6, 0);
+
+        assert_eq!(r, mem::transmute(__msa_ftint_u_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftq_h() {
+        #[rustfmt::skip]
+        let a = f32x4::new(0.00001, 0.0002, 0.00001, -0.0002);
+        #[rustfmt::skip]
+        let b = f32x4::new(0.0001, -0.002, 0.0001, 0.002);
+        #[rustfmt::skip]
+        let r = i16x8::new(3, -66, 3, 66, 0, 7, 0, -7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ftq_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftq_w() {
+        #[rustfmt::skip]
+        let a = f64x2::new(0.00001, -0.0002);
+        #[rustfmt::skip]
+        let b = f64x2::new(0.00000045, 0.000015);
+        #[rustfmt::skip]
+        let r = i32x4::new(966, 32212, 21475, -429497);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ftq_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftrunc_s_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 75.6, -1000.7, 1219.3);
+        #[rustfmt::skip]
+        let r = i32x4::new(-5, 75, -1000, 1219);
+
+        assert_eq!(r, mem::transmute(__msa_ftrunc_s_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftrunc_s_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-5.5, 25656.4);
+        #[rustfmt::skip]
+        let r = i64x2::new(-5, 25656);
+
+        assert_eq!(r, mem::transmute(__msa_ftrunc_s_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftrunc_u_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 75.6, -1000.7, 1219.3);
+        #[rustfmt::skip]
+        let r = u32x4::new(0, 75, 0, 1219);
+
+        assert_eq!(r, mem::transmute(__msa_ftrunc_u_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftrunc_u_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(5.5, -25656.4);
+        #[rustfmt::skip]
+        let r = u64x2::new(5, 0);
+
+        assert_eq!(r, mem::transmute(__msa_ftrunc_u_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hadd_s_h() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(6, 6, 2, -2, 6, 6, 2, -2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hadd_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hadd_s_w() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(6, 6, 2, -2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hadd_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hadd_s_d() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, -2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hadd_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hadd_u_h() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(6, 6, 6, 6, 6, 6, 6, 6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hadd_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hadd_u_w() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = u32x4::new(6, 6, 6, 6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hadd_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hadd_u_d() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = u64x2::new(6, 6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hadd_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hsub_s_h() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-2, 2, -6, -6, -2, 2, -6, -6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hsub_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hsub_s_w() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(-2, 2, -6, -6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hsub_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hsub_s_d() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-6, -6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hsub_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hsub_u_h() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-2, 2, -2, 2, -2, 2, -2, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hsub_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hsub_u_w() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(-2, 2, -2, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hsub_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hsub_u_d() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-2, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hsub_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvev_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            4, 1, 2, 3,
+            4, 1, 2, 3,
+            4, 1, 2, 3,
+            4, 1, 2, 3
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvev_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvev_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(4, 1, 2, 3, 4, 1, 2, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvev_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvev_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 1, 2, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvev_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvev_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(4, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(4, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvev_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvl_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            16, 15, 14, 13,
+            12, 11, 10, 9,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            8, 9, 7, 10,
+            6, 11, 5, 12,
+            4, 13, 3, 14,
+            2, 15, 1, 16
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvl_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvl_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(4, 5, 3, 6, 2, 7, 1, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvl_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvl_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(2, 3, 1, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvl_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvl_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(1, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvl_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvod_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            16, 15, 14, 13,
+            12, 11, 10, 9,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            15, 2, 13, 4,
+            11, 6, 9, 8,
+            7, 10, 5, 12,
+            3, 14, 1, 16
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvod_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvod_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(7, 2, 5, 4, 3, 6, 1, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvod_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvod_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(3, 2, 1, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvod_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvod_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(1, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvod_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvr_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            16, 15, 14, 13,
+            12, 11, 10, 9,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            16, 1, 15, 2,
+            14, 3, 13, 4,
+            12, 5, 11, 6,
+            10, 7, 9, 8
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvr_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvr_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1,
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(8, 1, 7, 2, 6, 3, 5, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvr_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvr_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 1, 3, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvr_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvr_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvr_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insert_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            -100, 127, 4, 127
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            5, 127, 4, 127
+        );
+
+        assert_eq!(r, mem::transmute(__msa_insert_b(mem::transmute(a), 12, 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insert_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            32767, 3276, 100, 11,
+            32767, 3276, 100, 11
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            32767, 3276, 100, 11,
+            5, 3276, 100, 11
+        );
+
+        assert_eq!(r, mem::transmute(__msa_insert_h(mem::transmute(a), 4, 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insert_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 2147483647, 5, -2147483647);
+        #[rustfmt::skip]
+        let r = i32x4::new(100, 7, 5, -2147483647);
+
+        assert_eq!(r, mem::transmute(__msa_insert_w(mem::transmute(a), 1, 7)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insert_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(3, i64::MAX);
+        #[rustfmt::skip]
+        let r = i64x2::new(3, 100);
+
+        assert_eq!(r, mem::transmute(__msa_insert_d(mem::transmute(a), 1, 100)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insve_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -100, i8::MAX, 4, i8::MAX,
+            -100, i8::MAX, 4, i8::MAX,
+            -100, i8::MAX, 4, i8::MAX,
+            -100, i8::MAX, 4, i8::MAX
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            5, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            5, 127, 4, 127
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_insve_b(mem::transmute(a), 12, mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insve_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 3276, 100, 11,
+            i16::MAX, 3276, 100, 11
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            32767, 3276, 100, 11,
+            1, 3276, 100, 11
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_insve_h(mem::transmute(a), 4, mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insve_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 2147483647, 5, -2147483647);
+        #[rustfmt::skip]
+        let b = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = i32x4::new(100, 2147483647, 5, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_insve_w(mem::transmute(a), 3, mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insve_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(3, i64::MAX);
+        #[rustfmt::skip]
+        let b = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = i64x2::new(3, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_insve_d(mem::transmute(a), 1, mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ld_b() {
+        #[rustfmt::skip]
+        let mut a : [i8; 32] = [
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31
+        ];
+        let p = &mut a[4] as *mut _ as *mut u8;
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            13, 14, 15, 16,
+            17, 18, 19, 20,
+            21, 22, 23, 24,
+            25, 26, 27, 28
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ld_b(p, 9)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ld_h() {
+        #[rustfmt::skip]
+        let mut a : [i16; 16] = [
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15
+        ];
+        let p = &mut a[4] as *mut _ as *mut u8;
+        #[rustfmt::skip]
+        let r = i16x8::new(3, 4, 5, 6, 7, 8, 9, 10);
+
+        assert_eq!(r, mem::transmute(__msa_ld_h(p, -2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ld_w() {
+        #[rustfmt::skip]
+        let mut a : [i32; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+        let p = &mut a[3] as *mut _ as *mut u8;
+        #[rustfmt::skip]
+        let r = i32x4::new(2, 3, 4, 5);
+
+        assert_eq!(r, mem::transmute(__msa_ld_w(p, -4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ld_d() {
+        #[rustfmt::skip]
+        let mut a : [i64; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+        let p = &mut a[4] as *mut _ as *mut u8;
+        #[rustfmt::skip]
+        let r = i64x2::new(0, 1);
+
+        assert_eq!(r, mem::transmute(__msa_ld_d(p, -32)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ldi_b() {
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -20, -20, -20, -20,
+            -20, -20, -20, -20,
+            -20, -20, -20, -20,
+            -20, -20, -20, -20
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ldi_b(-20)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ldi_h() {
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            255, 255, 255, 255,
+            255, 255, 255, 255
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ldi_h(255)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ldi_w() {
+        #[rustfmt::skip]
+        let r = i32x4::new(-509, -509, -509, -509);
+
+        assert_eq!(r, mem::transmute(__msa_ldi_w(-509)));
+    }
+
+    // FIXME: https://reviews.llvm.org/D59884
+    // If target type is i64, negative immediate loses the sign
+    // Test passes if 4294967185 is used instead -111 in vector `r`
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_ldi_d() {
+    //     let r = i64x2::new(-111, -111);
+
+    //     assert_eq!(r, mem::transmute(__msa_ldi_d(-111)));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_madd_q_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 1024, i16::MIN, -1024,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1024, 1024, 1024, 1024,
+            1024, 1024, 1024, 1024
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            i16::MAX, i16::MAX, 1, -1,
+            33, 66, 99, 132
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(32767, 2047, -32768, -1025, 2, 4, 6, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_madd_q_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_madd_q_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MAX, i32::MIN, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(102401, 102401, 102401, 102401);
+        #[rustfmt::skip]
+        let c = i32x4::new(10240, 20480, 30720, 40960);
+        #[rustfmt::skip]
+        let r = i32x4::new(2147483647, -2147483648, 2, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_madd_q_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maddr_q_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            32767, 1024, -32768, -1024,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1024, 1024, 1024, 1024,
+            1024, 1024, 1024, 1024
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            32767, 32767, 32767, 32767,
+            33, 66, 99, 132
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(32767, 2048, -31744, 0, 2, 4, 6, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_maddr_q_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maddr_q_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MAX, i32::MIN, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(102401, 102401, 102401, 102401);
+        #[rustfmt::skip]
+        let c = i32x4::new(10240, 20480, 30720, 40960);
+        #[rustfmt::skip]
+        let r = i32x4::new(2147483647, -2147483647, 2, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_maddr_q_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maddv_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            5, 6, 7, 8,
+            5, 6, 7, 8,
+            5, 6, 7, 8,
+            5, 6, 7, 8
+        );
+        #[rustfmt::skip]
+        let c = i8x16::new(
+            9, 10, 11, 12,
+            9, 10, 11, 12,
+            9, 10, 11, 12,
+            9, 10, 11, 12
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            46, 62, 80, 100,
+            46, 62, 80, 100,
+            46, 62, 80, 100,
+            46, 62, 80, 100
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_maddv_b(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maddv_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(5, 6, 7, 8, 5, 6, 7, 8);
+        #[rustfmt::skip]
+        let c = i16x8::new(9, 10, 11, 12, 9, 10, 11, 12);
+        #[rustfmt::skip]
+        let r = i16x8::new(46, 62, 80, 100, 46, 62, 80, 100);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_maddv_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maddv_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(3, 4, 3, 4);
+        #[rustfmt::skip]
+        let c = i32x4::new(5, 6, 5, 6);
+        #[rustfmt::skip]
+        let r = i32x4::new(16, 26, 16, 26);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_maddv_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maddv_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(3, 4);
+        #[rustfmt::skip]
+        let c = i64x2::new(5, 6);
+        #[rustfmt::skip]
+        let r = i64x2::new(16, 26);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_maddv_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_a_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            6, 7, 8, 9,
+            -6, -7, -8, -9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -6, -7, -8, -9,
+            6, 7, 8, 9,
+            -6, -7, -8, -9,
+            6, 7, 8, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_a_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_a_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, 7, -8, 9, -6, 7, -8, 9);
+        #[rustfmt::skip]
+        let r = i16x8::new(-6, 7, -8, 9, -6, 7, -8, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_a_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_a_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = i32x4::new(6, 7, 8, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_a_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_a_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(6, -7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_a_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            6, 7, 8, 9,
+            -6, -7, -8, -9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            1, 2, 3, 4,
+            6, 7, 8, 9,
+            1, 2, 3, 4,
+            6, 7, 8, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, 7, -8, 9, -6, 7, -8, 9);
+        #[rustfmt::skip]
+        let r = i16x8::new(1, 7, 3, 9, 1, 7, 3, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = i32x4::new(6, 7, 8, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(6, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(6, 7, 8, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(6, 7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, -20, -6, 8,
+            1, -20, -6, 8,
+            1, -20, -6, 8,
+            1, -20, -6, 8
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            1, -16, -6, 8,
+            1, -16, -6, 8,
+            1, -16, -6, 8,
+            1, -16, -6, 8
+        );
+
+        assert_eq!(r, mem::transmute(__msa_maxi_s_b(mem::transmute(a), -16)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 3, -60, -8, 1, 3, -6, -8);
+        #[rustfmt::skip]
+        let r = i16x8::new(15, 15, 15, 15, 15, 15, 15, 15);
+
+        assert_eq!(r, mem::transmute(__msa_maxi_s_h(mem::transmute(a), 15)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 3, -6, -8);
+        #[rustfmt::skip]
+        let r = i32x4::new(1, 3, -5, -5);
+
+        assert_eq!(r, mem::transmute(__msa_maxi_s_w(mem::transmute(a), -5)));
+    }
+
+    // FIXME: https://reviews.llvm.org/D59884
+    // If target type is i64, negative immediate loses the sign
+    // Test passes if 4294967293 is used instead -3 in vector `r`
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_maxi_s_d() {
+    //     #[rustfmt::skip]
+    //     let a = i64x2::new(1, -8);
+    //     #[rustfmt::skip]
+    //     let r = i64x2::new(-3, -3);
+
+    //     assert_eq!(r, mem::transmute(__msa_maxi_s_d(mem::transmute(a), -3)));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 3, 6, 8,
+            1, 3, 6, 8,
+            1, 3, 6, 8,
+            1, 3, 6, 8
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            5, 5, 6, 8,
+            5, 5, 6, 8,
+            5, 5, 6, 8,
+            5, 5, 6, 8
+        );
+
+        assert_eq!(r, mem::transmute(__msa_maxi_u_b(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 3, 6, 8, 1, 3, 6, 8);
+        #[rustfmt::skip]
+        let r = u16x8::new(5, 5, 6, 8, 5, 5, 6, 8);
+
+        assert_eq!(r, mem::transmute(__msa_maxi_u_h(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 3, 6, 8);
+        #[rustfmt::skip]
+        let r = u32x4::new(5, 5, 6, 8);
+
+        assert_eq!(r, mem::transmute(__msa_maxi_u_w(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 8);
+        #[rustfmt::skip]
+        let r = u64x2::new(5, 8);
+
+        assert_eq!(r, mem::transmute(__msa_maxi_u_d(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_a_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            6, 7, 8, 9,
+            -6, -7, -8, -9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_a_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_a_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, 7, -8, 9, -6, 7, -8, 9);
+        #[rustfmt::skip]
+        let r = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_a_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_a_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = i32x4::new(1, -2, 3, -4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_a_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_a_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_a_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            6, 7, 8, 9,
+            -6, -7, -8, -9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -6, -7, -8, -9,
+            -1, -2, -3, -4,
+            -6, -7, -8, -9,
+            -1, -2, -3, -4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, 7, -8, 9, -6, 7, -8, 9);
+        #[rustfmt::skip]
+        let r = i16x8::new(-6, -2, -8, -4, -6, -2, -8, -4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = i32x4::new(1, -2, 3, -4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -10, -10, -10, -10,
+            -10, -10, -10, -10,
+            -10, -10, -10, -10,
+            -10, -10, -10, -10
+        );
+
+        assert_eq!(r, mem::transmute(__msa_mini_s_b(mem::transmute(a), -10)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+        #[rustfmt::skip]
+        let r = i16x8::new(-3, -3, -3, -4, -3, -3, -3, -4);
+
+        assert_eq!(r, mem::transmute(__msa_mini_s_h(mem::transmute(a), -3)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let r = i32x4::new(-3, -3, -3, -4);
+
+        assert_eq!(r, mem::transmute(__msa_mini_s_w(mem::transmute(a), -3)));
+    }
+
+    // FIXME: https://reviews.llvm.org/D59884
+    // If target type is i64, negative immediate loses the sign
+    // -3 is represented as 4294967293
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_mini_s_d() {
+    //     #[rustfmt::skip]
+    //     let a = i64x2::new(-3, 2);
+    //     #[rustfmt::skip]
+    //     let r = i64x2::new(-1, -3);
+
+    //     assert_eq!(r, mem::transmute(__msa_mini_s_d(mem::transmute(a), -3)));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4,);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(1, 2, 3, 4,);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(1, 2,);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 3, 6, 8,
+            1, 3, 6, 8,
+            1, 3, 6, 8,
+            1, 3, 6, 8
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            1, 3, 5, 5,
+            1, 3, 5, 5,
+            1, 3, 5, 5,
+            1, 3, 5, 5
+        );
+
+        assert_eq!(r, mem::transmute(__msa_mini_u_b(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 3, 6, 8, 1, 3, 6, 8);
+        #[rustfmt::skip]
+        let r = u16x8::new(1, 3, 5, 5, 1, 3, 5, 5);
+
+        assert_eq!(r, mem::transmute(__msa_mini_u_h(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 3, 6, 8);
+        #[rustfmt::skip]
+        let r = u32x4::new(1, 3, 5, 5);
+
+        assert_eq!(r, mem::transmute(__msa_mini_u_w(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 8);
+        #[rustfmt::skip]
+        let r = u64x2::new(1, 5);
+
+        assert_eq!(r, mem::transmute(__msa_mini_u_d(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -6, -7, -8, -9,
+            6, 7, 8, 9,
+            -6, -7, -8, -9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            0, -1, -2, -1,
+            0, 1, 2, 1,
+            0, -1, -2, -1,
+            0, 1, 2, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-6, 7, -8, 9, -6, 7, -8, 9);
+        #[rustfmt::skip]
+        let b = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+        #[rustfmt::skip]
+        let r = i16x8::new(0, 1, -2, 1, 0, 1, -2, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let b = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 1, 2, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let b = i64x2::new(-1, 2);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            0, 1, 2, 1,
+            0, 1, 2, 1,
+            0, 1, 2, 1,
+            0, 1, 2, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let b = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = u16x8::new(0, 1, 2, 1, 0, 1, 2, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let b = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = u32x4::new(0, 1, 2, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let b = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = u64x2::new(0, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_move_v() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            1, 2, 3, 4,
+            5, 6, 7, 8
+            );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            1, 2, 3, 4,
+            5, 6, 7, 8
+            );
+
+        assert_eq!(r, mem::transmute(__msa_move_v(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msub_q_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1024, -1024, 1024, -1024,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1025, 1025, 1025, 1025,
+            1025, 1025, 1025, 1025
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            1024, 2048, 3072, 4096,
+            1024, 2048, 3072, 4096
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(991, -1089, 927, -1153, -32, -63, -94, -125);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msub_q_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msub_q_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(2147483647, -2147483647, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(10240, 10240, 10240, 10240);
+        #[rustfmt::skip]
+        let c = i32x4::new(10240, 20480, 30720, 40960);
+        #[rustfmt::skip]
+        let r = i32x4::new(2147483646, -2147483648, 0, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msub_q_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msubr_q_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1024, -1024, 1024, -1024,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1025, 1025, 1025, 1025,
+            1025, 1025, 1025, 1025
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            1024, 2048, 3072, 4096,
+            1024, 2048, 3072, 4096
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(992, -1088, 928, -1152, -31, -62, -93, -124);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msubr_q_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msubr_q_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MAX, -2147483647, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(10240, 10240, 10240, 10240);
+        #[rustfmt::skip]
+        let c = i32x4::new(10240, 20480, 30720, 40960);
+        #[rustfmt::skip]
+        let r = i32x4::new(2147483647, -2147483647, 1, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msubr_q_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msubv_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            5, 6, 7, 8,
+            5, 6, 7, 8,
+            5, 6, 7, 8,
+            5, 6, 7, 8
+        );
+        #[rustfmt::skip]
+        let c = i8x16::new(
+            9, 10, 11, 12,
+            9, 10, 11, 12,
+            9, 10, 11, 12,
+            9, 10, 11, 12
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -44, -58, -74, -92,
+            -44, -58, -74, -92,
+            -44, -58, -74, -92,
+            -44, -58, -74, -92
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msubv_b(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msubv_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(5, 6, 7, 8, 5, 6, 7, 8);
+        #[rustfmt::skip]
+        let c = i16x8::new(9, 10, 11, 12, 9, 10, 11, 12);
+        #[rustfmt::skip]
+        let r = i16x8::new(-44, -58, -74, -92, -44, -58, -74, -92);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msubv_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msubv_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(3, 4, 3, 4);
+        #[rustfmt::skip]
+        let c = i32x4::new(5, 6, 5, 6);
+        #[rustfmt::skip]
+        let r = i32x4::new(-14, -22, -14, -22);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msubv_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msubv_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(3, 4);
+        #[rustfmt::skip]
+        let c = i64x2::new(5, 6);
+        #[rustfmt::skip]
+        let r = i64x2::new(-14, -22);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msubv_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mul_q_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            12500, -20, -300, 400,
+            12500, 20, 300, 400
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1250, 10240, -7585, 8456,
+            1250, 10240, -7585, 8456
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(476, -7, 69, 103, 476, 6, -70, 103);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mul_q_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mul_q_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(
+            i32::MAX, i32::MAX,
+            i32::MIN, i32::MIN
+        );
+        #[rustfmt::skip]
+        let b = i32x4::new(30, 60, 30, 60);
+        #[rustfmt::skip]
+        let r = i32x4::new(29, 59, -30, -60);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mul_q_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mulr_q_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            12500, -20, -300, 400,
+            12500, 20, 300, 400
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1250, 10240, -7585, 8456,
+            1250, 10240, -7585, 8456
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(477, -6, 69, 103, 477, 6, -69, 103);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mulr_q_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mulr_q_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(
+            i32::MAX, i32::MAX,
+            i32::MIN, i32::MIN
+        );
+        #[rustfmt::skip]
+        let b = i32x4::new(30, 60, 30, 60);
+        #[rustfmt::skip]
+        let r = i32x4::new(30, 60, -30, -60);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mulr_q_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mulv_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            16, 15, 14, 13,
+            12, 11, 10, 9,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            16, 30, 42, 52,
+            60, 66, 70, 72,
+            72, 70, 66, 60,
+            52, 42, 30, 16
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mulv_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mulv_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(8, 14, 18, 20, 20, 18, 14, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mulv_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mulv_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 6, 6, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mulv_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mulv_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mulv_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nloc_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, -64, -32, -16,
+            -8, -4, -2, -1,
+            1, 2, 4, 8,
+            16, 32, 64, 127
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            0, 0, 0, 0,
+            0, 0, 0, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_nloc_b(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nloc_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            -32768, -16384, -8192, -4096,
+            4096, 8192, 16384, 32767
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(1, 2, 3, 4, 0, 0, 0, 0);
+
+        assert_eq!(r, mem::transmute(__msa_nloc_h(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nloc_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(
+            i32::MIN, -1073741824,
+            1073741824, i32::MAX
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(1, 2, 0, 0);
+
+        assert_eq!(r, mem::transmute(__msa_nloc_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nloc_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MIN, i64::MAX);
+        #[rustfmt::skip]
+        let r = i64x2::new(1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_nloc_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nlzc_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            7, 6, 6, 5,
+            5, 5, 5, 4,
+            4, 4, 4, 4,
+            4, 4, 4, 3
+        );
+
+        assert_eq!(r, mem::transmute(__msa_nlzc_b(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nlzc_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(15, 14, 14, 13, 13, 13, 13, 12);
+
+        assert_eq!(r, mem::transmute(__msa_nlzc_h(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nlzc_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = i32x4::new(31, 30, 30, 29);
+
+        assert_eq!(r, mem::transmute(__msa_nlzc_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nlzc_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = i64x2::new(63, 62);
+
+        assert_eq!(r, mem::transmute(__msa_nlzc_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nor_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            254, 253, 252, 251,
+            250, 249, 248, 247,
+            246, 245, 244, 243,
+            242, 241, 240, 239
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_nor_v(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nori_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            250, 249, 248, 251,
+            250, 249, 248, 243,
+            242, 241, 240, 243,
+            242, 241, 240, 235
+        );
+
+        assert_eq!(r, mem::transmute(__msa_nori_b(mem::transmute(a), 4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_or_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_or_v(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ori_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            5, 6, 7, 4,
+            5, 6, 7, 12,
+            13, 14, 15, 12,
+            13, 14, 15, 20
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ori_b(mem::transmute(a), 4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckev_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            4, 2, 4, 2,
+            4, 2, 4, 2,
+            1, 3, 1, 3,
+            1, 3, 1, 3
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckev_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckev_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(4, 3, 2, 1, 4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(4, 2, 4, 2, 1, 3, 1, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckev_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckev_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 2, 1, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckev_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckev_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(4, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(4, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckev_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckod_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            3, 1, 3, 1,
+            3, 1, 3, 1,
+            2, 4, 2, 4,
+            2, 4, 2, 4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckod_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckod_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(4, 3, 2, 1, 4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(3, 1, 3, 1, 2, 4, 2, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckod_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckod_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(3, 1, 2, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckod_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckod_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(4, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(3, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckod_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pcnt_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, -64, -32, -16,
+            -8, -4, -2, -1,
+            1, 2, 4, 8,
+            16, 32, 64, 127
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            1, 1, 1, 1,
+            1, 1, 1, 7
+        );
+
+        assert_eq!(r, mem::transmute(__msa_pcnt_b(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pcnt_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            -32768, -16384, -8192, -4096,
+            4096, 8192, 16384, 32767
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(1, 2, 3, 4, 1, 1, 1, 15);
+
+        assert_eq!(r, mem::transmute(__msa_pcnt_h(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pcnt_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(
+            i32::MIN, -1073741824,
+            1073741824, i32::MAX
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(1, 2, 1, 31);
+
+        assert_eq!(r, mem::transmute(__msa_pcnt_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pcnt_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-2147483648, 2147483647);
+        #[rustfmt::skip]
+        let r = i64x2::new(33, 31);
+
+        assert_eq!(r, mem::transmute(__msa_pcnt_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            i8::MAX, 105, 30, 1,
+            i8::MAX, 105, 30, 1,
+            i8::MAX, 105, 30, 1,
+            i8::MAX, 105, 30, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            3, 3, 3, 1,
+            3, 3, 3, 1,
+            3, 3, 3, 1,
+            3, 3, 3, 1
+        );
+
+        assert_eq!(r, mem::transmute(__msa_sat_s_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 1155, 155, 1,
+            i16::MAX, 1155, 155, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(127, 127, 127, 1, 127, 127, 127, 1);
+
+        assert_eq!(r, mem::transmute(__msa_sat_s_h(mem::transmute(a), 7)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MAX, 111111155, i32::MAX, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(131071, 131071, 131071, 1);
+
+        assert_eq!(r, mem::transmute(__msa_sat_s_w(mem::transmute(a), 17)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MAX, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(137438953471, 1);
+
+        assert_eq!(r, mem::transmute(__msa_sat_s_d(mem::transmute(a), 37)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 105, 30, 1,
+            u8::MAX, 105, 30, 1,
+            u8::MAX, 105, 30, 1,
+            u8::MAX, 105, 30, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            7, 7, 7, 1,
+            7, 7, 7, 1,
+            7, 7, 7, 1,
+            7, 7, 7, 1
+        );
+
+        assert_eq!(r, mem::transmute(__msa_sat_u_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            u16::MAX, 1155, 155, 1,
+            u16::MAX, 1155, 155, 1
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(255, 255, 155, 1, 255, 255, 155, 1);
+
+        assert_eq!(r, mem::transmute(__msa_sat_u_h(mem::transmute(a), 7)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(u32::MAX, 111111155, u32::MAX, 1);
+        #[rustfmt::skip]
+        let r = u32x4::new(262143, 262143, 262143, 1);
+
+        assert_eq!(r, mem::transmute(__msa_sat_u_w(mem::transmute(a), 17)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(u64::MAX, 1);
+        #[rustfmt::skip]
+        let r = u64x2::new(274877906943, 1);
+
+        assert_eq!(r, mem::transmute(__msa_sat_u_d(mem::transmute(a), 37)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_shf_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            11, 12, 3, 4,
+            11, 12, 3, 4,
+            11, 12, 3, 4,
+            11, 12, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            11, 3, 4, 12,
+            11, 3, 4, 12,
+            11, 3, 4, 12,
+            11, 3, 4, 12
+        );
+
+        assert_eq!(r, mem::transmute(__msa_shf_b(mem::transmute(a), 120)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_shf_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            11, 12, 13, 14,
+            11, 12, 13, 14
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(11, 14, 12, 13, 11, 14, 12, 13);
+
+        assert_eq!(r, mem::transmute(__msa_shf_h(mem::transmute(a), 156)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_shf_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = i32x4::new(1, 3, 2, 4);
+
+        assert_eq!(r, mem::transmute(__msa_shf_w(mem::transmute(a), 216)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sld_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, 14, 15
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            16, 17, 18, 19,
+            20, 21, 22, 23,
+            24, 25, 26, 27,
+            28, 29, 30, 31
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            21, 22, 23, 24,
+            25, 26, 27, 28,
+            29, 30, 31, 0,
+            1, 2, 3, 4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sld_b(mem::transmute(a), mem::transmute(b), 5))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sld_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        #[rustfmt::skip]
+        let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        // let c = 5 as i32;
+        let r = i16x8::new(9, 10, 11, 0, 13, 14, 15, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sld_h(mem::transmute(a), mem::transmute(b), 2))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sld_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(0, 1, 2, 3);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 5, 6, 7);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 5, 6, 7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sld_w(mem::transmute(a), mem::transmute(b), 4))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sld_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(0, 1);
+        #[rustfmt::skip]
+        let b = i64x2::new(2, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sld_d(mem::transmute(a), mem::transmute(b), 2))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sldi_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, 14, 15
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            16, 17, 18, 19,
+            20, 21, 22, 23,
+            24, 25, 26, 27,
+            28, 29, 30, 31
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            21, 22, 23, 24,
+            25, 26, 27, 28,
+            29, 30, 31, 0,
+            1, 2, 3, 4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sldi_b(mem::transmute(a), mem::transmute(b), 5))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sldi_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        #[rustfmt::skip]
+        let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        // let c = 5 as i32;
+        let r = i16x8::new(9, 10, 11, 0, 13, 14, 15, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sldi_h(mem::transmute(a), mem::transmute(b), 2))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sldi_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(0, 1, 2, 3);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 5, 6, 7);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 5, 6, 7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sldi_w(mem::transmute(a), mem::transmute(b), 4))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sldi_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(0, 1);
+        #[rustfmt::skip]
+        let b = i64x2::new(2, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sldi_d(mem::transmute(a), mem::transmute(b), 2))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sll_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            16, 16, 12, 8,
+            16, 16, 12, 8,
+            16, 16, 12, 8,
+            16, 16, 12, 8
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sll_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sll_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(4, 3, 2, 1, 4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(16, 16, 12, 8, 16, 16, 12, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sll_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sll_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(16, 16, 12, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sll_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sll_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(4, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(16, 16);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sll_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_slli_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            4, 8, 12, 16,
+            4, 8, 12, 16,
+            4, 8, 12, 16,
+            4, 8, 12, 16
+        );
+
+        assert_eq!(r, mem::transmute(__msa_slli_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_slli_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(4, 8, 12, 16, 4, 8, 12, 16);
+
+        assert_eq!(r, mem::transmute(__msa_slli_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_slli_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 8, 12, 16);
+
+        assert_eq!(r, mem::transmute(__msa_slli_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_slli_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 4);
+
+        assert_eq!(r, mem::transmute(__msa_slli_d(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splat_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            4, 4, 4, 4,
+            4, 4, 4, 4,
+            4, 4, 4, 4,
+            4, 4, 4, 4
+        );
+
+        assert_eq!(r, mem::transmute(__msa_splat_b(mem::transmute(a), 3)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splat_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(4, 4, 4, 4, 4, 4, 4, 4);
+
+        assert_eq!(r, mem::transmute(__msa_splat_h(mem::transmute(a), 3)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splat_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 4, 4, 4);
+
+        assert_eq!(r, mem::transmute(__msa_splat_w(mem::transmute(a), 3)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splat_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 2);
+
+        assert_eq!(r, mem::transmute(__msa_splat_d(mem::transmute(a), 3)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splati_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            3, 3, 3, 3,
+            3, 3, 3, 3,
+            3, 3, 3, 3,
+            3, 3, 3, 3
+        );
+
+        assert_eq!(r, mem::transmute(__msa_splati_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splati_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+
+        assert_eq!(r, mem::transmute(__msa_splati_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splati_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = i32x4::new(3, 3, 3, 3);
+
+        assert_eq!(r, mem::transmute(__msa_splati_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splati_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 2);
+
+        assert_eq!(r, mem::transmute(__msa_splati_d(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sra_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, -64, -32, -16,
+            -8, -4, -2, -1,
+            1, 2, 4, 8,
+            16, 32, 64, 127
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -128, -1, -1, -1,
+            -1, -1, -1, -1,
+            1, 0, 0, 0,
+            1, 4, 16, 63
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sra_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sra_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            -32768, -16384, -8192, -4096,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            15, 14, 13, 12,
+            12, 13, 14, 15
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            -1, -1, -1, -1,
+            0, 0, 0, 0
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sra_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sra_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MIN, -1073741824, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(16, 15, 16, 15);
+        #[rustfmt::skip]
+        let r = i32x4::new(-32768, -32768, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sra_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sra_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MIN, i64::MAX);
+        #[rustfmt::skip]
+        let b = i64x2::new(32, 31);
+        #[rustfmt::skip]
+        let r = i64x2::new(-2147483648, 4294967295);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sra_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srai_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            i8::MAX, 125, 55, 1,
+            i8::MAX, 125, 55, 1,
+            i8::MAX, 125, 55, 1,
+            i8::MAX, 125, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            31, 31, 13, 0,
+            31, 31, 13, 0,
+            31, 31, 13, 0,
+            31, 31, 13, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_srai_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srai_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 125, 55, 1,
+            i16::MAX, 125, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(8191, 31, 13, 0, 8191, 31, 13, 0);
+
+        assert_eq!(r, mem::transmute(__msa_srai_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srai_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MAX, 125, 55, 1);
+        let r = i32x4::new(536870911, 31, 13, 0);
+
+        assert_eq!(r, mem::transmute(__msa_srai_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srai_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MAX, 55);
+        #[rustfmt::skip]
+        let r = i64x2::new(2305843009213693951, 13);
+
+        assert_eq!(r, mem::transmute(__msa_srai_d(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srar_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, -64, -32, -16,
+            -8, -4, -2, -1,
+            1, 2, 4, 8,
+            16, 32, 64, 127
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -8, -8, -8, -8,
+            0, 0, 0, 0,
+            1, 0, 0, 0,
+            1, 4, 16, 64
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srar_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srar_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MIN, -16384, -8192, -4096,
+            150, 50, 25, 15
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            4, 3, 2, 1,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            -2048, -2048, -2048, -2048,
+            75, 13, 3, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srar_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srar_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MIN, -1073741824, 100, 50);
+        #[rustfmt::skip]
+        let b = i32x4::new(16, 15, 1, 2);
+        #[rustfmt::skip]
+        let r = i32x4::new(-32768, -32768, 50, 13);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srar_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srar_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MIN, i64::MAX);
+        #[rustfmt::skip]
+        let b = i64x2::new(32, 31);
+        #[rustfmt::skip]
+        let r = i64x2::new(-2147483648, 4294967296);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srar_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srari_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            125, i8::MAX, 55, 1,
+            125, i8::MAX, 55, 1,
+            125, i8::MAX, 55, 1,
+            125, i8::MAX, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            31, 32, 14, 0,
+            31, 32, 14, 0,
+            31, 32, 14, 0,
+            31, 32, 14, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_srari_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srari_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(2155, 1155, 155, 1, 2155, 1155, 155, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(539, 289, 39, 0, 539, 289, 39, 0);
+
+        assert_eq!(r, mem::transmute(__msa_srari_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srari_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(211111155, 111111155, 11111155, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(52777789, 27777789, 2777789, 0);
+
+        assert_eq!(r, mem::transmute(__msa_srari_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srari_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(211111111155, 111111111155);
+        #[rustfmt::skip]
+        let r = i64x2::new(52777777789, 27777777789);
+
+        assert_eq!(r, mem::transmute(__msa_srari_d(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srl_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, -64, -32, -16,
+            -8, -4, -2, -1,
+            1, 2, 4, 8,
+            16, 32, 64, 127
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -128, 1, 3, 7,
+            15, 31, 63, 127,
+            1, 0, 0, 0,
+            1, 4, 16, 63
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srl_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srl_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            -32768, -16384, -8192, -4096,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            15, 14, 13, 12,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(1, 3, 7, 15, 0, 0, 0, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srl_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srl_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MIN, -1073741824, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(16, 15, 16, 15);
+        #[rustfmt::skip]
+        let r = i32x4::new(32768, 98304, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srl_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srl_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MIN, i64::MAX);
+        #[rustfmt::skip]
+        let b = i64x2::new(32, 31);
+        #[rustfmt::skip]
+        let r = i64x2::new(2147483648, 4294967295);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srl_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srli_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            25, 50, 100, 127,
+            25, 50, 100, 127,
+            25, 50, 100, 127,
+            25, 50, 100, 127
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            6, 12, 25, 31,
+            6, 12, 25, 31,
+            6, 12, 25, 31,
+            6, 12, 25, 31
+        );
+
+        assert_eq!(r, mem::transmute(__msa_srli_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srli_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 3276, 100, 127,
+            i16::MAX, 3276, 100, 127
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            8191, 819, 25, 31,
+            8191, 819, 25, 31
+        );
+
+        assert_eq!(r, mem::transmute(__msa_srli_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srli_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, i32::MAX, 100, i32::MAX);
+        #[rustfmt::skip]
+        let r = i32x4::new(25, 536870911, 25, 536870911);
+
+        assert_eq!(r, mem::transmute(__msa_srli_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srli_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MAX);
+        #[rustfmt::skip]
+        let r = i64x2::new(50, 4611686018427387903);
+
+        assert_eq!(r, mem::transmute(__msa_srli_d(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlr_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, -64, -32, -16,
+            -8, -4, -2, -1,
+            1, 2, 4, 8,
+            16, 32, 64, 127
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -128, 2, 4, 8,
+            16, 32, 64, -128,
+            1, 0, 0, 0,
+            1, 4, 16, 64
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srlr_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlr_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            -32768, -16384, -8192, -4096,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            15, 14, 13, 12,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(1, 3, 7, 15, 0, 0, 1, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srlr_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlr_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MIN, -1073741824, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(16, 15, 16, 15);
+        let r = i32x4::new(32768, 98304, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srlr_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlr_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MIN, i64::MAX);
+        #[rustfmt::skip]
+        let b = i64x2::new(32, 31);
+        #[rustfmt::skip]
+        let r = i64x2::new(2147483648, 4294967296);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srlr_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlri_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            25, 50, 100, i8::MAX,
+            25, 50, 100, i8::MAX,
+            25, 50, 100, i8::MAX,
+            25, 50, 100, i8::MAX
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            6, 13, 25, 32,
+            6, 13, 25, 32,
+            6, 13, 25, 32,
+            6, 13, 25, 32
+        );
+
+        assert_eq!(r, mem::transmute(__msa_srlri_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlri_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 3276, 100, 127,
+            i16::MAX, 3276, 100, 127
+        );
+        let r = i16x8::new(8192, 819, 25, 32, 8192, 819, 25, 32);
+
+        assert_eq!(r, mem::transmute(__msa_srlri_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlri_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 150, 200, i32::MAX);
+        #[rustfmt::skip]
+        let r = i32x4::new(25, 38, 50, 536870912);
+
+        assert_eq!(r, mem::transmute(__msa_srlri_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlri_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MAX);
+        #[rustfmt::skip]
+        let r = i64x2::new(50, 4611686018427387904);
+
+        assert_eq!(r, mem::transmute(__msa_srlri_d(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_st_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            13, 14, 15, 16,
+            17, 18, 19, 20,
+            21, 22, 23, 24,
+            25, 26, 27, 28
+        );
+        #[rustfmt::skip]
+        let mut arr : [i8; 16] = [
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0
+        ];
+        #[rustfmt::skip]
+        let r : [i8; 16] = [
+            13, 14, 15, 16,
+            17, 18, 19, 20,
+            21, 22, 23, 24,
+            25, 26, 27, 28
+        ];
+        __msa_st_b(mem::transmute(a), arr.as_mut_ptr() as *mut u8, 0);
+        assert_eq!(arr, r);
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_st_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(13, 14, 15, 16, 17, 18, 19, 20);
+        let mut arr: [i16; 8] = [0, 0, 0, 0, 0, 0, 0, 0];
+        #[rustfmt::skip]
+        let r  : [i16; 8] = [13, 14, 15, 16, 17, 18, 19, 20];
+        __msa_st_h(mem::transmute(a), arr.as_mut_ptr() as *mut u8, 0);
+        assert_eq!(arr, r);
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_st_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(13, 14, 15, 16);
+        let mut arr: [i32; 4] = [0, 0, 0, 0];
+        #[rustfmt::skip]
+        let r  : [i32; 4] = [13, 14, 15, 16];
+        __msa_st_w(mem::transmute(a), arr.as_mut_ptr() as *mut u8, 0);
+        assert_eq!(arr, r);
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_st_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(13, 14);
+        let mut arr: [i64; 2] = [0, 0];
+        #[rustfmt::skip]
+        let r : [i64; 2] = [13, 14];
+        __msa_st_d(mem::transmute(a), arr.as_mut_ptr() as *mut u8, 0);
+        assert_eq!(arr, r);
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            i8::MIN, -2, -3, -4,
+            i8::MIN, -2, -3, -4,
+            i8::MIN, -2, -3, -4,
+            i8::MIN, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            6, -7, 8, -9,
+            6, -7, 8, -9,
+            6, -7, 8, -9,
+            6, -7, 8, -9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            i8::MIN, 5, -11, 5,
+            i8::MIN, 5, -11, 5,
+            i8::MIN, 5, -11, 5,
+            i8::MIN, 5, -11, 5
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MIN, -2, -3, -4,
+            i16::MIN, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(6, -7, 8, -9, 6, -7, 8, -9);
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            i16::MIN, 5, -11, 5,
+            i16::MIN, 5, -11, 5
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MIN, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, -7, 8, -9);
+        #[rustfmt::skip]
+        let r = i32x4::new(i32::MIN, 5, -11, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MIN, -2);
+        #[rustfmt::skip]
+        let b = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(i64::MIN, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            249, 0, 0, 0,
+            249, 0, 0, 0,
+            249, 0, 0, 0,
+            249, 0, 0, 0
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            u16::MAX, 2, 3, 4,
+            u16::MAX, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(65529, 0, 0, 0, 65529, 0, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(u32::MAX, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(4294967289, 0, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(u64::MAX, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(18446744073709551609, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsus_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            255, 9, 11, 13,
+            255, 9, 11, 13,
+            255, 9, 11, 13,
+            255, 9, 11, 13
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsus_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsus_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            u16::MAX, 2, 3, 4,
+            u16::MAX, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, -7, -8, -9, -6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = u16x8::new(65535, 9, 11, 13, 65535, 9, 11, 13);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsus_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsus_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(u32::MAX, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(-6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = u32x4::new(4294967295, 9, 11, 13);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsus_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsus_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(u64::MAX, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(-6, -7);
+        #[rustfmt::skip]
+        let r = u64x2::new(18446744073709551615, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsus_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsuu_s_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, u8::MAX,
+            6, 7, 8, u8::MAX,
+            6, 7, 8, u8::MAX,
+            6, 7, 8, u8::MAX
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            127, -5, -5, -128,
+            127, -5, -5, -128,
+            127, -5, -5, -128,
+            127, -5, -5, -128
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsuu_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsuu_s_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            u16::MAX, 2, 3,
+            4, u16::MAX, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 65535, 6, 7, 8, 65535);
+        #[rustfmt::skip]
+        let r = i16x8::new(32767, -5, -5, -32768, 32767, -5, -5, -32768);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsuu_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsuu_s_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(u32::MAX, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 4294967295);
+        #[rustfmt::skip]
+        let r = i32x4::new(2147483647, -5, -5, -2147483648);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsuu_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsuu_s_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(u64::MAX, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = i64x2::new(i64::MAX, -5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsuu_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subv_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            i8::MIN, -2, -3, -4,
+            i8::MIN, -2, -3, -4,
+            i8::MIN, -2, -3, -4,
+            i8::MIN, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            6, -7, 8, -9,
+            6, -7, 8, -9,
+            6, -7, 8, -9,
+            6, -7, 8, -9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            122, 5, -11, 5,
+            122, 5, -11, 5,
+            122, 5, -11, 5,
+            122, 5, -11, 5
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subv_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subv_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MIN, -2, -3, -4,
+            i16::MIN, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(6, -7, 8, -9, 6, -7, 8, -9);
+        #[rustfmt::skip]
+        let r = i16x8::new(32762, 5, -11, 5, 32762, 5, -11, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subv_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subv_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MIN, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, -7, 8, -9);
+        #[rustfmt::skip]
+        let r = i32x4::new(2147483642, 5, -11, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subv_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subv_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MAX, -2);
+        #[rustfmt::skip]
+        let b = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(9223372036854775801, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subv_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subvi_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, i8::MAX, 50, i8::MIN,
+            100, i8::MAX, 50, i8::MIN,
+            100, i8::MAX, 50, i8::MIN,
+            100, i8::MAX, 50, i8::MIN
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            95, 122, 45, 123,
+            95, 122, 45, 123,
+            95, 122, 45, 123,
+            95, 122, 45, 123
+        );
+
+        assert_eq!(r, mem::transmute(__msa_subvi_b(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subvi_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 3276, -100, i16::MIN,
+            i16::MAX, 3276, -100, i16::MIN
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            32762, 3271, -105, 32763,
+            32762, 3271, -105, 32763
+        );
+
+        assert_eq!(r, mem::transmute(__msa_subvi_h(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subvi_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 150, 200, i32::MAX);
+        #[rustfmt::skip]
+        let r = i32x4::new(95, 145, 195, 2147483642);
+
+        assert_eq!(r, mem::transmute(__msa_subvi_w(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subvi_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MAX);
+        #[rustfmt::skip]
+        let r = i64x2::new(95, 9223372036854775802);
+
+        assert_eq!(r, mem::transmute(__msa_subvi_d(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_vshf_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let c = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            3, 2, 1, 4,
+            3, 2, 1, 4,
+            3, 2, 1, 4,
+            3, 2, 1, 4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_vshf_b(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_vshf_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        let r = i16x8::new(3, 2, 1, 4, 3, 2, 1, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_vshf_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_vshf_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let c = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(3, 2, 1, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_vshf_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_vshf_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(4, 3);
+        #[rustfmt::skip]
+        let c = i64x2::new(4, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(3, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_vshf_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_xor_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            16, 15, 14, 13,
+            12, 11, 10, 9,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            17, 13, 13, 9,
+            9, 13, 13, 1,
+            1, 13, 13, 9,
+            9, 13, 13, 17
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_xor_v(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_xori_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            5, 6, 7, 0,
+            1, 2, 3, 12,
+            13, 14, 15, 8,
+            9, 10, 11, 20
+        );
+
+        assert_eq!(r, mem::transmute(__msa_xori_b(mem::transmute(a), 4)));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/mod.rs b/library/stdarch/crates/core_arch/src/mod.rs
new file mode 100644
index 0000000000000..f6e959efd47cb
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/mod.rs
@@ -0,0 +1,343 @@
+//! `core_arch`
+
+#![allow(unknown_lints, unnecessary_transmutes)]
+
+#[macro_use]
+mod macros;
+
+#[cfg(any(target_arch = "riscv32", target_arch = "riscv64", doc))]
+mod riscv_shared;
+
+#[cfg(any(
+    target_arch = "arm",
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    doc
+))]
+mod arm_shared;
+
+mod simd;
+
+#[doc = include_str!("core_arch_docs.md")]
+#[stable(feature = "simd_arch", since = "1.27.0")]
+pub mod arch {
+    /// Platform-specific intrinsics for the `x86` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "x86", doc))]
+    #[doc(cfg(target_arch = "x86"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub mod x86 {
+        #[stable(feature = "simd_x86", since = "1.27.0")]
+        pub use crate::core_arch::x86::*;
+    }
+
+    /// Platform-specific intrinsics for the `x86_64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "x86_64", doc))]
+    #[doc(cfg(target_arch = "x86_64"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub mod x86_64 {
+        #[stable(feature = "simd_x86", since = "1.27.0")]
+        pub use crate::core_arch::x86::*;
+        #[stable(feature = "simd_x86", since = "1.27.0")]
+        pub use crate::core_arch::x86_64::*;
+    }
+
+    /// Platform-specific intrinsics for the `arm` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "arm", doc))]
+    #[doc(cfg(target_arch = "arm"))]
+    #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+    pub mod arm {
+        #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+        pub use crate::core_arch::arm::*;
+    }
+
+    /// Platform-specific intrinsics for the `aarch64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec", doc))]
+    #[doc(cfg(any(target_arch = "aarch64", target_arch = "arm64ec")))]
+    #[stable(feature = "neon_intrinsics", since = "1.59.0")]
+    pub mod aarch64 {
+        #[stable(feature = "neon_intrinsics", since = "1.59.0")]
+        pub use crate::core_arch::aarch64::*;
+    }
+
+    /// Platform-specific intrinsics for the `riscv32` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "riscv32", doc))]
+    #[doc(cfg(any(target_arch = "riscv32")))]
+    #[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+    pub mod riscv32 {
+        pub use crate::core_arch::riscv_shared::*;
+        pub use crate::core_arch::riscv32::*;
+    }
+
+    /// Platform-specific intrinsics for the `riscv64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "riscv64", doc))]
+    #[doc(cfg(any(target_arch = "riscv64")))]
+    #[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+    pub mod riscv64 {
+        pub use crate::core_arch::riscv64::*;
+        // RISC-V RV64 supports all RV32 instructions as well in current specifications (2022-01-05).
+        // Module `riscv_shared` includes instructions available under all RISC-V platforms,
+        // i.e. RISC-V RV32 instructions.
+        pub use crate::core_arch::riscv_shared::*;
+    }
+
+    /// Platform-specific intrinsics for the `wasm32` platform.
+    ///
+    /// This module provides intrinsics specific to the WebAssembly
+    /// architecture. Here you'll find intrinsics specific to WebAssembly that
+    /// aren't otherwise surfaced somewhere in a cross-platform abstraction of
+    /// `std`, and you'll also find functions for leveraging WebAssembly
+    /// proposals such as [atomics] and [simd].
+    ///
+    /// Intrinsics in the `wasm32` module are modeled after the WebAssembly
+    /// instructions that they represent. Most functions are named after the
+    /// instruction they intend to correspond to, and the arguments/results
+    /// correspond to the type signature of the instruction itself. Stable
+    /// WebAssembly instructions are [documented online][instrdoc].
+    ///
+    /// [instrdoc]: https://webassembly.github.io/spec/core/valid/instructions.html
+    ///
+    /// If a proposal is not yet stable in WebAssembly itself then the functions
+    /// within this function may be unstable and require the nightly channel of
+    /// Rust to use. As the proposal itself stabilizes the intrinsics in this
+    /// module should stabilize as well.
+    ///
+    /// [atomics]: https://github.com/webassembly/threads
+    /// [simd]: https://github.com/webassembly/simd
+    ///
+    /// See the [module documentation](../index.html) for general information
+    /// about the `arch` module and platform intrinsics.
+    ///
+    /// ## Atomics
+    ///
+    /// The [threads proposal][atomics] for WebAssembly adds a number of
+    /// instructions for dealing with multithreaded programs. Most instructions
+    /// added in the [atomics] proposal are exposed in Rust through the
+    /// `std::sync::atomic` module. Some instructions, however, don't have
+    /// direct equivalents in Rust so they're exposed here instead.
+    ///
+    /// Note that the instructions added in the [atomics] proposal can work in
+    /// either a context with a shared wasm memory and without. These intrinsics
+    /// are always available in the standard library, but you likely won't be
+    /// able to use them too productively unless you recompile the standard
+    /// library (and all your code) with `-Ctarget-feature=+atomics`.
+    ///
+    /// It's also worth pointing out that multi-threaded WebAssembly and its
+    /// story in Rust is still in a somewhat "early days" phase as of the time
+    /// of this writing. Pieces should mostly work but it generally requires a
+    /// good deal of manual setup. At this time it's not as simple as "just call
+    /// `std::thread::spawn`", but it will hopefully get there one day!
+    ///
+    /// ## SIMD
+    ///
+    /// The [simd proposal][simd] for WebAssembly added a new `v128` type for a
+    /// 128-bit SIMD register. It also added a large array of instructions to
+    /// operate on the `v128` type to perform data processing. Using SIMD on
+    /// wasm is intended to be similar to as you would on `x86_64`, for example.
+    /// You'd write a function such as:
+    ///
+    /// ```rust,ignore
+    /// #[cfg(target_arch = "wasm32")]
+    /// #[target_feature(enable = "simd128")]
+    /// unsafe fn uses_simd() {
+    ///     use std::arch::wasm32::*;
+    ///     // ...
+    /// }
+    /// ```
+    ///
+    /// Unlike `x86_64`, however, WebAssembly does not currently have dynamic
+    /// detection at runtime as to whether SIMD is supported (this is one of the
+    /// motivators for the [conditional sections][condsections] and [feature
+    /// detection] proposals, but that is still pretty early days). This means
+    /// that your binary will either have SIMD and can only run on engines
+    /// which support SIMD, or it will not have SIMD at all. For compatibility
+    /// the standard library itself does not use any SIMD internally.
+    /// Determining how best to ship your WebAssembly binary with SIMD is
+    /// largely left up to you as it can be pretty nuanced depending on
+    /// your situation.
+    ///
+    /// [condsections]: https://github.com/webassembly/conditional-sections
+    /// [feature detection]: https://github.com/WebAssembly/feature-detection
+    ///
+    /// To enable SIMD support at compile time you need to do one of two things:
+    ///
+    /// * First you can annotate functions with `#[target_feature(enable =
+    ///   "simd128")]`. This causes just that one function to have SIMD support
+    ///   available to it, and intrinsics will get inlined as usual in this
+    ///   situation.
+    ///
+    /// * Second you can compile your program with `-Ctarget-feature=+simd128`.
+    ///   This compilation flag blanket enables SIMD support for your entire
+    ///   compilation. Note that this does not include the standard library
+    ///   unless you [recompile the standard library][buildstd].
+    ///
+    /// [buildstd]: https://doc.rust-lang.org/nightly/cargo/reference/unstable.html#build-std
+    ///
+    /// If you enable SIMD via either of these routes then you'll have a
+    /// WebAssembly binary that uses SIMD instructions, and you'll need to ship
+    /// that accordingly. Also note that if you call SIMD intrinsics but don't
+    /// enable SIMD via either of these mechanisms, you'll still have SIMD
+    /// generated in your program. This means to generate a binary without SIMD
+    /// you'll need to avoid both options above plus calling into any intrinsics
+    /// in this module.
+    #[cfg(any(target_arch = "wasm32", doc))]
+    #[doc(cfg(target_arch = "wasm32"))]
+    #[stable(feature = "simd_wasm32", since = "1.33.0")]
+    pub mod wasm32 {
+        #[stable(feature = "simd_wasm32", since = "1.33.0")]
+        pub use crate::core_arch::wasm32::*;
+    }
+
+    /// Platform-specific intrinsics for the `wasm64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "wasm64", doc))]
+    #[doc(cfg(target_arch = "wasm64"))]
+    #[unstable(feature = "simd_wasm64", issue = "90599")]
+    pub mod wasm64 {
+        #[unstable(feature = "simd_wasm64", issue = "90599")]
+        pub use crate::core_arch::wasm32::*;
+    }
+
+    /// Platform-specific intrinsics for the `wasm` target family.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_family = "wasm", doc))]
+    #[doc(cfg(target_family = "wasm"))]
+    #[unstable(feature = "simd_wasm64", issue = "90599")]
+    pub mod wasm {
+        #[unstable(feature = "simd_wasm64", issue = "90599")]
+        pub use crate::core_arch::wasm32::*;
+    }
+
+    /// Platform-specific intrinsics for the `mips` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "mips", doc))]
+    #[doc(cfg(target_arch = "mips"))]
+    #[unstable(feature = "stdarch_mips", issue = "111198")]
+    pub mod mips {
+        pub use crate::core_arch::mips::*;
+    }
+
+    /// Platform-specific intrinsics for the `mips64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "mips64", doc))]
+    #[doc(cfg(target_arch = "mips64"))]
+    #[unstable(feature = "stdarch_mips", issue = "111198")]
+    pub mod mips64 {
+        pub use crate::core_arch::mips::*;
+    }
+
+    /// Platform-specific intrinsics for the `PowerPC` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "powerpc", doc))]
+    #[doc(cfg(target_arch = "powerpc"))]
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub mod powerpc {
+        pub use crate::core_arch::powerpc::*;
+    }
+
+    /// Platform-specific intrinsics for the `PowerPC64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "powerpc64", doc))]
+    #[doc(cfg(target_arch = "powerpc64"))]
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub mod powerpc64 {
+        pub use crate::core_arch::powerpc64::*;
+    }
+
+    /// Platform-specific intrinsics for the `NVPTX` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "nvptx64", doc))]
+    #[doc(cfg(target_arch = "nvptx64"))]
+    #[unstable(feature = "stdarch_nvptx", issue = "111199")]
+    pub mod nvptx {
+        pub use crate::core_arch::nvptx::*;
+    }
+
+    /// Platform-specific intrinsics for the `loongarch` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "loongarch64", doc))]
+    #[doc(cfg(target_arch = "loongarch64"))]
+    #[unstable(feature = "stdarch_loongarch", issue = "117427")]
+    pub mod loongarch64 {
+        pub use crate::core_arch::loongarch64::*;
+    }
+
+    /// Platform-specific intrinsics for the `s390x` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "s390x", doc))]
+    #[doc(cfg(target_arch = "s390x"))]
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub mod s390x {
+        pub use crate::core_arch::s390x::*;
+    }
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64", doc))]
+#[doc(cfg(any(target_arch = "x86", target_arch = "x86_64")))]
+mod x86;
+#[cfg(any(target_arch = "x86_64", doc))]
+#[doc(cfg(target_arch = "x86_64"))]
+mod x86_64;
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec", doc))]
+#[doc(cfg(any(target_arch = "aarch64", target_arch = "arm64ec")))]
+mod aarch64;
+#[cfg(any(target_arch = "arm", doc))]
+#[doc(cfg(any(target_arch = "arm")))]
+mod arm;
+
+#[cfg(any(target_arch = "riscv32", doc))]
+#[doc(cfg(any(target_arch = "riscv32")))]
+mod riscv32;
+
+#[cfg(any(target_arch = "riscv64", doc))]
+#[doc(cfg(any(target_arch = "riscv64")))]
+mod riscv64;
+
+#[cfg(any(target_family = "wasm", doc))]
+#[doc(cfg(target_family = "wasm"))]
+mod wasm32;
+
+#[cfg(any(target_arch = "mips", target_arch = "mips64", doc))]
+#[doc(cfg(any(target_arch = "mips", target_arch = "mips64")))]
+mod mips;
+
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64", doc))]
+#[doc(cfg(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+mod powerpc;
+
+#[cfg(any(target_arch = "powerpc64", doc))]
+#[doc(cfg(target_arch = "powerpc64"))]
+mod powerpc64;
+
+#[cfg(any(target_arch = "nvptx64", doc))]
+#[doc(cfg(target_arch = "nvptx64"))]
+mod nvptx;
+
+#[cfg(any(target_arch = "loongarch64", doc))]
+#[doc(cfg(target_arch = "loongarch64"))]
+mod loongarch64;
+
+#[cfg(any(target_arch = "s390x", doc))]
+#[doc(cfg(target_arch = "s390x"))]
+mod s390x;
diff --git a/library/stdarch/crates/core_arch/src/nvptx/mod.rs b/library/stdarch/crates/core_arch/src/nvptx/mod.rs
new file mode 100644
index 0000000000000..8d16dfb53d433
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/nvptx/mod.rs
@@ -0,0 +1,236 @@
+//! NVPTX intrinsics (experimental)
+//!
+//! These intrinsics form the foundation of the CUDA
+//! programming model.
+//!
+//! The reference is the [CUDA C Programming Guide][cuda_c]. Relevant is also
+//! the [LLVM NVPTX Backend documentation][llvm_docs].
+//!
+//! [cuda_c]:
+//! http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
+//! [llvm_docs]:
+//! https://llvm.org/docs/NVPTXUsage.html
+
+use crate::ffi::c_void;
+
+mod packed;
+
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub use packed::*;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.nvvm.barrier0"]
+    fn syncthreads() -> ();
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.x"]
+    fn block_dim_x() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.y"]
+    fn block_dim_y() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.z"]
+    fn block_dim_z() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ctaid.x"]
+    fn block_idx_x() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ctaid.y"]
+    fn block_idx_y() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ctaid.z"]
+    fn block_idx_z() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.nctaid.x"]
+    fn grid_dim_x() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.nctaid.y"]
+    fn grid_dim_y() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.nctaid.z"]
+    fn grid_dim_z() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.tid.x"]
+    fn thread_idx_x() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.tid.y"]
+    fn thread_idx_y() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.tid.z"]
+    fn thread_idx_z() -> i32;
+}
+
+/// Synchronizes all threads in the block.
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _syncthreads() -> () {
+    syncthreads()
+}
+
+/// x-th thread-block dimension.
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _block_dim_x() -> i32 {
+    block_dim_x()
+}
+
+/// y-th thread-block dimension.
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _block_dim_y() -> i32 {
+    block_dim_y()
+}
+
+/// z-th thread-block dimension.
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _block_dim_z() -> i32 {
+    block_dim_z()
+}
+
+/// x-th thread-block index.
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _block_idx_x() -> i32 {
+    block_idx_x()
+}
+
+/// y-th thread-block index.
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _block_idx_y() -> i32 {
+    block_idx_y()
+}
+
+/// z-th thread-block index.
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _block_idx_z() -> i32 {
+    block_idx_z()
+}
+
+/// x-th block-grid dimension.
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _grid_dim_x() -> i32 {
+    grid_dim_x()
+}
+
+/// y-th block-grid dimension.
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _grid_dim_y() -> i32 {
+    grid_dim_y()
+}
+
+/// z-th block-grid dimension.
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _grid_dim_z() -> i32 {
+    grid_dim_z()
+}
+
+/// x-th thread index.
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _thread_idx_x() -> i32 {
+    thread_idx_x()
+}
+
+/// y-th thread index.
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _thread_idx_y() -> i32 {
+    thread_idx_y()
+}
+
+/// z-th thread index.
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _thread_idx_z() -> i32 {
+    thread_idx_z()
+}
+
+/// Generates the trap instruction `TRAP`
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn trap() -> ! {
+    crate::intrinsics::abort()
+}
+
+// Basic CUDA syscall declarations.
+unsafe extern "C" {
+    /// Print formatted output from a kernel to a host-side output stream.
+    ///
+    /// Syscall arguments:
+    /// * `status`: The status value that is returned by `vprintf`.
+    /// * `format`: A pointer to the format specifier input (uses common `printf` format).
+    /// * `valist`: A pointer to the valist input.
+    ///
+    /// ```
+    /// #[repr(C)]
+    /// struct PrintArgs(f32, f32, f32, i32);
+    ///
+    /// vprintf(
+    ///     "int(%f + %f) = int(%f) = %d\n".as_ptr(),
+    ///     transmute(&PrintArgs(a, b, a + b, (a + b) as i32)),
+    /// );
+    /// ```
+    ///
+    /// Sources:
+    /// [Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#formatted-output),
+    /// [PTX Interoperability](https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls).
+    #[unstable(feature = "stdarch_nvptx", issue = "111199")]
+    pub fn vprintf(format: *const u8, valist: *const c_void) -> i32;
+
+    /// Allocate memory dynamically from a fixed-size heap in global memory.
+    ///
+    /// The CUDA in-kernel `malloc()` function allocates at least `size` bytes
+    /// from the device heap and returns a pointer to the allocated memory
+    /// or `NULL` if insufficient memory exists to fulfill the request.
+    ///
+    /// The returned pointer is guaranteed to be aligned to a 16-byte boundary.
+    ///
+    /// The memory allocated by a given CUDA thread via `malloc()` remains allocated
+    /// for the lifetime of the CUDA context, or until it is explicitly released
+    /// by a call to `free()`. It can be used by any other CUDA threads
+    /// even from subsequent kernel launches.
+    ///
+    /// Sources:
+    /// [Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dynamic-global-memory-allocation-and-operations),
+    /// [PTX Interoperability](https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls).
+    // FIXME(denzp): assign `malloc` and `nothrow` attributes.
+    #[unstable(feature = "stdarch_nvptx", issue = "111199")]
+    pub fn malloc(size: usize) -> *mut c_void;
+
+    /// Free previously dynamically allocated memory.
+    ///
+    /// The CUDA in-kernel `free()` function deallocates the memory pointed to by `ptr`,
+    /// which must have been returned by a previous call to `malloc()`. If `ptr` is NULL,
+    /// the call to `free()` is ignored.
+    ///
+    /// Any CUDA thread may free memory allocated by another thread, but care should be taken
+    /// to ensure that the same pointer is not freed more than once. Repeated calls to `free()`
+    /// with the same `ptr` has undefined behavior.
+    ///
+    /// Sources:
+    /// [Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dynamic-global-memory-allocation-and-operations),
+    /// [PTX Interoperability](https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls).
+    // FIXME(denzp): assign `nothrow` attribute.
+    #[unstable(feature = "stdarch_nvptx", issue = "111199")]
+    pub fn free(ptr: *mut c_void);
+
+    // Internal declaration of the syscall. Exported variant has
+    // the `char_size` parameter set to `1` (single char size in bytes).
+    fn __assertfail(
+        message: *const u8,
+        file: *const u8,
+        line: u32,
+        function: *const u8,
+        char_size: usize,
+    );
+}
+
+/// Syscall to be used whenever the *assert expression produces a `false` value*.
+///
+/// Syscall arguments:
+/// * `message`: The pointer to the string that should be output.
+/// * `file`: The pointer to the file name string associated with the assert.
+/// * `line`: The line number associated with the assert.
+/// * `function`: The pointer to the function name string associated with the assert.
+///
+/// Source:
+/// [PTX Interoperability](https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls).
+#[inline]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn __assert_fail(message: *const u8, file: *const u8, line: u32, function: *const u8) {
+    __assertfail(message, file, line, function, 1)
+}
diff --git a/library/stdarch/crates/core_arch/src/nvptx/packed.rs b/library/stdarch/crates/core_arch/src/nvptx/packed.rs
new file mode 100644
index 0000000000000..856aeea4b686c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/nvptx/packed.rs
@@ -0,0 +1,139 @@
+//! NVPTX Packed data types (SIMD)
+//!
+//! Packed Data Types is what PTX calls SIMD types. See [PTX ISA (Packed Data Types)](https://docs.nvidia.com/cuda/parallel-thread-execution/#packed-data-types) for a full reference.
+
+// Note: #[assert_instr] tests are not actually being run on nvptx due to being a `no_std` target incapable of running tests. Something like FileCheck would be appropriate for verifying the correct instruction is used.
+
+use crate::intrinsics::simd::*;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.minimum.v2f16"]
+    fn llvm_f16x2_minimum(a: f16x2, b: f16x2) -> f16x2;
+    #[link_name = "llvm.maximum.v2f16"]
+    fn llvm_f16x2_maximum(a: f16x2, b: f16x2) -> f16x2;
+}
+
+types! {
+    #![unstable(feature = "stdarch_nvptx", issue = "111199")]
+
+    /// PTX-specific 32-bit wide floating point (f16 x 2) vector type
+    pub struct f16x2(2 x f16);
+
+}
+
+/// Add two values, round to nearest even
+///
+/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-add>
+///
+/// Corresponds to the CUDA C intrinsics:
+///  - [`__hadd2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g921c795176eaa31265bd80ef4fe4b8e6)
+///  - [`__hadd2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g6cd8ddb2c3d670e1a10c3eb2e7644f82)
+#[inline]
+#[cfg_attr(test, assert_instr(add.rn.f16x22))]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn f16x2_add(a: f16x2, b: f16x2) -> f16x2 {
+    simd_add(a, b)
+}
+
+/// Subtract two values, round to nearest even
+///
+/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-sub>
+///
+/// Corresponds to the CUDA C intrinsics:
+///  - [`__hsub2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1ga5536c9c3d853d8c8b9de60e18b41e54)
+///  - [`__hsub2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g8adc164c68d553354f749f0f0645a874)
+#[inline]
+#[cfg_attr(test, assert_instr(sub.rn.f16x2))]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn f16x2_sub(a: f16x2, b: f16x2) -> f16x2 {
+    simd_sub(a, b)
+}
+
+/// Multiply two values, round to nearest even
+///
+/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-mul>
+///
+/// Corresponds to the CUDA C intrinsics:
+///  - [`__hmul2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g70de3f2ee48babe4e0969397ac17708e)
+///  - [`__hmul2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g99f8fe23a4b4c6898d6faf999afaa76e)
+#[inline]
+#[cfg_attr(test, assert_instr(mul.rn.f16x2))]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn f16x2_mul(a: f16x2, b: f16x2) -> f16x2 {
+    simd_mul(a, b)
+}
+
+/// Fused multiply-add, round to nearest even
+///
+/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-fma>
+///
+/// Corresponds to the CUDA C intrinsics:
+///  - [`__fma2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g43628ba21ded8b1e188a367348008dab)
+///  - [`__fma2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g43628ba21ded8b1e188a367348008dab)
+#[inline]
+#[cfg_attr(test, assert_instr(fma.rn.f16x2))]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn f16x2_fma(a: f16x2, b: f16x2, c: f16x2) -> f16x2 {
+    simd_fma(a, b, c)
+}
+
+/// Arithmetic negate
+///
+/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-neg>
+///
+/// Corresponds to the CUDA C intrinsic [`__hmin2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g9e17a33f96061804166f3fbd395422b6)
+#[inline]
+#[cfg_attr(test, assert_instr(neg.f16x2))]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn f16x2_neg(a: f16x2) -> f16x2 {
+    simd_neg(a)
+}
+
+/// Find the minimum of two values
+///
+/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-min>
+///
+/// Corresponds to the CUDA C intrinsic [`__hmin2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g9e17a33f96061804166f3fbd395422b6)
+#[inline]
+#[cfg_attr(test, assert_instr(min.f16x2))]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn f16x2_min(a: f16x2, b: f16x2) -> f16x2 {
+    simd_fmin(a, b)
+}
+
+/// Find the minimum of two values, NaNs pass through.
+///
+/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-min>
+///
+/// Corresponds to the CUDA C intrinsic [`__hmin2_nan`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g8bb8f58e9294cc261d2f42c4d5aecd6b)
+#[inline]
+#[cfg_attr(test, assert_instr(min.NaN.f16x2))]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn f16x2_min_nan(a: f16x2, b: f16x2) -> f16x2 {
+    llvm_f16x2_minimum(a, b)
+}
+
+/// Find the maximum of two values
+///
+/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-max>
+///
+/// Corresponds to the CUDA C intrinsic [`__hmax2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g59fc7fc7975d8127b202444a05e57e3d)
+#[inline]
+#[cfg_attr(test, assert_instr(max.f16x2))]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn f16x2_max(a: f16x2, b: f16x2) -> f16x2 {
+    simd_fmax(a, b)
+}
+
+/// Find the maximum of two values, NaNs pass through.
+///
+/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-max>
+///
+/// Corresponds to the CUDA C intrinsic [`__hmax2_nan`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g41623db7850e3074fd9daa80a14c3897)
+#[inline]
+#[cfg_attr(test, assert_instr(max.NaN.f16x2))]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn f16x2_max_nan(a: f16x2, b: f16x2) -> f16x2 {
+    llvm_f16x2_maximum(a, b)
+}
diff --git a/library/stdarch/crates/core_arch/src/powerpc/altivec.rs b/library/stdarch/crates/core_arch/src/powerpc/altivec.rs
new file mode 100644
index 0000000000000..2deeb53c20995
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc/altivec.rs
@@ -0,0 +1,6692 @@
+//! PowerPC AltiVec intrinsics.
+//!
+//! AltiVec is a brandname trademarked by Freescale (previously Motorola) for
+//! the standard `Category:Vector` part of the Power ISA v.2.03 specification.
+//! This Category is also known as VMX (used by IBM), and "Velocity Engine" (a
+//! brand name previously used by Apple).
+//!
+//! The references are: [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA
+//! NVlink)] and [POWER ISA v3.0B (for POWER9)].
+//!
+//! [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA NVlink)]: https://ibm.box.com/s/jd5w15gz301s5b5dt375mshpq9c3lh4u
+//! [POWER ISA v3.0B (for POWER9)]: https://ibm.box.com/s/1hzcwkwf8rbju5h9iyf44wm94amnlcrv
+
+#![allow(non_camel_case_types)]
+
+use crate::{core_arch::simd::*, intrinsics::simd::*, mem, mem::transmute};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use super::macros::*;
+
+types! {
+    #![unstable(feature = "stdarch_powerpc", issue = "111145")]
+
+    /// PowerPC-specific 128-bit wide vector of sixteen packed `i8`
+    pub struct vector_signed_char(16 x i8);
+    /// PowerPC-specific 128-bit wide vector of sixteen packed `u8`
+    pub struct vector_unsigned_char(16 x u8);
+
+    /// PowerPC-specific 128-bit wide vector mask of sixteen packed elements
+    pub struct vector_bool_char(16 x i8);
+    /// PowerPC-specific 128-bit wide vector of eight packed `i16`
+    pub struct vector_signed_short(8 x i16);
+    /// PowerPC-specific 128-bit wide vector of eight packed `u16`
+    pub struct vector_unsigned_short(8 x u16);
+    /// PowerPC-specific 128-bit wide vector mask of eight packed elements
+    pub struct vector_bool_short(8 x i16);
+    // pub struct vector_pixel(???);
+    /// PowerPC-specific 128-bit wide vector of four packed `i32`
+    pub struct vector_signed_int(4 x i32);
+    /// PowerPC-specific 128-bit wide vector of four packed `u32`
+    pub struct vector_unsigned_int(4 x u32);
+    /// PowerPC-specific 128-bit wide vector mask of four packed elements
+    pub struct vector_bool_int(4 x i32);
+    /// PowerPC-specific 128-bit wide vector of four packed `f32`
+    pub struct vector_float(4 x f32);
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.ppc.altivec.lvx"]
+    fn lvx(p: *const i8) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.lvebx"]
+    fn lvebx(p: *const i8) -> vector_signed_char;
+    #[link_name = "llvm.ppc.altivec.lvehx"]
+    fn lvehx(p: *const i8) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.lvewx"]
+    fn lvewx(p: *const i8) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.lvxl"]
+    fn lvxl(p: *const i8) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.stvx"]
+    fn stvx(a: vector_signed_int, p: *const i8);
+
+    #[link_name = "llvm.ppc.altivec.stvebx"]
+    fn stvebx(a: vector_signed_char, p: *const i8);
+    #[link_name = "llvm.ppc.altivec.stvehx"]
+    fn stvehx(a: vector_signed_short, p: *const i8);
+    #[link_name = "llvm.ppc.altivec.stvewx"]
+    fn stvewx(a: vector_signed_int, p: *const i8);
+
+    #[link_name = "llvm.ppc.altivec.stvxl"]
+    fn stvxl(a: vector_signed_int, p: *const i8);
+
+    #[link_name = "llvm.ppc.altivec.vperm"]
+    fn vperm(
+        a: vector_signed_int,
+        b: vector_signed_int,
+        c: vector_unsigned_char,
+    ) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmhaddshs"]
+    fn vmhaddshs(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_short,
+    ) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmhraddshs"]
+    fn vmhraddshs(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_short,
+    ) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmsumuhs"]
+    fn vmsumuhs(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmsumshs"]
+    fn vmsumshs(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_int,
+    ) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmsumubm"]
+    fn vmsumubm(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmsummbm"]
+    fn vmsummbm(
+        a: vector_signed_char,
+        b: vector_unsigned_char,
+        c: vector_signed_int,
+    ) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmsumuhm"]
+    fn vmsumuhm(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmsumshm"]
+    fn vmsumshm(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_int,
+    ) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vnmsubfp"]
+    fn vnmsubfp(a: vector_float, b: vector_float, c: vector_float) -> vector_float;
+    #[link_name = "llvm.ppc.altivec.vsum2sws"]
+    fn vsum2sws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vsum4ubs"]
+    fn vsum4ubs(a: vector_unsigned_char, b: vector_unsigned_int) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vsum4sbs"]
+    fn vsum4sbs(a: vector_signed_char, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vsum4shs"]
+    fn vsum4shs(a: vector_signed_short, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmuleub"]
+    fn vmuleub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vmulesb"]
+    fn vmulesb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmuleuh"]
+    fn vmuleuh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmulesh"]
+    fn vmulesh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmuloub"]
+    fn vmuloub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vmulosb"]
+    fn vmulosb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmulouh"]
+    fn vmulouh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmulosh"]
+    fn vmulosh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int;
+
+    #[link_name = "llvm.smax.v16i8"]
+    fn vmaxsb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.smax.v8i16"]
+    fn vmaxsh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.smax.v4i32"]
+    fn vmaxsw(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.umax.v16i8"]
+    fn vmaxub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.umax.v8i16"]
+    fn vmaxuh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.umax.v4i32"]
+    fn vmaxuw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.smin.v16i8"]
+    fn vminsb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.smin.v8i16"]
+    fn vminsh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.smin.v4i32"]
+    fn vminsw(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.umin.v16i8"]
+    fn vminub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.umin.v8i16"]
+    fn vminuh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.umin.v4i32"]
+    fn vminuw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.vsubsbs"]
+    fn vsubsbs(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.ppc.altivec.vsubshs"]
+    fn vsubshs(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vsubsws"]
+    fn vsubsws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vsububs"]
+    fn vsububs(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.ppc.altivec.vsubuhs"]
+    fn vsubuhs(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vsubuws"]
+    fn vsubuws(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.vsubcuw"]
+    fn vsubcuw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.vaddcuw"]
+    fn vaddcuw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.vaddsbs"]
+    fn vaddsbs(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.ppc.altivec.vaddshs"]
+    fn vaddshs(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vaddsws"]
+    fn vaddsws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vaddubs"]
+    fn vaddubs(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.ppc.altivec.vadduhs"]
+    fn vadduhs(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vadduws"]
+    fn vadduws(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.vavgsb"]
+    fn vavgsb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.ppc.altivec.vavgsh"]
+    fn vavgsh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vavgsw"]
+    fn vavgsw(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vavgub"]
+    fn vavgub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.ppc.altivec.vavguh"]
+    fn vavguh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vavguw"]
+    fn vavguw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.vcmpbfp"]
+    fn vcmpbfp(a: vector_float, b: vector_float) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vcmpequb"]
+    fn vcmpequb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_bool_char;
+    #[link_name = "llvm.ppc.altivec.vcmpequh"]
+    fn vcmpequh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_bool_short;
+    #[link_name = "llvm.ppc.altivec.vcmpequw"]
+    fn vcmpequw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_bool_int;
+
+    #[link_name = "llvm.ppc.altivec.vcmpneb"]
+    fn vcmpneb(a: vector_signed_char, b: vector_signed_char) -> vector_bool_char;
+    #[link_name = "llvm.ppc.altivec.vcmpneh"]
+    fn vcmpneh(a: vector_signed_short, b: vector_signed_short) -> vector_bool_short;
+    #[link_name = "llvm.ppc.altivec.vcmpnew"]
+    fn vcmpnew(a: vector_signed_int, b: vector_signed_int) -> vector_bool_int;
+
+    #[link_name = "llvm.ppc.altivec.vcmpgefp"]
+    fn vcmpgefp(a: vector_float, b: vector_float) -> vector_bool_int;
+
+    #[link_name = "llvm.ppc.altivec.vcmpgtub"]
+    fn vcmpgtub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_bool_char;
+    #[link_name = "llvm.ppc.altivec.vcmpgtuh"]
+    fn vcmpgtuh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_bool_short;
+    #[link_name = "llvm.ppc.altivec.vcmpgtuw"]
+    fn vcmpgtuw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_bool_int;
+
+    #[link_name = "llvm.ppc.altivec.vcmpgtsb"]
+    fn vcmpgtsb(a: vector_signed_char, b: vector_signed_char) -> vector_bool_char;
+    #[link_name = "llvm.ppc.altivec.vcmpgtsh"]
+    fn vcmpgtsh(a: vector_signed_short, b: vector_signed_short) -> vector_bool_short;
+    #[link_name = "llvm.ppc.altivec.vcmpgtsw"]
+    fn vcmpgtsw(a: vector_signed_int, b: vector_signed_int) -> vector_bool_int;
+
+    #[link_name = "llvm.ppc.altivec.vexptefp"]
+    fn vexptefp(a: vector_float) -> vector_float;
+
+    #[link_name = "llvm.ppc.altivec.vcmpequb.p"]
+    fn vcmpequb_p(cr: i32, a: vector_unsigned_char, b: vector_unsigned_char) -> i32;
+    #[link_name = "llvm.ppc.altivec.vcmpequh.p"]
+    fn vcmpequh_p(cr: i32, a: vector_unsigned_short, b: vector_unsigned_short) -> i32;
+    #[link_name = "llvm.ppc.altivec.vcmpequw.p"]
+    fn vcmpequw_p(cr: i32, a: vector_unsigned_int, b: vector_unsigned_int) -> i32;
+
+    #[link_name = "llvm.ppc.altivec.vcmpeqfp.p"]
+    fn vcmpeqfp_p(cr: i32, a: vector_float, b: vector_float) -> i32;
+
+    #[link_name = "llvm.ppc.altivec.vcmpgtub.p"]
+    fn vcmpgtub_p(cr: i32, a: vector_unsigned_char, b: vector_unsigned_char) -> i32;
+    #[link_name = "llvm.ppc.altivec.vcmpgtuh.p"]
+    fn vcmpgtuh_p(cr: i32, a: vector_unsigned_short, b: vector_unsigned_short) -> i32;
+    #[link_name = "llvm.ppc.altivec.vcmpgtuw.p"]
+    fn vcmpgtuw_p(cr: i32, a: vector_unsigned_int, b: vector_unsigned_int) -> i32;
+    #[link_name = "llvm.ppc.altivec.vcmpgtsb.p"]
+    fn vcmpgtsb_p(cr: i32, a: vector_signed_char, b: vector_signed_char) -> i32;
+    #[link_name = "llvm.ppc.altivec.vcmpgtsh.p"]
+    fn vcmpgtsh_p(cr: i32, a: vector_signed_short, b: vector_signed_short) -> i32;
+    #[link_name = "llvm.ppc.altivec.vcmpgtsw.p"]
+    fn vcmpgtsw_p(cr: i32, a: vector_signed_int, b: vector_signed_int) -> i32;
+
+    #[link_name = "llvm.ppc.altivec.vcmpgefp.p"]
+    fn vcmpgefp_p(cr: i32, a: vector_float, b: vector_float) -> i32;
+    #[link_name = "llvm.ppc.altivec.vcmpgtfp.p"]
+    fn vcmpgtfp_p(cr: i32, a: vector_float, b: vector_float) -> i32;
+    #[link_name = "llvm.ppc.altivec.vcmpbfp.p"]
+    fn vcmpbfp_p(cr: i32, a: vector_float, b: vector_float) -> i32;
+
+    #[link_name = "llvm.ppc.altivec.vcfsx"]
+    fn vcfsx(a: vector_signed_int, b: i32) -> vector_float;
+    #[link_name = "llvm.ppc.altivec.vcfux"]
+    fn vcfux(a: vector_unsigned_int, b: i32) -> vector_float;
+
+    #[link_name = "llvm.ppc.altivec.vctsxs"]
+    fn vctsxs(a: vector_float, b: i32) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vctuxs"]
+    fn vctuxs(a: vector_float, b: i32) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.vpkshss"]
+    fn vpkshss(a: vector_signed_short, b: vector_signed_short) -> vector_signed_char;
+    #[link_name = "llvm.ppc.altivec.vpkshus"]
+    fn vpkshus(a: vector_signed_short, b: vector_signed_short) -> vector_unsigned_char;
+    #[link_name = "llvm.ppc.altivec.vpkuhus"]
+    fn vpkuhus(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_char;
+    #[link_name = "llvm.ppc.altivec.vpkswss"]
+    fn vpkswss(a: vector_signed_int, b: vector_signed_int) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vpkswus"]
+    fn vpkswus(a: vector_signed_int, b: vector_signed_int) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vpkuwus"]
+    fn vpkuwus(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_short;
+
+    #[link_name = "llvm.ppc.altivec.vupkhsb"]
+    fn vupkhsb(a: vector_signed_char) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vupklsb"]
+    fn vupklsb(a: vector_signed_char) -> vector_signed_short;
+
+    #[link_name = "llvm.ppc.altivec.vupkhsh"]
+    fn vupkhsh(a: vector_signed_short) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vupklsh"]
+    fn vupklsh(a: vector_signed_short) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.mfvscr"]
+    fn mfvscr() -> vector_unsigned_short;
+
+    #[link_name = "llvm.ppc.altivec.vlogefp"]
+    fn vlogefp(a: vector_float) -> vector_float;
+
+    #[link_name = "llvm.ppc.altivec.vsl"]
+    fn vsl(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vslo"]
+    fn vslo(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vsrab"]
+    fn vsrab(a: vector_signed_char, b: vector_unsigned_char) -> vector_signed_char;
+    #[link_name = "llvm.ppc.altivec.vsrah"]
+    fn vsrah(a: vector_signed_short, b: vector_unsigned_short) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vsraw"]
+    fn vsraw(a: vector_signed_int, b: vector_unsigned_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vsr"]
+    fn vsr(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vsro"]
+    fn vsro(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vslv"]
+    fn vslv(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.ppc.altivec.vsrv"]
+    fn vsrv(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+
+    #[link_name = "llvm.fshl.v16i8"]
+    fn fshlb(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+        c: vector_unsigned_char,
+    ) -> vector_unsigned_char;
+    #[link_name = "llvm.fshl.v8i16"]
+    fn fshlh(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+        c: vector_unsigned_short,
+    ) -> vector_unsigned_short;
+    #[link_name = "llvm.fshl.v4i32"]
+    fn fshlw(
+        a: vector_unsigned_int,
+        b: vector_unsigned_int,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int;
+
+    #[link_name = "llvm.nearbyint.v4f32"]
+    fn vrfin(a: vector_float) -> vector_float;
+}
+
+impl_from! { i8x16, u8x16,  i16x8, u16x8, i32x4, u32x4, f32x4 }
+
+impl_neg! { i8x16 : 0 }
+impl_neg! { i16x8 : 0 }
+impl_neg! { i32x4 : 0 }
+impl_neg! { f32x4 : 0f32 }
+
+#[macro_use]
+mod sealed {
+    use super::*;
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorInsert {
+        type Scalar;
+        unsafe fn vec_insert<const IDX: u32>(self, s: Self::Scalar) -> Self;
+    }
+
+    const fn idx_in_vec<T, const IDX: u32>() -> u32 {
+        IDX & (16 / crate::mem::size_of::<T>() as u32)
+    }
+
+    macro_rules! impl_vec_insert {
+        ($ty:ident) => {
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorInsert for t_t_l!($ty) {
+                type Scalar = $ty;
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_insert<const IDX: u32>(self, s: Self::Scalar) -> Self {
+                    simd_insert(self, const { idx_in_vec::<Self::Scalar, IDX>() }, s)
+                }
+            }
+        };
+    }
+
+    impl_vec_insert! { i8 }
+    impl_vec_insert! { u8 }
+    impl_vec_insert! { i16 }
+    impl_vec_insert! { u16 }
+    impl_vec_insert! { i32 }
+    impl_vec_insert! { u32 }
+    impl_vec_insert! { f32 }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorExtract {
+        type Scalar;
+        unsafe fn vec_extract<const IDX: u32>(self) -> Self::Scalar;
+    }
+
+    macro_rules! impl_vec_extract {
+        ($ty:ident) => {
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorExtract for t_t_l!($ty) {
+                type Scalar = $ty;
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_extract<const IDX: u32>(self) -> Self::Scalar {
+                    simd_extract(self, const { idx_in_vec::<Self::Scalar, IDX>() })
+                }
+            }
+        };
+    }
+
+    impl_vec_extract! { i8 }
+    impl_vec_extract! { u8 }
+    impl_vec_extract! { i16 }
+    impl_vec_extract! { u16 }
+    impl_vec_extract! { i32 }
+    impl_vec_extract! { u32 }
+    impl_vec_extract! { f32 }
+
+    macro_rules! impl_vec_cmp {
+        ([$Trait:ident $m:ident] ($b:ident, $h:ident, $w:ident)) => {
+            impl_vec_cmp! { [$Trait $m] ($b, $b, $h, $h, $w, $w) }
+        };
+        ([$Trait:ident $m:ident] ($ub:ident, $sb:ident, $uh:ident, $sh:ident, $uw:ident, $sw:ident)) => {
+            impl_vec_trait!{ [$Trait $m] $ub (vector_unsigned_char, vector_unsigned_char) -> vector_bool_char }
+            impl_vec_trait!{ [$Trait $m] $sb (vector_signed_char, vector_signed_char) -> vector_bool_char }
+            impl_vec_trait!{ [$Trait $m] $uh (vector_unsigned_short, vector_unsigned_short) -> vector_bool_short }
+            impl_vec_trait!{ [$Trait $m] $sh (vector_signed_short, vector_signed_short) -> vector_bool_short }
+            impl_vec_trait!{ [$Trait $m] $uw (vector_unsigned_int, vector_unsigned_int) -> vector_bool_int }
+            impl_vec_trait!{ [$Trait $m] $sw (vector_signed_int, vector_signed_int) -> vector_bool_int }
+        }
+    }
+
+    macro_rules! impl_vec_any_all {
+        ([$Trait:ident $m:ident] ($b:ident, $h:ident, $w:ident)) => {
+            impl_vec_any_all! { [$Trait $m] ($b, $b, $h, $h, $w, $w) }
+        };
+        ([$Trait:ident $m:ident] ($ub:ident, $sb:ident, $uh:ident, $sh:ident, $uw:ident, $sw:ident)) => {
+            impl_vec_trait!{ [$Trait $m] $ub (vector_unsigned_char, vector_unsigned_char) -> bool }
+            impl_vec_trait!{ [$Trait $m] $sb (vector_signed_char, vector_signed_char) -> bool }
+            impl_vec_trait!{ [$Trait $m] $uh (vector_unsigned_short, vector_unsigned_short) -> bool }
+            impl_vec_trait!{ [$Trait $m] $sh (vector_signed_short, vector_signed_short) -> bool }
+            impl_vec_trait!{ [$Trait $m] $uw (vector_unsigned_int, vector_unsigned_int) -> bool }
+            impl_vec_trait!{ [$Trait $m] $sw (vector_signed_int, vector_signed_int) -> bool }
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorLd {
+        type Result;
+        unsafe fn vec_ld(self, off: isize) -> Self::Result;
+        unsafe fn vec_ldl(self, off: isize) -> Self::Result;
+    }
+
+    macro_rules! impl_vec_ld {
+        ($fun:ident $fun_lru:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(test, assert_instr(lvx))]
+            pub unsafe fn $fun(off: isize, p: *const $ty) -> t_t_l!($ty) {
+                let addr = (p as *const i8).offset(off);
+                transmute(lvx(addr))
+            }
+
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(test, assert_instr(lvxl))]
+            pub unsafe fn $fun_lru(off: isize, p: *const $ty) -> t_t_l!($ty) {
+                let addr = (p as *const i8).offset(off);
+                transmute(lvxl(addr))
+            }
+
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorLd for *const $ty {
+                type Result = t_t_l!($ty);
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_ld(self, off: isize) -> Self::Result {
+                    $fun(off, self)
+                }
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_ldl(self, off: isize) -> Self::Result {
+                    $fun_lru(off, self)
+                }
+            }
+        };
+    }
+
+    impl_vec_ld! { vec_ld_u8 vec_ldl_u8 u8 }
+    impl_vec_ld! { vec_ld_i8 vec_ldl_i8 i8 }
+
+    impl_vec_ld! { vec_ld_u16 vec_ldl_u16 u16 }
+    impl_vec_ld! { vec_ld_i16 vec_ldl_i16 i16 }
+
+    impl_vec_ld! { vec_ld_u32 vec_ldl_u32 u32 }
+    impl_vec_ld! { vec_ld_i32 vec_ldl_i32 i32 }
+
+    impl_vec_ld! { vec_ld_f32 vec_ldl_f32 f32 }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorLde {
+        type Result;
+        unsafe fn vec_lde(self, a: isize) -> Self::Result;
+    }
+
+    macro_rules! impl_vec_lde {
+        ($fun:ident $instr:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(test, assert_instr($instr))]
+            pub unsafe fn $fun(a: isize, b: *const $ty) -> t_t_l!($ty) {
+                let addr = b.byte_offset(a).cast::<i8>();
+                transmute($instr(addr))
+            }
+
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorLde for *const $ty {
+                type Result = t_t_l!($ty);
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_lde(self, a: isize) -> Self::Result {
+                    $fun(a, self)
+                }
+            }
+        };
+    }
+
+    impl_vec_lde! { vec_lde_u8 lvebx u8 }
+    impl_vec_lde! { vec_lde_i8 lvebx i8 }
+
+    impl_vec_lde! { vec_lde_u16 lvehx u16 }
+    impl_vec_lde! { vec_lde_i16 lvehx i16 }
+
+    impl_vec_lde! { vec_lde_u32 lvewx u32 }
+    impl_vec_lde! { vec_lde_i32 lvewx i32 }
+
+    impl_vec_lde! { vec_lde_f32 lvewx f32 }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSt {
+        type Target;
+        unsafe fn vec_st(self, off: isize, p: Self::Target);
+        unsafe fn vec_stl(self, off: isize, p: Self::Target);
+    }
+
+    macro_rules! impl_vec_st {
+        ($fun:ident $fun_lru:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(test, assert_instr(stvx))]
+            pub unsafe fn $fun(a: t_t_l!($ty), off: isize, p: *const $ty) {
+                let addr = (p as *const i8).offset(off);
+                stvx(transmute(a), addr)
+            }
+
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(test, assert_instr(stvxl))]
+            pub unsafe fn $fun_lru(a: t_t_l!($ty), off: isize, p: *const $ty) {
+                let addr = (p as *const i8).offset(off as isize);
+                stvxl(transmute(a), addr)
+            }
+
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorSt for t_t_l!($ty) {
+                type Target = *const $ty;
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_st(self, off: isize, p: Self::Target) {
+                    $fun(self, off, p)
+                }
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_stl(self, off: isize, p: Self::Target) {
+                    $fun(self, off, p)
+                }
+            }
+        };
+    }
+
+    impl_vec_st! { vec_st_u8 vec_stl_u8 u8 }
+    impl_vec_st! { vec_st_i8 vec_stl_i8 i8 }
+
+    impl_vec_st! { vec_st_u16 vec_stl_u16 u16 }
+    impl_vec_st! { vec_st_i16 vec_stl_i16 i16 }
+
+    impl_vec_st! { vec_st_u32 vec_stl_u32 u32 }
+    impl_vec_st! { vec_st_i32 vec_stl_i32 i32 }
+
+    impl_vec_st! { vec_st_f32 vec_stl_f32 f32 }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSte {
+        type Target;
+        unsafe fn vec_ste(self, off: isize, p: Self::Target);
+    }
+
+    macro_rules! impl_vec_ste {
+        ($fun:ident $instr:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(test, assert_instr($instr))]
+            pub unsafe fn $fun(a: t_t_l!($ty), off: isize, p: *const $ty) {
+                let addr = (p as *const i8).offset(off);
+                $instr(transmute(a), addr)
+            }
+
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorSte for t_t_l!($ty) {
+                type Target = *const $ty;
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_ste(self, off: isize, p: Self::Target) {
+                    $fun(self, off, p)
+                }
+            }
+        };
+    }
+
+    impl_vec_ste! { vec_ste_u8 stvebx u8 }
+    impl_vec_ste! { vec_ste_i8 stvebx i8 }
+
+    impl_vec_ste! { vec_ste_u16 stvehx u16 }
+    impl_vec_ste! { vec_ste_i16 stvehx i16 }
+
+    impl_vec_ste! { vec_ste_u32 stvewx u32 }
+    impl_vec_ste! { vec_ste_i32 stvewx i32 }
+
+    impl_vec_ste! { vec_ste_f32 stvewx f32 }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorXl {
+        type Result;
+        unsafe fn vec_xl(self, a: isize) -> Self::Result;
+    }
+
+    macro_rules! impl_vec_xl {
+        ($fun:ident $notpwr9:ident / $pwr9:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(
+                all(test, not(target_feature = "power9-altivec")),
+                assert_instr($notpwr9)
+            )]
+            #[cfg_attr(all(test, target_feature = "power9-altivec"), assert_instr($pwr9))]
+            pub unsafe fn $fun(a: isize, b: *const $ty) -> t_t_l!($ty) {
+                let addr = (b as *const u8).offset(a);
+
+                let mut r = mem::MaybeUninit::uninit();
+
+                crate::ptr::copy_nonoverlapping(
+                    addr,
+                    r.as_mut_ptr() as *mut u8,
+                    mem::size_of::<t_t_l!($ty)>(),
+                );
+
+                r.assume_init()
+            }
+
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorXl for *const $ty {
+                type Result = t_t_l!($ty);
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_xl(self, a: isize) -> Self::Result {
+                    $fun(a, self)
+                }
+            }
+        };
+    }
+
+    impl_vec_xl! { vec_xl_i8 lxvd2x / lxv i8 }
+    impl_vec_xl! { vec_xl_u8 lxvd2x / lxv u8 }
+    impl_vec_xl! { vec_xl_i16 lxvd2x / lxv i16 }
+    impl_vec_xl! { vec_xl_u16 lxvd2x / lxv u16 }
+    impl_vec_xl! { vec_xl_i32 lxvd2x / lxv i32 }
+    impl_vec_xl! { vec_xl_u32 lxvd2x / lxv u32 }
+    impl_vec_xl! { vec_xl_f32 lxvd2x / lxv f32 }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorXst {
+        type Out;
+        unsafe fn vec_xst(self, a: isize, p: Self::Out);
+    }
+
+    macro_rules! impl_vec_xst {
+        ($fun:ident $notpwr9:ident / $pwr9:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(
+                all(test, not(target_feature = "power9-altivec")),
+                assert_instr($notpwr9)
+            )]
+            #[cfg_attr(all(test, target_feature = "power9-altivec"), assert_instr($pwr9))]
+            pub unsafe fn $fun(s: t_t_l!($ty), a: isize, b: *mut $ty) {
+                let addr = (b as *mut u8).offset(a);
+
+                crate::ptr::copy_nonoverlapping(
+                    &s as *const _ as *const u8,
+                    addr,
+                    mem::size_of::<t_t_l!($ty)>(),
+                );
+            }
+
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorXst for t_t_l!($ty) {
+                type Out = *mut $ty;
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_xst(self, a: isize, b: Self::Out) {
+                    $fun(self, a, b)
+                }
+            }
+        };
+    }
+
+    impl_vec_xst! { vec_xst_i8 stxvd2x / stxv i8 }
+    impl_vec_xst! { vec_xst_u8 stxvd2x / stxv u8 }
+    impl_vec_xst! { vec_xst_i16 stxvd2x / stxv i16 }
+    impl_vec_xst! { vec_xst_u16 stxvd2x / stxv u16 }
+    impl_vec_xst! { vec_xst_i32 stxvd2x / stxv i32 }
+    impl_vec_xst! { vec_xst_u32 stxvd2x / stxv u32 }
+    impl_vec_xst! { vec_xst_f32 stxvd2x / stxv f32 }
+
+    test_impl! { vec_floor(a: vector_float) -> vector_float [ simd_floor, vrfim / xvrspim ] }
+
+    test_impl! { vec_vexptefp(a: vector_float) -> vector_float [ vexptefp, vexptefp ] }
+
+    test_impl! { vec_vcmpgtub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_bool_char [ vcmpgtub, vcmpgtub ] }
+    test_impl! { vec_vcmpgtuh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_bool_short [ vcmpgtuh, vcmpgtuh ] }
+    test_impl! { vec_vcmpgtuw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_bool_int [ vcmpgtuw, vcmpgtuw ] }
+
+    test_impl! { vec_vcmpgtsb(a: vector_signed_char, b: vector_signed_char) -> vector_bool_char [ vcmpgtsb, vcmpgtsb ] }
+    test_impl! { vec_vcmpgtsh(a: vector_signed_short, b: vector_signed_short) -> vector_bool_short [ vcmpgtsh, vcmpgtsh ] }
+    test_impl! { vec_vcmpgtsw(a: vector_signed_int, b: vector_signed_int) -> vector_bool_int [ vcmpgtsw, vcmpgtsw ] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorCmpGt<Other> {
+        type Result;
+        unsafe fn vec_cmpgt(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_cmp! { [VectorCmpGt vec_cmpgt] ( vec_vcmpgtub, vec_vcmpgtsb, vec_vcmpgtuh, vec_vcmpgtsh, vec_vcmpgtuw, vec_vcmpgtsw ) }
+
+    test_impl! { vec_vcmpgefp(a: vector_float, b: vector_float) -> vector_bool_int [ vcmpgefp, vcmpgefp ] }
+
+    test_impl! { vec_vcmpequb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_bool_char [ vcmpequb, vcmpequb ] }
+    test_impl! { vec_vcmpequh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_bool_short [ vcmpequh, vcmpequh ] }
+    test_impl! { vec_vcmpequw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_bool_int [ vcmpequw, vcmpequw ] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorCmpEq<Other> {
+        type Result;
+        unsafe fn vec_cmpeq(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_cmp! { [VectorCmpEq vec_cmpeq] (vec_vcmpequb, vec_vcmpequh, vec_vcmpequw) }
+
+    macro_rules! impl_cmpne {
+        ($fun:ident ($ty:ident) -> $r:ident $([ $pwr9:ident ])? ) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            $( #[cfg_attr(all(test, target_feature = "power9-altivec"), assert_instr($pwr9))] )?
+            unsafe fn $fun(a: $ty, b: $ty) -> $r {
+                $( if cfg!(target_feature = "power9-altivec") {
+                    transmute($pwr9(transmute(a), transmute(b)))
+                } else )? {
+                    let zero = transmute(i32x4::new(0, 0, 0, 0));
+                    vec_nor(vec_cmpeq(a, b), zero)
+                }
+            }
+        };
+    }
+
+    impl_cmpne! { vec_vcmpneb(vector_signed_char) -> vector_bool_char [ vcmpneb ] }
+    impl_cmpne! { vec_vcmpneh(vector_signed_short) -> vector_bool_short [ vcmpneh ] }
+    impl_cmpne! { vec_vcmpnew(vector_signed_int) -> vector_bool_int [ vcmpnew ] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorCmpNe<Other> {
+        type Result;
+        unsafe fn vec_cmpne(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_cmp! { [VectorCmpNe vec_cmpne] (vec_vcmpneb, vec_vcmpneh, vec_vcmpnew) }
+
+    test_impl! { vec_vcmpbfp(a: vector_float, b: vector_float) -> vector_signed_int [vcmpbfp, vcmpbfp] }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpequb.))]
+    unsafe fn vcmpequb_all(a: vector_unsigned_char, b: vector_unsigned_char) -> bool {
+        vcmpequb_p(2, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpequb.))]
+    unsafe fn vcmpequb_any(a: vector_unsigned_char, b: vector_unsigned_char) -> bool {
+        vcmpequb_p(1, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpequh.))]
+    unsafe fn vcmpequh_all(a: vector_unsigned_short, b: vector_unsigned_short) -> bool {
+        vcmpequh_p(2, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpequh.))]
+    unsafe fn vcmpequh_any(a: vector_unsigned_short, b: vector_unsigned_short) -> bool {
+        vcmpequh_p(1, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpequw.))]
+    unsafe fn vcmpequw_all(a: vector_unsigned_int, b: vector_unsigned_int) -> bool {
+        vcmpequw_p(2, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpequw.))]
+    unsafe fn vcmpequw_any(a: vector_unsigned_int, b: vector_unsigned_int) -> bool {
+        vcmpequw_p(1, a, b) != 0
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAllEq<Other> {
+        type Result;
+        unsafe fn vec_all_eq(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_any_all! { [VectorAllEq vec_all_eq] (vcmpequb_all, vcmpequh_all, vcmpequw_all) }
+
+    // TODO: vsx encoding
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpeqfp.))]
+    unsafe fn vcmpeqfp_all(a: vector_float, b: vector_float) -> bool {
+        vcmpeqfp_p(2, a, b) != 0
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAllEq<vector_float> for vector_float {
+        type Result = bool;
+        #[inline]
+        unsafe fn vec_all_eq(self, b: vector_float) -> Self::Result {
+            vcmpeqfp_all(self, b)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAnyEq<Other> {
+        type Result;
+        unsafe fn vec_any_eq(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_any_all! { [VectorAnyEq vec_any_eq] (vcmpequb_any, vcmpequh_any, vcmpequw_any) }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpeqfp.))]
+    unsafe fn vcmpeqfp_any(a: vector_float, b: vector_float) -> bool {
+        vcmpeqfp_p(1, a, b) != 0
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAnyEq<vector_float> for vector_float {
+        type Result = bool;
+        #[inline]
+        unsafe fn vec_any_eq(self, b: vector_float) -> Self::Result {
+            vcmpeqfp_any(self, b)
+        }
+    }
+
+    // All/Any GreaterEqual
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtsb.))]
+    unsafe fn vcmpgesb_all(a: vector_signed_char, b: vector_signed_char) -> bool {
+        vcmpgtsb_p(0, b, a) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtsb.))]
+    unsafe fn vcmpgesb_any(a: vector_signed_char, b: vector_signed_char) -> bool {
+        vcmpgtsb_p(3, b, a) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtsh.))]
+    unsafe fn vcmpgesh_all(a: vector_signed_short, b: vector_signed_short) -> bool {
+        vcmpgtsh_p(0, b, a) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtsh.))]
+    unsafe fn vcmpgesh_any(a: vector_signed_short, b: vector_signed_short) -> bool {
+        vcmpgtsh_p(3, b, a) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtsw.))]
+    unsafe fn vcmpgesw_all(a: vector_signed_int, b: vector_signed_int) -> bool {
+        vcmpgtsw_p(0, b, a) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtsw.))]
+    unsafe fn vcmpgesw_any(a: vector_signed_int, b: vector_signed_int) -> bool {
+        vcmpgtsw_p(3, b, a) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtub.))]
+    unsafe fn vcmpgeub_all(a: vector_unsigned_char, b: vector_unsigned_char) -> bool {
+        vcmpgtub_p(0, b, a) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtub.))]
+    unsafe fn vcmpgeub_any(a: vector_unsigned_char, b: vector_unsigned_char) -> bool {
+        vcmpgtub_p(3, b, a) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtuh.))]
+    unsafe fn vcmpgeuh_all(a: vector_unsigned_short, b: vector_unsigned_short) -> bool {
+        vcmpgtuh_p(0, b, a) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtuh.))]
+    unsafe fn vcmpgeuh_any(a: vector_unsigned_short, b: vector_unsigned_short) -> bool {
+        vcmpgtuh_p(3, b, a) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtuw.))]
+    unsafe fn vcmpgeuw_all(a: vector_unsigned_int, b: vector_unsigned_int) -> bool {
+        vcmpgtuw_p(0, b, a) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtuw.))]
+    unsafe fn vcmpgeuw_any(a: vector_unsigned_int, b: vector_unsigned_int) -> bool {
+        vcmpgtuw_p(3, b, a) != 0
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAllGe<Other> {
+        type Result;
+        unsafe fn vec_all_ge(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_any_all! { [VectorAllGe vec_all_ge] (
+        vcmpgeub_all, vcmpgesb_all,
+        vcmpgeuh_all, vcmpgesh_all,
+        vcmpgeuw_all, vcmpgesw_all
+    ) }
+
+    // TODO: vsx encoding
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgefp.))]
+    unsafe fn vcmpgefp_all(a: vector_float, b: vector_float) -> bool {
+        vcmpgefp_p(2, a, b) != 0
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAllGe<vector_float> for vector_float {
+        type Result = bool;
+        #[inline]
+        unsafe fn vec_all_ge(self, b: vector_float) -> Self::Result {
+            vcmpgefp_all(self, b)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAnyGe<Other> {
+        type Result;
+        unsafe fn vec_any_ge(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_any_all! { [VectorAnyGe vec_any_ge] (
+        vcmpgeub_any, vcmpgesb_any,
+        vcmpgeuh_any, vcmpgesh_any,
+        vcmpgeuw_any, vcmpgesw_any
+    ) }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgefp.))]
+    unsafe fn vcmpgefp_any(a: vector_float, b: vector_float) -> bool {
+        vcmpgefp_p(1, a, b) != 0
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAnyGe<vector_float> for vector_float {
+        type Result = bool;
+        #[inline]
+        unsafe fn vec_any_ge(self, b: vector_float) -> Self::Result {
+            vcmpgefp_any(self, b)
+        }
+    }
+
+    // All/Any Greater Than
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtsb.))]
+    unsafe fn vcmpgtsb_all(a: vector_signed_char, b: vector_signed_char) -> bool {
+        vcmpgtsb_p(2, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtsb.))]
+    unsafe fn vcmpgtsb_any(a: vector_signed_char, b: vector_signed_char) -> bool {
+        vcmpgtsb_p(1, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtsh.))]
+    unsafe fn vcmpgtsh_all(a: vector_signed_short, b: vector_signed_short) -> bool {
+        vcmpgtsh_p(2, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtsh.))]
+    unsafe fn vcmpgtsh_any(a: vector_signed_short, b: vector_signed_short) -> bool {
+        vcmpgtsh_p(1, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtsw.))]
+    unsafe fn vcmpgtsw_all(a: vector_signed_int, b: vector_signed_int) -> bool {
+        vcmpgtsw_p(2, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtsw.))]
+    unsafe fn vcmpgtsw_any(a: vector_signed_int, b: vector_signed_int) -> bool {
+        vcmpgtsw_p(1, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtub.))]
+    unsafe fn vcmpgtub_all(a: vector_unsigned_char, b: vector_unsigned_char) -> bool {
+        vcmpgtub_p(2, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtub.))]
+    unsafe fn vcmpgtub_any(a: vector_unsigned_char, b: vector_unsigned_char) -> bool {
+        vcmpgtub_p(1, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtuh.))]
+    unsafe fn vcmpgtuh_all(a: vector_unsigned_short, b: vector_unsigned_short) -> bool {
+        vcmpgtuh_p(2, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtuh.))]
+    unsafe fn vcmpgtuh_any(a: vector_unsigned_short, b: vector_unsigned_short) -> bool {
+        vcmpgtuh_p(1, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtuw.))]
+    unsafe fn vcmpgtuw_all(a: vector_unsigned_int, b: vector_unsigned_int) -> bool {
+        vcmpgtuw_p(2, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtuw.))]
+    unsafe fn vcmpgtuw_any(a: vector_unsigned_int, b: vector_unsigned_int) -> bool {
+        vcmpgtuw_p(1, a, b) != 0
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAllGt<Other> {
+        type Result;
+        unsafe fn vec_all_gt(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_any_all! { [VectorAllGt vec_all_gt] (
+        vcmpgtub_all, vcmpgtsb_all,
+        vcmpgtuh_all, vcmpgtsh_all,
+        vcmpgtuw_all, vcmpgtsw_all
+    ) }
+
+    // TODO: vsx encoding
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtfp.))]
+    unsafe fn vcmpgtfp_all(a: vector_float, b: vector_float) -> bool {
+        vcmpgtfp_p(2, a, b) != 0
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAllGt<vector_float> for vector_float {
+        type Result = bool;
+        #[inline]
+        unsafe fn vec_all_gt(self, b: vector_float) -> Self::Result {
+            vcmpgtfp_all(self, b)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAnyGt<Other> {
+        type Result;
+        unsafe fn vec_any_gt(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_any_all! { [VectorAnyGt vec_any_gt] (
+        vcmpgtub_any, vcmpgtsb_any,
+        vcmpgtuh_any, vcmpgtsh_any,
+        vcmpgtuw_any, vcmpgtsw_any
+    ) }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpgtfp.))]
+    unsafe fn vcmpgtfp_any(a: vector_float, b: vector_float) -> bool {
+        vcmpgtfp_p(1, a, b) != 0
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAnyGt<vector_float> for vector_float {
+        type Result = bool;
+        #[inline]
+        unsafe fn vec_any_gt(self, b: vector_float) -> Self::Result {
+            vcmpgtfp_any(self, b)
+        }
+    }
+
+    // All/Any Elements Not Equal
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpequb.))]
+    unsafe fn vcmpneub_all(a: vector_unsigned_char, b: vector_unsigned_char) -> bool {
+        vcmpequb_p(0, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpequb.))]
+    unsafe fn vcmpneub_any(a: vector_unsigned_char, b: vector_unsigned_char) -> bool {
+        vcmpequb_p(3, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpequh.))]
+    unsafe fn vcmpneuh_all(a: vector_unsigned_short, b: vector_unsigned_short) -> bool {
+        vcmpequh_p(0, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpequh.))]
+    unsafe fn vcmpneuh_any(a: vector_unsigned_short, b: vector_unsigned_short) -> bool {
+        vcmpequh_p(3, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpequw.))]
+    unsafe fn vcmpneuw_all(a: vector_unsigned_int, b: vector_unsigned_int) -> bool {
+        vcmpequw_p(0, a, b) != 0
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpequw.))]
+    unsafe fn vcmpneuw_any(a: vector_unsigned_int, b: vector_unsigned_int) -> bool {
+        vcmpequw_p(3, a, b) != 0
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAllNe<Other> {
+        type Result;
+        unsafe fn vec_all_ne(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_any_all! { [VectorAllNe vec_all_ne] (vcmpneub_all, vcmpneuh_all, vcmpneuw_all) }
+
+    // TODO: vsx encoding
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpeqfp.))]
+    unsafe fn vcmpnefp_all(a: vector_float, b: vector_float) -> bool {
+        vcmpeqfp_p(0, a, b) != 0
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAllNe<vector_float> for vector_float {
+        type Result = bool;
+        #[inline]
+        unsafe fn vec_all_ne(self, b: vector_float) -> Self::Result {
+            vcmpnefp_all(self, b)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAnyNe<Other> {
+        type Result;
+        unsafe fn vec_any_ne(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_any_all! { [VectorAnyNe vec_any_ne] (vcmpneub_any, vcmpneuh_any, vcmpneuw_any) }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcmpeqfp.))]
+    unsafe fn vcmpnefp_any(a: vector_float, b: vector_float) -> bool {
+        vcmpeqfp_p(3, a, b) != 0
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAnyNe<vector_float> for vector_float {
+        type Result = bool;
+        #[inline]
+        unsafe fn vec_any_ne(self, b: vector_float) -> Self::Result {
+            vcmpnefp_any(self, b)
+        }
+    }
+
+    test_impl! { vec_vceil(a: vector_float) -> vector_float [simd_ceil, vrfip / xvrspip ] }
+
+    test_impl! { vec_vavgsb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [ vavgsb, vavgsb ] }
+    test_impl! { vec_vavgsh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [ vavgsh, vavgsh ] }
+    test_impl! { vec_vavgsw(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [ vavgsw, vavgsw ] }
+    test_impl! { vec_vavgub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [ vavgub, vavgub ] }
+    test_impl! { vec_vavguh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [ vavguh, vavguh ] }
+    test_impl! { vec_vavguw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [ vavguw, vavguw ] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAvg<Other> {
+        type Result;
+        unsafe fn vec_avg(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorAvg vec_avg] 2 (vec_vavgub, vec_vavgsb, vec_vavguh, vec_vavgsh, vec_vavguw, vec_vavgsw) }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(all(test, not(target_feature = "vsx")), assert_instr(vandc))]
+    #[cfg_attr(all(test, target_feature = "vsx"), assert_instr(xxlandc))]
+    unsafe fn andc(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char {
+        let a = transmute(a);
+        let b = transmute(b);
+        transmute(simd_and(simd_xor(u8x16::splat(0xff), b), a))
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAndc<Other> {
+        type Result;
+        unsafe fn vec_andc(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorAndc vec_andc]+ 2b (andc) }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(all(test, not(target_feature = "vsx")), assert_instr(vorc))]
+    #[cfg_attr(all(test, target_feature = "vsx"), assert_instr(xxlorc))]
+    unsafe fn orc(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char {
+        let a = transmute(a);
+        let b = transmute(b);
+        transmute(simd_or(simd_xor(u8x16::splat(0xff), b), a))
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorOrc<Other> {
+        type Result;
+        unsafe fn vec_orc(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorOrc vec_orc]+ 2b (orc) }
+
+    test_impl! { vec_vand(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [ simd_and, vand / xxland ] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAnd<Other> {
+        type Result;
+        unsafe fn vec_and(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorAnd vec_and] ~(simd_and) }
+
+    test_impl! { vec_vaddsbs(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [ vaddsbs, vaddsbs ] }
+    test_impl! { vec_vaddshs(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [ vaddshs, vaddshs ] }
+    test_impl! { vec_vaddsws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [ vaddsws, vaddsws ] }
+    test_impl! { vec_vaddubs(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [ vaddubs, vaddubs ] }
+    test_impl! { vec_vadduhs(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [ vadduhs, vadduhs ] }
+    test_impl! { vec_vadduws(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [ vadduws, vadduws ] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAdds<Other> {
+        type Result;
+        unsafe fn vec_adds(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorAdds vec_adds] ~(vaddubs, vaddsbs, vadduhs, vaddshs, vadduws, vaddsws) }
+
+    test_impl! { vec_vaddcuw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vaddcuw, vaddcuw] }
+
+    test_impl! { vec_vsubsbs(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [ vsubsbs, vsubsbs ] }
+    test_impl! { vec_vsubshs(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [ vsubshs, vsubshs ] }
+    test_impl! { vec_vsubsws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [ vsubsws, vsubsws ] }
+    test_impl! { vec_vsububs(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [ vsububs, vsububs ] }
+    test_impl! { vec_vsubuhs(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [ vsubuhs, vsubuhs ] }
+    test_impl! { vec_vsubuws(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [ vsubuws, vsubuws ] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSubs<Other> {
+        type Result;
+        unsafe fn vec_subs(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorSubs vec_subs] ~(vsububs, vsubsbs, vsubuhs, vsubshs, vsubuws, vsubsws) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAbs {
+        unsafe fn vec_abs(self) -> Self;
+    }
+
+    macro_rules! impl_abs {
+        ($name:ident,  $ty: ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn $name(v: s_t_l!($ty)) -> s_t_l!($ty) {
+                v.vec_max(-v)
+            }
+
+            impl_vec_trait! { [VectorAbs vec_abs] $name (s_t_l!($ty)) }
+        };
+    }
+
+    impl_abs! { vec_abs_i8, i8x16 }
+    impl_abs! { vec_abs_i16, i16x8 }
+    impl_abs! { vec_abs_i32, i32x4 }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    unsafe fn vec_abs_f32(v: vector_float) -> vector_float {
+        let v: u32x4 = transmute(v);
+
+        transmute(simd_and(v, u32x4::splat(0x7FFFFFFF)))
+    }
+
+    impl_vec_trait! { [VectorAbs vec_abs] vec_abs_f32 (vector_float) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAbss {
+        unsafe fn vec_abss(self) -> Self;
+    }
+
+    macro_rules! impl_abss {
+        ($name:ident,  $ty: ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn $name(v: s_t_l!($ty)) -> s_t_l!($ty) {
+                let zero: s_t_l!($ty) = transmute(0u8.vec_splats());
+                v.vec_max(zero.vec_subs(v))
+            }
+
+            impl_vec_trait! { [VectorAbss vec_abss] $name (s_t_l!($ty)) }
+        };
+    }
+
+    impl_abss! { vec_abss_i8, i8x16 }
+    impl_abss! { vec_abss_i16, i16x8 }
+    impl_abss! { vec_abss_i32, i32x4 }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vspltb, IMM4 = 15))]
+    unsafe fn vspltb<const IMM4: u32>(a: vector_signed_char) -> vector_signed_char {
+        static_assert_uimm_bits!(IMM4, 4);
+        simd_shuffle(a, a, const { u32x16::from_array([IMM4; 16]) })
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsplth, IMM3 = 7))]
+    unsafe fn vsplth<const IMM3: u32>(a: vector_signed_short) -> vector_signed_short {
+        static_assert_uimm_bits!(IMM3, 3);
+        simd_shuffle(a, a, const { u32x8::from_array([IMM3; 8]) })
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(all(test, not(target_feature = "vsx")), assert_instr(vspltw, IMM2 = 3))]
+    #[cfg_attr(all(test, target_feature = "vsx"), assert_instr(xxspltw, IMM2 = 3))]
+    unsafe fn vspltw<const IMM2: u32>(a: vector_signed_int) -> vector_signed_int {
+        static_assert_uimm_bits!(IMM2, 2);
+        simd_shuffle(a, a, const { u32x4::from_array([IMM2; 4]) })
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSplat {
+        unsafe fn vec_splat<const IMM: u32>(self) -> Self;
+    }
+
+    macro_rules! impl_vec_splat {
+        ($ty:ty, $fun:ident) => {
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorSplat for $ty {
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_splat<const IMM: u32>(self) -> Self {
+                    transmute($fun::<IMM>(transmute(self)))
+                }
+            }
+        };
+    }
+
+    impl_vec_splat! { vector_signed_char, vspltb }
+    impl_vec_splat! { vector_unsigned_char, vspltb }
+    impl_vec_splat! { vector_bool_char, vspltb }
+    impl_vec_splat! { vector_signed_short, vsplth }
+    impl_vec_splat! { vector_unsigned_short, vsplth }
+    impl_vec_splat! { vector_bool_short, vsplth }
+    impl_vec_splat! { vector_signed_int, vspltw }
+    impl_vec_splat! { vector_unsigned_int, vspltw }
+    impl_vec_splat! { vector_bool_int, vspltw }
+
+    macro_rules! splat {
+        ($name:ident, $v:ident, $r:ident [$instr_altivec:ident / $instr_pwr9:ident, $doc:literal]) => {
+            #[doc = $doc]
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(
+                all(test, not(target_feature = "vsx")),
+                assert_instr($instr_altivec, IMM5 = 1)
+            )]
+            #[cfg_attr(
+                all(test, target_feature = "power9-vector"),
+                assert_instr($instr_pwr9, IMM5 = 1)
+            )]
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            pub unsafe fn $name<const IMM5: i8>() -> s_t_l!($r) {
+                static_assert_simm_bits!(IMM5, 5);
+                transmute($r::splat(IMM5 as $v))
+            }
+        };
+        ($name:ident, $v:ident, $r:ident [$instr:ident, $doc:literal]) => {
+            splat! { $name, $v, $r [$instr / $instr, $doc] }
+        };
+    }
+
+    macro_rules! splats {
+        ($name:ident, $v:ident, $r:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn $name(v: $v) -> s_t_l!($r) {
+                transmute($r::splat(v))
+            }
+        };
+    }
+
+    splats! { splats_u8, u8, u8x16 }
+    splats! { splats_u16, u16, u16x8 }
+    splats! { splats_u32, u32, u32x4 }
+    splats! { splats_i8, i8, i8x16 }
+    splats! { splats_i16, i16, i16x8 }
+    splats! { splats_i32, i32, i32x4 }
+    splats! { splats_f32, f32, f32x4 }
+
+    test_impl! { vec_splats_u8 (v: u8) -> vector_unsigned_char [splats_u8, vspltb] }
+    test_impl! { vec_splats_u16 (v: u16) -> vector_unsigned_short [splats_u16, vsplth] }
+    test_impl! { vec_splats_u32 (v: u32) -> vector_unsigned_int [splats_u32, vspltw / xxspltw / mtvsrws] }
+    test_impl! { vec_splats_i8 (v: i8) -> vector_signed_char [splats_i8, vspltb] }
+    test_impl! { vec_splats_i16 (v: i16) -> vector_signed_short [splats_i16, vsplth] }
+    test_impl! { vec_splats_i32 (v: i32) -> vector_signed_int [splats_i32, vspltw / xxspltw / mtvsrws] }
+    test_impl! { vec_splats_f32 (v: f32) -> vector_float [splats_f32, vspltw / xxspltw / mtvsrws] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSplats {
+        type Result;
+        unsafe fn vec_splats(self) -> Self::Result;
+    }
+
+    macro_rules! impl_vec_splats {
+        ($(($fn:ident ($ty:ty) -> $r:ty)),*) => {
+            $(
+                impl_vec_trait!{ [VectorSplats vec_splats] $fn ($ty) -> $r }
+            )*
+        }
+    }
+
+    impl_vec_splats! {
+        (vec_splats_u8 (u8) -> vector_unsigned_char),
+        (vec_splats_i8 (i8) -> vector_signed_char),
+        (vec_splats_u16 (u16) -> vector_unsigned_short),
+        (vec_splats_i16 (i16) -> vector_signed_short),
+        (vec_splats_u32 (u32) -> vector_unsigned_int),
+        (vec_splats_i32 (i32) -> vector_signed_int),
+        (vec_splats_f32 (f32) -> vector_float)
+    }
+
+    test_impl! { vec_vsububm (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [simd_sub, vsububm] }
+    test_impl! { vec_vsubuhm (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [simd_sub, vsubuhm] }
+    test_impl! { vec_vsubuwm (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [simd_sub, vsubuwm] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSub<Other> {
+        type Result;
+        unsafe fn vec_sub(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorSub vec_sub] ~(simd_sub, simd_sub, simd_sub, simd_sub, simd_sub, simd_sub) }
+    impl_vec_trait! { [VectorSub vec_sub] simd_sub(vector_float, vector_float) -> vector_float }
+
+    test_impl! { vec_vsubcuw (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vsubcuw, vsubcuw] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSubc<Other> {
+        type Result;
+        unsafe fn vec_subc(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! {[VectorSubc vec_subc]+ vec_vsubcuw(vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int }
+    impl_vec_trait! {[VectorSubc vec_subc]+ vec_vsubcuw(vector_signed_int, vector_signed_int) -> vector_signed_int }
+
+    test_impl! { vec_vminsb (a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [vminsb, vminsb] }
+    test_impl! { vec_vminsh (a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [vminsh, vminsh] }
+    test_impl! { vec_vminsw (a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [vminsw, vminsw] }
+
+    test_impl! { vec_vminub (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [vminub, vminub] }
+    test_impl! { vec_vminuh (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [vminuh, vminuh] }
+    test_impl! { vec_vminuw (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vminuw, vminuw] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorMin<Other> {
+        type Result;
+        unsafe fn vec_min(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorMin vec_min] ~(vminub, vminsb, vminuh, vminsh, vminuw, vminsw) }
+
+    test_impl! { vec_vmaxsb (a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [vmaxsb, vmaxsb] }
+    test_impl! { vec_vmaxsh (a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [vmaxsh, vmaxsh] }
+    test_impl! { vec_vmaxsw (a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [vmaxsw, vmaxsw] }
+
+    test_impl! { vec_vmaxub (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [vmaxub, vmaxub] }
+    test_impl! { vec_vmaxuh (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [vmaxuh, vmaxuh] }
+    test_impl! { vec_vmaxuw (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vmaxuw, vmaxuw] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorMax<Other> {
+        type Result;
+        unsafe fn vec_max(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorMax vec_max] ~(vmaxub, vmaxsb, vmaxuh, vmaxsh, vmaxuw, vmaxsw) }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmuleub))]
+    unsafe fn vec_vmuleub(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_short {
+        vmuleub(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulesb))]
+    unsafe fn vec_vmulesb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short {
+        vmulesb(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmuleuh))]
+    unsafe fn vec_vmuleuh(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+    ) -> vector_unsigned_int {
+        vmuleuh(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulesh))]
+    unsafe fn vec_vmulesh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int {
+        vmulesh(a, b)
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorMul {
+        unsafe fn vec_mul(self, b: Self) -> Self;
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmuluwm))]
+    unsafe fn vec_vmuluwm(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        transmute(simd_mul::<i32x4>(transmute(a), transmute(b)))
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(xvmulsp))]
+    unsafe fn vec_xvmulsp(a: vector_float, b: vector_float) -> vector_float {
+        transmute(simd_mul::<f32x4>(transmute(a), transmute(b)))
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMul for vector_signed_int {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mul(self, b: Self) -> Self {
+            vec_vmuluwm(self, b)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMul for vector_unsigned_int {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mul(self, b: Self) -> Self {
+            transmute(simd_mul::<u32x4>(transmute(self), transmute(b)))
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMul for vector_float {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mul(self, b: Self) -> Self {
+            vec_xvmulsp(self, b)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorMule<Result> {
+        unsafe fn vec_mule(self, b: Self) -> Result;
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMule<vector_unsigned_short> for vector_unsigned_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mule(self, b: Self) -> vector_unsigned_short {
+            vmuleub(self, b)
+        }
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMule<vector_signed_short> for vector_signed_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mule(self, b: Self) -> vector_signed_short {
+            vmulesb(self, b)
+        }
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMule<vector_unsigned_int> for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mule(self, b: Self) -> vector_unsigned_int {
+            vmuleuh(self, b)
+        }
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMule<vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mule(self, b: Self) -> vector_signed_int {
+            vmulesh(self, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmuloub))]
+    unsafe fn vec_vmuloub(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_short {
+        vmuloub(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulosb))]
+    unsafe fn vec_vmulosb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short {
+        vmulosb(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulouh))]
+    unsafe fn vec_vmulouh(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+    ) -> vector_unsigned_int {
+        vmulouh(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulosh))]
+    unsafe fn vec_vmulosh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int {
+        vmulosh(a, b)
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorMulo<Result> {
+        unsafe fn vec_mulo(self, b: Self) -> Result;
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMulo<vector_unsigned_short> for vector_unsigned_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mulo(self, b: Self) -> vector_unsigned_short {
+            vmuloub(self, b)
+        }
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMulo<vector_signed_short> for vector_signed_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mulo(self, b: Self) -> vector_signed_short {
+            vmulosb(self, b)
+        }
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMulo<vector_unsigned_int> for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mulo(self, b: Self) -> vector_unsigned_int {
+            vmulouh(self, b)
+        }
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMulo<vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mulo(self, b: Self) -> vector_signed_int {
+            vmulosh(self, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsum4ubs))]
+    unsafe fn vec_vsum4ubs(a: vector_unsigned_char, b: vector_unsigned_int) -> vector_unsigned_int {
+        vsum4ubs(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsum4sbs))]
+    unsafe fn vec_vsum4sbs(a: vector_signed_char, b: vector_signed_int) -> vector_signed_int {
+        vsum4sbs(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsum4shs))]
+    unsafe fn vec_vsum4shs(a: vector_signed_short, b: vector_signed_int) -> vector_signed_int {
+        vsum4shs(a, b)
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSum4s<Other> {
+        unsafe fn vec_sum4s(self, b: Other) -> Other;
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorSum4s<vector_unsigned_int> for vector_unsigned_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_sum4s(self, b: vector_unsigned_int) -> vector_unsigned_int {
+            vsum4ubs(self, b)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorSum4s<vector_signed_int> for vector_signed_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_sum4s(self, b: vector_signed_int) -> vector_signed_int {
+            vsum4sbs(self, b)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorSum4s<vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_sum4s(self, b: vector_signed_int) -> vector_signed_int {
+            vsum4shs(self, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsum2sws))]
+    unsafe fn vec_vsum2sws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        vsum2sws(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vnmsubfp))]
+    unsafe fn vec_vnmsubfp(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+        vnmsubfp(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(xvmaddasp))]
+    pub unsafe fn vec_vmaddfp(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+        simd_fma(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumubm))]
+    unsafe fn vec_vmsumubm(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int {
+        vmsumubm(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsummbm))]
+    unsafe fn vec_vmsummbm(
+        a: vector_signed_char,
+        b: vector_unsigned_char,
+        c: vector_signed_int,
+    ) -> vector_signed_int {
+        vmsummbm(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumuhm))]
+    unsafe fn vec_vmsumuhm(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int {
+        vmsumuhm(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumshm))]
+    unsafe fn vec_vmsumshm(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_int,
+    ) -> vector_signed_int {
+        vmsumshm(a, b, c)
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorMsum<B, Other> {
+        unsafe fn vec_msum(self, b: B, c: Other) -> Other;
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMsum<vector_unsigned_char, vector_unsigned_int> for vector_unsigned_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msum(
+            self,
+            b: vector_unsigned_char,
+            c: vector_unsigned_int,
+        ) -> vector_unsigned_int {
+            vmsumubm(self, b, c)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMsum<vector_unsigned_char, vector_signed_int> for vector_signed_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msum(
+            self,
+            b: vector_unsigned_char,
+            c: vector_signed_int,
+        ) -> vector_signed_int {
+            vmsummbm(self, b, c)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMsum<vector_unsigned_short, vector_unsigned_int> for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msum(
+            self,
+            b: vector_unsigned_short,
+            c: vector_unsigned_int,
+        ) -> vector_unsigned_int {
+            vmsumuhm(self, b, c)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMsum<vector_signed_short, vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msum(
+            self,
+            b: vector_signed_short,
+            c: vector_signed_int,
+        ) -> vector_signed_int {
+            vmsumshm(self, b, c)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumuhs))]
+    unsafe fn vec_vmsumuhs(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int {
+        vmsumuhs(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumshs))]
+    unsafe fn vec_vmsumshs(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_int,
+    ) -> vector_signed_int {
+        vmsumshs(a, b, c)
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorMsums<Other> {
+        unsafe fn vec_msums(self, b: Self, c: Other) -> Other;
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMsums<vector_unsigned_int> for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msums(self, b: Self, c: vector_unsigned_int) -> vector_unsigned_int {
+            vmsumuhs(self, b, c)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorMsums<vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msums(self, b: Self, c: vector_signed_int) -> vector_signed_int {
+            vmsumshs(self, b, c)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vperm))]
+    unsafe fn vec_vperm(
+        a: vector_signed_int,
+        b: vector_signed_int,
+        c: vector_unsigned_char,
+    ) -> vector_signed_int {
+        vperm(a, b, c)
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorPerm {
+        unsafe fn vec_vperm(self, b: Self, c: vector_unsigned_char) -> Self;
+    }
+
+    macro_rules! vector_perm {
+        {$impl: ident} => {
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorPerm for $impl {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn vec_vperm(self, b: Self, c: vector_unsigned_char) -> Self {
+                    transmute(vec_vperm(transmute(self), transmute(b), c))
+                }
+            }
+        }
+    }
+
+    vector_perm! { vector_signed_char }
+    vector_perm! { vector_unsigned_char }
+    vector_perm! { vector_bool_char }
+
+    vector_perm! { vector_signed_short }
+    vector_perm! { vector_unsigned_short }
+    vector_perm! { vector_bool_short }
+
+    vector_perm! { vector_signed_int }
+    vector_perm! { vector_unsigned_int }
+    vector_perm! { vector_bool_int }
+
+    vector_perm! { vector_float }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAdd<Other> {
+        type Result;
+        unsafe fn vec_add(self, other: Other) -> Self::Result;
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vaddubm))]
+    pub unsafe fn vec_add_bc_sc(a: vector_bool_char, b: vector_signed_char) -> vector_signed_char {
+        simd_add(transmute(a), b)
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_signed_char> for vector_bool_char {
+        type Result = vector_signed_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_char) -> Self::Result {
+            vec_add_bc_sc(self, other)
+        }
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_bool_char> for vector_signed_char {
+        type Result = vector_signed_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_char) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vaddubm))]
+    pub unsafe fn vec_add_sc_sc(
+        a: vector_signed_char,
+        b: vector_signed_char,
+    ) -> vector_signed_char {
+        simd_add(a, b)
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_signed_char> for vector_signed_char {
+        type Result = vector_signed_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_char) -> Self::Result {
+            vec_add_sc_sc(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vaddubm))]
+    pub unsafe fn vec_add_bc_uc(
+        a: vector_bool_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_char {
+        simd_add(transmute(a), b)
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_unsigned_char> for vector_bool_char {
+        type Result = vector_unsigned_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_char) -> Self::Result {
+            vec_add_bc_uc(self, other)
+        }
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_bool_char> for vector_unsigned_char {
+        type Result = vector_unsigned_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_char) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vaddubm))]
+    pub unsafe fn vec_add_uc_uc(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_char {
+        simd_add(a, b)
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_unsigned_char> for vector_unsigned_char {
+        type Result = vector_unsigned_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_char) -> Self::Result {
+            vec_add_uc_uc(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduhm))]
+    pub unsafe fn vec_add_bs_ss(
+        a: vector_bool_short,
+        b: vector_signed_short,
+    ) -> vector_signed_short {
+        let a: i16x8 = transmute(a);
+        let a: vector_signed_short = simd_cast(a);
+        simd_add(a, b)
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_signed_short> for vector_bool_short {
+        type Result = vector_signed_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_short) -> Self::Result {
+            vec_add_bs_ss(self, other)
+        }
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_bool_short> for vector_signed_short {
+        type Result = vector_signed_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_short) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduhm))]
+    pub unsafe fn vec_add_ss_ss(
+        a: vector_signed_short,
+        b: vector_signed_short,
+    ) -> vector_signed_short {
+        simd_add(a, b)
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_signed_short> for vector_signed_short {
+        type Result = vector_signed_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_short) -> Self::Result {
+            vec_add_ss_ss(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduhm))]
+    pub unsafe fn vec_add_bs_us(
+        a: vector_bool_short,
+        b: vector_unsigned_short,
+    ) -> vector_unsigned_short {
+        let a: i16x8 = transmute(a);
+        let a: vector_unsigned_short = simd_cast(a);
+        simd_add(a, b)
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_unsigned_short> for vector_bool_short {
+        type Result = vector_unsigned_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_short) -> Self::Result {
+            vec_add_bs_us(self, other)
+        }
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_bool_short> for vector_unsigned_short {
+        type Result = vector_unsigned_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_short) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduhm))]
+    pub unsafe fn vec_add_us_us(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+    ) -> vector_unsigned_short {
+        simd_add(a, b)
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_unsigned_short> for vector_unsigned_short {
+        type Result = vector_unsigned_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_short) -> Self::Result {
+            vec_add_us_us(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduwm))]
+    pub unsafe fn vec_add_bi_si(a: vector_bool_int, b: vector_signed_int) -> vector_signed_int {
+        let a: i32x4 = transmute(a);
+        let a: vector_signed_int = simd_cast(a);
+        simd_add(a, b)
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_signed_int> for vector_bool_int {
+        type Result = vector_signed_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_int) -> Self::Result {
+            vec_add_bi_si(self, other)
+        }
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_bool_int> for vector_signed_int {
+        type Result = vector_signed_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_int) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduwm))]
+    pub unsafe fn vec_add_si_si(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        simd_add(a, b)
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_signed_int> for vector_signed_int {
+        type Result = vector_signed_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_int) -> Self::Result {
+            vec_add_si_si(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduwm))]
+    pub unsafe fn vec_add_bi_ui(a: vector_bool_int, b: vector_unsigned_int) -> vector_unsigned_int {
+        let a: i32x4 = transmute(a);
+        let a: vector_unsigned_int = simd_cast(a);
+        simd_add(a, b)
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_unsigned_int> for vector_bool_int {
+        type Result = vector_unsigned_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_int) -> Self::Result {
+            vec_add_bi_ui(self, other)
+        }
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_bool_int> for vector_unsigned_int {
+        type Result = vector_unsigned_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_int) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduwm))]
+    pub unsafe fn vec_add_ui_ui(
+        a: vector_unsigned_int,
+        b: vector_unsigned_int,
+    ) -> vector_unsigned_int {
+        simd_add(a, b)
+    }
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_unsigned_int> for vector_unsigned_int {
+        type Result = vector_unsigned_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_int) -> Self::Result {
+            vec_add_ui_ui(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(xvaddsp))]
+    pub unsafe fn vec_add_float_float(a: vector_float, b: vector_float) -> vector_float {
+        simd_add(a, b)
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdd<vector_float> for vector_float {
+        type Result = vector_float;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_float) -> Self::Result {
+            vec_add_float_float(self, other)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAdde {
+        unsafe fn vec_adde(self, b: Self, c: Self) -> Self;
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdde for vector_unsigned_int {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_adde(self, b: Self, c: Self) -> Self {
+            let mask: vector_unsigned_int = transmute(u32x4::new(1, 1, 1, 1));
+            let carry = vec_and(c, mask);
+            vec_add(vec_add(self, b), carry)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdde for vector_signed_int {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_adde(self, b: Self, c: Self) -> Self {
+            let mask: vector_signed_int = transmute(i32x4::new(1, 1, 1, 1));
+            let carry = vec_and(c, mask);
+            vec_add(vec_add(self, b), carry)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorMladd<Other> {
+        type Result;
+        unsafe fn vec_mladd(self, b: Other, c: Other) -> Self::Result;
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmladduhm))]
+    unsafe fn mladd(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_short,
+    ) -> vector_signed_short {
+        let a: i16x8 = transmute(a);
+        let b: i16x8 = transmute(b);
+        let c: i16x8 = transmute(c);
+        transmute(simd_add(simd_mul(a, b), c))
+    }
+
+    macro_rules! vector_mladd {
+        ($a: ident, $bc: ident, $d: ident) => {
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorMladd<$bc> for $a {
+                type Result = $d;
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_mladd(self, b: $bc, c: $bc) -> Self::Result {
+                    let a = transmute(self);
+                    let b = transmute(b);
+                    let c = transmute(c);
+
+                    transmute(mladd(a, b, c))
+                }
+            }
+        };
+    }
+
+    vector_mladd! { vector_unsigned_short, vector_unsigned_short, vector_unsigned_short }
+    vector_mladd! { vector_unsigned_short, vector_signed_short, vector_signed_short }
+    vector_mladd! { vector_signed_short, vector_unsigned_short, vector_signed_short }
+    vector_mladd! { vector_signed_short, vector_signed_short, vector_signed_short }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorOr<Other> {
+        type Result;
+        unsafe fn vec_or(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorOr vec_or] ~(simd_or) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorXor<Other> {
+        type Result;
+        unsafe fn vec_xor(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorXor vec_xor] ~(simd_xor) }
+
+    macro_rules! vector_vnor {
+        ($fun:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(all(test, not(target_feature = "vsx")), assert_instr(vnor))]
+            #[cfg_attr(all(test, target_feature = "vsx"), assert_instr(xxlnor))]
+            pub unsafe fn $fun(a: t_t_l!($ty), b: t_t_l!($ty)) -> t_t_l!($ty) {
+                let o = vec_splats(!0 as $ty);
+                vec_xor(vec_or(a, b), o)
+            }
+        };
+    }
+
+    vector_vnor! { vec_vnorsb i8 }
+    vector_vnor! { vec_vnorsh i16 }
+    vector_vnor! { vec_vnorsw i32 }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorNor<Other> {
+        type Result;
+        unsafe fn vec_nor(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorNor vec_nor]+ 2b (vec_vnorsb, vec_vnorsh, vec_vnorsw) }
+
+    macro_rules! vector_vnand {
+        ($fun:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(all(test, not(target_feature = "vsx")), assert_instr(vnand))]
+            #[cfg_attr(all(test, target_feature = "vsx"), assert_instr(xxlnand))]
+            pub unsafe fn $fun(a: t_t_l!($ty), b: t_t_l!($ty)) -> t_t_l!($ty) {
+                let o = vec_splats(!0 as $ty);
+                vec_xor(vec_and(a, b), o)
+            }
+        };
+    }
+
+    vector_vnand! { vec_vnandsb i8 }
+    vector_vnand! { vec_vnandsh i16 }
+    vector_vnand! { vec_vnandsw i32 }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorNand<Other> {
+        type Result;
+        unsafe fn vec_nand(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorNand vec_nand]+ 2b (vec_vnandsb, vec_vnandsh, vec_vnandsw) }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(all(test, not(target_feature = "vsx")), assert_instr(vsel))]
+    #[cfg_attr(all(test, target_feature = "vsx"), assert_instr(xxsel))]
+    pub unsafe fn vec_vsel(
+        a: vector_signed_char,
+        b: vector_signed_char,
+        c: vector_signed_char,
+    ) -> vector_signed_char {
+        let a: i8x16 = transmute(a);
+        let b: i8x16 = transmute(b);
+        let c: i8x16 = transmute(c);
+        let not_c = simd_xor(c, i8x16::splat(!0));
+
+        transmute(simd_or(simd_and(a, not_c), simd_and(b, c)))
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSel<Mask> {
+        unsafe fn vec_sel(self, b: Self, c: Mask) -> Self;
+    }
+
+    macro_rules! vector_sel {
+        ($ty: ty, $m: ty) => {
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorSel<$m> for $ty {
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_sel(self, b: Self, c: $m) -> Self {
+                    let a = transmute(self);
+                    let b = transmute(b);
+                    let c = transmute(c);
+
+                    transmute(vec_vsel(a, b, c))
+                }
+            }
+        };
+        ($ty: ident) => {
+            vector_sel! { $ty, t_b!{ $ty } }
+            vector_sel! { $ty, t_u!{ $ty } }
+            vector_sel! { t_u!{ $ty }, t_b!{ $ty } }
+            vector_sel! { t_u!{ $ty }, t_u!{ $ty } }
+            vector_sel! { t_b!{ $ty }, t_b!{ $ty } }
+            vector_sel! { t_b!{ $ty }, t_u!{ $ty } }
+        };
+        (- $ty: ident) => {
+            vector_sel! { $ty, t_b!{ $ty } }
+            vector_sel! { $ty, t_u!{ $ty } }
+        };
+    }
+
+    vector_sel! { vector_signed_char }
+    vector_sel! { vector_signed_short }
+    vector_sel! { vector_signed_int }
+    vector_sel! {- vector_float }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcfsx, IMM5 = 1))]
+    unsafe fn vec_ctf_i32<const IMM5: i32>(a: vector_signed_int) -> vector_float {
+        static_assert_uimm_bits!(IMM5, 5);
+        vcfsx(a, IMM5)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vcfux, IMM5 = 1))]
+    unsafe fn vec_ctf_u32<const IMM5: i32>(a: vector_unsigned_int) -> vector_float {
+        static_assert_uimm_bits!(IMM5, 5);
+        vcfux(a, IMM5)
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorCtf {
+        unsafe fn vec_ctf<const IMM5: i32>(self) -> vector_float;
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorCtf for vector_signed_int {
+        unsafe fn vec_ctf<const IMM5: i32>(self) -> vector_float {
+            vec_ctf_i32::<IMM5>(self)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorCtf for vector_unsigned_int {
+        unsafe fn vec_ctf<const IMM5: i32>(self) -> vector_float {
+            vec_ctf_u32::<IMM5>(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(all(test, target_endian = "little"), assert_instr(vmrghb))]
+    #[cfg_attr(all(test, target_endian = "big"), assert_instr(vmrglb))]
+    unsafe fn vec_vmrglb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char {
+        let mergel_perm = transmute(u8x16::new(
+            0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E,
+            0x0F, 0x1F,
+        ));
+        vec_perm(a, b, mergel_perm)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(all(test, target_endian = "little"), assert_instr(vmrghh))]
+    #[cfg_attr(all(test, target_endian = "big"), assert_instr(vmrglh))]
+    unsafe fn vec_vmrglh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short {
+        let mergel_perm = transmute(u8x16::new(
+            0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F,
+            0x1E, 0x1F,
+        ));
+        vec_perm(a, b, mergel_perm)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(
+        all(test, target_endian = "little", not(target_feature = "vsx")),
+        assert_instr(vmrghw)
+    )]
+    #[cfg_attr(
+        all(test, target_endian = "little", target_feature = "vsx"),
+        assert_instr(xxmrghw)
+    )]
+    #[cfg_attr(
+        all(test, target_endian = "big", not(target_feature = "vsx")),
+        assert_instr(vmrglw)
+    )]
+    #[cfg_attr(
+        all(test, target_endian = "big", target_feature = "vsx"),
+        assert_instr(xxmrglw)
+    )]
+    unsafe fn vec_vmrglw(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        let mergel_perm = transmute(u8x16::new(
+            0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D,
+            0x1E, 0x1F,
+        ));
+        vec_perm(a, b, mergel_perm)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(all(test, target_endian = "little"), assert_instr(vmrglb))]
+    #[cfg_attr(all(test, target_endian = "big"), assert_instr(vmrghb))]
+    unsafe fn vec_vmrghb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char {
+        let mergel_perm = transmute(u8x16::new(
+            0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 0x04, 0x14, 0x05, 0x15, 0x06, 0x16,
+            0x07, 0x17,
+        ));
+        vec_perm(a, b, mergel_perm)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(all(test, target_endian = "little"), assert_instr(vmrglh))]
+    #[cfg_attr(all(test, target_endian = "big"), assert_instr(vmrghh))]
+    unsafe fn vec_vmrghh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short {
+        let mergel_perm = transmute(u8x16::new(
+            0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, 0x04, 0x05, 0x14, 0x15, 0x06, 0x07,
+            0x16, 0x17,
+        ));
+        vec_perm(a, b, mergel_perm)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(
+        all(test, target_endian = "little", not(target_feature = "vsx")),
+        assert_instr(vmrglw)
+    )]
+    #[cfg_attr(
+        all(test, target_endian = "little", target_feature = "vsx"),
+        assert_instr(xxmrglw)
+    )]
+    #[cfg_attr(
+        all(test, target_endian = "big", not(target_feature = "vsx")),
+        assert_instr(vmrghw)
+    )]
+    #[cfg_attr(
+        all(test, target_endian = "big", target_feature = "vsx"),
+        assert_instr(xxmrghw)
+    )]
+    unsafe fn vec_vmrghw(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        let mergel_perm = transmute(u8x16::new(
+            0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+            0x16, 0x17,
+        ));
+        vec_perm(a, b, mergel_perm)
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorMergeh<Other> {
+        type Result;
+        unsafe fn vec_mergeh(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorMergeh vec_mergeh]+ 2b (vec_vmrghb, vec_vmrghh, vec_vmrghw) }
+    impl_vec_trait! { [VectorMergeh vec_mergeh]+ vec_vmrghw (vector_float, vector_float) -> vector_float }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorMergel<Other> {
+        type Result;
+        unsafe fn vec_mergel(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorMergel vec_mergel]+ 2b (vec_vmrglb, vec_vmrglh, vec_vmrglw) }
+    impl_vec_trait! { [VectorMergel vec_mergel]+ vec_vmrglw (vector_float, vector_float) -> vector_float }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vpkuhum))]
+    unsafe fn vec_vpkuhum(a: vector_signed_short, b: vector_signed_short) -> vector_signed_char {
+        let pack_perm = if cfg!(target_endian = "little") {
+            transmute(u8x16::new(
+                0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A,
+                0x1C, 0x1E,
+            ))
+        } else {
+            transmute(u8x16::new(
+                0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B,
+                0x1D, 0x1F,
+            ))
+        };
+
+        transmute(vec_perm(a, b, pack_perm))
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vpkuwum))]
+    unsafe fn vec_vpkuwum(a: vector_signed_int, b: vector_signed_int) -> vector_signed_short {
+        let pack_perm = if cfg!(target_endian = "little") {
+            transmute(u8x16::new(
+                0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D, 0x10, 0x11, 0x14, 0x15, 0x18, 0x19,
+                0x1C, 0x1D,
+            ))
+        } else {
+            transmute(u8x16::new(
+                0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F, 0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B,
+                0x1E, 0x1F,
+            ))
+        };
+
+        transmute(vec_perm(a, b, pack_perm))
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorPack<Other> {
+        type Result;
+        unsafe fn vec_pack(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorPack vec_pack]+ vec_vpkuhum (vector_signed_short, vector_signed_short) -> vector_signed_char }
+    impl_vec_trait! { [VectorPack vec_pack]+ vec_vpkuhum (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_char }
+    impl_vec_trait! { [VectorPack vec_pack]+ vec_vpkuhum (vector_bool_short, vector_bool_short) -> vector_bool_char }
+    impl_vec_trait! { [VectorPack vec_pack]+ vec_vpkuwum (vector_signed_int, vector_signed_int) -> vector_signed_short }
+    impl_vec_trait! { [VectorPack vec_pack]+ vec_vpkuwum (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_short }
+    impl_vec_trait! { [VectorPack vec_pack]+ vec_vpkuwum (vector_bool_int, vector_bool_int) -> vector_bool_short }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vpkshss))]
+    unsafe fn vec_vpkshss(a: vector_signed_short, b: vector_signed_short) -> vector_signed_char {
+        if cfg!(target_endian = "little") {
+            vpkshss(b, a)
+        } else {
+            vpkshss(a, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vpkshus))]
+    unsafe fn vec_vpkshus(a: vector_signed_short, b: vector_signed_short) -> vector_unsigned_char {
+        if cfg!(target_endian = "little") {
+            vpkshus(b, a)
+        } else {
+            vpkshus(a, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vpkuhus))]
+    unsafe fn vec_vpkuhus(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+    ) -> vector_unsigned_char {
+        if cfg!(target_endian = "little") {
+            vpkuhus(b, a)
+        } else {
+            vpkuhus(a, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vpkswss))]
+    unsafe fn vec_vpkswss(a: vector_signed_int, b: vector_signed_int) -> vector_signed_short {
+        if cfg!(target_endian = "little") {
+            vpkswss(b, a)
+        } else {
+            vpkswss(a, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vpkswus))]
+    unsafe fn vec_vpkswus(a: vector_signed_int, b: vector_signed_int) -> vector_unsigned_short {
+        if cfg!(target_endian = "little") {
+            vpkswus(b, a)
+        } else {
+            vpkswus(a, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vpkuwus))]
+    unsafe fn vec_vpkuwus(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_short {
+        if cfg!(target_endian = "little") {
+            vpkuwus(b, a)
+        } else {
+            vpkuwus(a, b)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorPacks<Other> {
+        type Result;
+        unsafe fn vec_packs(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorPacks vec_packs] vec_vpkshss (vector_signed_short, vector_signed_short) -> vector_signed_char }
+    impl_vec_trait! { [VectorPacks vec_packs] vec_vpkuhus (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_char }
+    impl_vec_trait! { [VectorPacks vec_packs] vec_vpkswss (vector_signed_int, vector_signed_int) -> vector_signed_short }
+    impl_vec_trait! { [VectorPacks vec_packs] vec_vpkuwus (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_short }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorPacksu<Other> {
+        type Result;
+        unsafe fn vec_packsu(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorPacksu vec_packsu] vec_vpkshus (vector_signed_short, vector_signed_short) -> vector_unsigned_char }
+    impl_vec_trait! { [VectorPacksu vec_packsu] vec_vpkuhus (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_char }
+    impl_vec_trait! { [VectorPacksu vec_packsu] vec_vpkswus (vector_signed_int, vector_signed_int) -> vector_unsigned_short }
+    impl_vec_trait! { [VectorPacksu vec_packsu] vec_vpkuwus (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_short }
+
+    macro_rules! impl_vec_unpack {
+        ($fun:ident ($a:ident) -> $r:ident [$little:ident, $big:ident]) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(all(test, target_endian = "little"), assert_instr($little))]
+            #[cfg_attr(all(test, target_endian = "big"), assert_instr($big))]
+            unsafe fn $fun(a: $a) -> $r {
+                if cfg!(target_endian = "little") {
+                    $little(a)
+                } else {
+                    $big(a)
+                }
+            }
+        };
+    }
+
+    impl_vec_unpack! { vec_vupkhsb (vector_signed_char) -> vector_signed_short [vupklsb, vupkhsb] }
+    impl_vec_unpack! { vec_vupklsb (vector_signed_char) -> vector_signed_short [vupkhsb, vupklsb] }
+    impl_vec_unpack! { vec_vupkhsh (vector_signed_short) -> vector_signed_int [vupklsh, vupkhsh] }
+    impl_vec_unpack! { vec_vupklsh (vector_signed_short) -> vector_signed_int [vupkhsh, vupklsh] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorUnpackh {
+        type Result;
+        unsafe fn vec_unpackh(self) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorUnpackh vec_unpackh] vec_vupkhsb (vector_signed_char) -> vector_signed_short }
+    impl_vec_trait! { [VectorUnpackh vec_unpackh]+ vec_vupkhsb (vector_bool_char) -> vector_bool_short }
+    impl_vec_trait! { [VectorUnpackh vec_unpackh] vec_vupkhsh (vector_signed_short) -> vector_signed_int }
+    impl_vec_trait! { [VectorUnpackh vec_unpackh]+ vec_vupkhsh (vector_bool_short) -> vector_bool_int }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorUnpackl {
+        type Result;
+        unsafe fn vec_unpackl(self) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorUnpackl vec_unpackl] vec_vupklsb (vector_signed_char) -> vector_signed_short }
+    impl_vec_trait! { [VectorUnpackl vec_unpackl]+ vec_vupklsb (vector_bool_char) -> vector_bool_short }
+    impl_vec_trait! { [VectorUnpackl vec_unpackl] vec_vupklsh (vector_signed_short) -> vector_signed_int }
+    impl_vec_trait! { [VectorUnpackl vec_unpackl]+ vec_vupklsh (vector_bool_short) -> vector_bool_int }
+
+    macro_rules! impl_vec_shift {
+        ([$Trait:ident $m:ident] ($b:ident, $h:ident, $w:ident)) => {
+            impl_vec_trait!{ [$Trait $m]+ $b (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$Trait $m]+ $b (vector_signed_char, vector_unsigned_char) -> vector_signed_char }
+            impl_vec_trait!{ [$Trait $m]+ $h (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short }
+            impl_vec_trait!{ [$Trait $m]+ $h (vector_signed_short, vector_unsigned_short) -> vector_signed_short }
+            impl_vec_trait!{ [$Trait $m]+ $w (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int }
+            impl_vec_trait!{ [$Trait $m]+ $w (vector_signed_int, vector_unsigned_int) -> vector_signed_int }
+        };
+    }
+
+    macro_rules! impl_shift {
+        ($fun:ident $intr:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(test, assert_instr($fun))]
+            unsafe fn $fun(a: t_t_l!($ty), b: t_t_l!($ty)) -> t_t_l!($ty) {
+                let a = transmute(a);
+                let b = simd_rem(
+                    transmute(b),
+                    <t_t_s!($ty)>::splat(mem::size_of::<$ty>() as $ty * $ty::BITS as $ty),
+                );
+
+                transmute($intr(a, b))
+            }
+        };
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSl<Other> {
+        type Result;
+        unsafe fn vec_sl(self, b: Other) -> Self::Result;
+    }
+
+    impl_shift! { vslb simd_shl u8 }
+    impl_shift! { vslh simd_shl u16 }
+    impl_shift! { vslw simd_shl u32 }
+
+    impl_vec_shift! { [VectorSl vec_sl] (vslb, vslh, vslw) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSr<Other> {
+        type Result;
+        unsafe fn vec_sr(self, b: Other) -> Self::Result;
+    }
+
+    impl_shift! { vsrb simd_shr u8 }
+    impl_shift! { vsrh simd_shr u16 }
+    impl_shift! { vsrw simd_shr u32 }
+
+    impl_vec_shift! { [VectorSr vec_sr] (vsrb, vsrh, vsrw) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSra<Other> {
+        type Result;
+        unsafe fn vec_sra(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift! { [VectorSra vec_sra] (vsrab, vsrah, vsraw) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSld {
+        unsafe fn vec_sld<const UIMM4: i32>(self, b: Self) -> Self;
+        unsafe fn vec_sldw<const UIMM2: i32>(self, b: Self) -> Self;
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsldoi, UIMM4 = 1))]
+    unsafe fn vsldoi<const UIMM4: i32>(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_char {
+        static_assert_uimm_bits!(UIMM4, 4);
+        let d = UIMM4 as u8;
+        if cfg!(target_endian = "little") {
+            let perm = u8x16::new(
+                16 - d,
+                17 - d,
+                18 - d,
+                19 - d,
+                20 - d,
+                21 - d,
+                22 - d,
+                23 - d,
+                24 - d,
+                25 - d,
+                26 - d,
+                27 - d,
+                28 - d,
+                29 - d,
+                30 - d,
+                31 - d,
+            );
+
+            vec_perm(b, a, transmute(perm))
+        } else {
+            let perm = u8x16::new(
+                d,
+                d + 1,
+                d + 2,
+                d + 3,
+                d + 4,
+                d + 5,
+                d + 6,
+                d + 7,
+                d + 8,
+                d + 9,
+                d + 10,
+                d + 11,
+                d + 12,
+                d + 13,
+                d + 14,
+                d + 15,
+            );
+            vec_perm(a, b, transmute(perm))
+        }
+    }
+
+    // TODO: collapse the two once generic_const_exprs are usable.
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(xxsldwi, UIMM2 = 1))]
+    unsafe fn xxsldwi<const UIMM2: i32>(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_char {
+        static_assert_uimm_bits!(UIMM2, 2);
+        let d = (UIMM2 << 2) as u8;
+        if cfg!(target_endian = "little") {
+            let perm = u8x16::new(
+                16 - d,
+                17 - d,
+                18 - d,
+                19 - d,
+                20 - d,
+                21 - d,
+                22 - d,
+                23 - d,
+                24 - d,
+                25 - d,
+                26 - d,
+                27 - d,
+                28 - d,
+                29 - d,
+                30 - d,
+                31 - d,
+            );
+
+            vec_perm(b, a, transmute(perm))
+        } else {
+            let perm = u8x16::new(
+                d,
+                d + 1,
+                d + 2,
+                d + 3,
+                d + 4,
+                d + 5,
+                d + 6,
+                d + 7,
+                d + 8,
+                d + 9,
+                d + 10,
+                d + 11,
+                d + 12,
+                d + 13,
+                d + 14,
+                d + 15,
+            );
+            vec_perm(a, b, transmute(perm))
+        }
+    }
+
+    macro_rules! impl_vec_sld {
+        ($($ty:ident),+) => { $(
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorSld for $ty {
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_sld<const UIMM4: i32>(self, b: Self) -> Self {
+                    transmute(vsldoi::<UIMM4>(transmute(self), transmute(b)))
+                }
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_sldw<const UIMM2: i32>(self, b: Self) -> Self {
+                    transmute(xxsldwi::<UIMM2>(transmute(self), transmute(b)))
+                }
+           }
+        )+ };
+    }
+
+    impl_vec_sld! { vector_bool_char, vector_signed_char, vector_unsigned_char }
+    impl_vec_sld! { vector_bool_short, vector_signed_short, vector_unsigned_short }
+    impl_vec_sld! { vector_bool_int, vector_signed_int, vector_unsigned_int }
+    impl_vec_sld! { vector_float }
+
+    macro_rules! impl_vec_shift_long {
+        ([$Trait:ident $m:ident] ($f:ident)) => {
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_char, vector_unsigned_char) -> vector_signed_char }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_short, vector_unsigned_char) -> vector_unsigned_short }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_short, vector_unsigned_char) -> vector_signed_short }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_int, vector_unsigned_char) -> vector_unsigned_int }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_int, vector_unsigned_char) -> vector_signed_int }
+        };
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSll<Other> {
+        type Result;
+        unsafe fn vec_sll(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_long! { [VectorSll vec_sll] (vsl) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSrl<Other> {
+        type Result;
+        unsafe fn vec_srl(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_long! { [VectorSrl vec_srl] (vsr) }
+
+    macro_rules! impl_vec_shift_octect {
+        ([$Trait:ident $m:ident] ($f:ident)) => {
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_char, vector_signed_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_char, vector_signed_char) -> vector_signed_char }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_short, vector_signed_char) -> vector_unsigned_short }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_short, vector_signed_char) -> vector_signed_short }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_int, vector_signed_char) -> vector_unsigned_int }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_int, vector_signed_char) -> vector_signed_int }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_float, vector_signed_char) -> vector_float }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_char, vector_unsigned_char) -> vector_signed_char }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_short, vector_unsigned_char) -> vector_unsigned_short }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_short, vector_unsigned_char) -> vector_signed_short }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_int, vector_unsigned_char) -> vector_unsigned_int }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_int, vector_unsigned_char) -> vector_signed_int }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_float, vector_unsigned_char) -> vector_float }
+        };
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSlo<Other> {
+        type Result;
+        unsafe fn vec_slo(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_octect! { [VectorSlo vec_slo] (vslo) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSro<Other> {
+        type Result;
+        unsafe fn vec_sro(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_octect! { [VectorSro vec_sro] (vsro) }
+
+    test_impl! { vec_vcntlzb(a: vector_signed_char) -> vector_signed_char [simd_ctlz, vclzb] }
+    test_impl! { vec_vcntlzh(a: vector_signed_short) -> vector_signed_short [simd_ctlz, vclzh] }
+    test_impl! { vec_vcntlzw(a: vector_signed_int) -> vector_signed_int [simd_ctlz, vclzw] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorCntlz {
+        unsafe fn vec_cntlz(self) -> Self;
+    }
+
+    macro_rules! impl_vec_cntlz {
+        ($fun:ident ($a:ty)) => {
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorCntlz for $a {
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_cntlz(self) -> Self {
+                    transmute($fun(transmute(self)))
+                }
+            }
+        };
+    }
+
+    impl_vec_cntlz! { vec_vcntlzb(vector_signed_char) }
+    impl_vec_cntlz! { vec_vcntlzb(vector_unsigned_char) }
+    impl_vec_cntlz! { vec_vcntlzh(vector_signed_short) }
+    impl_vec_cntlz! { vec_vcntlzh(vector_unsigned_short) }
+    impl_vec_cntlz! { vec_vcntlzw(vector_signed_int) }
+    impl_vec_cntlz! { vec_vcntlzw(vector_unsigned_int) }
+
+    macro_rules! impl_vrl {
+        ($fun:ident $intr:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(test, assert_instr($fun))]
+            unsafe fn $fun(a: t_t_l!($ty), b: t_t_l!($ty)) -> t_t_l!($ty) {
+                transmute($intr(transmute(a), transmute(a), transmute(b)))
+            }
+        };
+    }
+
+    impl_vrl! { vrlb fshlb u8 }
+    impl_vrl! { vrlh fshlh u16 }
+    impl_vrl! { vrlw fshlw u32 }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorRl {
+        type Shift;
+        unsafe fn vec_rl(self, b: Self::Shift) -> Self;
+    }
+
+    macro_rules! impl_vec_rl {
+        ($fun:ident ($a:ident)) => {
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorRl for $a {
+                type Shift = t_u!($a);
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_rl(self, b: Self::Shift) -> Self {
+                    transmute($fun(transmute(self), b))
+                }
+            }
+        };
+    }
+
+    impl_vec_rl! { vrlb(vector_signed_char) }
+    impl_vec_rl! { vrlh(vector_signed_short) }
+    impl_vec_rl! { vrlw(vector_signed_int) }
+    impl_vec_rl! { vrlb(vector_unsigned_char) }
+    impl_vec_rl! { vrlh(vector_unsigned_short) }
+    impl_vec_rl! { vrlw(vector_unsigned_int) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorRound {
+        unsafe fn vec_round(self) -> Self;
+    }
+
+    test_impl! { vec_vrfin(a: vector_float) -> vector_float [vrfin, xvrspic] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorRound for vector_float {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_round(self) -> Self {
+            vec_vrfin(self)
+        }
+    }
+}
+
+/// Vector Insert
+///
+/// ## Purpose
+/// Returns a copy of vector b with element c replaced by the value of a.
+///
+/// ## Result value
+/// r contains a copy of vector b with element c replaced by the value of a.
+/// This function uses modular arithmetic on c to determine the element number.
+/// For example, if c is out of range, the compiler uses c modulo the number of
+/// elements in the vector to determine the element position.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_insert<T, const IDX: u32>(a: T, b: <T as sealed::VectorInsert>::Scalar) -> T
+where
+    T: sealed::VectorInsert,
+{
+    a.vec_insert::<IDX>(b)
+}
+
+/// Vector Extract
+///
+/// ## Purpose
+/// Returns the value of the bth element of vector a.
+///
+/// ## Result value
+/// The value of each element of r is the element of a at position b modulo the number of
+/// elements of a.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_extract<T, const IDX: u32>(a: T) -> <T as sealed::VectorExtract>::Scalar
+where
+    T: sealed::VectorExtract,
+{
+    a.vec_extract::<IDX>()
+}
+
+/// Vector Merge Low
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_mergel<T, U>(a: T, b: U) -> <T as sealed::VectorMergel<U>>::Result
+where
+    T: sealed::VectorMergel<U>,
+{
+    a.vec_mergel(b)
+}
+
+/// Vector Merge High
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_mergeh<T, U>(a: T, b: U) -> <T as sealed::VectorMergeh<U>>::Result
+where
+    T: sealed::VectorMergeh<U>,
+{
+    a.vec_mergeh(b)
+}
+
+/// Vector Pack
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_pack<T, U>(a: T, b: U) -> <T as sealed::VectorPack<U>>::Result
+where
+    T: sealed::VectorPack<U>,
+{
+    a.vec_pack(b)
+}
+
+/// Vector Pack Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_packs<T, U>(a: T, b: U) -> <T as sealed::VectorPacks<U>>::Result
+where
+    T: sealed::VectorPacks<U>,
+{
+    a.vec_packs(b)
+}
+
+/// Vector Pack Saturated Unsigned
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_packsu<T, U>(a: T, b: U) -> <T as sealed::VectorPacksu<U>>::Result
+where
+    T: sealed::VectorPacksu<U>,
+{
+    a.vec_packsu(b)
+}
+
+/// Vector Unpack High
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_unpackh<T>(a: T) -> <T as sealed::VectorUnpackh>::Result
+where
+    T: sealed::VectorUnpackh,
+{
+    a.vec_unpackh()
+}
+
+/// Vector Unpack Low
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_unpackl<T>(a: T) -> <T as sealed::VectorUnpackl>::Result
+where
+    T: sealed::VectorUnpackl,
+{
+    a.vec_unpackl()
+}
+
+/// Vector Shift Left
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sl<T, U>(a: T, b: U) -> <T as sealed::VectorSl<U>>::Result
+where
+    T: sealed::VectorSl<U>,
+{
+    a.vec_sl(b)
+}
+
+/// Vector Shift Right
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sr<T, U>(a: T, b: U) -> <T as sealed::VectorSr<U>>::Result
+where
+    T: sealed::VectorSr<U>,
+{
+    a.vec_sr(b)
+}
+
+/// Vector Shift Right Algebraic
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sra<T, U>(a: T, b: U) -> <T as sealed::VectorSra<U>>::Result
+where
+    T: sealed::VectorSra<U>,
+{
+    a.vec_sra(b)
+}
+
+/// Vector Shift Left Double
+///
+/// ## Endian considerations
+///
+/// This intrinsic is not endian-neutral, so uses of vec_sld in
+/// big-endian code must be rewritten for little-endian targets.
+///
+/// Historically, vec_sld could be used to shift by amounts not a multiple of the element size
+/// for most types, in which case the purpose of the shift is difficult to determine and difficult
+/// to automatically rewrite efficiently for little endian.
+///
+/// So the concatenation of a and b is done in big-endian fashion (left to right), and the shift is
+/// always to the left. This will generally produce surprising results for little-endian targets.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sld<T, const UIMM4: i32>(a: T, b: T) -> T
+where
+    T: sealed::VectorSld,
+{
+    a.vec_sld::<UIMM4>(b)
+}
+
+/// Vector Shift Left Double by Words
+///
+/// ## Endian considerations
+///
+/// This intrinsic is not endian-neutral, so uses of vec_sldw in
+/// big-endian code must be rewritten for little-endian targets.
+///
+/// The concatenation of a and b is done in big-endian fashion (left to right), and the shift is
+/// always to the left. This will generally produce surprising results for little- endian targets.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sldw<T, const UIMM2: i32>(a: T, b: T) -> T
+where
+    T: sealed::VectorSld,
+{
+    a.vec_sldw::<UIMM2>(b)
+}
+
+/// Vector Shift Left Long
+///
+/// ## Endian considerations
+/// This intrinsic is not endian-neutral, so uses of vec_sll in big-endian
+/// code must be rewritten for little-endian targets.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sll<T, U>(a: T, b: U) -> <T as sealed::VectorSll<U>>::Result
+where
+    T: sealed::VectorSll<U>,
+{
+    a.vec_sll(b)
+}
+
+/// Vector Shift Right Long
+///
+/// ## Endian considerations
+/// This intrinsic is not endian-neutral, so uses of vec_srl in big-endian
+/// code must be rewritten for little-endian targets.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_srl<T, U>(a: T, b: U) -> <T as sealed::VectorSrl<U>>::Result
+where
+    T: sealed::VectorSrl<U>,
+{
+    a.vec_srl(b)
+}
+
+/// Vector Shift Left by Octets
+///
+/// ## Endian considerations
+/// This intrinsic is not endian-neutral, so uses of vec_slo in big-endian code must be rewritten
+/// for little-endian targets. The shift count is in element 15 of b for big-endian, but in element
+/// 0 of b for little-endian.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_slo<T, U>(a: T, b: U) -> <T as sealed::VectorSlo<U>>::Result
+where
+    T: sealed::VectorSlo<U>,
+{
+    a.vec_slo(b)
+}
+
+/// Vector Shift Right by Octets
+///
+/// ## Endian considerations
+/// This intrinsic is not endian-neutral, so uses of vec_sro in big-endian code must be rewritten
+/// for little-endian targets. The shift count is in element 15 of b for big-endian, but in element
+/// 0 of b for little-endian.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sro<T, U>(a: T, b: U) -> <T as sealed::VectorSro<U>>::Result
+where
+    T: sealed::VectorSro<U>,
+{
+    a.vec_sro(b)
+}
+
+/// Vector Shift Left Variable
+///
+/// ## Result value
+/// Let v be a 17-byte vector formed from a in bytes `[0:15]` and a zero byte in element 16.
+/// Then each byte element i of r is determined as follows. The start bit sb is
+/// obtained from bits 5:7 of byte element i of b. Then the contents of bits sb:sb+7 of the
+/// halfword in byte elements i:i+1 of v are placed into byte element i of r.
+///
+/// ## Endian considerations
+/// All bit and byte element numbers are specified in big-endian order. This intrinsic is not
+/// endian-neutral.
+#[inline]
+#[target_feature(enable = "power9-altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_slv(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char {
+    vslv(a, b)
+}
+
+/// Vector Shift Right Variable
+///
+/// ## Result value
+/// Let v be a 17-byte vector formed from a zero byte in element 0 and the elements of
+/// a in bytes `[1:16]`. Then each byte element i of r is determined as follows. The start bit sb is
+/// obtained from bits 5:7 of byte element i of b. Then the contents of bits (8 – sb):(15 – sb) of
+/// the halfword in byte elements i:i+1 of v are placed into byte element i of r.
+///
+/// ## Endian considerations
+/// All bit and byte element numbers are specified in big-endian order. This intrinsic is not
+/// endian-neutral.
+#[inline]
+#[target_feature(enable = "power9-altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_srv(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char {
+    vsrv(a, b)
+}
+
+/// Vector Load Indexed.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_ld<T>(off: isize, p: T) -> <T as sealed::VectorLd>::Result
+where
+    T: sealed::VectorLd,
+{
+    p.vec_ld(off)
+}
+
+/// Vector Load Indexed Least Recently Used.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_ldl<T>(off: isize, p: T) -> <T as sealed::VectorLd>::Result
+where
+    T: sealed::VectorLd,
+{
+    p.vec_ldl(off)
+}
+
+/// Vector Load Element Indexed.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_lde<T>(off: isize, p: T) -> <T as sealed::VectorLde>::Result
+where
+    T: sealed::VectorLde,
+{
+    p.vec_lde(off)
+}
+
+/// Vector Store Indexed
+///
+/// ## Purpose
+/// Stores a 16-byte vector into memory at the address specified by a displacement and a
+/// pointer, ignoring the four low-order bits of the calculated address.
+///
+/// ## Operation
+/// A memory address is obtained by adding b and c, and masking off the four low-order
+/// bits of the result. The 16-byte vector in a is stored to the resultant memory address.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_st<T>(a: T, off: isize, c: <T as sealed::VectorSt>::Target)
+where
+    T: sealed::VectorSt,
+{
+    a.vec_st(off, c)
+}
+
+/// Vector Store Indexed Least Recently Used
+///
+/// ## Purpose
+/// Stores a 16-byte vector into memory at the address specified by a displacement and
+/// a pointer, ignoring the four low-order bits of the calculated address, and marking the cache
+/// line containing the address as least frequently used.
+///
+/// ## Operation
+/// A memory address is obtained by adding b and c, and masking off the four
+/// low-order bits of the result. The 16-byte vector in a is stored to the resultant memory
+/// address, and the containing cache line is marked as least frequently used.
+///
+/// ## Notes
+/// This intrinsic can be used to indicate the last access to a portion of memory, as a hint to the
+/// data cache controller that the associated cache line can be replaced without performance loss.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_stl<T>(a: T, off: isize, c: <T as sealed::VectorSt>::Target)
+where
+    T: sealed::VectorSt,
+{
+    a.vec_stl(off, c)
+}
+
+/// Vector Store Element Indexed
+///
+/// ## Purpose
+/// Stores a single element from a 16-byte vector into memory at the address specified by
+/// a displacement and a pointer, aligned to the element size.
+///
+/// ## Operation
+/// The integer value b is added to the pointer value c. The resulting address is
+/// rounded down to the nearest address that is a multiple of es, where es is 1 for char pointers,
+/// 2 for short pointers, and 4 for float or int pointers. An element offset eo is calculated by
+/// taking the resultant address modulo 16. The vector element of a at offset eo is stored to the
+/// resultant address.
+///
+/// ## Notes
+/// Be careful to note that the address (b+c) is aligned to an element boundary. Do not attempt
+/// to store unaligned data with this intrinsic.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_ste<T>(a: T, off: isize, c: <T as sealed::VectorSte>::Target)
+where
+    T: sealed::VectorSte,
+{
+    a.vec_ste(off, c)
+}
+
+/// VSX Unaligned Load
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_xl<T>(off: isize, p: T) -> <T as sealed::VectorXl>::Result
+where
+    T: sealed::VectorXl,
+{
+    p.vec_xl(off)
+}
+
+/// VSX Unaligned Store
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_xst<T>(v: T, off: isize, p: <T as sealed::VectorXst>::Out)
+where
+    T: sealed::VectorXst,
+{
+    v.vec_xst(off, p)
+}
+
+/// Vector Base-2 Logarithm Estimate
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr(vlogefp))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_loge(a: vector_float) -> vector_float {
+    vlogefp(a)
+}
+
+/// Vector floor.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_floor(a: vector_float) -> vector_float {
+    sealed::vec_floor(a)
+}
+
+/// Vector expte.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_expte(a: vector_float) -> vector_float {
+    sealed::vec_vexptefp(a)
+}
+
+/// Vector cmplt.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_cmplt<T, U>(a: U, b: T) -> <T as sealed::VectorCmpGt<U>>::Result
+where
+    T: sealed::VectorCmpGt<U>,
+{
+    vec_cmpgt(b, a)
+}
+
+/// Vector cmple.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_cmple(a: vector_float, b: vector_float) -> vector_bool_int {
+    vec_cmpge(b, a)
+}
+
+/// Vector cmpgt.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_cmpgt<T, U>(a: T, b: U) -> <T as sealed::VectorCmpGt<U>>::Result
+where
+    T: sealed::VectorCmpGt<U>,
+{
+    a.vec_cmpgt(b)
+}
+
+/// Vector cmpge.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_cmpge(a: vector_float, b: vector_float) -> vector_bool_int {
+    sealed::vec_vcmpgefp(a, b)
+}
+
+/// Vector cmpeq.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_cmpeq<T, U>(a: T, b: U) -> <T as sealed::VectorCmpEq<U>>::Result
+where
+    T: sealed::VectorCmpEq<U>,
+{
+    a.vec_cmpeq(b)
+}
+
+/// Vector Compare Not Equal
+///
+/// ## Result value
+/// For each element of r, the value of each bit is 1 if the corresponding elements
+/// of a and b are not equal. Otherwise, the value of each bit is 0.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_cmpne<T, U>(a: T, b: U) -> <T as sealed::VectorCmpNe<U>>::Result
+where
+    T: sealed::VectorCmpNe<U>,
+{
+    a.vec_cmpne(b)
+}
+
+/// Vector cmpb.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_cmpb(a: vector_float, b: vector_float) -> vector_signed_int {
+    sealed::vec_vcmpbfp(a, b)
+}
+
+/// Vector ceil.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_ceil(a: vector_float) -> vector_float {
+    sealed::vec_vceil(a)
+}
+
+/// Vector avg.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_avg<T, U>(a: T, b: U) -> <T as sealed::VectorAvg<U>>::Result
+where
+    T: sealed::VectorAvg<U>,
+{
+    a.vec_avg(b)
+}
+
+/// Vector andc.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_andc<T, U>(a: T, b: U) -> <T as sealed::VectorAndc<U>>::Result
+where
+    T: sealed::VectorAndc<U>,
+{
+    a.vec_andc(b)
+}
+
+/// Vector OR with Complement
+///
+/// ## Purpose
+/// Performs a bitwise OR of the first vector with the bitwise-complemented second vector.
+///
+/// ## Result value
+/// r is the bitwise OR of a and the bitwise complement of b.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_orc<T, U>(a: T, b: U) -> <T as sealed::VectorOrc<U>>::Result
+where
+    T: sealed::VectorOrc<U>,
+{
+    a.vec_orc(b)
+}
+
+/// Vector and.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_and<T, U>(a: T, b: U) -> <T as sealed::VectorAnd<U>>::Result
+where
+    T: sealed::VectorAnd<U>,
+{
+    a.vec_and(b)
+}
+
+/// Vector or.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_or<T, U>(a: T, b: U) -> <T as sealed::VectorOr<U>>::Result
+where
+    T: sealed::VectorOr<U>,
+{
+    a.vec_or(b)
+}
+
+/// Vector NAND
+///
+/// ## Purpose
+/// Performs a bitwise NAND of two vectors.
+///
+/// ## Result value
+/// r is the bitwise NAND of a and b.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_nand<T, U>(a: T, b: U) -> <T as sealed::VectorNand<U>>::Result
+where
+    T: sealed::VectorNand<U>,
+{
+    a.vec_nand(b)
+}
+
+/// Vector nor.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_nor<T, U>(a: T, b: U) -> <T as sealed::VectorNor<U>>::Result
+where
+    T: sealed::VectorNor<U>,
+{
+    a.vec_nor(b)
+}
+
+/// Vector xor.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_xor<T, U>(a: T, b: U) -> <T as sealed::VectorXor<U>>::Result
+where
+    T: sealed::VectorXor<U>,
+{
+    a.vec_xor(b)
+}
+
+/// Vector adds.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_adds<T, U>(a: T, b: U) -> <T as sealed::VectorAdds<U>>::Result
+where
+    T: sealed::VectorAdds<U>,
+{
+    a.vec_adds(b)
+}
+
+/// Vector addc.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_addc(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int {
+    sealed::vec_vaddcuw(a, b)
+}
+
+/// Vector abs.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_abs<T>(a: T) -> T
+where
+    T: sealed::VectorAbs,
+{
+    a.vec_abs()
+}
+
+/// Vector abss.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_abss<T>(a: T) -> T
+where
+    T: sealed::VectorAbss,
+{
+    a.vec_abss()
+}
+
+/// Vector Rotate Left
+///
+/// ## Purpose
+/// Rotates each element of a vector left by a given number of bits.
+///
+/// ## Result value
+/// Each element of r is obtained by rotating the corresponding element of a left by
+/// the number of bits specified by the corresponding element of b.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_rl<T>(a: T, b: <T as sealed::VectorRl>::Shift) -> T
+where
+    T: sealed::VectorRl,
+{
+    a.vec_rl(b)
+}
+
+/// Vector Round
+///
+/// ## Purpose
+/// Returns a vector containing the rounded values of the corresponding elements of the
+/// source vector.
+///
+/// ## Result value
+/// Each element of r contains the value of the corresponding element of a, rounded
+/// to the nearest representable floating-point integer, using IEEE round-to-nearest
+/// rounding.
+/// The current floating-point rounding mode is ignored.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_round<T>(a: T) -> T
+where
+    T: sealed::VectorRound,
+{
+    a.vec_round()
+}
+
+/// Vector Splat
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_splat<T, const IMM: u32>(a: T) -> T
+where
+    T: sealed::VectorSplat,
+{
+    a.vec_splat::<IMM>()
+}
+
+splat! { vec_splat_u8, u8, u8x16 [vspltisb / xxspltib, "Vector Splat to Unsigned Byte"] }
+splat! { vec_splat_s8, i8, i8x16 [vspltisb / xxspltib, "Vector Splat to Signed Byte"] }
+splat! { vec_splat_u16, u16, u16x8 [vspltish, "Vector Splat to Unsigned Halfword"] }
+splat! { vec_splat_s16, i16, i16x8 [vspltish, "Vector Splat to Signed Halfword"] }
+splat! { vec_splat_u32, u32, u32x4 [vspltisw, "Vector Splat to Unsigned Word"] }
+splat! { vec_splat_s32, i32, i32x4 [vspltisw, "Vector Splat to Signed Word"] }
+
+/// Vector splats.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_splats<T>(a: T) -> <T as sealed::VectorSplats>::Result
+where
+    T: sealed::VectorSplats,
+{
+    a.vec_splats()
+}
+
+/// Vector sub.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sub<T, U>(a: T, b: U) -> <T as sealed::VectorSub<U>>::Result
+where
+    T: sealed::VectorSub<U>,
+{
+    a.vec_sub(b)
+}
+
+/// Vector Subtract Carryout
+///
+/// ## Purpose
+/// Returns a vector wherein each element contains the carry produced by subtracting the
+/// corresponding elements of the two source vectors.
+///
+/// ## Result value
+/// The value of each element of r is the complement of the carry produced by subtract- ing the
+/// value of the corresponding element of b from the value of the corresponding element of a. The
+/// value is 0 if a borrow occurred, or 1 if no borrow occurred.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_subc<T, U>(a: T, b: U) -> <T as sealed::VectorSubc<U>>::Result
+where
+    T: sealed::VectorSubc<U>,
+{
+    a.vec_subc(b)
+}
+
+/// Vector subs.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_subs<T, U>(a: T, b: U) -> <T as sealed::VectorSubs<U>>::Result
+where
+    T: sealed::VectorSubs<U>,
+{
+    a.vec_subs(b)
+}
+
+/// Vector min.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_min<T, U>(a: T, b: U) -> <T as sealed::VectorMin<U>>::Result
+where
+    T: sealed::VectorMin<U>,
+{
+    a.vec_min(b)
+}
+
+/// Vector max.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_max<T, U>(a: T, b: U) -> <T as sealed::VectorMax<U>>::Result
+where
+    T: sealed::VectorMax<U>,
+{
+    a.vec_max(b)
+}
+
+/// Move From Vector Status and Control Register.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr(mfvscr))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_mfvscr() -> vector_unsigned_short {
+    mfvscr()
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_add<T, U>(a: T, b: U) -> <T as sealed::VectorAdd<U>>::Result
+where
+    T: sealed::VectorAdd<U>,
+{
+    a.vec_add(b)
+}
+
+/// Vector Add Extended
+///
+/// ## Result value
+/// The value of each element of r is produced by adding the corresponding elements of
+/// a and b with a carry specified in the corresponding element of c (1 if there is a carry, 0
+/// otherwise).
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_adde<T>(a: T, b: T, c: T) -> T
+where
+    T: sealed::VectorAdde,
+{
+    a.vec_adde(b, c)
+}
+
+/// Vector Convert to Floating-Point
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_ctf<const IMM5: i32, T>(a: T) -> vector_float
+where
+    T: sealed::VectorCtf,
+{
+    a.vec_ctf::<IMM5>()
+}
+
+/// Vector Convert to Signed Integer
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr(vctsxs, IMM5 = 1))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_cts<const IMM5: i32>(a: vector_float) -> vector_signed_int {
+    static_assert_uimm_bits!(IMM5, 5);
+
+    vctsxs(a, IMM5)
+}
+
+/// Vector Convert to Unsigned Integer
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr(vctuxs, IMM5 = 1))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_ctu<const IMM5: i32>(a: vector_float) -> vector_unsigned_int {
+    static_assert_uimm_bits!(IMM5, 5);
+
+    vctuxs(a, IMM5)
+}
+
+/// Endian-biased intrinsics
+#[cfg(target_endian = "little")]
+mod endian {
+    use super::*;
+    /// Vector permute.
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub unsafe fn vec_perm<T>(a: T, b: T, c: vector_unsigned_char) -> T
+    where
+        T: sealed::VectorPerm,
+    {
+        // vperm has big-endian bias
+        //
+        // Xor the mask and flip the arguments
+        let d = transmute(u8x16::new(
+            255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+        ));
+        let c = simd_xor(c, d);
+
+        b.vec_vperm(a, c)
+    }
+
+    /// Vector Sum Across Partial (1/2) Saturated
+    #[inline]
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_sum2s(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        // vsum2sws has big-endian bias
+        //
+        // swap the even b elements with the odd ones
+        let flip = transmute(u8x16::new(
+            4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11,
+        ));
+        let b = vec_perm(b, b, flip);
+        let c = vsum2sws(a, b);
+
+        vec_perm(c, c, flip)
+    }
+
+    // Even and Odd are swapped in little-endian
+    /// Vector Multiply Even
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub unsafe fn vec_mule<T, U>(a: T, b: T) -> U
+    where
+        T: sealed::VectorMulo<U>,
+    {
+        a.vec_mulo(b)
+    }
+    /// Vector Multiply Odd
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub unsafe fn vec_mulo<T, U>(a: T, b: T) -> U
+    where
+        T: sealed::VectorMule<U>,
+    {
+        a.vec_mule(b)
+    }
+}
+
+/// Vector Multiply
+///
+/// ## Purpose
+/// Compute the products of corresponding elements of two vectors.
+///
+/// ## Result value
+/// Each element of r receives the product of the corresponding elements of a and b.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_mul<T>(a: T, b: T) -> T
+where
+    T: sealed::VectorMul,
+{
+    a.vec_mul(b)
+}
+
+/// Vector Multiply Add Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr(vmhaddshs))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_madds(
+    a: vector_signed_short,
+    b: vector_signed_short,
+    c: vector_signed_short,
+) -> vector_signed_short {
+    vmhaddshs(a, b, c)
+}
+
+/// Vector Multiply Low and Add Unsigned Half Word
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_mladd<T, U>(a: T, b: U, c: U) -> <T as sealed::VectorMladd<U>>::Result
+where
+    T: sealed::VectorMladd<U>,
+{
+    a.vec_mladd(b, c)
+}
+
+/// Vector Multiply Round and Add Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr(vmhraddshs))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_mradds(
+    a: vector_signed_short,
+    b: vector_signed_short,
+    c: vector_signed_short,
+) -> vector_signed_short {
+    vmhraddshs(a, b, c)
+}
+
+/// Vector Multiply Sum
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_msum<T, B, U>(a: T, b: B, c: U) -> U
+where
+    T: sealed::VectorMsum<B, U>,
+{
+    a.vec_msum(b, c)
+}
+
+/// Vector Multiply Sum Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_msums<T, U>(a: T, b: T, c: U) -> U
+where
+    T: sealed::VectorMsums<U>,
+{
+    a.vec_msums(b, c)
+}
+
+/// Vector Multiply Add
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_madd(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+    sealed::vec_vmaddfp(a, b, c)
+}
+
+/// Vector Negative Multiply Subtract
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_nmsub(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+    vnmsubfp(a, b, c)
+}
+
+/// Vector Select
+///
+/// ## Purpose
+/// Returns a vector selecting bits from two source vectors depending on the corresponding
+/// bit values of a third source vector.
+///
+/// ## Result value
+/// Each bit of r has the value of the corresponding bit of a if the corresponding
+/// bit of c is 0. Otherwise, the bit of r has the value of the corresponding bit of b.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sel<T, U>(a: T, b: T, c: U) -> T
+where
+    T: sealed::VectorSel<U>,
+{
+    a.vec_sel(b, c)
+}
+
+/// Vector Sum Across Partial (1/4) Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sum4s<T, U>(a: T, b: U) -> U
+where
+    T: sealed::VectorSum4s<U>,
+{
+    a.vec_sum4s(b)
+}
+
+/// Vector All Elements Equal
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_all_eq<T, U>(a: T, b: U) -> <T as sealed::VectorAllEq<U>>::Result
+where
+    T: sealed::VectorAllEq<U>,
+{
+    a.vec_all_eq(b)
+}
+
+/// Vector All Elements Equal
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_any_eq<T, U>(a: T, b: U) -> <T as sealed::VectorAnyEq<U>>::Result
+where
+    T: sealed::VectorAnyEq<U>,
+{
+    a.vec_any_eq(b)
+}
+
+/// Vector All Elements Greater or Equal
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_all_ge<T, U>(a: T, b: U) -> <T as sealed::VectorAllGe<U>>::Result
+where
+    T: sealed::VectorAllGe<U>,
+{
+    a.vec_all_ge(b)
+}
+
+/// Vector Any Element Greater or Equal
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_any_ge<T, U>(a: T, b: U) -> <T as sealed::VectorAnyGe<U>>::Result
+where
+    T: sealed::VectorAnyGe<U>,
+{
+    a.vec_any_ge(b)
+}
+
+/// Vector All Elements Greater Than
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_all_gt<T, U>(a: T, b: U) -> <T as sealed::VectorAllGt<U>>::Result
+where
+    T: sealed::VectorAllGt<U>,
+{
+    a.vec_all_gt(b)
+}
+
+/// Vector Any Element Greater Than
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_any_gt<T, U>(a: T, b: U) -> <T as sealed::VectorAnyGt<U>>::Result
+where
+    T: sealed::VectorAnyGt<U>,
+{
+    a.vec_any_gt(b)
+}
+
+/// Vector All In
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpbfp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_all_in(a: vector_float, b: vector_float) -> bool {
+    vcmpbfp_p(0, a, b) != 0
+}
+
+/// Vector All Elements Less Than or Equal
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_all_le<T, U>(a: U, b: T) -> <T as sealed::VectorAllGe<U>>::Result
+where
+    T: sealed::VectorAllGe<U>,
+{
+    b.vec_all_ge(a)
+}
+
+/// Vector Any Element Less Than or Equal
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_any_le<T, U>(a: U, b: T) -> <T as sealed::VectorAnyGe<U>>::Result
+where
+    T: sealed::VectorAnyGe<U>,
+{
+    b.vec_any_ge(a)
+}
+
+/// Vector All Elements Less Than
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_all_lt<T, U>(a: U, b: T) -> <T as sealed::VectorAllGt<U>>::Result
+where
+    T: sealed::VectorAllGt<U>,
+{
+    b.vec_all_gt(a)
+}
+
+/// Vector Any Element Less Than
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_any_lt<T, U>(a: U, b: T) -> <T as sealed::VectorAnyGt<U>>::Result
+where
+    T: sealed::VectorAnyGt<U>,
+{
+    b.vec_any_gt(a)
+}
+
+/// All Elements Not a Number
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpeqfp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_all_nan(a: vector_float) -> bool {
+    vcmpeqfp_p(0, a, a) != 0
+}
+
+/// Any Elements Not a Number
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpeqfp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_any_nan(a: vector_float) -> bool {
+    vcmpeqfp_p(3, a, a) != 0
+}
+
+/// Vector All Elements Not Equal
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_all_ne<T, U>(a: T, b: U) -> <T as sealed::VectorAllNe<U>>::Result
+where
+    T: sealed::VectorAllNe<U>,
+{
+    a.vec_all_ne(b)
+}
+
+/// Vector Any Elements Not Equal
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_any_ne<T, U>(a: T, b: U) -> <T as sealed::VectorAnyNe<U>>::Result
+where
+    T: sealed::VectorAnyNe<U>,
+{
+    a.vec_any_ne(b)
+}
+
+/// All Elements Not Greater Than or Equal
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpgefp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_all_nge(a: vector_float, b: vector_float) -> bool {
+    vcmpgefp_p(0, a, b) != 0
+}
+
+/// All Elements Not Greater Than
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpgtfp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_all_ngt(a: vector_float, b: vector_float) -> bool {
+    vcmpgtfp_p(0, a, b) != 0
+}
+
+/// All Elements Not Less Than or Equal
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpgefp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_all_nle(a: vector_float, b: vector_float) -> bool {
+    vcmpgefp_p(0, b, a) != 0
+}
+
+/// All Elements Not Less Than
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpgtfp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_all_nlt(a: vector_float, b: vector_float) -> bool {
+    vcmpgtfp_p(0, b, a) != 0
+}
+
+/// All Elements Numeric
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpgefp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_all_numeric(a: vector_float) -> bool {
+    vcmpgefp_p(2, a, a) != 0
+}
+
+/// Any Elements Not Greater Than or Equal
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpgefp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_any_nge(a: vector_float, b: vector_float) -> bool {
+    vcmpgefp_p(3, a, b) != 0
+}
+
+/// Any Elements Not Greater Than
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpgtfp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_any_ngt(a: vector_float, b: vector_float) -> bool {
+    vcmpgtfp_p(3, a, b) != 0
+}
+
+/// Any Elements Not Less Than or Equal
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpgefp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_any_nle(a: vector_float, b: vector_float) -> bool {
+    vcmpgefp_p(3, b, a) != 0
+}
+
+/// Any Elements Not Less Than
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpgtfp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_any_nlt(a: vector_float, b: vector_float) -> bool {
+    vcmpgtfp_p(3, b, a) != 0
+}
+
+/// Any Elements Numeric
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpgefp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_any_numeric(a: vector_float) -> bool {
+    vcmpgefp_p(1, a, a) != 0
+}
+
+/// Vector Count Leading Zeros
+///
+/// ## Purpose
+/// Returns a vector containing the number of most-significant bits equal to zero of each
+/// corresponding element of the source vector.
+///
+/// ## Result value
+/// The value of each element of r is set to the number of leading zeros of the
+/// corresponding element of a.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_cntlz<T>(a: T) -> T
+where
+    T: sealed::VectorCntlz,
+{
+    a.vec_cntlz()
+}
+
+/// Any Element Out of Bounds
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr("vcmpeqfp."))]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_any_out(a: vector_float) -> bool {
+    vcmpeqfp_p(1, a, a) != 0
+}
+
+#[cfg(target_endian = "big")]
+mod endian {
+    use super::*;
+    /// Vector permute.
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub unsafe fn vec_perm<T>(a: T, b: T, c: vector_unsigned_char) -> T
+    where
+        T: sealed::VectorPerm,
+    {
+        a.vec_vperm(b, c)
+    }
+
+    /// Vector Sum Across Partial (1/2) Saturated
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub unsafe fn vec_sum2s(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        vsum2sws(a, b)
+    }
+
+    /// Vector Multiply Even
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub unsafe fn vec_mule<T, U>(a: T, b: T) -> U
+    where
+        T: sealed::VectorMule<U>,
+    {
+        a.vec_mule(b)
+    }
+    /// Vector Multiply Odd
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub unsafe fn vec_mulo<T, U>(a: T, b: T) -> U
+    where
+        T: sealed::VectorMulo<U>,
+    {
+        a.vec_mulo(b)
+    }
+}
+
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub use self::endian::*;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::mem::transmute;
+
+    use crate::core_arch::simd::*;
+    use stdarch_test::simd_test;
+
+    macro_rules! test_vec_2 {
+        { $name: ident, $fn:ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! { $name, $fn, $ty -> $ty, [$($a),+], [$($b),+], [$($d),+] }
+        };
+        { $name: ident, $fn:ident, $ty: ident -> $ty_out: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = transmute($ty::new($($a),+));
+                let b: s_t_l!($ty) = transmute($ty::new($($b),+));
+
+                let d = $ty_out::new($($d),+);
+                let r : $ty_out = transmute($fn(a, b));
+                assert_eq!(d, r);
+            }
+         };
+         { $name: ident, $fn:ident, $ty: ident -> $ty_out: ident, [$($a:expr),+], [$($b:expr),+], $d:expr } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = transmute($ty::new($($a),+));
+                let b: s_t_l!($ty) = transmute($ty::new($($b),+));
+
+                let r : $ty_out = transmute($fn(a, b));
+                assert_eq!($d, r);
+            }
+         }
+   }
+
+    macro_rules! test_vec_1 {
+        { $name: ident, $fn:ident, f32x4, [$($a:expr),+], ~[$($d:expr),+] } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: vector_float = transmute(f32x4::new($($a),+));
+
+                let d: vector_float = transmute(f32x4::new($($d),+));
+                let r = transmute(vec_cmple(vec_abs(vec_sub($fn(a), d)), vec_splats(f32::EPSILON)));
+                let e = m32x4::new(true, true, true, true);
+                assert_eq!(e, r);
+            }
+        };
+        { $name: ident, $fn:ident, $ty: ident, [$($a:expr),+], [$($d:expr),+] } => {
+            test_vec_1! { $name, $fn, $ty -> $ty, [$($a),+], [$($d),+] }
+        };
+        { $name: ident, $fn:ident, $ty: ident -> $ty_out: ident, [$($a:expr),+], [$($d:expr),+] } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = transmute($ty::new($($a),+));
+
+                let d = $ty_out::new($($d),+);
+                let r : $ty_out = transmute($fn(a));
+                assert_eq!(d, r);
+            }
+        }
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_ld() {
+        let pat = [
+            u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+            u8x16::new(
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ),
+        ];
+
+        for off in 0..16 {
+            let v: u8x16 = transmute(vec_ld(0, (pat.as_ptr() as *const u8).offset(off)));
+            assert_eq!(
+                v,
+                u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+            );
+        }
+        for off in 16..32 {
+            let v: u8x16 = transmute(vec_ld(0, (pat.as_ptr() as *const u8).offset(off)));
+            assert_eq!(
+                v,
+                u8x16::new(
+                    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+                )
+            );
+        }
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_xl() {
+        let pat = [
+            u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+            u8x16::new(
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ),
+        ];
+
+        for off in 0..16 {
+            let val: u8x16 = transmute(vec_xl(0, (pat.as_ptr() as *const u8).offset(off)));
+            for i in 0..16 {
+                let v = val.extract(i);
+                assert_eq!(off as usize + i, v as usize);
+            }
+        }
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_xst() {
+        let v: vector_unsigned_char = transmute(u8x16::new(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+        ));
+
+        for off in 0..16 {
+            let mut buf = [0u8; 32];
+            vec_xst(v, 0, (buf.as_mut_ptr() as *mut u8).offset(off));
+            for i in 0..16 {
+                assert_eq!(i as u8, buf[off as usize..][i]);
+            }
+        }
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_ldl() {
+        let pat = [
+            u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+            u8x16::new(
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ),
+        ];
+
+        for off in 0..16 {
+            let v: u8x16 = transmute(vec_ldl(0, (pat.as_ptr() as *const u8).offset(off)));
+            assert_eq!(
+                v,
+                u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+            );
+        }
+        for off in 16..32 {
+            let v: u8x16 = transmute(vec_ldl(0, (pat.as_ptr() as *const u8).offset(off)));
+            assert_eq!(
+                v,
+                u8x16::new(
+                    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+                )
+            );
+        }
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_lde_u8() {
+        let pat = [u8x16::new(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+        )];
+        for off in 0..16 {
+            let v: u8x16 = transmute(vec_lde(off, pat.as_ptr() as *const u8));
+            assert_eq!(off as u8, v.extract(off as _));
+        }
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_lde_u16() {
+        let pat = [u16x8::new(0, 1, 2, 3, 4, 5, 6, 7)];
+        for off in 0..8 {
+            let v: u16x8 = transmute(vec_lde(off * 2, pat.as_ptr() as *const u16));
+            assert_eq!(off as u16, v.extract(off as _));
+        }
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_lde_u32() {
+        let pat = [u32x4::new(0, 1, 2, 3)];
+        for off in 0..4 {
+            let v: u32x4 = transmute(vec_lde(off * 4, pat.as_ptr() as *const u32));
+            assert_eq!(off as u32, v.extract(off as _));
+        }
+    }
+
+    test_vec_1! { test_vec_floor, vec_floor, f32x4,
+        [1.1, 1.9, -0.5, -0.9],
+        [1.0, 1.0, -1.0, -1.0]
+    }
+
+    test_vec_1! { test_vec_expte, vec_expte, f32x4,
+        [0.0, 2.0, 2.0, -1.0],
+        ~[1.0, 4.0, 4.0, 0.5]
+    }
+
+    test_vec_2! { test_vec_cmpgt_i8, vec_cmpgt, i8x16 -> m8x16,
+        [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [true, false, true, false, false, false, false, false, false, false, false, false, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpgt_u8, vec_cmpgt, u8x16 -> m8x16,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [true, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpgt_i16, vec_cmpgt, i16x8 -> m16x8,
+        [1, -1, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0],
+        [true, false, true, false, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpgt_u16, vec_cmpgt, u16x8 -> m16x8,
+        [1, 255, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0],
+        [true, true, false, false, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpgt_i32, vec_cmpgt, i32x4 -> m32x4,
+        [1, -1, 0, 0],
+        [0, -1, 0, 1],
+        [true, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpgt_u32, vec_cmpgt, u32x4 -> m32x4,
+        [1, 255, 0, 0],
+        [0, 255,  0, 1],
+        [true, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpge, vec_cmpge, f32x4 -> m32x4,
+        [0.1, -0.1, 0.0, 0.99],
+        [0.1, 0.0, 0.1, 1.0],
+        [true, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpeq_i8, vec_cmpeq, i8x16 -> m8x16,
+        [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [false, false, false, false, true, true, true, true, true, true, true, true, true, true, true, true]
+    }
+
+    test_vec_2! { test_vec_cmpeq_u8, vec_cmpeq, u8x16 -> m8x16,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [false, false, false, false, true, true, true, true, true, true, true, true, true, true, true, true]
+    }
+
+    test_vec_2! { test_vec_cmpeq_i16, vec_cmpeq, i16x8 -> m16x8,
+        [1, -1, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0],
+        [false, false, false, false, true, true, true, true]
+    }
+
+    test_vec_2! { test_vec_cmpeq_u16, vec_cmpeq, u16x8 -> m16x8,
+        [1, 255, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0],
+        [false, false, false, false, true, true, true, true]
+    }
+
+    test_vec_2! { test_vec_cmpeq_i32, vec_cmpeq, i32x4 -> m32x4,
+        [1, -1, 0, 0],
+        [0, -1, 0, 1],
+        [false, true, true, false]
+    }
+
+    test_vec_2! { test_vec_cmpeq_u32, vec_cmpeq, u32x4 -> m32x4,
+        [1, 255, 0, 0],
+        [0, 255,  0, 1],
+        [false, true, true, false]
+    }
+
+    test_vec_2! { test_vec_cmpne_i8, vec_cmpne, i8x16 -> m8x16,
+        [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpne_u8, vec_cmpne, u8x16 -> m8x16,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpne_i16, vec_cmpne, i16x8 -> m16x8,
+        [1, -1, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0],
+        [true, true, true, true, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpne_u16, vec_cmpne, u16x8 -> m16x8,
+        [1, 255, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0],
+        [true, true, true, true, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpne_i32, vec_cmpne, i32x4 -> m32x4,
+        [1, -1, 0, 0],
+        [0, -1, 0, 1],
+        [true, false, false, true]
+    }
+
+    test_vec_2! { test_vec_cmpne_u32, vec_cmpne, u32x4 -> m32x4,
+        [1, 255, 0, 0],
+        [0, 255,  0, 1],
+        [true, false, false, true]
+    }
+
+    test_vec_2! { test_vec_all_eq_i8_false, vec_all_eq, i8x16 -> bool,
+        [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_eq_u8_false, vec_all_eq, u8x16 -> bool,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_eq_i16_false, vec_all_eq, i16x8 -> bool,
+        [1, -1, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_eq_u16_false, vec_all_eq, u16x8 -> bool,
+        [1, 255, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_eq_i32_false, vec_all_eq, i32x4 -> bool,
+        [1, -1, 0, 0],
+        [0, -1, 0, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_all_eq_u32_false, vec_all_eq, u32x4 -> bool,
+        [1, 255, 0, 0],
+        [0, 255,  0, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_all_eq_i8_true, vec_all_eq, i8x16 -> bool,
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_eq_u8_true, vec_all_eq, u8x16 -> bool,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_eq_i16_true, vec_all_eq, i16x8 -> bool,
+        [1, -1, 1, 0, 0, 0, 0, 0],
+        [1, -1, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_eq_u16_true, vec_all_eq, u16x8 -> bool,
+        [1, 255, 1, 0, 0, 0, 0, 0],
+        [1, 255, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_eq_i32_true, vec_all_eq, i32x4 -> bool,
+        [1, -1, 0, 1],
+        [1, -1, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_eq_u32_true, vec_all_eq, u32x4 -> bool,
+        [1, 255, 0, 1],
+        [1, 255, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_any_eq_i8_false, vec_any_eq, i8x16 -> bool,
+        [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_eq_u8_false, vec_any_eq, u8x16 -> bool,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_eq_i16_false, vec_any_eq, i16x8 -> bool,
+        [1, -1, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 1, 1, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_eq_u16_false, vec_any_eq, u16x8 -> bool,
+        [1, 255, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 1, 1, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_eq_i32_false, vec_any_eq, i32x4 -> bool,
+        [1, -1, 0, 0],
+        [0, -2, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_eq_u32_false, vec_any_eq, u32x4 -> bool,
+        [1, 2, 1, 0],
+        [0, 255,  0, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_eq_i8_true, vec_any_eq, i8x16 -> bool,
+        [1, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_eq_u8_true, vec_any_eq, u8x16 -> bool,
+        [0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_eq_i16_true, vec_any_eq, i16x8 -> bool,
+        [0, -1, 1, 0, 0, 0, 0, 0],
+        [1, -1, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_eq_u16_true, vec_any_eq, u16x8 -> bool,
+        [0, 255, 1, 0, 0, 0, 0, 0],
+        [1, 255, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_eq_i32_true, vec_any_eq, i32x4 -> bool,
+        [0, -1, 0, 1],
+        [1, -1, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_any_eq_u32_true, vec_any_eq, u32x4 -> bool,
+        [0, 255, 0, 1],
+        [1, 255, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_ge_i8_false, vec_all_ge, i8x16 -> bool,
+        [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_ge_u8_false, vec_all_ge, u8x16 -> bool,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_ge_i16_false, vec_all_ge, i16x8 -> bool,
+        [1, -1, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_ge_u16_false, vec_all_ge, u16x8 -> bool,
+        [1, 255, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_ge_i32_false, vec_all_ge, i32x4 -> bool,
+        [1, -1, 0, 0],
+        [0, -1, 0, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_all_ge_u32_false, vec_all_ge, u32x4 -> bool,
+        [1, 255, 0, 0],
+        [0, 255,  1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_all_ge_i8_true, vec_all_ge, i8x16 -> bool,
+        [0, 0, -1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_ge_u8_true, vec_all_ge, u8x16 -> bool,
+        [1, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_ge_i16_true, vec_all_ge, i16x8 -> bool,
+        [1, -1, 42, 0, 0, 0, 0, 0],
+        [1, -5, 2, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_ge_u16_true, vec_all_ge, u16x8 -> bool,
+        [42, 255, 1, 0, 0, 0, 0, 0],
+        [2, 255, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_ge_i32_true, vec_all_ge, i32x4 -> bool,
+        [1, -1, 0, 1],
+        [0, -1, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_ge_u32_true, vec_all_ge, u32x4 -> bool,
+        [1, 255, 0, 1],
+        [1, 254, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_ge_i8_false, vec_any_ge, i8x16 -> bool,
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_ge_u8_false, vec_any_ge, u8x16 -> bool,
+        [1, 254, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [42, 255, 255, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_ge_i16_false, vec_any_ge, i16x8 -> bool,
+        [1, -1, -2, 0, 0, 0, 0, 0],
+        [2, 0, -1, 1, 1, 1, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_ge_u16_false, vec_any_ge, u16x8 -> bool,
+        [1, 2, 0, 0, 0, 0, 0, 0],
+        [2, 42, 255, 1, 1, 1, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_ge_i32_false, vec_any_ge, i32x4 -> bool,
+        [1, -1, 0, 0],
+        [2, 0, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_ge_u32_false, vec_any_ge, u32x4 -> bool,
+        [1, 2, 1, 0],
+        [4, 255,  4, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_ge_i8_true, vec_any_ge, i8x16 -> bool,
+        [1, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_ge_u8_true, vec_any_ge, u8x16 -> bool,
+        [0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_ge_i16_true, vec_any_ge, i16x8 -> bool,
+        [0, -1, 1, 0, 0, 0, 0, 0],
+        [1, -1, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_ge_u16_true, vec_any_ge, u16x8 -> bool,
+        [0, 255, 1, 0, 0, 0, 0, 0],
+        [1, 255, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_ge_i32_true, vec_any_ge, i32x4 -> bool,
+        [0, -1, 0, 1],
+        [1, -1, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_any_ge_u32_true, vec_any_ge, u32x4 -> bool,
+        [0, 255, 0, 1],
+        [1, 255, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_gt_i8_false, vec_all_gt, i8x16 -> bool,
+        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_gt_u8_false, vec_all_gt, u8x16 -> bool,
+        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_gt_i16_false, vec_all_gt, i16x8 -> bool,
+        [1, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_gt_u16_false, vec_all_gt, u16x8 -> bool,
+        [1, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_gt_i32_false, vec_all_gt, i32x4 -> bool,
+        [1, -1, 0, 0],
+        [0, -1, 0, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_all_gt_u32_false, vec_all_gt, u32x4 -> bool,
+        [1, 255, 0, 0],
+        [0, 255,  1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_all_gt_i8_true, vec_all_gt, i8x16 -> bool,
+        [2, 1, -1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -2, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_gt_u8_true, vec_all_gt, u8x16 -> bool,
+        [1, 255, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        [0, 254, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_gt_i16_true, vec_all_gt, i16x8 -> bool,
+        [1, -1, 42, 1, 1, 1, 1, 1],
+        [0, -5, 2, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_gt_u16_true, vec_all_gt, u16x8 -> bool,
+        [42, 255, 1, 1, 1, 1, 1, 1],
+        [2, 254, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_gt_i32_true, vec_all_gt, i32x4 -> bool,
+        [1, -1, 1, 1],
+        [0, -2, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_gt_u32_true, vec_all_gt, u32x4 -> bool,
+        [1, 255, 1, 1],
+        [0, 254, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_gt_i8_false, vec_any_gt, i8x16 -> bool,
+        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_gt_u8_false, vec_any_gt, u8x16 -> bool,
+        [1, 254, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [42, 255, 255, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_gt_i16_false, vec_any_gt, i16x8 -> bool,
+        [1, -1, -2, 0, 0, 0, 0, 0],
+        [2, 0, -1, 1, 1, 1, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_gt_u16_false, vec_any_gt, u16x8 -> bool,
+        [1, 2, 0, 0, 0, 0, 0, 0],
+        [2, 42, 255, 1, 1, 1, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_gt_i32_false, vec_any_gt, i32x4 -> bool,
+        [1, -1, 0, 0],
+        [2, 0, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_gt_u32_false, vec_any_gt, u32x4 -> bool,
+        [1, 2, 1, 0],
+        [4, 255,  4, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_gt_i8_true, vec_any_gt, i8x16 -> bool,
+        [1, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_gt_u8_true, vec_any_gt, u8x16 -> bool,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_gt_i16_true, vec_any_gt, i16x8 -> bool,
+        [1, -1, 1, 0, 0, 0, 0, 0],
+        [0, -1, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_gt_u16_true, vec_any_gt, u16x8 -> bool,
+        [1, 255, 1, 0, 0, 0, 0, 0],
+        [0, 255, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_gt_i32_true, vec_any_gt, i32x4 -> bool,
+        [1, -1, 0, 1],
+        [0, -1, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_any_gt_u32_true, vec_any_gt, u32x4 -> bool,
+        [1, 255, 0, 1],
+        [0, 255, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_in_true, vec_all_in, f32x4 -> bool,
+        [0.0, -0.1, 0.0, 0.0],
+        [0.1, 0.2, 0.0, 0.0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_in_false, vec_all_in, f32x4 -> bool,
+        [0.5, 0.4, -0.5, 0.8],
+        [0.1, 0.4, -0.5, 0.8],
+        false
+    }
+
+    test_vec_2! { test_vec_all_le_i8_false, vec_all_le, i8x16 -> bool,
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_le_u8_false, vec_all_le, u8x16 -> bool,
+        [0, 0, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_le_i16_false, vec_all_le, i16x8 -> bool,
+        [0, 0, -1, 1, 0, 0, 0, 0],
+        [1, -1, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_le_u16_false, vec_all_le, u16x8 -> bool,
+        [0, 0, 255, 1, 0, 0, 0, 0],
+        [1, 255, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_le_i32_false, vec_all_le, i32x4 -> bool,
+        [0, -1, 0, 1],
+        [1, -1, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_le_u32_false, vec_all_le, u32x4 -> bool,
+        [0, 255,  1, 1],
+        [1, 255, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_le_i8_true, vec_all_le, i8x16 -> bool,
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_le_u8_true, vec_all_le, u8x16 -> bool,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_le_i16_true, vec_all_le, i16x8 -> bool,
+        [1, -5, 2, 0, 0, 0, 0, 0],
+        [1, -1, 42, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_le_u16_true, vec_all_le, u16x8 -> bool,
+        [2, 255, 1, 0, 0, 0, 0, 0],
+        [42, 255, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_le_i32_true, vec_all_le, i32x4 -> bool,
+        [0, -1, 0, 1],
+        [1, -1, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_le_u32_true, vec_all_le, u32x4 -> bool,
+        [1, 254, 0, 0],
+        [1, 255, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_any_le_i8_false, vec_any_le, i8x16 -> bool,
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_le_u8_false, vec_any_le, u8x16 -> bool,
+        [42, 255, 255, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        [1, 254, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_le_i16_false, vec_any_le, i16x8 -> bool,
+        [2, 0, -1, 1, 1, 1, 1, 1],
+        [1, -1, -2, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_le_u16_false, vec_any_le, u16x8 -> bool,
+        [2, 42, 255, 1, 1, 1, 1, 1],
+        [1, 2, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_le_i32_false, vec_any_le, i32x4 -> bool,
+        [2, 0, 1, 1],
+        [1, -1, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_le_u32_false, vec_any_le, u32x4 -> bool,
+        [4, 255,  4, 1],
+        [1, 2, 1, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_le_i8_true, vec_any_le, i8x16 -> bool,
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_le_u8_true, vec_any_le, u8x16 -> bool,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_le_i16_true, vec_any_le, i16x8 -> bool,
+        [1, -1, 1, 0, 0, 0, 0, 0],
+        [0, -1, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_le_u16_true, vec_any_le, u16x8 -> bool,
+        [1, 255, 1, 0, 0, 0, 0, 0],
+        [0, 255, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_le_i32_true, vec_any_le, i32x4 -> bool,
+        [1, -1, 0, 1],
+        [0, -1, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_any_le_u32_true, vec_any_le, u32x4 -> bool,
+        [1, 255, 0, 1],
+        [0, 255, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_lt_i8_false, vec_all_lt, i8x16 -> bool,
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_lt_u8_false, vec_all_lt, u8x16 -> bool,
+        [0, 0, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_lt_i16_false, vec_all_lt, i16x8 -> bool,
+        [0, 0, -1, 1, 0, 0, 0, 0],
+        [1, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_lt_u16_false, vec_all_lt, u16x8 -> bool,
+        [0, 0, 255, 1, 0, 0, 0, 0],
+        [1, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_lt_i32_false, vec_all_lt, i32x4 -> bool,
+        [0, -1, 0, 1],
+        [1, -1, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_lt_u32_false, vec_all_lt, u32x4 -> bool,
+        [0, 255,  1, 1],
+        [1, 255, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_lt_i8_true, vec_all_lt, i8x16 -> bool,
+        [0, 0, -2, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+        [2, 1, -1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_lt_u8_true, vec_all_lt, u8x16 -> bool,
+        [0, 254, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 255, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_lt_i16_true, vec_all_lt, i16x8 -> bool,
+        [0, -5, 2, 0, 0, 0, 0, 0],
+        [1, -1, 42, 1, 1, 1, 1, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_lt_u16_true, vec_all_lt, u16x8 -> bool,
+        [2, 254, 0, 0, 0, 0, 0, 0],
+        [42, 255, 1, 1, 1, 1, 1, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_lt_i32_true, vec_all_lt, i32x4 -> bool,
+        [0, -2, 0, 0],
+        [1, -1, 1, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_lt_u32_true, vec_all_lt, u32x4 -> bool,
+        [0, 254, 0, 0],
+        [1, 255, 1, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_any_lt_i8_false, vec_any_lt, i8x16 -> bool,
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_lt_u8_false, vec_any_lt, u8x16 -> bool,
+        [42, 255, 255, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        [1, 254, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_lt_i16_false, vec_any_lt, i16x8 -> bool,
+        [2, 0, -1, 1, 1, 1, 1, 1],
+        [1, -1, -2, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_lt_u16_false, vec_any_lt, u16x8 -> bool,
+        [2, 42, 255, 1, 1, 1, 1, 1],
+        [1, 2, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_lt_i32_false, vec_any_lt, i32x4 -> bool,
+        [2, 0, 1, 1],
+        [1, -1, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_lt_u32_false, vec_any_lt, u32x4 -> bool,
+        [4, 255,  4, 1],
+        [1, 2, 1, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_lt_i8_true, vec_any_lt, i8x16 -> bool,
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_lt_u8_true, vec_any_lt, u8x16 -> bool,
+        [0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_lt_i16_true, vec_any_lt, i16x8 -> bool,
+        [0, -1, 1, 0, 0, 0, 0, 0],
+        [1, -1, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_lt_u16_true, vec_any_lt, u16x8 -> bool,
+        [0, 255, 1, 0, 0, 0, 0, 0],
+        [1, 255, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_lt_i32_true, vec_any_lt, i32x4 -> bool,
+        [0, -1, 0, 1],
+        [1, -1, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_any_lt_u32_true, vec_any_lt, u32x4 -> bool,
+        [0, 255, 0, 1],
+        [1, 255, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_ne_i8_false, vec_all_ne, i8x16 -> bool,
+        [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_ne_u8_false, vec_all_ne, u8x16 -> bool,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_ne_i16_false, vec_all_ne, i16x8 -> bool,
+        [1, -1, 0, 0, 0, 0, 0, 0],
+        [0, -1, 1, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_ne_u16_false, vec_all_ne, u16x8 -> bool,
+        [1, 255, 0, 0, 0, 0, 0, 0],
+        [0, 255, 0, 1, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_all_ne_i32_false, vec_all_ne, i32x4 -> bool,
+        [1, -1, 0, 0],
+        [0, -1, 0, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_all_ne_u32_false, vec_all_ne, u32x4 -> bool,
+        [1, 255, 0, 0],
+        [0, 255,  0, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_all_ne_i8_true, vec_all_ne, i8x16 -> bool,
+        [0, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        [1, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_ne_u8_true, vec_all_ne, u8x16 -> bool,
+        [0, 254, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_ne_i16_true, vec_all_ne, i16x8 -> bool,
+        [2, -2, 0, 1, 1, 1, 1, 1],
+        [1, -1, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_all_ne_u16_true, vec_all_ne, u16x8 -> bool,
+        [0, 254, 1, 1, 0, 0, 1, 0],
+        [1, 255, 0, 0, 1, 1, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_ne_i32_true, vec_all_ne, i32x4 -> bool,
+        [0, -2, 0, 0],
+        [1, -1, 1, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_all_ne_u32_true, vec_all_ne, u32x4 -> bool,
+        [1, 255, 0, 0],
+        [0, 254, 1, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_any_ne_i8_false, vec_any_ne, i8x16 -> bool,
+        [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_ne_u8_false, vec_any_ne, u8x16 -> bool,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_ne_i16_false, vec_any_ne, i16x8 -> bool,
+        [1, -1, 0, 0, 0, 0, 0, 0],
+        [1, -1, 0, 0, 0, 0, 0, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_ne_u16_false, vec_any_ne, u16x8 -> bool,
+        [1, 255, 1, 1, 1, 1, 1, 0],
+        [1, 255, 1, 1, 1, 1, 1, 0],
+        false
+    }
+
+    test_vec_2! { test_vec_any_ne_i32_false, vec_any_ne, i32x4 -> bool,
+        [0, -1, 1, 1],
+        [0, -1, 1, 1],
+        false
+    }
+
+    test_vec_2! { test_vec_any_ne_u32_false, vec_any_ne, u32x4 -> bool,
+        [1, 2, 1, 255],
+        [1, 2, 1, 255],
+        false
+    }
+
+    test_vec_2! { test_vec_any_ne_i8_true, vec_any_ne, i8x16 -> bool,
+        [1, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_ne_u8_true, vec_any_ne, u8x16 -> bool,
+        [0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_ne_i16_true, vec_any_ne, i16x8 -> bool,
+        [0, -1, 1, 0, 0, 0, 0, 0],
+        [1, -1, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_ne_u16_true, vec_any_ne, u16x8 -> bool,
+        [0, 255, 1, 0, 0, 0, 0, 0],
+        [1, 255, 1, 0, 0, 0, 0, 0],
+        true
+    }
+
+    test_vec_2! { test_vec_any_ne_i32_true, vec_any_ne, i32x4 -> bool,
+        [0, -1, 0, 1],
+        [1, -1, 0, 1],
+        true
+    }
+
+    test_vec_2! { test_vec_any_ne_u32_true, vec_any_ne, u32x4 -> bool,
+        [0, 255, 0, 1],
+        [1, 255, 0, 1],
+        true
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_cmpb() {
+        let a: vector_float = transmute(f32x4::new(0.1, 0.5, 0.6, 0.9));
+        let b: vector_float = transmute(f32x4::new(-0.1, 0.5, -0.6, 0.9));
+        let d = i32x4::new(
+            -0b10000000000000000000000000000000,
+            0,
+            -0b10000000000000000000000000000000,
+            0,
+        );
+
+        assert_eq!(d, transmute(vec_cmpb(a, b)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_ceil() {
+        let a: vector_float = transmute(f32x4::new(0.1, 0.5, 0.6, 0.9));
+        let d = f32x4::new(1.0, 1.0, 1.0, 1.0);
+
+        assert_eq!(d, transmute(vec_ceil(a)));
+    }
+
+    test_vec_2! { test_vec_andc, vec_andc, i32x4,
+    [0b11001100, 0b11001100, 0b11001100, 0b11001100],
+    [0b00110011, 0b11110011, 0b00001100, 0b10000000],
+    [0b11001100, 0b00001100, 0b11000000, 0b01001100] }
+
+    test_vec_2! { test_vec_and, vec_and, i32x4,
+    [0b11001100, 0b11001100, 0b11001100, 0b11001100],
+    [0b00110011, 0b11110011, 0b00001100, 0b00000000],
+    [0b00000000, 0b11000000, 0b00001100, 0b00000000] }
+
+    macro_rules! test_vec_avg {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! {$name, vec_avg, $ty, [$($a),+], [$($b),+], [$($d),+] }
+        }
+    }
+
+    test_vec_avg! { test_vec_avg_i32x4, i32x4,
+    [i32::MIN, i32::MAX, 1, -1],
+    [-1, 1, 1, -1],
+    [-1073741824, 1073741824, 1, -1] }
+
+    test_vec_avg! { test_vec_avg_u32x4, u32x4,
+    [u32::MAX, 0, 1, 2],
+    [2, 1, 0, 0],
+    [2147483649, 1, 1, 1] }
+
+    test_vec_avg! { test_vec_avg_i16x8, i16x8,
+    [i16::MIN, i16::MAX, 1, -1, 0, 0, 0, 0],
+    [-1, 1, 1, -1, 0, 0, 0, 0],
+    [-16384, 16384, 1, -1, 0, 0, 0, 0] }
+
+    test_vec_avg! { test_vec_avg_u16x8, u16x8,
+    [u16::MAX, 0, 1, 2, 0, 0, 0, 0],
+    [2, 1, 0, 0, 0, 0, 0, 0],
+    [32769, 1, 1, 1, 0, 0, 0, 0] }
+
+    test_vec_avg! { test_vec_avg_i8x16, i8x16,
+    [i8::MIN, i8::MAX, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [-1, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [-64, 64, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }
+
+    test_vec_avg! { test_vec_avg_u8x16, u8x16,
+    [u8::MAX, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [129, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }
+
+    macro_rules! test_vec_adds {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! {$name, vec_adds, $ty, [$($a),+], [$($b),+], [$($d),+] }
+        }
+    }
+
+    test_vec_adds! { test_vec_adds_i32x4, i32x4,
+    [i32::MIN, i32::MAX, 1, -1],
+    [-1, 1, 1, -1],
+    [i32::MIN, i32::MAX, 2, -2] }
+
+    test_vec_adds! { test_vec_adds_u32x4, u32x4,
+    [u32::MAX, 0, 1, 2],
+    [2, 1, 0, 0],
+    [u32::MAX, 1, 1, 2] }
+
+    test_vec_adds! { test_vec_adds_i16x8, i16x8,
+    [i16::MIN, i16::MAX, 1, -1, 0, 0, 0, 0],
+    [-1, 1, 1, -1, 0, 0, 0, 0],
+    [i16::MIN, i16::MAX, 2, -2, 0, 0, 0, 0] }
+
+    test_vec_adds! { test_vec_adds_u16x8, u16x8,
+    [u16::MAX, 0, 1, 2, 0, 0, 0, 0],
+    [2, 1, 0, 0, 0, 0, 0, 0],
+    [u16::MAX, 1, 1, 2, 0, 0, 0, 0] }
+
+    test_vec_adds! { test_vec_adds_i8x16, i8x16,
+    [i8::MIN, i8::MAX, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [-1, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [i8::MIN, i8::MAX, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }
+
+    test_vec_adds! { test_vec_adds_u8x16, u8x16,
+    [u8::MAX, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [u8::MAX, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }
+
+    test_vec_2! { test_vec_addc, vec_addc, u32x4, [u32::MAX, 0, 0, 0], [1, 1, 1, 1], [1, 0, 0, 0] }
+
+    macro_rules! test_vec_abs {
+        { $name: ident, $ty: ident, $a: expr, $d: expr } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a = vec_splats($a);
+                let a: s_t_l!($ty) = vec_abs(a);
+                let d = $ty::splat($d);
+                assert_eq!(d, transmute(a));
+            }
+        }
+    }
+
+    test_vec_abs! { test_vec_abs_i8, i8x16, -42i8, 42i8 }
+    test_vec_abs! { test_vec_abs_i16, i16x8, -42i16, 42i16 }
+    test_vec_abs! { test_vec_abs_i32, i32x4, -42i32, 42i32 }
+    test_vec_abs! { test_vec_abs_f32, f32x4, -42f32, 42f32 }
+
+    macro_rules! test_vec_abss {
+        { $name: ident, $ty: ident, $a: expr, $d: expr } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a = vec_splats($a);
+                let a: s_t_l!($ty) = vec_abss(a);
+                let d = $ty::splat($d);
+                assert_eq!(d, transmute(a));
+            }
+        }
+    }
+
+    test_vec_abss! { test_vec_abss_i8, i8x16, -127i8, 127i8 }
+    test_vec_abss! { test_vec_abss_i16, i16x8, -42i16, 42i16 }
+    test_vec_abss! { test_vec_abss_i32, i32x4, -42i32, 42i32 }
+
+    macro_rules! test_vec_splats {
+        { $name: ident, $ty: ident, $a: expr } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = vec_splats($a);
+                let d = $ty::splat($a);
+                assert_eq!(d, transmute(a));
+            }
+        }
+    }
+
+    test_vec_splats! { test_vec_splats_u8, u8x16, 42u8 }
+    test_vec_splats! { test_vec_splats_u16, u16x8, 42u16 }
+    test_vec_splats! { test_vec_splats_u32, u32x4, 42u32 }
+    test_vec_splats! { test_vec_splats_i8, i8x16, 42i8 }
+    test_vec_splats! { test_vec_splats_i16, i16x8, 42i16 }
+    test_vec_splats! { test_vec_splats_i32, i32x4, 42i32 }
+    test_vec_splats! { test_vec_splats_f32, f32x4, 42f32 }
+
+    macro_rules! test_vec_splat {
+        { $name: ident, $fun: ident, $ty: ident, $a: expr, $b: expr} => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a = $fun::<$a>();
+                let d = $ty::splat($b);
+                assert_eq!(d, transmute(a));
+            }
+        }
+    }
+
+    test_vec_splat! { test_vec_splat_u8, vec_splat_u8, u8x16, -1, u8::MAX }
+    test_vec_splat! { test_vec_splat_u16, vec_splat_u16, u16x8, -1, u16::MAX }
+    test_vec_splat! { test_vec_splat_u32, vec_splat_u32, u32x4, -1, u32::MAX }
+    test_vec_splat! { test_vec_splat_s8, vec_splat_s8, i8x16, -1, -1 }
+    test_vec_splat! { test_vec_splat_s16, vec_splat_s16, i16x8, -1, -1 }
+    test_vec_splat! { test_vec_splat_s32, vec_splat_s32, i32x4, -1, -1 }
+
+    macro_rules! test_vec_sub {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! {$name, vec_sub, $ty, [$($a),+], [$($b),+], [$($d),+] }
+        }
+    }
+
+    test_vec_sub! { test_vec_sub_f32x4, f32x4,
+    [-1.0, 0.0, 1.0, 2.0],
+    [2.0, 1.0, -1.0, -2.0],
+    [-3.0, -1.0, 2.0, 4.0] }
+
+    test_vec_sub! { test_vec_sub_i32x4, i32x4,
+    [-1, 0, 1, 2],
+    [2, 1, -1, -2],
+    [-3, -1, 2, 4] }
+
+    test_vec_sub! { test_vec_sub_u32x4, u32x4,
+    [0, 0, 1, 2],
+    [2, 1, 0, 0],
+    [4294967294, 4294967295, 1, 2] }
+
+    test_vec_sub! { test_vec_sub_i16x8, i16x8,
+    [-1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2],
+    [-3, -1, 2, 4, -3, -1, 2, 4] }
+
+    test_vec_sub! { test_vec_sub_u16x8, u16x8,
+    [0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0],
+    [65534, 65535, 1, 2, 65534, 65535, 1, 2] }
+
+    test_vec_sub! { test_vec_sub_i8x16, i8x16,
+    [-1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2],
+    [-3, -1, 2, 4, -3, -1, 2, 4, -3, -1, 2, 4, -3, -1, 2, 4] }
+
+    test_vec_sub! { test_vec_sub_u8x16, u8x16,
+    [0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0],
+    [254, 255, 1, 2, 254, 255, 1, 2, 254, 255, 1, 2, 254, 255, 1, 2] }
+
+    macro_rules! test_vec_subs {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! {$name, vec_subs, $ty, [$($a),+], [$($b),+], [$($d),+] }
+        }
+    }
+
+    test_vec_subs! { test_vec_subs_i32x4, i32x4,
+    [-1, 0, 1, 2],
+    [2, 1, -1, -2],
+    [-3, -1, 2, 4] }
+
+    test_vec_subs! { test_vec_subs_u32x4, u32x4,
+    [0, 0, 1, 2],
+    [2, 1, 0, 0],
+    [0, 0, 1, 2] }
+
+    test_vec_subs! { test_vec_subs_i16x8, i16x8,
+    [-1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2],
+    [-3, -1, 2, 4, -3, -1, 2, 4] }
+
+    test_vec_subs! { test_vec_subs_u16x8, u16x8,
+    [0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0],
+    [0, 0, 1, 2, 0, 0, 1, 2] }
+
+    test_vec_subs! { test_vec_subs_i8x16, i8x16,
+    [-1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2],
+    [-3, -1, 2, 4, -3, -1, 2, 4, -3, -1, 2, 4, -3, -1, 2, 4] }
+
+    test_vec_subs! { test_vec_subs_u8x16, u8x16,
+    [0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0],
+    [0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2] }
+
+    macro_rules! test_vec_min {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = transmute($ty::new($($a),+));
+                let b: s_t_l!($ty) = transmute($ty::new($($b),+));
+
+                let d = $ty::new($($d),+);
+                let r : $ty = transmute(vec_min(a, b));
+                assert_eq!(d, r);
+            }
+         }
+    }
+
+    test_vec_min! { test_vec_min_i32x4, i32x4,
+    [-1, 0, 1, 2],
+    [2, 1, -1, -2],
+    [-1, 0, -1, -2] }
+
+    test_vec_min! { test_vec_min_u32x4, u32x4,
+    [0, 0, 1, 2],
+    [2, 1, 0, 0],
+    [0, 0, 0, 0] }
+
+    test_vec_min! { test_vec_min_i16x8, i16x8,
+    [-1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2],
+    [-1, 0, -1, -2, -1, 0, -1, -2] }
+
+    test_vec_min! { test_vec_min_u16x8, u16x8,
+    [0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0],
+    [0, 0, 0, 0, 0, 0, 0, 0] }
+
+    test_vec_min! { test_vec_min_i8x16, i8x16,
+    [-1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2],
+    [-1, 0, -1, -2, -1, 0, -1, -2, -1, 0, -1, -2, -1, 0, -1, -2] }
+
+    test_vec_min! { test_vec_min_u8x16, u8x16,
+    [0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0],
+    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }
+
+    macro_rules! test_vec_max {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = transmute($ty::new($($a),+));
+                let b: s_t_l!($ty) = transmute($ty::new($($b),+));
+
+                let d = $ty::new($($d),+);
+                let r : $ty = transmute(vec_max(a, b));
+                assert_eq!(d, r);
+            }
+         }
+    }
+
+    test_vec_max! { test_vec_max_i32x4, i32x4,
+    [-1, 0, 1, 2],
+    [2, 1, -1, -2],
+    [2, 1, 1, 2] }
+
+    test_vec_max! { test_vec_max_u32x4, u32x4,
+    [0, 0, 1, 2],
+    [2, 1, 0, 0],
+    [2, 1, 1, 2] }
+
+    test_vec_max! { test_vec_max_i16x8, i16x8,
+    [-1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2],
+    [2, 1, 1, 2, 2, 1, 1, 2] }
+
+    test_vec_max! { test_vec_max_u16x8, u16x8,
+    [0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0],
+    [2, 1, 1, 2, 2, 1, 1, 2] }
+
+    test_vec_max! { test_vec_max_i8x16, i8x16,
+    [-1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2],
+    [2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2] }
+
+    test_vec_max! { test_vec_max_u8x16, u8x16,
+    [0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0],
+    [2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2] }
+
+    macro_rules! test_vec_perm {
+        {$name:ident,
+         $shorttype:ident, $longtype:ident,
+         [$($a:expr),+], [$($b:expr),+], [$($c:expr),+], [$($d:expr),+]} => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: $longtype = transmute($shorttype::new($($a),+));
+                let b: $longtype = transmute($shorttype::new($($b),+));
+                let c: vector_unsigned_char = transmute(u8x16::new($($c),+));
+                let d = $shorttype::new($($d),+);
+
+                let r: $shorttype = transmute(vec_perm(a, b, c));
+                assert_eq!(d, r);
+            }
+        }
+    }
+
+    test_vec_perm! {test_vec_perm_u8x16,
+    u8x16, vector_unsigned_char,
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]}
+    test_vec_perm! {test_vec_perm_i8x16,
+    i8x16, vector_signed_char,
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]}
+
+    test_vec_perm! {test_vec_perm_m8x16,
+    m8x16, vector_bool_char,
+    [false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false],
+    [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [false, false, true, true, false, false, true, true, false, false, true, true, false, false, true, true]}
+    test_vec_perm! {test_vec_perm_u16x8,
+    u16x8, vector_unsigned_short,
+    [0, 1, 2, 3, 4, 5, 6, 7],
+    [10, 11, 12, 13, 14, 15, 16, 17],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 10, 1, 11, 2, 12, 3, 13]}
+    test_vec_perm! {test_vec_perm_i16x8,
+    i16x8, vector_signed_short,
+    [0, 1, 2, 3, 4, 5, 6, 7],
+    [10, 11, 12, 13, 14, 15, 16, 17],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 10, 1, 11, 2, 12, 3, 13]}
+    test_vec_perm! {test_vec_perm_m16x8,
+    m16x8, vector_bool_short,
+    [false, false, false, false, false, false, false, false],
+    [true, true, true, true, true, true, true, true],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [false, true, false, true, false, true, false, true]}
+
+    test_vec_perm! {test_vec_perm_u32x4,
+    u32x4, vector_unsigned_int,
+    [0, 1, 2, 3],
+    [10, 11, 12, 13],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0, 10, 1, 11]}
+    test_vec_perm! {test_vec_perm_i32x4,
+    i32x4, vector_signed_int,
+    [0, 1, 2, 3],
+    [10, 11, 12, 13],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0, 10, 1, 11]}
+    test_vec_perm! {test_vec_perm_m32x4,
+    m32x4, vector_bool_int,
+    [false, false, false, false],
+    [true, true, true, true],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [false, true, false, true]}
+    test_vec_perm! {test_vec_perm_f32x4,
+    f32x4, vector_float,
+    [0.0, 1.0, 2.0, 3.0],
+    [1.0, 1.1, 1.2, 1.3],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0.0, 1.0, 1.0, 1.1]}
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_madds() {
+        let a: vector_signed_short = transmute(i16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ));
+        let b: vector_signed_short = transmute(i16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_signed_short = transmute(i16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
+
+        let d = i16x8::new(0, 3, 6, 9, 12, 15, 18, 21);
+
+        assert_eq!(d, transmute(vec_madds(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_madd_float() {
+        let a: vector_float = transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let b: vector_float = transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let c: vector_float = transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let d = f32x4::new(
+            0.1 * 0.1 + 0.1,
+            0.2 * 0.2 + 0.2,
+            0.3 * 0.3 + 0.3,
+            0.4 * 0.4 + 0.4,
+        );
+
+        assert_eq!(d, transmute(vec_madd(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_nmsub_float() {
+        let a: vector_float = transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let b: vector_float = transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let c: vector_float = transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let d = f32x4::new(
+            -(0.1 * 0.1 - 0.1),
+            -(0.2 * 0.2 - 0.2),
+            -(0.3 * 0.3 - 0.3),
+            -(0.4 * 0.4 - 0.4),
+        );
+        assert_eq!(d, transmute(vec_nmsub(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mradds() {
+        let a: vector_signed_short = transmute(i16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ));
+        let b: vector_signed_short = transmute(i16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_signed_short = transmute(i16x8::new(0, 1, 2, 3, 4, 5, 6, i16::MAX - 1));
+
+        let d = i16x8::new(0, 3, 6, 9, 12, 15, 18, i16::MAX);
+
+        assert_eq!(d, transmute(vec_mradds(a, b, c)));
+    }
+
+    macro_rules! test_vec_mladd {
+        {$name:ident, $sa:ident, $la:ident, $sbc:ident, $lbc:ident, $sd:ident,
+            [$($a:expr),+], [$($b:expr),+], [$($c:expr),+], [$($d:expr),+]} => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: $la = transmute($sa::new($($a),+));
+                let b: $lbc = transmute($sbc::new($($b),+));
+                let c = transmute($sbc::new($($c),+));
+                let d = $sd::new($($d),+);
+
+                assert_eq!(d, transmute(vec_mladd(a, b, c)));
+            }
+        }
+    }
+
+    test_vec_mladd! { test_vec_mladd_u16x8_u16x8, u16x8, vector_unsigned_short, u16x8, vector_unsigned_short, u16x8,
+        [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 6, 12, 20, 30, 42, 56]
+    }
+    test_vec_mladd! { test_vec_mladd_u16x8_i16x8, u16x8, vector_unsigned_short, i16x8, vector_unsigned_short, i16x8,
+        [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 6, 12, 20, 30, 42, 56]
+    }
+    test_vec_mladd! { test_vec_mladd_i16x8_u16x8, i16x8, vector_signed_short, u16x8, vector_unsigned_short, i16x8,
+        [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 6, 12, 20, 30, 42, 56]
+    }
+    test_vec_mladd! { test_vec_mladd_i16x8_i16x8, i16x8, vector_signed_short, i16x8, vector_unsigned_short, i16x8,
+        [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 6, 12, 20, 30, 42, 56]
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msum_unsigned_char() {
+        let a: vector_unsigned_char =
+            transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let b: vector_unsigned_char = transmute(u8x16::new(
+            255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+        ));
+        let c: vector_unsigned_int = transmute(u32x4::new(0, 1, 2, 3));
+        let d = u32x4::new(
+            (0 + 1 + 2 + 3) * 255 + 0,
+            (4 + 5 + 6 + 7) * 255 + 1,
+            (0 + 1 + 2 + 3) * 255 + 2,
+            (4 + 5 + 6 + 7) * 255 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_msum(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msum_signed_char() {
+        let a: vector_signed_char = transmute(i8x16::new(
+            0, -1, 2, -3, 1, -1, 1, -1, 0, 1, 2, 3, 4, -5, -6, -7,
+        ));
+        let b: vector_unsigned_char =
+            transmute(i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1));
+        let c: vector_signed_int = transmute(u32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(
+            (0 - 1 + 2 - 3) + 0,
+            (0) + 1,
+            (0 + 1 + 2 + 3) + 2,
+            (4 - 5 - 6 - 7) + 3,
+        );
+
+        assert_eq!(d, transmute(vec_msum(a, b, c)));
+    }
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msum_unsigned_short() {
+        let a: vector_unsigned_short = transmute(u16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ));
+        let b: vector_unsigned_short =
+            transmute(u16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_unsigned_int = transmute(u32x4::new(0, 1, 2, 3));
+        let d = u32x4::new(
+            (0 + 1) * 256 * 256 + 0,
+            (2 + 3) * 256 * 256 + 1,
+            (4 + 5) * 256 * 256 + 2,
+            (6 + 7) * 256 * 256 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_msum(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msum_signed_short() {
+        let a: vector_signed_short = transmute(i16x8::new(
+            0 * 256,
+            -1 * 256,
+            2 * 256,
+            -3 * 256,
+            4 * 256,
+            -5 * 256,
+            6 * 256,
+            -7 * 256,
+        ));
+        let b: vector_signed_short = transmute(i16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_signed_int = transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(
+            (0 - 1) * 256 * 256 + 0,
+            (2 - 3) * 256 * 256 + 1,
+            (4 - 5) * 256 * 256 + 2,
+            (6 - 7) * 256 * 256 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_msum(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msums_unsigned() {
+        let a: vector_unsigned_short = transmute(u16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ));
+        let b: vector_unsigned_short =
+            transmute(u16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_unsigned_int = transmute(u32x4::new(0, 1, 2, 3));
+        let d = u32x4::new(
+            (0 + 1) * 256 * 256 + 0,
+            (2 + 3) * 256 * 256 + 1,
+            (4 + 5) * 256 * 256 + 2,
+            (6 + 7) * 256 * 256 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_msums(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msums_signed() {
+        let a: vector_signed_short = transmute(i16x8::new(
+            0 * 256,
+            -1 * 256,
+            2 * 256,
+            -3 * 256,
+            4 * 256,
+            -5 * 256,
+            6 * 256,
+            -7 * 256,
+        ));
+        let b: vector_signed_short = transmute(i16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_signed_int = transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(
+            (0 - 1) * 256 * 256 + 0,
+            (2 - 3) * 256 * 256 + 1,
+            (4 - 5) * 256 * 256 + 2,
+            (6 - 7) * 256 * 256 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_msums(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_sum2s() {
+        let a: vector_signed_int = transmute(i32x4::new(0, 1, 2, 3));
+        let b: vector_signed_int = transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(0, 0 + 1 + 1, 0, 2 + 3 + 3);
+
+        assert_eq!(d, transmute(vec_sum2s(a, b)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_sum4s_unsigned_char() {
+        let a: vector_unsigned_char =
+            transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let b: vector_unsigned_int = transmute(u32x4::new(0, 1, 2, 3));
+        let d = u32x4::new(
+            0 + 1 + 2 + 3 + 0,
+            4 + 5 + 6 + 7 + 1,
+            0 + 1 + 2 + 3 + 2,
+            4 + 5 + 6 + 7 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_sum4s(a, b)));
+    }
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_sum4s_signed_char() {
+        let a: vector_signed_char =
+            transmute(i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let b: vector_signed_int = transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(
+            0 + 1 + 2 + 3 + 0,
+            4 + 5 + 6 + 7 + 1,
+            0 + 1 + 2 + 3 + 2,
+            4 + 5 + 6 + 7 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_sum4s(a, b)));
+    }
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_sum4s_signed_short() {
+        let a: vector_signed_short = transmute(i16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
+        let b: vector_signed_int = transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(0 + 1 + 0, 2 + 3 + 1, 4 + 5 + 2, 6 + 7 + 3);
+
+        assert_eq!(d, transmute(vec_sum4s(a, b)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mule_unsigned_char() {
+        let a: vector_unsigned_char =
+            transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let d = u16x8::new(0 * 0, 2 * 2, 4 * 4, 6 * 6, 0 * 0, 2 * 2, 4 * 4, 6 * 6);
+
+        assert_eq!(d, transmute(vec_mule(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mule_signed_char() {
+        let a: vector_signed_char = transmute(i8x16::new(
+            0, 1, -2, 3, -4, 5, -6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
+        ));
+        let d = i16x8::new(0 * 0, 2 * 2, 4 * 4, 6 * 6, 0 * 0, 2 * 2, 4 * 4, 6 * 6);
+
+        assert_eq!(d, transmute(vec_mule(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mule_unsigned_short() {
+        let a: vector_unsigned_short = transmute(u16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
+        let d = u32x4::new(0 * 0, 2 * 2, 4 * 4, 6 * 6);
+
+        assert_eq!(d, transmute(vec_mule(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mule_signed_short() {
+        let a: vector_signed_short = transmute(i16x8::new(0, 1, -2, 3, -4, 5, -6, 7));
+        let d = i32x4::new(0 * 0, 2 * 2, 4 * 4, 6 * 6);
+
+        assert_eq!(d, transmute(vec_mule(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mulo_unsigned_char() {
+        let a: vector_unsigned_char =
+            transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let d = u16x8::new(1 * 1, 3 * 3, 5 * 5, 7 * 7, 1 * 1, 3 * 3, 5 * 5, 7 * 7);
+
+        assert_eq!(d, transmute(vec_mulo(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mulo_signed_char() {
+        let a: vector_signed_char = transmute(i8x16::new(
+            0, 1, -2, 3, -4, 5, -6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
+        ));
+        let d = i16x8::new(1 * 1, 3 * 3, 5 * 5, 7 * 7, 1 * 1, 3 * 3, 5 * 5, 7 * 7);
+
+        assert_eq!(d, transmute(vec_mulo(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mulo_unsigned_short() {
+        let a: vector_unsigned_short = transmute(u16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
+        let d = u32x4::new(1 * 1, 3 * 3, 5 * 5, 7 * 7);
+
+        assert_eq!(d, transmute(vec_mulo(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mulo_signed_short() {
+        let a: vector_signed_short = transmute(i16x8::new(0, 1, -2, 3, -4, 5, -6, 7));
+        let d = i32x4::new(1 * 1, 3 * 3, 5 * 5, 7 * 7);
+
+        assert_eq!(d, transmute(vec_mulo(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn vec_add_i32x4_i32x4() {
+        let x = i32x4::new(1, 2, 3, 4);
+        let y = i32x4::new(4, 3, 2, 1);
+        let x: vector_signed_int = transmute(x);
+        let y: vector_signed_int = transmute(y);
+        let z = vec_add(x, y);
+        assert_eq!(i32x4::splat(5), transmute(z));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn vec_ctf_u32() {
+        let v: vector_unsigned_int = transmute(u32x4::new(u32::MIN, u32::MAX, u32::MAX, 42));
+        let v2 = vec_ctf::<1, _>(v);
+        let r2: vector_float = transmute(f32x4::new(0.0, 2147483600.0, 2147483600.0, 21.0));
+        let v4 = vec_ctf::<2, _>(v);
+        let r4: vector_float = transmute(f32x4::new(0.0, 1073741800.0, 1073741800.0, 10.5));
+        let v8 = vec_ctf::<3, _>(v);
+        let r8: vector_float = transmute(f32x4::new(0.0, 536870900.0, 536870900.0, 5.25));
+
+        let check = |a, b| {
+            let r = transmute(vec_cmple(vec_abs(vec_sub(a, b)), vec_splats(f32::EPSILON)));
+            let e = m32x4::new(true, true, true, true);
+            assert_eq!(e, r);
+        };
+
+        check(v2, r2);
+        check(v4, r4);
+        check(v8, r8);
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_ctu() {
+        let v = u32x4::new(u32::MIN, u32::MAX, u32::MAX, 42);
+        let v2: u32x4 = transmute(vec_ctu::<1>(transmute(f32x4::new(
+            0.0,
+            2147483600.0,
+            2147483600.0,
+            21.0,
+        ))));
+        let v4: u32x4 = transmute(vec_ctu::<2>(transmute(f32x4::new(
+            0.0,
+            1073741800.0,
+            1073741800.0,
+            10.5,
+        ))));
+        let v8: u32x4 = transmute(vec_ctu::<3>(transmute(f32x4::new(
+            0.0,
+            536870900.0,
+            536870900.0,
+            5.25,
+        ))));
+
+        assert_eq!(v2, v);
+        assert_eq!(v4, v);
+        assert_eq!(v8, v);
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn vec_ctf_i32() {
+        let v: vector_signed_int = transmute(i32x4::new(i32::MIN, i32::MAX, i32::MAX - 42, 42));
+        let v2 = vec_ctf::<1, _>(v);
+        let r2: vector_float =
+            transmute(f32x4::new(-1073741800.0, 1073741800.0, 1073741800.0, 21.0));
+        let v4 = vec_ctf::<2, _>(v);
+        let r4: vector_float = transmute(f32x4::new(-536870900.0, 536870900.0, 536870900.0, 10.5));
+        let v8 = vec_ctf::<3, _>(v);
+        let r8: vector_float = transmute(f32x4::new(-268435460.0, 268435460.0, 268435460.0, 5.25));
+
+        let check = |a, b| {
+            let r = transmute(vec_cmple(vec_abs(vec_sub(a, b)), vec_splats(f32::EPSILON)));
+            println!("{:?} {:?}", a, b);
+            let e = m32x4::new(true, true, true, true);
+            assert_eq!(e, r);
+        };
+
+        check(v2, r2);
+        check(v4, r4);
+        check(v8, r8);
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_cts() {
+        let v = i32x4::new(i32::MIN, i32::MAX, i32::MAX, 42);
+        let v2: i32x4 = transmute(vec_cts::<1>(transmute(f32x4::new(
+            -1073741800.0,
+            1073741800.0,
+            1073741800.0,
+            21.0,
+        ))));
+        let v4: i32x4 = transmute(vec_cts::<2>(transmute(f32x4::new(
+            -536870900.0,
+            536870900.0,
+            536870900.0,
+            10.5,
+        ))));
+        let v8: i32x4 = transmute(vec_cts::<3>(transmute(f32x4::new(
+            -268435460.0,
+            268435460.0,
+            268435460.0,
+            5.25,
+        ))));
+
+        assert_eq!(v2, v);
+        assert_eq!(v4, v);
+        assert_eq!(v8, v);
+    }
+
+    test_vec_2! { test_vec_rl, vec_rl, u32x4,
+        [0x12345678, 0x9ABCDEF0, 0x0F0F0F0F, 0x12345678],
+        [4, 8, 12, 68],
+        [0x23456781, 0xBCDEF09A, 0xF0F0F0F0, 0x23456781]
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/powerpc/macros.rs b/library/stdarch/crates/core_arch/src/powerpc/macros.rs
new file mode 100644
index 0000000000000..af47494e8fb40
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc/macros.rs
@@ -0,0 +1,315 @@
+macro_rules! test_impl {
+    ($fun:ident ($($v:ident : $ty:ty),*) -> $r:ty [$call:ident, $instr:ident]) => {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        #[cfg_attr(test, assert_instr($instr))]
+        pub unsafe fn $fun ($($v : $ty),*) -> $r {
+            $call ($($v),*)
+        }
+    };
+    ($fun:ident ($($v:ident : $ty:ty),*) -> $r:ty [$call:ident, $instr_altivec:ident / $instr_vsx:ident]) => {
+        test_impl! { $fun ($($v : $ty),*) -> $r [$call, $instr_altivec / $instr_vsx / $instr_vsx] }
+    };
+    ($fun:ident ($($v:ident : $ty:ty),*) -> $r:ty [$call:ident, $instr_altivec:ident / $instr_vsx:ident / $instr_pwr9:ident]) => {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        #[cfg_attr(all(test, not(target_feature="vsx"), not(target_feature = "power9-vector")), assert_instr($instr_altivec))]
+        #[cfg_attr(all(test, target_feature="vsx", not(target_feature = "power9-vector")), assert_instr($instr_vsx))]
+        #[cfg_attr(all(test, not(target_feature="vsx"), target_feature = "power9-vector"), assert_instr($instr_pwr9))]
+        pub unsafe fn $fun ($($v : $ty),*) -> $r {
+            $call ($($v),*)
+        }
+    }
+}
+
+#[allow(unknown_lints, unused_macro_rules)]
+macro_rules! impl_vec_trait {
+    ([$Trait:ident $m:ident] $fun:ident ($a:ty)) => {
+        #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+        impl $Trait for $a {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn $m(self) -> Self {
+                $fun(transmute(self))
+            }
+        }
+    };
+    ([$Trait:ident $m:ident] $fun:ident ($a:ty) -> $r:ty) => {
+        #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+        impl $Trait for $a {
+            type Result = $r;
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn $m(self) -> Self::Result {
+                $fun(transmute(self))
+            }
+        }
+    };
+    ([$Trait:ident $m:ident]+ $fun:ident ($a:ty) -> $r:ty) => {
+        #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+        impl $Trait for $a {
+            type Result = $r;
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn $m(self) -> Self::Result {
+                transmute($fun(transmute(self)))
+            }
+        }
+    };
+    ([$Trait:ident $m:ident] 1 ($ub:ident, $sb:ident, $uh:ident, $sh:ident, $uw:ident, $sw:ident, $sf: ident)) => {
+        impl_vec_trait!{ [$Trait $m] $ub (vector_unsigned_char) -> vector_unsigned_char }
+        impl_vec_trait!{ [$Trait $m] $sb (vector_signed_char) -> vector_signed_char }
+        impl_vec_trait!{ [$Trait $m] $uh (vector_unsigned_short) -> vector_unsigned_short }
+        impl_vec_trait!{ [$Trait $m] $sh (vector_signed_short) -> vector_signed_short }
+        impl_vec_trait!{ [$Trait $m] $uw (vector_unsigned_int) -> vector_unsigned_int }
+        impl_vec_trait!{ [$Trait $m] $sw (vector_signed_int) -> vector_signed_int }
+        impl_vec_trait!{ [$Trait $m] $sf (vector_float) -> vector_float }
+    };
+    ([$Trait:ident $m:ident] $fun:ident ($a:ty, $b:ty) -> $r:ty) => {
+        #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+        impl $Trait<$b> for $a {
+            type Result = $r;
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn $m(self, b: $b) -> Self::Result {
+                $fun(transmute(self), transmute(b))
+            }
+        }
+    };
+    ([$Trait:ident $m:ident]+ $fun:ident ($a:ty, $b:ty) -> $r:ty) => {
+        #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+        impl $Trait<$b> for $a {
+            type Result = $r;
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn $m(self, b: $b) -> Self::Result {
+                transmute($fun(transmute(self), transmute(b)))
+            }
+        }
+    };
+    ([$Trait:ident $m:ident] $fun:ident ($a:ty, ~$b:ty) -> $r:ty) => {
+        impl_vec_trait!{ [$Trait $m] $fun ($a, $a) -> $r }
+        impl_vec_trait!{ [$Trait $m] $fun ($a, $b) -> $r }
+        impl_vec_trait!{ [$Trait $m] $fun ($b, $a) -> $r }
+    };
+    ([$Trait:ident $m:ident] ~($ub:ident, $sb:ident, $uh:ident, $sh:ident, $uw:ident, $sw:ident)) => {
+        impl_vec_trait!{ [$Trait $m] $ub (vector_unsigned_char, ~vector_bool_char) -> vector_unsigned_char }
+        impl_vec_trait!{ [$Trait $m] $sb (vector_signed_char, ~vector_bool_char) -> vector_signed_char }
+        impl_vec_trait!{ [$Trait $m] $uh (vector_unsigned_short, ~vector_bool_short) -> vector_unsigned_short }
+        impl_vec_trait!{ [$Trait $m] $sh (vector_signed_short, ~vector_bool_short) -> vector_signed_short }
+        impl_vec_trait!{ [$Trait $m] $uw (vector_unsigned_int, ~vector_bool_int) -> vector_unsigned_int }
+        impl_vec_trait!{ [$Trait $m] $sw (vector_signed_int, ~vector_bool_int) -> vector_signed_int }
+    };
+    ([$Trait:ident $m:ident] ~($fn:ident)) => {
+        impl_vec_trait!{ [$Trait $m] ~($fn, $fn, $fn, $fn, $fn, $fn) }
+    };
+    ([$Trait:ident $m:ident] 2 ($ub:ident, $sb:ident, $uh:ident, $sh:ident, $uw:ident, $sw:ident)) => {
+        impl_vec_trait!{ [$Trait $m] $ub (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+        impl_vec_trait!{ [$Trait $m] $sb (vector_signed_char, vector_signed_char) -> vector_signed_char }
+        impl_vec_trait!{ [$Trait $m] $uh (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short }
+        impl_vec_trait!{ [$Trait $m] $sh (vector_signed_short, vector_signed_short) -> vector_signed_short }
+        impl_vec_trait!{ [$Trait $m] $uw (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int }
+        impl_vec_trait!{ [$Trait $m] $sw (vector_signed_int, vector_signed_int) -> vector_signed_int }
+    };
+    ([$Trait:ident $m:ident] 2 ($fn:ident)) => {
+        impl_vec_trait!{ [$Trait $m] ($fn, $fn, $fn, $fn, $fn, $fn) }
+    };
+    ([$Trait:ident $m:ident]+ 2b ($b:ident, $h:ident, $w:ident)) => {
+        impl_vec_trait!{ [$Trait $m]+ $b (vector_bool_char, vector_bool_char) -> vector_bool_char }
+        impl_vec_trait!{ [$Trait $m]+ $b (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+        impl_vec_trait!{ [$Trait $m]+ $b (vector_signed_char, vector_signed_char) -> vector_signed_char }
+        impl_vec_trait!{ [$Trait $m]+ $h (vector_bool_short, vector_bool_short) -> vector_bool_short }
+        impl_vec_trait!{ [$Trait $m]+ $h (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short }
+        impl_vec_trait!{ [$Trait $m]+ $h (vector_signed_short, vector_signed_short) -> vector_signed_short }
+        impl_vec_trait!{ [$Trait $m]+ $w (vector_bool_int, vector_bool_int) -> vector_bool_int }
+        impl_vec_trait!{ [$Trait $m]+ $w (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int }
+        impl_vec_trait!{ [$Trait $m]+ $w (vector_signed_int, vector_signed_int) -> vector_signed_int }
+    };
+    ([$Trait:ident $m:ident]+ 2b ($fn:ident)) => {
+        impl_vec_trait!{ [$Trait $m]+ 2b ($fn, $fn, $fn) }
+    };
+}
+
+macro_rules! s_t_l {
+    (i32x4) => {
+        vector_signed_int
+    };
+    (i16x8) => {
+        vector_signed_short
+    };
+    (i8x16) => {
+        vector_signed_char
+    };
+
+    (u32x4) => {
+        vector_unsigned_int
+    };
+    (u16x8) => {
+        vector_unsigned_short
+    };
+    (u8x16) => {
+        vector_unsigned_char
+    };
+
+    (f32x4) => {
+        vector_float
+    };
+}
+
+macro_rules! t_t_l {
+    (i32) => {
+        vector_signed_int
+    };
+    (i16) => {
+        vector_signed_short
+    };
+    (i8) => {
+        vector_signed_char
+    };
+
+    (u32) => {
+        vector_unsigned_int
+    };
+    (u16) => {
+        vector_unsigned_short
+    };
+    (u8) => {
+        vector_unsigned_char
+    };
+
+    (f32) => {
+        vector_float
+    };
+}
+
+macro_rules! t_t_s {
+    (i32) => {
+        i32x4
+    };
+    (i16) => {
+        i16x8
+    };
+    (i8) => {
+        i8x16
+    };
+
+    (u32) => {
+        u32x4
+    };
+    (u16) => {
+        u16x8
+    };
+    (u8) => {
+        u8x16
+    };
+
+    (f32) => {
+        f32x4
+    };
+}
+
+macro_rules! t_u {
+    (vector_bool_char) => {
+        vector_unsigned_char
+    };
+    (vector_bool_short) => {
+        vector_unsigned_short
+    };
+    (vector_bool_int) => {
+        vector_unsigned_int
+    };
+    (vector_unsigned_char) => {
+        vector_unsigned_char
+    };
+    (vector_unsigned_short) => {
+        vector_unsigned_short
+    };
+    (vector_unsigned_int) => {
+        vector_unsigned_int
+    };
+    (vector_signed_char) => {
+        vector_unsigned_char
+    };
+    (vector_signed_short) => {
+        vector_unsigned_short
+    };
+    (vector_signed_int) => {
+        vector_unsigned_int
+    };
+    (vector_float) => {
+        vector_unsigned_int
+    };
+}
+
+macro_rules! t_b {
+    (vector_bool_char) => {
+        vector_bool_char
+    };
+    (vector_bool_short) => {
+        vector_bool_short
+    };
+    (vector_bool_int) => {
+        vector_bool_int
+    };
+    (vector_signed_char) => {
+        vector_bool_char
+    };
+    (vector_signed_short) => {
+        vector_bool_short
+    };
+    (vector_signed_int) => {
+        vector_bool_int
+    };
+    (vector_unsigned_char) => {
+        vector_bool_char
+    };
+    (vector_unsigned_short) => {
+        vector_bool_short
+    };
+    (vector_unsigned_int) => {
+        vector_bool_int
+    };
+    (vector_float) => {
+        vector_bool_int
+    };
+}
+
+macro_rules! impl_from {
+    ($s: ident) => {
+        #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+        impl From<$s> for s_t_l!($s) {
+            fn from (v: $s) -> Self {
+                unsafe {
+                    transmute(v)
+                }
+            }
+        }
+    };
+    ($($s: ident),*) => {
+        $(
+            impl_from! { $s }
+        )*
+    };
+}
+
+macro_rules! impl_neg {
+    ($s: ident : $zero: expr) => {
+        #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+        impl crate::ops::Neg for s_t_l!($s) {
+            type Output = s_t_l!($s);
+            fn neg(self) -> Self::Output {
+                unsafe { simd_neg(self) }
+            }
+        }
+    };
+}
+
+pub(crate) use impl_from;
+pub(crate) use impl_neg;
+pub(crate) use impl_vec_trait;
+pub(crate) use s_t_l;
+pub(crate) use t_b;
+pub(crate) use t_t_l;
+pub(crate) use t_t_s;
+pub(crate) use t_u;
+pub(crate) use test_impl;
diff --git a/library/stdarch/crates/core_arch/src/powerpc/mod.rs b/library/stdarch/crates/core_arch/src/powerpc/mod.rs
new file mode 100644
index 0000000000000..53227215d946c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc/mod.rs
@@ -0,0 +1,22 @@
+//! PowerPC intrinsics
+
+pub(crate) mod macros;
+
+mod altivec;
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub use self::altivec::*;
+
+mod vsx;
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub use self::vsx::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Generates the trap instruction `TRAP`
+#[cfg_attr(test, assert_instr(trap))]
+#[inline]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn trap() -> ! {
+    crate::intrinsics::abort()
+}
diff --git a/library/stdarch/crates/core_arch/src/powerpc/vsx.rs b/library/stdarch/crates/core_arch/src/powerpc/vsx.rs
new file mode 100644
index 0000000000000..ca9fcaabe8b22
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc/vsx.rs
@@ -0,0 +1,240 @@
+//! PowerPC Vector Scalar eXtensions (VSX) intrinsics.
+//!
+//! The references are: [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA
+//! NVlink)] and [POWER ISA v3.0B (for POWER9)].
+//!
+//! [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA NVlink)]: https://ibm.box.com/s/jd5w15gz301s5b5dt375mshpq9c3lh4u
+//! [POWER ISA v3.0B (for POWER9)]: https://ibm.box.com/s/1hzcwkwf8rbju5h9iyf44wm94amnlcrv
+
+#![allow(non_camel_case_types)]
+
+use crate::core_arch::powerpc::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::mem::transmute;
+
+types! {
+    #![unstable(feature = "stdarch_powerpc", issue = "111145")]
+
+    // pub struct vector_Float16 = f16x8;
+    /// PowerPC-specific 128-bit wide vector of two packed `i64`
+    pub struct vector_signed_long(2 x i64);
+    /// PowerPC-specific 128-bit wide vector of two packed `u64`
+    pub struct vector_unsigned_long(2 x u64);
+    /// PowerPC-specific 128-bit wide vector mask of two `i64`
+    pub struct vector_bool_long(2 x i64);
+    /// PowerPC-specific 128-bit wide vector of two packed `f64`
+    pub struct vector_double(2 x f64);
+    // pub struct vector_signed_long_long = vector_signed_long;
+    // pub struct vector_unsigned_long_long = vector_unsigned_long;
+    // pub struct vector_bool_long_long = vector_bool_long;
+    // pub struct vector_signed___int128 = i128x1;
+    // pub struct vector_unsigned___int128 = i128x1;
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.ppc.altivec.vperm"]
+    fn vperm(
+        a: vector_signed_int,
+        b: vector_signed_int,
+        c: vector_unsigned_char,
+    ) -> vector_signed_int;
+}
+
+mod sealed {
+    use super::*;
+    use crate::core_arch::simd::*;
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorPermDI {
+        #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+        unsafe fn vec_xxpermdi(self, b: Self, dm: u8) -> Self;
+    }
+
+    // xxpermdi has an big-endian bias and extended mnemonics
+    #[inline]
+    #[target_feature(enable = "vsx")]
+    #[cfg_attr(all(test, target_endian = "little"), assert_instr(xxmrgld, dm = 0x0))]
+    #[cfg_attr(all(test, target_endian = "big"), assert_instr(xxspltd, dm = 0x0))]
+    unsafe fn xxpermdi(a: vector_signed_long, b: vector_signed_long, dm: u8) -> vector_signed_long {
+        let a: i64x2 = transmute(a);
+        let b: i64x2 = transmute(b);
+        let r: i64x2 = match dm & 0b11 {
+            0 => simd_shuffle!(a, b, [0b00, 0b10]),
+            1 => simd_shuffle!(a, b, [0b01, 0b10]),
+            2 => simd_shuffle!(a, b, [0b00, 0b11]),
+            _ => simd_shuffle!(a, b, [0b01, 0b11]),
+        };
+        transmute(r)
+    }
+
+    macro_rules! vec_xxpermdi {
+        {$impl: ident} => {
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorPermDI for $impl {
+                #[inline]
+                #[target_feature(enable = "vsx")]
+                unsafe fn vec_xxpermdi(self, b: Self, dm: u8) -> Self {
+                    transmute(xxpermdi(transmute(self), transmute(b), dm))
+                }
+            }
+        }
+    }
+
+    vec_xxpermdi! { vector_unsigned_long }
+    vec_xxpermdi! { vector_signed_long }
+    vec_xxpermdi! { vector_bool_long }
+    vec_xxpermdi! { vector_double }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorMergeEo {
+        #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+        unsafe fn vec_mergee(self, b: Self) -> Self;
+        #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+        unsafe fn vec_mergeo(self, b: Self) -> Self;
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(
+        all(test, target_endian = "little", target_feature = "power8-vector"),
+        assert_instr(vmrgow)
+    )]
+    #[cfg_attr(
+        all(test, target_endian = "big", target_feature = "power8-vector"),
+        assert_instr(vmrgew)
+    )]
+    unsafe fn mergee(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        let p = transmute(u8x16::new(
+            0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+            0x1A, 0x1B,
+        ));
+        vec_perm(a, b, p)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(
+        all(test, target_endian = "little", target_feature = "power8-vector"),
+        assert_instr(vmrgew)
+    )]
+    #[cfg_attr(
+        all(test, target_endian = "big", target_feature = "power8-vector"),
+        assert_instr(vmrgow)
+    )]
+    unsafe fn mergeo(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        let p = transmute(u8x16::new(
+            0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D,
+            0x1E, 0x1F,
+        ));
+        vec_perm(a, b, p)
+    }
+
+    macro_rules! vec_mergeeo {
+        { $impl: ident, $even: ident, $odd: ident } => {
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorMergeEo for $impl {
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_mergee(self, b: Self) -> Self {
+                    transmute(mergee(transmute(self), transmute(b)))
+                }
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_mergeo(self, b: Self) -> Self {
+                    transmute(mergeo(transmute(self), transmute(b)))
+                }
+            }
+        }
+    }
+
+    vec_mergeeo! { vector_signed_int, mergee, mergeo }
+    vec_mergeeo! { vector_unsigned_int, mergee, mergeo }
+    vec_mergeeo! { vector_bool_int, mergee, mergeo }
+    vec_mergeeo! { vector_float, mergee, mergeo }
+}
+
+/// Vector permute.
+#[inline]
+#[target_feature(enable = "vsx")]
+//#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_xxpermdi<T, const DM: i32>(a: T, b: T) -> T
+where
+    T: sealed::VectorPermDI,
+{
+    static_assert_uimm_bits!(DM, 2);
+    a.vec_xxpermdi(b, DM as u8)
+}
+
+/// Vector Merge Even
+///
+/// ## Purpose
+/// Merges the even-numbered values from two vectors.
+///
+/// ## Result value
+/// The even-numbered elements of a are stored into the even-numbered elements of r.
+/// The even-numbered elements of b are stored into the odd-numbered elements of r.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_mergee<T>(a: T, b: T) -> T
+where
+    T: sealed::VectorMergeEo,
+{
+    a.vec_mergee(b)
+}
+
+/// Vector Merge Odd
+///
+/// ## Purpose
+/// Merges the odd-numbered values from two vectors.
+///
+/// ## Result value
+/// The odd-numbered elements of a are stored into the even-numbered elements of r.
+/// The odd-numbered elements of b are stored into the odd-numbered elements of r.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_mergeo<T>(a: T, b: T) -> T
+where
+    T: sealed::VectorMergeEo,
+{
+    a.vec_mergeo(b)
+}
+
+#[cfg(test)]
+mod tests {
+    #[cfg(target_arch = "powerpc")]
+    use crate::core_arch::arch::powerpc::*;
+
+    #[cfg(target_arch = "powerpc64")]
+    use crate::core_arch::arch::powerpc64::*;
+
+    use crate::core_arch::simd::*;
+    use crate::mem::transmute;
+    use stdarch_test::simd_test;
+
+    macro_rules! test_vec_xxpermdi {
+        {$name:ident, $shorttype:ident, $longtype:ident, [$($a:expr),+], [$($b:expr),+], [$($c:expr),+], [$($d:expr),+]} => {
+            #[simd_test(enable = "vsx")]
+            unsafe fn $name() {
+                let a: $longtype = transmute($shorttype::new($($a),+, $($b),+));
+                let b = transmute($shorttype::new($($c),+, $($d),+));
+
+                assert_eq!($shorttype::new($($a),+, $($c),+), transmute(vec_xxpermdi::<_, 0>(a, b)));
+                assert_eq!($shorttype::new($($b),+, $($c),+), transmute(vec_xxpermdi::<_, 1>(a, b)));
+                assert_eq!($shorttype::new($($a),+, $($d),+), transmute(vec_xxpermdi::<_, 2>(a, b)));
+                assert_eq!($shorttype::new($($b),+, $($d),+), transmute(vec_xxpermdi::<_, 3>(a, b)));
+            }
+        }
+    }
+
+    test_vec_xxpermdi! {test_vec_xxpermdi_u64x2, u64x2, vector_unsigned_long, [0], [1], [2], [3]}
+    test_vec_xxpermdi! {test_vec_xxpermdi_i64x2, i64x2, vector_signed_long, [0], [-1], [2], [-3]}
+    test_vec_xxpermdi! {test_vec_xxpermdi_m64x2, m64x2, vector_bool_long, [false], [true], [false], [true]}
+    test_vec_xxpermdi! {test_vec_xxpermdi_f64x2, f64x2, vector_double, [0.0], [1.0], [2.0], [3.0]}
+}
diff --git a/library/stdarch/crates/core_arch/src/powerpc64/mod.rs b/library/stdarch/crates/core_arch/src/powerpc64/mod.rs
new file mode 100644
index 0000000000000..e361c55a9071f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc64/mod.rs
@@ -0,0 +1,14 @@
+//! PowerPC 64
+//!
+//! The reference is the [64-Bit ELF V2 ABI Specification - Power
+//! Architecture].
+//!
+//! [64-Bit ELF V2 ABI Specification - Power Architecture]: http://openpowerfoundation.org/wp-content/uploads/resources/leabi/leabi-20170510.pdf
+
+mod vsx;
+
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub use crate::core_arch::powerpc::*;
+
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub use self::vsx::*;
diff --git a/library/stdarch/crates/core_arch/src/powerpc64/vsx.rs b/library/stdarch/crates/core_arch/src/powerpc64/vsx.rs
new file mode 100644
index 0000000000000..7b42be8653c55
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc64/vsx.rs
@@ -0,0 +1,156 @@
+//! PowerPC Vector Scalar eXtensions (VSX) intrinsics.
+//!
+//! The references are: [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA
+//! NVlink)] and [POWER ISA v3.0B (for POWER9)].
+//!
+//! [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA NVlink)]: https://ibm.box.com/s/jd5w15gz301s5b5dt375mshpq9c3lh4u
+//! [POWER ISA v3.0B (for POWER9)]: https://ibm.box.com/s/1hzcwkwf8rbju5h9iyf44wm94amnlcrv
+
+#![allow(non_camel_case_types)]
+
+use crate::core_arch::powerpc::macros::*;
+use crate::core_arch::powerpc::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::mem::transmute;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.ppc.vsx.lxvl"]
+    fn lxvl(a: *const u8, l: usize) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.vsx.stxvl"]
+    fn stxvl(v: vector_signed_int, a: *mut u8, l: usize);
+}
+
+mod sealed {
+    use super::*;
+
+    #[inline]
+    #[target_feature(enable = "power9-vector")]
+    #[cfg_attr(test, assert_instr(lxvl))]
+    unsafe fn vec_lxvl(p: *const u8, l: usize) -> vector_signed_int {
+        lxvl(p, l << 56)
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorXloads {
+        type Result;
+        unsafe fn vec_xl_len(self, l: usize) -> Self::Result;
+    }
+
+    macro_rules! impl_vsx_loads {
+        ($ty:ident) => {
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorXloads for *const $ty {
+                type Result = t_t_l!($ty);
+                #[inline]
+                #[target_feature(enable = "power9-vector")]
+                unsafe fn vec_xl_len(self, l: usize) -> Self::Result {
+                    transmute(vec_lxvl(self as *const u8, l))
+                }
+            }
+        };
+    }
+
+    impl_vsx_loads! { i8 }
+    impl_vsx_loads! { u8 }
+    impl_vsx_loads! { i16 }
+    impl_vsx_loads! { u16 }
+    impl_vsx_loads! { i32 }
+    impl_vsx_loads! { u32 }
+    impl_vsx_loads! { f32 }
+
+    #[inline]
+    #[target_feature(enable = "power9-vector")]
+    #[cfg_attr(test, assert_instr(stxvl))]
+    unsafe fn vec_stxvl(v: vector_signed_int, a: *mut u8, l: usize) {
+        stxvl(v, a, l << 56);
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorXstores {
+        type Out;
+        unsafe fn vec_xst_len(self, p: Self::Out, l: usize);
+    }
+
+    macro_rules! impl_stores {
+        ($ty:ident) => {
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorXstores for t_t_l!($ty) {
+                type Out = *mut $ty;
+                #[inline]
+                #[target_feature(enable = "power9-vector")]
+                unsafe fn vec_xst_len(self, a: Self::Out, l: usize) {
+                    stxvl(transmute(self), a as *mut u8, l)
+                }
+            }
+        };
+    }
+
+    impl_stores! { i8 }
+    impl_stores! { u8 }
+    impl_stores! { i16 }
+    impl_stores! { u16 }
+    impl_stores! { i32 }
+    impl_stores! { u32 }
+    impl_stores! { f32 }
+}
+
+/// Vector Load with Length
+///
+/// ## Purpose
+/// Loads a vector of a specified byte length.
+///
+/// ## Result value
+/// Loads the number of bytes specified by b from the address specified in a.
+/// Initializes elements in order from the byte stream (as defined by the endianness of the
+/// target). Any bytes of elements that cannot be initialized from the number of loaded bytes have
+/// a zero value.
+///
+/// Between 0 and 16 bytes, inclusive, will be loaded. The length is specified by the
+/// least-significant byte of b, as min (b mod 256, 16). The behavior is undefined if the length
+/// argument is outside of the range 0–255, or if it is not a multiple of the vector element size.
+///
+/// ## Notes
+/// vec_xl_len should not be used to load from cache-inhibited memory.
+#[inline]
+#[target_feature(enable = "power9-vector")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_xl_len<T>(p: T, len: usize) -> <T as sealed::VectorXloads>::Result
+where
+    T: sealed::VectorXloads,
+{
+    p.vec_xl_len(len)
+}
+
+/// Vector Store with Length
+///
+/// ## Purpose
+///
+/// Stores a vector of a specified byte length.
+///
+/// ## Operation
+///
+/// Stores the number of bytes specified by c of the vector a to the address specified
+/// in b. The bytes are obtained starting from the lowest-numbered byte of the lowest-numbered
+/// element (as defined by the endianness of the target). All bytes of an element are accessed
+/// before proceeding to the next higher element.
+///
+/// Between 0 and 16 bytes, inclusive, will be stored. The length is specified by the
+/// least-significant byte of c, as min (c mod 256, 16). The behavior is undefined if the length
+/// argument is outside of the range 0–255, or if it is not a multiple of the vector element size.
+///
+/// ## Notes
+/// vec_xst_len should not be used to store to cache-inhibited memory.
+#[inline]
+#[target_feature(enable = "power9-vector")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_xst_len<T>(v: T, a: <T as sealed::VectorXstores>::Out, l: usize)
+where
+    T: sealed::VectorXstores,
+{
+    v.vec_xst_len(a, l)
+}
diff --git a/library/stdarch/crates/core_arch/src/riscv32/mod.rs b/library/stdarch/crates/core_arch/src/riscv32/mod.rs
new file mode 100644
index 0000000000000..7ff871227b503
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv32/mod.rs
@@ -0,0 +1,6 @@
+//! RISC-V RV32 specific intrinsics
+
+mod zk;
+
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub use zk::*;
diff --git a/library/stdarch/crates/core_arch/src/riscv32/zk.rs b/library/stdarch/crates/core_arch/src/riscv32/zk.rs
new file mode 100644
index 0000000000000..054bcfe955b7d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv32/zk.rs
@@ -0,0 +1,331 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.riscv.aes32esi"]
+    fn _aes32esi(rs1: i32, rs2: i32, bs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.aes32esmi"]
+    fn _aes32esmi(rs1: i32, rs2: i32, bs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.aes32dsi"]
+    fn _aes32dsi(rs1: i32, rs2: i32, bs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.aes32dsmi"]
+    fn _aes32dsmi(rs1: i32, rs2: i32, bs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.zip.i32"]
+    fn _zip(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.unzip.i32"]
+    fn _unzip(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha512sig0h"]
+    fn _sha512sig0h(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha512sig0l"]
+    fn _sha512sig0l(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha512sig1h"]
+    fn _sha512sig1h(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha512sig1l"]
+    fn _sha512sig1l(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha512sum0r"]
+    fn _sha512sum0r(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha512sum1r"]
+    fn _sha512sum1r(rs1: i32, rs2: i32) -> i32;
+}
+
+/// AES final round encryption instruction for RV32.
+///
+/// This instruction sources a single byte from rs2 according to bs. To this it applies the
+/// forward AES SBox operation, before XOR’ing the result with rs1. This instruction must
+/// always be implemented such that its execution latency does not depend on the data being
+/// operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.3
+///
+/// # Note
+///
+/// The `BS` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+#[target_feature(enable = "zkne")]
+#[rustc_legacy_const_generics(2)]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes32esi, BS = 0))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn aes32esi<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
+    static_assert!(BS < 4);
+
+    unsafe { _aes32esi(rs1 as i32, rs2 as i32, BS as i32) as u32 }
+}
+
+/// AES middle round encryption instruction for RV32 with.
+///
+/// This instruction sources a single byte from rs2 according to bs. To this it applies the
+/// forward AES SBox operation, and a partial forward MixColumn, before XOR’ing the result with
+/// rs1. This instruction must always be implemented such that its execution latency does not
+/// depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.4
+///
+/// # Note
+///
+/// The `bs` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+#[target_feature(enable = "zkne")]
+#[rustc_legacy_const_generics(2)]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes32esmi, BS = 0))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn aes32esmi<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
+    static_assert!(BS < 4);
+
+    unsafe { _aes32esmi(rs1 as i32, rs2 as i32, BS as i32) as u32 }
+}
+
+/// AES final round decryption instruction for RV32.
+///
+/// This instruction sources a single byte from rs2 according to bs. To this it applies the
+/// inverse AES SBox operation, and XOR’s the result with rs1. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.1
+///
+/// # Note
+///
+/// The `BS` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+#[target_feature(enable = "zknd")]
+#[rustc_legacy_const_generics(2)]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes32dsi, BS = 0))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn aes32dsi<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
+    static_assert!(BS < 4);
+
+    unsafe { _aes32dsi(rs1 as i32, rs2 as i32, BS as i32) as u32 }
+}
+
+/// AES middle round decryption instruction for RV32.
+///
+/// This instruction sources a single byte from rs2 according to bs. To this it applies the
+/// inverse AES SBox operation, and a partial inverse MixColumn, before XOR’ing the result with
+/// rs1. This instruction must always be implemented such that its execution latency does not
+/// depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.2
+///
+/// # Note
+///
+/// The `BS` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+#[target_feature(enable = "zknd")]
+#[rustc_legacy_const_generics(2)]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes32dsmi, BS = 0))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn aes32dsmi<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
+    static_assert!(BS < 4);
+
+    unsafe { _aes32dsmi(rs1 as i32, rs2 as i32, BS as i32) as u32 }
+}
+
+/// Place upper/lower halves of the source register into odd/even bits of the destination
+/// respectivley.
+///
+/// This instruction places bits in the low half of the source register into the even bit
+/// positions of the destination, and bits in the high half of the source register into the odd
+/// bit positions of the destination. It is the inverse of the unzip instruction. This
+/// instruction is available only on RV32.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.49
+#[target_feature(enable = "zbkb")]
+// See #1464
+// #[cfg_attr(test, assert_instr(zip))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn zip(rs: u32) -> u32 {
+    unsafe { _zip(rs as i32) as u32 }
+}
+
+/// Place odd and even bits of the source word into upper/lower halves of the destination.
+///
+/// This instruction places the even bits of the source register into the low half of the
+/// destination, and the odd bits of the source into the high bits of the destination. It is
+/// the inverse of the zip instruction. This instruction is available only on RV32.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.45
+#[target_feature(enable = "zbkb")]
+#[cfg_attr(test, assert_instr(unzip))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn unzip(rs: u32) -> u32 {
+    unsafe { _unzip(rs as i32) as u32 }
+}
+
+/// Implements the high half of the Sigma0 transformation, as used in the SHA2-512 hash
+/// function \[49\] (Section 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sigma0 transform of the
+/// SHA2-512 hash function in conjunction with the sha512sig0l instruction. The transform is a
+/// 64-bit to 64-bit function, so the input and output are each represented by two 32-bit
+/// registers. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.31
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sig0h))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sha512sig0h(rs1: u32, rs2: u32) -> u32 {
+    unsafe { _sha512sig0h(rs1 as i32, rs2 as i32) as u32 }
+}
+
+/// Implements the low half of the Sigma0 transformation, as used in the SHA2-512 hash function
+/// \[49\] (Section 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sigma0 transform of the
+/// SHA2-512 hash function in conjunction with the sha512sig0h instruction. The transform is a
+/// 64-bit to 64-bit function, so the input and output are each represented by two 32-bit
+/// registers. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.32
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sig0l))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sha512sig0l(rs1: u32, rs2: u32) -> u32 {
+    unsafe { _sha512sig0l(rs1 as i32, rs2 as i32) as u32 }
+}
+
+/// Implements the high half of the Sigma1 transformation, as used in the SHA2-512 hash
+/// function \[49\] (Section 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sigma1 transform of the
+/// SHA2-512 hash function in conjunction with the sha512sig1l instruction. The transform is a
+/// 64-bit to 64-bit function, so the input and output are each represented by two 32-bit
+/// registers. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.33
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sig1h))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sha512sig1h(rs1: u32, rs2: u32) -> u32 {
+    unsafe { _sha512sig1h(rs1 as i32, rs2 as i32) as u32 }
+}
+
+/// Implements the low half of the Sigma1 transformation, as used in the SHA2-512 hash function
+/// \[49\] (Section 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sigma1 transform of the
+/// SHA2-512 hash function in conjunction with the sha512sig1h instruction. The transform is a
+/// 64-bit to 64-bit function, so the input and output are each represented by two 32-bit
+/// registers. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.34
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sig1l))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sha512sig1l(rs1: u32, rs2: u32) -> u32 {
+    unsafe { _sha512sig1l(rs1 as i32, rs2 as i32) as u32 }
+}
+
+/// Implements the Sum0 transformation, as used in the SHA2-512 hash function \[49\] (Section
+/// 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sum0 transform of the
+/// SHA2-512 hash function. The transform is a 64-bit to 64-bit function, so the input and
+/// output is represented by two 32-bit registers. This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.35
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sum0r))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sha512sum0r(rs1: u32, rs2: u32) -> u32 {
+    unsafe { _sha512sum0r(rs1 as i32, rs2 as i32) as u32 }
+}
+
+/// Implements the Sum1 transformation, as used in the SHA2-512 hash function \[49\] (Section
+/// 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sum1 transform of the
+/// SHA2-512 hash function. The transform is a 64-bit to 64-bit function, so the input and
+/// output is represented by two 32-bit registers. This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.36
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sum1r))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sha512sum1r(rs1: u32, rs2: u32) -> u32 {
+    unsafe { _sha512sum1r(rs1 as i32, rs2 as i32) as u32 }
+}
diff --git a/library/stdarch/crates/core_arch/src/riscv64/mod.rs b/library/stdarch/crates/core_arch/src/riscv64/mod.rs
new file mode 100644
index 0000000000000..0e860f6f2ad2f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv64/mod.rs
@@ -0,0 +1,57 @@
+//! RISC-V RV64 specific intrinsics
+use crate::arch::asm;
+
+mod zk;
+
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub use zk::*;
+
+/// Loads virtual machine memory by unsigned word integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This operation is not available under RV32 base instruction set.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.WU`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hlv_wu(src: *const u32) -> u32 {
+    let value: u32;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x681", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Loads virtual machine memory by double integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This operation is not available under RV32 base instruction set.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.D`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hlv_d(src: *const i64) -> i64 {
+    let value: i64;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x6C0", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Stores virtual machine memory by double integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HSV.D`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hsv_d(dst: *mut i64, src: i64) {
+    asm!(".insn r 0x73, 0x4, 0x37, x0, {}, {}", in(reg) dst, in(reg) src, options(nostack));
+}
diff --git a/library/stdarch/crates/core_arch/src/riscv64/zk.rs b/library/stdarch/crates/core_arch/src/riscv64/zk.rs
new file mode 100644
index 0000000000000..c6af750bbc570
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv64/zk.rs
@@ -0,0 +1,265 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.riscv.aes64es"]
+    fn _aes64es(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.aes64esm"]
+    fn _aes64esm(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.aes64ds"]
+    fn _aes64ds(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.aes64dsm"]
+    fn _aes64dsm(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.aes64ks1i"]
+    fn _aes64ks1i(rs1: i64, rnum: i32) -> i64;
+
+    #[link_name = "llvm.riscv.aes64ks2"]
+    fn _aes64ks2(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.aes64im"]
+    fn _aes64im(rs1: i64) -> i64;
+
+    #[link_name = "llvm.riscv.sha512sig0"]
+    fn _sha512sig0(rs1: i64) -> i64;
+
+    #[link_name = "llvm.riscv.sha512sig1"]
+    fn _sha512sig1(rs1: i64) -> i64;
+
+    #[link_name = "llvm.riscv.sha512sum0"]
+    fn _sha512sum0(rs1: i64) -> i64;
+
+    #[link_name = "llvm.riscv.sha512sum1"]
+    fn _sha512sum1(rs1: i64) -> i64;
+}
+
+/// AES final round encryption instruction for RV64.
+///
+/// Uses the two 64-bit source registers to represent the entire AES state, and produces half
+/// of the next round output, applying the ShiftRows and SubBytes steps. This instruction must
+/// always be implemented such that its execution latency does not depend on the data being
+/// operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.7
+#[target_feature(enable = "zkne")]
+#[cfg_attr(test, assert_instr(aes64es))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn aes64es(rs1: u64, rs2: u64) -> u64 {
+    unsafe { _aes64es(rs1 as i64, rs2 as i64) as u64 }
+}
+
+/// AES middle round encryption instruction for RV64.
+///
+/// Uses the two 64-bit source registers to represent the entire AES state, and produces half
+/// of the next round output, applying the ShiftRows, SubBytes and MixColumns steps. This
+/// instruction must always be implemented such that its execution latency does not depend on
+/// the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.8
+#[target_feature(enable = "zkne")]
+#[cfg_attr(test, assert_instr(aes64esm))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn aes64esm(rs1: u64, rs2: u64) -> u64 {
+    unsafe { _aes64esm(rs1 as i64, rs2 as i64) as u64 }
+}
+
+/// AES final round decryption instruction for RV64.
+///
+/// Uses the two 64-bit source registers to represent the entire AES state, and produces half
+/// of the next round output, applying the Inverse ShiftRows and SubBytes steps. This
+/// instruction must always be implemented such that its execution latency does not depend on
+/// the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.5
+#[target_feature(enable = "zknd")]
+#[cfg_attr(test, assert_instr(aes64ds))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn aes64ds(rs1: u64, rs2: u64) -> u64 {
+    unsafe { _aes64ds(rs1 as i64, rs2 as i64) as u64 }
+}
+
+/// AES middle round decryption instruction for RV64.
+///
+/// Uses the two 64-bit source registers to represent the entire AES state, and produces half
+/// of the next round output, applying the Inverse ShiftRows, SubBytes and MixColumns steps.
+/// This instruction must always be implemented such that its execution latency does not depend
+/// on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.6
+#[target_feature(enable = "zknd")]
+#[cfg_attr(test, assert_instr(aes64dsm))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn aes64dsm(rs1: u64, rs2: u64) -> u64 {
+    unsafe { _aes64dsm(rs1 as i64, rs2 as i64) as u64 }
+}
+
+/// This instruction implements part of the KeySchedule operation for the AES Block cipher
+/// involving the SBox operation.
+///
+/// This instruction implements the rotation, SubBytes and Round Constant addition steps of the
+/// AES block cipher Key Schedule. This instruction must always be implemented such that its
+/// execution latency does not depend on the data being operated on. Note that rnum must be in
+/// the range 0x0..0xA. The values 0xB..0xF are reserved.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.10
+///
+/// # Note
+///
+/// The `RNUM` parameter is expected to be a constant value inside the range of `0..=10`.
+#[target_feature(enable = "zkne", enable = "zknd")]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(aes64ks1i, RNUM = 0))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn aes64ks1i<const RNUM: u8>(rs1: u64) -> u64 {
+    static_assert!(RNUM <= 10);
+
+    unsafe { _aes64ks1i(rs1 as i64, RNUM as i32) as u64 }
+}
+
+/// This instruction implements part of the KeySchedule operation for the AES Block cipher.
+///
+/// This instruction implements the additional XOR’ing of key words as part of the AES block
+/// cipher Key Schedule. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.11
+#[target_feature(enable = "zkne", enable = "zknd")]
+#[cfg_attr(test, assert_instr(aes64ks2))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn aes64ks2(rs1: u64, rs2: u64) -> u64 {
+    unsafe { _aes64ks2(rs1 as i64, rs2 as i64) as u64 }
+}
+
+/// This instruction accelerates the inverse MixColumns step of the AES Block Cipher, and is used to aid creation of
+/// the decryption KeySchedule.
+///
+/// The instruction applies the inverse MixColumns transformation to two columns of the state array, packed
+/// into a single 64-bit register. It is used to create the inverse cipher KeySchedule, according to the equivalent
+/// inverse cipher construction in (Page 23, Section 5.3.5). This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.9
+#[target_feature(enable = "zkne", enable = "zknd")]
+#[cfg_attr(test, assert_instr(aes64im))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn aes64im(rs1: u64) -> u64 {
+    unsafe { _aes64im(rs1 as i64) as u64 }
+}
+
+/// Implements the Sigma0 transformation function as used in the SHA2-512 hash function \[49\]
+/// (Section 4.1.3).
+///
+/// This instruction is supported for the RV64 base architecture. It implements the Sigma0
+/// transform of the SHA2-512 hash function. \[49\]. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.37
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sig0))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sha512sig0(rs1: u64) -> u64 {
+    unsafe { _sha512sig0(rs1 as i64) as u64 }
+}
+
+/// Implements the Sigma1 transformation function as used in the SHA2-512 hash function \[49\]
+/// (Section 4.1.3).
+///
+/// This instruction is supported for the RV64 base architecture. It implements the Sigma1
+/// transform of the SHA2-512 hash function. \[49\]. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.38
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sig1))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sha512sig1(rs1: u64) -> u64 {
+    unsafe { _sha512sig1(rs1 as i64) as u64 }
+}
+
+/// Implements the Sum0 transformation function as used in the SHA2-512 hash function \[49\]
+/// (Section 4.1.3).
+///
+/// This instruction is supported for the RV64 base architecture. It implements the Sum0
+/// transform of the SHA2-512 hash function. \[49\]. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.39
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sum0))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sha512sum0(rs1: u64) -> u64 {
+    unsafe { _sha512sum0(rs1 as i64) as u64 }
+}
+
+/// Implements the Sum1 transformation function as used in the SHA2-512 hash function \[49\]
+/// (Section 4.1.3).
+///
+/// This instruction is supported for the RV64 base architecture. It implements the Sum1
+/// transform of the SHA2-512 hash function. \[49\]. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.40
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sum1))]
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sha512sum1(rs1: u64) -> u64 {
+    unsafe { _sha512sum1(rs1 as i64) as u64 }
+}
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs b/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
new file mode 100644
index 0000000000000..3ce24324de2e7
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
@@ -0,0 +1,579 @@
+//! Shared RISC-V intrinsics
+//!
+//! ## Missing floating-point register instructions
+//!
+//! We are deliberately *not* providing instructions that could change the floating-point rounding
+//! mode or exception behavior or read the accrued exceptions flags: `frcsr`, `fscsr`, `fsrm`,
+//! `frflags`, `fsflags`.
+//!
+//! Rust makes no guarantees whatsoever about the contents of the accrued exceptions register: Rust
+//! floating-point operations may or may not result in this register getting updated with exception
+//! state, and the register can change between two invocations of this function even when no
+//! floating-point operations appear in the source code (since floating-point operations appearing
+//! earlier or later can be reordered).
+//!
+//! Modifying the rounding mode leads to **immediate Undefined Behavior**: Rust assumes that the
+//! default rounding mode is always set and will optimize accordingly. This even applies when the
+//! rounding mode is altered and later reset to its original value without any floating-point
+//! operations appearing in the source code between those operations (since floating-point
+//! operations appearing earlier or later can be reordered).
+//!
+//! If you need to perform some floating-point operations and check whether they raised an
+//! exception, use a single inline assembly block for the entire sequence of operations.
+//!
+//! If you need to perform some floating-point operations under a differen rounding mode, use a
+//! single inline assembly block and make sure to restore the original rounding mode before the end
+//! of the block.
+mod p;
+mod zb;
+mod zk;
+
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub use p::*;
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub use zb::*;
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub use zk::*;
+
+use crate::arch::asm;
+
+/// Generates the `PAUSE` instruction
+///
+/// The PAUSE instruction is a HINT that indicates the current hart's rate of instruction retirement
+/// should be temporarily reduced or paused. The duration of its effect must be bounded and may be zero.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn pause() {
+    unsafe { asm!(".insn i 0x0F, 0, x0, x0, 0x010", options(nomem, nostack)) }
+}
+
+/// Generates the `NOP` instruction
+///
+/// The NOP instruction does not change any architecturally visible state, except for
+/// advancing the `pc` and incrementing any applicable performance counters.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn nop() {
+    unsafe { asm!("nop", options(nomem, nostack)) }
+}
+
+/// Generates the `WFI` instruction
+///
+/// The WFI instruction provides a hint to the implementation that the current hart can be stalled
+/// until an interrupt might need servicing. This instruction is a hint,
+/// and a legal implementation is to simply implement WFI as a NOP.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn wfi() {
+    asm!("wfi", options(nomem, nostack))
+}
+
+/// Generates the `FENCE.I` instruction
+///
+/// A FENCE.I instruction ensures that a subsequent instruction fetch on a RISC-V hart will see
+/// any previous data stores already visible to the same RISC-V hart.
+///
+/// FENCE.I does not ensure that other RISC-V harts' instruction fetches will observe the
+/// local hart's stores in a multiprocessor system.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn fence_i() {
+    asm!("fence.i", options(nostack))
+}
+
+/// Supervisor memory management fence for given virtual address and address space
+///
+/// The fence orders only reads and writes made to leaf page table entries corresponding to
+/// the virtual address in parameter `vaddr`, for the address space identified by integer parameter
+/// `asid`. Accesses to global mappings are not ordered. The fence also invalidates all
+/// address-translation cache entries that contain leaf page table entries corresponding to the
+/// virtual address in parameter `vaddr` and that match the address space identified by integer
+/// parameter `asid`, except for entries containing global mappings.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn sfence_vma(vaddr: usize, asid: usize) {
+    asm!("sfence.vma {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
+}
+
+/// Supervisor memory management fence for given virtual address
+///
+/// The fence orders only reads and writes made to leaf page table entries corresponding to
+/// the virtual address in parameter `vaddr`, for all address spaces.
+/// The fence also invalidates all address-translation cache entries that contain leaf page
+/// table entries corresponding to the virtual address in parameter `vaddr`, for all address spaces.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn sfence_vma_vaddr(vaddr: usize) {
+    asm!("sfence.vma {}, x0", in(reg) vaddr, options(nostack))
+}
+
+/// Supervisor memory management fence for given address space
+///
+/// The fence orders all reads and writes made to any level of the page tables,
+/// but only for the address space identified by integer parameter `asid`.
+///
+/// Accesses to global mappings are not ordered. The fence also invalidates all
+/// address-translation cache entries matching the address space identified by integer
+/// parameter `asid`, except for entries containing global mappings.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn sfence_vma_asid(asid: usize) {
+    asm!("sfence.vma x0, {}", in(reg) asid, options(nostack))
+}
+
+/// Supervisor memory management fence for all address spaces and virtual addresses
+///
+/// The fence orders all reads and writes made to any level of the page
+/// tables, for all address spaces. The fence also invalidates all address-translation cache entries,
+/// for all address spaces.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn sfence_vma_all() {
+    asm!("sfence.vma", options(nostack))
+}
+
+/// Invalidate supervisor translation cache for given virtual address and address space
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `SFENCE.VMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn sinval_vma(vaddr: usize, asid: usize) {
+    // asm!("sinval.vma {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
+    asm!(".insn r 0x73, 0, 0x0B, x0, {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
+}
+
+/// Invalidate supervisor translation cache for given virtual address
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `SFENCE.VMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn sinval_vma_vaddr(vaddr: usize) {
+    asm!(".insn r 0x73, 0, 0x0B, x0, {}, x0", in(reg) vaddr, options(nostack))
+}
+
+/// Invalidate supervisor translation cache for given address space
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `SFENCE.VMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn sinval_vma_asid(asid: usize) {
+    asm!(".insn r 0x73, 0, 0x0B, x0, x0, {}", in(reg) asid, options(nostack))
+}
+
+/// Invalidate supervisor translation cache for all address spaces and virtual addresses
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `SFENCE.VMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn sinval_vma_all() {
+    asm!(".insn r 0x73, 0, 0x0B, x0, x0, x0", options(nostack))
+}
+
+/// Generates the `SFENCE.W.INVAL` instruction
+///
+/// This instruction guarantees that any previous stores already visible to the current RISC-V hart
+/// are ordered before subsequent `SINVAL.VMA` instructions executed by the same hart.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn sfence_w_inval() {
+    // asm!("sfence.w.inval", options(nostack))
+    asm!(".insn i 0x73, 0, x0, x0, 0x180", options(nostack))
+}
+
+/// Generates the `SFENCE.INVAL.IR` instruction
+///
+/// This instruction guarantees that any previous SINVAL.VMA instructions executed by the current hart
+/// are ordered before subsequent implicit references by that hart to the memory-management data structures.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn sfence_inval_ir() {
+    // asm!("sfence.inval.ir", options(nostack))
+    asm!(".insn i 0x73, 0, x0, x0, 0x181", options(nostack))
+}
+
+/// Loads virtual machine memory by signed byte integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.B`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hlv_b(src: *const i8) -> i8 {
+    let value: i8;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x600", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Loads virtual machine memory by unsigned byte integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.BU`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hlv_bu(src: *const u8) -> u8 {
+    let value: u8;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x601", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Loads virtual machine memory by signed half integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.H`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hlv_h(src: *const i16) -> i16 {
+    let value: i16;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x640", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Loads virtual machine memory by unsigned half integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.HU`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hlv_hu(src: *const u16) -> u16 {
+    let value: u16;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x641", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Accesses virtual machine instruction by unsigned half integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// the memory being read must be executable in both stages of address translation,
+/// but read permission is not required.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLVX.HU`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hlvx_hu(src: *const u16) -> u16 {
+    let insn: u16;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x643", out(reg) insn, in(reg) src, options(readonly, nostack));
+    insn
+}
+
+/// Loads virtual machine memory by signed word integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.W`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hlv_w(src: *const i32) -> i32 {
+    let value: i32;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x680", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Accesses virtual machine instruction by unsigned word integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// the memory being read must be executable in both stages of address translation,
+/// but read permission is not required.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLVX.WU`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hlvx_wu(src: *const u32) -> u32 {
+    let insn: u32;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x683", out(reg) insn, in(reg) src, options(readonly, nostack));
+    insn
+}
+
+/// Stores virtual machine memory by byte integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HSV.B`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hsv_b(dst: *mut i8, src: i8) {
+    asm!(".insn r 0x73, 0x4, 0x31, x0, {}, {}", in(reg) dst, in(reg) src, options(nostack));
+}
+
+/// Stores virtual machine memory by half integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HSV.H`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hsv_h(dst: *mut i16, src: i16) {
+    asm!(".insn r 0x73, 0x4, 0x33, x0, {}, {}", in(reg) dst, in(reg) src, options(nostack));
+}
+
+/// Stores virtual machine memory by word integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HSV.W`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hsv_w(dst: *mut i32, src: i32) {
+    asm!(".insn r 0x73, 0x4, 0x35, x0, {}, {}", in(reg) dst, in(reg) src, options(nostack));
+}
+
+/// Hypervisor memory management fence for given guest virtual address and guest address space
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all
+/// implicit reads by that hart done for VS-stage address translation for instructions that:
+/// - are subsequent to the `HFENCE.VVMA`, and
+/// - execute when `hgatp.VMID` has the same setting as it did when `HFENCE.VVMA` executed.
+///
+/// This fence specifies a single guest virtual address, and a single guest address-space identifier.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hfence_vvma(vaddr: usize, asid: usize) {
+    // asm!("hfence.vvma {}, {}", in(reg) vaddr, in(reg) asid)
+    asm!(".insn r 0x73, 0, 0x11, x0, {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
+}
+
+/// Hypervisor memory management fence for given guest virtual address
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all
+/// implicit reads by that hart done for VS-stage address translation for instructions that:
+/// - are subsequent to the `HFENCE.VVMA`, and
+/// - execute when `hgatp.VMID` has the same setting as it did when `HFENCE.VVMA` executed.
+///
+/// This fence specifies a single guest virtual address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hfence_vvma_vaddr(vaddr: usize) {
+    asm!(".insn r 0x73, 0, 0x11, x0, {}, x0", in(reg) vaddr, options(nostack))
+}
+
+/// Hypervisor memory management fence for given guest address space
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all
+/// implicit reads by that hart done for VS-stage address translation for instructions that:
+/// - are subsequent to the `HFENCE.VVMA`, and
+/// - execute when `hgatp.VMID` has the same setting as it did when `HFENCE.VVMA` executed.
+///
+/// This fence specifies a single guest address-space identifier.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hfence_vvma_asid(asid: usize) {
+    asm!(".insn r 0x73, 0, 0x11, x0, x0, {}", in(reg) asid, options(nostack))
+}
+
+/// Hypervisor memory management fence for all guest address spaces and guest virtual addresses
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all
+/// implicit reads by that hart done for VS-stage address translation for instructions that:
+/// - are subsequent to the `HFENCE.VVMA`, and
+/// - execute when `hgatp.VMID` has the same setting as it did when `HFENCE.VVMA` executed.
+///
+/// This fence applies to any guest address spaces and guest virtual addresses.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hfence_vvma_all() {
+    asm!(".insn r 0x73, 0, 0x11, x0, x0, x0", options(nostack))
+}
+
+/// Hypervisor memory management fence for guest physical address and virtual machine
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all implicit reads
+/// by that hart done for G-stage address translation for instructions that follow the HFENCE.GVMA.
+///
+/// This fence specifies a single guest physical address, **shifted right by 2 bits**, and a single virtual machine
+/// by virtual machine identifier (VMID).
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hfence_gvma(gaddr: usize, vmid: usize) {
+    // asm!("hfence.gvma {}, {}", in(reg) gaddr, in(reg) vmid, options(nostack))
+    asm!(".insn r 0x73, 0, 0x31, x0, {}, {}", in(reg) gaddr, in(reg) vmid, options(nostack))
+}
+
+/// Hypervisor memory management fence for guest physical address
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all implicit reads
+/// by that hart done for G-stage address translation for instructions that follow the HFENCE.GVMA.
+///
+/// This fence specifies a single guest physical address; **the physical address should be shifted right by 2 bits**.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hfence_gvma_gaddr(gaddr: usize) {
+    asm!(".insn r 0x73, 0, 0x31, x0, {}, x0", in(reg) gaddr, options(nostack))
+}
+
+/// Hypervisor memory management fence for given virtual machine
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all implicit reads
+/// by that hart done for G-stage address translation for instructions that follow the HFENCE.GVMA.
+///
+/// This fence specifies a single virtual machine by virtual machine identifier (VMID).
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hfence_gvma_vmid(vmid: usize) {
+    asm!(".insn r 0x73, 0, 0x31, x0, x0, {}", in(reg) vmid, options(nostack))
+}
+
+/// Hypervisor memory management fence for all virtual machines and guest physical addresses
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all implicit reads
+/// by that hart done for G-stage address translation for instructions that follow the HFENCE.GVMA.
+///
+/// This fence specifies all guest physical addresses and all virtual machines.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hfence_gvma_all() {
+    asm!(".insn r 0x73, 0, 0x31, x0, x0, x0", options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for given guest virtual address and guest address space
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.VVMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+///
+/// This fence specifies a single guest virtual address, and a single guest address-space identifier.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hinval_vvma(vaddr: usize, asid: usize) {
+    // asm!("hinval.vvma {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
+    asm!(".insn r 0x73, 0, 0x13, x0, {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for given guest virtual address
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.VVMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+///
+/// This fence specifies a single guest virtual address.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hinval_vvma_vaddr(vaddr: usize) {
+    asm!(".insn r 0x73, 0, 0x13, x0, {}, x0", in(reg) vaddr, options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for given guest address space
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.VVMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+///
+/// This fence specifies a single guest address-space identifier.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hinval_vvma_asid(asid: usize) {
+    asm!(".insn r 0x73, 0, 0x13, x0, x0, {}", in(reg) asid, options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for all guest address spaces and guest virtual addresses
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.VVMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+///
+/// This fence applies to any guest address spaces and guest virtual addresses.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hinval_vvma_all() {
+    asm!(".insn r 0x73, 0, 0x13, x0, x0, x0", options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for guest physical address and virtual machine
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.GVMA` instruction with the same values of `gaddr` and `vmid` would invalidate.
+///
+/// This fence specifies a single guest physical address, **shifted right by 2 bits**, and a single virtual machine
+/// by virtual machine identifier (VMID).
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hinval_gvma(gaddr: usize, vmid: usize) {
+    // asm!("hinval.gvma {}, {}", in(reg) gaddr, in(reg) vmid, options(nostack))
+    asm!(".insn r 0x73, 0, 0x33, x0, {}, {}", in(reg) gaddr, in(reg) vmid, options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for guest physical address
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.GVMA` instruction with the same values of `gaddr` and `vmid` would invalidate.
+///
+/// This fence specifies a single guest physical address; **the physical address should be shifted right by 2 bits**.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hinval_gvma_gaddr(gaddr: usize) {
+    asm!(".insn r 0x73, 0, 0x33, x0, {}, x0", in(reg) gaddr, options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for given virtual machine
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.GVMA` instruction with the same values of `gaddr` and `vmid` would invalidate.
+///
+/// This fence specifies a single virtual machine by virtual machine identifier (VMID).
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hinval_gvma_vmid(vmid: usize) {
+    asm!(".insn r 0x73, 0, 0x33, x0, x0, {}", in(reg) vmid, options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for all virtual machines and guest physical addresses
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.GVMA` instruction with the same values of `gaddr` and `vmid` would invalidate.
+///
+/// This fence specifies all guest physical addresses and all virtual machines.
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub unsafe fn hinval_gvma_all() {
+    asm!(".insn r 0x73, 0, 0x33, x0, x0, x0", options(nostack))
+}
+
+/// Reads the floating-point rounding mode register `frm`
+///
+/// According to "F" Standard Extension for Single-Precision Floating-Point, Version 2.2,
+/// the rounding mode field is defined as listed in the table below:
+///
+/// | Rounding Mode | Mnemonic | Meaning |
+/// |:-------------|:----------|:---------|
+/// | 000 | RNE | Round to Nearest, ties to Even |
+/// | 001 | RTZ | Round towards Zero |
+/// | 010 | RDN | Round Down (towards −∞) |
+/// | 011 | RUP | Round Up (towards +∞) |
+/// | 100 | RMM | Round to Nearest, ties to Max Magnitude |
+/// | 101 |     | _Reserved for future use._ |
+/// | 110 |     | _Reserved for future use._ |
+/// | 111 | DYN | In Rounding Mode register, _reserved_. |
+#[inline]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn frrm() -> u32 {
+    let value: u32;
+    unsafe { asm!("frrm {}", out(reg) value, options(nomem, nostack)) };
+    value
+}
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/p.rs b/library/stdarch/crates/core_arch/src/riscv_shared/p.rs
new file mode 100644
index 0000000000000..c76a0ec4b8f26
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/p.rs
@@ -0,0 +1,1283 @@
+//! RISC-V Packed SIMD intrinsics; shared part.
+//!
+//! RV64 only part is placed in riscv64 folder.
+use crate::arch::asm;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+// FIXME: Currently the P extension is still unratified, so there is no support
+// for it in the upstream LLVM for now, and thus no LLVM built-in functions or
+// serialization of instructions are provided.
+//
+// We add `assert_instr(unknown)` to each function so that we can at least make
+// sure they compile. Since there is no serialization yet, we can only write
+// "unknown" here, so that if LLVM upstream provides support for the P extension
+// at some point in the future, we can know in time and then update our
+// implementation.
+
+/// Adds packed 16-bit signed numbers, discarding overflow bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn add16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x20, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the sum of packed 16-bit signed numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn radd16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x00, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the sum of packed 16-bit unsigned numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn uradd16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x10, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds packed 16-bit signed numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn kadd16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x08, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds packed 16-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ukadd16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x18, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts packed 16-bit signed numbers, discarding overflow bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sub16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x21, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the subtraction result of packed 16-bit signed numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn rsub16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x01, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the subtraction result of packed 16-bit unsigned numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ursub16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x11, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts packed 16-bit signed numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ksub16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x09, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts packed 16-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn uksub16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x19, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross adds and subtracts packed 16-bit signed numbers, discarding overflow bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn cras16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x22, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross halves of adds and subtracts packed 16-bit signed numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn rcras16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x02, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross halves of adds and subtracts packed 16-bit unsigned numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn urcras16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x12, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross adds and subtracts packed 16-bit signed numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn kcras16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross adds and subtracts packed 16-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ukcras16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x1A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross subtracts and adds packed 16-bit signed numbers, discarding overflow bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn crsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x23, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross halves of subtracts and adds packed 16-bit signed numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn rcrsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x03, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross halves of subtracts and adds packed 16-bit unsigned numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn urcrsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x13, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross subtracts and adds packed 16-bit signed numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn kcrsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Cross subtracts and adds packed 16-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ukcrsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x1B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight adds and subtracts packed 16-bit signed numbers, discarding overflow bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn stas16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x7A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight halves of adds and subtracts packed 16-bit signed numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn rstas16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x5A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight halves of adds and subtracts packed 16-bit unsigned numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn urstas16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x6A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight adds and subtracts packed 16-bit signed numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn kstas16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x62, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight adds and subtracts packed 16-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ukstas16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x72, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight subtracts and adds packed 16-bit signed numbers, discarding overflow bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn stsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x7B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight halves of subtracts and adds packed 16-bit signed numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn rstsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x5B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight halves of subtracts and adds packed 16-bit unsigned numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn urstsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x6B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight subtracts and adds packed 16-bit signed numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn kstsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x63, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Straight subtracts and adds packed 16-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ukstsa16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x73, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds packed 8-bit signed numbers, discarding overflow bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn add8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x24, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the sum of packed 8-bit signed numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn radd8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x04, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the sum of packed 8-bit unsigned numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn uradd8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x14, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds packed 8-bit signed numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn kadd8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0C, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds packed 8-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ukadd8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x1C, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts packed 8-bit signed numbers, discarding overflow bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sub8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x25, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the subtraction result of packed 8-bit signed numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn rsub8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x05, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Halves the subtraction result of packed 8-bit unsigned numbers, dropping least bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ursub8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x15, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts packed 8-bit signed numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ksub8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0D, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts packed 8-bit unsigned numbers, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn uksub8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x1D, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Arithmetic right shift packed 16-bit elements without rounding up
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sra16(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x28, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Arithmetic right shift packed 16-bit elements with rounding up
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sra16u(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x30, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical right shift packed 16-bit elements without rounding up
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn srl16(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x29, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical right shift packed 16-bit elements with rounding up
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn srl16u(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x31, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical left shift packed 16-bit elements, discarding overflow bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sll16(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x2A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical left shift packed 16-bit elements, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ksll16(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x32, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical saturating left then arithmetic right shift packed 16-bit elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn kslra16(a: usize, b: i32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x2B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical saturating left then arithmetic right shift packed 16-bit elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn kslra16u(a: usize, b: i32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x33, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Arithmetic right shift packed 8-bit elements without rounding up
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sra8(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x2C, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Arithmetic right shift packed 8-bit elements with rounding up
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sra8u(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x34, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical right shift packed 8-bit elements without rounding up
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn srl8(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x2D, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical right shift packed 8-bit elements with rounding up
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn srl8u(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x35, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical left shift packed 8-bit elements, discarding overflow bits
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sll8(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x2E, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical left shift packed 8-bit elements, saturating at the numeric bounds
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ksll8(a: usize, b: u32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x36, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical saturating left then arithmetic right shift packed 8-bit elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn kslra8(a: usize, b: i32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x2F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Logical saturating left then arithmetic right shift packed 8-bit elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn kslra8u(a: usize, b: i32) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x37, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare equality for packed 16-bit elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn cmpeq16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x26, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 16-bit packed signed integers are less than the others
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn scmplt16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x06, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 16-bit packed signed integers are less than or equal to the others
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn scmple16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0E, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 16-bit packed unsigned integers are less than the others
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ucmplt16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x16, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 16-bit packed unsigned integers are less than or equal to the others
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ucmple16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x1E, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare equality for packed 8-bit elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn cmpeq8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x27, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 8-bit packed signed integers are less than the others
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn scmplt8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x07, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 8-bit packed signed integers are less than or equal to the others
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn scmple8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 8-bit packed unsigned integers are less than the others
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ucmplt8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x17, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Compare whether 8-bit packed unsigned integers are less than or equal to the others
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ucmple8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x1F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get minimum values from 16-bit packed signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn smin16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x40, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get minimum values from 16-bit packed unsigned integers
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn umin16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x48, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get maximum values from 16-bit packed signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn smax16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x41, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get maximum values from 16-bit packed unsigned integers
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn umax16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x49, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/* todo: sclip16, uclip16 */
+
+/// Compute the absolute value of packed 16-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn kabs16(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xAD1)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Count the number of redundant sign bits of the packed 16-bit elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn clrs16(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xAE8)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Count the number of leading zero bits of the packed 16-bit elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn clz16(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xAE9)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Swap the 16-bit halfwords within each 32-bit word of a register
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn swap16(a: usize) -> usize {
+    let value: usize;
+    // this instruction is an alias for `pkbt rd, rs1, rs1`.
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x0F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get minimum values from 8-bit packed signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn smin8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x44, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get minimum values from 8-bit packed unsigned integers
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn umin8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x4C, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get maximum values from 8-bit packed signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn smax8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x45, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Get maximum values from 8-bit packed unsigned integers
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn umax8(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x4D, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/* todo: sclip8, uclip8 */
+
+/// Compute the absolute value of packed 8-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn kabs8(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xAD0)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Count the number of redundant sign bits of the packed 8-bit elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn clrs8(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xAE0)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Count the number of leading zero bits of the packed 8-bit elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn clz8(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xAE1)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Swap the 8-bit bytes within each 16-bit halfword of a register.
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn swap8(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xAD8)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack first and zeroth into two 16-bit signed halfwords in each 32-bit chunk
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sunpkd810(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xAC8)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack second and zeroth into two 16-bit signed halfwords in each 32-bit chunk
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sunpkd820(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xAC9)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack third and zeroth into two 16-bit signed halfwords in each 32-bit chunk
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sunpkd830(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xACA)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack third and first into two 16-bit signed halfwords in each 32-bit chunk
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sunpkd831(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xACB)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack third and second into two 16-bit signed halfwords in each 32-bit chunk
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn sunpkd832(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xAD3)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack first and zeroth into two 16-bit unsigned halfwords in each 32-bit chunk
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn zunpkd810(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xACC)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack second and zeroth into two 16-bit unsigned halfwords in each 32-bit chunk
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn zunpkd820(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xACD)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack third and zeroth into two 16-bit unsigned halfwords in each 32-bit chunk
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn zunpkd830(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xACE)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack third and first into two 16-bit unsigned halfwords in each 32-bit chunk
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn zunpkd831(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xACF)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Unpack third and second into two 16-bit unsigned halfwords in each 32-bit chunk
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn zunpkd832(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xAD7)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+// todo: pkbb16, pktt16
+
+/// Pack two 16-bit data from bottom and top half from 32-bit chunks
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn pkbt16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x1, 0x0F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Pack two 16-bit data from top and bottom half from 32-bit chunks
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn pktb16(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x1, 0x1F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Count the number of redundant sign bits of the packed 32-bit elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn clrs32(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xAF8)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Count the number of leading zero bits of the packed 32-bit elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn clz32(a: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn i 0x77, 0x0, {}, {}, %lo(0xAF9)", lateout(reg) value, in(reg) a, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Calculate the sum of absolute difference of unsigned 8-bit data elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn pbsad(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x7E, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Calculate and accumulate the sum of absolute difference of unsigned 8-bit data elements
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn pbsada(t: usize, a: usize, b: usize) -> usize {
+    let mut value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x7F, {}, {}, {}", inlateout(reg) t => value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Multiply signed 8-bit elements and add 16-bit elements on results for packed 32-bit chunks
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn smaqa(t: usize, a: usize, b: usize) -> usize {
+    let mut value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x64, {}, {}, {}", inlateout(reg) t => value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Multiply unsigned 8-bit elements and add 16-bit elements on results for packed 32-bit chunks
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn umaqa(t: usize, a: usize, b: usize) -> usize {
+    let mut value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x66, {}, {}, {}", inlateout(reg) t => value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Multiply signed to unsigned 8-bit and add 16-bit elements on results for packed 32-bit chunks
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn smaqasu(t: usize, a: usize, b: usize) -> usize {
+    let mut value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x0, 0x65, {}, {}, {}", inlateout(reg) t => value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds signed lower 16-bit content of two registers with Q15 saturation
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn kaddh(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x1, 0x02, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts signed lower 16-bit content of two registers with Q15 saturation
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ksubh(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x1, 0x03, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Adds signed lower 16-bit content of two registers with U16 saturation
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn ukaddh(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x1, 0x0A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
+
+/// Subtracts signed lower 16-bit content of two registers with U16 saturation
+#[inline]
+#[cfg_attr(test, assert_instr(unknown))]
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+pub fn uksubh(a: usize, b: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(".insn r 0x77, 0x1, 0x0B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack))
+    }
+    value
+}
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs b/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs
new file mode 100644
index 0000000000000..9472e3c8be9f6
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs
@@ -0,0 +1,134 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[cfg(target_arch = "riscv32")]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.riscv.orc.b.i32"]
+    fn _orc_b_32(rs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.clmul.i32"]
+    fn _clmul_32(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.clmulh.i32"]
+    fn _clmulh_32(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.clmulr.i32"]
+    fn _clmulr_32(rs1: i32, rs2: i32) -> i32;
+}
+
+#[cfg(target_arch = "riscv64")]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.riscv.orc.b.i64"]
+    fn _orc_b_64(rs1: i64) -> i64;
+
+    #[link_name = "llvm.riscv.clmul.i64"]
+    fn _clmul_64(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.clmulh.i64"]
+    fn _clmulh_64(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.clmulr.i64"]
+    fn _clmulr_64(rs1: i64, rs2: i64) -> i64;
+}
+
+/// Bitwise OR-Combine, byte granule
+///
+/// Combines the bits within every byte through a reciprocal bitwise logical OR. This sets the bits of each byte in
+/// the result rd to all zeros if no bit within the respective byte of rs is set, or to all ones if any bit within the
+/// respective byte of rs is set.
+///
+/// Source: RISC-V Bit-Manipulation ISA-extensions
+///
+/// Version: v1.0.0
+///
+/// Section: 2.24
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zbb")]
+#[cfg_attr(test, assert_instr(orc.b))]
+#[inline]
+pub fn orc_b(rs: usize) -> usize {
+    #[cfg(target_arch = "riscv32")]
+    unsafe {
+        _orc_b_32(rs as i32) as usize
+    }
+
+    #[cfg(target_arch = "riscv64")]
+    unsafe {
+        _orc_b_64(rs as i64) as usize
+    }
+}
+
+/// Carry-less multiply (low-part)
+///
+/// clmul produces the lower half of the 2·XLEN carry-less product.
+///
+/// Source: RISC-V Bit-Manipulation ISA-extensions
+///
+/// Version: v1.0.0
+///
+/// Section: 2.11
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zbc")]
+#[cfg_attr(test, assert_instr(clmul))]
+#[inline]
+pub fn clmul(rs1: usize, rs2: usize) -> usize {
+    #[cfg(target_arch = "riscv32")]
+    unsafe {
+        _clmul_32(rs1 as i32, rs2 as i32) as usize
+    }
+
+    #[cfg(target_arch = "riscv64")]
+    unsafe {
+        _clmul_64(rs1 as i64, rs2 as i64) as usize
+    }
+}
+
+/// Carry-less multiply (high-part)
+///
+/// clmulh produces the upper half of the 2·XLEN carry-less product.
+///
+/// Source: RISC-V Bit-Manipulation ISA-extensions
+///
+/// Version: v1.0.0
+///
+/// Section: 2.12
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zbc")]
+#[cfg_attr(test, assert_instr(clmulh))]
+#[inline]
+pub fn clmulh(rs1: usize, rs2: usize) -> usize {
+    #[cfg(target_arch = "riscv32")]
+    unsafe {
+        _clmulh_32(rs1 as i32, rs2 as i32) as usize
+    }
+
+    #[cfg(target_arch = "riscv64")]
+    unsafe {
+        _clmulh_64(rs1 as i64, rs2 as i64) as usize
+    }
+}
+
+/// Carry-less multiply (reversed)
+///
+/// clmulr produces bits 2·XLEN−2:XLEN-1 of the 2·XLEN carry-less product.
+///
+/// Source: RISC-V Bit-Manipulation ISA-extensions
+///
+/// Version: v1.0.0
+///
+/// Section: 2.13
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zbc")]
+#[cfg_attr(test, assert_instr(clmulr))]
+#[inline]
+pub fn clmulr(rs1: usize, rs2: usize) -> usize {
+    #[cfg(target_arch = "riscv32")]
+    unsafe {
+        _clmulr_32(rs1 as i32, rs2 as i32) as usize
+    }
+
+    #[cfg(target_arch = "riscv64")]
+    unsafe {
+        _clmulr_64(rs1 as i64, rs2 as i64) as usize
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs b/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs
new file mode 100644
index 0000000000000..b1e633d170223
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs
@@ -0,0 +1,422 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.riscv.sm4ed"]
+    fn _sm4ed(rs1: i32, rs2: i32, bs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sm4ks"]
+    fn _sm4ks(rs1: i32, rs2: i32, bs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sm3p0"]
+    fn _sm3p0(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sm3p1"]
+    fn _sm3p1(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha256sig0"]
+    fn _sha256sig0(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha256sig1"]
+    fn _sha256sig1(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha256sum0"]
+    fn _sha256sum0(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha256sum1"]
+    fn _sha256sum1(rs1: i32) -> i32;
+}
+
+#[cfg(target_arch = "riscv32")]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.riscv.xperm8.i32"]
+    fn _xperm8_32(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.xperm4.i32"]
+    fn _xperm4_32(rs1: i32, rs2: i32) -> i32;
+}
+
+#[cfg(target_arch = "riscv64")]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.riscv.xperm8.i64"]
+    fn _xperm8_64(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.xperm4.i64"]
+    fn _xperm4_64(rs1: i64, rs2: i64) -> i64;
+}
+
+/// Byte-wise lookup of indicies into a vector in registers.
+///
+/// The xperm8 instruction operates on bytes. The rs1 register contains a vector of XLEN/8
+/// 8-bit elements. The rs2 register contains a vector of XLEN/8 8-bit indexes. The result is
+/// each element in rs2 replaced by the indexed element in rs1, or zero if the index into rs2
+/// is out of bounds.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.47
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zbkx")]
+#[cfg_attr(test, assert_instr(xperm8))]
+#[inline]
+pub fn xperm8(rs1: usize, rs2: usize) -> usize {
+    #[cfg(target_arch = "riscv32")]
+    unsafe {
+        _xperm8_32(rs1 as i32, rs2 as i32) as usize
+    }
+
+    #[cfg(target_arch = "riscv64")]
+    unsafe {
+        _xperm8_64(rs1 as i64, rs2 as i64) as usize
+    }
+}
+
+/// Nibble-wise lookup of indicies into a vector.
+///
+/// The xperm4 instruction operates on nibbles. The rs1 register contains a vector of XLEN/4
+/// 4-bit elements. The rs2 register contains a vector of XLEN/4 4-bit indexes. The result is
+/// each element in rs2 replaced by the indexed element in rs1, or zero if the index into rs2
+/// is out of bounds.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.48
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zbkx")]
+#[cfg_attr(test, assert_instr(xperm4))]
+#[inline]
+pub fn xperm4(rs1: usize, rs2: usize) -> usize {
+    #[cfg(target_arch = "riscv32")]
+    unsafe {
+        _xperm4_32(rs1 as i32, rs2 as i32) as usize
+    }
+
+    #[cfg(target_arch = "riscv64")]
+    unsafe {
+        _xperm4_64(rs1 as i64, rs2 as i64) as usize
+    }
+}
+
+/// Implements the Sigma0 transformation function as used in the SHA2-256 hash function \[49\]
+/// (Section 4.1.2).
+///
+/// This instruction is supported for both RV32 and RV64 base architectures. For RV32, the
+/// entire XLEN source register is operated on. For RV64, the low 32 bits of the source
+/// register are operated on, and the result sign extended to XLEN bits. Though named for
+/// SHA2-256, the instruction works for both the SHA2-224 and SHA2-256 parameterisations as
+/// described in \[49\]. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.27
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha256sig0))]
+#[inline]
+pub fn sha256sig0(rs1: u32) -> u32 {
+    unsafe { _sha256sig0(rs1 as i32) as u32 }
+}
+
+/// Implements the Sigma1 transformation function as used in the SHA2-256 hash function \[49\]
+/// (Section 4.1.2).
+///
+/// This instruction is supported for both RV32 and RV64 base architectures. For RV32, the
+/// entire XLEN source register is operated on. For RV64, the low 32 bits of the source
+/// register are operated on, and the result sign extended to XLEN bits. Though named for
+/// SHA2-256, the instruction works for both the SHA2-224 and SHA2-256 parameterisations as
+/// described in \[49\]. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.28
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha256sig1))]
+#[inline]
+pub fn sha256sig1(rs1: u32) -> u32 {
+    unsafe { _sha256sig1(rs1 as i32) as u32 }
+}
+
+/// Implements the Sum0 transformation function as used in the SHA2-256 hash function \[49\]
+/// (Section 4.1.2).
+///
+/// This instruction is supported for both RV32 and RV64 base architectures. For RV32, the
+/// entire XLEN source register is operated on. For RV64, the low 32 bits of the source
+/// register are operated on, and the result sign extended to XLEN bits. Though named for
+/// SHA2-256, the instruction works for both the SHA2-224 and SHA2-256 parameterisations as
+/// described in \[49\]. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.29
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha256sum0))]
+#[inline]
+pub fn sha256sum0(rs1: u32) -> u32 {
+    unsafe { _sha256sum0(rs1 as i32) as u32 }
+}
+
+/// Implements the Sum1 transformation function as used in the SHA2-256 hash function \[49\]
+/// (Section 4.1.2).
+///
+/// This instruction is supported for both RV32 and RV64 base architectures. For RV32, the
+/// entire XLEN source register is operated on. For RV64, the low 32 bits of the source
+/// register are operated on, and the result sign extended to XLEN bits. Though named for
+/// SHA2-256, the instruction works for both the SHA2-224 and SHA2-256 parameterisations as
+/// described in \[49\]. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.30
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha256sum1))]
+#[inline]
+pub fn sha256sum1(rs1: u32) -> u32 {
+    unsafe { _sha256sum1(rs1 as i32) as u32 }
+}
+
+/// Accelerates the block encrypt/decrypt operation of the SM4 block cipher \[5, 31\].
+///
+/// Implements a T-tables in hardware style approach to accelerating the SM4 round function. A
+/// byte is extracted from rs2 based on bs, to which the SBox and linear layer transforms are
+/// applied, before the result is XOR’d with rs1 and written back to rd. This instruction
+/// exists on RV32 and RV64 base architectures. On RV64, the 32-bit result is sign extended to
+/// XLEN bits. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.43
+///
+/// # Note
+///
+/// The `BS` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Details
+///
+/// Accelerates the round function `F` in the SM4 block cipher algorithm
+///
+/// This instruction is included in extension `Zksed`. It's defined as:
+///
+/// ```text
+/// SM4ED(x, a, BS) = x ⊕ T(ai)
+/// ... where
+/// ai = a.bytes[BS]
+/// T(ai) = L(τ(ai))
+/// bi = τ(ai) = SM4-S-Box(ai)
+/// ci = L(bi) = bi ⊕ (bi ≪ 2) ⊕ (bi ≪ 10) ⊕ (bi ≪ 18) ⊕ (bi ≪ 24)
+/// SM4ED = (ci ≪ (BS * 8)) ⊕ x
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+/// As is defined above, `T` is a combined transformation of non linear S-Box transform `τ`
+/// and linear layer transform `L`.
+///
+/// In the SM4 algorithm, the round function `F` is defined as:
+///
+/// ```text
+/// F(x0, x1, x2, x3, rk) = x0 ⊕ T(x1 ⊕ x2 ⊕ x3 ⊕ rk)
+/// ... where
+/// T(A) = L(τ(A))
+/// B = τ(A) = (SM4-S-Box(a0), SM4-S-Box(a1), SM4-S-Box(a2), SM4-S-Box(a3))
+/// C = L(B) = B ⊕ (B ≪ 2) ⊕ (B ≪ 10) ⊕ (B ≪ 18) ⊕ (B ≪ 24)
+/// ```
+///
+/// It can be implemented by `sm4ed` instruction like:
+///
+/// ```no_run
+/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
+/// # fn round_function(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
+/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ed;
+/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ed;
+/// let a = x1 ^ x2 ^ x3 ^ rk;
+/// let c0 = sm4ed(x0, a, 0);
+/// let c1 = sm4ed(c0, a, 1); // c1 represents c[0..=1], etc.
+/// let c2 = sm4ed(c1, a, 2);
+/// let c3 = sm4ed(c2, a, 3);
+/// return c3; // c3 represents c[0..=3]
+/// # }
+/// ```
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zksed")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(sm4ed, BS = 0))]
+#[inline]
+pub fn sm4ed<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
+    static_assert!(BS < 4);
+
+    unsafe { _sm4ed(rs1 as i32, rs2 as i32, BS as i32) as u32 }
+}
+
+/// Accelerates the Key Schedule operation of the SM4 block cipher \[5, 31\] with `bs=0`.
+///
+/// Implements a T-tables in hardware style approach to accelerating the SM4 Key Schedule. A
+/// byte is extracted from rs2 based on bs, to which the SBox and linear layer transforms are
+/// applied, before the result is XOR’d with rs1 and written back to rd. This instruction
+/// exists on RV32 and RV64 base architectures. On RV64, the 32-bit result is sign extended to
+/// XLEN bits. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.44
+///
+/// # Note
+///
+/// The `BS` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Details
+///
+/// Accelerates the round function `F` in the SM4 block cipher algorithm
+///
+/// This instruction is included in extension `Zksed`. It's defined as:
+///
+/// ```text
+/// SM4ED(x, a, BS) = x ⊕ T(ai)
+/// ... where
+/// ai = a.bytes[BS]
+/// T(ai) = L(τ(ai))
+/// bi = τ(ai) = SM4-S-Box(ai)
+/// ci = L(bi) = bi ⊕ (bi ≪ 2) ⊕ (bi ≪ 10) ⊕ (bi ≪ 18) ⊕ (bi ≪ 24)
+/// SM4ED = (ci ≪ (BS * 8)) ⊕ x
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+/// As is defined above, `T` is a combined transformation of non linear S-Box transform `τ`
+/// and linear layer transform `L`.
+///
+/// In the SM4 algorithm, the round function `F` is defined as:
+///
+/// ```text
+/// F(x0, x1, x2, x3, rk) = x0 ⊕ T(x1 ⊕ x2 ⊕ x3 ⊕ rk)
+/// ... where
+/// T(A) = L(τ(A))
+/// B = τ(A) = (SM4-S-Box(a0), SM4-S-Box(a1), SM4-S-Box(a2), SM4-S-Box(a3))
+/// C = L(B) = B ⊕ (B ≪ 2) ⊕ (B ≪ 10) ⊕ (B ≪ 18) ⊕ (B ≪ 24)
+/// ```
+///
+/// It can be implemented by `sm4ed` instruction like:
+///
+/// ```no_run
+/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
+/// # fn round_function(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
+/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ed;
+/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ed;
+/// let a = x1 ^ x2 ^ x3 ^ rk;
+/// let c0 = sm4ed(x0, a, 0);
+/// let c1 = sm4ed(c0, a, 1); // c1 represents c[0..=1], etc.
+/// let c2 = sm4ed(c1, a, 2);
+/// let c3 = sm4ed(c2, a, 3);
+/// return c3; // c3 represents c[0..=3]
+/// # }
+/// ```
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zksed")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(sm4ks, BS = 0))]
+#[inline]
+pub fn sm4ks<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
+    static_assert!(BS < 4);
+
+    unsafe { _sm4ks(rs1 as i32, rs2 as i32, BS as i32) as u32 }
+}
+
+/// Implements the P0 transformation function as used in the SM3 hash function [4, 30].
+///
+/// This instruction is supported for the RV32 and RV64 base architectures. It implements the
+/// P0 transform of the SM3 hash function [4, 30]. This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.41
+///
+/// # Details
+///
+/// `P0` transformation function as is used in the SM3 hash algorithm
+///
+/// This function is included in `Zksh` extension. It's defined as:
+///
+/// ```text
+/// P0(X) = X ⊕ (X ≪ 9) ⊕ (X ≪ 17)
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+///
+/// In the SM3 algorithm, the `P0` transformation is used as `E ← P0(TT2)` when the
+/// compression function `CF` uses the intermediate value `TT2` to calculate
+/// the variable `E` in one iteration for subsequent processes.
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zksh")]
+#[cfg_attr(test, assert_instr(sm3p0))]
+#[inline]
+pub fn sm3p0(rs1: u32) -> u32 {
+    unsafe { _sm3p0(rs1 as i32) as u32 }
+}
+
+/// Implements the P1 transformation function as used in the SM3 hash function [4, 30].
+///
+/// This instruction is supported for the RV32 and RV64 base architectures. It implements the
+/// P1 transform of the SM3 hash function [4, 30]. This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.42
+///
+/// # Details
+///
+/// `P1` transformation function as is used in the SM3 hash algorithm
+///
+/// This function is included in `Zksh` extension. It's defined as:
+///
+/// ```text
+/// P1(X) = X ⊕ (X ≪ 15) ⊕ (X ≪ 23)
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+///
+/// In the SM3 algorithm, the `P1` transformation is used to expand message,
+/// where expanded word `Wj` can be generated from the previous words.
+/// The whole process can be described as the following pseudocode:
+///
+/// ```text
+/// FOR j=16 TO 67
+///     Wj ← P1(Wj−16 ⊕ Wj−9 ⊕ (Wj−3 ≪ 15)) ⊕ (Wj−13 ≪ 7) ⊕ Wj−6
+/// ENDFOR
+/// ```
+#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
+#[target_feature(enable = "zksh")]
+#[cfg_attr(test, assert_instr(sm3p1))]
+#[inline]
+pub fn sm3p1(rs1: u32) -> u32 {
+    unsafe { _sm3p1(rs1 as i32) as u32 }
+}
diff --git a/library/stdarch/crates/core_arch/src/s390x/macros.rs b/library/stdarch/crates/core_arch/src/s390x/macros.rs
new file mode 100644
index 0000000000000..4f0f84ec912b7
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/s390x/macros.rs
@@ -0,0 +1,473 @@
+#![allow(unused_macros)] // FIXME remove when more tests are added
+#![allow(unused_imports)] // FIXME remove when more tests are added
+
+macro_rules! test_impl {
+    ($fun:ident ($($v:ident : $ty:ty),*) -> $r:ty [$call:ident, _]) => {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        pub unsafe fn $fun ($($v : $ty),*) -> $r {
+            $call ($($v),*)
+        }
+    };
+    ($fun:ident +($($v:ident : $ty:ty),*) -> $r:ty [$call:ident, $instr:ident]) => {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        #[cfg_attr(test, assert_instr($instr))]
+        pub unsafe fn $fun ($($v : $ty),*) -> $r {
+            transmute($call ($($v),*))
+        }
+    };
+    ($fun:ident +($($v:ident : $ty:ty),*) -> $r:ty [$call:ident, $tf:literal $instr:ident]) => {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        #[cfg_attr(all(test, target_feature = $tf), assert_instr($instr))]
+        pub unsafe fn $fun ($($v : $ty),*) -> $r {
+            transmute($call ($($v),*))
+        }
+    };
+    ($fun:ident ($($v:ident : $ty:ty),*) -> $r:ty [$call:ident, $tf:literal $instr:ident]) => {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        #[cfg_attr(all(test, target_feature = $tf), assert_instr($instr))]
+        pub unsafe fn $fun ($($v : $ty),*) -> $r {
+            $call ($($v),*)
+        }
+    };
+    ($fun:ident ($($v:ident : $ty:ty),*) -> $r:ty [$call:ident, $instr:ident]) => {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        #[cfg_attr(test, assert_instr($instr))]
+        pub unsafe fn $fun ($($v : $ty),*) -> $r {
+            $call ($($v),*)
+        }
+    };
+}
+
+#[allow(unknown_lints, unused_macro_rules)]
+macro_rules! impl_vec_trait {
+    ([$Trait:ident $m:ident] $fun:ident ($a:ty)) => {
+        #[unstable(feature = "stdarch_s390x", issue = "135681")]
+        impl $Trait for $a {
+            #[inline]
+            #[target_feature(enable = "vector")]
+            unsafe fn $m(self) -> Self {
+                $fun(transmute(self))
+            }
+        }
+    };
+    ([$Trait:ident $m:ident]+ $fun:ident ($a:ty)) => {
+        #[unstable(feature = "stdarch_s390x", issue = "135681")]
+        impl $Trait for $a {
+            #[inline]
+            #[target_feature(enable = "vector")]
+            unsafe fn $m(self) -> Self {
+                transmute($fun(transmute(self)))
+            }
+        }
+    };
+    ([$Trait:ident $m:ident] $fun:ident ($a:ty) -> $r:ty) => {
+        #[unstable(feature = "stdarch_s390x", issue = "135681")]
+        impl $Trait for $a {
+            type Result = $r;
+            #[inline]
+            #[target_feature(enable = "vector")]
+            unsafe fn $m(self) -> Self::Result {
+                $fun(transmute(self))
+            }
+        }
+    };
+    ([$Trait:ident $m:ident]+ $fun:ident ($a:ty) -> $r:ty) => {
+        #[unstable(feature = "stdarch_s390x", issue = "135681")]
+        impl $Trait for $a {
+            type Result = $r;
+            #[inline]
+            #[target_feature(enable = "vector")]
+            unsafe fn $m(self) -> Self::Result {
+                transmute($fun(transmute(self)))
+            }
+        }
+    };
+    ([$Trait:ident $m:ident] 1 ($ub:ident, $sb:ident, $uh:ident, $sh:ident, $uw:ident, $sw:ident, $sf: ident)) => {
+        impl_vec_trait!{ [$Trait $m] $ub (vector_unsigned_char) -> vector_unsigned_char }
+        impl_vec_trait!{ [$Trait $m] $sb (vector_signed_char) -> vector_signed_char }
+        impl_vec_trait!{ [$Trait $m] $uh (vector_unsigned_short) -> vector_unsigned_short }
+        impl_vec_trait!{ [$Trait $m] $sh (vector_signed_short) -> vector_signed_short }
+        impl_vec_trait!{ [$Trait $m] $uw (vector_unsigned_int) -> vector_unsigned_int }
+        impl_vec_trait!{ [$Trait $m] $sw (vector_signed_int) -> vector_signed_int }
+        impl_vec_trait!{ [$Trait $m] $uw (vector_unsigned_long_long) -> vector_unsigned_long_long }
+        impl_vec_trait!{ [$Trait $m] $sw (vector_signed_long_long) -> vector_signed_long_long }
+        impl_vec_trait!{ [$Trait $m] $sf (vector_float) -> vector_float }
+        impl_vec_trait!{ [$Trait $m] $sf (vector_double) -> vector_double }
+    };
+    ([$Trait:ident $m:ident] $fun:ident ($a:ty, $b:ty) -> $r:ty) => {
+        #[unstable(feature = "stdarch_s390x", issue = "135681")]
+        impl $Trait<$b> for $a {
+            type Result = $r;
+            #[inline]
+            #[target_feature(enable = "vector")]
+            unsafe fn $m(self, b: $b) -> Self::Result {
+                $fun(transmute(self), transmute(b))
+            }
+        }
+    };
+    ([$Trait:ident $m:ident]+ $fun:ident ($a:ty, $b:ty) -> $r:ty) => {
+        #[unstable(feature = "stdarch_s390x", issue = "135681")]
+        impl $Trait<$b> for $a {
+            type Result = $r;
+            #[inline]
+            #[target_feature(enable = "vector")]
+            unsafe fn $m(self, b: $b) -> Self::Result {
+                transmute($fun(transmute(self), transmute(b)))
+            }
+        }
+    };
+    ([$Trait:ident $m:ident] $fun:ident ($a:ty, ~$b:ty) -> $r:ty) => {
+        impl_vec_trait!{ [$Trait $m] $fun ($a, $a) -> $r }
+        impl_vec_trait!{ [$Trait $m] $fun ($a, $b) -> $r }
+        impl_vec_trait!{ [$Trait $m] $fun ($b, $a) -> $r }
+    };
+    ([$Trait:ident $m:ident] ~($ub:ident, $sb:ident, $uh:ident, $sh:ident, $uw:ident, $sw:ident, $ug:ident, $sg:ident)) => {
+        impl_vec_trait!{ [$Trait $m] $ub (vector_unsigned_char, ~vector_bool_char) -> vector_unsigned_char }
+        impl_vec_trait!{ [$Trait $m] $sb (vector_signed_char, ~vector_bool_char) -> vector_signed_char }
+        impl_vec_trait!{ [$Trait $m] $uh (vector_unsigned_short, ~vector_bool_short) -> vector_unsigned_short }
+        impl_vec_trait!{ [$Trait $m] $sh (vector_signed_short, ~vector_bool_short) -> vector_signed_short }
+        impl_vec_trait!{ [$Trait $m] $uw (vector_unsigned_int, ~vector_bool_int) -> vector_unsigned_int }
+        impl_vec_trait!{ [$Trait $m] $sw (vector_signed_int, ~vector_bool_int) -> vector_signed_int }
+        impl_vec_trait!{ [$Trait $m] $ug (vector_unsigned_long_long, ~vector_bool_long_long) -> vector_unsigned_long_long }
+        impl_vec_trait!{ [$Trait $m] $sg (vector_signed_long_long, ~vector_bool_long_long) -> vector_signed_long_long }
+    };
+    ([$Trait:ident $m:ident] ~($fn:ident)) => {
+        impl_vec_trait!{ [$Trait $m] ~($fn, $fn, $fn, $fn, $fn, $fn, $fn, $fn) }
+    };
+    ([$Trait:ident $m:ident] 2 ($ub:ident, $sb:ident, $uh:ident, $sh:ident, $uw:ident, $sw:ident, $ug:ident, $sg:ident)) => {
+        impl_vec_trait!{ [$Trait $m] $ub (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+        impl_vec_trait!{ [$Trait $m] $sb (vector_signed_char, vector_signed_char) -> vector_signed_char }
+        impl_vec_trait!{ [$Trait $m] $uh (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short }
+        impl_vec_trait!{ [$Trait $m] $sh (vector_signed_short, vector_signed_short) -> vector_signed_short }
+        impl_vec_trait!{ [$Trait $m] $uw (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int }
+        impl_vec_trait!{ [$Trait $m] $sw (vector_signed_int, vector_signed_int) -> vector_signed_int }
+        impl_vec_trait!{ [$Trait $m] $ug (vector_unsigned_long_long, vector_unsigned_long_long) -> vector_unsigned_long_long }
+        impl_vec_trait!{ [$Trait $m] $sg (vector_signed_long_long, vector_signed_long_long) -> vector_signed_long_long }
+    };
+    ([$Trait:ident $m:ident] 2 ($fn:ident)) => {
+        impl_vec_trait!{ [$Trait $m] ($fn, $fn, $fn, $fn, $fn, $fn, $fn, $fn) }
+    };
+    ([$Trait:ident $m:ident]+ 2b ($b:ident, $h:ident, $w:ident, $g:ident)) => {
+        impl_vec_trait!{ [$Trait $m]+ $b (vector_bool_char, vector_bool_char) -> vector_bool_char }
+        impl_vec_trait!{ [$Trait $m]+ $b (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+        impl_vec_trait!{ [$Trait $m]+ $b (vector_signed_char, vector_signed_char) -> vector_signed_char }
+        impl_vec_trait!{ [$Trait $m]+ $h (vector_bool_short, vector_bool_short) -> vector_bool_short }
+        impl_vec_trait!{ [$Trait $m]+ $h (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short }
+        impl_vec_trait!{ [$Trait $m]+ $h (vector_signed_short, vector_signed_short) -> vector_signed_short }
+        impl_vec_trait!{ [$Trait $m]+ $w (vector_bool_int, vector_bool_int) -> vector_bool_int }
+        impl_vec_trait!{ [$Trait $m]+ $w (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int }
+        impl_vec_trait!{ [$Trait $m]+ $w (vector_signed_int, vector_signed_int) -> vector_signed_int }
+        impl_vec_trait!{ [$Trait $m]+ $g (vector_unsigned_long_long, vector_unsigned_long_long) -> vector_unsigned_long_long }
+        impl_vec_trait!{ [$Trait $m]+ $g (vector_signed_long_long, vector_signed_long_long) -> vector_signed_long_long }
+    };
+    ([$Trait:ident $m:ident]+ 2b ($fn:ident)) => {
+        impl_vec_trait!{ [$Trait $m]+ 2b ($fn, $fn, $fn, $fn) }
+    };
+    ([$Trait:ident $m:ident]+ 2c ($b:ident, $h:ident, $w:ident, $g:ident, $s:ident, $d:ident)) => {
+        impl_vec_trait!{ [$Trait $m]+ $b (vector_bool_char, vector_bool_char) -> vector_bool_char }
+        impl_vec_trait!{ [$Trait $m]+ $b (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+        impl_vec_trait!{ [$Trait $m]+ $b (vector_signed_char, vector_signed_char) -> vector_signed_char }
+        impl_vec_trait!{ [$Trait $m]+ $h (vector_bool_short, vector_bool_short) -> vector_bool_short }
+        impl_vec_trait!{ [$Trait $m]+ $h (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short }
+        impl_vec_trait!{ [$Trait $m]+ $h (vector_signed_short, vector_signed_short) -> vector_signed_short }
+        impl_vec_trait!{ [$Trait $m]+ $w (vector_bool_int, vector_bool_int) -> vector_bool_int }
+        impl_vec_trait!{ [$Trait $m]+ $w (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int }
+        impl_vec_trait!{ [$Trait $m]+ $w (vector_signed_int, vector_signed_int) -> vector_signed_int }
+        impl_vec_trait!{ [$Trait $m]+ $g (vector_unsigned_long_long, vector_unsigned_long_long) -> vector_unsigned_long_long }
+        impl_vec_trait!{ [$Trait $m]+ $g (vector_signed_long_long, vector_signed_long_long) -> vector_signed_long_long }
+        impl_vec_trait!{ [$Trait $m]+ $s (vector_float, vector_float) -> vector_float }
+        impl_vec_trait!{ [$Trait $m]+ $d (vector_double, vector_double) -> vector_double }
+    };
+    ([$Trait:ident $m:ident]+ 2c ($fn:ident)) => {
+        impl_vec_trait!{ [$Trait $m]+ 2c ($fn, $fn, $fn, $fn, $fn, $fn) }
+    };
+}
+
+macro_rules! s_t_l {
+    (i64x2) => {
+        vector_signed_long_long
+    };
+    (i32x4) => {
+        vector_signed_int
+    };
+    (i16x8) => {
+        vector_signed_short
+    };
+    (i8x16) => {
+        vector_signed_char
+    };
+
+    (u64x2) => {
+        vector_unsigned_long_long
+    };
+    (u32x4) => {
+        vector_unsigned_int
+    };
+    (u16x8) => {
+        vector_unsigned_short
+    };
+    (u8x16) => {
+        vector_unsigned_char
+    };
+
+    (f32x4) => {
+        vector_float
+    };
+    (f64x2) => {
+        vector_double
+    };
+}
+
+macro_rules! l_t_t {
+    (vector_signed_long_long) => {
+        i64
+    };
+    (vector_signed_int) => {
+        i32
+    };
+    (vector_signed_short) => {
+        i16
+    };
+    (vector_signed_char) => {
+        i8
+    };
+
+    (vector_unsigned_long_long ) => {
+        u64
+    };
+    (vector_unsigned_int ) => {
+        u32
+    };
+    (vector_unsigned_short ) => {
+        u16
+    };
+    (vector_unsigned_char ) => {
+        u8
+    };
+
+    (vector_bool_long_long ) => {
+        u64
+    };
+    (vector_bool_int ) => {
+        u32
+    };
+    (vector_bool_short ) => {
+        u16
+    };
+    (vector_bool_char ) => {
+        u8
+    };
+
+    (vector_float) => {
+        f32
+    };
+    (vector_double) => {
+        f64
+    };
+}
+
+macro_rules! t_t_l {
+    (i64) => {
+        vector_signed_long_long
+    };
+    (i32) => {
+        vector_signed_int
+    };
+    (i16) => {
+        vector_signed_short
+    };
+    (i8) => {
+        vector_signed_char
+    };
+
+    (u64) => {
+        vector_unsigned_long_long
+    };
+    (u32) => {
+        vector_unsigned_int
+    };
+    (u16) => {
+        vector_unsigned_short
+    };
+    (u8) => {
+        vector_unsigned_char
+    };
+
+    (f32) => {
+        vector_float
+    };
+    (f64) => {
+        vector_double
+    };
+}
+
+macro_rules! t_t_s {
+    (i64) => {
+        i64x2
+    };
+    (i32) => {
+        i32x4
+    };
+    (i16) => {
+        i16x8
+    };
+    (i8) => {
+        i8x16
+    };
+
+    (u64) => {
+        u64x2
+    };
+    (u32) => {
+        u32x4
+    };
+    (u16) => {
+        u16x8
+    };
+    (u8) => {
+        u8x16
+    };
+
+    (f32) => {
+        f32x4
+    };
+    (f64) => {
+        f64x2
+    };
+}
+
+macro_rules! t_u {
+    (vector_bool_char) => {
+        vector_unsigned_char
+    };
+    (vector_bool_short) => {
+        vector_unsigned_short
+    };
+    (vector_bool_int) => {
+        vector_unsigned_int
+    };
+    (vector_bool_long_long) => {
+        vector_unsigned_long_long
+    };
+    (vector_unsigned_char) => {
+        vector_unsigned_char
+    };
+    (vector_unsigned_short) => {
+        vector_unsigned_short
+    };
+    (vector_unsigned_int) => {
+        vector_unsigned_int
+    };
+    (vector_unsigned_long_long) => {
+        vector_unsigned_long_long
+    };
+    (vector_signed_char) => {
+        vector_unsigned_char
+    };
+    (vector_signed_short) => {
+        vector_unsigned_short
+    };
+    (vector_signed_int) => {
+        vector_unsigned_int
+    };
+    (vector_signed_long_long) => {
+        vector_unsigned_long_long
+    };
+    (vector_float) => {
+        vector_unsigned_int
+    };
+    (vector_double) => {
+        vector_unsigned_long_long
+    };
+}
+
+macro_rules! t_b {
+    (vector_bool_char) => {
+        vector_bool_char
+    };
+    (vector_bool_short) => {
+        vector_bool_short
+    };
+    (vector_bool_int) => {
+        vector_bool_int
+    };
+    (vector_bool_long_long) => {
+        vector_bool_long_long
+    };
+    (vector_signed_char) => {
+        vector_bool_char
+    };
+    (vector_signed_short) => {
+        vector_bool_short
+    };
+    (vector_signed_int) => {
+        vector_bool_int
+    };
+    (vector_signed_long_long) => {
+        vector_bool_long_long
+    };
+    (vector_unsigned_char) => {
+        vector_bool_char
+    };
+    (vector_unsigned_short) => {
+        vector_bool_short
+    };
+    (vector_unsigned_int) => {
+        vector_bool_int
+    };
+    (vector_unsigned_long_long) => {
+        vector_bool_long_long
+    };
+    (vector_float) => {
+        vector_bool_int
+    };
+    (vector_double) => {
+        vector_bool_long_long
+    };
+}
+
+macro_rules! impl_from {
+    ($s: ident) => {
+        #[unstable(feature = "stdarch_s390x", issue = "135681")]
+        impl From<$s> for s_t_l!($s) {
+            fn from (v: $s) -> Self {
+                unsafe {
+                    transmute(v)
+                }
+            }
+        }
+    };
+    ($($s: ident),*) => {
+        $(
+            impl_from! { $s }
+        )*
+    };
+}
+
+macro_rules! impl_neg {
+    ($s: ident : $zero: expr) => {
+        #[unstable(feature = "stdarch_s390x", issue = "135681")]
+        impl crate::ops::Neg for s_t_l!($s) {
+            type Output = s_t_l!($s);
+            fn neg(self) -> Self::Output {
+                unsafe { simd_neg(self) }
+            }
+        }
+    };
+}
+
+pub(crate) use impl_from;
+pub(crate) use impl_neg;
+pub(crate) use impl_vec_trait;
+pub(crate) use l_t_t;
+pub(crate) use s_t_l;
+pub(crate) use t_b;
+pub(crate) use t_t_l;
+pub(crate) use t_t_s;
+pub(crate) use t_u;
+pub(crate) use test_impl;
diff --git a/library/stdarch/crates/core_arch/src/s390x/mod.rs b/library/stdarch/crates/core_arch/src/s390x/mod.rs
new file mode 100644
index 0000000000000..7d3b3f2d99aae
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/s390x/mod.rs
@@ -0,0 +1,7 @@
+//! `s390x` intrinsics
+
+pub(crate) mod macros;
+
+mod vector;
+#[unstable(feature = "stdarch_s390x", issue = "130869")]
+pub use self::vector::*;
diff --git a/library/stdarch/crates/core_arch/src/s390x/vector.rs b/library/stdarch/crates/core_arch/src/s390x/vector.rs
new file mode 100644
index 0000000000000..ae5c37ce0178b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/s390x/vector.rs
@@ -0,0 +1,7382 @@
+//! s390x vector intrinsics.
+//!
+//! For more info see the [Reference Summary] or the online [IBM docs].
+//!
+//! [Reference Summary]: https://www.ibm.com/support/pages/sites/default/files/2021-05/SA22-7871-10.pdf
+//! [IBM docs]: https://www.ibm.com/docs/en/zos/2.4.0?topic=support-vector-built-in-functions
+
+#![allow(non_camel_case_types)]
+
+use crate::{core_arch::simd::*, intrinsics::simd::*, mem::MaybeUninit, mem::transmute};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use super::macros::*;
+
+types! {
+    #![unstable(feature = "stdarch_s390x", issue = "135681")]
+
+    /// s390x-specific 128-bit wide vector of sixteen packed `i8`
+    pub struct vector_signed_char(16 x i8);
+    /// s390x-specific 128-bit wide vector of sixteen packed `u8`
+    pub struct vector_unsigned_char(16 x u8);
+    /// s390x-specific 128-bit wide vector mask of sixteen packed elements
+    pub struct vector_bool_char(16 x i8);
+
+    /// s390x-specific 128-bit wide vector of eight packed `i16`
+    pub struct vector_signed_short(8 x i16);
+    /// s390x-specific 128-bit wide vector of eight packed `u16`
+    pub struct vector_unsigned_short(8 x u16);
+    /// s390x-specific 128-bit wide vector mask of eight packed elements
+    pub struct vector_bool_short(8 x i16);
+
+    /// s390x-specific 128-bit wide vector of four packed `i32`
+    pub struct vector_signed_int(4 x i32);
+    /// s390x-specific 128-bit wide vector of four packed `u32`
+    pub struct vector_unsigned_int(4 x u32);
+    /// s390x-specific 128-bit wide vector mask of four packed elements
+    pub struct vector_bool_int(4 x i32);
+
+    /// s390x-specific 128-bit wide vector of two packed `i64`
+    pub struct vector_signed_long_long(2 x i64);
+    /// s390x-specific 128-bit wide vector of two packed `u64`
+    pub struct vector_unsigned_long_long(2 x u64);
+    /// s390x-specific 128-bit wide vector mask of two packed elements
+    pub struct vector_bool_long_long(2 x i64);
+
+    /// s390x-specific 128-bit wide vector of four packed `f32`
+    pub struct vector_float(4 x f32);
+    /// s390x-specific 128-bit wide vector of two packed `f64`
+    pub struct vector_double(2 x f64);
+}
+
+#[repr(packed)]
+struct PackedTuple<T, U> {
+    x: T,
+    y: U,
+}
+
+#[allow(improper_ctypes)]
+#[rustfmt::skip]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.smax.v16i8"] fn vmxb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.smax.v8i16"] fn vmxh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.smax.v4i32"] fn vmxf(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.smax.v2i64"] fn vmxg(a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long;
+
+    #[link_name = "llvm.umax.v16i8"] fn vmxlb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.umax.v8i16"] fn vmxlh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.umax.v4i32"] fn vmxlf(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+    #[link_name = "llvm.umax.v2i64"] fn vmxlg(a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long;
+
+    #[link_name = "llvm.smin.v16i8"] fn vmnb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.smin.v8i16"] fn vmnh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.smin.v4i32"] fn vmnf(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.smin.v2i64"] fn vmng(a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long;
+
+    #[link_name = "llvm.umin.v16i8"] fn vmnlb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.umin.v8i16"] fn vmnlh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.umin.v4i32"] fn vmnlf(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+    #[link_name = "llvm.umin.v2i64"] fn vmnlg(a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long;
+
+    #[link_name = "llvm.nearbyint.v4f32"] fn nearbyint_v4f32(a: vector_float) -> vector_float;
+    #[link_name = "llvm.nearbyint.v2f64"] fn nearbyint_v2f64(a: vector_double) -> vector_double;
+
+    #[link_name = "llvm.rint.v4f32"] fn rint_v4f32(a: vector_float) -> vector_float;
+    #[link_name = "llvm.rint.v2f64"] fn rint_v2f64(a: vector_double) -> vector_double;
+
+    #[link_name = "llvm.roundeven.v4f32"] fn roundeven_v4f32(a: vector_float) -> vector_float;
+    #[link_name = "llvm.roundeven.v2f64"] fn roundeven_v2f64(a: vector_double) -> vector_double;
+
+    #[link_name = "llvm.s390.vsra"] fn vsra(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.s390.vsrl"] fn vsrl(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.s390.vsl"] fn vsl(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+
+    #[link_name = "llvm.s390.vsrab"] fn vsrab(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.s390.vsrlb"] fn vsrlb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.s390.vslb"] fn vslb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+
+    #[link_name = "llvm.s390.vsldb"] fn vsldb(a: i8x16, b: i8x16, c: u32) -> i8x16;
+    #[link_name = "llvm.s390.vsld"] fn vsld(a: i8x16, b: i8x16, c: u32) -> i8x16;
+    #[link_name = "llvm.s390.vsrd"] fn vsrd(a: i8x16, b: i8x16, c: u32) -> i8x16;
+
+    #[link_name = "llvm.fshl.v16i8"] fn fshlb(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.fshl.v8i16"] fn fshlh(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.fshl.v4i32"] fn fshlf(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_int) -> vector_unsigned_int;
+    #[link_name = "llvm.fshl.v2i64"] fn fshlg(a: vector_unsigned_long_long, b: vector_unsigned_long_long, c: vector_unsigned_long_long) -> vector_unsigned_long_long;
+
+    #[link_name = "llvm.s390.verimb"] fn verimb(a: vector_signed_char, b: vector_signed_char, c: vector_signed_char, d: i32) -> vector_signed_char;
+    #[link_name = "llvm.s390.verimh"] fn verimh(a: vector_signed_short, b: vector_signed_short, c: vector_signed_short, d: i32) -> vector_signed_short;
+    #[link_name = "llvm.s390.verimf"] fn verimf(a: vector_signed_int, b: vector_signed_int, c: vector_signed_int, d: i32) -> vector_signed_int;
+    #[link_name = "llvm.s390.verimg"] fn verimg(a: vector_signed_long_long, b: vector_signed_long_long, c: vector_signed_long_long, d: i32) -> vector_signed_long_long;
+
+    #[link_name = "llvm.s390.vperm"] fn vperm(a: vector_signed_char, b: vector_signed_char, c: vector_unsigned_char) -> vector_signed_char;
+
+    #[link_name = "llvm.s390.vsumb"] fn vsumb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_int;
+    #[link_name = "llvm.s390.vsumh"] fn vsumh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int;
+
+    #[link_name = "llvm.s390.vsumgh"] fn vsumgh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_long_long;
+    #[link_name = "llvm.s390.vsumgf"] fn vsumgf(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_long_long;
+
+    #[link_name = "llvm.s390.vsumqf"] fn vsumqf(a: vector_unsigned_int, b: vector_unsigned_int) -> u128;
+    #[link_name = "llvm.s390.vsumqg"] fn vsumqg(a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> u128;
+
+    #[link_name = "llvm.s390.vscbiq"] fn vscbiq(a: u128, b: u128) -> u128;
+    #[link_name = "llvm.s390.vsbiq"] fn vsbiq(a: u128, b: u128, c: u128) -> u128;
+    #[link_name = "llvm.s390.vsbcbiq"] fn vsbcbiq(a: u128, b: u128, c: u128) -> u128;
+
+    #[link_name = "llvm.s390.vacq"] fn vacq(a: u128, b: u128, c: u128) -> u128;
+
+    #[link_name = "llvm.s390.vscbib"] fn vscbib(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.s390.vscbih"] fn vscbih(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vscbif"] fn vscbif(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+    #[link_name = "llvm.s390.vscbig"] fn vscbig(a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long;
+
+    #[link_name = "llvm.s390.vfaeb"] fn vfaeb(a: vector_signed_char, b: vector_signed_char, c: i32) -> vector_signed_char;
+    #[link_name = "llvm.s390.vfaeh"] fn vfaeh(a: vector_signed_short, b: vector_signed_short, c: i32) -> vector_signed_short;
+    #[link_name = "llvm.s390.vfaef"] fn vfaef(a: vector_signed_int, b: vector_signed_int, c: i32) -> vector_signed_int;
+
+    #[link_name = "llvm.s390.vfaezb"] fn vfaezb(a: vector_signed_char, b: vector_signed_char, c: i32) -> vector_signed_char;
+    #[link_name = "llvm.s390.vfaezh"] fn vfaezh(a: vector_signed_short, b: vector_signed_short, c: i32) -> vector_signed_short;
+    #[link_name = "llvm.s390.vfaezf"] fn vfaezf(a: vector_signed_int, b: vector_signed_int, c: i32) -> vector_signed_int;
+
+    #[link_name = "llvm.s390.vfaebs"] fn vfaebs(a: vector_signed_char, b: vector_signed_char, c: i32) -> PackedTuple<vector_signed_char, i32>;
+    #[link_name = "llvm.s390.vfaehs"] fn vfaehs(a: vector_signed_short, b: vector_signed_short, c: i32) -> PackedTuple<vector_signed_short, i32>;
+    #[link_name = "llvm.s390.vfaefs"] fn vfaefs(a: vector_signed_int, b: vector_signed_int, c: i32) -> PackedTuple<vector_signed_int, i32>;
+
+    #[link_name = "llvm.s390.vfaezbs"] fn vfaezbs(a: vector_signed_char, b: vector_signed_char, c: i32) -> PackedTuple<vector_signed_char, i32>;
+    #[link_name = "llvm.s390.vfaezhs"] fn vfaezhs(a: vector_signed_short, b: vector_signed_short, c: i32) -> PackedTuple<vector_signed_short, i32>;
+    #[link_name = "llvm.s390.vfaezfs"] fn vfaezfs(a: vector_signed_int, b: vector_signed_int, c: i32) -> PackedTuple<vector_signed_int, i32>;
+
+    #[link_name = "llvm.s390.vll"] fn vll(a: u32, b: *const u8) -> vector_signed_char;
+    #[link_name = "llvm.s390.vstl"] fn vstl(a: vector_signed_char, b: u32, c: *mut u8);
+
+    #[link_name = "llvm.s390.vlrl"] fn vlrl(a: u32, b: *const u8) -> vector_unsigned_char;
+    #[link_name = "llvm.s390.vstrl"] fn vstrl(a: vector_unsigned_char, b: u32, c: *mut u8);
+
+    #[link_name = "llvm.s390.lcbb"] fn lcbb(a: *const u8, b: u32) -> u32;
+    #[link_name = "llvm.s390.vlbb"] fn vlbb(a: *const u8, b: u32) -> MaybeUninit<vector_signed_char>;
+
+    #[link_name = "llvm.s390.vpksh"] fn vpksh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_char;
+    #[link_name = "llvm.s390.vpksf"] fn vpksf(a: vector_signed_int, b: vector_signed_int) -> vector_signed_short;
+    #[link_name = "llvm.s390.vpksg"] fn vpksg(a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_int;
+
+    #[link_name = "llvm.s390.vpklsh"] fn vpklsh(a: vector_signed_short, b: vector_signed_short) -> vector_unsigned_char;
+    #[link_name = "llvm.s390.vpklsf"] fn vpklsf(a: vector_signed_int, b: vector_signed_int) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vpklsg"] fn vpklsg(a: vector_signed_long_long, b: vector_signed_long_long) -> vector_unsigned_int;
+
+    #[link_name = "llvm.s390.vpkshs"] fn vpkshs(a: vector_signed_short, b: vector_signed_short) -> PackedTuple<vector_signed_char, i32>;
+    #[link_name = "llvm.s390.vpksfs"] fn vpksfs(a: vector_signed_int, b: vector_signed_int) -> PackedTuple<vector_signed_short, i32>;
+    #[link_name = "llvm.s390.vpksgs"] fn vpksgs(a: vector_signed_long_long, b: vector_signed_long_long) -> PackedTuple<vector_signed_int, i32>;
+
+    #[link_name = "llvm.s390.vpklshs"] fn vpklshs(a: vector_unsigned_short, b: vector_unsigned_short) -> PackedTuple<vector_unsigned_char, i32>;
+    #[link_name = "llvm.s390.vpklsfs"] fn vpklsfs(a: vector_unsigned_int, b: vector_unsigned_int) -> PackedTuple<vector_unsigned_short, i32>;
+    #[link_name = "llvm.s390.vpklsgs"] fn vpklsgs(a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> PackedTuple<vector_unsigned_int, i32>;
+
+    #[link_name = "llvm.s390.vuplb"] fn vuplb (a: vector_signed_char) -> vector_signed_short;
+    #[link_name = "llvm.s390.vuplhw"] fn vuplhw (a: vector_signed_short) -> vector_signed_int;
+    #[link_name = "llvm.s390.vuplf"] fn vuplf (a: vector_signed_int) -> vector_signed_long_long;
+    #[link_name = "llvm.s390.vupllb"] fn vupllb (a: vector_unsigned_char) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vupllh"] fn vupllh (a: vector_unsigned_short) -> vector_unsigned_int;
+    #[link_name = "llvm.s390.vupllf"] fn vupllf (a: vector_unsigned_int) -> vector_unsigned_long_long;
+
+    #[link_name = "llvm.s390.vavgb"] fn vavgb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.s390.vavgh"] fn vavgh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.s390.vavgf"] fn vavgf(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.s390.vavgg"] fn vavgg(a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long;
+
+    #[link_name = "llvm.s390.vavglb"] fn vavglb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.s390.vavglh"] fn vavglh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vavglf"] fn vavglf(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+    #[link_name = "llvm.s390.vavglg"] fn vavglg(a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long;
+
+    #[link_name = "llvm.s390.vcksm"] fn vcksm(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.s390.vmeb"] fn vmeb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short;
+    #[link_name = "llvm.s390.vmeh"] fn vmeh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int;
+    #[link_name = "llvm.s390.vmef"] fn vmef(a: vector_signed_int, b: vector_signed_int) -> vector_signed_long_long;
+
+    #[link_name = "llvm.s390.vmleb"] fn vmleb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vmleh"] fn vmleh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int;
+    #[link_name = "llvm.s390.vmlef"] fn vmlef(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_long_long;
+
+    #[link_name = "llvm.s390.vmob"] fn vmob(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short;
+    #[link_name = "llvm.s390.vmoh"] fn vmoh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int;
+    #[link_name = "llvm.s390.vmof"] fn vmof(a: vector_signed_int, b: vector_signed_int) -> vector_signed_long_long;
+
+    #[link_name = "llvm.s390.vmlob"] fn vmlob(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vmloh"] fn vmloh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int;
+    #[link_name = "llvm.s390.vmlof"] fn vmlof(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_long_long;
+
+    #[link_name = "llvm.s390.vmhb"] fn vmhb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.s390.vmhh"] fn vmhh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.s390.vmhf"] fn vmhf(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.s390.vmlhb"] fn vmlhb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.s390.vmlhh"] fn vmlhh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vmlhf"] fn vmlhf(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.s390.vmaeb"] fn vmaeb(a: vector_signed_char, b: vector_signed_char, c: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.s390.vmaeh"] fn vmaeh(a: vector_signed_short, b: vector_signed_short, c: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.s390.vmaef"] fn vmaef(a: vector_signed_int, b: vector_signed_int, c: vector_signed_long_long) -> vector_signed_long_long;
+
+    #[link_name = "llvm.s390.vmaleb"] fn vmaleb(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vmaleh"] fn vmaleh(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_int) -> vector_unsigned_int;
+    #[link_name = "llvm.s390.vmalef"] fn vmalef(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_long_long) -> vector_unsigned_long_long;
+
+    #[link_name = "llvm.s390.vmaob"] fn vmaob(a: vector_signed_char, b: vector_signed_char, c: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.s390.vmaoh"] fn vmaoh(a: vector_signed_short, b: vector_signed_short, c: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.s390.vmaof"] fn vmaof(a: vector_signed_int, b: vector_signed_int, c: vector_signed_long_long) -> vector_signed_long_long;
+
+    #[link_name = "llvm.s390.vmalob"] fn vmalob(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vmaloh"] fn vmaloh(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_int) -> vector_unsigned_int;
+    #[link_name = "llvm.s390.vmalof"] fn vmalof(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_long_long) -> vector_unsigned_long_long;
+
+    #[link_name = "llvm.s390.vmahb"] fn vmahb(a: vector_signed_char, b: vector_signed_char, c: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.s390.vmahh"] fn vmahh(a: vector_signed_short, b: vector_signed_short, c: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.s390.vmahf"] fn vmahf(a: vector_signed_int, b: vector_signed_int, c: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.s390.vmalhb"] fn vmalhb(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.s390.vmalhh"] fn vmalhh(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vmalhf"] fn vmalhf(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.s390.vmalb"] fn vmalb(a: vector_signed_char, b: vector_signed_char, c: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.s390.vmalh"] fn vmalh(a: vector_signed_short, b: vector_signed_short, c: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.s390.vmalf"] fn vmalf(a: vector_signed_int, b: vector_signed_int, c: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.s390.vmallb"] fn vmallb(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.s390.vmallh"] fn vmallh(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vmallf"] fn vmallf(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.s390.vgfmb"] fn vgfmb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vgfmh"] fn vgfmh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int;
+    #[link_name = "llvm.s390.vgfmf"] fn vgfmf(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_long_long;
+    #[link_name = "llvm.s390.vgfmg"] fn vgfmg(a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> u128;
+
+    #[link_name = "llvm.s390.vgfmab"] fn vgfmab(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vgfmah"] fn vgfmah(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_int) -> vector_unsigned_int;
+    #[link_name = "llvm.s390.vgfmaf"] fn vgfmaf(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_long_long) -> vector_unsigned_long_long;
+    #[link_name = "llvm.s390.vgfmag"] fn vgfmag(a: vector_unsigned_long_long, b: vector_unsigned_long_long, c: u128) -> u128;
+
+    #[link_name = "llvm.s390.vbperm"] fn vbperm(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_long_long;
+
+    #[link_name = "llvm.s390.vftcisb"] fn vftcisb(a: vector_float, b: u32) -> PackedTuple<vector_bool_int, i32>;
+    #[link_name = "llvm.s390.vftcidb"] fn vftcidb(a: vector_double, b: u32) -> PackedTuple<vector_bool_long_long, i32>;
+
+    #[link_name = "llvm.s390.vtm"] fn vtm(a: i8x16, b: i8x16) -> i32;
+
+    #[link_name = "llvm.s390.vstrsb"] fn vstrsb(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_char) -> PackedTuple<vector_unsigned_char, i32>;
+    #[link_name = "llvm.s390.vstrsh"] fn vstrsh(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_char) -> PackedTuple<vector_unsigned_char, i32>;
+    #[link_name = "llvm.s390.vstrsf"] fn vstrsf(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_char) -> PackedTuple<vector_unsigned_char, i32>;
+
+    #[link_name = "llvm.s390.vstrszb"] fn vstrszb(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_char) -> PackedTuple<vector_unsigned_char, i32>;
+    #[link_name = "llvm.s390.vstrszh"] fn vstrszh(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_char) -> PackedTuple<vector_unsigned_char, i32>;
+    #[link_name = "llvm.s390.vstrszf"] fn vstrszf(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_char) -> PackedTuple<vector_unsigned_char, i32>;
+
+    #[link_name = "llvm.s390.vistrb"] fn vistrb(a: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.s390.vistrh"] fn vistrh(a: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.s390.vistrf"] fn vistrf(a: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.s390.vistrbs"] fn vistrbs(a: vector_unsigned_char) -> PackedTuple<vector_unsigned_char, i32>;
+    #[link_name = "llvm.s390.vistrhs"] fn vistrhs(a: vector_unsigned_short) -> PackedTuple<vector_unsigned_short, i32>;
+    #[link_name = "llvm.s390.vistrfs"] fn vistrfs(a: vector_unsigned_int) -> PackedTuple<vector_unsigned_int, i32>;
+
+    #[link_name = "llvm.s390.vmslg"] fn vmslg(a: vector_unsigned_long_long, b: vector_unsigned_long_long, c: u128, d: u32) -> u128;
+
+    #[link_name = "llvm.s390.vstrcb"] fn vstrcb(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_char, d: u32) -> vector_bool_char;
+    #[link_name = "llvm.s390.vstrch"] fn vstrch(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_short, d: u32) -> vector_bool_short;
+    #[link_name = "llvm.s390.vstrcf"] fn vstrcf(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_int, d: u32) -> vector_bool_int;
+
+    #[link_name = "llvm.s390.vstrcbs"] fn vstrcbs(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_char, d: u32) -> PackedTuple<vector_bool_char, i32>;
+    #[link_name = "llvm.s390.vstrchs"] fn vstrchs(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_short, d: u32) -> PackedTuple<vector_bool_short, i32>;
+    #[link_name = "llvm.s390.vstrcfs"] fn vstrcfs(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_int, d: u32) -> PackedTuple<vector_bool_int, i32>;
+
+    #[link_name = "llvm.s390.vstrczb"] fn vstrczb(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_char, d: u32) -> vector_bool_char;
+    #[link_name = "llvm.s390.vstrczh"] fn vstrczh(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_short, d: u32) -> vector_bool_short;
+    #[link_name = "llvm.s390.vstrczf"] fn vstrczf(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_int, d: u32) -> vector_bool_int;
+
+    #[link_name = "llvm.s390.vstrczbs"] fn vstrczbs(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_char, d: u32) -> PackedTuple<vector_bool_char, i32>;
+    #[link_name = "llvm.s390.vstrczhs"] fn vstrczhs(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_short, d: u32) -> PackedTuple<vector_bool_short, i32>;
+    #[link_name = "llvm.s390.vstrczfs"] fn vstrczfs(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_int, d: u32) -> PackedTuple<vector_bool_int, i32>;
+
+    #[link_name = "llvm.s390.vfeeb"] fn vfeeb(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.s390.vfeeh"] fn vfeeh(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.s390.vfeef"] fn vfeef(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.s390.vfeezb"] fn vfeezb(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.s390.vfeezh"] fn vfeezh(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.s390.vfeezf"] fn vfeezf(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.s390.vfeebs"] fn vfeebs(a: i8x16, b: i8x16) -> PackedTuple<i8x16, i32>;
+    #[link_name = "llvm.s390.vfeehs"] fn vfeehs(a: i16x8, b: i16x8) -> PackedTuple<i16x8, i32>;
+    #[link_name = "llvm.s390.vfeefs"] fn vfeefs(a: i32x4, b: i32x4) -> PackedTuple<i32x4, i32>;
+
+    #[link_name = "llvm.s390.vfeezbs"] fn vfeezbs(a: i8x16, b: i8x16) -> PackedTuple<i8x16, i32>;
+    #[link_name = "llvm.s390.vfeezhs"] fn vfeezhs(a: i16x8, b: i16x8) -> PackedTuple<i16x8, i32>;
+    #[link_name = "llvm.s390.vfeezfs"] fn vfeezfs(a: i32x4, b: i32x4) -> PackedTuple<i32x4, i32>;
+
+    #[link_name = "llvm.s390.vfeneb"] fn vfeneb(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.s390.vfeneh"] fn vfeneh(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.s390.vfenef"] fn vfenef(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.s390.vfenezb"] fn vfenezb(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.s390.vfenezh"] fn vfenezh(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.s390.vfenezf"] fn vfenezf(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.s390.vfenebs"] fn vfenebs(a: i8x16, b: i8x16) -> PackedTuple<i8x16, i32>;
+    #[link_name = "llvm.s390.vfenehs"] fn vfenehs(a: i16x8, b: i16x8) -> PackedTuple<i16x8, i32>;
+    #[link_name = "llvm.s390.vfenefs"] fn vfenefs(a: i32x4, b: i32x4) -> PackedTuple<i32x4, i32>;
+
+    #[link_name = "llvm.s390.vfenezbs"] fn vfenezbs(a: i8x16, b: i8x16) -> PackedTuple<i8x16, i32>;
+    #[link_name = "llvm.s390.vfenezhs"] fn vfenezhs(a: i16x8, b: i16x8) -> PackedTuple<i16x8, i32>;
+    #[link_name = "llvm.s390.vfenezfs"] fn vfenezfs(a: i32x4, b: i32x4) -> PackedTuple<i32x4, i32>;
+}
+
+impl_from! { i8x16, u8x16,  i16x8, u16x8, i32x4, u32x4, i64x2, u64x2, f32x4, f64x2 }
+
+impl_neg! { i8x16 : 0 }
+impl_neg! { i16x8 : 0 }
+impl_neg! { i32x4 : 0 }
+impl_neg! { i64x2 : 0 }
+impl_neg! { f32x4 : 0f32 }
+impl_neg! { f64x2 : 0f64 }
+
+#[repr(simd)]
+struct ShuffleMask<const N: usize>([u32; N]);
+
+impl<const N: usize> ShuffleMask<N> {
+    const fn reverse() -> Self {
+        let mut index = [0; N];
+        let mut i = 0;
+        while i < N {
+            index[i] = (N - i - 1) as u32;
+            i += 1;
+        }
+        ShuffleMask(index)
+    }
+
+    const fn merge_low() -> Self {
+        let mut mask = [0; N];
+        let mut i = N / 2;
+        let mut index = 0;
+        while index < N {
+            mask[index] = i as u32;
+            mask[index + 1] = (i + N) as u32;
+
+            i += 1;
+            index += 2;
+        }
+        ShuffleMask(mask)
+    }
+
+    const fn merge_high() -> Self {
+        let mut mask = [0; N];
+        let mut i = 0;
+        let mut index = 0;
+        while index < N {
+            mask[index] = i as u32;
+            mask[index + 1] = (i + N) as u32;
+
+            i += 1;
+            index += 2;
+        }
+        ShuffleMask(mask)
+    }
+
+    const fn pack() -> Self {
+        let mut mask = [0; N];
+        let mut i = 1;
+        let mut index = 0;
+        while index < N {
+            mask[index] = i as u32;
+
+            i += 2;
+            index += 1;
+        }
+        ShuffleMask(mask)
+    }
+
+    const fn unpack_low() -> Self {
+        let mut mask = [0; N];
+        let mut i = 0;
+        while i < N {
+            mask[i] = (N + i) as u32;
+            i += 1;
+        }
+        ShuffleMask(mask)
+    }
+
+    const fn unpack_high() -> Self {
+        let mut mask = [0; N];
+        let mut i = 0;
+        while i < N {
+            mask[i] = i as u32;
+            i += 1;
+        }
+        ShuffleMask(mask)
+    }
+}
+
+const fn genmask<const MASK: u16>() -> [u8; 16] {
+    let mut bits = MASK;
+    let mut elements = [0u8; 16];
+
+    let mut i = 0;
+    while i < 16 {
+        elements[i] = match bits & (1u16 << 15) {
+            0 => 0,
+            _ => 0xFF,
+        };
+
+        bits <<= 1;
+        i += 1;
+    }
+
+    elements
+}
+
+const fn genmasks(bit_width: u32, a: u8, b: u8) -> u64 {
+    let bit_width = bit_width as u8;
+    let a = a % bit_width;
+    let mut b = b % bit_width;
+    if a > b {
+        b = bit_width - 1;
+    }
+
+    // of course these indices start from the left
+    let a = (bit_width - 1) - a;
+    let b = (bit_width - 1) - b;
+
+    ((1u64.wrapping_shl(a as u32 + 1)) - 1) & !((1u64.wrapping_shl(b as u32)) - 1)
+}
+
+const fn validate_block_boundary(block_boundary: u16) -> u32 {
+    assert!(
+        block_boundary.is_power_of_two() && block_boundary >= 64 && block_boundary <= 4096,
+        "block boundary must be a constant power of 2 from 64 to 4096",
+    );
+
+    // so that 64 is encoded as 0, 128 as 1, ect.
+    block_boundary as u32 >> 7
+}
+
+enum FindImm {
+    Eq = 4,
+    Ne = 12,
+    EqIdx = 0,
+    NeIdx = 8,
+}
+
+#[macro_use]
+mod sealed {
+    use super::*;
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorAdd<Other> {
+        type Result;
+        unsafe fn vec_add(self, other: Other) -> Self::Result;
+    }
+
+    macro_rules! impl_add {
+        ($name:ident, $a:ty, $instr:ident) => {
+            impl_add!($name, $a, $a, $a, $instr);
+        };
+        ($name:ident, $a:ty, $b:ty, $c:ty, $instr:ident) => {
+            #[inline]
+            #[target_feature(enable = "vector")]
+            #[cfg_attr(test, assert_instr($instr))]
+            pub unsafe fn $name(a: $a, b: $b) -> $c {
+                transmute(simd_add(transmute(a), b))
+            }
+
+            #[unstable(feature = "stdarch_s390x", issue = "135681")]
+            impl VectorAdd<$b> for $a {
+                type Result = $c;
+
+                #[inline]
+                #[target_feature(enable = "vector")]
+                unsafe fn vec_add(self, other: $b) -> Self::Result {
+                    $name(self, other)
+                }
+            }
+        };
+    }
+
+    #[rustfmt::skip]
+    mod impl_add {
+        use super::*;
+
+        impl_add!(va_sc, vector_signed_char, vab);
+        impl_add!(va_uc, vector_unsigned_char, vab);
+        impl_add!(va_sh, vector_signed_short, vah);
+        impl_add!(va_uh, vector_unsigned_short, vah);
+        impl_add!(va_sf, vector_signed_int, vaf);
+        impl_add!(va_uf, vector_unsigned_int, vaf);
+        impl_add!(va_sg, vector_signed_long_long, vag);
+        impl_add!(va_ug, vector_unsigned_long_long, vag);
+
+        impl_add!(va_sc_bc, vector_signed_char, vector_bool_char, vector_signed_char, vab);
+        impl_add!(va_uc_bc, vector_unsigned_char, vector_bool_char, vector_unsigned_char, vab);
+        impl_add!(va_sh_bh, vector_signed_short, vector_bool_short, vector_signed_short, vah);
+        impl_add!(va_uh_bh, vector_unsigned_short, vector_bool_short, vector_unsigned_short, vah);
+        impl_add!(va_sf_bf, vector_signed_int, vector_bool_int, vector_signed_int, vaf);
+        impl_add!(va_uf_bf, vector_unsigned_int, vector_bool_int, vector_unsigned_int, vaf);
+        impl_add!(va_sg_bg, vector_signed_long_long, vector_bool_long_long, vector_signed_long_long, vag);
+        impl_add!(va_ug_bg, vector_unsigned_long_long, vector_bool_long_long, vector_unsigned_long_long, vag);
+
+        impl_add!(va_bc_sc, vector_bool_char, vector_signed_char, vector_signed_char, vab);
+        impl_add!(va_bc_uc, vector_bool_char, vector_unsigned_char, vector_unsigned_char, vab);
+        impl_add!(va_bh_sh, vector_bool_short, vector_signed_short, vector_signed_short, vah);
+        impl_add!(va_bh_uh, vector_bool_short, vector_unsigned_short, vector_unsigned_short, vah);
+        impl_add!(va_bf_sf, vector_bool_int, vector_signed_int, vector_signed_int, vaf);
+        impl_add!(va_bf_uf, vector_bool_int, vector_unsigned_int, vector_unsigned_int, vaf);
+        impl_add!(va_bg_sg, vector_bool_long_long, vector_signed_long_long, vector_signed_long_long, vag);
+        impl_add!(va_bg_ug, vector_bool_long_long, vector_unsigned_long_long, vector_unsigned_long_long, vag);
+
+        impl_add!(va_double, vector_double, vfadb);
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        #[cfg_attr(all(test, target_feature = "vector-enhancements-1"), assert_instr(vfasb))]
+        pub unsafe fn va_float(a: vector_float, b: vector_float) -> vector_float {
+            transmute(simd_add(a, b))
+        }
+
+        #[unstable(feature = "stdarch_s390x", issue = "135681")]
+        impl VectorAdd<Self> for vector_float {
+            type Result = Self;
+
+            #[inline]
+            #[target_feature(enable = "vector")]
+            unsafe fn vec_add(self, other: Self) -> Self::Result {
+                va_float(self, other)
+            }
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSub<Other> {
+        type Result;
+        unsafe fn vec_sub(self, other: Other) -> Self::Result;
+    }
+
+    macro_rules! impl_sub {
+        ($name:ident, $a:ty, $instr:ident) => {
+            impl_sub!($name, $a, $a, $a, $instr);
+        };
+        ($name:ident, $a:ty, $b:ty, $c:ty, $instr:ident) => {
+            #[inline]
+            #[target_feature(enable = "vector")]
+            #[cfg_attr(test, assert_instr($instr))]
+            pub unsafe fn $name(a: $a, b: $b) -> $c {
+                transmute(simd_sub(transmute(a), b))
+            }
+
+            #[unstable(feature = "stdarch_s390x", issue = "135681")]
+            impl VectorSub<$b> for $a {
+                type Result = $c;
+
+                #[inline]
+                #[target_feature(enable = "vector")]
+                unsafe fn vec_sub(self, other: $b) -> Self::Result {
+                    $name(self, other)
+                }
+            }
+        };
+    }
+
+    #[rustfmt::skip]
+    mod impl_sub {
+        use super::*;
+
+        impl_sub!(vs_sc, vector_signed_char, vsb);
+        impl_sub!(vs_uc, vector_unsigned_char, vsb);
+        impl_sub!(vs_sh, vector_signed_short, vsh);
+        impl_sub!(vs_uh, vector_unsigned_short, vsh);
+        impl_sub!(vs_sf, vector_signed_int, vsf);
+        impl_sub!(vs_uf, vector_unsigned_int, vsf);
+        impl_sub!(vs_sg, vector_signed_long_long, vsg);
+        impl_sub!(vs_ug, vector_unsigned_long_long, vsg);
+
+        impl_sub!(vs_sc_bc, vector_signed_char, vector_bool_char, vector_signed_char, vsb);
+        impl_sub!(vs_uc_bc, vector_unsigned_char, vector_bool_char, vector_unsigned_char, vsb);
+        impl_sub!(vs_sh_bh, vector_signed_short, vector_bool_short, vector_signed_short, vsh);
+        impl_sub!(vs_uh_bh, vector_unsigned_short, vector_bool_short, vector_unsigned_short, vsh);
+        impl_sub!(vs_sf_bf, vector_signed_int, vector_bool_int, vector_signed_int, vsf);
+        impl_sub!(vs_uf_bf, vector_unsigned_int, vector_bool_int, vector_unsigned_int, vsf);
+        impl_sub!(vs_sg_bg, vector_signed_long_long, vector_bool_long_long, vector_signed_long_long, vsg);
+        impl_sub!(vs_ug_bg, vector_unsigned_long_long, vector_bool_long_long, vector_unsigned_long_long, vsg);
+
+        impl_sub!(vs_bc_sc, vector_bool_char, vector_signed_char, vector_signed_char, vsb);
+        impl_sub!(vs_bc_uc, vector_bool_char, vector_unsigned_char, vector_unsigned_char, vsb);
+        impl_sub!(vs_bh_sh, vector_bool_short, vector_signed_short, vector_signed_short, vsh);
+        impl_sub!(vs_bh_uh, vector_bool_short, vector_unsigned_short, vector_unsigned_short, vsh);
+        impl_sub!(vs_bf_sf, vector_bool_int, vector_signed_int, vector_signed_int, vsf);
+        impl_sub!(vs_bf_uf, vector_bool_int, vector_unsigned_int, vector_unsigned_int, vsf);
+        impl_sub!(vs_bg_sg, vector_bool_long_long, vector_signed_long_long, vector_signed_long_long, vsg);
+        impl_sub!(vs_bg_ug, vector_bool_long_long, vector_unsigned_long_long, vector_unsigned_long_long, vsg);
+
+        impl_sub!(vs_double, vector_double, vfsdb);
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        #[cfg_attr(all(test, target_feature = "vector-enhancements-1"), assert_instr(vfssb))]
+        pub unsafe fn vs_float(a: vector_float, b: vector_float) -> vector_float {
+            transmute(simd_sub(a, b))
+        }
+
+        #[unstable(feature = "stdarch_s390x", issue = "135681")]
+        impl VectorSub<Self> for vector_float {
+            type Result = Self;
+
+            #[inline]
+            #[target_feature(enable = "vector")]
+            unsafe fn vec_sub(self, other: Self) -> Self::Result {
+                vs_float(self, other)
+            }
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorMul {
+        unsafe fn vec_mul(self, b: Self) -> Self;
+    }
+
+    macro_rules! impl_mul {
+        ($name:ident, $a:ty, std_simd) => {
+            #[unstable(feature = "stdarch_s390x", issue = "135681")]
+            impl VectorMul for $a {
+                #[inline]
+                #[target_feature(enable = "vector")]
+                unsafe fn vec_mul(self, other: Self) -> Self {
+                    transmute(simd_mul(transmute(self), other))
+                }
+            }
+        };
+        ($name:ident, $a:ty, $instr:ident) => {
+            #[inline]
+            #[target_feature(enable = "vector")]
+            #[cfg_attr(test, assert_instr($instr))]
+            pub unsafe fn $name(a: $a, b: $a) -> $a {
+                transmute(simd_mul(transmute(a), b))
+            }
+
+            #[unstable(feature = "stdarch_s390x", issue = "135681")]
+            impl VectorMul for $a {
+                #[inline]
+                #[target_feature(enable = "vector")]
+                unsafe fn vec_mul(self, other: Self) -> Self {
+                    $name(self, other)
+                }
+            }
+        };
+    }
+
+    #[rustfmt::skip]
+    mod impl_mul {
+        use super::*;
+
+        impl_mul!(vml_sc, vector_signed_char, vmlb);
+        impl_mul!(vml_uc, vector_unsigned_char, vmlb);
+        impl_mul!(vml_sh, vector_signed_short, vmlhw);
+        impl_mul!(vml_uh, vector_unsigned_short, vmlhw);
+        impl_mul!(vml_sf, vector_signed_int, vmlf);
+        impl_mul!(vml_uf, vector_unsigned_int, vmlf);
+        impl_mul!(vml_sg, vector_signed_long_long, std_simd);
+        impl_mul!(vml_ug, vector_unsigned_long_long, std_simd);
+
+        impl_mul!(vml_float, vector_float, std_simd);
+        impl_mul!(vml_double, vector_double, vfmdb);
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorMax<Other> {
+        type Result;
+        unsafe fn vec_max(self, b: Other) -> Self::Result;
+    }
+
+    test_impl! { vec_vmxsb (a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [vmxb, vmxb] }
+    test_impl! { vec_vmxsh (a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [vmxh, vmxh] }
+    test_impl! { vec_vmxsf (a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [vmxf, vmxf] }
+    test_impl! { vec_vmxsg (a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long [vmxg, vmxg] }
+
+    test_impl! { vec_vmxslb (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [vmxlb, vmxlb] }
+    test_impl! { vec_vmxslh (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [vmxlh, vmxlh] }
+    test_impl! { vec_vmxslf (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vmxlf, vmxlf] }
+    test_impl! { vec_vmxslg (a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long [vmxlg, vmxlg] }
+
+    impl_vec_trait! { [VectorMax vec_max] ~(vmxlb, vmxb, vmxlh, vmxh, vmxlf, vmxf, vmxlg, vmxg) }
+
+    test_impl! { vec_vfmaxsb (a: vector_float, b: vector_float) -> vector_float [simd_fmax, "vector-enhancements-1" vfmaxsb ] }
+    test_impl! { vec_vfmaxdb (a: vector_double, b: vector_double) -> vector_double [simd_fmax, "vector-enhancements-1" vfmaxdb] }
+
+    impl_vec_trait!([VectorMax vec_max] vec_vfmaxsb (vector_float, vector_float) -> vector_float);
+    impl_vec_trait!([VectorMax vec_max] vec_vfmaxdb (vector_double, vector_double) -> vector_double);
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorMin<Other> {
+        type Result;
+        unsafe fn vec_min(self, b: Other) -> Self::Result;
+    }
+
+    test_impl! { vec_vmnsb (a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [vmnb, vmnb] }
+    test_impl! { vec_vmnsh (a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [vmnh, vmnh] }
+    test_impl! { vec_vmnsf (a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [vmnf, vmnf] }
+    test_impl! { vec_vmnsg (a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long [vmng, vmng] }
+
+    test_impl! { vec_vmnslb (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [vmnlb, vmnlb] }
+    test_impl! { vec_vmnslh (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [vmnlh, vmnlh] }
+    test_impl! { vec_vmnslf (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vmnlf, vmnlf] }
+    test_impl! { vec_vmnslg (a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long [vmnlg, vmnlg] }
+
+    impl_vec_trait! { [VectorMin vec_min] ~(vmxlb, vmxb, vmxlh, vmxh, vmxlf, vmxf, vmxlg, vmxg) }
+
+    test_impl! { vec_vfminsb (a: vector_float, b: vector_float) -> vector_float [simd_fmin, "vector-enhancements-1" vfminsb]  }
+    test_impl! { vec_vfmindb (a: vector_double, b: vector_double) -> vector_double [simd_fmin, "vector-enhancements-1" vfmindb]  }
+
+    impl_vec_trait!([VectorMin vec_min] vec_vfminsb (vector_float, vector_float) -> vector_float);
+    impl_vec_trait!([VectorMin vec_min] vec_vfmindb (vector_double, vector_double) -> vector_double);
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorAbs {
+        unsafe fn vec_abs(self) -> Self;
+    }
+
+    macro_rules! impl_abs {
+        ($name:ident, $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "vector")]
+            unsafe fn $name(v: s_t_l!($ty)) -> s_t_l!($ty) {
+                v.vec_max(-v)
+            }
+
+            impl_vec_trait! { [VectorAbs vec_abs] $name (s_t_l!($ty)) }
+        };
+    }
+
+    impl_abs! { vec_abs_i8, i8x16 }
+    impl_abs! { vec_abs_i16, i16x8 }
+    impl_abs! { vec_abs_i32, i32x4 }
+    impl_abs! { vec_abs_i64, i64x2 }
+
+    test_impl! { vec_abs_f32 (v: vector_float) -> vector_float [ simd_fabs, "vector-enhancements-1" vflpsb ] }
+    test_impl! { vec_abs_f64 (v: vector_double) -> vector_double [ simd_fabs, vflpdb ] }
+
+    impl_vec_trait! { [VectorAbs vec_abs] vec_abs_f32 (vector_float) }
+    impl_vec_trait! { [VectorAbs vec_abs] vec_abs_f64 (vector_double) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorNabs {
+        unsafe fn vec_nabs(self) -> Self;
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(
+        all(test, target_feature = "vector-enhancements-1"),
+        assert_instr(vflnsb)
+    )]
+    unsafe fn vec_nabs_f32(a: vector_float) -> vector_float {
+        simd_neg(simd_fabs(a))
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vflndb))]
+    unsafe fn vec_nabs_f64(a: vector_double) -> vector_double {
+        simd_neg(simd_fabs(a))
+    }
+
+    impl_vec_trait! { [VectorNabs vec_nabs] vec_nabs_f32 (vector_float) }
+    impl_vec_trait! { [VectorNabs vec_nabs] vec_nabs_f64 (vector_double) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorNmsub {
+        unsafe fn vec_nmsub(self, b: Self, c: Self) -> Self;
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(
+        all(test, target_feature = "vector-enhancements-2"),
+        assert_instr(vfnmssb)
+    )]
+    unsafe fn vec_nmsub_f32(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+        simd_neg(simd_fma(a, b, simd_neg(c)))
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorNmsub for vector_float {
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_nmsub(self, b: Self, c: Self) -> Self {
+            vec_nmsub_f32(self, b, c)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(
+        all(test, target_feature = "vector-enhancements-2"),
+        assert_instr(vfnmsdb)
+    )]
+    unsafe fn vec_nmsub_f64(a: vector_double, b: vector_double, c: vector_double) -> vector_double {
+        simd_neg(simd_fma(a, b, simd_neg(c)))
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorNmsub for vector_double {
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_nmsub(self, b: Self, c: Self) -> Self {
+            vec_nmsub_f64(self, b, c)
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorNmadd {
+        unsafe fn vec_nmadd(self, b: Self, c: Self) -> Self;
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(
+        all(test, target_feature = "vector-enhancements-2"),
+        assert_instr(vfnmasb)
+    )]
+    unsafe fn vec_nmadd_f32(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+        simd_neg(simd_fma(a, b, c))
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorNmadd for vector_float {
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_nmadd(self, b: Self, c: Self) -> Self {
+            vec_nmadd_f32(self, b, c)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(
+        all(test, target_feature = "vector-enhancements-2"),
+        assert_instr(vfnmadb)
+    )]
+    unsafe fn vec_nmadd_f64(a: vector_double, b: vector_double, c: vector_double) -> vector_double {
+        simd_neg(simd_fma(a, b, c))
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorNmadd for vector_double {
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_nmadd(self, b: Self, c: Self) -> Self {
+            vec_nmadd_f64(self, b, c)
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSplat {
+        unsafe fn vec_splat<const IMM: u32>(self) -> Self;
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vrepb, IMM2 = 1))]
+    unsafe fn vrepb<const IMM2: u32>(a: vector_signed_char) -> vector_signed_char {
+        static_assert_uimm_bits!(IMM2, 4);
+        simd_shuffle(a, a, const { u32x16::from_array([IMM2; 16]) })
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vreph, IMM2 = 1))]
+    unsafe fn vreph<const IMM2: u32>(a: vector_signed_short) -> vector_signed_short {
+        static_assert_uimm_bits!(IMM2, 3);
+        simd_shuffle(a, a, const { u32x8::from_array([IMM2; 8]) })
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vrepf, IMM2 = 1))]
+    unsafe fn vrepf<const IMM2: u32>(a: vector_signed_int) -> vector_signed_int {
+        static_assert_uimm_bits!(IMM2, 2);
+        simd_shuffle(a, a, const { u32x4::from_array([IMM2; 4]) })
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vrepg, IMM2 = 1))]
+    unsafe fn vrepg<const IMM2: u32>(a: vector_signed_long_long) -> vector_signed_long_long {
+        static_assert_uimm_bits!(IMM2, 1);
+        simd_shuffle(a, a, const { u32x2::from_array([IMM2; 2]) })
+    }
+
+    macro_rules! impl_vec_splat {
+        ($ty:ty, $fun:ident) => {
+            #[unstable(feature = "stdarch_s390x", issue = "135681")]
+            impl VectorSplat for $ty {
+                #[inline]
+                #[target_feature(enable = "vector")]
+                unsafe fn vec_splat<const IMM: u32>(self) -> Self {
+                    transmute($fun::<IMM>(transmute(self)))
+                }
+            }
+        };
+    }
+
+    impl_vec_splat! { vector_signed_char, vrepb }
+    impl_vec_splat! { vector_unsigned_char, vrepb }
+    impl_vec_splat! { vector_bool_char, vrepb }
+    impl_vec_splat! { vector_signed_short, vreph }
+    impl_vec_splat! { vector_unsigned_short, vreph }
+    impl_vec_splat! { vector_bool_short, vreph }
+    impl_vec_splat! { vector_signed_int, vrepf }
+    impl_vec_splat! { vector_unsigned_int, vrepf }
+    impl_vec_splat! { vector_bool_int, vrepf }
+    impl_vec_splat! { vector_signed_long_long, vrepg }
+    impl_vec_splat! { vector_unsigned_long_long, vrepg }
+    impl_vec_splat! { vector_bool_long_long, vrepg }
+
+    impl_vec_splat! { vector_float, vrepf }
+    impl_vec_splat! { vector_double, vrepg }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSplats<Output> {
+        unsafe fn vec_splats(self) -> Output;
+    }
+
+    macro_rules! impl_vec_splats {
+        ($(($fn:ident ($ty:ty, $shortty:tt) $instr:ident)),*) => {
+            $(
+                #[inline]
+                #[target_feature(enable = "vector")]
+                #[cfg_attr(test, assert_instr($instr))]
+                pub unsafe fn $fn(v: $ty) -> s_t_l!($shortty) {
+                    transmute($shortty::splat(v))
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorSplats<s_t_l!($shortty)> for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_splats(self) -> s_t_l!($shortty) {
+                        $fn (self)
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_vec_splats! {
+        (vec_splats_u8 (u8, u8x16) vrepb),
+        (vec_splats_i8 (i8, i8x16) vrepb),
+        (vec_splats_u16 (u16, u16x8) vreph),
+        (vec_splats_i16 (i16, i16x8) vreph),
+        (vec_splats_u32 (u32, u32x4) vrepf),
+        (vec_splats_i32 (i32, i32x4) vrepf),
+        (vec_splats_u64 (u64, u64x2) vlvgp),
+        (vec_splats_i64 (i64, i64x2) vlvgp),
+        (vec_splats_f32 (f32, f32x4) vrepf),
+        (vec_splats_f64 (f64, f64x2) vrepg)
+    }
+
+    macro_rules! impl_bool_vec_splats {
+        ($(($ty:ty, $shortty:tt, $boolty:ty)),*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorSplats<$boolty> for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_splats(self) -> $boolty {
+                        transmute($shortty::splat(self))
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_bool_vec_splats! {
+        (u8, u8x16, vector_bool_char),
+        (i8, i8x16, vector_bool_char),
+        (u16, u16x8, vector_bool_short),
+        (i16, i16x8, vector_bool_short),
+        (u32, u32x4, vector_bool_int),
+        (i32, i32x4, vector_bool_int),
+        (u64, u64x2, vector_bool_long_long),
+        (i64, i64x2, vector_bool_long_long)
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait CountBits {
+        type Result;
+
+        unsafe fn vec_cntlz(self) -> Self::Result;
+        unsafe fn vec_cnttz(self) -> Self::Result;
+        unsafe fn vec_popcnt(self) -> Self::Result;
+    }
+
+    macro_rules! impl_count_bits {
+        ($ty:tt) => {
+            #[unstable(feature = "stdarch_s390x", issue = "135681")]
+            impl CountBits for $ty {
+                type Result = t_u!($ty);
+
+                #[inline]
+                #[target_feature(enable = "vector")]
+                unsafe fn vec_cntlz(self) -> Self::Result {
+                    transmute(simd_ctlz(self))
+                }
+
+                #[inline]
+                #[target_feature(enable = "vector")]
+                unsafe fn vec_cnttz(self) -> Self::Result {
+                    transmute(simd_cttz(self))
+                }
+
+                #[inline]
+                #[target_feature(enable = "vector")]
+                unsafe fn vec_popcnt(self) -> Self::Result {
+                    transmute(simd_ctpop(self))
+                }
+            }
+        };
+    }
+
+    impl_count_bits!(vector_signed_char);
+    impl_count_bits!(vector_unsigned_char);
+    impl_count_bits!(vector_signed_short);
+    impl_count_bits!(vector_unsigned_short);
+    impl_count_bits!(vector_signed_int);
+    impl_count_bits!(vector_unsigned_int);
+    impl_count_bits!(vector_signed_long_long);
+    impl_count_bits!(vector_unsigned_long_long);
+
+    test_impl! { vec_clzb_signed +(a: vector_signed_char) -> vector_unsigned_char [simd_ctlz, vclzb] }
+    test_impl! { vec_clzh_signed +(a: vector_signed_short) -> vector_unsigned_short [simd_ctlz, vclzh] }
+    test_impl! { vec_clzf_signed +(a: vector_signed_int) -> vector_unsigned_int [simd_ctlz, vclzf] }
+    test_impl! { vec_clzg_signed +(a: vector_signed_long_long) -> vector_unsigned_long_long [simd_ctlz, vclzg] }
+
+    test_impl! { vec_clzb_unsigned +(a: vector_unsigned_char) -> vector_unsigned_char [simd_ctlz, vclzb] }
+    test_impl! { vec_clzh_unsigned +(a: vector_unsigned_short) -> vector_unsigned_short [simd_ctlz, vclzh] }
+    test_impl! { vec_clzf_unsigned +(a: vector_unsigned_int) -> vector_unsigned_int [simd_ctlz, vclzf] }
+    test_impl! { vec_clzg_unsigned +(a: vector_unsigned_long_long) -> vector_unsigned_long_long [simd_ctlz, vclzg] }
+
+    test_impl! { vec_ctzb_signed +(a: vector_signed_char) -> vector_unsigned_char [simd_cttz, vctzb] }
+    test_impl! { vec_ctzh_signed +(a: vector_signed_short) -> vector_unsigned_short [simd_cttz, vctzh] }
+    test_impl! { vec_ctzf_signed +(a: vector_signed_int) -> vector_unsigned_int [simd_cttz, vctzf] }
+    test_impl! { vec_ctzg_signed +(a: vector_signed_long_long) -> vector_unsigned_long_long [simd_cttz, vctzg] }
+
+    test_impl! { vec_ctzb_unsigned +(a: vector_unsigned_char) -> vector_unsigned_char [simd_cttz, vctzb] }
+    test_impl! { vec_ctzh_unsigned +(a: vector_unsigned_short) -> vector_unsigned_short [simd_cttz, vctzh] }
+    test_impl! { vec_ctzf_unsigned +(a: vector_unsigned_int) -> vector_unsigned_int [simd_cttz, vctzf] }
+    test_impl! { vec_ctzg_unsigned +(a: vector_unsigned_long_long) -> vector_unsigned_long_long [simd_cttz, vctzg] }
+
+    test_impl! { vec_vpopctb_signed +(a: vector_signed_char) -> vector_signed_char [simd_ctpop, vpopctb] }
+    test_impl! { vec_vpopcth_signed +(a: vector_signed_short) -> vector_signed_short [simd_ctpop, "vector-enhancements-1" vpopcth] }
+    test_impl! { vec_vpopctf_signed +(a: vector_signed_int) -> vector_signed_int [simd_ctpop, "vector-enhancements-1" vpopctf] }
+    test_impl! { vec_vpopctg_signed +(a: vector_signed_long_long) -> vector_signed_long_long [simd_ctpop, "vector-enhancements-1" vpopctg] }
+
+    test_impl! { vec_vpopctb_unsigned +(a: vector_unsigned_char) -> vector_unsigned_char [simd_ctpop, vpopctb] }
+    test_impl! { vec_vpopcth_unsigned +(a: vector_unsigned_short) -> vector_unsigned_short [simd_ctpop, "vector-enhancements-1" vpopcth] }
+    test_impl! { vec_vpopctf_unsigned +(a: vector_unsigned_int) -> vector_unsigned_int [simd_ctpop, "vector-enhancements-1" vpopctf] }
+    test_impl! { vec_vpopctg_unsigned +(a: vector_unsigned_long_long) -> vector_unsigned_long_long [simd_ctpop, "vector-enhancements-1" vpopctg] }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorAnd<Other> {
+        type Result;
+        unsafe fn vec_and(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorAnd vec_and] ~(simd_and) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorOr<Other> {
+        type Result;
+        unsafe fn vec_or(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorOr vec_or] ~(simd_or) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorXor<Other> {
+        type Result;
+        unsafe fn vec_xor(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorXor vec_xor] ~(simd_xor) }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(all(test, target_feature = "vector-enhancements-1"), assert_instr(vno))]
+    unsafe fn nor(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char {
+        let a: u8x16 = transmute(a);
+        let b: u8x16 = transmute(b);
+        transmute(simd_xor(simd_or(a, b), u8x16::splat(0xff)))
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorNor<Other> {
+        type Result;
+        unsafe fn vec_nor(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorNor vec_nor]+ 2c (nor) }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(all(test, target_feature = "vector-enhancements-1"), assert_instr(vnn))]
+    unsafe fn nand(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char {
+        let a: u8x16 = transmute(a);
+        let b: u8x16 = transmute(b);
+        transmute(simd_xor(simd_and(a, b), u8x16::splat(0xff)))
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorNand<Other> {
+        type Result;
+        unsafe fn vec_nand(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorNand vec_nand]+ 2c (nand) }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(all(test, target_feature = "vector-enhancements-1"), assert_instr(vnx))]
+    unsafe fn eqv(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char {
+        let a: u8x16 = transmute(a);
+        let b: u8x16 = transmute(b);
+        transmute(simd_xor(simd_xor(a, b), u8x16::splat(0xff)))
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorEqv<Other> {
+        type Result;
+        unsafe fn vec_eqv(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorEqv vec_eqv]+ 2c (eqv) }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(all(test, target_feature = "vector-enhancements-1"), assert_instr(vnc))]
+    unsafe fn andc(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char {
+        let a = transmute(a);
+        let b = transmute(b);
+        transmute(simd_and(simd_xor(u8x16::splat(0xff), b), a))
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorAndc<Other> {
+        type Result;
+        unsafe fn vec_andc(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorAndc vec_andc]+ 2c (andc) }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(all(test, target_feature = "vector-enhancements-1"), assert_instr(voc))]
+    unsafe fn orc(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char {
+        let a = transmute(a);
+        let b = transmute(b);
+        transmute(simd_or(simd_xor(u8x16::splat(0xff), b), a))
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorOrc<Other> {
+        type Result;
+        unsafe fn vec_orc(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorOrc vec_orc]+ 2c (orc) }
+
+    test_impl! { vec_roundc_f32 (a: vector_float) -> vector_float [nearbyint_v4f32,  "vector-enhancements-1" vfisb] }
+    test_impl! { vec_roundc_f64 (a: vector_double) -> vector_double [nearbyint_v2f64, vfidb] }
+
+    // FIXME(llvm) llvm trunk already lowers roundeven to vfidb, but rust does not use it yet
+    // use https://godbolt.org/z/cWq95fexe to check, and enable the instruction test when it works
+    test_impl! { vec_round_f32 (a: vector_float) -> vector_float [roundeven_v4f32, _] }
+    test_impl! { vec_round_f64 (a: vector_double) -> vector_double [roundeven_v2f64, _] }
+
+    test_impl! { vec_rint_f32 (a: vector_float) -> vector_float [rint_v4f32, "vector-enhancements-1" vfisb] }
+    test_impl! { vec_rint_f64 (a: vector_double) -> vector_double [rint_v2f64, vfidb] }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorRoundc {
+        unsafe fn vec_roundc(self) -> Self;
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorRound {
+        unsafe fn vec_round(self) -> Self;
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorRint {
+        unsafe fn vec_rint(self) -> Self;
+    }
+
+    impl_vec_trait! { [VectorRoundc vec_roundc] vec_roundc_f32 (vector_float) }
+    impl_vec_trait! { [VectorRoundc vec_roundc] vec_roundc_f64 (vector_double) }
+
+    impl_vec_trait! { [VectorRound vec_round] vec_round_f32 (vector_float) }
+    impl_vec_trait! { [VectorRound vec_round] vec_round_f64 (vector_double) }
+
+    impl_vec_trait! { [VectorRint vec_rint] vec_rint_f32 (vector_float) }
+    impl_vec_trait! { [VectorRint vec_rint] vec_rint_f64 (vector_double) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorTrunc {
+        // same as vec_roundz
+        unsafe fn vec_trunc(self) -> Self;
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorCeil {
+        // same as vec_roundp
+        unsafe fn vec_ceil(self) -> Self;
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFloor {
+        // same as vec_roundm
+        unsafe fn vec_floor(self) -> Self;
+    }
+
+    impl_vec_trait! { [VectorTrunc vec_trunc] simd_trunc (vector_float) }
+    impl_vec_trait! { [VectorTrunc vec_trunc] simd_trunc (vector_double) }
+
+    impl_vec_trait! { [VectorCeil vec_ceil] simd_ceil (vector_float) }
+    impl_vec_trait! { [VectorCeil vec_ceil] simd_ceil (vector_double) }
+
+    impl_vec_trait! { [VectorFloor vec_floor] simd_floor (vector_float) }
+    impl_vec_trait! { [VectorFloor vec_floor] simd_floor (vector_double) }
+
+    macro_rules! impl_vec_shift {
+        ([$Trait:ident $m:ident] ($b:ident, $h:ident, $w:ident, $g:ident)) => {
+            impl_vec_trait!{ [$Trait $m]+ $b (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$Trait $m]+ $b (vector_signed_char, vector_unsigned_char) -> vector_signed_char }
+            impl_vec_trait!{ [$Trait $m]+ $h (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short }
+            impl_vec_trait!{ [$Trait $m]+ $h (vector_signed_short, vector_unsigned_short) -> vector_signed_short }
+            impl_vec_trait!{ [$Trait $m]+ $w (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int }
+            impl_vec_trait!{ [$Trait $m]+ $w (vector_signed_int, vector_unsigned_int) -> vector_signed_int }
+            impl_vec_trait!{ [$Trait $m]+ $g (vector_unsigned_long_long, vector_unsigned_long_long) -> vector_unsigned_long_long }
+            impl_vec_trait!{ [$Trait $m]+ $g (vector_signed_long_long, vector_unsigned_long_long) -> vector_signed_long_long }
+        };
+    }
+
+    macro_rules! impl_shift {
+        ($fun:ident $intr:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "vector")]
+            #[cfg_attr(test, assert_instr($fun))]
+            unsafe fn $fun(a: t_t_l!($ty), b: t_t_l!($ty)) -> t_t_l!($ty) {
+                let a = transmute(a);
+                // use the remainder of b by the width of a's elements to prevent UB
+                let b = simd_rem(transmute(b), <t_t_s!($ty)>::splat($ty::BITS as $ty));
+
+                transmute($intr(a, b))
+            }
+        };
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSl<Other> {
+        type Result;
+        unsafe fn vec_sl(self, b: Other) -> Self::Result;
+    }
+
+    impl_shift! { veslvb simd_shl u8 }
+    impl_shift! { veslvh simd_shl u16 }
+    impl_shift! { veslvf simd_shl u32 }
+    impl_shift! { veslvg simd_shl u64 }
+
+    impl_vec_shift! { [VectorSl vec_sl] (veslvb, veslvh, veslvf, veslvg) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSr<Other> {
+        type Result;
+        unsafe fn vec_sr(self, b: Other) -> Self::Result;
+    }
+
+    impl_shift! { vesrlvb simd_shr u8 }
+    impl_shift! { vesrlvh simd_shr u16 }
+    impl_shift! { vesrlvf simd_shr u32 }
+    impl_shift! { vesrlvg simd_shr u64 }
+
+    impl_vec_shift! { [VectorSr vec_sr] (vesrlvb, vesrlvh, vesrlvf, vesrlvg) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSra<Other> {
+        type Result;
+        unsafe fn vec_sra(self, b: Other) -> Self::Result;
+    }
+
+    impl_shift! { vesravb simd_shr i8 }
+    impl_shift! { vesravh simd_shr i16 }
+    impl_shift! { vesravf simd_shr i32 }
+    impl_shift! { vesravg simd_shr i64 }
+
+    impl_vec_shift! { [VectorSra vec_sra] (vesravb, vesravh, vesravf, vesravg) }
+
+    macro_rules! impl_vec_shift_byte {
+        ([$trait:ident $m:ident] ($f:ident)) => {
+            impl_vec_trait!{ [$trait $m]+ $f (vector_unsigned_char, vector_signed_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_signed_char, vector_signed_char) -> vector_signed_char }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_signed_char, vector_unsigned_char) -> vector_signed_char }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_unsigned_short, vector_signed_short) -> vector_unsigned_short }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_signed_short, vector_signed_short) -> vector_signed_short }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_signed_short, vector_unsigned_short) -> vector_signed_short }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_unsigned_int, vector_signed_int) -> vector_unsigned_int }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_signed_int, vector_signed_int) -> vector_signed_int }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_signed_int, vector_unsigned_int) -> vector_signed_int }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_unsigned_long_long, vector_signed_long_long) -> vector_unsigned_long_long }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_unsigned_long_long, vector_unsigned_long_long) -> vector_unsigned_long_long }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_signed_long_long, vector_signed_long_long) -> vector_signed_long_long }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_signed_long_long, vector_unsigned_long_long) -> vector_signed_long_long }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_float, vector_signed_int) -> vector_float }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_float, vector_unsigned_int) -> vector_float }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_double, vector_signed_long_long) -> vector_double }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_double, vector_unsigned_long_long) -> vector_double }
+        };
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSlb<Other> {
+        type Result;
+        unsafe fn vec_slb(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_byte! { [VectorSlb vec_slb] (vslb) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSrab<Other> {
+        type Result;
+        unsafe fn vec_srab(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_byte! { [VectorSrab vec_srab] (vsrab) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSrb<Other> {
+        type Result;
+        unsafe fn vec_srb(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_byte! { [VectorSrb vec_srb] (vsrlb) }
+
+    macro_rules! impl_vec_shift_long {
+        ([$trait:ident $m:ident] ($f:ident)) => {
+            impl_vec_trait!{ [$trait $m]+ $f (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_signed_char, vector_unsigned_char) -> vector_signed_char }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_unsigned_short, vector_unsigned_char) -> vector_unsigned_short }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_signed_short, vector_unsigned_char) -> vector_signed_short }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_unsigned_int, vector_unsigned_char) -> vector_unsigned_int }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_signed_int, vector_unsigned_char) -> vector_signed_int }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_unsigned_long_long, vector_unsigned_char) -> vector_unsigned_long_long }
+            impl_vec_trait!{ [$trait $m]+ $f (vector_signed_long_long, vector_unsigned_char) -> vector_signed_long_long }
+        };
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSrl<Other> {
+        type Result;
+        unsafe fn vec_srl(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_long! { [VectorSrl vec_srl] (vsrl) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSral<Other> {
+        type Result;
+        unsafe fn vec_sral(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_long! { [VectorSral vec_sral] (vsra) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSll<Other> {
+        type Result;
+        unsafe fn vec_sll(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_long! { [VectorSll vec_sll] (vsl) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorRl<Other> {
+        type Result;
+        unsafe fn vec_rl(self, b: Other) -> Self::Result;
+    }
+
+    macro_rules! impl_rot {
+        ($fun:ident $intr:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "vector")]
+            #[cfg_attr(test, assert_instr($fun))]
+            unsafe fn $fun(a: t_t_l!($ty), b: t_t_l!($ty)) -> t_t_l!($ty) {
+                transmute($intr(transmute(a), transmute(a), transmute(b)))
+            }
+        };
+    }
+
+    impl_rot! { verllvb fshlb u8 }
+    impl_rot! { verllvh fshlh u16 }
+    impl_rot! { verllvf fshlf u32 }
+    impl_rot! { verllvg fshlg u64 }
+
+    impl_vec_shift! { [VectorRl vec_rl] (verllvb, verllvh, verllvf, verllvg) }
+
+    macro_rules! test_rot_imm {
+        ($fun:ident $instr:ident $intr:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "vector")]
+            #[cfg_attr(test, assert_instr($instr))]
+            unsafe fn $fun(a: t_t_l!($ty), bits: core::ffi::c_ulong) -> t_t_l!($ty) {
+                // mod by the number of bits in a's element type to prevent UB
+                let bits = (bits % $ty::BITS as core::ffi::c_ulong) as $ty;
+                let a = transmute(a);
+                let b = <t_t_s!($ty)>::splat(bits);
+
+                transmute($intr(a, a, transmute(b)))
+            }
+        };
+    }
+
+    test_rot_imm! { verllvb_imm verllb fshlb u8 }
+    test_rot_imm! { verllvh_imm verllh fshlh u16 }
+    test_rot_imm! { verllvf_imm verllf fshlf u32 }
+    test_rot_imm! { verllvg_imm verllg fshlg u64 }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorRli {
+        unsafe fn vec_rli(self, bits: core::ffi::c_ulong) -> Self;
+    }
+
+    macro_rules! impl_rot_imm {
+        ($($ty:ident, $intr:ident),*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorRli for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_rli(self, bits: core::ffi::c_ulong) -> Self {
+                        transmute($intr(transmute(self), bits))
+                    }
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorRli for t_u!($ty) {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_rli(self, bits: core::ffi::c_ulong) -> Self {
+                        $intr(self, bits)
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_rot_imm! {
+        vector_signed_char, verllvb_imm,
+        vector_signed_short, verllvh_imm,
+        vector_signed_int, verllvf_imm,
+        vector_signed_long_long, verllvg_imm
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorRlMask<Other> {
+        unsafe fn vec_rl_mask<const IMM8: u8>(self, other: Other) -> Self;
+    }
+
+    macro_rules! impl_rl_mask {
+        ($($ty:ident, $intr:ident, $fun:ident),*) => {
+            $(
+                #[inline]
+                #[target_feature(enable = "vector")]
+                #[cfg_attr(test, assert_instr($intr, IMM8 = 6))]
+                unsafe fn $fun<const IMM8: u8>(a: $ty, b: t_u!($ty)) -> $ty {
+                    // mod by the number of bits in a's element type to prevent UB
+                    $intr(a, a, transmute(b), const { (IMM8 % <l_t_t!($ty)>::BITS as u8) as i32 })
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorRlMask<t_u!($ty)> for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_rl_mask<const IMM8: u8>(self, other: t_u!($ty)) -> Self {
+                        $fun::<IMM8>(self, other)
+                    }
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorRlMask<t_u!($ty)> for t_u!($ty) {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_rl_mask<const IMM8: u8>(self, other: t_u!($ty)) -> Self {
+                        transmute($fun::<IMM8>(transmute(self), transmute(other)))
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_rl_mask! {
+        vector_signed_char, verimb, test_verimb,
+        vector_signed_short, verimh, test_verimh,
+        vector_signed_int, verimf, test_verimf,
+        vector_signed_long_long, verimg, test_verimg
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorReve {
+        unsafe fn vec_reve(self) -> Self;
+    }
+
+    #[repr(simd)]
+    struct ReverseMask<const N: usize>([u32; N]);
+
+    impl<const N: usize> ReverseMask<N> {
+        const fn new() -> Self {
+            let mut index = [0; N];
+            let mut i = 0;
+            while i < N {
+                index[i] = (N - i - 1) as u32;
+                i += 1;
+            }
+            ReverseMask(index)
+        }
+    }
+
+    macro_rules! impl_reve {
+        ($($ty:ident, $fun:ident, $instr:ident),*) => {
+            $(
+                #[inline]
+                #[target_feature(enable = "vector")]
+                #[cfg_attr(test, assert_instr($instr))]
+                unsafe fn $fun(a: $ty) -> $ty {
+                    const N: usize = core::mem::size_of::<$ty>() / core::mem::size_of::<l_t_t!($ty)>();
+                    simd_shuffle(a, a, const { ShuffleMask::<N>::reverse() })
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorReve for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_reve(self) -> Self {
+                        $fun(self)
+                    }
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorReve for t_u!($ty) {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_reve(self) -> Self {
+                        transmute($fun(transmute(self)))
+                    }
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorReve for t_b!($ty) {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_reve(self) -> Self {
+                        transmute($fun(transmute(self)))
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_reve! {
+        vector_signed_char, reveb, vperm,
+        vector_signed_short, reveh, vperm,
+        vector_signed_int, revef, vperm,
+        vector_signed_long_long, reveg, vpdi
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorReve for vector_float {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_reve(self) -> Self {
+            transmute(transmute::<_, vector_signed_int>(self).vec_reve())
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorReve for vector_double {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_reve(self) -> Self {
+            transmute(transmute::<_, vector_signed_long_long>(self).vec_reve())
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorRevb {
+        unsafe fn vec_revb(self) -> Self;
+    }
+
+    test_impl! { bswapb (a: vector_signed_char) -> vector_signed_char [simd_bswap, _] }
+    test_impl! { bswaph (a: vector_signed_short) -> vector_signed_short [simd_bswap, vperm] }
+    test_impl! { bswapf (a: vector_signed_int) -> vector_signed_int [simd_bswap, vperm] }
+    test_impl! { bswapg (a: vector_signed_long_long) -> vector_signed_long_long [simd_bswap, vperm] }
+
+    impl_vec_trait! { [VectorRevb vec_revb]+ bswapb (vector_unsigned_char) }
+    impl_vec_trait! { [VectorRevb vec_revb]+ bswapb (vector_signed_char) }
+    impl_vec_trait! { [VectorRevb vec_revb]+ bswaph (vector_unsigned_short) }
+    impl_vec_trait! { [VectorRevb vec_revb]+ bswaph (vector_signed_short) }
+    impl_vec_trait! { [VectorRevb vec_revb]+ bswapf (vector_unsigned_int) }
+    impl_vec_trait! { [VectorRevb vec_revb]+ bswapf (vector_signed_int) }
+    impl_vec_trait! { [VectorRevb vec_revb]+ bswapg (vector_unsigned_long_long) }
+    impl_vec_trait! { [VectorRevb vec_revb]+ bswapg (vector_signed_long_long) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorRevb for vector_float {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_revb(self) -> Self {
+            transmute(transmute::<_, vector_signed_int>(self).vec_revb())
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorRevb for vector_double {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_revb(self) -> Self {
+            transmute(transmute::<_, vector_signed_long_long>(self).vec_revb())
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorMergel {
+        unsafe fn vec_mergel(self, other: Self) -> Self;
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorMergeh {
+        unsafe fn vec_mergeh(self, other: Self) -> Self;
+    }
+
+    macro_rules! impl_merge {
+        ($($ty:ident, $mergel:ident, $mergeh:ident),*) => {
+            $(
+                #[inline]
+                #[target_feature(enable = "vector")]
+                #[cfg_attr(test, assert_instr($mergel))]
+                unsafe fn $mergel(a: $ty, b: $ty) -> $ty {
+                    const N: usize = core::mem::size_of::<$ty>() / core::mem::size_of::<l_t_t!($ty)>();
+                    simd_shuffle(a, b, const { ShuffleMask::<N>::merge_low() })
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorMergel for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_mergel(self, other: Self) -> Self {
+                        $mergel(self, other)
+                    }
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorMergel for t_u!($ty) {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_mergel(self, other: Self) -> Self {
+                        transmute($mergel(transmute(self), transmute(other)))
+                    }
+                }
+
+                #[inline]
+                #[target_feature(enable = "vector")]
+                #[cfg_attr(test, assert_instr($mergeh))]
+                unsafe fn $mergeh(a: $ty, b: $ty) -> $ty {
+                    const N: usize = core::mem::size_of::<$ty>() / core::mem::size_of::<l_t_t!($ty)>();
+                    simd_shuffle(a, b, const { ShuffleMask::<N>::merge_high() })
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorMergeh for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_mergeh(self, other: Self) -> Self {
+                        $mergeh(self, other)
+                    }
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorMergeh for t_u!($ty) {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_mergeh(self, other: Self) -> Self {
+                        transmute($mergeh(transmute(self), transmute(other)))
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_merge! {
+        vector_signed_char, vmrlb, vmrhb,
+        vector_signed_short, vmrlh, vmrhh,
+        vector_signed_int, vmrlf, vmrhf,
+        vector_signed_long_long, vmrlg, vmrhg
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorPerm {
+        unsafe fn vec_perm(self, other: Self, c: vector_unsigned_char) -> Self;
+    }
+
+    macro_rules! impl_merge {
+        ($($ty:ident),*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorPerm for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_perm(self, other: Self, c: vector_unsigned_char) -> Self {
+                        transmute(vperm(transmute(self), transmute(other), c))
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_merge! {
+        vector_signed_char,
+        vector_signed_short,
+        vector_signed_int,
+        vector_signed_long_long,
+        vector_unsigned_char,
+        vector_unsigned_short,
+        vector_unsigned_int,
+        vector_unsigned_long_long,
+        vector_bool_char,
+        vector_bool_short,
+        vector_bool_int,
+        vector_bool_long_long,
+        vector_float,
+        vector_double
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSumU128 {
+        unsafe fn vec_sum_u128(self, other: Self) -> vector_unsigned_char;
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vsumqf))]
+    pub unsafe fn vec_vsumqf(a: vector_unsigned_int, b: vector_unsigned_int) -> u128 {
+        transmute(vsumqf(a, b))
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vsumqg))]
+    pub unsafe fn vec_vsumqg(a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> u128 {
+        transmute(vsumqg(a, b))
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorSumU128 for vector_unsigned_int {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_sum_u128(self, other: Self) -> vector_unsigned_char {
+            transmute(vec_vsumqf(self, other))
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorSumU128 for vector_unsigned_long_long {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_sum_u128(self, other: Self) -> vector_unsigned_char {
+            transmute(vec_vsumqg(self, other))
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSum2 {
+        unsafe fn vec_sum2(self, other: Self) -> vector_unsigned_long_long;
+    }
+
+    test_impl! { vec_vsumgh (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_long_long [vsumgh, vsumgh] }
+    test_impl! { vec_vsumgf (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_long_long [vsumgf, vsumgf] }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorSum2 for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_sum2(self, other: Self) -> vector_unsigned_long_long {
+            vec_vsumgh(self, other)
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorSum2 for vector_unsigned_int {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_sum2(self, other: Self) -> vector_unsigned_long_long {
+            vec_vsumgf(self, other)
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSum4 {
+        unsafe fn vec_sum4(self, other: Self) -> vector_unsigned_int;
+    }
+
+    test_impl! { vec_vsumb (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_int [vsumb, vsumb] }
+    test_impl! { vec_vsumh (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int [vsumh, vsumh] }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorSum4 for vector_unsigned_char {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_sum4(self, other: Self) -> vector_unsigned_int {
+            vec_vsumb(self, other)
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorSum4 for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_sum4(self, other: Self) -> vector_unsigned_int {
+            vec_vsumh(self, other)
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSubc<Other> {
+        type Result;
+        unsafe fn vec_subc(self, b: Other) -> Self::Result;
+    }
+
+    test_impl! { vec_vscbib (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [vscbib, vscbib] }
+    test_impl! { vec_vscbih (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [vscbih, vscbih] }
+    test_impl! { vec_vscbif (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vscbif, vscbif] }
+    test_impl! { vec_vscbig (a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long [vscbig, vscbig] }
+
+    impl_vec_trait! {[VectorSubc vec_subc] vec_vscbib (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+    impl_vec_trait! {[VectorSubc vec_subc] vec_vscbih (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short }
+    impl_vec_trait! {[VectorSubc vec_subc] vec_vscbif (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int }
+    impl_vec_trait! {[VectorSubc vec_subc] vec_vscbig (vector_unsigned_long_long, vector_unsigned_long_long) -> vector_unsigned_long_long }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSqrt {
+        unsafe fn vec_sqrt(self) -> Self;
+    }
+
+    test_impl! { vec_sqrt_f32 (v: vector_float) -> vector_float [ simd_fsqrt, "vector-enhancements-1" vfsqsb ] }
+    test_impl! { vec_sqrt_f64 (v: vector_double) -> vector_double [ simd_fsqrt, vfsqdb ] }
+
+    impl_vec_trait! { [VectorSqrt vec_sqrt] vec_sqrt_f32 (vector_float) }
+    impl_vec_trait! { [VectorSqrt vec_sqrt] vec_sqrt_f64 (vector_double) }
+
+    macro_rules! vfae_wrapper {
+        ($($name:ident $ty:ident)*) => {
+            $(
+                #[inline]
+                #[target_feature(enable = "vector")]
+                #[cfg_attr(test, assert_instr($name, IMM = 0))]
+                unsafe fn $name<const IMM: i32>(
+                    a: $ty,
+                    b: $ty,
+                ) -> $ty {
+                    super::$name(a, b, IMM)
+                }
+            )*
+        }
+     }
+
+    vfae_wrapper! {
+       vfaeb vector_signed_char
+       vfaeh vector_signed_short
+       vfaef vector_signed_int
+
+       vfaezb vector_signed_char
+       vfaezh vector_signed_short
+       vfaezf vector_signed_int
+    }
+
+    macro_rules! impl_vfae {
+        ([idx_cc $Trait:ident $m:ident] $imm:ident $b:ident $h:ident $f:ident) => {
+            impl_vfae! { [idx_cc $Trait $m] $imm
+                $b vector_signed_char vector_signed_char
+                $b vector_unsigned_char vector_unsigned_char
+                $b vector_bool_char vector_unsigned_char
+
+                $h vector_signed_short vector_signed_short
+                $h vector_unsigned_short vector_unsigned_short
+                $h vector_bool_short vector_unsigned_short
+
+                $f vector_signed_int vector_signed_int
+                $f vector_unsigned_int vector_unsigned_int
+                $f vector_bool_int vector_unsigned_int
+            }
+        };
+        ([idx_cc $Trait:ident $m:ident] $imm:ident $($fun:ident $ty:ident $r:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl $Trait<Self> for $ty {
+                    type Result = $r;
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn $m(self, b: Self) -> (Self::Result, i32) {
+                        let PackedTuple { x, y } = $fun::<{ FindImm::$imm as i32 }>(transmute(self), transmute(b));
+                        (transmute(x), y)
+                    }
+                }
+            )*
+        };
+        ([cc $Trait:ident $m:ident] $imm:ident $b:ident $h:ident $f:ident) => {
+            impl_vfae! { [cc $Trait $m] $imm
+                $b vector_signed_char
+                $b vector_unsigned_char
+                $b vector_bool_char
+
+                $h vector_signed_short
+                $h vector_unsigned_short
+                $h vector_bool_short
+
+                $f vector_signed_int
+                $f vector_unsigned_int
+                $f vector_bool_int
+            }
+        };
+        ([cc $Trait:ident $m:ident] $imm:ident $($fun:ident $ty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl $Trait<Self> for $ty {
+                    type Result = t_b!($ty);
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn $m(self, b: Self) -> (Self::Result, i32) {
+                        let PackedTuple { x, y } = $fun::<{ FindImm::$imm as i32 }>(transmute(self), transmute(b));
+                        (transmute(x), y)
+                    }
+                }
+            )*
+        };
+        ([idx $Trait:ident $m:ident] $imm:ident $b:ident $h:ident $f:ident) => {
+            impl_vfae! { [idx $Trait $m] $imm
+                $b vector_signed_char vector_signed_char
+                $b vector_unsigned_char vector_unsigned_char
+                $b vector_bool_char vector_unsigned_char
+
+                $h vector_signed_short vector_signed_short
+                $h vector_unsigned_short vector_unsigned_short
+                $h vector_bool_short vector_unsigned_short
+
+                $f vector_signed_int vector_signed_int
+                $f vector_unsigned_int vector_unsigned_int
+                $f vector_bool_int vector_unsigned_int
+            }
+        };
+        ([idx $Trait:ident $m:ident] $imm:ident $($fun:ident $ty:ident $r:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl $Trait<Self> for $ty {
+                    type Result = $r;
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn $m(self, b: Self) -> Self::Result {
+                        transmute($fun::<{ FindImm::$imm as i32 }>(transmute(self), transmute(b)))
+                    }
+                }
+            )*
+        };
+        ([$Trait:ident $m:ident] $imm:ident $b:ident $h:ident $f:ident) => {
+            impl_vfae! { [$Trait $m] $imm
+                $b vector_signed_char
+                $b vector_unsigned_char
+                $b vector_bool_char
+
+                $h vector_signed_short
+                $h vector_unsigned_short
+                $h vector_bool_short
+
+                $f vector_signed_int
+                $f vector_unsigned_int
+                $f vector_bool_int
+            }
+        };
+        ([$Trait:ident $m:ident] $imm:ident $($fun:ident $ty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl $Trait<Self> for $ty {
+                    type Result = t_b!($ty);
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn $m(self, b: Self) -> Self::Result {
+                        transmute($fun::<{ FindImm::$imm as i32 }>(transmute(self), transmute(b)))
+                    }
+                }
+            )*
+        };
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFindAnyEq<Other> {
+        type Result;
+        unsafe fn vec_find_any_eq(self, other: Other) -> Self::Result;
+    }
+
+    impl_vfae! { [VectorFindAnyEq vec_find_any_eq] Eq vfaeb vfaeh vfaef }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFindAnyNe<Other> {
+        type Result;
+        unsafe fn vec_find_any_ne(self, other: Other) -> Self::Result;
+    }
+
+    impl_vfae! { [VectorFindAnyNe vec_find_any_ne] Ne vfaeb vfaeh vfaef }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFindAnyEqOrZeroIdx<Other> {
+        type Result;
+        unsafe fn vec_find_any_eq_or_0_idx(self, other: Other) -> Self::Result;
+    }
+
+    impl_vfae! { [idx VectorFindAnyEqOrZeroIdx vec_find_any_eq_or_0_idx] EqIdx
+        vfaezb vector_signed_char vector_signed_char
+        vfaezb vector_unsigned_char vector_unsigned_char
+        vfaezb vector_bool_char vector_unsigned_char
+
+        vfaezh vector_signed_short vector_signed_short
+        vfaezh vector_unsigned_short vector_unsigned_short
+        vfaezh vector_bool_short vector_unsigned_short
+
+        vfaezf vector_signed_int vector_signed_int
+        vfaezf vector_unsigned_int vector_unsigned_int
+        vfaezf vector_bool_int vector_unsigned_int
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFindAnyNeOrZeroIdx<Other> {
+        type Result;
+        unsafe fn vec_find_any_ne_or_0_idx(self, other: Other) -> Self::Result;
+    }
+
+    impl_vfae! { [idx VectorFindAnyNeOrZeroIdx vec_find_any_ne_or_0_idx] NeIdx
+        vfaezb vector_signed_char vector_signed_char
+        vfaezb vector_unsigned_char vector_unsigned_char
+        vfaezb vector_bool_char vector_unsigned_char
+
+        vfaezh vector_signed_short vector_signed_short
+        vfaezh vector_unsigned_short vector_unsigned_short
+        vfaezh vector_bool_short vector_unsigned_short
+
+        vfaezf vector_signed_int vector_signed_int
+        vfaezf vector_unsigned_int vector_unsigned_int
+        vfaezf vector_bool_int vector_unsigned_int
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFindAnyEqIdx<Other> {
+        type Result;
+        unsafe fn vec_find_any_eq_idx(self, other: Other) -> Self::Result;
+    }
+
+    impl_vfae! { [idx VectorFindAnyEqIdx vec_find_any_eq_idx] EqIdx vfaeb vfaeh vfaef }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFindAnyNeIdx<Other> {
+        type Result;
+        unsafe fn vec_find_any_ne_idx(self, other: Other) -> Self::Result;
+    }
+
+    impl_vfae! { [idx VectorFindAnyNeIdx vec_find_any_ne_idx] NeIdx vfaeb vfaeh vfaef }
+
+    macro_rules! vfaes_wrapper {
+        ($($name:ident $ty:ident)*) => {
+            $(
+                #[inline]
+                #[target_feature(enable = "vector")]
+                #[cfg_attr(test, assert_instr($name, IMM = 0))]
+                unsafe fn $name<const IMM: i32>(
+                    a: $ty,
+                    b: $ty,
+                ) -> PackedTuple<$ty, i32> {
+                    super::$name(a, b, IMM)
+                }
+            )*
+        }
+     }
+
+    vfaes_wrapper! {
+        vfaebs vector_signed_char
+        vfaehs vector_signed_short
+        vfaefs vector_signed_int
+
+        vfaezbs vector_signed_char
+        vfaezhs vector_signed_short
+        vfaezfs vector_signed_int
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFindAnyEqCC<Other> {
+        type Result;
+        unsafe fn vec_find_any_eq_cc(self, other: Other) -> (Self::Result, i32);
+    }
+
+    impl_vfae! { [cc VectorFindAnyEqCC vec_find_any_eq_cc] Eq vfaebs vfaehs vfaefs }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFindAnyNeCC<Other> {
+        type Result;
+        unsafe fn vec_find_any_ne_cc(self, other: Other) -> (Self::Result, i32);
+    }
+
+    impl_vfae! { [cc VectorFindAnyNeCC vec_find_any_ne_cc] Ne vfaebs vfaehs vfaefs }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFindAnyEqIdxCC<Other> {
+        type Result;
+        unsafe fn vec_find_any_eq_idx_cc(self, other: Other) -> (Self::Result, i32);
+    }
+
+    impl_vfae! { [idx_cc VectorFindAnyEqIdxCC vec_find_any_eq_idx_cc] EqIdx vfaebs vfaehs vfaefs }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFindAnyNeIdxCC<Other> {
+        type Result;
+        unsafe fn vec_find_any_ne_idx_cc(self, other: Other) -> (Self::Result, i32);
+    }
+
+    impl_vfae! { [idx_cc VectorFindAnyNeIdxCC vec_find_any_ne_idx_cc] NeIdx vfaebs vfaehs vfaefs }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFindAnyEqOrZeroIdxCC<Other> {
+        type Result;
+        unsafe fn vec_find_any_eq_or_0_idx_cc(self, other: Other) -> (Self::Result, i32);
+    }
+
+    impl_vfae! { [idx_cc VectorFindAnyEqOrZeroIdxCC vec_find_any_eq_or_0_idx_cc] EqIdx vfaezbs vfaezhs vfaezfs }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFindAnyNeOrZeroIdxCC<Other> {
+        type Result;
+        unsafe fn vec_find_any_ne_or_0_idx_cc(self, other: Other) -> (Self::Result, i32);
+    }
+
+    impl_vfae! { [idx_cc VectorFindAnyNeOrZeroIdxCC vec_find_any_ne_or_0_idx_cc] NeIdx vfaezbs vfaezhs vfaezfs }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vl))]
+    unsafe fn test_vector_load(offset: isize, ptr: *const i32) -> vector_signed_int {
+        ptr.byte_offset(offset)
+            .cast::<vector_signed_int>()
+            .read_unaligned()
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vst))]
+    unsafe fn test_vector_store(vector: vector_signed_int, offset: isize, ptr: *mut i32) {
+        ptr.byte_offset(offset)
+            .cast::<vector_signed_int>()
+            .write_unaligned(vector)
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorLoad: Sized {
+        type ElementType;
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_xl(offset: isize, ptr: *const Self::ElementType) -> Self {
+            ptr.byte_offset(offset).cast::<Self>().read_unaligned()
+        }
+
+        unsafe fn vec_load_len(ptr: *const Self::ElementType, byte_count: u32) -> Self;
+
+        unsafe fn vec_load_bndry<const BLOCK_BOUNDARY: u16>(
+            ptr: *const Self::ElementType,
+        ) -> MaybeUninit<Self>;
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorStore: Sized {
+        type ElementType;
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_xst(self, offset: isize, ptr: *mut Self::ElementType) {
+            ptr.byte_offset(offset).cast::<Self>().write_unaligned(self)
+        }
+
+        unsafe fn vec_store_len(self, ptr: *mut Self::ElementType, byte_count: u32);
+    }
+
+    macro_rules! impl_load_store {
+        ($($ty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorLoad for t_t_l!($ty) {
+                    type ElementType = $ty;
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_load_len(ptr: *const Self::ElementType, byte_count: u32) -> Self {
+                        transmute(vll( byte_count, ptr.cast(),))
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_load_bndry<const BLOCK_BOUNDARY: u16>(ptr: *const Self::ElementType) -> MaybeUninit<Self> {
+                        transmute(vlbb(ptr.cast(), const { validate_block_boundary(BLOCK_BOUNDARY) }))
+                    }
+
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorStore for t_t_l!($ty) {
+                    type ElementType = $ty;
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_store_len(self, ptr: *mut Self::ElementType, byte_count: u32) {
+                        vstl(transmute(self), byte_count, ptr.cast())
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_load_store! { i8 u8 i16 u16 i32 u32 i64 u64 f32 f64 }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vll))]
+    unsafe fn test_vec_load_len(ptr: *const i32, byte_count: u32) -> vector_signed_int {
+        vector_signed_int::vec_load_len(ptr, byte_count)
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr("vlbb"))]
+    unsafe fn test_vec_load_bndry(ptr: *const i32) -> MaybeUninit<vector_signed_int> {
+        vector_signed_int::vec_load_bndry::<512>(ptr)
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vst))]
+    unsafe fn test_vec_store_len(vector: vector_signed_int, ptr: *mut i32, byte_count: u32) {
+        vector.vec_store_len(ptr, byte_count)
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorLoadPair: Sized {
+        type ElementType;
+
+        unsafe fn vec_load_pair(a: Self::ElementType, b: Self::ElementType) -> Self;
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorLoadPair for vector_signed_long_long {
+        type ElementType = i64;
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_load_pair(a: i64, b: i64) -> Self {
+            vector_signed_long_long([a, b])
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorLoadPair for vector_unsigned_long_long {
+        type ElementType = u64;
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_load_pair(a: u64, b: u64) -> Self {
+            vector_unsigned_long_long([a, b])
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    unsafe fn pack<T, const N: usize>(a: T, b: T) -> T {
+        simd_shuffle(a, b, const { ShuffleMask::<N>::pack() })
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vpkh))]
+    unsafe fn vpkh(a: i16x8, b: i16x8) -> i8x16 {
+        let a: i8x16 = transmute(a);
+        let b: i8x16 = transmute(b);
+        simd_shuffle(a, b, const { ShuffleMask::<16>::pack() })
+    }
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vpkf))]
+    unsafe fn vpkf(a: i32x4, b: i32x4) -> i16x8 {
+        let a: i16x8 = transmute(a);
+        let b: i16x8 = transmute(b);
+        simd_shuffle(a, b, const { ShuffleMask::<8>::pack() })
+    }
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vpkg))]
+    unsafe fn vpkg(a: i64x2, b: i64x2) -> i32x4 {
+        let a: i32x4 = transmute(a);
+        let b: i32x4 = transmute(b);
+        simd_shuffle(a, b, const { ShuffleMask::<4>::pack() })
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorPack<Other> {
+        type Result;
+        unsafe fn vec_pack(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorPack vec_pack]+ vpkh (vector_signed_short, vector_signed_short) -> vector_signed_char }
+    impl_vec_trait! { [VectorPack vec_pack]+ vpkh (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_char }
+    impl_vec_trait! { [VectorPack vec_pack]+ vpkh (vector_bool_short, vector_bool_short) -> vector_bool_char }
+    impl_vec_trait! { [VectorPack vec_pack]+ vpkf (vector_signed_int, vector_signed_int) -> vector_signed_short }
+    impl_vec_trait! { [VectorPack vec_pack]+ vpkf (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_short }
+    impl_vec_trait! { [VectorPack vec_pack]+ vpkf (vector_bool_int, vector_bool_int) -> vector_bool_short }
+    impl_vec_trait! { [VectorPack vec_pack]+ vpkg (vector_signed_long_long, vector_signed_long_long) -> vector_signed_int }
+    impl_vec_trait! { [VectorPack vec_pack]+ vpkg (vector_unsigned_long_long, vector_unsigned_long_long) -> vector_unsigned_int }
+    impl_vec_trait! { [VectorPack vec_pack]+ vpkg (vector_bool_long_long, vector_bool_long_long) -> vector_bool_int }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorPacks<Other> {
+        type Result;
+        unsafe fn vec_packs(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorPacks vec_packs] vpksh (vector_signed_short, vector_signed_short) -> vector_signed_char }
+    impl_vec_trait! { [VectorPacks vec_packs] vpklsh (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_char }
+    impl_vec_trait! { [VectorPacks vec_packs] vpksf (vector_signed_int, vector_signed_int) -> vector_signed_short }
+    impl_vec_trait! { [VectorPacks vec_packs] vpklsf (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_short }
+    impl_vec_trait! { [VectorPacks vec_packs] vpksg (vector_signed_long_long, vector_signed_long_long) -> vector_signed_int }
+    impl_vec_trait! { [VectorPacks vec_packs] vpklsg (vector_unsigned_long_long, vector_unsigned_long_long) -> vector_unsigned_int }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorPacksu<Other> {
+        type Result;
+        unsafe fn vec_packsu(self, b: Other) -> Self::Result;
+    }
+
+    unsafe fn simd_smax<T: Copy>(a: T, b: T) -> T {
+        simd_select::<T, T>(simd_gt::<T, T>(a, b), a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vpklsh))]
+    unsafe fn vpacksuh(a: vector_signed_short, b: vector_signed_short) -> vector_unsigned_char {
+        vpklsh(
+            simd_smax(a, vector_signed_short([0; 8])),
+            simd_smax(b, vector_signed_short([0; 8])),
+        )
+    }
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vpklsf))]
+    unsafe fn vpacksuf(a: vector_signed_int, b: vector_signed_int) -> vector_unsigned_short {
+        vpklsf(
+            simd_smax(a, vector_signed_int([0; 4])),
+            simd_smax(b, vector_signed_int([0; 4])),
+        )
+    }
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vpklsg))]
+    unsafe fn vpacksug(
+        a: vector_signed_long_long,
+        b: vector_signed_long_long,
+    ) -> vector_unsigned_int {
+        vpklsg(
+            simd_smax(a, vector_signed_long_long([0; 2])),
+            simd_smax(b, vector_signed_long_long([0; 2])),
+        )
+    }
+
+    impl_vec_trait! { [VectorPacksu vec_packsu] vpacksuh (vector_signed_short, vector_signed_short) -> vector_unsigned_char }
+    impl_vec_trait! { [VectorPacksu vec_packsu] vpklsh (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_char }
+    impl_vec_trait! { [VectorPacksu vec_packsu] vpacksuf (vector_signed_int, vector_signed_int) -> vector_unsigned_short }
+    impl_vec_trait! { [VectorPacksu vec_packsu] vpklsf (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_short }
+    impl_vec_trait! { [VectorPacksu vec_packsu] vpacksug (vector_signed_long_long, vector_signed_long_long) -> vector_unsigned_int }
+    impl_vec_trait! { [VectorPacksu vec_packsu] vpklsg (vector_unsigned_long_long, vector_unsigned_long_long) -> vector_unsigned_int }
+
+    macro_rules! impl_vector_packs_cc {
+        ($($intr:ident $ty:ident $outty:ident)*) => {
+            $(
+                #[inline]
+                #[target_feature(enable = "vector")]
+                #[cfg_attr(test, assert_instr($intr))]
+                unsafe fn $intr(
+                    a: $ty,
+                    b: $ty,
+                ) -> ($outty, i32) {
+                    let PackedTuple { x, y } = super::$intr(a, b);
+                    (x, y)
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorPacksCC for $ty {
+                    type Result = $outty;
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_packs_cc(self, b: Self) -> (Self::Result, i32) {
+                        $intr(self, b)
+                    }
+                }
+            )*
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorPacksCC {
+        type Result;
+        unsafe fn vec_packs_cc(self, b: Self) -> (Self::Result, i32);
+    }
+
+    impl_vector_packs_cc! {
+        vpkshs vector_signed_short vector_signed_char
+        vpklshs vector_unsigned_short vector_unsigned_char
+        vpksfs vector_signed_int vector_signed_short
+        vpklsfs vector_unsigned_int vector_unsigned_short
+        vpksgs vector_signed_long_long vector_signed_int
+        vpklsgs vector_unsigned_long_long vector_unsigned_int
+    }
+
+    macro_rules! impl_vector_packsu_cc {
+        ($($intr:ident $ty:ident $outty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorPacksuCC for $ty {
+                    type Result = $outty;
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_packsu_cc(self, b: Self) -> (Self::Result, i32) {
+                        $intr(self, b)
+                    }
+                }
+            )*
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorPacksuCC {
+        type Result;
+        unsafe fn vec_packsu_cc(self, b: Self) -> (Self::Result, i32);
+    }
+
+    impl_vector_packsu_cc! {
+        vpklshs vector_unsigned_short vector_unsigned_char
+        vpklsfs vector_unsigned_int vector_unsigned_short
+        vpklsgs vector_unsigned_long_long vector_unsigned_int
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorMadd {
+        unsafe fn vec_madd(self, b: Self, c: Self) -> Self;
+        unsafe fn vec_msub(self, b: Self, c: Self) -> Self;
+    }
+
+    test_impl! { vfmasb (a: vector_float, b: vector_float, c: vector_float) -> vector_float [simd_fma, "vector-enhancements-1" vfmasb] }
+    test_impl! { vfmadb (a: vector_double, b: vector_double, c: vector_double) -> vector_double [simd_fma, vfmadb] }
+
+    #[inline]
+    unsafe fn simd_fms<T>(a: T, b: T, c: T) -> T {
+        simd_fma(a, b, simd_neg(c))
+    }
+
+    test_impl! { vfmssb (a: vector_float, b: vector_float, c: vector_float) -> vector_float [simd_fms, "vector-enhancements-1" vfmssb] }
+    test_impl! { vfmsdb (a: vector_double, b: vector_double, c: vector_double) -> vector_double [simd_fms, vfmsdb] }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorMadd for vector_float {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_madd(self, b: Self, c: Self) -> Self {
+            vfmasb(self, b, c)
+        }
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_msub(self, b: Self, c: Self) -> Self {
+            vfmssb(self, b, c)
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorMadd for vector_double {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_madd(self, b: Self, c: Self) -> Self {
+            vfmadb(self, b, c)
+        }
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_msub(self, b: Self, c: Self) -> Self {
+            vfmsdb(self, b, c)
+        }
+    }
+
+    macro_rules! impl_vec_unpack {
+        ($mask:ident $instr:ident $src:ident $shuffled:ident $dst:ident $width:literal) => {
+            #[inline]
+            #[target_feature(enable = "vector")]
+            #[cfg_attr(test, assert_instr($instr))]
+            unsafe fn $instr(a: $src) -> $dst {
+                simd_as(simd_shuffle::<_, _, $shuffled>(
+                    a,
+                    a,
+                    const { ShuffleMask::<$width>::$mask() },
+                ))
+            }
+        };
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorUnpackh {
+        type Result;
+        unsafe fn vec_unpackh(self) -> Self::Result;
+    }
+
+    impl_vec_unpack!(unpack_high vuphb vector_signed_char i8x8 vector_signed_short 8);
+    impl_vec_unpack!(unpack_high vuphh vector_signed_short i16x4 vector_signed_int 4);
+    impl_vec_unpack!(unpack_high vuphf vector_signed_int i32x2 vector_signed_long_long 2);
+
+    impl_vec_unpack!(unpack_high vuplhb vector_unsigned_char u8x8 vector_unsigned_short 8);
+    impl_vec_unpack!(unpack_high vuplhh vector_unsigned_short u16x4 vector_unsigned_int 4);
+    impl_vec_unpack!(unpack_high vuplhf vector_unsigned_int u32x2 vector_unsigned_long_long 2);
+
+    impl_vec_trait! {[VectorUnpackh vec_unpackh] vuphb (vector_signed_char) -> vector_signed_short}
+    impl_vec_trait! {[VectorUnpackh vec_unpackh] vuphh (vector_signed_short) -> vector_signed_int}
+    impl_vec_trait! {[VectorUnpackh vec_unpackh] vuphf (vector_signed_int) -> vector_signed_long_long}
+
+    impl_vec_trait! {[VectorUnpackh vec_unpackh] vuplhb (vector_unsigned_char) -> vector_unsigned_short}
+    impl_vec_trait! {[VectorUnpackh vec_unpackh] vuplhh (vector_unsigned_short) -> vector_unsigned_int}
+    impl_vec_trait! {[VectorUnpackh vec_unpackh] vuplhf (vector_unsigned_int) -> vector_unsigned_long_long}
+
+    impl_vec_trait! {[VectorUnpackh vec_unpackh]+ vuplhb (vector_bool_char) -> vector_bool_short}
+    impl_vec_trait! {[VectorUnpackh vec_unpackh]+ vuplhh (vector_bool_short) -> vector_bool_int}
+    impl_vec_trait! {[VectorUnpackh vec_unpackh]+ vuplhf (vector_bool_int) -> vector_bool_long_long}
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorUnpackl {
+        type Result;
+        unsafe fn vec_unpackl(self) -> Self::Result;
+    }
+
+    // FIXME(llvm): a shuffle + simd_as does not currently optimize into a single instruction like
+    // unpachk above. Tracked in https://github.com/llvm/llvm-project/issues/129576.
+
+    impl_vec_trait! {[VectorUnpackl vec_unpackl] vuplb (vector_signed_char) -> vector_signed_short}
+    impl_vec_trait! {[VectorUnpackl vec_unpackl] vuplhw (vector_signed_short) -> vector_signed_int}
+    impl_vec_trait! {[VectorUnpackl vec_unpackl] vuplf (vector_signed_int) -> vector_signed_long_long}
+
+    impl_vec_trait! {[VectorUnpackl vec_unpackl] vupllb (vector_unsigned_char) -> vector_unsigned_short}
+    impl_vec_trait! {[VectorUnpackl vec_unpackl] vupllh (vector_unsigned_short) -> vector_unsigned_int}
+    impl_vec_trait! {[VectorUnpackl vec_unpackl] vupllf (vector_unsigned_int) -> vector_unsigned_long_long}
+
+    impl_vec_trait! {[VectorUnpackl vec_unpackl]+ vupllb (vector_bool_char) -> vector_bool_short}
+    impl_vec_trait! {[VectorUnpackl vec_unpackl]+ vupllh (vector_bool_short) -> vector_bool_int}
+    impl_vec_trait! {[VectorUnpackl vec_unpackl]+ vupllf (vector_bool_int) -> vector_bool_long_long}
+
+    test_impl! { vec_vavgb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [ vavgb, vavgb ] }
+    test_impl! { vec_vavgh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [ vavgh, vavgh ] }
+    test_impl! { vec_vavgf(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [ vavgf, vavgf ] }
+    test_impl! { vec_vavgg(a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long [ vavgg, vavgg ] }
+
+    test_impl! { vec_vavglb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [ vavglb, vavglb ] }
+    test_impl! { vec_vavglh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [ vavglh, vavglh ] }
+    test_impl! { vec_vavglf(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [ vavglf, vavglf ] }
+    test_impl! { vec_vavglg(a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long [ vavglg, vavglg ] }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorAvg<Other> {
+        type Result;
+        unsafe fn vec_avg(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorAvg vec_avg] 2 (vec_vavglb, vec_vavgb, vec_vavglh, vec_vavgh, vec_vavglf, vec_vavgf, vec_vavglg, vec_vavgg) }
+
+    macro_rules! impl_mul {
+        ([$Trait:ident $m:ident] $fun:ident ($a:ty, $b:ty) -> $r:ty) => {
+            #[unstable(feature = "stdarch_s390x", issue = "135681")]
+            impl $Trait<$r> for $a {
+                #[inline]
+                #[target_feature(enable = "vector")]
+                unsafe fn $m(self, b: $b) -> $r {
+                    $fun(transmute(self), transmute(b))
+                }
+            }
+        };
+        ([$Trait:ident $m:ident] $fun:ident ($a:ty, $b:ty, $c:ty) -> $r:ty) => {
+            #[unstable(feature = "stdarch_s390x", issue = "135681")]
+            impl $Trait for $a {
+                type Result = $r;
+                #[inline]
+                #[target_feature(enable = "vector")]
+                unsafe fn $m(self, b: $b, c: $c) -> $r {
+                    $fun(self, b, c)
+                }
+            }
+        };
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorMule<Result> {
+        unsafe fn vec_mule(self, b: Self) -> Result;
+    }
+
+    // FIXME(llvm) sadly this does not yet work https://github.com/llvm/llvm-project/issues/129705
+    //    #[target_feature(enable = "vector")]
+    //    #[cfg_attr(test, assert_instr(vmleh))]
+    //    unsafe fn vec_vmleh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int {
+    //        let even_a: vector_unsigned_int = simd_as(simd_shuffle::<_, _, u16x4>(
+    //            a,
+    //            a,
+    //            const { ShuffleMask([0, 2, 4, 6]) },
+    //        ));
+    //
+    //        let even_b: vector_unsigned_int = simd_as(simd_shuffle::<_, _, u16x4>(
+    //            b,
+    //            b,
+    //            const { ShuffleMask([0, 2, 4, 6]) },
+    //        ));
+    //
+    //        simd_mul(even_a, even_b)
+    //    }
+
+    test_impl! { vec_vmeb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short [ vmeb, vmeb ] }
+    test_impl! { vec_vmeh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int[ vmeh, vmeh ] }
+    test_impl! { vec_vmef(a: vector_signed_int, b: vector_signed_int) -> vector_signed_long_long [ vmef, vmef ] }
+
+    test_impl! { vec_vmleb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_short [ vmleb, vmleb ] }
+    test_impl! { vec_vmleh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int[ vmleh, vmleh ] }
+    test_impl! { vec_vmlef(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_long_long [ vmlef, vmlef ] }
+
+    impl_mul!([VectorMule vec_mule] vec_vmeb (vector_signed_char, vector_signed_char) -> vector_signed_short );
+    impl_mul!([VectorMule vec_mule] vec_vmeh (vector_signed_short, vector_signed_short) -> vector_signed_int);
+    impl_mul!([VectorMule vec_mule] vec_vmef (vector_signed_int, vector_signed_int) -> vector_signed_long_long );
+
+    impl_mul!([VectorMule vec_mule] vec_vmleb (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_short );
+    impl_mul!([VectorMule vec_mule] vec_vmleh (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_int);
+    impl_mul!([VectorMule vec_mule] vec_vmlef (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_long_long );
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorMulo<Result> {
+        unsafe fn vec_mulo(self, b: Self) -> Result;
+    }
+
+    test_impl! { vec_vmob(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short [ vmob, vmob ] }
+    test_impl! { vec_vmoh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int[ vmoh, vmoh ] }
+    test_impl! { vec_vmof(a: vector_signed_int, b: vector_signed_int) -> vector_signed_long_long [ vmof, vmof ] }
+
+    test_impl! { vec_vmlob(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_short [ vmlob, vmlob ] }
+    test_impl! { vec_vmloh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int[ vmloh, vmloh ] }
+    test_impl! { vec_vmlof(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_long_long [ vmlof, vmlof ] }
+
+    impl_mul!([VectorMulo vec_mulo] vec_vmob (vector_signed_char, vector_signed_char) -> vector_signed_short );
+    impl_mul!([VectorMulo vec_mulo] vec_vmoh (vector_signed_short, vector_signed_short) -> vector_signed_int);
+    impl_mul!([VectorMulo vec_mulo] vec_vmof (vector_signed_int, vector_signed_int) -> vector_signed_long_long );
+
+    impl_mul!([VectorMulo vec_mulo] vec_vmlob (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_short );
+    impl_mul!([VectorMulo vec_mulo] vec_vmloh (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_int);
+    impl_mul!([VectorMulo vec_mulo] vec_vmlof (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_long_long );
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorMulh<Result> {
+        unsafe fn vec_mulh(self, b: Self) -> Result;
+    }
+
+    test_impl! { vec_vmhb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [ vmhb, vmhb ] }
+    test_impl! { vec_vmhh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [ vmhh, vmhh ] }
+    test_impl! { vec_vmhf(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [ vmhf, vmhf ] }
+
+    test_impl! { vec_vmlhb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [ vmlhb, vmlhb ] }
+    test_impl! { vec_vmlhh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [ vmlhh, vmlhh ] }
+    test_impl! { vec_vmlhf(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [ vmlhf, vmlhf ] }
+
+    impl_mul!([VectorMulh vec_mulh] vec_vmhb (vector_signed_char, vector_signed_char) -> vector_signed_char);
+    impl_mul!([VectorMulh vec_mulh] vec_vmhh (vector_signed_short, vector_signed_short) -> vector_signed_short);
+    impl_mul!([VectorMulh vec_mulh] vec_vmhf (vector_signed_int, vector_signed_int) -> vector_signed_int);
+
+    impl_mul!([VectorMulh vec_mulh] vec_vmlhb (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char);
+    impl_mul!([VectorMulh vec_mulh] vec_vmlhh (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short);
+    impl_mul!([VectorMulh vec_mulh] vec_vmlhf (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int);
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorMeadd {
+        type Result;
+        unsafe fn vec_meadd(self, b: Self, c: Self::Result) -> Self::Result;
+    }
+
+    test_impl! { vec_vmaeb(a: vector_signed_char, b: vector_signed_char, c: vector_signed_short) -> vector_signed_short [ vmaeb, vmaeb ] }
+    test_impl! { vec_vmaeh(a: vector_signed_short, b: vector_signed_short, c: vector_signed_int) -> vector_signed_int[ vmaeh, vmaeh ] }
+    test_impl! { vec_vmaef(a: vector_signed_int, b: vector_signed_int, c: vector_signed_long_long) -> vector_signed_long_long [ vmaef, vmaef ] }
+
+    test_impl! { vec_vmaleb(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_short) -> vector_unsigned_short [ vmaleb, vmaleb ] }
+    test_impl! { vec_vmaleh(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_int) -> vector_unsigned_int[ vmaleh, vmaleh ] }
+    test_impl! { vec_vmalef(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_long_long) -> vector_unsigned_long_long [ vmalef, vmalef ] }
+
+    impl_mul!([VectorMeadd vec_meadd] vec_vmaeb (vector_signed_char, vector_signed_char, vector_signed_short) -> vector_signed_short );
+    impl_mul!([VectorMeadd vec_meadd] vec_vmaeh (vector_signed_short, vector_signed_short, vector_signed_int) -> vector_signed_int);
+    impl_mul!([VectorMeadd vec_meadd] vec_vmaef (vector_signed_int, vector_signed_int, vector_signed_long_long) -> vector_signed_long_long );
+
+    impl_mul!([VectorMeadd vec_meadd] vec_vmaleb (vector_unsigned_char, vector_unsigned_char, vector_unsigned_short) -> vector_unsigned_short );
+    impl_mul!([VectorMeadd vec_meadd] vec_vmaleh (vector_unsigned_short, vector_unsigned_short, vector_unsigned_int) -> vector_unsigned_int);
+    impl_mul!([VectorMeadd vec_meadd] vec_vmalef (vector_unsigned_int, vector_unsigned_int, vector_unsigned_long_long) -> vector_unsigned_long_long );
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorMoadd {
+        type Result;
+        unsafe fn vec_moadd(self, b: Self, c: Self::Result) -> Self::Result;
+    }
+
+    test_impl! { vec_vmaob(a: vector_signed_char, b: vector_signed_char, c: vector_signed_short) -> vector_signed_short [ vmaob, vmaob ] }
+    test_impl! { vec_vmaoh(a: vector_signed_short, b: vector_signed_short, c: vector_signed_int) -> vector_signed_int[ vmaoh, vmaoh ] }
+    test_impl! { vec_vmaof(a: vector_signed_int, b: vector_signed_int, c: vector_signed_long_long) -> vector_signed_long_long [ vmaof, vmaof ] }
+
+    test_impl! { vec_vmalob(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_short) -> vector_unsigned_short [ vmalob, vmalob ] }
+    test_impl! { vec_vmaloh(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_int) -> vector_unsigned_int[ vmaloh, vmaloh ] }
+    test_impl! { vec_vmalof(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_long_long) -> vector_unsigned_long_long [ vmalof, vmalof ] }
+
+    impl_mul!([VectorMoadd vec_moadd] vec_vmaob (vector_signed_char, vector_signed_char, vector_signed_short) -> vector_signed_short );
+    impl_mul!([VectorMoadd vec_moadd] vec_vmaoh (vector_signed_short, vector_signed_short, vector_signed_int) -> vector_signed_int);
+    impl_mul!([VectorMoadd vec_moadd] vec_vmaof (vector_signed_int, vector_signed_int, vector_signed_long_long) -> vector_signed_long_long );
+
+    impl_mul!([VectorMoadd vec_moadd] vec_vmalob (vector_unsigned_char, vector_unsigned_char, vector_unsigned_short) -> vector_unsigned_short );
+    impl_mul!([VectorMoadd vec_moadd] vec_vmaloh (vector_unsigned_short, vector_unsigned_short, vector_unsigned_int) -> vector_unsigned_int);
+    impl_mul!([VectorMoadd vec_moadd] vec_vmalof (vector_unsigned_int, vector_unsigned_int, vector_unsigned_long_long) -> vector_unsigned_long_long );
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorMhadd {
+        type Result;
+        unsafe fn vec_mhadd(self, b: Self, c: Self::Result) -> Self::Result;
+    }
+
+    test_impl! { vec_vmahb(a: vector_signed_char, b: vector_signed_char, c: vector_signed_char) -> vector_signed_char [ vmahb, vmahb ] }
+    test_impl! { vec_vmahh(a: vector_signed_short, b: vector_signed_short, c: vector_signed_short) -> vector_signed_short[ vmahh, vmahh ] }
+    test_impl! { vec_vmahf(a: vector_signed_int, b: vector_signed_int, c: vector_signed_int) -> vector_signed_int [ vmahf, vmahf ] }
+
+    test_impl! { vec_vmalhb(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_char) -> vector_unsigned_char [ vmalhb, vmalhb ] }
+    test_impl! { vec_vmalhh(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_short) -> vector_unsigned_short[ vmalhh, vmalhh ] }
+    test_impl! { vec_vmalhf(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_int) -> vector_unsigned_int [ vmalhf, vmalhf ] }
+
+    impl_mul!([VectorMhadd vec_mhadd] vec_vmahb (vector_signed_char, vector_signed_char, vector_signed_char) -> vector_signed_char );
+    impl_mul!([VectorMhadd vec_mhadd] vec_vmahh (vector_signed_short, vector_signed_short, vector_signed_short) -> vector_signed_short);
+    impl_mul!([VectorMhadd vec_mhadd] vec_vmahf (vector_signed_int, vector_signed_int, vector_signed_int) -> vector_signed_int );
+
+    impl_mul!([VectorMhadd vec_mhadd] vec_vmalhb (vector_unsigned_char, vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char );
+    impl_mul!([VectorMhadd vec_mhadd] vec_vmalhh (vector_unsigned_short, vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short);
+    impl_mul!([VectorMhadd vec_mhadd] vec_vmalhf (vector_unsigned_int, vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int );
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorMladd {
+        type Result;
+        unsafe fn vec_mladd(self, b: Self, c: Self::Result) -> Self::Result;
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    unsafe fn simd_mladd<T>(a: T, b: T, c: T) -> T {
+        simd_add(simd_mul(a, b), c)
+    }
+
+    test_impl! { vec_vmal_ib(a: vector_signed_char, b: vector_signed_char, c: vector_signed_char) -> vector_signed_char [simd_mladd, vmalb ] }
+    test_impl! { vec_vmal_ih(a: vector_signed_short, b: vector_signed_short, c: vector_signed_short) -> vector_signed_short[simd_mladd, vmalh ] }
+    test_impl! { vec_vmal_if(a: vector_signed_int, b: vector_signed_int, c: vector_signed_int) -> vector_signed_int [simd_mladd, vmalf ] }
+
+    test_impl! { vec_vmal_ub(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_char) -> vector_unsigned_char [simd_mladd, vmalb ] }
+    test_impl! { vec_vmal_uh(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_short) -> vector_unsigned_short[simd_mladd, vmalh ] }
+    test_impl! { vec_vmal_uf(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_int) -> vector_unsigned_int [simd_mladd, vmalf ] }
+
+    impl_mul!([VectorMladd vec_mladd] vec_vmal_ib (vector_signed_char, vector_signed_char, vector_signed_char) -> vector_signed_char );
+    impl_mul!([VectorMladd vec_mladd] vec_vmal_ih (vector_signed_short, vector_signed_short, vector_signed_short) -> vector_signed_short);
+    impl_mul!([VectorMladd vec_mladd] vec_vmal_if (vector_signed_int, vector_signed_int, vector_signed_int) -> vector_signed_int );
+
+    impl_mul!([VectorMladd vec_mladd] vec_vmal_ub (vector_unsigned_char, vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char );
+    impl_mul!([VectorMladd vec_mladd] vec_vmal_uh (vector_unsigned_short, vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short);
+    impl_mul!([VectorMladd vec_mladd] vec_vmal_uf (vector_unsigned_int, vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int );
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorGfmsum<Result> {
+        unsafe fn vec_gfmsum(self, b: Self) -> Result;
+    }
+
+    test_impl! { vec_vgfmb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_short [ vgfmb, vgfmb ] }
+    test_impl! { vec_vgfmh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int[ vgfmh, vgfmh] }
+    test_impl! { vec_vgfmf(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_long_long [ vgfmf, vgfmf ] }
+
+    impl_mul!([VectorGfmsum vec_gfmsum] vec_vgfmb (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_short );
+    impl_mul!([VectorGfmsum vec_gfmsum] vec_vgfmh (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_int);
+    impl_mul!([VectorGfmsum vec_gfmsum] vec_vgfmf (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_long_long );
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorGfmsumAccum {
+        type Result;
+        unsafe fn vec_gfmsum_accum(self, b: Self, c: Self::Result) -> Self::Result;
+    }
+
+    test_impl! { vec_vgfmab(a: vector_unsigned_char, b: vector_unsigned_char, c: vector_unsigned_short) -> vector_unsigned_short [ vgfmab, vgfmab ] }
+    test_impl! { vec_vgfmah(a: vector_unsigned_short, b: vector_unsigned_short, c: vector_unsigned_int) -> vector_unsigned_int[ vgfmah, vgfmah] }
+    test_impl! { vec_vgfmaf(a: vector_unsigned_int, b: vector_unsigned_int, c: vector_unsigned_long_long) -> vector_unsigned_long_long [ vgfmaf, vgfmaf ] }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorGfmsumAccum for vector_unsigned_char {
+        type Result = vector_unsigned_short;
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_gfmsum_accum(self, b: Self, c: Self::Result) -> Self::Result {
+            vec_vgfmab(self, b, c)
+        }
+    }
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorGfmsumAccum for vector_unsigned_short {
+        type Result = vector_unsigned_int;
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_gfmsum_accum(self, b: Self, c: Self::Result) -> Self::Result {
+            vec_vgfmah(self, b, c)
+        }
+    }
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorGfmsumAccum for vector_unsigned_int {
+        type Result = vector_unsigned_long_long;
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_gfmsum_accum(self, b: Self, c: Self::Result) -> Self::Result {
+            vec_vgfmaf(self, b, c)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vgef, D = 3))]
+    unsafe fn vgef<const D: u32>(
+        a: vector_unsigned_int,
+        b: vector_unsigned_int,
+        c: *const u32,
+    ) -> vector_unsigned_int {
+        static_assert_uimm_bits!(D, 2);
+        let offset: u32 = simd_extract(b, D);
+        let ptr = c.byte_add(offset as usize);
+        let value = ptr.read();
+        simd_insert(a, D, value)
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vgeg, D = 1))]
+    unsafe fn vgeg<const D: u32>(
+        a: vector_unsigned_long_long,
+        b: vector_unsigned_long_long,
+        c: *const u64,
+    ) -> vector_unsigned_long_long {
+        static_assert_uimm_bits!(D, 1);
+        let offset: u64 = simd_extract(b, D);
+        let ptr = c.byte_add(offset as usize);
+        let value = ptr.read();
+        simd_insert(a, D, value)
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorGatherElement {
+        type Element;
+        type Offset;
+        unsafe fn vec_gather_element<const D: u32>(
+            self,
+            b: Self::Offset,
+            c: *const Self::Element,
+        ) -> Self;
+    }
+
+    macro_rules! impl_vec_gather_element {
+        ($($instr:ident $ty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorGatherElement for $ty {
+                    type Element = l_t_t!($ty);
+                    type Offset = t_u!($ty);
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_gather_element<const D: u32>(self, b: Self::Offset, c: *const Self::Element) -> Self {
+                        transmute($instr::<D>(transmute(self), b, c.cast()))
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_vec_gather_element! {
+        vgef vector_signed_int
+        vgef vector_bool_int
+        vgef vector_unsigned_int
+
+        vgeg vector_signed_long_long
+        vgeg vector_bool_long_long
+        vgeg vector_unsigned_long_long
+
+        vgef vector_float
+        vgeg vector_double
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vscef, D = 3))]
+    unsafe fn vscef<const D: u32>(a: vector_unsigned_int, b: vector_unsigned_int, c: *mut u32) {
+        static_assert_uimm_bits!(D, 2);
+        let value = simd_extract(a, D);
+        let offset: u32 = simd_extract(b, D);
+        let ptr = c.byte_add(offset as usize);
+        ptr.write(value);
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vsceg, D = 1))]
+    unsafe fn vsceg<const D: u32>(
+        a: vector_unsigned_long_long,
+        b: vector_unsigned_long_long,
+        c: *mut u64,
+    ) {
+        static_assert_uimm_bits!(D, 1);
+        let value = simd_extract(a, D);
+        let offset: u64 = simd_extract(b, D);
+        let ptr = c.byte_add(offset as usize);
+        ptr.write(value);
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorScatterElement {
+        type Element;
+        type Offset;
+        unsafe fn vec_scatter_element<const D: u32>(self, b: Self::Offset, c: *mut Self::Element);
+    }
+
+    macro_rules! impl_vec_scatter_element {
+        ($($instr:ident $ty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorScatterElement for $ty {
+                    type Element = l_t_t!($ty);
+                    type Offset = t_u!($ty);
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_scatter_element<const D: u32>(self, b: Self::Offset, c: *mut Self::Element) {
+                        $instr::<D>(transmute(self), b, c.cast())
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_vec_scatter_element! {
+        vscef vector_signed_int
+        vscef vector_bool_int
+        vscef vector_unsigned_int
+
+        vsceg vector_signed_long_long
+        vsceg vector_bool_long_long
+        vsceg vector_unsigned_long_long
+
+        vscef vector_float
+        vsceg vector_double
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSel<Mask>: Sized {
+        unsafe fn vec_sel(self, b: Self, c: Mask) -> Self;
+    }
+
+    macro_rules! impl_vec_sel {
+        ($($ty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorSel<t_u!($ty)> for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_sel(self, b: Self, c: t_u!($ty)) -> Self {
+                        let b = simd_and(transmute(b), c);
+                        let a = simd_and(transmute(self), simd_xor(c, transmute(vector_signed_char([!0; 16]))));
+                        transmute(simd_or(a, b))
+                    }
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorSel<t_b!($ty)> for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_sel(self, b: Self, c: t_b!($ty)) -> Self {
+                        // defer to the implementation with an unsigned mask
+                        self.vec_sel(b, transmute::<_, t_u!($ty)>(c))
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_vec_sel! {
+        vector_signed_char
+        vector_signed_short
+        vector_signed_int
+        vector_signed_long_long
+
+        vector_unsigned_char
+        vector_unsigned_short
+        vector_unsigned_int
+        vector_unsigned_long_long
+
+        vector_bool_char
+        vector_bool_short
+        vector_bool_int
+        vector_bool_long_long
+
+        vector_float
+        vector_double
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFpTestDataClass {
+        type Result;
+        unsafe fn vec_fp_test_data_class<const CLASS: u32>(self) -> (Self::Result, i32);
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorFpTestDataClass for vector_float {
+        type Result = vector_bool_int;
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_fp_test_data_class<const CLASS: u32>(self) -> (Self::Result, i32) {
+            let PackedTuple { x, y } = vftcisb(self, CLASS);
+            (x, y)
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorFpTestDataClass for vector_double {
+        type Result = vector_bool_long_long;
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_fp_test_data_class<const CLASS: u32>(self) -> (Self::Result, i32) {
+            let PackedTuple { x, y } = vftcidb(self, CLASS);
+            (x, y)
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorCompare {
+        unsafe fn vec_all_lt(self, other: Self) -> i32;
+        unsafe fn vec_all_le(self, other: Self) -> i32;
+        unsafe fn vec_all_gt(self, other: Self) -> i32;
+        unsafe fn vec_all_ge(self, other: Self) -> i32;
+    }
+
+    // NOTE: this implementation is currently non-optimal, but it does work for floats even with
+    // only `vector` enabled.
+    //
+    // - https://github.com/llvm/llvm-project/issues/129434
+    // - https://github.com/llvm/llvm-project/issues/130424
+    macro_rules! impl_vec_compare {
+        ($($ty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorCompare for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_all_lt(self, other: Self) -> i32 {
+                        simd_reduce_all(simd_lt::<_, t_b!($ty)>(self, other)) as i32
+                    }
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_all_le(self, other: Self) -> i32 {
+                        simd_reduce_all(simd_le::<_, t_b!($ty)>(self, other)) as i32
+                    }
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_all_gt(self, other: Self) -> i32 {
+                        simd_reduce_all(simd_gt::<_, t_b!($ty)>(self, other)) as i32
+                    }
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_all_ge(self, other: Self) -> i32 {
+                        simd_reduce_all(simd_ge::<_, t_b!($ty)>(self, other)) as i32
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_vec_compare! {
+        vector_signed_char
+        vector_unsigned_char
+
+        vector_signed_short
+        vector_unsigned_short
+
+        vector_signed_int
+        vector_unsigned_int
+        vector_float
+
+        vector_signed_long_long
+        vector_unsigned_long_long
+        vector_double
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorTestMask {
+        type Mask;
+        unsafe fn vec_test_mask(self, other: Self::Mask) -> i32;
+    }
+
+    macro_rules! impl_vec_test_mask {
+        ($($instr:ident $ty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorTestMask for $ty {
+                    type Mask = t_u!($ty);
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_test_mask(self, other: Self::Mask) -> i32 {
+                        vtm(transmute(self), transmute(other))
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_vec_test_mask! {
+        vector_signed_char
+        vector_signed_short
+        vector_signed_int
+        vector_signed_long_long
+
+        vector_unsigned_char
+        vector_unsigned_short
+        vector_unsigned_int
+        vector_unsigned_long_long
+
+        vector_float
+        vector_double
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSearchString {
+        unsafe fn vec_search_string_cc(
+            self,
+            b: Self,
+            c: vector_unsigned_char,
+        ) -> (vector_unsigned_char, i32);
+
+        unsafe fn vec_search_string_until_zero_cc(
+            self,
+            b: Self,
+            c: vector_unsigned_char,
+        ) -> (vector_unsigned_char, i32);
+    }
+
+    macro_rules! impl_vec_search_string{
+        ($($intr_s:ident $intr_sz:ident $ty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorSearchString for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector-enhancements-2")]
+                    unsafe fn vec_search_string_cc(self, b: Self, c: vector_unsigned_char) -> (vector_unsigned_char, i32) {
+                        let PackedTuple { x,y } = $intr_s(transmute(self), transmute(b), c);
+                        (x, y)
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector-enhancements-2")]
+                    unsafe fn vec_search_string_until_zero_cc(self, b: Self, c: vector_unsigned_char) -> (vector_unsigned_char, i32) {
+                        let PackedTuple { x,y } = $intr_sz(transmute(self), transmute(b), c);
+                        (x, y)
+                    }
+                }
+
+            )*
+        }
+    }
+
+    impl_vec_search_string! {
+        vstrsb vstrszb vector_signed_char
+        vstrsb vstrszb vector_bool_char
+        vstrsb vstrszb vector_unsigned_char
+
+        vstrsh vstrszh vector_signed_short
+        vstrsh vstrszh vector_bool_short
+        vstrsh vstrszh vector_unsigned_short
+
+        vstrsf vstrszf vector_signed_int
+        vstrsf vstrszf vector_bool_int
+        vstrsf vstrszf vector_unsigned_int
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vcdgb))]
+    pub unsafe fn vcdgb(a: vector_signed_long_long) -> vector_double {
+        simd_as(a)
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vcdlgb))]
+    pub unsafe fn vcdlgb(a: vector_unsigned_long_long) -> vector_double {
+        simd_as(a)
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorDouble {
+        unsafe fn vec_double(self) -> vector_double;
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorDouble for vector_signed_long_long {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_double(self) -> vector_double {
+            vcdgb(self)
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorDouble for vector_unsigned_long_long {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_double(self) -> vector_double {
+            vcdlgb(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(
+        all(test, target_feature = "vector-enhancements-2"),
+        assert_instr(vcefb)
+    )]
+    pub unsafe fn vcefb(a: vector_signed_int) -> vector_float {
+        simd_as(a)
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(
+        all(test, target_feature = "vector-enhancements-2"),
+        assert_instr(vcelfb)
+    )]
+    pub unsafe fn vcelfb(a: vector_unsigned_int) -> vector_float {
+        simd_as(a)
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorFloat {
+        unsafe fn vec_float(self) -> vector_float;
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorFloat for vector_signed_int {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_float(self) -> vector_float {
+            vcefb(self)
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorFloat for vector_unsigned_int {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_float(self) -> vector_float {
+            vcelfb(self)
+        }
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorExtendSigned64 {
+        unsafe fn vec_extend_s64(self) -> vector_signed_long_long;
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    // FIXME(llvm): https://github.com/llvm/llvm-project/issues/129899
+    // #[cfg_attr(test, assert_instr(vsegb))]
+    pub unsafe fn vsegb(a: vector_signed_char) -> vector_signed_long_long {
+        simd_as(simd_shuffle::<_, _, i8x2>(
+            a,
+            a,
+            const { u32x2::from_array([7, 15]) },
+        ))
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    // FIXME(llvm): https://github.com/llvm/llvm-project/issues/129899
+    // #[cfg_attr(test, assert_instr(vsegh))]
+    pub unsafe fn vsegh(a: vector_signed_short) -> vector_signed_long_long {
+        simd_as(simd_shuffle::<_, _, i16x2>(
+            a,
+            a,
+            const { u32x2::from_array([3, 7]) },
+        ))
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    // FIXME(llvm): https://github.com/llvm/llvm-project/issues/129899
+    // #[cfg_attr(test, assert_instr(vsegf))]
+    pub unsafe fn vsegf(a: vector_signed_int) -> vector_signed_long_long {
+        simd_as(simd_shuffle::<_, _, i32x2>(
+            a,
+            a,
+            const { u32x2::from_array([1, 3]) },
+        ))
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorExtendSigned64 for vector_signed_char {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_extend_s64(self) -> vector_signed_long_long {
+            vsegb(self)
+        }
+    }
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorExtendSigned64 for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_extend_s64(self) -> vector_signed_long_long {
+            vsegh(self)
+        }
+    }
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    impl VectorExtendSigned64 for vector_signed_int {
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_extend_s64(self) -> vector_signed_long_long {
+            vsegf(self)
+        }
+    }
+
+    // NOTE: VectorSigned and VectorUnsigned make strong safety assumptions around floats.
+    // This is what C provides, but even IBM does not clearly document these constraints.
+    //
+    // https://doc.rust-lang.org/std/intrinsics/simd/fn.simd_cast.html
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSigned {
+        type Result;
+        unsafe fn vec_signed(self) -> Self::Result;
+    }
+
+    test_impl! { vcgsb (a: vector_float) -> vector_signed_int [simd_cast, "vector-enhancements-2" vcgsb] }
+    test_impl! { vcgdb (a: vector_double) -> vector_signed_long_long [simd_cast, vcgdb] }
+
+    impl_vec_trait! { [VectorSigned vec_signed] vcgsb (vector_float) -> vector_signed_int }
+    impl_vec_trait! { [VectorSigned vec_signed] vcgdb (vector_double) -> vector_signed_long_long }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorUnsigned {
+        type Result;
+        unsafe fn vec_unsigned(self) -> Self::Result;
+    }
+
+    test_impl! { vclgsb (a: vector_float) -> vector_unsigned_int [simd_cast, "vector-enhancements-2" vclgsb] }
+    test_impl! { vclgdb (a: vector_double) -> vector_unsigned_long_long [simd_cast, vclgdb] }
+
+    impl_vec_trait! { [VectorUnsigned vec_unsigned] vclgsb (vector_float) -> vector_unsigned_int }
+    impl_vec_trait! { [VectorUnsigned vec_unsigned] vclgdb (vector_double) -> vector_unsigned_long_long }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorCopyUntilZero {
+        unsafe fn vec_cp_until_zero(self) -> Self;
+    }
+
+    test_impl! { vec_vistrb (a: vector_unsigned_char) -> vector_unsigned_char [vistrb, vistrb] }
+    test_impl! { vec_vistrh (a: vector_unsigned_short) -> vector_unsigned_short [vistrh, vistrh] }
+    test_impl! { vec_vistrf (a: vector_unsigned_int) -> vector_unsigned_int [vistrf, vistrf] }
+
+    impl_vec_trait! { [VectorCopyUntilZero vec_cp_until_zero]+ vec_vistrb (vector_signed_char) }
+    impl_vec_trait! { [VectorCopyUntilZero vec_cp_until_zero]+ vec_vistrb (vector_bool_char) }
+    impl_vec_trait! { [VectorCopyUntilZero vec_cp_until_zero]+ vec_vistrb (vector_unsigned_char) }
+
+    impl_vec_trait! { [VectorCopyUntilZero vec_cp_until_zero]+ vec_vistrh (vector_signed_short) }
+    impl_vec_trait! { [VectorCopyUntilZero vec_cp_until_zero]+ vec_vistrh (vector_bool_short) }
+    impl_vec_trait! { [VectorCopyUntilZero vec_cp_until_zero]+ vec_vistrh (vector_unsigned_short) }
+
+    impl_vec_trait! { [VectorCopyUntilZero vec_cp_until_zero]+ vec_vistrf (vector_signed_int) }
+    impl_vec_trait! { [VectorCopyUntilZero vec_cp_until_zero]+ vec_vistrf (vector_bool_int) }
+    impl_vec_trait! { [VectorCopyUntilZero vec_cp_until_zero]+ vec_vistrf (vector_unsigned_int) }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorCopyUntilZeroCC: Sized {
+        unsafe fn vec_cp_until_zero_cc(self) -> (Self, i32);
+    }
+
+    test_impl! { vec_vistrbs (a: vector_unsigned_char) -> PackedTuple<vector_unsigned_char, i32> [vistrbs, vistrbs] }
+    test_impl! { vec_vistrhs (a: vector_unsigned_short) -> PackedTuple<vector_unsigned_short, i32> [vistrhs, vistrhs] }
+    test_impl! { vec_vistrfs (a: vector_unsigned_int) -> PackedTuple<vector_unsigned_int, i32> [vistrfs, vistrfs] }
+
+    macro_rules! impl_vec_copy_until_zero_cc {
+        ($($intr:ident $ty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorCopyUntilZeroCC for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_cp_until_zero_cc(self) -> (Self, i32) {
+                        let PackedTuple { x,y } = $intr(transmute(self));
+                        (transmute(x), y)
+                    }
+                }
+
+            )*
+        }
+    }
+
+    impl_vec_copy_until_zero_cc! {
+        vec_vistrbs vector_signed_char
+        vec_vistrbs vector_bool_char
+        vec_vistrbs vector_unsigned_char
+
+        vec_vistrhs vector_signed_short
+        vec_vistrhs vector_bool_short
+        vec_vistrhs vector_unsigned_short
+
+        vec_vistrfs vector_signed_int
+        vec_vistrfs vector_bool_int
+        vec_vistrfs vector_unsigned_int
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSrdb {
+        unsafe fn vec_srdb<const C: u32>(self, b: Self) -> Self;
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorSld {
+        unsafe fn vec_sld<const C: u32>(self, b: Self) -> Self;
+
+        unsafe fn vec_sldw<const C: u32>(self, b: Self) -> Self;
+
+        unsafe fn vec_sldb<const C: u32>(self, b: Self) -> Self;
+    }
+
+    // FIXME(llvm) https://github.com/llvm/llvm-project/issues/129955
+    // ideally we could implement this in terms of llvm.fshl.i128
+    // #[link_name = "llvm.fshl.i128"] fn fshl_i128(a: u128, b: u128, c: u128) -> u128;
+    // transmute(fshl_i128(transmute(a), transmute(b), const { C * 8 } ))
+
+    macro_rules! impl_vec_sld {
+        ($($ty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorSld for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_sld<const C: u32>(self, b: Self) -> Self {
+                        static_assert_uimm_bits!(C, 4);
+                        transmute(vsldb(transmute(self), transmute(b), C))
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_sldw<const C: u32>(self, b: Self) -> Self {
+                        static_assert_uimm_bits!(C, 2);
+                        transmute(vsldb(transmute(self), transmute(b), const { 4 * C }))
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector-enhancements-2")]
+                    unsafe fn vec_sldb<const C: u32>(self, b: Self) -> Self {
+                        static_assert_uimm_bits!(C, 3);
+                        transmute(vsld(transmute(self), transmute(b), C))
+                    }
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorSrdb for $ty {
+                    #[inline]
+                    #[target_feature(enable = "vector-enhancements-2")]
+                    unsafe fn vec_srdb<const C: u32>(self, b: Self) -> Self {
+                        static_assert_uimm_bits!(C, 3);
+                        transmute(vsrd(transmute(self), transmute(b), C))
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_vec_sld! {
+        vector_signed_char
+        vector_bool_char
+        vector_unsigned_char
+
+        vector_signed_short
+        vector_bool_short
+        vector_unsigned_short
+
+        vector_signed_int
+        vector_bool_int
+        vector_unsigned_int
+
+        vector_signed_long_long
+        vector_bool_long_long
+        vector_unsigned_long_long
+
+        vector_float
+        vector_double
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorCompareRange: Sized {
+        type Result;
+
+        unsafe fn vstrc<const IMM: u32>(self, b: Self, c: Self) -> Self::Result;
+        unsafe fn vstrcz<const IMM: u32>(self, b: Self, c: Self) -> Self::Result;
+        unsafe fn vstrcs<const IMM: u32>(self, b: Self, c: Self) -> (Self::Result, i32);
+        unsafe fn vstrczs<const IMM: u32>(self, b: Self, c: Self) -> (Self::Result, i32);
+    }
+
+    const fn validate_compare_range_imm(imm: u32) {
+        if !matches!(imm, 0 | 4 | 8 | 12) {
+            panic!("IMM needs to be one of 0, 4, 8, 12");
+        }
+    }
+
+    macro_rules! impl_compare_range {
+        ($($ty:ident $vstrc:ident $vstrcs:ident $vstrcz:ident $vstrczs:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorCompareRange for $ty {
+                    type Result = t_b!($ty);
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vstrc<const IMM: u32>(self, b: Self, c: Self) -> Self::Result {
+                        const { validate_compare_range_imm };
+                        $vstrc(self, b, c, IMM)
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vstrcz<const IMM: u32>(self, b: Self, c: Self) -> Self::Result {
+                        const { validate_compare_range_imm };
+                        $vstrcz(self, b, c, IMM)
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vstrcs<const IMM: u32>(self, b: Self, c: Self) -> (Self::Result, i32) {
+                        const { validate_compare_range_imm };
+                        let PackedTuple { x, y } = $vstrcs(self, b, c, IMM);
+                        (x,y)
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vstrczs<const IMM: u32>(self, b: Self, c: Self) -> (Self::Result, i32) {
+                        const { validate_compare_range_imm };
+                        let PackedTuple { x, y } = $vstrczs(self, b, c, IMM);
+                        (x,y)
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_compare_range! {
+        vector_unsigned_char    vstrcb vstrcbs vstrczb vstrczbs
+        vector_unsigned_short   vstrch vstrchs vstrczh vstrczhs
+        vector_unsigned_int     vstrcf vstrcfs vstrczf vstrczfs
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorComparePredicate: Sized {
+        type Result;
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_cmpgt(self, other: Self) -> Self::Result {
+            simd_gt(self, other)
+        }
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_cmpge(self, other: Self) -> Self::Result {
+            simd_ge(self, other)
+        }
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_cmplt(self, other: Self) -> Self::Result {
+            simd_lt(self, other)
+        }
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_cmple(self, other: Self) -> Self::Result {
+            simd_le(self, other)
+        }
+    }
+
+    macro_rules! impl_compare_predicate {
+        ($($ty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorComparePredicate for $ty {
+                    type Result = t_b!($ty);
+                }
+            )*
+        }
+    }
+
+    impl_compare_predicate! {
+        vector_signed_char
+        vector_unsigned_char
+
+        vector_signed_short
+        vector_unsigned_short
+
+        vector_signed_int
+        vector_unsigned_int
+        vector_float
+
+        vector_signed_long_long
+        vector_unsigned_long_long
+        vector_double
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorEquality: Sized {
+        type Result;
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_cmpeq(self, other: Self) -> Self::Result {
+            simd_eq(self, other)
+        }
+
+        #[inline]
+        #[target_feature(enable = "vector")]
+        unsafe fn vec_cmpne(self, other: Self) -> Self::Result {
+            simd_ne(self, other)
+        }
+    }
+
+    macro_rules! impl_compare_equality {
+        ($($ty:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorEquality for $ty {
+                    type Result = t_b!($ty);
+                }
+            )*
+        }
+    }
+
+    impl_compare_equality! {
+        vector_bool_char
+        vector_signed_char
+        vector_unsigned_char
+
+        vector_bool_short
+        vector_signed_short
+        vector_unsigned_short
+
+        vector_bool_int
+        vector_signed_int
+        vector_unsigned_int
+        vector_float
+
+        vector_bool_long_long
+        vector_signed_long_long
+        vector_unsigned_long_long
+        vector_double
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorEqualityIdx: Sized {
+        type Result;
+
+        unsafe fn vec_cmpeq_idx(self, other: Self) -> Self::Result;
+        unsafe fn vec_cmpne_idx(self, other: Self) -> Self::Result;
+
+        unsafe fn vec_cmpeq_idx_cc(self, other: Self) -> (Self::Result, i32);
+        unsafe fn vec_cmpne_idx_cc(self, other: Self) -> (Self::Result, i32);
+
+        unsafe fn vec_cmpeq_or_0_idx(self, other: Self) -> Self::Result;
+        unsafe fn vec_cmpne_or_0_idx(self, other: Self) -> Self::Result;
+
+        unsafe fn vec_cmpeq_or_0_idx_cc(self, other: Self) -> (Self::Result, i32);
+        unsafe fn vec_cmpne_or_0_idx_cc(self, other: Self) -> (Self::Result, i32);
+    }
+
+    macro_rules! impl_compare_equality_idx {
+        ($($ty:ident $ret:ident
+                $cmpeq:ident $cmpne:ident
+                $cmpeq_or_0:ident $cmpne_or_0:ident
+                $cmpeq_cc:ident $cmpne_cc:ident
+                $cmpeq_or_0_cc:ident $cmpne_or_0_cc:ident
+        )*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorEqualityIdx for $ty {
+                    type Result = $ret;
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_cmpeq_idx(self, other: Self) -> Self::Result {
+                        transmute($cmpeq(transmute(self), transmute(other)))
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_cmpne_idx(self, other: Self) -> Self::Result {
+                        transmute($cmpne(transmute(self), transmute(other)))
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_cmpeq_or_0_idx(self, other: Self) -> Self::Result {
+                        transmute($cmpeq_or_0(transmute(self), transmute(other)))
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_cmpne_or_0_idx(self, other: Self) -> Self::Result {
+                        transmute($cmpne_or_0(transmute(self), transmute(other)))
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_cmpeq_idx_cc(self, other: Self) -> (Self::Result, i32) {
+                        let PackedTuple { x, y } = $cmpeq_cc(transmute(self), transmute(other));
+                        (transmute(x), y)
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_cmpne_idx_cc(self, other: Self) -> (Self::Result, i32) {
+                        let PackedTuple { x, y } = $cmpne_cc(transmute(self), transmute(other));
+                        (transmute(x),y)
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_cmpeq_or_0_idx_cc(self, other: Self) -> (Self::Result, i32) {
+                        let PackedTuple { x, y } = $cmpeq_or_0_cc(transmute(self), transmute(other));
+                        (transmute(x), y)
+                    }
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_cmpne_or_0_idx_cc(self, other: Self) -> (Self::Result, i32) {
+                        let PackedTuple { x, y } = $cmpne_or_0_cc(transmute(self), transmute(other));
+                        (transmute(x),y)
+                    }
+                }
+            )*
+        }
+    }
+
+    impl_compare_equality_idx! {
+        vector_signed_char vector_signed_char               vfeeb vfeneb vfeezb vfenezb vfeebs vfenebs vfeezbs vfenezbs
+        vector_bool_char vector_unsigned_char               vfeeb vfeneb vfeezb vfenezb vfeebs vfenebs vfeezbs vfenezbs
+        vector_unsigned_char vector_unsigned_char           vfeeb vfeneb vfeezb vfenezb vfeebs vfenebs vfeezbs vfenezbs
+        vector_signed_short vector_signed_short             vfeeh vfeneh vfeezh vfenezh vfeehs vfenehs vfeezhs vfenezhs
+        vector_bool_short  vector_unsigned_short            vfeeh vfeneh vfeezh vfenezh vfeehs vfenehs vfeezhs vfenezhs
+        vector_unsigned_short vector_unsigned_short         vfeeh vfeneh vfeezh vfenezh vfeehs vfenehs vfeezhs vfenezhs
+        vector_signed_int vector_signed_int                 vfeef vfenef vfeezf vfenezf vfeefs vfenefs vfeezfs vfenezfs
+        vector_bool_int  vector_unsigned_int                vfeef vfenef vfeezf vfenezf vfeefs vfenefs vfeezfs vfenezfs
+        vector_unsigned_int vector_unsigned_int             vfeef vfenef vfeezf vfenezf vfeefs vfenefs vfeezfs vfenezfs
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorExtract {
+        type ElementType;
+
+        unsafe fn vec_extract(a: Self, b: i32) -> Self::ElementType;
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vlgvb))]
+    unsafe fn vlgvb(a: vector_unsigned_char, b: i32) -> u8 {
+        simd_extract_dyn(a, b as u32 % 16)
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vlgvh))]
+    unsafe fn vlgvh(a: vector_unsigned_short, b: i32) -> u16 {
+        simd_extract_dyn(a, b as u32 % 8)
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vlgvf))]
+    unsafe fn vlgvf(a: vector_unsigned_int, b: i32) -> u32 {
+        simd_extract_dyn(a, b as u32 % 4)
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vlgvg))]
+    unsafe fn vlgvg(a: vector_unsigned_long_long, b: i32) -> u64 {
+        simd_extract_dyn(a, b as u32 % 2)
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorInsert {
+        type ElementType;
+
+        unsafe fn vec_insert(a: Self::ElementType, b: Self, c: i32) -> Self;
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorPromote: Sized {
+        type ElementType;
+
+        unsafe fn vec_promote(a: Self::ElementType, b: i32) -> MaybeUninit<Self>;
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vlvgb))]
+    unsafe fn vlvgb(a: u8, b: vector_unsigned_char, c: i32) -> vector_unsigned_char {
+        simd_insert_dyn(b, c as u32 % 16, a)
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vlvgh))]
+    unsafe fn vlvgh(a: u16, b: vector_unsigned_short, c: i32) -> vector_unsigned_short {
+        simd_insert_dyn(b, c as u32 % 8, a)
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vlvgf))]
+    unsafe fn vlvgf(a: u32, b: vector_unsigned_int, c: i32) -> vector_unsigned_int {
+        simd_insert_dyn(b, c as u32 % 4, a)
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vlvgg))]
+    unsafe fn vlvgg(a: u64, b: vector_unsigned_long_long, c: i32) -> vector_unsigned_long_long {
+        simd_insert_dyn(b, c as u32 % 2, a)
+    }
+
+    #[unstable(feature = "stdarch_s390x", issue = "135681")]
+    pub trait VectorInsertAndZero {
+        type ElementType;
+
+        unsafe fn vec_insert_and_zero(a: *const Self::ElementType) -> Self;
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vllezb))]
+    unsafe fn vllezb(x: *const u8) -> vector_unsigned_char {
+        vector_unsigned_char([0, 0, 0, 0, 0, 0, 0, *x, 0, 0, 0, 0, 0, 0, 0, 0])
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vllezh))]
+    unsafe fn vllezh(x: *const u16) -> vector_unsigned_short {
+        vector_unsigned_short([0, 0, 0, *x, 0, 0, 0, 0])
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vllezf))]
+    unsafe fn vllezf(x: *const u32) -> vector_unsigned_int {
+        vector_unsigned_int([0, *x, 0, 0])
+    }
+
+    #[inline]
+    #[target_feature(enable = "vector")]
+    #[cfg_attr(test, assert_instr(vllezg))]
+    unsafe fn vllezg(x: *const u64) -> vector_unsigned_long_long {
+        vector_unsigned_long_long([*x, 0])
+    }
+
+    macro_rules! impl_extract_insert {
+        ($($ty:ident $extract_intr:ident $insert_intr:ident $insert_and_zero_intr:ident)*) => {
+            $(
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorExtract for $ty {
+                    type ElementType = l_t_t!($ty);
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_extract(a: Self, b: i32) -> Self::ElementType {
+                        transmute($extract_intr(transmute(a), b))
+                    }
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorInsert for $ty {
+                    type ElementType = l_t_t!($ty);
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_insert(a: Self::ElementType, b: Self, c: i32) -> Self {
+                        transmute($insert_intr(transmute(a), transmute(b), c))
+                    }
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorInsertAndZero for $ty {
+                    type ElementType = l_t_t!($ty);
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_insert_and_zero(a: *const Self::ElementType) -> Self {
+                        transmute($insert_and_zero_intr(a.cast()))
+                    }
+                }
+
+                #[unstable(feature = "stdarch_s390x", issue = "135681")]
+                impl VectorPromote for $ty {
+                    type ElementType = l_t_t!($ty);
+
+                    #[inline]
+                    #[target_feature(enable = "vector")]
+                    unsafe fn vec_promote(a: Self::ElementType, c: i32) -> MaybeUninit<Self> {
+                        // Rust does not currently support `MaybeUninit` element types to simd
+                        // vectors. In C/LLVM that is allowed (using poison values). So rust will
+                        // use an extra instruction to zero the memory.
+                        let b = MaybeUninit::<$ty>::zeroed();
+                        MaybeUninit::new(transmute($insert_intr(transmute(a), transmute(b), c)))
+                    }
+                }
+            )*
+        }
+
+    }
+
+    impl_extract_insert! {
+        vector_signed_char          vlgvb vlvgb vllezb
+        vector_unsigned_char        vlgvb vlvgb vllezb
+        vector_signed_short         vlgvh vlvgh vllezh
+        vector_unsigned_short       vlgvh vlvgh vllezh
+        vector_signed_int           vlgvf vlvgf vllezf
+        vector_unsigned_int         vlgvf vlvgf vllezf
+        vector_signed_long_long     vlgvg vlvgg vllezg
+        vector_unsigned_long_long   vlgvg vlvgg vllezg
+        vector_float                vlgvf vlvgf vllezf
+        vector_double               vlgvg vlvgg vllezg
+    }
+}
+
+/// Load Count to Block Boundary
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(lcbb, BLOCK_BOUNDARY = 512))]
+unsafe fn __lcbb<const BLOCK_BOUNDARY: u16>(ptr: *const u8) -> u32 {
+    lcbb(ptr, const { validate_block_boundary(BLOCK_BOUNDARY) })
+}
+
+/// Vector Add
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_add<T: sealed::VectorAdd<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_add(b)
+}
+
+/// Vector Subtract
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sub<T: sealed::VectorSub<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_sub(b)
+}
+
+/// Vector Multiply
+///
+/// ## Purpose
+/// Compute the products of corresponding elements of two vectors.
+///
+/// ## Result value
+/// Each element of r receives the product of the corresponding elements of a and b.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_mul<T: sealed::VectorMul>(a: T, b: T) -> T {
+    a.vec_mul(b)
+}
+
+/// Vector Count Leading Zeros
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cntlz<T: sealed::CountBits>(a: T) -> T::Result {
+    a.vec_cntlz()
+}
+
+/// Vector Count Trailing Zeros
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cnttz<T: sealed::CountBits>(a: T) -> T::Result {
+    a.vec_cnttz()
+}
+
+/// Vector Population Count
+///
+/// Computes the population count (number of set bits) in each element of the input.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_popcnt<T: sealed::CountBits>(a: T) -> T::Result {
+    a.vec_popcnt()
+}
+
+/// Vector Maximum
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_max<T: sealed::VectorMax<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_max(b)
+}
+
+/// Vector  Minimum
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_min<T: sealed::VectorMin<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_min(b)
+}
+
+/// Vector Absolute
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_abs<T: sealed::VectorAbs>(a: T) -> T {
+    a.vec_abs()
+}
+
+/// Vector Negative Absolute
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_nabs<T: sealed::VectorNabs>(a: T) -> T {
+    a.vec_nabs()
+}
+
+/// Vector Negative Multiply Add
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_nmadd<T: sealed::VectorNmadd>(a: T, b: T, c: T) -> T {
+    a.vec_nmadd(b, c)
+}
+
+/// Vector Negative Multiply Subtract
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_nmsub<T: sealed::VectorNmsub>(a: T, b: T, c: T) -> T {
+    a.vec_nmsub(b, c)
+}
+
+/// Vector Square Root
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sqrt<T: sealed::VectorSqrt>(a: T) -> T {
+    a.vec_sqrt()
+}
+
+/// Vector Splat
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_splat<T: sealed::VectorSplat, const IMM: u32>(a: T) -> T {
+    a.vec_splat::<IMM>()
+}
+
+/// Vector Splats
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_splats<T: sealed::VectorSplats<U>, U>(a: T) -> U {
+    a.vec_splats()
+}
+
+/// Vector AND
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_and<T: sealed::VectorAnd<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_and(b)
+}
+
+/// Vector OR
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_or<T: sealed::VectorOr<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_or(b)
+}
+
+/// Vector XOR
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_xor<T: sealed::VectorXor<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_xor(b)
+}
+
+/// Vector NOR
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_nor<T: sealed::VectorNor<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_nor(b)
+}
+
+/// Vector NAND
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_nand<T: sealed::VectorNand<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_nand(b)
+}
+
+/// Vector XNOR
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_eqv<T: sealed::VectorEqv<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_eqv(b)
+}
+
+/// Vector ANDC
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_andc<T: sealed::VectorAndc<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_andc(b)
+}
+
+/// Vector OR with Complement
+///
+/// ## Purpose
+/// Performs a bitwise OR of the first vector with the bitwise-complemented second vector.
+///
+/// ## Result value
+/// r is the bitwise OR of a and the bitwise complement of b.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_orc<T: sealed::VectorOrc<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_orc(b)
+}
+
+/// Vector Floor
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_floor<T: sealed::VectorFloor>(a: T) -> T {
+    a.vec_floor()
+}
+
+/// Vector Ceil
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_ceil<T: sealed::VectorCeil>(a: T) -> T {
+    a.vec_ceil()
+}
+
+/// Vector Truncate
+///
+/// Returns a vector containing the truncated values of the corresponding elements of the given vector.
+/// Each element of the result contains the value of the corresponding element of a, truncated to an integral value.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_trunc<T: sealed::VectorTrunc>(a: T) -> T {
+    a.vec_trunc()
+}
+
+/// Vector Round
+///
+/// Returns a vector containing the rounded values to the nearest representable floating-point integer,
+/// using IEEE round-to-nearest rounding, of the corresponding elements of the given vector
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_round<T: sealed::VectorRound>(a: T) -> T {
+    a.vec_round()
+}
+
+/// Vector Round to Current
+///
+/// Returns a vector by using the current rounding mode to round every
+/// floating-point element in the given vector to integer.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_roundc<T: sealed::VectorRoundc>(a: T) -> T {
+    a.vec_roundc()
+}
+
+/// Vector Round toward Negative Infinity
+///
+/// Returns a vector containing the largest representable floating-point integral values less
+/// than or equal to the values of the corresponding elements of the given vector.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_roundm<T: sealed::VectorFloor>(a: T) -> T {
+    // the IBM docs note
+    //
+    // > vec_roundm provides the same functionality as vec_floor, except that vec_roundz would not trigger the IEEE-inexact exception.
+    //
+    // but in practice `vec_floor` also does not trigger that exception, so both are equivalent
+    a.vec_floor()
+}
+
+/// Vector Round toward Positive Infinity
+///
+/// Returns a vector containing the smallest representable floating-point integral values greater
+/// than or equal to the values of the corresponding elements of the given vector.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_roundp<T: sealed::VectorCeil>(a: T) -> T {
+    // the IBM docs note
+    //
+    // > vec_roundp provides the same functionality as vec_ceil, except that vec_roundz would not trigger the IEEE-inexact exception.
+    //
+    // but in practice `vec_ceil` also does not trigger that exception, so both are equivalent
+    a.vec_ceil()
+}
+
+/// Vector Round toward Zero
+///
+/// Returns a vector containing the truncated values of the corresponding elements of the given vector.
+/// Each element of the result contains the value of the corresponding element of a, truncated to an integral value.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_roundz<T: sealed::VectorTrunc>(a: T) -> T {
+    // the IBM docs note
+    //
+    // > vec_roundz provides the same functionality as vec_trunc, except that vec_roundz would not trigger the IEEE-inexact exception.
+    //
+    // but in practice `vec_trunc` also does not trigger that exception, so both are equivalent
+    a.vec_trunc()
+}
+
+/// Vector Round to Integer
+///
+/// Returns a vector by using the current rounding mode to round every floating-point element in the given vector to integer.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_rint<T: sealed::VectorRint>(a: T) -> T {
+    a.vec_rint()
+}
+
+/// Vector Average
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_avg<T: sealed::VectorAvg<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_avg(b)
+}
+
+/// Vector Shift Left
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sl<T: sealed::VectorSl<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_sl(b)
+}
+
+/// Vector Shift Right
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sr<T: sealed::VectorSr<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_sr(b)
+}
+
+/// Vector Shift Right Algebraic
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sra<T: sealed::VectorSra<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_sra(b)
+}
+
+/// Vector Shift Left by Byte
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_slb<T: sealed::VectorSlb<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_slb(b)
+}
+
+/// Vector Shift Right by Byte
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_srb<T: sealed::VectorSrb<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_srb(b)
+}
+
+/// Vector Shift Right Algebraic by Byte
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_srab<T: sealed::VectorSrab<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_srab(b)
+}
+
+/// Vector Element Rotate Left
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_rl<T: sealed::VectorRl<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_rl(b)
+}
+
+/// Vector Shift Left
+///
+/// Performs a left shift for a vector by a given number of bits. Each element of the result is obtained by shifting the corresponding
+/// element of a left by the number of bits specified by the last 3 bits of every byte of b. The bits that are shifted out are replaced by zeros.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sll<T>(a: T, b: vector_unsigned_char) -> T
+where
+    T: sealed::VectorSll<vector_unsigned_char, Result = T>,
+{
+    a.vec_sll(b)
+}
+
+/// Vector Shift Right
+///
+/// Performs a right shift for a vector by a given number of bits. Each element of the result is obtained by shifting the corresponding
+/// element of a right by the number of bits specified by the last 3 bits of every byte of b. The bits that are shifted out are replaced by zeros.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_srl<T>(a: T, b: vector_unsigned_char) -> T
+where
+    T: sealed::VectorSrl<vector_unsigned_char, Result = T>,
+{
+    a.vec_srl(b)
+}
+
+/// Vector Shift Right Arithmetic
+///
+/// Performs an algebraic right shift for a vector by a given number of bits. Each element of the result is obtained by shifting the corresponding
+/// element of a right by the number of bits specified by the last 3 bits of every byte of b. The bits that are shifted out are replaced by copies of
+/// the most significant bit of the element of a.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sral<T>(a: T, b: vector_unsigned_char) -> T
+where
+    T: sealed::VectorSral<vector_unsigned_char, Result = T>,
+{
+    a.vec_sral(b)
+}
+
+/// Vector Element Rotate Left Immediate
+///
+/// Rotates each element of a vector left by a given number of bits. Each element of the result is obtained by rotating the corresponding element
+/// of a left by the number of bits specified by b, modulo the number of bits in the element.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_rli<T: sealed::VectorRli>(a: T, bits: core::ffi::c_ulong) -> T {
+    a.vec_rli(bits)
+}
+
+/// Vector Reverse Elements
+///
+/// Returns a vector with the elements of the input vector in reversed order.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_reve<T: sealed::VectorReve>(a: T) -> T {
+    a.vec_reve()
+}
+
+/// Vector Byte Reverse
+///
+/// Returns a vector where each vector element contains the corresponding byte-reversed vector element of the input vector.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_revb<T: sealed::VectorRevb>(a: T) -> T {
+    a.vec_revb()
+}
+
+/// Vector Merge High
+///
+/// Merges the most significant ("high") halves of two vectors.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_mergeh<T: sealed::VectorMergeh>(a: T, b: T) -> T {
+    a.vec_mergeh(b)
+}
+
+/// Vector Merge Low
+///
+/// Merges the least significant ("low") halves of two vectors.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_mergel<T: sealed::VectorMergel>(a: T, b: T) -> T {
+    a.vec_mergel(b)
+}
+
+/// Vector Pack
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_pack<T: sealed::VectorPack<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_pack(b)
+}
+
+/// Vector Pack Saturated
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_packs<T: sealed::VectorPacks<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_packs(b)
+}
+
+/// Vector Pack Saturated Condition Code
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_packs_cc<T: sealed::VectorPacksCC>(a: T, b: T) -> (T::Result, i32) {
+    a.vec_packs_cc(b)
+}
+
+/// Vector Pack Saturated Unsigned
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_packsu<T: sealed::VectorPacksu<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_packsu(b)
+}
+
+/// Vector Pack Saturated Unsigned Condition Code
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_packsu_cc<T: sealed::VectorPacksuCC>(a: T, b: T) -> (T::Result, i32) {
+    a.vec_packsu_cc(b)
+}
+
+/// Vector Unpack High
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_unpackh<T: sealed::VectorUnpackh>(a: T) -> <T as sealed::VectorUnpackh>::Result {
+    a.vec_unpackh()
+}
+
+/// Vector Unpack Low
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_unpackl<T: sealed::VectorUnpackl>(a: T) -> <T as sealed::VectorUnpackl>::Result {
+    a.vec_unpackl()
+}
+
+/// Vector Generate Byte Mask
+///
+/// Generates byte masks for elements in the return vector. For each bit in a, if the bit is one, all bit positions
+/// in the corresponding byte element of d are set to ones. Otherwise, if the bit is zero, the corresponding byte element is set to zero.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vgbm, MASK = 0x00FF))]
+pub unsafe fn vec_genmask<const MASK: u16>() -> vector_unsigned_char {
+    vector_unsigned_char(const { genmask::<MASK>() })
+}
+
+/// Vector Generate Mask (Byte)
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vrepib, L = 3, H = 5))]
+pub unsafe fn vec_genmasks_8<const L: u8, const H: u8>() -> vector_unsigned_char {
+    vector_unsigned_char(const { [genmasks(u8::BITS, L, H) as u8; 16] })
+}
+
+/// Vector Generate Mask (Halfword)
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vrepih, L = 3, H = 5))]
+pub unsafe fn vec_genmasks_16<const L: u8, const H: u8>() -> vector_unsigned_short {
+    vector_unsigned_short(const { [genmasks(u16::BITS, L, H) as u16; 8] })
+}
+
+/// Vector Generate Mask (Word)
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vgmf, L = 3, H = 5))]
+pub unsafe fn vec_genmasks_32<const L: u8, const H: u8>() -> vector_unsigned_int {
+    vector_unsigned_int(const { [genmasks(u32::BITS, L, H) as u32; 4] })
+}
+
+/// Vector Generate Mask (Doubleword)
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vgmg, L = 3, H = 5))]
+pub unsafe fn vec_genmasks_64<const L: u8, const H: u8>() -> vector_unsigned_long_long {
+    vector_unsigned_long_long(const { [genmasks(u64::BITS, L, H); 2] })
+}
+
+/// Vector Permute
+///
+/// Returns a vector that contains some elements of two vectors, in the order specified by a third vector.
+/// Each byte of the result is selected by using the least significant 5 bits of the corresponding byte of c as an index into the concatenated bytes of a and b.
+/// Note: The vector generate mask built-in function [`vec_genmask`] could help generate the mask c.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_perm<T: sealed::VectorPerm>(a: T, b: T, c: vector_unsigned_char) -> T {
+    a.vec_perm(b, c)
+}
+
+/// Vector Sum Across Quadword
+///
+/// Returns a vector containing the results of performing a sum across all the elements in each of the quadword of vector a,
+/// and the rightmost word or doubleword element of the b. The result is an unsigned 128-bit integer.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sum_u128<T: sealed::VectorSumU128>(a: T, b: T) -> vector_unsigned_char {
+    a.vec_sum_u128(b)
+}
+
+/// Vector Sum Across Doubleword
+///
+/// Returns a vector containing the results of performing a sum across all the elements in each of the doubleword of vector a,
+/// and the rightmost sub-element of the corresponding doubleword of b.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sum2<T: sealed::VectorSum2>(a: T, b: T) -> vector_unsigned_long_long {
+    a.vec_sum2(b)
+}
+
+/// Vector Sum Across Word
+///
+/// Returns a vector containing the results of performing a sum across all the elements in each of the word of vector a,
+/// and the rightmost sub-element of the corresponding word of b.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sum4<T: sealed::VectorSum4>(a: T, b: T) -> vector_unsigned_int {
+    a.vec_sum4(b)
+}
+
+/// Vector Addition unsigned 128-bits
+///
+/// Adds unsigned quadword values.
+///
+/// This function operates on the vectors as 128-bit unsigned integers. It returns low 128 bits of a + b.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vaq))]
+pub unsafe fn vec_add_u128(
+    a: vector_unsigned_char,
+    b: vector_unsigned_char,
+) -> vector_unsigned_char {
+    let a: u128 = transmute(a);
+    let b: u128 = transmute(b);
+    transmute(a.wrapping_add(b))
+}
+
+/// Vector Subtract unsigned 128-bits
+///
+/// Subtracts unsigned quadword values.
+///
+/// This function operates on the vectors as 128-bit unsigned integers. It returns low 128 bits of a - b.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vsq))]
+pub unsafe fn vec_sub_u128(
+    a: vector_unsigned_char,
+    b: vector_unsigned_char,
+) -> vector_unsigned_char {
+    let a: u128 = transmute(a);
+    let b: u128 = transmute(b);
+
+    transmute(a.wrapping_sub(b))
+}
+
+/// Vector Subtract Carryout
+///
+/// Returns a vector containing the borrow produced by subtracting each of corresponding elements of b from a.
+///
+/// On each resulting element, the value is 0 if a borrow occurred, or 1 if no borrow occurred.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_subc<T: sealed::VectorSubc<U>, U>(a: T, b: U) -> T::Result {
+    a.vec_subc(b)
+}
+
+/// Vector Subtract Carryout unsigned 128-bits
+///
+/// Gets the carry bit of the 128-bit subtraction of two quadword values.
+/// This function operates on the vectors as 128-bit unsigned integers. It returns a vector containing the borrow produced by subtracting b from a, as unsigned 128-bits integers.
+/// If no borrow occurred, the bit 127 of d is 1; otherwise it is set to 0. All other bits of d are 0.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vscbiq))]
+pub unsafe fn vec_subc_u128(
+    a: vector_unsigned_char,
+    b: vector_unsigned_char,
+) -> vector_unsigned_char {
+    // FIXME(llvm) sadly this does not work https://github.com/llvm/llvm-project/issues/129608
+    // let a: u128 = transmute(a);
+    // let b: u128 = transmute(b);
+    // transmute(!a.overflowing_sub(b).1 as u128)
+    transmute(vscbiq(transmute(a), transmute(b)))
+}
+
+/// Vector Add Compute Carryout unsigned 128-bits
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vaccq))]
+pub unsafe fn vec_addc_u128(
+    a: vector_unsigned_char,
+    b: vector_unsigned_char,
+) -> vector_unsigned_char {
+    let a: u128 = transmute(a);
+    let b: u128 = transmute(b);
+    transmute(a.overflowing_add(b).1 as u128)
+}
+
+/// Vector Add With Carry unsigned 128-bits
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vacq))]
+pub unsafe fn vec_adde_u128(
+    a: vector_unsigned_char,
+    b: vector_unsigned_char,
+    c: vector_unsigned_char,
+) -> vector_unsigned_char {
+    let a: u128 = transmute(a);
+    let b: u128 = transmute(b);
+    let c: u128 = transmute(c);
+    // FIXME(llvm) sadly this does not work
+    //     let (d, _carry) = a.carrying_add(b, c & 1 != 0);
+    //     transmute(d)
+    transmute(vacq(a, b, c))
+}
+
+/// Vector Add With Carry Compute Carry unsigned 128-bits
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vacccq))]
+pub unsafe fn vec_addec_u128(
+    a: vector_unsigned_char,
+    b: vector_unsigned_char,
+    c: vector_unsigned_char,
+) -> vector_unsigned_char {
+    let a: u128 = transmute(a);
+    let b: u128 = transmute(b);
+    let c: u128 = transmute(c);
+    let (_d, carry) = a.carrying_add(b, c & 1 != 0);
+    transmute(carry as u128)
+}
+
+/// Vector Subtract with Carryout
+///
+/// Subtracts unsigned quadword values with carry bit from a previous operation.
+///
+/// This function operates on the vectors as 128-bit unsigned integers. It returns a vector containing the result of subtracting of b from a,
+/// and the carryout bit from a previous operation.
+///
+/// Note: Only the borrow indication bit (127-bit) of c is used, and the other bits are ignored.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vsbiq))]
+pub unsafe fn vec_sube_u128(
+    a: vector_unsigned_char,
+    b: vector_unsigned_char,
+    c: vector_unsigned_char,
+) -> vector_unsigned_char {
+    transmute(vsbiq(transmute(a), transmute(b), transmute(c)))
+}
+
+/// Vector Subtract with Carryout, Carryout
+///
+/// Gets the carry bit of the 128-bit subtraction of two quadword values with carry bit from the previous operation.
+///
+/// It returns a vector containing the carryout produced from the result of subtracting of b from a,
+/// and the carryout bit from a previous operation. If no borrow occurred, the 127-bit of d is 1, otherwise 0.
+/// All other bits of d are 0.
+///
+/// Note: Only the borrow indication bit (127-bit) of c is used, and the other bits are ignored.
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vsbcbiq))]
+pub unsafe fn vec_subec_u128(
+    a: vector_unsigned_char,
+    b: vector_unsigned_char,
+    c: vector_unsigned_char,
+) -> vector_unsigned_char {
+    transmute(vsbcbiq(transmute(a), transmute(b), transmute(c)))
+}
+
+/// Vector Splat Signed Byte
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vrepib, IMM = 42))]
+pub unsafe fn vec_splat_s8<const IMM: i8>() -> vector_signed_char {
+    vector_signed_char([IMM; 16])
+}
+
+/// Vector Splat Signed Halfword
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vrepih, IMM = 42))]
+pub unsafe fn vec_splat_s16<const IMM: i16>() -> vector_signed_short {
+    vector_signed_short([IMM as i16; 8])
+}
+
+/// Vector Splat Signed Word
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vrepif, IMM = 42))]
+pub unsafe fn vec_splat_s32<const IMM: i16>() -> vector_signed_int {
+    vector_signed_int([IMM as i32; 4])
+}
+
+/// Vector Splat Signed Doubleword
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vrepig, IMM = 42))]
+pub unsafe fn vec_splat_s64<const IMM: i16>() -> vector_signed_long_long {
+    vector_signed_long_long([IMM as i64; 2])
+}
+
+/// Vector Splat Unsigned Byte
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vrepib, IMM = 42))]
+pub unsafe fn vec_splat_u8<const IMM: u8>() -> vector_unsigned_char {
+    vector_unsigned_char([IMM; 16])
+}
+
+/// Vector Splat Unsigned Halfword
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vrepih, IMM = 42))]
+pub unsafe fn vec_splat_u16<const IMM: i16>() -> vector_unsigned_short {
+    vector_unsigned_short([IMM as u16; 8])
+}
+
+/// Vector Splat Unsigned Word
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vrepif, IMM = 42))]
+pub unsafe fn vec_splat_u32<const IMM: i16>() -> vector_unsigned_int {
+    vector_unsigned_int([IMM as u32; 4])
+}
+
+/// Vector Splat Unsigned Doubleword
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vrepig, IMM = 42))]
+pub unsafe fn vec_splat_u64<const IMM: i16>() -> vector_unsigned_long_long {
+    vector_unsigned_long_long([IMM as u64; 2])
+}
+
+macro_rules! vec_find_any {
+    ($($Trait:ident $fun:ident $doc:literal)*) => {
+        $(
+            #[inline]
+            #[target_feature(enable = "vector")]
+            #[unstable(feature = "stdarch_s390x", issue = "135681")]
+            #[doc = $doc]
+            pub unsafe fn $fun<T: sealed::$Trait<U>, U>(a: T, b: U) -> T::Result {
+                a.$fun(b)
+            }
+        )*
+    }
+}
+
+vec_find_any! {
+    VectorFindAnyEq vec_find_any_eq "Vector Find Any Element Equal with Condition Code"
+    VectorFindAnyNe vec_find_any_ne "Vector Find Any Element Not Equal with Condition Code"
+    VectorFindAnyEqIdx vec_find_any_eq_idx "Vector Find Any Element Equal Index with Condition Code"
+    VectorFindAnyNeIdx vec_find_any_ne_idx "Vector Find Any Element Not Equal Index with Condition Code"
+    VectorFindAnyEqOrZeroIdx vec_find_any_eq_or_0_idx "Vector Find Any Element Equal or Zero Index with Condition Code"
+    VectorFindAnyNeOrZeroIdx vec_find_any_ne_or_0_idx "Vector Find Any Element Not Equal or Zero Index with Condition Code"
+}
+
+macro_rules! vec_find_any_cc {
+    ($($Trait:ident $fun:ident $doc:literal)*) => {
+        $(
+            #[inline]
+            #[target_feature(enable = "vector")]
+            #[unstable(feature = "stdarch_s390x", issue = "135681")]
+            #[doc = $doc]
+            pub unsafe fn $fun<T: sealed::$Trait<U>, U>(a: T, b: U) -> (T::Result, i32) {
+                a.$fun(b)
+            }
+        )*
+    }
+}
+
+vec_find_any_cc! {
+    VectorFindAnyEqCC vec_find_any_eq_cc "Vector Find Any Element Equal with Condition Code"
+    VectorFindAnyNeCC vec_find_any_ne_cc "Vector Find Any Element Not Equal with Condition Code"
+    VectorFindAnyEqIdxCC vec_find_any_eq_idx_cc "Vector Find Any Element Equal Index with Condition Code"
+    VectorFindAnyNeIdxCC vec_find_any_ne_idx_cc "Vector Find Any Element Not Equal Index with Condition Code"
+    VectorFindAnyEqOrZeroIdxCC vec_find_any_eq_or_0_idx_cc "Vector Find Any Element Equal or Zero Index with Condition Code"
+    VectorFindAnyNeOrZeroIdxCC vec_find_any_ne_or_0_idx_cc "Vector Find Any Element Not Equal or Zero Index with Condition Code"
+}
+
+/// Vector Load
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_xl<T: sealed::VectorLoad>(offset: isize, ptr: *const T::ElementType) -> T {
+    T::vec_xl(offset, ptr)
+}
+
+/// Vector Load Pair
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_load_pair<T: sealed::VectorLoadPair>(a: T::ElementType, b: T::ElementType) -> T {
+    T::vec_load_pair(a, b)
+}
+
+/// Vector Load to Block Boundary
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_load_bndry<T: sealed::VectorLoad, const BLOCK_BOUNDARY: u16>(
+    ptr: *const T::ElementType,
+) -> MaybeUninit<T> {
+    T::vec_load_bndry::<BLOCK_BOUNDARY>(ptr)
+}
+
+/// Vector Store
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_xst<T: sealed::VectorStore>(vector: T, offset: isize, ptr: *mut T::ElementType) {
+    vector.vec_xst(offset, ptr)
+}
+
+/// Vector Load with Length
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_load_len<T: sealed::VectorLoad>(
+    ptr: *const T::ElementType,
+    byte_count: u32,
+) -> T {
+    T::vec_load_len(ptr, byte_count)
+}
+
+/// Vector Store with Length
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_store_len<T: sealed::VectorStore>(
+    vector: T,
+    ptr: *mut T::ElementType,
+    byte_count: u32,
+) {
+    vector.vec_store_len(ptr, byte_count)
+}
+
+/// Vector Load Rightmost with Length
+#[inline]
+#[target_feature(enable = "vector-packed-decimal")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vlrlr))]
+pub unsafe fn vec_load_len_r(ptr: *const u8, byte_count: u32) -> vector_unsigned_char {
+    vlrl(byte_count, ptr)
+}
+
+/// Vector Store Rightmost with Length
+#[inline]
+#[target_feature(enable = "vector-packed-decimal")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vstrlr))]
+pub unsafe fn vec_store_len_r(vector: vector_unsigned_char, ptr: *mut u8, byte_count: u32) {
+    vstrl(vector, byte_count, ptr)
+}
+
+/// Vector Multiply Add
+#[inline]
+#[target_feature(enable = "vector-packed-decimal")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_madd<T: sealed::VectorMadd>(a: T, b: T, c: T) -> T {
+    a.vec_madd(b, c)
+}
+
+/// Vector Multiply Add
+#[inline]
+#[target_feature(enable = "vector-packed-decimal")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_msub<T: sealed::VectorMadd>(a: T, b: T, c: T) -> T {
+    a.vec_msub(b, c)
+}
+
+/// Vector Multiply and Add Even
+#[inline]
+#[target_feature(enable = "vector-packed-decimal")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_meadd<T: sealed::VectorMeadd>(a: T, b: T, c: T::Result) -> T::Result {
+    a.vec_meadd(b, c)
+}
+
+/// Vector Multiply and Add Odd
+#[inline]
+#[target_feature(enable = "vector-packed-decimal")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_moadd<T: sealed::VectorMoadd>(a: T, b: T, c: T::Result) -> T::Result {
+    a.vec_moadd(b, c)
+}
+
+/// Vector Multiply and Add High
+#[inline]
+#[target_feature(enable = "vector-packed-decimal")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_mhadd<T: sealed::VectorMhadd>(a: T, b: T, c: T::Result) -> T::Result {
+    a.vec_mhadd(b, c)
+}
+
+/// Vector Multiply and Add Low
+#[inline]
+#[target_feature(enable = "vector-packed-decimal")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_mladd<T: sealed::VectorMladd>(a: T, b: T, c: T::Result) -> T::Result {
+    a.vec_mladd(b, c)
+}
+
+/// Vector Checksum
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vcksm))]
+pub unsafe fn vec_checksum(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int {
+    vcksm(a, b)
+}
+
+/// Vector Multiply Even
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_mule<T: sealed::VectorMule<U>, U>(a: T, b: T) -> U {
+    a.vec_mule(b)
+}
+
+/// Vector Multiply Odd
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_mulo<T: sealed::VectorMulo<U>, U>(a: T, b: T) -> U {
+    a.vec_mulo(b)
+}
+
+/// Vector Multiply High
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_mulh<T: sealed::VectorMulh<U>, U>(a: T, b: T) -> U {
+    a.vec_mulh(b)
+}
+
+/// Vector Galois Field Multiply Sum
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_gfmsum<T: sealed::VectorGfmsum<U>, U>(a: T, b: T) -> U {
+    a.vec_gfmsum(b)
+}
+
+/// Vector Galois Field Multiply Sum
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_gfmsum_accum<T: sealed::VectorGfmsumAccum>(
+    a: T,
+    b: T,
+    c: T::Result,
+) -> T::Result {
+    a.vec_gfmsum_accum(b, c)
+}
+
+/// Vector Galois Field Multiply Sum 128-bits
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vgfmg))]
+pub unsafe fn vec_gfmsum_128(
+    a: vector_unsigned_long_long,
+    b: vector_unsigned_long_long,
+) -> vector_unsigned_char {
+    transmute(vgfmg(a, b))
+}
+
+/// Vector Galois Field Multiply Sum and Accumulate 128-bits
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vgfmag))]
+pub unsafe fn vec_gfmsum_accum_128(
+    a: vector_unsigned_long_long,
+    b: vector_unsigned_long_long,
+    c: vector_unsigned_char,
+) -> vector_unsigned_char {
+    transmute(vgfmag(a, b, transmute(c)))
+}
+
+/// Vector Bit Permute
+#[inline]
+#[target_feature(enable = "vector-enhancements-1")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(test, assert_instr(vbperm))]
+pub unsafe fn vec_bperm_u128(
+    a: vector_unsigned_char,
+    b: vector_unsigned_char,
+) -> vector_unsigned_long_long {
+    vbperm(a, b)
+}
+
+/// Vector Gather Element
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_gather_element<T: sealed::VectorGatherElement, const D: u32>(
+    a: T,
+    b: T::Offset,
+    c: *const T::Element,
+) -> T {
+    a.vec_gather_element::<D>(b, c)
+}
+
+/// Vector Select
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sel<T: sealed::VectorSel<U>, U>(a: T, b: T, c: U) -> T {
+    a.vec_sel(b, c)
+}
+
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_ZERO_P: u32 = 1 << 11;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_ZERO_N: u32 = 1 << 10;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_ZERO: u32 = __VEC_CLASS_FP_ZERO_P | __VEC_CLASS_FP_ZERO_N;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_NORMAL_P: u32 = 1 << 9;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_NORMAL_N: u32 = 1 << 8;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_NORMAL: u32 = __VEC_CLASS_FP_NORMAL_P | __VEC_CLASS_FP_NORMAL_N;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_SUBNORMAL_P: u32 = 1 << 7;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_SUBNORMAL_N: u32 = 1 << 6;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_SUBNORMAL: u32 = __VEC_CLASS_FP_SUBNORMAL_P | __VEC_CLASS_FP_SUBNORMAL_N;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_INFINITY_P: u32 = 1 << 5;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_INFINITY_N: u32 = 1 << 4;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_INFINITY: u32 = __VEC_CLASS_FP_INFINITY_P | __VEC_CLASS_FP_INFINITY_N;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_QNAN_P: u32 = 1 << 3;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_QNAN_N: u32 = 1 << 2;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_QNAN: u32 = __VEC_CLASS_FP_QNAN_P | __VEC_CLASS_FP_QNAN_N;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_SNAN_P: u32 = 1 << 1;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_SNAN_N: u32 = 1 << 0;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_SNAN: u32 = __VEC_CLASS_FP_SNAN_P | __VEC_CLASS_FP_SNAN_N;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_NAN: u32 = __VEC_CLASS_FP_QNAN | __VEC_CLASS_FP_SNAN;
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub const __VEC_CLASS_FP_NOT_NORMAL: u32 =
+    __VEC_CLASS_FP_NAN | __VEC_CLASS_FP_SUBNORMAL | __VEC_CLASS_FP_ZERO | __VEC_CLASS_FP_INFINITY;
+
+/// Vector Floating-Point Test Data Class
+///
+/// You can use the `__VEC_CLASS_FP_*` constants as the argument for this operand
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_fp_test_data_class<T: sealed::VectorFpTestDataClass, const CLASS: u32>(
+    a: T,
+    c: *mut i32,
+) -> T::Result {
+    let (x, y) = a.vec_fp_test_data_class::<CLASS>();
+    c.write(y);
+    x
+}
+
+/// All Elements Not a Number
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_all_nan<T: sealed::VectorFpTestDataClass>(a: T) -> i32 {
+    i32::from(a.vec_fp_test_data_class::<__VEC_CLASS_FP_NAN>().1 == 0)
+}
+
+/// All Elements Numeric
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_all_numeric<T: sealed::VectorFpTestDataClass>(a: T) -> i32 {
+    i32::from(a.vec_fp_test_data_class::<__VEC_CLASS_FP_NAN>().1 == 3)
+}
+
+/// Any Elements Not a Number
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_any_nan<T: sealed::VectorFpTestDataClass>(a: T) -> i32 {
+    i32::from(a.vec_fp_test_data_class::<__VEC_CLASS_FP_NAN>().1 != 3)
+}
+
+/// Any Elements Numeric
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_any_numeric<T: sealed::VectorFpTestDataClass>(a: T) -> i32 {
+    i32::from(a.vec_fp_test_data_class::<__VEC_CLASS_FP_NAN>().1 != 0)
+}
+
+/// Vector Test under Mask
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_test_mask<T: sealed::VectorTestMask>(a: T, b: T::Mask) -> i32 {
+    // I can't find much information about this, but this might just be a check for whether the
+    // bitwise and of a and b is non-zero?
+    a.vec_test_mask(b)
+}
+
+/// Vector Search String
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_search_string_cc<T: sealed::VectorSearchString>(
+    a: T,
+    b: T,
+    c: vector_unsigned_char,
+) -> (vector_unsigned_char, i32) {
+    a.vec_search_string_cc(b, c)
+}
+
+/// Vector Search String Until Zero
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_search_string_until_zero_cc<T: sealed::VectorSearchString>(
+    a: T,
+    b: T,
+    c: vector_unsigned_char,
+) -> (vector_unsigned_char, i32) {
+    a.vec_search_string_until_zero_cc(b, c)
+}
+
+/// Vector Convert from float (even elements) to double
+#[inline]
+#[target_feature(enable = "vector-enhancements-1")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+// FIXME: this emits `vflls` where `vldeb` is expected
+// #[cfg_attr(all(test, target_feature = "vector-enhancements-1"), assert_instr(vldeb))]
+pub unsafe fn vec_doublee(a: vector_float) -> vector_double {
+    let even = simd_shuffle::<_, _, f32x2>(a, a, const { u32x2::from_array([0, 2]) });
+    simd_as(even)
+}
+
+/// Vector Convert from double to float (even elements)
+#[inline]
+#[target_feature(enable = "vector-enhancements-1")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+// FIXME: the C version uses a shuffle mask with poison; we can't do that
+// #[cfg_attr(all(test, target_feature = "vector-enhancements-1"), assert_instr(vledb))]
+pub unsafe fn vec_floate(a: vector_double) -> vector_float {
+    let truncated: f32x2 = simd_as(a);
+    simd_shuffle(
+        truncated,
+        truncated,
+        const { u32x4::from_array([0, 0, 1, 1]) },
+    )
+}
+
+/// Vector Convert from int to float
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_float(a: impl sealed::VectorFloat) -> vector_float {
+    a.vec_float()
+}
+
+/// Vector Convert from long long to double
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_double(a: impl sealed::VectorDouble) -> vector_double {
+    a.vec_double()
+}
+
+/// Vector Sign Extend to Doubleword
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_extend_s64(a: impl sealed::VectorExtendSigned64) -> vector_signed_long_long {
+    a.vec_extend_s64()
+}
+
+/// Vector Convert floating point to signed
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_signed<T: sealed::VectorSigned>(a: T) -> T::Result {
+    a.vec_signed()
+}
+
+/// Vector Convert floating point to unsigned
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_unsigned<T: sealed::VectorUnsigned>(a: T) -> T::Result {
+    a.vec_unsigned()
+}
+
+/// Vector Copy Until Zero
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cp_until_zero<T: sealed::VectorCopyUntilZero>(a: T) -> T {
+    a.vec_cp_until_zero()
+}
+
+/// Vector Copy Until Zero
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cp_until_zero_cc<T: sealed::VectorCopyUntilZeroCC>(a: T) -> (T, i32) {
+    a.vec_cp_until_zero_cc()
+}
+
+/// Vector Multiply Sum Logical
+#[inline]
+#[target_feature(enable = "vector-enhancements-1")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+#[cfg_attr(
+    all(test, target_feature = "vector-enhancements-1"),
+    assert_instr(vmslg, D = 4)
+)]
+pub unsafe fn vec_msum_u128<const D: u32>(
+    a: vector_unsigned_long_long,
+    b: vector_unsigned_long_long,
+    c: vector_unsigned_char,
+) -> vector_unsigned_char {
+    const {
+        if !matches!(D, 0 | 4 | 8 | 12) {
+            panic!("D needs to be one of 0, 4, 8, 12");
+        }
+    };
+    transmute(vmslg(a, b, transmute(c), D))
+}
+
+/// Vector Shift Left Double by Byte
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sld<T: sealed::VectorSld, const C: u32>(a: T, b: T) -> T {
+    static_assert_uimm_bits!(C, 4);
+    a.vec_sld::<C>(b)
+}
+
+/// Vector Shift Left Double by Word
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sldw<T: sealed::VectorSld, const C: u32>(a: T, b: T) -> T {
+    static_assert_uimm_bits!(C, 2);
+    a.vec_sldw::<C>(b)
+}
+
+/// Vector Shift Left Double by Bit
+#[inline]
+#[target_feature(enable = "vector-enhancements-2")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_sldb<T: sealed::VectorSld, const C: u32>(a: T, b: T) -> T {
+    static_assert_uimm_bits!(C, 3);
+    a.vec_sldb::<C>(b)
+}
+
+/// Vector Shift Right Double by Bit
+#[inline]
+#[target_feature(enable = "vector-enhancements-2")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_srdb<T: sealed::VectorSrdb, const C: u32>(a: T, b: T) -> T {
+    static_assert_uimm_bits!(C, 3);
+    a.vec_srdb::<C>(b)
+}
+
+/// Vector Compare Ranges
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmprg<T: sealed::VectorCompareRange>(a: T, b: T, c: T) -> T::Result {
+    a.vstrc::<{ FindImm::Eq as u32 }>(b, c)
+}
+
+/// Vector Compare Not in Ranges
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpnrg<T: sealed::VectorCompareRange>(a: T, b: T, c: T) -> T::Result {
+    a.vstrc::<{ FindImm::Ne as u32 }>(b, c)
+}
+
+/// Vector Compare Ranges Index
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmprg_idx<T: sealed::VectorCompareRange>(a: T, b: T, c: T) -> T::Result {
+    a.vstrc::<{ FindImm::EqIdx as u32 }>(b, c)
+}
+
+/// Vector Compare Not in Ranges Index
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpnrg_idx<T: sealed::VectorCompareRange>(a: T, b: T, c: T) -> T::Result {
+    a.vstrc::<{ FindImm::NeIdx as u32 }>(b, c)
+}
+
+/// Vector Compare Ranges with Condition Code
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmprg_cc<T: sealed::VectorCompareRange>(
+    a: T,
+    b: T,
+    c: T,
+    d: *mut i32,
+) -> T::Result {
+    let (x, y) = a.vstrcs::<{ FindImm::Eq as u32 }>(b, c);
+    d.write(y);
+    x
+}
+
+/// Vector Compare Not in Ranges with Condition Code
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpnrg_cc<T: sealed::VectorCompareRange>(
+    a: T,
+    b: T,
+    c: T,
+    d: *mut i32,
+) -> T::Result {
+    let (x, y) = a.vstrcs::<{ FindImm::Ne as u32 }>(b, c);
+    d.write(y);
+    x
+}
+
+/// Vector Compare Ranges Index with Condition Code
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmprg_idx_cc<T: sealed::VectorCompareRange>(
+    a: T,
+    b: T,
+    c: T,
+    d: *mut i32,
+) -> T::Result {
+    let (x, y) = a.vstrcs::<{ FindImm::EqIdx as u32 }>(b, c);
+    d.write(y);
+    x
+}
+
+/// Vector Compare Not in Ranges Index with Condition Code
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpnrg_idx_cc<T: sealed::VectorCompareRange>(
+    a: T,
+    b: T,
+    c: T,
+    d: *mut i32,
+) -> T::Result {
+    let (x, y) = a.vstrcs::<{ FindImm::NeIdx as u32 }>(b, c);
+    d.write(y);
+    x
+}
+
+/// Vector Compare Ranges or Zero Index
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmprg_or_0_idx<T: sealed::VectorCompareRange>(a: T, b: T, c: T) -> T::Result {
+    a.vstrcz::<{ FindImm::EqIdx as u32 }>(b, c)
+}
+
+/// Vector Compare Not in Ranges or Zero Index
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpnrg_or_0_idx<T: sealed::VectorCompareRange>(a: T, b: T, c: T) -> T::Result {
+    a.vstrcz::<{ FindImm::NeIdx as u32 }>(b, c)
+}
+
+/// Vector Compare Ranges or Zero Index with Condition Code
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmprg_or_0_idx_cc<T: sealed::VectorCompareRange>(
+    a: T,
+    b: T,
+    c: T,
+    d: *mut i32,
+) -> T::Result {
+    let (x, y) = a.vstrczs::<{ FindImm::EqIdx as u32 }>(b, c);
+    d.write(y);
+    x
+}
+
+/// Vector Compare Not in Ranges or Zero Index with Condition Code
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpnrg_or_0_idx_cc<T: sealed::VectorCompareRange>(
+    a: T,
+    b: T,
+    c: T,
+    d: *mut i32,
+) -> T::Result {
+    let (x, y) = a.vstrczs::<{ FindImm::NeIdx as u32 }>(b, c);
+    d.write(y);
+    x
+}
+
+/// Vector Compare Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpeq<T: sealed::VectorEquality>(a: T, b: T) -> T::Result {
+    a.vec_cmpeq(b)
+}
+
+/// Vector Compare Not Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpne<T: sealed::VectorEquality>(a: T, b: T) -> T::Result {
+    a.vec_cmpne(b)
+}
+
+/// Vector Compare Greater Than
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpgt<T: sealed::VectorComparePredicate>(a: T, b: T) -> T::Result {
+    a.vec_cmpgt(b)
+}
+
+/// Vector Compare Greater Than or Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpge<T: sealed::VectorComparePredicate>(a: T, b: T) -> T::Result {
+    a.vec_cmpge(b)
+}
+
+/// Vector Compare Less
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmplt<T: sealed::VectorComparePredicate>(a: T, b: T) -> T::Result {
+    a.vec_cmplt(b)
+}
+
+/// Vector Compare Less Than or Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmple<T: sealed::VectorComparePredicate>(a: T, b: T) -> T::Result {
+    a.vec_cmple(b)
+}
+
+/// Vector Compare Equal Index
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpeq_idx<T: sealed::VectorEqualityIdx>(a: T, b: T) -> T::Result {
+    a.vec_cmpeq_idx(b)
+}
+/// Vector Compare Not Equal Index
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpne_idx<T: sealed::VectorEqualityIdx>(a: T, b: T) -> T::Result {
+    a.vec_cmpne_idx(b)
+}
+/// Vector Compare Equal Index with Condition Code
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpeq_idx_cc<T: sealed::VectorEqualityIdx>(a: T, b: T) -> (T::Result, i32) {
+    a.vec_cmpeq_idx_cc(b)
+}
+/// Vector Compare Not Equal Index with Condition Code
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpne_idx_cc<T: sealed::VectorEqualityIdx>(a: T, b: T) -> (T::Result, i32) {
+    a.vec_cmpne_idx_cc(b)
+}
+/// Vector Compare Equal or Zero Index
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpeq_or_0_idx<T: sealed::VectorEqualityIdx>(a: T, b: T) -> T::Result {
+    a.vec_cmpeq_or_0_idx(b)
+}
+/// Vector Compare Not Equal or Zero Index
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpne_or_0_idx<T: sealed::VectorEqualityIdx>(a: T, b: T) -> T::Result {
+    a.vec_cmpne_or_0_idx(b)
+}
+/// Vector Compare Equal or Zero Index with Condition Code
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpeq_or_0_idx_cc<T: sealed::VectorEqualityIdx>(a: T, b: T) -> (T::Result, i32) {
+    a.vec_cmpeq_or_0_idx_cc(b)
+}
+/// Vector Compare Not Equal or Zero Index with Condition Code
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_cmpne_or_0_idx_cc<T: sealed::VectorEqualityIdx>(a: T, b: T) -> (T::Result, i32) {
+    a.vec_cmpne_or_0_idx_cc(b)
+}
+
+/// All Elements Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_all_eq<T: sealed::VectorEquality>(a: T, b: T) -> i32 {
+    simd_reduce_all(vec_cmpeq(a, b)) as i32 as i32
+}
+
+/// All Elements Not Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_all_ne<T: sealed::VectorEquality>(a: T, b: T) -> i32 {
+    simd_reduce_all(vec_cmpne(a, b)) as i32
+}
+
+/// Any Element Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_any_eq<T: sealed::VectorEquality>(a: T, b: T) -> i32 {
+    simd_reduce_any(vec_cmpeq(a, b)) as i32
+}
+
+/// Any Element Not Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_any_ne<T: sealed::VectorEquality>(a: T, b: T) -> i32 {
+    simd_reduce_any(vec_cmpne(a, b)) as i32
+}
+
+/// All Elements Less Than
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_all_lt<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    a.vec_all_lt(b)
+}
+
+/// All Elements Less Than or Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_all_le<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    a.vec_all_le(b)
+}
+
+/// All Elements Greater Than
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_all_gt<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    a.vec_all_gt(b)
+}
+
+/// All Elements Greater Than or Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_all_ge<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    a.vec_all_ge(b)
+}
+
+/// All Elements Not Less Than
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_all_nlt<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    vec_all_ge(a, b)
+}
+
+/// All Elements Not Less Than or Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_all_nle<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    vec_all_gt(a, b)
+}
+
+/// All Elements Not Greater Than
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_all_ngt<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    vec_all_le(a, b)
+}
+
+/// All Elements Not Greater Than or Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_all_nge<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    vec_all_lt(a, b)
+}
+
+/// Any Elements Less Than
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_any_lt<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    !vec_all_ge(a, b)
+}
+
+/// Any Elements Less Than or Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_any_le<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    !vec_all_gt(a, b)
+}
+
+/// Any Elements Greater Than
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_any_gt<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    !vec_all_le(a, b)
+}
+
+/// Any Elements Greater Than or Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_any_ge<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    !vec_all_lt(a, b)
+}
+
+/// Any Elements Not Less Than
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_any_nlt<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    vec_any_ge(a, b)
+}
+
+/// Any Elements Not Less Than or Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_any_nle<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    vec_any_gt(a, b)
+}
+
+/// Any Elements Not Greater Than
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_any_ngt<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    vec_any_le(a, b)
+}
+
+/// Any Elements Not Greater Than or Equal
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_any_nge<T: sealed::VectorCompare>(a: T, b: T) -> i32 {
+    vec_any_lt(a, b)
+}
+
+/// Vector Extract
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_extract<T: sealed::VectorExtract>(a: T, b: i32) -> T::ElementType {
+    T::vec_extract(a, b)
+}
+
+/// Vector Insert
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_insert<T: sealed::VectorInsert>(a: T::ElementType, b: T, c: i32) -> T {
+    T::vec_insert(a, b, c)
+}
+
+/// Vector Insert and Zero
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_insert_and_zero<T: sealed::VectorInsertAndZero>(a: *const T::ElementType) -> T {
+    T::vec_insert_and_zero(a)
+}
+
+/// Vector Promote
+#[inline]
+#[target_feature(enable = "vector")]
+#[unstable(feature = "stdarch_s390x", issue = "135681")]
+pub unsafe fn vec_promote<T: sealed::VectorPromote>(a: T::ElementType, b: i32) -> MaybeUninit<T> {
+    T::vec_promote(a, b)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::mem::transmute;
+
+    use crate::core_arch::simd::*;
+    use stdarch_test::simd_test;
+
+    #[test]
+    fn reverse_mask() {
+        assert_eq!(ShuffleMask::<4>::reverse().0, [3, 2, 1, 0]);
+    }
+
+    #[test]
+    fn mergel_mask() {
+        assert_eq!(ShuffleMask::<4>::merge_low().0, [2, 6, 3, 7]);
+    }
+
+    #[test]
+    fn mergeh_mask() {
+        assert_eq!(ShuffleMask::<4>::merge_high().0, [0, 4, 1, 5]);
+    }
+
+    #[test]
+    fn pack_mask() {
+        assert_eq!(ShuffleMask::<4>::pack().0, [1, 3, 5, 7]);
+    }
+
+    #[test]
+    fn test_vec_mask() {
+        assert_eq!(
+            genmask::<0x00FF>(),
+            [
+                0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+            ]
+        );
+    }
+
+    #[test]
+    fn test_genmasks() {
+        assert_eq!(genmasks(u8::BITS, 3, 5), 28);
+        assert_eq!(genmasks(u8::BITS, 3, 7), 31);
+
+        // If a or b is greater than 8, the operation is performed as if the value gets modulo by 8.
+        assert_eq!(genmasks(u8::BITS, 3 + 8, 7 + 8), 31);
+        // If a is greater than b, the operation is perform as if b equals 7.
+        assert_eq!(genmasks(u8::BITS, 5, 4), genmasks(u8::BITS, 5, 7));
+
+        assert_eq!(
+            genmasks(u16::BITS, 4, 12) as u16,
+            u16::from_be_bytes([15, -8i8 as u8])
+        );
+        assert_eq!(
+            genmasks(u32::BITS, 4, 29) as u32,
+            u32::from_be_bytes([15, 0xFF, 0xFF, -4i8 as u8])
+        );
+    }
+
+    macro_rules! test_vec_1 {
+        { $name: ident, $fn:ident, f32x4, [$($a:expr),+], ~[$($d:expr),+] } => {
+            #[simd_test(enable = "vector")]
+            unsafe fn $name() {
+                let a: vector_float = transmute(f32x4::new($($a),+));
+
+                let d: vector_float = transmute(f32x4::new($($d),+));
+                let r = transmute(vec_cmple(vec_abs(vec_sub($fn(a), d)), vec_splats(f32::EPSILON)));
+                let e = m32x4::new(true, true, true, true);
+                assert_eq!(e, r);
+            }
+        };
+        { $name: ident, $fn:ident, $ty: ident, [$($a:expr),+], [$($d:expr),+] } => {
+            test_vec_1! { $name, $fn, $ty -> $ty, [$($a),+], [$($d),+] }
+        };
+        { $name: ident, $fn:ident, $ty: ident -> $ty_out: ident, [$($a:expr),+], [$($d:expr),+] } => {
+            #[simd_test(enable = "vector")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = transmute($ty::new($($a),+));
+
+                let d = $ty_out::new($($d),+);
+                let r : $ty_out = transmute($fn(a));
+                assert_eq!(d, r);
+            }
+        }
+    }
+
+    macro_rules! test_vec_2 {
+        { $name: ident, $fn:ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! { $name, $fn, $ty -> $ty, [$($a),+], [$($b),+], [$($d),+] }
+        };
+        { $name: ident, $fn:ident, $ty: ident -> $ty_out: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! { $name, $fn, $ty, $ty -> $ty, [$($a),+], [$($b),+], [$($d),+] }
+         };
+        { $name: ident, $fn:ident, $ty1: ident, $ty2: ident -> $ty_out: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            #[simd_test(enable = "vector")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty1) = transmute($ty1::new($($a),+));
+                let b: s_t_l!($ty2) = transmute($ty2::new($($b),+));
+
+                let d = $ty_out::new($($d),+);
+                let r : $ty_out = transmute($fn(a, b));
+                assert_eq!(d, r);
+            }
+         };
+         { $name: ident, $fn:ident, $ty: ident -> $ty_out: ident, [$($a:expr),+], [$($b:expr),+], $d:expr } => {
+            #[simd_test(enable = "vector")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = transmute($ty::new($($a),+));
+                let b: s_t_l!($ty) = transmute($ty::new($($b),+));
+
+                let r : $ty_out = transmute($fn(a, b));
+                assert_eq!($d, r);
+            }
+         }
+   }
+
+    #[simd_test(enable = "vector")]
+    unsafe fn vec_add_i32x4_i32x4() {
+        let x = i32x4::new(1, 2, 3, 4);
+        let y = i32x4::new(4, 3, 2, 1);
+        let x: vector_signed_int = transmute(x);
+        let y: vector_signed_int = transmute(y);
+        let z = vec_add(x, y);
+        assert_eq!(i32x4::splat(5), transmute(z));
+    }
+
+    macro_rules! test_vec_sub {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! {$name, vec_sub, $ty, [$($a),+], [$($b),+], [$($d),+] }
+        }
+    }
+
+    test_vec_sub! { test_vec_sub_f32x4, f32x4,
+    [-1.0, 0.0, 1.0, 2.0],
+    [2.0, 1.0, -1.0, -2.0],
+    [-3.0, -1.0, 2.0, 4.0] }
+
+    test_vec_sub! { test_vec_sub_f64x2, f64x2,
+    [-1.0, 0.0],
+    [2.0, 1.0],
+    [-3.0, -1.0] }
+
+    test_vec_sub! { test_vec_sub_i64x2, i64x2,
+    [-1, 0],
+    [2, 1],
+    [-3, -1] }
+
+    test_vec_sub! { test_vec_sub_u64x2, u64x2,
+    [0, 1],
+    [1, 0],
+    [u64::MAX, 1] }
+
+    test_vec_sub! { test_vec_sub_i32x4, i32x4,
+    [-1, 0, 1, 2],
+    [2, 1, -1, -2],
+    [-3, -1, 2, 4] }
+
+    test_vec_sub! { test_vec_sub_u32x4, u32x4,
+    [0, 0, 1, 2],
+    [2, 1, 0, 0],
+    [4294967294, 4294967295, 1, 2] }
+
+    test_vec_sub! { test_vec_sub_i16x8, i16x8,
+    [-1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2],
+    [-3, -1, 2, 4, -3, -1, 2, 4] }
+
+    test_vec_sub! { test_vec_sub_u16x8, u16x8,
+    [0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0],
+    [65534, 65535, 1, 2, 65534, 65535, 1, 2] }
+
+    test_vec_sub! { test_vec_sub_i8x16, i8x16,
+    [-1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2],
+    [-3, -1, 2, 4, -3, -1, 2, 4, -3, -1, 2, 4, -3, -1, 2, 4] }
+
+    test_vec_sub! { test_vec_sub_u8x16, u8x16,
+    [0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0],
+    [254, 255, 1, 2, 254, 255, 1, 2, 254, 255, 1, 2, 254, 255, 1, 2] }
+
+    macro_rules! test_vec_mul {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! {$name, vec_mul, $ty, [$($a),+], [$($b),+], [$($d),+] }
+        }
+    }
+
+    test_vec_mul! { test_vec_mul_f32x4, f32x4,
+    [-1.0, 0.0, 1.0, 2.0],
+    [2.0, 1.0, -1.0, -2.0],
+    [-2.0, 0.0, -1.0, -4.0] }
+
+    test_vec_mul! { test_vec_mul_f64x2, f64x2,
+    [-1.0, 0.0],
+    [2.0, 1.0],
+    [-2.0, 0.0] }
+
+    test_vec_mul! { test_vec_mul_i64x2, i64x2,
+    [i64::MAX, -4],
+    [2, 3],
+    [i64::MAX.wrapping_mul(2), -12] }
+
+    test_vec_mul! { test_vec_mul_u64x2, u64x2,
+    [u64::MAX, 4],
+    [2, 3],
+    [u64::MAX.wrapping_mul(2), 12] }
+
+    test_vec_mul! { test_vec_mul_i32x4, i32x4,
+    [-1, 0, 1, 2],
+    [2, 1, -1, -2],
+    [-2, 0, -1, -4] }
+
+    test_vec_mul! { test_vec_mul_u32x4, u32x4,
+    [0, u32::MAX - 1, 1, 2],
+    [5, 6, 7, 8],
+    [0, 4294967284, 7, 16] }
+
+    test_vec_mul! { test_vec_mul_i16x8, i16x8,
+    [-1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2],
+    [-2, 0, -1, -4, -2, 0, -1, -4] }
+
+    test_vec_mul! { test_vec_mul_u16x8, u16x8,
+    [0, u16::MAX - 1, 1, 2, 3, 4, 5, 6],
+    [5, 6, 7, 8, 9, 8, 7, 6],
+    [0, 65524, 7, 16, 27, 32, 35, 36] }
+
+    test_vec_mul! { test_vec_mul_i8x16, i8x16,
+    [-1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2],
+    [-2, 0, -1, -4, -2, 0, -1, -4, -2, 0, -1, -4, -2, 0, -1, -4] }
+
+    test_vec_mul! { test_vec_mul_u8x16, u8x16,
+    [0, u8::MAX - 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 6, 5, 4],
+    [5, 6, 7, 8, 9, 8, 7, 6, 5, 4, 0, u8::MAX, 1, 2, 3, 4],
+    [0, 244, 7, 16, 27, 32, 35, 36, 35, 32, 0, 248, 7, 12, 15, 16] }
+
+    macro_rules! test_vec_abs {
+        { $name: ident, $ty: ident, $a: expr, $d: expr } => {
+            #[simd_test(enable = "vector")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = vec_splats($a);
+                let a: s_t_l!($ty) = vec_abs(a);
+                let d = $ty::splat($d);
+                assert_eq!(d, transmute(a));
+            }
+        }
+    }
+
+    test_vec_abs! { test_vec_abs_i8, i8x16, -42i8, 42i8 }
+    test_vec_abs! { test_vec_abs_i16, i16x8, -42i16, 42i16 }
+    test_vec_abs! { test_vec_abs_i32, i32x4, -42i32, 42i32 }
+    test_vec_abs! { test_vec_abs_i64, i64x2, -42i64, 42i64 }
+    test_vec_abs! { test_vec_abs_f32, f32x4, -42f32, 42f32 }
+    test_vec_abs! { test_vec_abs_f64, f64x2, -42f64, 42f64 }
+
+    test_vec_1! { test_vec_nabs, vec_nabs, f32x4,
+    [core::f32::consts::PI, 1.0, 0.0, -1.0],
+    [-core::f32::consts::PI, -1.0, 0.0, -1.0] }
+
+    test_vec_2! { test_vec_andc, vec_andc, i32x4,
+    [0b11001100, 0b11001100, 0b11001100, 0b11001100],
+    [0b00110011, 0b11110011, 0b00001100, 0b10000000],
+    [0b11001100, 0b00001100, 0b11000000, 0b01001100] }
+
+    test_vec_2! { test_vec_and, vec_and, i32x4,
+    [0b11001100, 0b11001100, 0b11001100, 0b11001100],
+    [0b00110011, 0b11110011, 0b00001100, 0b00000000],
+    [0b00000000, 0b11000000, 0b00001100, 0b00000000] }
+
+    test_vec_2! { test_vec_nand, vec_nand, i32x4,
+    [0b11001100, 0b11001100, 0b11001100, 0b11001100],
+    [0b00110011, 0b11110011, 0b00001100, 0b00000000],
+    [!0b00000000, !0b11000000, !0b00001100, !0b00000000] }
+
+    test_vec_2! { test_vec_orc, vec_orc, u32x4,
+    [0b11001100, 0b11001100, 0b11001100, 0b11001100],
+    [0b00110011, 0b11110011, 0b00001100, 0b00000000],
+    [0b11001100 | !0b00110011, 0b11001100 | !0b11110011, 0b11001100 | !0b00001100, 0b11001100 | !0b00000000] }
+
+    test_vec_2! { test_vec_or, vec_or, i32x4,
+    [0b11001100, 0b11001100, 0b11001100, 0b11001100],
+    [0b00110011, 0b11110011, 0b00001100, 0b00000000],
+    [0b11111111, 0b11111111, 0b11001100, 0b11001100] }
+
+    test_vec_2! { test_vec_nor, vec_nor, i32x4,
+    [0b11001100, 0b11001100, 0b11001100, 0b11001100],
+    [0b00110011, 0b11110011, 0b00001100, 0b00000000],
+    [!0b11111111, !0b11111111, !0b11001100, !0b11001100] }
+
+    test_vec_2! { test_vec_xor, vec_xor, i32x4,
+    [0b11001100, 0b11001100, 0b11001100, 0b11001100],
+    [0b00110011, 0b11110011, 0b00001100, 0b00000000],
+    [0b11111111, 0b00111111, 0b11000000, 0b11001100] }
+
+    test_vec_2! { test_vec_eqv, vec_eqv, i32x4,
+    [0b11001100, 0b11001100, 0b11001100, 0b11001100],
+    [0b00110011, 0b11110011, 0b00001100, 0b00000000],
+    [!0b11111111, !0b00111111, !0b11000000, !0b11001100] }
+
+    test_vec_1! { test_vec_floor_f32, vec_floor, f32x4,
+        [1.1, 1.9, -0.5, -0.9],
+        [1.0, 1.0, -1.0, -1.0]
+    }
+
+    test_vec_1! { test_vec_floor_f64_1, vec_floor, f64x2,
+        [1.1, 1.9],
+        [1.0, 1.0]
+    }
+    test_vec_1! { test_vec_floor_f64_2, vec_floor, f64x2,
+        [-0.5, -0.9],
+        [-1.0, -1.0]
+    }
+
+    test_vec_1! { test_vec_ceil_f32, vec_ceil, f32x4,
+        [0.1, 0.5, 0.6, 0.9],
+        [1.0, 1.0, 1.0, 1.0]
+    }
+    test_vec_1! { test_vec_ceil_f64_1, vec_ceil, f64x2,
+        [0.1, 0.5],
+        [1.0, 1.0]
+    }
+    test_vec_1! { test_vec_ceil_f64_2, vec_ceil, f64x2,
+        [0.6, 0.9],
+        [1.0, 1.0]
+    }
+
+    test_vec_1! { test_vec_round_f32, vec_round, f32x4,
+        [0.1, 0.5, 0.6, 0.9],
+        [0.0, 0.0, 1.0, 1.0]
+    }
+
+    test_vec_1! { test_vec_round_f32_even_odd, vec_round, f32x4,
+        [0.5, 1.5, 2.5, 3.5],
+        [0.0, 2.0, 2.0, 4.0]
+    }
+
+    test_vec_1! { test_vec_round_f64_1, vec_round, f64x2,
+        [0.1, 0.5],
+        [0.0, 0.0]
+    }
+    test_vec_1! { test_vec_round_f64_2, vec_round, f64x2,
+        [0.6, 0.9],
+        [1.0, 1.0]
+    }
+
+    test_vec_1! { test_vec_roundc_f32, vec_roundc, f32x4,
+        [0.1, 0.5, 0.6, 0.9],
+        [0.0, 0.0, 1.0, 1.0]
+    }
+
+    test_vec_1! { test_vec_roundc_f32_even_odd, vec_roundc, f32x4,
+        [0.5, 1.5, 2.5, 3.5],
+        [0.0, 2.0, 2.0, 4.0]
+    }
+
+    test_vec_1! { test_vec_roundc_f64_1, vec_roundc, f64x2,
+        [0.1, 0.5],
+        [0.0, 0.0]
+    }
+    test_vec_1! { test_vec_roundc_f64_2, vec_roundc, f64x2,
+        [0.6, 0.9],
+        [1.0, 1.0]
+    }
+
+    test_vec_1! { test_vec_rint_f32, vec_rint, f32x4,
+        [0.1, 0.5, 0.6, 0.9],
+        [0.0, 0.0, 1.0, 1.0]
+    }
+
+    test_vec_1! { test_vec_rint_f32_even_odd, vec_rint, f32x4,
+        [0.5, 1.5, 2.5, 3.5],
+        [0.0, 2.0, 2.0, 4.0]
+    }
+
+    test_vec_1! { test_vec_rint_f64_1, vec_rint, f64x2,
+        [0.1, 0.5],
+        [0.0, 0.0]
+    }
+    test_vec_1! { test_vec_rint_f64_2, vec_rint, f64x2,
+        [0.6, 0.9],
+        [1.0, 1.0]
+    }
+
+    test_vec_2! { test_vec_sll, vec_sll, i32x4, u8x16 -> i32x4,
+    [1, 1, 1, 1],
+    [0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 8],
+    [1 << 2, 1 << 3, 1 << 4, 1] }
+
+    test_vec_2! { test_vec_srl, vec_srl, i32x4, u8x16 -> i32x4,
+    [0b1000, 0b1000, 0b1000, 0b1000],
+    [0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 16],
+    [4, 2, 1, 8] }
+
+    test_vec_2! { test_vec_sral_pos, vec_sral, u32x4, u8x16 -> i32x4,
+    [0b1000, 0b1000, 0b1000, 0b1000],
+    [0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 16],
+    [4, 2, 1, 8] }
+
+    test_vec_2! { test_vec_sral_neg, vec_sral, i32x4, u8x16 -> i32x4,
+    [-8, -8, -8, -8],
+    [0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 16],
+    [-4, -2, -1, -8] }
+
+    test_vec_1! { test_vec_reve_f32, vec_reve, f32x4,
+        [0.1, 0.5, 0.6, 0.9],
+        [0.9, 0.6, 0.5, 0.1]
+    }
+
+    test_vec_1! { test_vec_revb_u32, vec_revb, u32x4,
+        [0xAABBCCDD, 0xEEFF0011, 0x22334455, 0x66778899],
+        [0xDDCCBBAA, 0x1100FFEE, 0x55443322, 0x99887766]
+    }
+
+    test_vec_2! { test_vec_mergeh_u32, vec_mergeh, u32x4,
+        [0xAAAAAAAA, 0xBBBBBBBB, 0xCCCCCCCC, 0xDDDDDDDD],
+        [0x00000000, 0x11111111, 0x22222222, 0x33333333],
+        [0xAAAAAAAA, 0x00000000, 0xBBBBBBBB, 0x11111111]
+    }
+
+    test_vec_2! { test_vec_mergel_u32, vec_mergel, u32x4,
+        [0xAAAAAAAA, 0xBBBBBBBB, 0xCCCCCCCC, 0xDDDDDDDD],
+        [0x00000000, 0x11111111, 0x22222222, 0x33333333],
+        [0xCCCCCCCC, 0x22222222, 0xDDDDDDDD, 0x33333333]
+    }
+
+    macro_rules! test_vec_perm {
+        {$name:ident,
+         $shorttype:ident, $longtype:ident,
+         [$($a:expr),+], [$($b:expr),+], [$($c:expr),+], [$($d:expr),+]} => {
+            #[simd_test(enable = "vector")]
+            unsafe fn $name() {
+                let a: $longtype = transmute($shorttype::new($($a),+));
+                let b: $longtype = transmute($shorttype::new($($b),+));
+                let c: vector_unsigned_char = transmute(u8x16::new($($c),+));
+                let d = $shorttype::new($($d),+);
+
+                let r: $shorttype = transmute(vec_perm(a, b, c));
+                assert_eq!(d, r);
+            }
+        }
+    }
+
+    test_vec_perm! {test_vec_perm_u8x16,
+    u8x16, vector_unsigned_char,
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]}
+    test_vec_perm! {test_vec_perm_i8x16,
+    i8x16, vector_signed_char,
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]}
+
+    test_vec_perm! {test_vec_perm_m8x16,
+    m8x16, vector_bool_char,
+    [false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false],
+    [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [false, false, true, true, false, false, true, true, false, false, true, true, false, false, true, true]}
+    test_vec_perm! {test_vec_perm_u16x8,
+    u16x8, vector_unsigned_short,
+    [0, 1, 2, 3, 4, 5, 6, 7],
+    [10, 11, 12, 13, 14, 15, 16, 17],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 10, 1, 11, 2, 12, 3, 13]}
+    test_vec_perm! {test_vec_perm_i16x8,
+    i16x8, vector_signed_short,
+    [0, 1, 2, 3, 4, 5, 6, 7],
+    [10, 11, 12, 13, 14, 15, 16, 17],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 10, 1, 11, 2, 12, 3, 13]}
+    test_vec_perm! {test_vec_perm_m16x8,
+    m16x8, vector_bool_short,
+    [false, false, false, false, false, false, false, false],
+    [true, true, true, true, true, true, true, true],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [false, true, false, true, false, true, false, true]}
+
+    test_vec_perm! {test_vec_perm_u32x4,
+    u32x4, vector_unsigned_int,
+    [0, 1, 2, 3],
+    [10, 11, 12, 13],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0, 10, 1, 11]}
+    test_vec_perm! {test_vec_perm_i32x4,
+    i32x4, vector_signed_int,
+    [0, 1, 2, 3],
+    [10, 11, 12, 13],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0, 10, 1, 11]}
+    test_vec_perm! {test_vec_perm_m32x4,
+    m32x4, vector_bool_int,
+    [false, false, false, false],
+    [true, true, true, true],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [false, true, false, true]}
+    test_vec_perm! {test_vec_perm_f32x4,
+    f32x4, vector_float,
+    [0.0, 1.0, 2.0, 3.0],
+    [1.0, 1.1, 1.2, 1.3],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0.0, 1.0, 1.0, 1.1]}
+
+    test_vec_1! { test_vec_sqrt, vec_sqrt, f32x4,
+    [core::f32::consts::PI, 1.0, 25.0, 2.0],
+    [core::f32::consts::PI.sqrt(), 1.0, 5.0, core::f32::consts::SQRT_2] }
+
+    test_vec_2! { test_vec_find_any_eq, vec_find_any_eq, i32x4, i32x4 -> u32x4,
+        [1, -2, 3, -4],
+        [-5, 3, -7, 8],
+        [0, 0, 0xFFFFFFFF, 0]
+    }
+
+    test_vec_2! { test_vec_find_any_ne, vec_find_any_ne, i32x4, i32x4 -> u32x4,
+        [1, -2, 3, -4],
+        [-5, 3, -7, 8],
+        [0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF]
+    }
+
+    test_vec_2! { test_vec_find_any_eq_idx_1, vec_find_any_eq_idx, i32x4, i32x4 -> u32x4,
+        [1, 2, 3, 4],
+        [5, 3, 7, 8],
+        [0, 8, 0, 0]
+    }
+    test_vec_2! { test_vec_find_any_eq_idx_2, vec_find_any_eq_idx, i32x4, i32x4 -> u32x4,
+        [1, 2, 3, 4],
+        [5, 6, 7, 8],
+        [0, 16, 0, 0]
+    }
+
+    test_vec_2! { test_vec_find_any_ne_idx_1, vec_find_any_ne_idx, i32x4, i32x4 -> u32x4,
+        [1, 2, 3, 4],
+        [1, 5, 3, 4],
+        [0, 4, 0, 0]
+    }
+    test_vec_2! { test_vec_find_any_ne_idx_2, vec_find_any_ne_idx, i32x4, i32x4 -> u32x4,
+        [1, 2, 3, 4],
+        [1, 2, 3, 4],
+        [0, 16, 0, 0]
+    }
+
+    test_vec_2! { test_vec_find_any_eq_or_0_idx_1, vec_find_any_eq_or_0_idx, i32x4, i32x4 -> u32x4,
+        [1, 2, 0, 4],
+        [5, 6, 7, 8],
+        [0, 8, 0, 0]
+    }
+    test_vec_2! { test_vec_find_any_ne_or_0_idx_1, vec_find_any_ne_or_0_idx, i32x4, i32x4 -> u32x4,
+        [1, 2, 0, 4],
+        [1, 2, 3, 4],
+        [0, 8, 0, 0]
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_find_any_eq_cc() {
+        let a = vector_unsigned_int([1, 2, 3, 4]);
+        let b = vector_unsigned_int([5, 3, 7, 8]);
+
+        let (d, c) = unsafe { vec_find_any_eq_cc(a, b) };
+        assert_eq!(c, 1);
+        assert_eq!(d.as_array(), &[0, 0, -1, 0]);
+
+        let a = vector_unsigned_int([1, 2, 3, 4]);
+        let b = vector_unsigned_int([5, 6, 7, 8]);
+        let (d, c) = unsafe { vec_find_any_eq_cc(a, b) };
+        assert_eq!(c, 3);
+        assert_eq!(d.as_array(), &[0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_find_any_ne_cc() {
+        let a = vector_unsigned_int([1, 2, 3, 4]);
+        let b = vector_unsigned_int([5, 3, 7, 8]);
+
+        let (d, c) = unsafe { vec_find_any_ne_cc(a, b) };
+        assert_eq!(c, 1);
+        assert_eq!(d.as_array(), &[-1, -1, 0, -1]);
+
+        let a = vector_unsigned_int([1, 2, 3, 4]);
+        let b = vector_unsigned_int([1, 2, 3, 4]);
+        let (d, c) = unsafe { vec_find_any_ne_cc(a, b) };
+        assert_eq!(c, 3);
+        assert_eq!(d.as_array(), &[0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_find_any_eq_idx_cc() {
+        let a = vector_unsigned_int([1, 2, 3, 4]);
+        let b = vector_unsigned_int([5, 3, 7, 8]);
+
+        let (d, c) = unsafe { vec_find_any_eq_idx_cc(a, b) };
+        assert_eq!(c, 1);
+        assert_eq!(d.as_array(), &[0, 8, 0, 0]);
+
+        let a = vector_unsigned_int([1, 2, 3, 4]);
+        let b = vector_unsigned_int([5, 6, 7, 8]);
+        let (d, c) = unsafe { vec_find_any_eq_idx_cc(a, b) };
+        assert_eq!(c, 3);
+        assert_eq!(d.as_array(), &[0, 16, 0, 0]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_find_any_ne_idx_cc() {
+        let a = vector_unsigned_int([5, 2, 3, 4]);
+        let b = vector_unsigned_int([5, 3, 7, 8]);
+
+        let (d, c) = unsafe { vec_find_any_ne_idx_cc(a, b) };
+        assert_eq!(c, 1);
+        assert_eq!(d.as_array(), &[0, 4, 0, 0]);
+
+        let a = vector_unsigned_int([1, 2, 3, 4]);
+        let b = vector_unsigned_int([1, 2, 3, 4]);
+        let (d, c) = unsafe { vec_find_any_ne_idx_cc(a, b) };
+        assert_eq!(c, 3);
+        assert_eq!(d.as_array(), &[0, 16, 0, 0]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_find_any_eq_or_0_idx_cc() {
+        // if no element of a matches any element of b with an equal value, and there is at least one element from a with a value of 0
+        let a = vector_unsigned_int([0, 1, 2, 3]);
+        let b = vector_unsigned_int([4, 5, 6, 7]);
+        let (d, c) = unsafe { vec_find_any_eq_or_0_idx_cc(a, b) };
+        assert_eq!(c, 0);
+        assert_eq!(d.as_array(), &[0, 0, 0, 0]);
+
+        // if at least one element of a matches any element of b with an equal value, and no elements of a with a value of 0
+        let a = vector_unsigned_int([1, 2, 3, 4]);
+        let b = vector_unsigned_int([5, 2, 3, 4]);
+        let (d, c) = unsafe { vec_find_any_eq_or_0_idx_cc(a, b) };
+        assert_eq!(c, 1);
+        assert_eq!(d.as_array(), &[0, 4, 0, 0]);
+
+        // if at least one element of a matches any element of b with an equal value, and there is at least one element from a has a value of 0
+        let a = vector_unsigned_int([1, 2, 3, 0]);
+        let b = vector_unsigned_int([1, 2, 3, 4]);
+        let (d, c) = unsafe { vec_find_any_eq_or_0_idx_cc(a, b) };
+        assert_eq!(c, 2);
+        assert_eq!(d.as_array(), &[0, 0, 0, 0]);
+
+        // if no element of a matches any element of b with an equal value, and there is no element from a with a value of 0.
+        let a = vector_unsigned_int([1, 2, 3, 4]);
+        let b = vector_unsigned_int([5, 6, 7, 8]);
+        let (d, c) = unsafe { vec_find_any_eq_or_0_idx_cc(a, b) };
+        assert_eq!(c, 3);
+        assert_eq!(d.as_array(), &[0, 16, 0, 0]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_find_any_ne_or_0_idx_cc() {
+        // if no element of a matches any element of b with a not equal value, and there is at least one element from a with a value of 0.
+        let a = vector_unsigned_int([0, 1, 2, 3]);
+        let b = vector_unsigned_int([4, 1, 2, 3]);
+        let (d, c) = unsafe { vec_find_any_ne_or_0_idx_cc(a, b) };
+        assert_eq!(c, 0);
+        assert_eq!(d.as_array(), &[0, 0, 0, 0]);
+
+        // if at least one element of a matches any element of b with a not equal value, and no elements of a with a value of 0.
+        let a = vector_unsigned_int([4, 2, 3, 4]);
+        let b = vector_unsigned_int([4, 5, 6, 7]);
+        let (d, c) = unsafe { vec_find_any_ne_or_0_idx_cc(a, b) };
+        assert_eq!(c, 1);
+        assert_eq!(d.as_array(), &[0, 4, 0, 0]);
+
+        // if at least one element of a matches any element of b with a not equal value, and there is at least one element from a has a value of 0.
+        let a = vector_unsigned_int([1, 0, 1, 1]);
+        let b = vector_unsigned_int([4, 5, 6, 7]);
+        let (d, c) = unsafe { vec_find_any_ne_or_0_idx_cc(a, b) };
+        assert_eq!(c, 2);
+        assert_eq!(d.as_array(), &[0, 0, 0, 0]);
+
+        // if no element of a matches any element of b with a not equal value, and there is no element from a with a value of 0.
+        let a = vector_unsigned_int([4, 4, 4, 4]);
+        let b = vector_unsigned_int([4, 5, 6, 7]);
+        let (d, c) = unsafe { vec_find_any_ne_or_0_idx_cc(a, b) };
+        assert_eq!(c, 3);
+        assert_eq!(d.as_array(), &[0, 16, 0, 0]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vector_load() {
+        let expected = [0xAAAA_AAAA, 0xBBBB_BBBB, 0xCCCC_CCCC, 0xDDDD_DDDD];
+
+        let source: [u32; 8] = [
+            0xAAAA_AAAA,
+            0xBBBB_BBBB,
+            0xCCCC_CCCC,
+            0xDDDD_DDDD,
+            0,
+            0,
+            0,
+            0,
+        ];
+        assert_eq!(
+            unsafe { vec_xl::<vector_unsigned_int>(0, source.as_ptr()) }.as_array(),
+            &expected
+        );
+
+        // offset is in bytes
+        let source: [u32; 8] = [
+            0x0000_AAAA,
+            0xAAAA_BBBB,
+            0xBBBB_CCCC,
+            0xCCCC_DDDD,
+            0xDDDD_0000,
+            0,
+            0,
+            0,
+        ];
+        assert_eq!(
+            unsafe { vec_xl::<vector_unsigned_int>(2, source.as_ptr()) }.as_array(),
+            &expected
+        );
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vector_store() {
+        let vec = vector_unsigned_int([0xAAAA_AAAA, 0xBBBB_BBBB, 0xCCCC_CCCC, 0xDDDD_DDDD]);
+
+        let mut dest = [0u32; 8];
+        unsafe { vec_xst(vec, 0, dest.as_mut_ptr()) };
+        assert_eq!(
+            dest,
+            [
+                0xAAAA_AAAA,
+                0xBBBB_BBBB,
+                0xCCCC_CCCC,
+                0xDDDD_DDDD,
+                0,
+                0,
+                0,
+                0
+            ]
+        );
+
+        // offset is in bytes
+        let mut dest = [0u32; 8];
+        unsafe { vec_xst(vec, 2, dest.as_mut_ptr()) };
+        assert_eq!(
+            dest,
+            [
+                0x0000_AAAA,
+                0xAAAA_BBBB,
+                0xBBBB_CCCC,
+                0xCCCC_DDDD,
+                0xDDDD_0000,
+                0,
+                0,
+                0,
+            ]
+        );
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vector_lcbb() {
+        #[repr(align(64))]
+        struct Align64<T>(T);
+
+        static ARRAY: Align64<[u8; 128]> = Align64([0; 128]);
+
+        assert_eq!(unsafe { __lcbb::<64>(ARRAY.0[64..].as_ptr()) }, 16);
+        assert_eq!(unsafe { __lcbb::<64>(ARRAY.0[63..].as_ptr()) }, 1);
+        assert_eq!(unsafe { __lcbb::<64>(ARRAY.0[56..].as_ptr()) }, 8);
+        assert_eq!(unsafe { __lcbb::<64>(ARRAY.0[48..].as_ptr()) }, 16);
+    }
+
+    test_vec_2! { test_vec_pack, vec_pack, i16x8, i16x8 -> i8x16,
+        [0, 1, -1, 42, 32767, -32768, 30000, -30000],
+        [32767, -32768, 12345, -12345, 0, 1, -1, 42],
+        [0, 1, -1, 42, -1, 0, 48, -48, -1, 0, 57, -57, 0, 1, -1, 42]
+    }
+
+    test_vec_2! { test_vec_packs, vec_packs, i16x8, i16x8 -> i8x16,
+        [0, 1, -1, 42, 32767, -32768, 30000, -30000],
+        [32767, -32768, 12345, -12345, 0, 1, -1, 42],
+        [0, 1, -1, 42, 127, -128, 127, -128, 127, -128, 127, -128, 0, 1, -1, 42]
+    }
+
+    test_vec_2! { test_vec_packsu_signed, vec_packsu, i16x8, i16x8 -> u8x16,
+        [0, 1, -1, 42, 32767, -32768, 30000, -30000],
+        [32767, -32768, 12345, -12345, 0, 1, -1, 42],
+        [0, 1, 0, 42, 255, 0, 255, 0, 255, 0, 255, 0, 0, 1, 0, 42]
+    }
+
+    test_vec_2! { test_vec_packsu_unsigned, vec_packsu, u16x8, u16x8 -> u8x16,
+        [65535, 32768, 1234, 5678, 16, 8, 4, 2],
+        [30000, 25000, 20000, 15000, 31, 63, 127, 255],
+        [255, 255, 255, 255, 16, 8, 4, 2, 255, 255, 255, 255, 31, 63, 127, 255]
+    }
+
+    test_vec_2! { test_vec_rl, vec_rl, u32x4,
+        [0x12345678, 0x9ABCDEF0, 0x0F0F0F0F, 0x12345678],
+        [4, 8, 12, 68],
+        [0x23456781, 0xBCDEF09A, 0xF0F0F0F0, 0x23456781]
+    }
+
+    test_vec_1! { test_vec_unpackh_i, vec_unpackh, i16x8 -> i32x4,
+        [0x1234, -2, 0x0F0F, -32768, 0, 0, 0, 0],
+        [0x1234, -2, 0x0F0F, -32768]
+    }
+
+    test_vec_1! { test_vec_unpackh_u, vec_unpackh, u16x8 -> u32x4,
+        [0x1234, 0xFFFF, 0x0F0F, 0x8000, 0, 0, 0, 0],
+        [0x1234, 0xFFFF, 0x0F0F, 0x8000]
+    }
+
+    test_vec_1! { test_vec_unpackl_i, vec_unpackl, i16x8 -> i32x4,
+        [0, 0, 0, 0, 0x1234, -2, 0x0F0F, -32768],
+        [0x1234, -2, 0x0F0F, -32768]
+    }
+
+    test_vec_1! { test_vec_unpackl_u, vec_unpackl, u16x8 -> u32x4,
+        [0, 0, 0, 0, 0x1234, 0xFFFF, 0x0F0F, 0x8000],
+        [0x1234, 0xFFFF, 0x0F0F, 0x8000]
+    }
+
+    test_vec_2! { test_vec_avg, vec_avg, u32x4,
+        [2, 1, u32::MAX, 0],
+        [4, 2, 2, 0],
+        [3, (1u32 + 2).div_ceil(2), (u32::MAX as u64 + 2u64).div_ceil(2) as u32, 0]
+    }
+
+    test_vec_2! { test_vec_checksum, vec_checksum, u32x4,
+        [1, 2, 3, u32::MAX],
+        [5, 6, 7, 8],
+        [0, 12, 0, 0]
+    }
+
+    test_vec_2! { test_vec_add_u128, vec_add_u128, u8x16,
+        [0x01, 0x05, 0x0F, 0x1A, 0x2F, 0x3F, 0x50, 0x65,
+                              0x7A, 0x8F, 0x9A, 0xAD, 0xB0, 0xC3, 0xD5, 0xE8],
+        [0xF0, 0xEF, 0xC3, 0xB1, 0x92, 0x71, 0x5A, 0x43,
+                              0x3B, 0x29, 0x13, 0x04, 0xD7, 0xA1, 0x8C, 0x76],
+        [0xF1, 0xF4, 0xD2, 0xCB, 0xC1, 0xB0, 0xAA, 0xA8, 0xB5, 0xB8, 0xAD, 0xB2, 0x88, 0x65, 0x62, 0x5E]
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_addc_u128() {
+        unsafe {
+            let a = u128::MAX;
+            let b = 1u128;
+
+            let d: u128 = transmute(vec_addc_u128(transmute(a), transmute(b)));
+            assert!(a.checked_add(b).is_none());
+            assert_eq!(d, 1);
+
+            let a = 1u128;
+            let b = 1u128;
+
+            let d: u128 = transmute(vec_addc_u128(transmute(a), transmute(b)));
+            assert!(a.checked_add(b).is_some());
+            assert_eq!(d, 0);
+        }
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_subc_u128() {
+        unsafe {
+            let a = 0u128;
+            let b = 1u128;
+
+            let d: u128 = transmute(vec_subc_u128(transmute(a), transmute(b)));
+            assert!(a.checked_sub(b).is_none());
+            assert_eq!(d, 0);
+
+            let a = 1u128;
+            let b = 1u128;
+
+            let d: u128 = transmute(vec_subc_u128(transmute(a), transmute(b)));
+            assert!(a.checked_sub(b).is_some());
+            assert_eq!(d, 1);
+        }
+    }
+
+    test_vec_2! { test_vec_mule_u, vec_mule, u16x8, u16x8 -> u32x4,
+        [0xFFFF, 0, 2, 0, 2, 0, 1, 0],
+        [0xFFFF, 0, 4, 0, 0xFFFF, 0, 2, 0],
+        [0xFFFE_0001, 8, 0x0001_FFFE, 2]
+    }
+
+    test_vec_2! { test_vec_mule_i, vec_mule, i16x8, i16x8 -> i32x4,
+        [i16::MIN, 0, -2, 0, 2, 0, 1, 0],
+        [i16::MIN, 0, 4, 0, i16::MAX, 0, 2, 0],
+        [0x4000_0000, -8, 0xFFFE, 2]
+    }
+
+    test_vec_2! { test_vec_mulo_u, vec_mulo, u16x8, u16x8 -> u32x4,
+        [0, 0xFFFF, 0, 2, 0, 2, 0, 1],
+        [0, 0xFFFF, 0, 4, 0, 0xFFFF, 0, 2],
+        [0xFFFE_0001, 8, 0x0001_FFFE, 2]
+    }
+
+    test_vec_2! { test_vec_mulo_i, vec_mulo, i16x8, i16x8 -> i32x4,
+        [0, i16::MIN, 0, -2, 0, 2, 0, 1],
+        [0, i16::MIN, 0, 4, 0, i16::MAX, 0, 2],
+        [0x4000_0000, -8, 0xFFFE, 2]
+    }
+
+    test_vec_2! { test_vec_mulh_u, vec_mulh, u32x4, u32x4 -> u32x4,
+        [u32::MAX, 2, 2, 1],
+        [u32::MAX, 4, u32::MAX, 2],
+        [u32::MAX - 1, 0, 1, 0]
+    }
+
+    test_vec_2! { test_vec_mulh_i, vec_mulh, i32x4, i32x4 -> i32x4,
+        [i32::MIN, -2, 2, 1],
+        [i32::MIN, 4, i32::MAX, 2],
+        [0x4000_0000, -1, 0, 0]
+    }
+
+    test_vec_2! { test_vec_gfmsum_1, vec_gfmsum, u16x8, u16x8 -> u32x4,
+        [0x1234, 0x5678, 0x9ABC, 0xDEF0, 0x1357, 0x2468, 0xACE0, 0xBDF0],
+        [0xFFFF, 0x0001, 0x8000, 0x7FFF, 0xAAAA, 0x5555, 0x1234, 0x5678],
+        [0xE13A794, 0x68764A50, 0x94AA3E, 0x2C93F300]
+    }
+
+    test_vec_2! { test_vec_gfmsum_2, vec_gfmsum, u16x8, u16x8 -> u32x4,
+        [0x0000, 0xFFFF, 0xAAAA, 0x5555, 0x1234, 0x5678, 0x9ABC, 0xDEF0],
+        [0xFFFF, 0x0000, 0x5555, 0xAAAA, 0x0001, 0x8000, 0x7FFF, 0x1357],
+        [0, 0, 0x2B3C1234, 0x3781D244]
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_gfmsum_128() {
+        let a = vector_unsigned_long_long([1, 2]);
+        let b = vector_unsigned_long_long([3, 4]);
+
+        let d: u128 = unsafe { transmute(vec_gfmsum_128(a, b)) };
+        assert_eq!(d, 11);
+
+        let a = vector_unsigned_long_long([0x0101010101010101, 0x0202020202020202]);
+        let b = vector_unsigned_long_long([0x0404040404040404, 0x0505050505050505]);
+
+        let d: u128 = unsafe { transmute(vec_gfmsum_128(a, b)) };
+        assert_eq!(d, 0xE000E000E000E000E000E000E000E);
+    }
+
+    #[simd_test(enable = "vector-enhancements-1")]
+    fn test_vec_bperm_u128() {
+        let a = vector_unsigned_char([65, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]);
+        let b = vector_unsigned_char([
+            0, 0, 0, 0, 1, 1, 1, 1, 128, 128, 128, 128, 255, 255, 255, 255,
+        ]);
+        let d = unsafe { vec_bperm_u128(a, b) };
+        assert_eq!(d.as_array(), &[0xF00, 0]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_sel() {
+        let a = vector_signed_int([1, 2, 3, 4]);
+        let b = vector_signed_int([5, 6, 7, 8]);
+
+        let e = vector_unsigned_int([9, 10, 11, 12]);
+        let f = vector_unsigned_int([9, 9, 11, 11]);
+
+        let c: vector_bool_int = unsafe { simd_eq(e, f) };
+        assert_eq!(c.as_array(), &[!0, 0, !0, 0]);
+        let d: vector_signed_int = unsafe { vec_sel(a, b, c) };
+        assert_eq!(d.as_array(), &[5, 2, 7, 4]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_gather_element() {
+        let a1: [u32; 10] = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19];
+        let a2: [u32; 10] = [20, 21, 22, 23, 24, 25, 26, 27, 28, 29];
+
+        let v1 = vector_unsigned_int([1, 2, 3, 4]);
+        let v2 = vector_unsigned_int([1, 2, 3, 4]);
+
+        let sizeof_int = core::mem::size_of::<u32>() as u32;
+        let v3 = vector_unsigned_int([
+            5 * sizeof_int,
+            8 * sizeof_int,
+            9 * sizeof_int,
+            6 * sizeof_int,
+        ]);
+
+        unsafe {
+            let d1 = vec_gather_element::<_, 0>(v1, v3, a1.as_ptr());
+            assert_eq!(d1.as_array(), &[15, 2, 3, 4]);
+            let d2 = vec_gather_element::<_, 0>(v2, v3, a2.as_ptr());
+            assert_eq!(d2.as_array(), &[25, 2, 3, 4]);
+        }
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_fp_test_data_class() {
+        let mut cc = 42;
+
+        let v1 = vector_double([0.0, f64::NAN]);
+        let v2 = vector_double([f64::INFINITY, 1.0]);
+        let v3 = vector_double([1.0, 2.0]);
+
+        unsafe {
+            let d = vec_fp_test_data_class::<_, __VEC_CLASS_FP_ZERO>(v1, &mut cc);
+            assert_eq!(cc, 1);
+            assert_eq!(d.as_array(), &[!0, 0]);
+
+            let d = vec_fp_test_data_class::<_, __VEC_CLASS_FP_NAN>(v1, &mut cc);
+            assert_eq!(cc, 1);
+            assert_eq!(d.as_array(), &[0, !0]);
+
+            let d = vec_fp_test_data_class::<_, __VEC_CLASS_FP_INFINITY>(v2, &mut cc);
+            assert_eq!(cc, 1);
+            assert_eq!(d.as_array(), &[!0, 0]);
+
+            let d = vec_fp_test_data_class::<_, __VEC_CLASS_FP_INFINITY_N>(v2, &mut cc);
+            assert_eq!(cc, 3);
+            assert_eq!(d.as_array(), &[0, 0]);
+
+            let d = vec_fp_test_data_class::<_, __VEC_CLASS_FP_NORMAL>(v2, &mut cc);
+            assert_eq!(cc, 1);
+            assert_eq!(d.as_array(), &[0, !0]);
+
+            let d = vec_fp_test_data_class::<_, __VEC_CLASS_FP_NORMAL>(v3, &mut cc);
+            assert_eq!(cc, 0);
+            assert_eq!(d.as_array(), &[!0, !0]);
+        }
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_fp_any_all_nan_numeric() {
+        unsafe {
+            assert_eq!(
+                vec_all_nan(vector_double([f64::NAN, f64::NAN])),
+                i32::from(true)
+            );
+            assert_eq!(
+                vec_all_nan(vector_double([f64::NAN, 1.0])),
+                i32::from(false)
+            );
+            assert_eq!(vec_all_nan(vector_double([0.0, 1.0])), i32::from(false));
+
+            assert_eq!(
+                vec_any_nan(vector_double([f64::NAN, f64::NAN])),
+                i32::from(true)
+            );
+            assert_eq!(vec_any_nan(vector_double([f64::NAN, 1.0])), i32::from(true));
+            assert_eq!(vec_any_nan(vector_double([0.0, 1.0])), i32::from(false));
+
+            assert_eq!(
+                vec_all_numeric(vector_double([f64::NAN, f64::NAN])),
+                i32::from(false)
+            );
+            assert_eq!(
+                vec_all_numeric(vector_double([f64::NAN, 1.0])),
+                i32::from(false)
+            );
+            assert_eq!(vec_all_numeric(vector_double([0.0, 1.0])), i32::from(true));
+
+            assert_eq!(
+                vec_any_numeric(vector_double([f64::NAN, f64::NAN])),
+                i32::from(false)
+            );
+            assert_eq!(
+                vec_any_numeric(vector_double([f64::NAN, 1.0])),
+                i32::from(true)
+            );
+            assert_eq!(vec_any_numeric(vector_double([0.0, 1.0])), i32::from(true));
+
+            // "numeric" means "not NaN". infinities are numeric
+            assert_eq!(
+                vec_all_numeric(vector_double([f64::INFINITY, f64::NEG_INFINITY])),
+                i32::from(true)
+            );
+            assert_eq!(
+                vec_any_numeric(vector_double([f64::INFINITY, f64::NEG_INFINITY])),
+                i32::from(true)
+            );
+        }
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_test_mask() {
+        unsafe {
+            let v = vector_unsigned_long_long([0xFF00FF00FF00FF00; 2]);
+            let m = vector_unsigned_long_long([0x0000FF000000FF00; 2]);
+            assert_eq!(vec_test_mask(v, m), 3);
+
+            let v = vector_unsigned_long_long([u64::MAX; 2]);
+            let m = vector_unsigned_long_long([0; 2]);
+            assert_eq!(vec_test_mask(v, m), 0);
+
+            let v = vector_unsigned_long_long([0; 2]);
+            let m = vector_unsigned_long_long([u64::MAX; 2]);
+            assert_eq!(vec_test_mask(v, m), 0);
+
+            let v = vector_unsigned_long_long([0xAAAAAAAAAAAAAAAA; 2]);
+            let m = vector_unsigned_long_long([0xAAAAAAAAAAAAAAAA; 2]);
+            assert_eq!(vec_test_mask(v, m), 3);
+        }
+    }
+
+    #[simd_test(enable = "vector-enhancements-2")]
+    fn test_vec_search_string_cc() {
+        unsafe {
+            let b = vector_unsigned_char(*b"ABCD------------");
+            let c = vector_unsigned_char([4; 16]);
+
+            let haystack = vector_unsigned_char(*b"__ABCD__________");
+            let (result, d) = vec_search_string_cc(haystack, b, c);
+            assert_eq!(result.as_array()[7], 2);
+            assert_eq!(d, 2);
+
+            let haystack = vector_unsigned_char(*b"___ABCD_________");
+            let (result, d) = vec_search_string_cc(haystack, b, c);
+            assert_eq!(result.as_array()[7], 3);
+            assert_eq!(d, 2);
+
+            let haystack = vector_unsigned_char(*b"________________");
+            let (result, d) = vec_search_string_cc(haystack, b, c);
+            assert_eq!(result.as_array()[7], 16);
+            assert_eq!(d, 0);
+
+            let haystack = vector_unsigned_char(*b"______\0_________");
+            let (result, d) = vec_search_string_cc(haystack, b, c);
+            assert_eq!(result.as_array()[7], 16);
+            assert_eq!(d, 0);
+
+            let haystack = vector_unsigned_char(*b"______\0__ABCD___");
+            let (result, d) = vec_search_string_cc(haystack, b, c);
+            assert_eq!(result.as_array()[7], 9);
+            assert_eq!(d, 2);
+        }
+    }
+
+    #[simd_test(enable = "vector-enhancements-2")]
+    fn test_vec_search_string_until_zero_cc() {
+        unsafe {
+            let b = vector_unsigned_char(*b"ABCD\0\0\0\0\0\0\0\0\0\0\0\0");
+            let c = vector_unsigned_char([16; 16]);
+
+            let haystack = vector_unsigned_char(*b"__ABCD__________");
+            let (result, d) = vec_search_string_until_zero_cc(haystack, b, c);
+            assert_eq!(result.as_array()[7], 2);
+            assert_eq!(d, 2);
+
+            let haystack = vector_unsigned_char(*b"___ABCD_________");
+            let (result, d) = vec_search_string_until_zero_cc(haystack, b, c);
+            assert_eq!(result.as_array()[7], 3);
+            assert_eq!(d, 2);
+
+            let haystack = vector_unsigned_char(*b"________________");
+            let (result, d) = vec_search_string_until_zero_cc(haystack, b, c);
+            assert_eq!(result.as_array()[7], 16);
+            assert_eq!(d, 0);
+
+            let haystack = vector_unsigned_char(*b"______\0_________");
+            let (result, d) = vec_search_string_until_zero_cc(haystack, b, c);
+            assert_eq!(result.as_array()[7], 16);
+            assert_eq!(d, 1);
+
+            let haystack = vector_unsigned_char(*b"______\0__ABCD___");
+            let (result, d) = vec_search_string_until_zero_cc(haystack, b, c);
+            assert_eq!(result.as_array()[7], 16);
+            assert_eq!(d, 1);
+        }
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_doublee() {
+        unsafe {
+            let v = vector_float([1.0, 2.0, 3.0, 4.0]);
+            assert_eq!(vec_doublee(v).as_array(), &[1.0, 3.0]);
+
+            let v = vector_float([f32::NAN, 2.0, f32::INFINITY, 4.0]);
+            let d = vec_doublee(v);
+            assert!(d.as_array()[0].is_nan());
+            assert_eq!(d.as_array()[1], f64::INFINITY);
+        }
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_floate() {
+        // NOTE: indices 1 and 3 can have an arbitrary value. With the C version
+        // these are poison values, our version initializes the memory but its
+        // value still should not be relied upon by application code.
+        unsafe {
+            let v = vector_double([1.0, 2.0]);
+            let d = vec_floate(v);
+            assert_eq!(d.as_array()[0], 1.0);
+            assert_eq!(d.as_array()[2], 2.0);
+
+            let v = vector_double([f64::NAN, f64::INFINITY]);
+            let d = vec_floate(v);
+            assert!(d.as_array()[0].is_nan());
+            assert_eq!(d.as_array()[2], f32::INFINITY);
+
+            let v = vector_double([f64::MIN, f64::MAX]);
+            let d = vec_floate(v);
+            assert_eq!(d.as_array()[0], f64::MIN as f32);
+            assert_eq!(d.as_array()[2], f64::MAX as f32);
+        }
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_extend_s64() {
+        unsafe {
+            let v = vector_signed_char([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+            assert_eq!(vec_extend_s64(v).as_array(), &[7, 15]);
+
+            let v = vector_signed_short([0, 1, 2, 3, 4, 5, 6, 7]);
+            assert_eq!(vec_extend_s64(v).as_array(), &[3, 7]);
+
+            let v = vector_signed_int([0, 1, 2, 3]);
+            assert_eq!(vec_extend_s64(v).as_array(), &[1, 3]);
+        }
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_signed() {
+        unsafe {
+            let v = vector_float([1.0, 2.5, -2.5, -0.0]);
+            assert_eq!(vec_signed(v).as_array(), &[1, 2, -2, 0]);
+
+            let v = vector_double([2.5, -2.5]);
+            assert_eq!(vec_signed(v).as_array(), &[2, -2]);
+        }
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_unsigned() {
+        // NOTE: converting a negative floating point value is UB!
+        unsafe {
+            let v = vector_float([1.0, 2.5, 3.5, 0.0]);
+            assert_eq!(vec_unsigned(v).as_array(), &[1, 2, 3, 0]);
+
+            let v = vector_double([2.5, 3.5]);
+            assert_eq!(vec_unsigned(v).as_array(), &[2, 3]);
+        }
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_cp_until_zero() {
+        unsafe {
+            let v = vector_signed_int([1, 2, 3, 4]);
+            let d = vec_cp_until_zero(v);
+            assert_eq!(d.as_array(), &[1, 2, 3, 4]);
+
+            let v = vector_signed_int([1, 2, 0, 4]);
+            let d = vec_cp_until_zero(v);
+            assert_eq!(d.as_array(), &[1, 2, 0, 0]);
+        }
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_cp_until_zero_cc() {
+        unsafe {
+            let v = vector_signed_int([1, 2, 3, 4]);
+            let (d, cc) = vec_cp_until_zero_cc(v);
+            assert_eq!(d.as_array(), &[1, 2, 3, 4]);
+            assert_eq!(cc, 3);
+
+            let v = vector_signed_int([1, 2, 0, 4]);
+            let (d, cc) = vec_cp_until_zero_cc(v);
+            assert_eq!(d.as_array(), &[1, 2, 0, 0]);
+            assert_eq!(cc, 0);
+        }
+    }
+
+    #[simd_test(enable = "vector-enhancements-1")]
+    fn test_vec_msum_u128() {
+        let a = vector_unsigned_long_long([1, 2]);
+        let b = vector_unsigned_long_long([3, 4]);
+
+        unsafe {
+            let c: vector_unsigned_char = transmute(100u128);
+
+            let d: u128 = transmute(vec_msum_u128::<0>(a, b, c));
+            assert_eq!(d, (1 * 3) + (2 * 4) + 100);
+
+            let d: u128 = transmute(vec_msum_u128::<4>(a, b, c));
+            assert_eq!(d, (1 * 3) + (2 * 4) * 2 + 100);
+
+            let d: u128 = transmute(vec_msum_u128::<8>(a, b, c));
+            assert_eq!(d, (1 * 3) * 2 + (2 * 4) + 100);
+
+            let d: u128 = transmute(vec_msum_u128::<12>(a, b, c));
+            assert_eq!(d, (1 * 3) * 2 + (2 * 4) * 2 + 100);
+        }
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_sld() {
+        let a = vector_unsigned_long_long([0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA]);
+        let b = vector_unsigned_long_long([0xBBBBBBBBBBBBBBBB, 0xBBBBBBBBBBBBBBBB]);
+
+        unsafe {
+            let d = vec_sld::<_, 4>(a, b);
+            assert_eq!(d.as_array(), &[0xAAAAAAAAAAAAAAAA, 0xAAAAAAAABBBBBBBB]);
+        }
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_sldw() {
+        let a = vector_unsigned_long_long([0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA]);
+        let b = vector_unsigned_long_long([0xBBBBBBBBBBBBBBBB, 0xBBBBBBBBBBBBBBBB]);
+
+        unsafe {
+            let d = vec_sldw::<_, 1>(a, b);
+            assert_eq!(d.as_array(), &[0xAAAAAAAAAAAAAAAA, 0xAAAAAAAABBBBBBBB]);
+        }
+    }
+
+    #[simd_test(enable = "vector-enhancements-2")]
+    fn test_vec_sldb() {
+        let a = vector_unsigned_long_long([0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA]);
+        let b = vector_unsigned_long_long([0xBBBBBBBBBBBBBBBB, 0xBBBBBBBBBBBBBBBB]);
+
+        unsafe {
+            let d = vec_sldb::<_, 4>(a, b);
+            assert_eq!(d.as_array(), &[0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAB]);
+        }
+    }
+
+    #[simd_test(enable = "vector-enhancements-2")]
+    fn test_vec_srdb() {
+        let a = vector_unsigned_long_long([0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA]);
+        let b = vector_unsigned_long_long([0xBBBBBBBBBBBBBBBB, 0xBBBBBBBBBBBBBBBB]);
+
+        unsafe {
+            let d = vec_srdb::<_, 4>(a, b);
+            assert_eq!(d.as_array(), &[0xABBBBBBBBBBBBBBB, 0xBBBBBBBBBBBBBBBB]);
+        }
+    }
+
+    const GT: u32 = 0x20000000;
+    const LT: u32 = 0x40000000;
+    const EQ: u32 = 0x80000000;
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_cmprg() {
+        let a = vector_unsigned_int([11, 22, 33, 44]);
+        let b = vector_unsigned_int([10, 20, 30, 40]);
+
+        let c = vector_unsigned_int([GT, LT, GT, LT]);
+        let d = unsafe { vec_cmprg(a, b, c) };
+        assert_eq!(d.as_array(), &[!0, 0, !0, 0]);
+
+        let c = vector_unsigned_int([GT, LT, 0, 0]);
+        let d = unsafe { vec_cmprg(a, b, c) };
+        assert_eq!(d.as_array(), &[!0, 0, 0, 0]);
+
+        let a = vector_unsigned_int([11, 22, 33, 30]);
+        let b = vector_unsigned_int([10, 20, 30, 30]);
+
+        let c = vector_unsigned_int([GT, LT, EQ, EQ]);
+        let d = unsafe { vec_cmprg(a, b, c) };
+        assert_eq!(d.as_array(), &[!0, 0, 0, !0]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_cmpnrg() {
+        let a = vector_unsigned_int([11, 22, 33, 44]);
+        let b = vector_unsigned_int([10, 20, 30, 40]);
+
+        let c = vector_unsigned_int([GT, LT, GT, LT]);
+        let d = unsafe { vec_cmpnrg(a, b, c) };
+        assert_eq!(d.as_array(), &[0, !0, 0, !0]);
+
+        let c = vector_unsigned_int([GT, LT, 0, 0]);
+        let d = unsafe { vec_cmpnrg(a, b, c) };
+        assert_eq!(d.as_array(), &[0, !0, !0, !0]);
+
+        let a = vector_unsigned_int([11, 22, 33, 30]);
+        let b = vector_unsigned_int([10, 20, 30, 30]);
+
+        let c = vector_unsigned_int([GT, LT, EQ, EQ]);
+        let d = unsafe { vec_cmpnrg(a, b, c) };
+        assert_eq!(d.as_array(), &[0, !0, !0, 0]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_cmprg_idx() {
+        let a = vector_unsigned_int([1, 11, 22, 33]);
+        let b = vector_unsigned_int([10, 20, 30, 40]);
+
+        let c = vector_unsigned_int([GT, LT, GT, LT]);
+        let d = unsafe { vec_cmprg_idx(a, b, c) };
+        assert_eq!(d.as_array(), &[0, 4, 0, 0]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_cmpnrg_idx() {
+        let a = vector_unsigned_int([1, 11, 22, 33]);
+        let b = vector_unsigned_int([10, 20, 30, 40]);
+
+        let c = vector_unsigned_int([GT, LT, GT, LT]);
+        let d = unsafe { vec_cmpnrg_idx(a, b, c) };
+        assert_eq!(d.as_array(), &[0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_cmprg_or_0_idx() {
+        let a = vector_unsigned_int([1, 0, 22, 33]);
+        let b = vector_unsigned_int([10, 20, 30, 40]);
+
+        let c = vector_unsigned_int([GT, LT, GT, LT]);
+        let d = unsafe { vec_cmprg_or_0_idx(a, b, c) };
+        assert_eq!(d.as_array(), &[0, 4, 0, 0]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_cmpnrg_or_0_idx() {
+        let a = vector_unsigned_int([11, 33, 0, 22]);
+        let b = vector_unsigned_int([10, 20, 30, 40]);
+
+        let c = vector_unsigned_int([GT, LT, GT, LT]);
+        let d = unsafe { vec_cmpnrg_or_0_idx(a, b, c) };
+        assert_eq!(d.as_array(), &[0, 8, 0, 0]);
+    }
+
+    test_vec_2! { test_vec_cmpgt, vec_cmpgt, f32x4, f32x4 -> i32x4,
+        [1.0, f32::NAN, f32::NAN, 3.14],
+        [2.0, f32::NAN, 5.0, 2.0],
+        [0, 0, 0, !0]
+    }
+
+    test_vec_2! { test_vec_cmpge, vec_cmpge, f32x4, f32x4 -> i32x4,
+        [1.0, f32::NAN, f32::NAN, 3.14],
+        [1.0, f32::NAN, 5.0, 2.0],
+        [!0, 0, 0, !0]
+    }
+
+    test_vec_2! { test_vec_cmplt, vec_cmplt, f32x4, f32x4 -> i32x4,
+        [1.0, f32::NAN, f32::NAN, 2.0],
+        [2.0, f32::NAN, 5.0, 2.0],
+        [!0, 0, 0, 0]
+    }
+
+    test_vec_2! { test_vec_cmple, vec_cmple, f32x4, f32x4 -> i32x4,
+        [1.0, f32::NAN, f32::NAN, 2.0],
+        [1.0, f32::NAN, 5.0, 3.14],
+        [!0, 0, 0, !0]
+    }
+
+    test_vec_2! { test_vec_cmpeq, vec_cmpeq, f32x4, f32x4 -> i32x4,
+        [1.0, f32::NAN, f32::NAN, 2.0],
+        [1.0, f32::NAN, 5.0, 3.14],
+        [!0, 0, 0, 0]
+    }
+
+    test_vec_2! { test_vec_cmpne, vec_cmpne, f32x4, f32x4 -> i32x4,
+        [1.0, f32::NAN, f32::NAN, 2.0],
+        [1.0, f32::NAN, 5.0, 3.14],
+        [0, !0, !0, !0]
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_meadd() {
+        let a = vector_unsigned_short([1, 0, 2, 0, 3, 0, 4, 0]);
+        let b = vector_unsigned_short([5, 0, 6, 0, 7, 0, 8, 0]);
+        let c = vector_unsigned_int([2, 2, 2, 2]);
+
+        let d = unsafe { vec_meadd(a, b, c) };
+        assert_eq!(d.as_array(), &[7, 14, 23, 34]);
+
+        let a = vector_signed_short([1, 0, 2, 0, 3, 0, 4, 0]);
+        let b = vector_signed_short([5, 0, 6, 0, 7, 0, 8, 0]);
+        let c = vector_signed_int([2, -2, 2, -2]);
+
+        let d = unsafe { vec_meadd(a, b, c) };
+        assert_eq!(d.as_array(), &[7, 10, 23, 30]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_moadd() {
+        let a = vector_unsigned_short([0, 1, 0, 2, 0, 3, 0, 4]);
+        let b = vector_unsigned_short([0, 5, 0, 6, 0, 7, 0, 8]);
+        let c = vector_unsigned_int([2, 2, 2, 2]);
+
+        let d = unsafe { vec_moadd(a, b, c) };
+        assert_eq!(d.as_array(), &[7, 14, 23, 34]);
+
+        let a = vector_signed_short([0, 1, 0, 2, 0, 3, 0, 4]);
+        let b = vector_signed_short([0, 5, 0, 6, 0, 7, 0, 8]);
+        let c = vector_signed_int([2, -2, 2, -2]);
+
+        let d = unsafe { vec_moadd(a, b, c) };
+        assert_eq!(d.as_array(), &[7, 10, 23, 30]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_mhadd() {
+        let a = vector_unsigned_int([1, 2, 3, 4]);
+        let b = vector_unsigned_int([5, 6, 7, 8]);
+        let c = vector_unsigned_int([u32::MAX; 4]);
+
+        let d = unsafe { vec_mhadd(a, b, c) };
+        assert_eq!(d.as_array(), &[1, 1, 1, 1]);
+
+        let a = vector_signed_int([-1, -2, -3, -4]);
+        let b = vector_signed_int([5, 6, 7, 8]);
+        let c = vector_signed_int([i32::MIN; 4]);
+
+        let d = unsafe { vec_mhadd(a, b, c) };
+        assert_eq!(d.as_array(), &[-1, -1, -1, -1]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_mladd() {
+        let a = vector_unsigned_int([1, 2, 3, 4]);
+        let b = vector_unsigned_int([5, 6, 7, 8]);
+        let c = vector_unsigned_int([2, 2, 2, 2]);
+
+        let d = unsafe { vec_mladd(a, b, c) };
+        assert_eq!(d.as_array(), &[7, 14, 23, 34]);
+
+        let a = vector_signed_int([-1, -2, -3, -4]);
+        let b = vector_signed_int([5, 6, 7, 8]);
+        let c = vector_signed_int([2, 2, 2, 2]);
+
+        let d = unsafe { vec_mladd(a, b, c) };
+        assert_eq!(d.as_array(), &[-3, -10, -19, -30]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_extract() {
+        let v = vector_unsigned_int([1, 2, 3, 4]);
+
+        assert_eq!(unsafe { vec_extract(v, 1) }, 2);
+        assert_eq!(unsafe { vec_extract(v, 4 + 2) }, 3);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_insert() {
+        let mut v = vector_unsigned_int([1, 2, 3, 4]);
+
+        v = unsafe { vec_insert(42, v, 1) };
+        assert_eq!(v.as_array(), &[1, 42, 3, 4]);
+
+        v = unsafe { vec_insert(64, v, 6) };
+        assert_eq!(v.as_array(), &[1, 42, 64, 4]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_promote() {
+        let v: vector_unsigned_int = unsafe { vec_promote(42, 1).assume_init() };
+        assert_eq!(v.as_array(), &[0, 42, 0, 0]);
+    }
+
+    #[simd_test(enable = "vector")]
+    fn test_vec_insert_and_zero() {
+        let v = unsafe { vec_insert_and_zero::<vector_unsigned_int>(&42u32) };
+        assert_eq!(v.as_array(), vector_unsigned_int([0, 42, 0, 0]).as_array());
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/simd.rs b/library/stdarch/crates/core_arch/src/simd.rs
new file mode 100644
index 0000000000000..25834943f009b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/simd.rs
@@ -0,0 +1,1021 @@
+//! Internal `#[repr(simd)]` types
+
+#![allow(non_camel_case_types)]
+
+macro_rules! simd_ty {
+    ($id:ident [$elem_type:ty ; $len:literal]: $($param_name:ident),*) => {
+        #[repr(simd)]
+        #[derive(Copy, Clone)]
+        pub(crate) struct $id([$elem_type; $len]);
+
+        #[allow(clippy::use_self)]
+        impl $id {
+            /// A value of this type where all elements are zeroed out.
+            pub(crate) const ZERO: Self = unsafe { crate::mem::zeroed() };
+
+            #[inline(always)]
+            pub(crate) const fn new($($param_name: $elem_type),*) -> Self {
+                $id([$($param_name),*])
+            }
+            #[inline(always)]
+            pub(crate) const fn from_array(elements: [$elem_type; $len]) -> Self {
+                $id(elements)
+            }
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) fn splat(value: $elem_type) -> Self {
+                #[derive(Copy, Clone)]
+                #[repr(simd)]
+                struct JustOne([$elem_type; 1]);
+                let one = JustOne([value]);
+                // SAFETY: 0 is always in-bounds because we're shuffling
+                // a simd type with exactly one element.
+                unsafe { simd_shuffle!(one, one, [0; $len]) }
+            }
+
+            /// Extract the element at position `index`.
+            /// `index` is not a constant so this is not efficient!
+            /// Use for testing only.
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) fn extract(&self, index: usize) -> $elem_type {
+                self.as_array()[index]
+            }
+
+            #[inline]
+            pub(crate) fn as_array(&self) -> &[$elem_type; $len] {
+                let simd_ptr: *const Self = self;
+                let array_ptr: *const [$elem_type; $len] = simd_ptr.cast();
+                // SAFETY: We can always read the prefix of a simd type as an array.
+                // There might be more padding afterwards for some widths, but
+                // that's not a problem for reading less than that.
+                unsafe { &*array_ptr }
+            }
+        }
+
+        impl core::cmp::PartialEq for $id {
+            #[inline]
+            fn eq(&self, other: &Self) -> bool {
+                self.as_array() == other.as_array()
+            }
+        }
+
+        impl core::fmt::Debug for $id {
+            #[inline]
+            fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+                debug_simd_finish(f, stringify!($id), self.as_array())
+            }
+        }
+    }
+}
+
+macro_rules! simd_m_ty {
+    ($id:ident [$elem_type:ident ; $len:literal]: $($param_name:ident),*) => {
+        #[repr(simd)]
+        #[derive(Copy, Clone)]
+        pub(crate) struct $id([$elem_type; $len]);
+
+        #[allow(clippy::use_self)]
+        impl $id {
+            #[inline(always)]
+            const fn bool_to_internal(x: bool) -> $elem_type {
+                [0 as $elem_type, !(0 as $elem_type)][x as usize]
+            }
+
+            #[inline(always)]
+            pub(crate) const fn new($($param_name: bool),*) -> Self {
+                $id([$(Self::bool_to_internal($param_name)),*])
+            }
+
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) fn splat(value: bool) -> Self {
+                #[derive(Copy, Clone)]
+                #[repr(simd)]
+                struct JustOne([$elem_type; 1]);
+                let one = JustOne([Self::bool_to_internal(value)]);
+                // SAFETY: 0 is always in-bounds because we're shuffling
+                // a simd type with exactly one element.
+                unsafe { simd_shuffle!(one, one, [0; $len]) }
+            }
+
+            #[inline]
+            pub(crate) fn as_array(&self) -> &[$elem_type; $len] {
+                let simd_ptr: *const Self = self;
+                let array_ptr: *const [$elem_type; $len] = simd_ptr.cast();
+                // SAFETY: We can always read the prefix of a simd type as an array.
+                // There might be more padding afterwards for some widths, but
+                // that's not a problem for reading less than that.
+                unsafe { &*array_ptr }
+            }
+        }
+
+        impl core::cmp::PartialEq for $id {
+            #[inline]
+            fn eq(&self, other: &Self) -> bool {
+                self.as_array() == other.as_array()
+            }
+        }
+
+        impl core::fmt::Debug for $id {
+            #[inline]
+            fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+                debug_simd_finish(f, stringify!($id), self.as_array())
+            }
+        }
+    }
+}
+
+// 16-bit wide types:
+
+simd_ty!(u8x2[u8;2]: x0, x1);
+simd_ty!(i8x2[i8;2]: x0, x1);
+
+// 32-bit wide types:
+
+simd_ty!(u8x4[u8;4]: x0, x1, x2, x3);
+simd_ty!(u16x2[u16;2]: x0, x1);
+
+simd_ty!(i8x4[i8;4]: x0, x1, x2, x3);
+simd_ty!(i16x2[i16;2]: x0, x1);
+
+// 64-bit wide types:
+
+simd_ty!(
+    u8x8[u8;8]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(u16x4[u16;4]: x0, x1, x2, x3);
+simd_ty!(u32x2[u32;2]: x0, x1);
+simd_ty!(u64x1[u64;1]: x1);
+
+simd_ty!(
+    i8x8[i8;8]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(i16x4[i16;4]: x0, x1, x2, x3);
+simd_ty!(i32x2[i32;2]: x0, x1);
+simd_ty!(i64x1[i64;1]: x1);
+
+simd_ty!(f32x2[f32;2]: x0, x1);
+simd_ty!(f64x1[f64;1]: x1);
+
+// 128-bit wide types:
+
+simd_ty!(
+    u8x16[u8;16]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+simd_ty!(
+    u16x8[u16;8]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(u32x4[u32;4]: x0, x1, x2, x3);
+simd_ty!(u64x2[u64;2]: x0, x1);
+
+simd_ty!(
+    i8x16[i8;16]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+simd_ty!(
+    i16x8[i16;8]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(i32x4[i32;4]: x0, x1, x2, x3);
+simd_ty!(i64x2[i64;2]: x0, x1);
+
+simd_ty!(f16x4[f16;4]: x0, x1, x2, x3);
+
+simd_ty!(
+    f16x8[f16;8]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(f32x4[f32;4]: x0, x1, x2, x3);
+simd_ty!(f64x2[f64;2]: x0, x1);
+
+simd_m_ty!(
+    m8x16[i8;16]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+simd_m_ty!(
+    m16x8[i16;8]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_m_ty!(m32x4[i32;4]: x0, x1, x2, x3);
+simd_m_ty!(m64x2[i64;2]: x0, x1);
+
+// 256-bit wide types:
+
+simd_ty!(
+    u8x32[u8;32]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
+simd_ty!(
+    u16x16[u16;16]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+simd_ty!(
+    u32x8[u32;8]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(u64x4[u64;4]: x0, x1, x2, x3);
+
+simd_ty!(
+    i8x32[i8;32]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
+simd_ty!(
+    i16x16[i16;16]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+simd_ty!(
+    i32x8[i32;8]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(i64x4[i64;4]: x0, x1, x2, x3);
+
+simd_ty!(
+    f16x16[f16;16]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+simd_ty!(
+    f32x8[f32;8]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(f64x4[f64;4]: x0, x1, x2, x3);
+
+simd_m_ty!(
+    m8x32[i8;32]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
+simd_m_ty!(
+    m16x16[i16;16]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+simd_m_ty!(
+    m32x8[i32;8]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+
+// 512-bit wide types:
+
+simd_ty!(
+    i8x64[i8;64]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31,
+    x32,
+    x33,
+    x34,
+    x35,
+    x36,
+    x37,
+    x38,
+    x39,
+    x40,
+    x41,
+    x42,
+    x43,
+    x44,
+    x45,
+    x46,
+    x47,
+    x48,
+    x49,
+    x50,
+    x51,
+    x52,
+    x53,
+    x54,
+    x55,
+    x56,
+    x57,
+    x58,
+    x59,
+    x60,
+    x61,
+    x62,
+    x63
+);
+
+simd_ty!(
+    u8x64[u8;64]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31,
+    x32,
+    x33,
+    x34,
+    x35,
+    x36,
+    x37,
+    x38,
+    x39,
+    x40,
+    x41,
+    x42,
+    x43,
+    x44,
+    x45,
+    x46,
+    x47,
+    x48,
+    x49,
+    x50,
+    x51,
+    x52,
+    x53,
+    x54,
+    x55,
+    x56,
+    x57,
+    x58,
+    x59,
+    x60,
+    x61,
+    x62,
+    x63
+);
+
+simd_ty!(
+    i16x32[i16;32]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
+
+simd_ty!(
+    u16x32[u16;32]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
+
+simd_ty!(
+    i32x16[i32;16]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+
+simd_ty!(
+    u32x16[u32;16]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+
+simd_ty!(
+    f16x32[f16;32]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
+simd_ty!(
+    f32x16[f32;16]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+
+simd_ty!(
+    i64x8[i64;8]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+
+simd_ty!(
+    u64x8[u64;8]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+
+simd_ty!(
+    f64x8[f64;8]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+
+// 1024-bit wide types:
+simd_ty!(
+    u16x64[u16;64]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31,
+    x32,
+    x33,
+    x34,
+    x35,
+    x36,
+    x37,
+    x38,
+    x39,
+    x40,
+    x41,
+    x42,
+    x43,
+    x44,
+    x45,
+    x46,
+    x47,
+    x48,
+    x49,
+    x50,
+    x51,
+    x52,
+    x53,
+    x54,
+    x55,
+    x56,
+    x57,
+    x58,
+    x59,
+    x60,
+    x61,
+    x62,
+    x63
+);
+simd_ty!(
+    i32x32[i32;32]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
+simd_ty!(
+    u32x32[u32;32]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
+
+/// Used to continue `Debug`ging SIMD types as `MySimd(1, 2, 3, 4)`, as they
+/// were before moving to array-based simd.
+#[inline]
+pub(crate) fn debug_simd_finish<T: crate::fmt::Debug, const N: usize>(
+    formatter: &mut crate::fmt::Formatter<'_>,
+    type_name: &str,
+    array: &[T; N],
+) -> crate::fmt::Result {
+    crate::fmt::Formatter::debug_tuple_fields_finish(
+        formatter,
+        type_name,
+        &crate::array::from_fn::<&dyn crate::fmt::Debug, N, _>(|i| &array[i]),
+    )
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/atomic.rs b/library/stdarch/crates/core_arch/src/wasm32/atomic.rs
new file mode 100644
index 0000000000000..fdc8cfbfdb414
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/wasm32/atomic.rs
@@ -0,0 +1,96 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.wasm.memory.atomic.wait32"]
+    fn llvm_atomic_wait_i32(ptr: *mut i32, exp: i32, timeout: i64) -> i32;
+    #[link_name = "llvm.wasm.memory.atomic.wait64"]
+    fn llvm_atomic_wait_i64(ptr: *mut i64, exp: i64, timeout: i64) -> i32;
+    #[link_name = "llvm.wasm.memory.atomic.notify"]
+    fn llvm_atomic_notify(ptr: *mut i32, cnt: i32) -> i32;
+}
+
+/// Corresponding intrinsic to wasm's [`memory.atomic.wait32` instruction][instr]
+///
+/// This function, when called, will block the current thread if the memory
+/// pointed to by `ptr` is equal to `expression` (performing this action
+/// atomically).
+///
+/// The argument `timeout_ns` is a maximum number of nanoseconds the calling
+/// thread will be blocked for, if it blocks. If the timeout is negative then
+/// the calling thread will be blocked forever.
+///
+/// The calling thread can only be woken up with a call to the `wake` intrinsic
+/// once it has been blocked. Changing the memory behind `ptr` will not wake
+/// the thread once it's blocked.
+///
+/// # Return value
+///
+/// * 0 - indicates that the thread blocked and then was woken up
+/// * 1 - the loaded value from `ptr` didn't match `expression`, the thread
+///   didn't block
+/// * 2 - the thread blocked, but the timeout expired.
+///
+/// [instr]: https://webassembly.github.io/threads/core/syntax/instructions.html#syntax-instr-atomic-memory
+#[inline]
+#[cfg_attr(test, assert_instr(memory.atomic.wait32))]
+#[target_feature(enable = "atomics")]
+#[doc(alias("memory.atomic.wait32"))]
+#[unstable(feature = "stdarch_wasm_atomic_wait", issue = "77839")]
+pub unsafe fn memory_atomic_wait32(ptr: *mut i32, expression: i32, timeout_ns: i64) -> i32 {
+    llvm_atomic_wait_i32(ptr, expression, timeout_ns)
+}
+
+/// Corresponding intrinsic to wasm's [`memory.atomic.wait64` instruction][instr]
+///
+/// This function, when called, will block the current thread if the memory
+/// pointed to by `ptr` is equal to `expression` (performing this action
+/// atomically).
+///
+/// The argument `timeout_ns` is a maximum number of nanoseconds the calling
+/// thread will be blocked for, if it blocks. If the timeout is negative then
+/// the calling thread will be blocked forever.
+///
+/// The calling thread can only be woken up with a call to the `wake` intrinsic
+/// once it has been blocked. Changing the memory behind `ptr` will not wake
+/// the thread once it's blocked.
+///
+/// # Return value
+///
+/// * 0 - indicates that the thread blocked and then was woken up
+/// * 1 - the loaded value from `ptr` didn't match `expression`, the thread
+///   didn't block
+/// * 2 - the thread blocked, but the timeout expired.
+///
+/// [instr]: https://webassembly.github.io/threads/core/syntax/instructions.html#syntax-instr-atomic-memory
+#[inline]
+#[cfg_attr(test, assert_instr(memory.atomic.wait64))]
+#[target_feature(enable = "atomics")]
+#[doc(alias("memory.atomic.wait64"))]
+#[unstable(feature = "stdarch_wasm_atomic_wait", issue = "77839")]
+pub unsafe fn memory_atomic_wait64(ptr: *mut i64, expression: i64, timeout_ns: i64) -> i32 {
+    llvm_atomic_wait_i64(ptr, expression, timeout_ns)
+}
+
+/// Corresponding intrinsic to wasm's [`memory.atomic.notify` instruction][instr]
+///
+/// This function will notify a number of threads blocked on the address
+/// indicated by `ptr`. Threads previously blocked with the `i32_atomic_wait`
+/// and `i64_atomic_wait` functions above will be woken up.
+///
+/// The `waiters` argument indicates how many waiters should be woken up (a
+/// maximum). If the value is zero no waiters are woken up.
+///
+/// # Return value
+///
+/// Returns the number of waiters which were actually notified.
+///
+/// [instr]: https://webassembly.github.io/threads/core/syntax/instructions.html#syntax-instr-atomic-memory
+#[inline]
+#[cfg_attr(test, assert_instr(memory.atomic.notify))]
+#[target_feature(enable = "atomics")]
+#[doc(alias("memory.atomic.notify"))]
+#[unstable(feature = "stdarch_wasm_atomic_wait", issue = "77839")]
+pub unsafe fn memory_atomic_notify(ptr: *mut i32, waiters: u32) -> u32 {
+    llvm_atomic_notify(ptr, waiters as i32) as u32
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/memory.rs b/library/stdarch/crates/core_arch/src/wasm32/memory.rs
new file mode 100644
index 0000000000000..90e9075e5136b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/wasm32/memory.rs
@@ -0,0 +1,58 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.wasm.memory.grow"]
+    fn llvm_memory_grow(mem: u32, pages: usize) -> usize;
+    #[link_name = "llvm.wasm.memory.size"]
+    fn llvm_memory_size(mem: u32) -> usize;
+}
+
+/// Corresponding intrinsic to wasm's [`memory.size` instruction][instr]
+///
+/// This function, when called, will return the current memory size in units of
+/// pages. The current WebAssembly page size is 65536 bytes (64 KB).
+///
+/// The argument `MEM` is the numerical index of which memory to return the
+/// size of. Note that currently the WebAssembly specification only supports one
+/// memory, so it is required that zero is passed in. The argument is present to
+/// be forward-compatible with future WebAssembly revisions. If a nonzero
+/// argument is passed to this function it will currently unconditionally abort.
+///
+/// [instr]: http://webassembly.github.io/spec/core/exec/instructions.html#exec-memory-size
+#[inline]
+#[cfg_attr(test, assert_instr("memory.size", MEM = 0))]
+#[rustc_legacy_const_generics(0)]
+#[stable(feature = "simd_wasm32", since = "1.33.0")]
+#[doc(alias("memory.size"))]
+pub fn memory_size<const MEM: u32>() -> usize {
+    static_assert!(MEM == 0);
+    unsafe { llvm_memory_size(MEM) }
+}
+
+/// Corresponding intrinsic to wasm's [`memory.grow` instruction][instr]
+///
+/// This function, when called, will attempt to grow the default linear memory
+/// by the specified `delta` of pages. The current WebAssembly page size is
+/// 65536 bytes (64 KB). If memory is successfully grown then the previous size
+/// of memory, in pages, is returned. If memory cannot be grown then
+/// `usize::MAX` is returned.
+///
+/// The argument `MEM` is the numerical index of which memory to return the
+/// size of. Note that currently the WebAssembly specification only supports one
+/// memory, so it is required that zero is passed in. The argument is present to
+/// be forward-compatible with future WebAssembly revisions. If a nonzero
+/// argument is passed to this function it will currently unconditionally abort.
+///
+/// [instr]: http://webassembly.github.io/spec/core/exec/instructions.html#exec-memory-grow
+#[inline]
+#[cfg_attr(test, assert_instr("memory.grow", MEM = 0))]
+#[rustc_legacy_const_generics(0)]
+#[stable(feature = "simd_wasm32", since = "1.33.0")]
+#[doc(alias("memory.grow"))]
+pub fn memory_grow<const MEM: u32>(delta: usize) -> usize {
+    unsafe {
+        static_assert!(MEM == 0);
+        llvm_memory_grow(MEM, delta)
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/mod.rs b/library/stdarch/crates/core_arch/src/wasm32/mod.rs
new file mode 100644
index 0000000000000..2c4361f1639f7
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/wasm32/mod.rs
@@ -0,0 +1,197 @@
+//! WASM32 intrinsics
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+mod atomic;
+#[unstable(feature = "stdarch_wasm_atomic_wait", issue = "77839")]
+pub use self::atomic::*;
+
+mod simd128;
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use self::simd128::*;
+
+mod relaxed_simd;
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub use self::relaxed_simd::*;
+
+mod memory;
+#[stable(feature = "simd_wasm32", since = "1.33.0")]
+pub use self::memory::*;
+
+/// Generates the [`unreachable`] instruction, which causes an unconditional [trap].
+///
+/// This function is safe to call and immediately aborts the execution.
+///
+/// [`unreachable`]: https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-instr-control
+/// [trap]: https://webassembly.github.io/spec/core/intro/overview.html#trap
+#[cfg_attr(test, assert_instr(unreachable))]
+#[inline]
+#[stable(feature = "unreachable_wasm32", since = "1.37.0")]
+pub fn unreachable() -> ! {
+    crate::intrinsics::abort()
+}
+
+/// Generates the [`f32.ceil`] instruction, returning the smallest integer greater than or equal to `a`.
+///
+/// This method is useful when targeting `no_std` and is equivalent to [`std::f32::ceil()`].
+///
+/// [`std::f32::ceil()`]: https://doc.rust-lang.org/std/primitive.f32.html#method.ceil
+/// [`f32.ceil`]: https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-instr-numeric
+#[cfg_attr(test, assert_instr(f32.ceil))]
+#[inline]
+#[must_use = "method returns a new number and does not mutate the original value"]
+#[unstable(feature = "wasm_numeric_instr", issue = "133908")]
+pub fn f32_ceil(a: f32) -> f32 {
+    unsafe { crate::intrinsics::ceilf32(a) }
+}
+
+/// Generates the [`f32.floor`] instruction, returning the largest integer less than or equal to `a`.
+///
+/// This method is useful when targeting `no_std` and is equivalent to [`std::f32::floor()`].
+///
+/// [`std::f32::floor()`]: https://doc.rust-lang.org/std/primitive.f32.html#method.floor
+/// [`f32.floor`]: https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-instr-numeric
+#[cfg_attr(test, assert_instr(f32.floor))]
+#[inline]
+#[must_use = "method returns a new number and does not mutate the original value"]
+#[unstable(feature = "wasm_numeric_instr", issue = "133908")]
+pub fn f32_floor(a: f32) -> f32 {
+    unsafe { crate::intrinsics::floorf32(a) }
+}
+
+/// Generates the [`f32.trunc`] instruction, roundinging to the nearest integer towards zero.
+///
+/// This method is useful when targeting `no_std` and is equivalent to [`std::f32::trunc()`].
+///
+/// [`std::f32::trunc()`]: https://doc.rust-lang.org/std/primitive.f32.html#method.trunc
+/// [`f32.trunc`]: https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-instr-numeric
+#[cfg_attr(test, assert_instr(f32.trunc))]
+#[inline]
+#[must_use = "method returns a new number and does not mutate the original value"]
+#[unstable(feature = "wasm_numeric_instr", issue = "133908")]
+pub fn f32_trunc(a: f32) -> f32 {
+    unsafe { crate::intrinsics::truncf32(a) }
+}
+
+/// Generates the [`f32.nearest`] instruction, roundinging to the nearest integer. Rounds half-way
+/// cases to the number with an even least significant digit.
+///
+/// This method is useful when targeting `no_std` and is equivalent to [`std::f32::round_ties_even()`].
+///
+/// [`std::f32::round_ties_even()`]: https://doc.rust-lang.org/std/primitive.f32.html#method.round_ties_even
+/// [`f32.nearest`]: https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-instr-numeric
+#[cfg_attr(test, assert_instr(f32.nearest))]
+#[inline]
+#[must_use = "method returns a new number and does not mutate the original value"]
+#[unstable(feature = "wasm_numeric_instr", issue = "133908")]
+pub fn f32_nearest(a: f32) -> f32 {
+    crate::intrinsics::round_ties_even_f32(a)
+}
+
+/// Generates the [`f32.sqrt`] instruction, returning the square root of the number `a`.
+///
+/// This method is useful when targeting `no_std` and is equivalent to [`std::f32::sqrt()`].
+///
+/// [`std::f32::sqrt()`]: https://doc.rust-lang.org/std/primitive.f32.html#method.sqrt
+/// [`f32.sqrt`]: https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-instr-numeric
+#[cfg_attr(test, assert_instr(f32.sqrt))]
+#[inline]
+#[must_use = "method returns a new number and does not mutate the original value"]
+#[unstable(feature = "wasm_numeric_instr", issue = "133908")]
+pub fn f32_sqrt(a: f32) -> f32 {
+    unsafe { crate::intrinsics::sqrtf32(a) }
+}
+
+/// Generates the [`f64.ceil`] instruction, returning the smallest integer greater than or equal to `a`.
+///
+/// This method is useful when targeting `no_std` and is equivalent to [`std::f64::ceil()`].
+///
+/// [`std::f64::ceil()`]: https://doc.rust-lang.org/std/primitive.f64.html#method.ceil
+/// [`f64.ceil`]: https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-instr-numeric
+#[cfg_attr(test, assert_instr(f64.ceil))]
+#[inline]
+#[must_use = "method returns a new number and does not mutate the original value"]
+#[unstable(feature = "wasm_numeric_instr", issue = "133908")]
+pub fn f64_ceil(a: f64) -> f64 {
+    unsafe { crate::intrinsics::ceilf64(a) }
+}
+
+/// Generates the [`f64.floor`] instruction, returning the largest integer less than or equal to `a`.
+///
+/// This method is useful when targeting `no_std` and is equivalent to [`std::f64::floor()`].
+///
+/// [`std::f64::floor()`]: https://doc.rust-lang.org/std/primitive.f64.html#method.floor
+/// [`f64.floor`]: https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-instr-numeric
+#[cfg_attr(test, assert_instr(f64.floor))]
+#[inline]
+#[must_use = "method returns a new number and does not mutate the original value"]
+#[unstable(feature = "wasm_numeric_instr", issue = "133908")]
+pub fn f64_floor(a: f64) -> f64 {
+    unsafe { crate::intrinsics::floorf64(a) }
+}
+
+/// Generates the [`f64.trunc`] instruction, roundinging to the nearest integer towards zero.
+///
+/// This method is useful when targeting `no_std` and is equivalent to [`std::f64::trunc()`].
+///
+/// [`std::f64::trunc()`]: https://doc.rust-lang.org/std/primitive.f64.html#method.trunc
+/// [`f64.trunc`]: https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-instr-numeric
+#[cfg_attr(test, assert_instr(f64.trunc))]
+#[inline]
+#[must_use = "method returns a new number and does not mutate the original value"]
+#[unstable(feature = "wasm_numeric_instr", issue = "133908")]
+pub fn f64_trunc(a: f64) -> f64 {
+    unsafe { crate::intrinsics::truncf64(a) }
+}
+
+/// Generates the [`f64.nearest`] instruction, roundinging to the nearest integer. Rounds half-way
+/// cases to the number with an even least significant digit.
+///
+/// This method is useful when targeting `no_std` and is equivalent to [`std::f64::round_ties_even()`].
+///
+/// [`std::f64::round_ties_even()`]: https://doc.rust-lang.org/std/primitive.f64.html#method.round_ties_even
+/// [`f64.nearest`]: https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-instr-numeric
+#[cfg_attr(test, assert_instr(f64.nearest))]
+#[inline]
+#[must_use = "method returns a new number and does not mutate the original value"]
+#[unstable(feature = "wasm_numeric_instr", issue = "133908")]
+pub fn f64_nearest(a: f64) -> f64 {
+    crate::intrinsics::round_ties_even_f64(a)
+}
+
+/// Generates the [`f64.sqrt`] instruction, returning the square root of the number `a`.
+///
+/// This method is useful when targeting `no_std` and is equivalent to [`std::f64::sqrt()`].
+///
+/// [`std::f64::sqrt()`]: https://doc.rust-lang.org/std/primitive.f64.html#method.sqrt
+/// [`f64.sqrt`]: https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-instr-numeric
+#[cfg_attr(test, assert_instr(f64.sqrt))]
+#[inline]
+#[must_use = "method returns a new number and does not mutate the original value"]
+#[unstable(feature = "wasm_numeric_instr", issue = "133908")]
+pub fn f64_sqrt(a: f64) -> f64 {
+    unsafe { crate::intrinsics::sqrtf64(a) }
+}
+
+unsafe extern "C-unwind" {
+    #[link_name = "llvm.wasm.throw"]
+    fn wasm_throw(tag: i32, ptr: *mut u8) -> !;
+}
+
+/// Generates the [`throw`] instruction from the [exception-handling proposal] for WASM.
+///
+/// This function is unlikely to be stabilized until codegen backends have better support.
+///
+/// [`throw`]: https://webassembly.github.io/exception-handling/core/syntax/instructions.html#syntax-instr-control
+/// [exception-handling proposal]: https://github.com/WebAssembly/exception-handling
+// FIXME: wasmtime does not currently support exception-handling, so cannot execute
+//        a wasm module with the throw instruction in it. once it does, we can
+//        reenable this attribute.
+// #[cfg_attr(test, assert_instr(throw, TAG = 0, ptr = core::ptr::null_mut()))]
+#[inline]
+#[unstable(feature = "wasm_exception_handling_intrinsics", issue = "122465")]
+pub unsafe fn throw<const TAG: i32>(ptr: *mut u8) -> ! {
+    static_assert!(TAG == 0); // LLVM only supports tag 0 == C++ right now.
+    wasm_throw(TAG, ptr)
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/relaxed_simd.rs b/library/stdarch/crates/core_arch/src/wasm32/relaxed_simd.rs
new file mode 100644
index 0000000000000..a9b7e9c04d112
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/wasm32/relaxed_simd.rs
@@ -0,0 +1,509 @@
+use super::v128;
+use crate::core_arch::simd;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.wasm.relaxed.swizzle"]
+    fn llvm_relaxed_swizzle(a: simd::i8x16, b: simd::i8x16) -> simd::i8x16;
+    #[link_name = "llvm.wasm.relaxed.trunc.signed"]
+    fn llvm_relaxed_trunc_signed(a: simd::f32x4) -> simd::i32x4;
+    #[link_name = "llvm.wasm.relaxed.trunc.unsigned"]
+    fn llvm_relaxed_trunc_unsigned(a: simd::f32x4) -> simd::i32x4;
+    #[link_name = "llvm.wasm.relaxed.trunc.signed.zero"]
+    fn llvm_relaxed_trunc_signed_zero(a: simd::f64x2) -> simd::i32x4;
+    #[link_name = "llvm.wasm.relaxed.trunc.unsigned.zero"]
+    fn llvm_relaxed_trunc_unsigned_zero(a: simd::f64x2) -> simd::i32x4;
+
+    #[link_name = "llvm.wasm.relaxed.madd.v4f32"]
+    fn llvm_f32x4_fma(a: simd::f32x4, b: simd::f32x4, c: simd::f32x4) -> simd::f32x4;
+    #[link_name = "llvm.wasm.relaxed.nmadd.v4f32"]
+    fn llvm_f32x4_fms(a: simd::f32x4, b: simd::f32x4, c: simd::f32x4) -> simd::f32x4;
+    #[link_name = "llvm.wasm.relaxed.madd.v2f64"]
+    fn llvm_f64x2_fma(a: simd::f64x2, b: simd::f64x2, c: simd::f64x2) -> simd::f64x2;
+    #[link_name = "llvm.wasm.relaxed.nmadd.v2f64"]
+    fn llvm_f64x2_fms(a: simd::f64x2, b: simd::f64x2, c: simd::f64x2) -> simd::f64x2;
+
+    #[link_name = "llvm.wasm.relaxed.laneselect.v16i8"]
+    fn llvm_i8x16_laneselect(a: simd::i8x16, b: simd::i8x16, c: simd::i8x16) -> simd::i8x16;
+    #[link_name = "llvm.wasm.relaxed.laneselect.v8i16"]
+    fn llvm_i16x8_laneselect(a: simd::i16x8, b: simd::i16x8, c: simd::i16x8) -> simd::i16x8;
+    #[link_name = "llvm.wasm.relaxed.laneselect.v4i32"]
+    fn llvm_i32x4_laneselect(a: simd::i32x4, b: simd::i32x4, c: simd::i32x4) -> simd::i32x4;
+    #[link_name = "llvm.wasm.relaxed.laneselect.v2i64"]
+    fn llvm_i64x2_laneselect(a: simd::i64x2, b: simd::i64x2, c: simd::i64x2) -> simd::i64x2;
+
+    #[link_name = "llvm.wasm.relaxed.min.v4f32"]
+    fn llvm_f32x4_relaxed_min(a: simd::f32x4, b: simd::f32x4) -> simd::f32x4;
+    #[link_name = "llvm.wasm.relaxed.min.v2f64"]
+    fn llvm_f64x2_relaxed_min(a: simd::f64x2, b: simd::f64x2) -> simd::f64x2;
+    #[link_name = "llvm.wasm.relaxed.max.v4f32"]
+    fn llvm_f32x4_relaxed_max(a: simd::f32x4, b: simd::f32x4) -> simd::f32x4;
+    #[link_name = "llvm.wasm.relaxed.max.v2f64"]
+    fn llvm_f64x2_relaxed_max(a: simd::f64x2, b: simd::f64x2) -> simd::f64x2;
+
+    #[link_name = "llvm.wasm.relaxed.q15mulr.signed"]
+    fn llvm_relaxed_q15mulr_signed(a: simd::i16x8, b: simd::i16x8) -> simd::i16x8;
+    #[link_name = "llvm.wasm.relaxed.dot.i8x16.i7x16.signed"]
+    fn llvm_i16x8_relaxed_dot_i8x16_i7x16_s(a: simd::i8x16, b: simd::i8x16) -> simd::i16x8;
+    #[link_name = "llvm.wasm.relaxed.dot.i8x16.i7x16.add.signed"]
+    fn llvm_i32x4_relaxed_dot_i8x16_i7x16_add_s(
+        a: simd::i8x16,
+        b: simd::i8x16,
+        c: simd::i32x4,
+    ) -> simd::i32x4;
+}
+
+/// A relaxed version of `i8x16_swizzle(a, s)` which selects lanes from `a`
+/// using indices in `s`.
+///
+/// Indices in the range `[0,15]` will select the `i`-th element of `a`.
+/// If the high bit of any element of `s` is set (meaning 128 or greater) then
+/// the corresponding output lane is guaranteed to be zero. Otherwise if the
+/// element of `s` is within the range `[16,128)` then the output lane is either
+/// 0 or `a[s[i] % 16]` depending on the implementation.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.relaxed_swizzle))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("i8x16.relaxed_swizzle"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn i8x16_relaxed_swizzle(a: v128, s: v128) -> v128 {
+    unsafe { llvm_relaxed_swizzle(a.as_i8x16(), s.as_i8x16()).v128() }
+}
+
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub use i8x16_relaxed_swizzle as u8x16_relaxed_swizzle;
+
+/// A relaxed version of `i32x4_trunc_sat_f32x4(a)` converts the `f32` lanes
+/// of `a` to signed 32-bit integers.
+///
+/// Values which don't fit in 32-bit integers or are NaN may have the same
+/// result as `i32x4_trunc_sat_f32x4` or may return `i32::MIN`.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.relaxed_trunc_f32x4_s))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("i32x4.relaxed_trunc_f32x4_s"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn i32x4_relaxed_trunc_f32x4(a: v128) -> v128 {
+    unsafe { llvm_relaxed_trunc_signed(a.as_f32x4()).v128() }
+}
+
+/// A relaxed version of `u32x4_trunc_sat_f32x4(a)` converts the `f32` lanes
+/// of `a` to unsigned 32-bit integers.
+///
+/// Values which don't fit in 32-bit unsigned integers or are NaN may have the
+/// same result as `u32x4_trunc_sat_f32x4` or may return `u32::MAX`.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.relaxed_trunc_f32x4_u))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("i32x4.relaxed_trunc_f32x4_u"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn u32x4_relaxed_trunc_f32x4(a: v128) -> v128 {
+    unsafe { llvm_relaxed_trunc_unsigned(a.as_f32x4()).v128() }
+}
+
+/// A relaxed version of `i32x4_trunc_sat_f64x2_zero(a)` converts the `f64`
+/// lanes of `a` to signed 32-bit integers and the upper two lanes are zero.
+///
+/// Values which don't fit in 32-bit integers or are NaN may have the same
+/// result as `i32x4_trunc_sat_f32x4` or may return `i32::MIN`.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.relaxed_trunc_f64x2_s_zero))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("i32x4.relaxed_trunc_f64x2_s_zero"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn i32x4_relaxed_trunc_f64x2_zero(a: v128) -> v128 {
+    unsafe { llvm_relaxed_trunc_signed_zero(a.as_f64x2()).v128() }
+}
+
+/// A relaxed version of `u32x4_trunc_sat_f64x2_zero(a)` converts the `f64`
+/// lanes of `a` to unsigned 32-bit integers and the upper two lanes are zero.
+///
+/// Values which don't fit in 32-bit unsigned integers or are NaN may have the
+/// same result as `u32x4_trunc_sat_f32x4` or may return `u32::MAX`.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.relaxed_trunc_f64x2_u_zero))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("i32x4.relaxed_trunc_f64x2_u_zero"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn u32x4_relaxed_trunc_f64x2_zero(a: v128) -> v128 {
+    unsafe { llvm_relaxed_trunc_unsigned_zero(a.as_f64x2()).v128() }
+}
+
+/// Computes `a * b + c` with either one rounding or two roundings.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.relaxed_madd))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("f32x4.relaxed_madd"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn f32x4_relaxed_madd(a: v128, b: v128, c: v128) -> v128 {
+    unsafe { llvm_f32x4_fma(a.as_f32x4(), b.as_f32x4(), c.as_f32x4()).v128() }
+}
+
+/// Computes `-a * b + c` with either one rounding or two roundings.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.relaxed_nmadd))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("f32x4.relaxed_nmadd"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn f32x4_relaxed_nmadd(a: v128, b: v128, c: v128) -> v128 {
+    unsafe { llvm_f32x4_fms(a.as_f32x4(), b.as_f32x4(), c.as_f32x4()).v128() }
+}
+
+/// Computes `a * b + c` with either one rounding or two roundings.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.relaxed_madd))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("f64x2.relaxed_madd"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn f64x2_relaxed_madd(a: v128, b: v128, c: v128) -> v128 {
+    unsafe { llvm_f64x2_fma(a.as_f64x2(), b.as_f64x2(), c.as_f64x2()).v128() }
+}
+
+/// Computes `-a * b + c` with either one rounding or two roundings.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.relaxed_nmadd))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("f64x2.relaxed_nmadd"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn f64x2_relaxed_nmadd(a: v128, b: v128, c: v128) -> v128 {
+    unsafe { llvm_f64x2_fms(a.as_f64x2(), b.as_f64x2(), c.as_f64x2()).v128() }
+}
+
+/// A relaxed version of `v128_bitselect` where this either behaves the same as
+/// `v128_bitselect` or the high bit of each lane `m` is inspected and the
+/// corresponding lane of `a` is chosen if the bit is 1 or the lane of `b` is
+/// chosen if it's zero.
+///
+/// If the `m` mask's lanes are either all-one or all-zero then this instruction
+/// is the same as `v128_bitselect`.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.relaxed_laneselect))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("i8x16.relaxed_laneselect"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn i8x16_relaxed_laneselect(a: v128, b: v128, m: v128) -> v128 {
+    unsafe { llvm_i8x16_laneselect(a.as_i8x16(), b.as_i8x16(), m.as_i8x16()).v128() }
+}
+
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub use i8x16_relaxed_laneselect as u8x16_relaxed_laneselect;
+
+/// A relaxed version of `v128_bitselect` where this either behaves the same as
+/// `v128_bitselect` or the high bit of each lane `m` is inspected and the
+/// corresponding lane of `a` is chosen if the bit is 1 or the lane of `b` is
+/// chosen if it's zero.
+///
+/// If the `m` mask's lanes are either all-one or all-zero then this instruction
+/// is the same as `v128_bitselect`.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.relaxed_laneselect))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("i16x8.relaxed_laneselect"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn i16x8_relaxed_laneselect(a: v128, b: v128, m: v128) -> v128 {
+    unsafe { llvm_i16x8_laneselect(a.as_i16x8(), b.as_i16x8(), m.as_i16x8()).v128() }
+}
+
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub use i16x8_relaxed_laneselect as u16x8_relaxed_laneselect;
+
+/// A relaxed version of `v128_bitselect` where this either behaves the same as
+/// `v128_bitselect` or the high bit of each lane `m` is inspected and the
+/// corresponding lane of `a` is chosen if the bit is 1 or the lane of `b` is
+/// chosen if it's zero.
+///
+/// If the `m` mask's lanes are either all-one or all-zero then this instruction
+/// is the same as `v128_bitselect`.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.relaxed_laneselect))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("i32x4.relaxed_laneselect"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn i32x4_relaxed_laneselect(a: v128, b: v128, m: v128) -> v128 {
+    unsafe { llvm_i32x4_laneselect(a.as_i32x4(), b.as_i32x4(), m.as_i32x4()).v128() }
+}
+
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub use i32x4_relaxed_laneselect as u32x4_relaxed_laneselect;
+
+/// A relaxed version of `v128_bitselect` where this either behaves the same as
+/// `v128_bitselect` or the high bit of each lane `m` is inspected and the
+/// corresponding lane of `a` is chosen if the bit is 1 or the lane of `b` is
+/// chosen if it's zero.
+///
+/// If the `m` mask's lanes are either all-one or all-zero then this instruction
+/// is the same as `v128_bitselect`.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.relaxed_laneselect))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("i64x2.relaxed_laneselect"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn i64x2_relaxed_laneselect(a: v128, b: v128, m: v128) -> v128 {
+    unsafe { llvm_i64x2_laneselect(a.as_i64x2(), b.as_i64x2(), m.as_i64x2()).v128() }
+}
+
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub use i64x2_relaxed_laneselect as u64x2_relaxed_laneselect;
+
+/// A relaxed version of `f32x4_min` which is either `f32x4_min` or
+/// `f32x4_pmin`.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.relaxed_min))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("f32x4.relaxed_min"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn f32x4_relaxed_min(a: v128, b: v128) -> v128 {
+    unsafe { llvm_f32x4_relaxed_min(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// A relaxed version of `f32x4_max` which is either `f32x4_max` or
+/// `f32x4_pmax`.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.relaxed_max))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("f32x4.relaxed_max"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn f32x4_relaxed_max(a: v128, b: v128) -> v128 {
+    unsafe { llvm_f32x4_relaxed_max(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// A relaxed version of `f64x2_min` which is either `f64x2_min` or
+/// `f64x2_pmin`.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.relaxed_min))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("f64x2.relaxed_min"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn f64x2_relaxed_min(a: v128, b: v128) -> v128 {
+    unsafe { llvm_f64x2_relaxed_min(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// A relaxed version of `f64x2_max` which is either `f64x2_max` or
+/// `f64x2_pmax`.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.relaxed_max))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("f64x2.relaxed_max"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn f64x2_relaxed_max(a: v128, b: v128) -> v128 {
+    unsafe { llvm_f64x2_relaxed_max(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// A relaxed version of `i16x8_relaxed_q15mulr` where if both lanes are
+/// `i16::MIN` then the result is either `i16::MIN` or `i16::MAX`.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.relaxed_q15mulr_s))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("i16x8.relaxed_q15mulr_s"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn i16x8_relaxed_q15mulr(a: v128, b: v128) -> v128 {
+    unsafe { llvm_relaxed_q15mulr_signed(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub use i16x8_relaxed_q15mulr as u16x8_relaxed_q15mulr;
+
+/// A relaxed dot-product instruction.
+///
+/// This instruction will perform pairwise products of the 8-bit values in `a`
+/// and `b` and then accumulate adjacent pairs into 16-bit results producing a
+/// final `i16x8` vector. The bytes of `a` are always interpreted as signed and
+/// the bytes in `b` may be interpreted as signed or unsigned. If the top bit in
+/// `b` isn't set then the value is the same regardless of whether it's signed
+/// or unsigned.
+///
+/// The accumulation into 16-bit values may be saturated on some platforms, and
+/// on other platforms it may wrap-around on overflow.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.relaxed_dot_i8x16_i7x16_s))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("i16x8.relaxed_dot_i8x16_i7x16_s"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn i16x8_relaxed_dot_i8x16_i7x16(a: v128, b: v128) -> v128 {
+    unsafe { llvm_i16x8_relaxed_dot_i8x16_i7x16_s(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub use i16x8_relaxed_dot_i8x16_i7x16 as u16x8_relaxed_dot_i8x16_i7x16;
+
+/// Similar to [`i16x8_relaxed_dot_i8x16_i7x16`] except that the intermediate
+/// `i16x8` result is fed into `i32x4_extadd_pairwise_i16x8` followed by
+/// `i32x4_add` to add the value `c` to the result.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.relaxed_dot_i8x16_i7x16_add_s))]
+#[target_feature(enable = "relaxed-simd")]
+#[doc(alias("i32x4.relaxed_dot_i8x16_i7x16_add_s"))]
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub fn i32x4_relaxed_dot_i8x16_i7x16_add(a: v128, b: v128, c: v128) -> v128 {
+    unsafe {
+        llvm_i32x4_relaxed_dot_i8x16_i7x16_add_s(a.as_i8x16(), b.as_i8x16(), c.as_i32x4()).v128()
+    }
+}
+
+#[stable(feature = "stdarch_wasm_relaxed_simd", since = "1.82.0")]
+pub use i32x4_relaxed_dot_i8x16_i7x16_add as u32x4_relaxed_dot_i8x16_i7x16_add;
+
+#[cfg(test)]
+mod tests {
+    use super::super::simd128::*;
+    use super::*;
+    use core::ops::{Add, Div, Mul, Neg, Sub};
+
+    use std::fmt::Debug;
+    use std::mem::transmute;
+    use std::num::Wrapping;
+    use std::prelude::v1::*;
+
+    fn compare_bytes(a: v128, b: &[v128]) {
+        let a: [u8; 16] = unsafe { transmute(a) };
+        if b.iter().any(|b| {
+            let b: [u8; 16] = unsafe { transmute(*b) };
+            a == b
+        }) {
+            return;
+        }
+        eprintln!("input vector {a:?}");
+        eprintln!("did not match any output:");
+        for b in b {
+            eprintln!("  {b:?}");
+        }
+    }
+
+    #[test]
+    fn test_relaxed_swizzle() {
+        compare_bytes(
+            i8x16_relaxed_swizzle(
+                i8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+                i8x16(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1),
+            ),
+            &[i8x16(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1)],
+        );
+        compare_bytes(
+            i8x16_relaxed_swizzle(
+                i8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+                u8x16(0x80, 0xff, 16, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+            ),
+            &[
+                i8x16(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+                i8x16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+            ],
+        );
+        compare_bytes(
+            u8x16_relaxed_swizzle(
+                u8x16(
+                    128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+                ),
+                u8x16(0x80, 0xff, 16, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+            ),
+            &[
+                u8x16(
+                    128, 128, 128, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+                ),
+                u8x16(
+                    0, 0, 0, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+                ),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_relaxed_trunc() {
+        compare_bytes(
+            i32x4_relaxed_trunc_f32x4(f32x4(1.0, 2.0, -1., -4.)),
+            &[i32x4(1, 2, -1, -4)],
+        );
+        compare_bytes(
+            i32x4_relaxed_trunc_f32x4(f32x4(f32::NEG_INFINITY, f32::NAN, -0.0, f32::INFINITY)),
+            &[
+                i32x4(i32::MIN, 0, 0, i32::MAX),
+                i32x4(i32::MIN, i32::MIN, 0, i32::MIN),
+            ],
+        );
+        compare_bytes(
+            i32x4_relaxed_trunc_f64x2_zero(f64x2(1.0, -3.0)),
+            &[i32x4(1, -3, 0, 0)],
+        );
+        compare_bytes(
+            i32x4_relaxed_trunc_f64x2_zero(f64x2(f64::INFINITY, f64::NAN)),
+            &[i32x4(i32::MAX, 0, 0, 0), i32x4(i32::MIN, i32::MIN, 0, 0)],
+        );
+
+        compare_bytes(
+            u32x4_relaxed_trunc_f32x4(f32x4(1.0, 2.0, 5., 100.)),
+            &[i32x4(1, 2, 5, 100)],
+        );
+        compare_bytes(
+            u32x4_relaxed_trunc_f32x4(f32x4(f32::NEG_INFINITY, f32::NAN, -0.0, f32::INFINITY)),
+            &[
+                u32x4(u32::MAX, 0, 0, u32::MAX),
+                u32x4(u32::MAX, u32::MAX, 0, u32::MAX),
+            ],
+        );
+        compare_bytes(
+            u32x4_relaxed_trunc_f64x2_zero(f64x2(1.0, 3.0)),
+            &[u32x4(1, 3, 0, 0)],
+        );
+        compare_bytes(
+            u32x4_relaxed_trunc_f64x2_zero(f64x2(f64::INFINITY, f64::NAN)),
+            &[i32x4(i32::MAX, 0, 0, 0), i32x4(i32::MIN, i32::MIN, 0, 0)],
+        );
+    }
+
+    #[test]
+    fn test_madd() {
+        let floats = [
+            f32::NAN,
+            f32::NEG_INFINITY,
+            f32::INFINITY,
+            1.0,
+            2.0,
+            -1.0,
+            0.0,
+            100.3,
+            7.8,
+            9.4,
+        ];
+        for &a in floats.iter() {
+            for &b in floats.iter() {
+                for &c in floats.iter() {
+                    let f1 = a * b + c;
+                    let f2 = a.mul_add(b, c);
+                    compare_bytes(
+                        f32x4_relaxed_madd(f32x4(a, a, a, a), f32x4(b, b, b, b), f32x4(c, c, c, c)),
+                        &[f32x4(f1, f1, f1, f1), f32x4(f2, f2, f2, f2)],
+                    );
+
+                    let f1 = -a * b + c;
+                    let f2 = (-a).mul_add(b, c);
+                    compare_bytes(
+                        f32x4_relaxed_nmadd(
+                            f32x4(a, a, a, a),
+                            f32x4(b, b, b, b),
+                            f32x4(c, c, c, c),
+                        ),
+                        &[f32x4(f1, f1, f1, f1), f32x4(f2, f2, f2, f2)],
+                    );
+
+                    let a = f64::from(a);
+                    let b = f64::from(b);
+                    let c = f64::from(c);
+                    let f1 = a * b + c;
+                    let f2 = a.mul_add(b, c);
+                    compare_bytes(
+                        f64x2_relaxed_madd(f64x2(a, a), f64x2(b, b), f64x2(c, c)),
+                        &[f64x2(f1, f1), f64x2(f2, f2)],
+                    );
+                    let f1 = -a * b + c;
+                    let f2 = (-a).mul_add(b, c);
+                    compare_bytes(
+                        f64x2_relaxed_nmadd(f64x2(a, a), f64x2(b, b), f64x2(c, c)),
+                        &[f64x2(f1, f1), f64x2(f2, f2)],
+                    );
+                }
+            }
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/simd128.rs b/library/stdarch/crates/core_arch/src/wasm32/simd128.rs
new file mode 100644
index 0000000000000..fc0d7723fa014
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/wasm32/simd128.rs
@@ -0,0 +1,6100 @@
+//! This module implements the [WebAssembly `SIMD128` ISA].
+//!
+//! [WebAssembly `SIMD128` ISA]:
+//! https://github.com/WebAssembly/simd/blob/master/proposals/simd/SIMD.md
+
+#![allow(non_camel_case_types)]
+#![allow(unused_imports)]
+
+use crate::{core_arch::simd, intrinsics::simd::*, marker::Sized, mem, ptr};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+types! {
+    #![stable(feature = "wasm_simd", since = "1.54.0")]
+
+    /// WASM-specific 128-bit wide SIMD vector type.
+    ///
+    /// This type corresponds to the `v128` type in the [WebAssembly SIMD
+    /// proposal](https://github.com/webassembly/simd). This type is 128-bits
+    /// large and the meaning of all the bits is defined within the context of
+    /// how this value is used.
+    ///
+    /// This same type is used simultaneously for all 128-bit-wide SIMD types,
+    /// for example:
+    ///
+    /// * sixteen 8-bit integers (both `i8` and `u8`)
+    /// * eight 16-bit integers (both `i16` and `u16`)
+    /// * four 32-bit integers (both `i32` and `u32`)
+    /// * two 64-bit integers (both `i64` and `u64`)
+    /// * four 32-bit floats (`f32`)
+    /// * two 64-bit floats (`f64`)
+    ///
+    /// The `v128` type in Rust is intended to be quite analogous to the `v128`
+    /// type in WebAssembly. Operations on `v128` can only be performed with the
+    /// functions in this module.
+    // N.B., internals here are arbitrary.
+    pub struct v128(4 x i32);
+}
+
+macro_rules! conversions {
+    ($(($name:ident = $ty:ty))*) => {
+        impl v128 {
+            $(
+                #[inline(always)]
+                pub(crate) fn $name(self) -> $ty {
+                    unsafe { mem::transmute(self) }
+                }
+            )*
+        }
+        $(
+            impl $ty {
+                #[inline(always)]
+                pub(crate) const fn v128(self) -> v128 {
+                    unsafe { mem::transmute(self) }
+                }
+            }
+        )*
+    }
+}
+
+conversions! {
+    (as_u8x16 = simd::u8x16)
+    (as_u16x8 = simd::u16x8)
+    (as_u32x4 = simd::u32x4)
+    (as_u64x2 = simd::u64x2)
+    (as_i8x16 = simd::i8x16)
+    (as_i16x8 = simd::i16x8)
+    (as_i32x4 = simd::i32x4)
+    (as_i64x2 = simd::i64x2)
+    (as_f32x4 = simd::f32x4)
+    (as_f64x2 = simd::f64x2)
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.wasm.swizzle"]
+    fn llvm_swizzle(a: simd::i8x16, b: simd::i8x16) -> simd::i8x16;
+
+    #[link_name = "llvm.wasm.bitselect.v16i8"]
+    fn llvm_bitselect(a: simd::i8x16, b: simd::i8x16, c: simd::i8x16) -> simd::i8x16;
+    #[link_name = "llvm.wasm.anytrue.v16i8"]
+    fn llvm_any_true_i8x16(x: simd::i8x16) -> i32;
+
+    #[link_name = "llvm.wasm.alltrue.v16i8"]
+    fn llvm_i8x16_all_true(x: simd::i8x16) -> i32;
+    #[link_name = "llvm.wasm.bitmask.v16i8"]
+    fn llvm_bitmask_i8x16(a: simd::i8x16) -> i32;
+    #[link_name = "llvm.wasm.narrow.signed.v16i8.v8i16"]
+    fn llvm_narrow_i8x16_s(a: simd::i16x8, b: simd::i16x8) -> simd::i8x16;
+    #[link_name = "llvm.wasm.narrow.unsigned.v16i8.v8i16"]
+    fn llvm_narrow_i8x16_u(a: simd::i16x8, b: simd::i16x8) -> simd::i8x16;
+    #[link_name = "llvm.wasm.avgr.unsigned.v16i8"]
+    fn llvm_avgr_u_i8x16(a: simd::i8x16, b: simd::i8x16) -> simd::i8x16;
+
+    #[link_name = "llvm.wasm.extadd.pairwise.signed.v8i16"]
+    fn llvm_i16x8_extadd_pairwise_i8x16_s(x: simd::i8x16) -> simd::i16x8;
+    #[link_name = "llvm.wasm.extadd.pairwise.unsigned.v8i16"]
+    fn llvm_i16x8_extadd_pairwise_i8x16_u(x: simd::i8x16) -> simd::i16x8;
+    #[link_name = "llvm.wasm.q15mulr.sat.signed"]
+    fn llvm_q15mulr(a: simd::i16x8, b: simd::i16x8) -> simd::i16x8;
+    #[link_name = "llvm.wasm.alltrue.v8i16"]
+    fn llvm_i16x8_all_true(x: simd::i16x8) -> i32;
+    #[link_name = "llvm.wasm.bitmask.v8i16"]
+    fn llvm_bitmask_i16x8(a: simd::i16x8) -> i32;
+    #[link_name = "llvm.wasm.narrow.signed.v8i16.v4i32"]
+    fn llvm_narrow_i16x8_s(a: simd::i32x4, b: simd::i32x4) -> simd::i16x8;
+    #[link_name = "llvm.wasm.narrow.unsigned.v8i16.v4i32"]
+    fn llvm_narrow_i16x8_u(a: simd::i32x4, b: simd::i32x4) -> simd::i16x8;
+    #[link_name = "llvm.wasm.avgr.unsigned.v8i16"]
+    fn llvm_avgr_u_i16x8(a: simd::i16x8, b: simd::i16x8) -> simd::i16x8;
+
+    #[link_name = "llvm.wasm.extadd.pairwise.signed.v4i32"]
+    fn llvm_i32x4_extadd_pairwise_i16x8_s(x: simd::i16x8) -> simd::i32x4;
+    #[link_name = "llvm.wasm.extadd.pairwise.unsigned.v4i32"]
+    fn llvm_i32x4_extadd_pairwise_i16x8_u(x: simd::i16x8) -> simd::i32x4;
+    #[link_name = "llvm.wasm.alltrue.v4i32"]
+    fn llvm_i32x4_all_true(x: simd::i32x4) -> i32;
+    #[link_name = "llvm.wasm.bitmask.v4i32"]
+    fn llvm_bitmask_i32x4(a: simd::i32x4) -> i32;
+    #[link_name = "llvm.wasm.dot"]
+    fn llvm_i32x4_dot_i16x8_s(a: simd::i16x8, b: simd::i16x8) -> simd::i32x4;
+
+    #[link_name = "llvm.wasm.alltrue.v2i64"]
+    fn llvm_i64x2_all_true(x: simd::i64x2) -> i32;
+    #[link_name = "llvm.wasm.bitmask.v2i64"]
+    fn llvm_bitmask_i64x2(a: simd::i64x2) -> i32;
+
+    #[link_name = "llvm.nearbyint.v4f32"]
+    fn llvm_f32x4_nearest(x: simd::f32x4) -> simd::f32x4;
+    #[link_name = "llvm.minimum.v4f32"]
+    fn llvm_f32x4_min(x: simd::f32x4, y: simd::f32x4) -> simd::f32x4;
+    #[link_name = "llvm.maximum.v4f32"]
+    fn llvm_f32x4_max(x: simd::f32x4, y: simd::f32x4) -> simd::f32x4;
+
+    #[link_name = "llvm.nearbyint.v2f64"]
+    fn llvm_f64x2_nearest(x: simd::f64x2) -> simd::f64x2;
+    #[link_name = "llvm.minimum.v2f64"]
+    fn llvm_f64x2_min(x: simd::f64x2, y: simd::f64x2) -> simd::f64x2;
+    #[link_name = "llvm.maximum.v2f64"]
+    fn llvm_f64x2_max(x: simd::f64x2, y: simd::f64x2) -> simd::f64x2;
+}
+
+#[repr(packed)]
+#[derive(Copy)]
+struct Unaligned<T>(T);
+
+impl<T: Copy> Clone for Unaligned<T> {
+    fn clone(&self) -> Unaligned<T> {
+        *self
+    }
+}
+
+/// Loads a `v128` vector from the given heap address.
+///
+/// This intrinsic will emit a load with an alignment of 1. While this is
+/// provided for completeness it is not strictly necessary, you can also load
+/// the pointer directly:
+///
+/// ```rust,ignore
+/// let a: &v128 = ...;
+/// let value = unsafe { v128_load(a) };
+/// // .. is the same as ..
+/// let value = *a;
+/// ```
+///
+/// The alignment of the load can be configured by doing a manual load without
+/// this intrinsic.
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 16 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load(m: *const v128) -> v128 {
+    (*(m as *const Unaligned<v128>)).0
+}
+
+/// Load eight 8-bit integers and sign extend each one to a 16-bit lane
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load8x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load8x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn i16x8_load_extend_i8x8(m: *const i8) -> v128 {
+    let m = *(m as *const Unaligned<simd::i8x8>);
+    simd_cast::<_, simd::i16x8>(m.0).v128()
+}
+
+/// Load eight 8-bit integers and zero extend each one to a 16-bit lane
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load8x8_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load8x8_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn i16x8_load_extend_u8x8(m: *const u8) -> v128 {
+    let m = *(m as *const Unaligned<simd::u8x8>);
+    simd_cast::<_, simd::u16x8>(m.0).v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_load_extend_u8x8 as u16x8_load_extend_u8x8;
+
+/// Load four 16-bit integers and sign extend each one to a 32-bit lane
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load16x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load16x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn i32x4_load_extend_i16x4(m: *const i16) -> v128 {
+    let m = *(m as *const Unaligned<simd::i16x4>);
+    simd_cast::<_, simd::i32x4>(m.0).v128()
+}
+
+/// Load four 16-bit integers and zero extend each one to a 32-bit lane
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load16x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load16x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn i32x4_load_extend_u16x4(m: *const u16) -> v128 {
+    let m = *(m as *const Unaligned<simd::u16x4>);
+    simd_cast::<_, simd::u32x4>(m.0).v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_load_extend_u16x4 as u32x4_load_extend_u16x4;
+
+/// Load two 32-bit integers and sign extend each one to a 64-bit lane
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load32x2_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load32x2_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn i64x2_load_extend_i32x2(m: *const i32) -> v128 {
+    let m = *(m as *const Unaligned<simd::i32x2>);
+    simd_cast::<_, simd::i64x2>(m.0).v128()
+}
+
+/// Load two 32-bit integers and zero extend each one to a 64-bit lane
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load32x2_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load32x2_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn i64x2_load_extend_u32x2(m: *const u32) -> v128 {
+    let m = *(m as *const Unaligned<simd::u32x2>);
+    simd_cast::<_, simd::u64x2>(m.0).v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_load_extend_u32x2 as u64x2_load_extend_u32x2;
+
+/// Load a single element and splat to all lanes of a v128 vector.
+///
+/// While this intrinsic is provided for completeness it can also be replaced
+/// with `u8x16_splat(*m)` and it should generate equivalent code (and also not
+/// require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 1 byte from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load8_splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load8_splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load8_splat(m: *const u8) -> v128 {
+    u8x16_splat(*m)
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+///
+/// While this intrinsic is provided for completeness it can also be replaced
+/// with `u16x8_splat(*m)` and it should generate equivalent code (and also not
+/// require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 2 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load16_splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load16_splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load16_splat(m: *const u16) -> v128 {
+    u16x8_splat(ptr::read_unaligned(m))
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+///
+/// While this intrinsic is provided for completeness it can also be replaced
+/// with `u32x4_splat(*m)` and it should generate equivalent code (and also not
+/// require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 4 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load32_splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load32_splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load32_splat(m: *const u32) -> v128 {
+    u32x4_splat(ptr::read_unaligned(m))
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+///
+/// While this intrinsic is provided for completeness it can also be replaced
+/// with `u64x2_splat(*m)` and it should generate equivalent code (and also not
+/// require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load64_splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load64_splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load64_splat(m: *const u64) -> v128 {
+    u64x2_splat(ptr::read_unaligned(m))
+}
+
+/// Load a 32-bit element into the low bits of the vector and sets all other
+/// bits to zero.
+///
+/// This intrinsic is provided for completeness and is equivalent to `u32x4(*m,
+/// 0, 0, 0)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 4 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load32_zero))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load32_zero"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load32_zero(m: *const u32) -> v128 {
+    u32x4(ptr::read_unaligned(m), 0, 0, 0)
+}
+
+/// Load a 64-bit element into the low bits of the vector and sets all other
+/// bits to zero.
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `u64x2_replace_lane::<0>(u64x2(0, 0), *m)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load64_zero))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load64_zero"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load64_zero(m: *const u64) -> v128 {
+    u64x2_replace_lane::<0>(u64x2(0, 0), ptr::read_unaligned(m))
+}
+
+/// Stores a `v128` vector to the given heap address.
+///
+/// This intrinsic will emit a store with an alignment of 1. While this is
+/// provided for completeness it is not strictly necessary, you can also store
+/// the pointer directly:
+///
+/// ```rust,ignore
+/// let a: &mut v128 = ...;
+/// unsafe { v128_store(a, value) };
+/// // .. is the same as ..
+/// *a = value;
+/// ```
+///
+/// The alignment of the store can be configured by doing a manual store without
+/// this intrinsic.
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to store 16 bytes to. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned store.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.store))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.store"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_store(m: *mut v128, a: v128) {
+    *(m as *mut Unaligned<v128>) = Unaligned(a);
+}
+
+/// Loads an 8-bit value from `m` and sets lane `L` of `v` to that value.
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `u8x16_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 1 byte from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load8_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load8_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load8_lane<const L: usize>(v: v128, m: *const u8) -> v128 {
+    u8x16_replace_lane::<L>(v, *m)
+}
+
+/// Loads a 16-bit value from `m` and sets lane `L` of `v` to that value.
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `u16x8_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 2 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load16_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load16_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load16_lane<const L: usize>(v: v128, m: *const u16) -> v128 {
+    u16x8_replace_lane::<L>(v, ptr::read_unaligned(m))
+}
+
+/// Loads a 32-bit value from `m` and sets lane `L` of `v` to that value.
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `u32x4_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 4 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load32_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load32_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load32_lane<const L: usize>(v: v128, m: *const u32) -> v128 {
+    u32x4_replace_lane::<L>(v, ptr::read_unaligned(m))
+}
+
+/// Loads a 64-bit value from `m` and sets lane `L` of `v` to that value.
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `u64x2_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load64_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load64_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load64_lane<const L: usize>(v: v128, m: *const u64) -> v128 {
+    u64x2_replace_lane::<L>(v, ptr::read_unaligned(m))
+}
+
+/// Stores the 8-bit value from lane `L` of `v` into `m`
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `*m = u8x16_extract_lane::<L>(v)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to store 1 byte to. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned store.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.store8_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.store8_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_store8_lane<const L: usize>(v: v128, m: *mut u8) {
+    *m = u8x16_extract_lane::<L>(v);
+}
+
+/// Stores the 16-bit value from lane `L` of `v` into `m`
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `*m = u16x8_extract_lane::<L>(v)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to store 2 bytes to. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned store.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.store16_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.store16_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_store16_lane<const L: usize>(v: v128, m: *mut u16) {
+    ptr::write_unaligned(m, u16x8_extract_lane::<L>(v))
+}
+
+/// Stores the 32-bit value from lane `L` of `v` into `m`
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `*m = u32x4_extract_lane::<L>(v)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to store 4 bytes to. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned store.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.store32_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.store32_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_store32_lane<const L: usize>(v: v128, m: *mut u32) {
+    ptr::write_unaligned(m, u32x4_extract_lane::<L>(v))
+}
+
+/// Stores the 64-bit value from lane `L` of `v` into `m`
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `*m = u64x2_extract_lane::<L>(v)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to store 8 bytes to. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned store.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.store64_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.store64_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_store64_lane<const L: usize>(v: v128, m: *mut u64) {
+    ptr::write_unaligned(m, u64x2_extract_lane::<L>(v))
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[cfg_attr(
+    test,
+    assert_instr(
+        v128.const,
+        a0 = 0,
+        a1 = 1,
+        a2 = 2,
+        a3 = 3,
+        a4 = 4,
+        a5 = 5,
+        a6 = 6,
+        a7 = 7,
+        a8 = 8,
+        a9 = 9,
+        a10 = 10,
+        a11 = 11,
+        a12 = 12,
+        a13 = 13,
+        a14 = 14,
+        a15 = 15,
+    )
+)]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+#[target_feature(enable = "simd128")]
+pub const fn i8x16(
+    a0: i8,
+    a1: i8,
+    a2: i8,
+    a3: i8,
+    a4: i8,
+    a5: i8,
+    a6: i8,
+    a7: i8,
+    a8: i8,
+    a9: i8,
+    a10: i8,
+    a11: i8,
+    a12: i8,
+    a13: i8,
+    a14: i8,
+    a15: i8,
+) -> v128 {
+    simd::i8x16::new(
+        a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15,
+    )
+    .v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+#[target_feature(enable = "simd128")]
+pub const fn u8x16(
+    a0: u8,
+    a1: u8,
+    a2: u8,
+    a3: u8,
+    a4: u8,
+    a5: u8,
+    a6: u8,
+    a7: u8,
+    a8: u8,
+    a9: u8,
+    a10: u8,
+    a11: u8,
+    a12: u8,
+    a13: u8,
+    a14: u8,
+    a15: u8,
+) -> v128 {
+    simd::u8x16::new(
+        a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15,
+    )
+    .v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[cfg_attr(
+    test,
+    assert_instr(
+        v128.const,
+        a0 = 0,
+        a1 = 1,
+        a2 = 2,
+        a3 = 3,
+        a4 = 4,
+        a5 = 5,
+        a6 = 6,
+        a7 = 7,
+    )
+)]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+#[target_feature(enable = "simd128")]
+pub const fn i16x8(a0: i16, a1: i16, a2: i16, a3: i16, a4: i16, a5: i16, a6: i16, a7: i16) -> v128 {
+    simd::i16x8::new(a0, a1, a2, a3, a4, a5, a6, a7).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+#[target_feature(enable = "simd128")]
+pub const fn u16x8(a0: u16, a1: u16, a2: u16, a3: u16, a4: u16, a5: u16, a6: u16, a7: u16) -> v128 {
+    simd::u16x8::new(a0, a1, a2, a3, a4, a5, a6, a7).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.const, a0 = 0, a1 = 1, a2 = 2, a3 = 3))]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+#[target_feature(enable = "simd128")]
+pub const fn i32x4(a0: i32, a1: i32, a2: i32, a3: i32) -> v128 {
+    simd::i32x4::new(a0, a1, a2, a3).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+#[target_feature(enable = "simd128")]
+pub const fn u32x4(a0: u32, a1: u32, a2: u32, a3: u32) -> v128 {
+    simd::u32x4::new(a0, a1, a2, a3).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.const, a0 = 1, a1 = 2))]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+#[target_feature(enable = "simd128")]
+pub const fn i64x2(a0: i64, a1: i64) -> v128 {
+    simd::i64x2::new(a0, a1).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+#[target_feature(enable = "simd128")]
+pub const fn u64x2(a0: u64, a1: u64) -> v128 {
+    simd::u64x2::new(a0, a1).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.const, a0 = 0.0, a1 = 1.0, a2 = 2.0, a3 = 3.0))]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd_const", since = "1.56.0")]
+#[target_feature(enable = "simd128")]
+pub const fn f32x4(a0: f32, a1: f32, a2: f32, a3: f32) -> v128 {
+    simd::f32x4::new(a0, a1, a2, a3).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.const, a0 = 0.0, a1 = 1.0))]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd_const", since = "1.56.0")]
+#[target_feature(enable = "simd128")]
+pub const fn f64x2(a0: f64, a1: f64) -> v128 {
+    simd::f64x2::new(a0, a1).v128()
+}
+
+/// Returns a new vector with lanes selected from the lanes of the two input
+/// vectors `$a` and `$b` specified in the 16 immediate operands.
+///
+/// The `$a` and `$b` expressions must have type `v128`, and this function
+/// generates a wasm instruction that is encoded with 16 bytes providing the
+/// indices of the elements to return. The indices `i` in range [0, 15] select
+/// the `i`-th element of `a`. The indices in range [16, 31] select the `i -
+/// 16`-th element of `b`.
+///
+/// Note that this is a macro due to the codegen requirements of all of the
+/// index expressions `$i*` must be constant. A compiler error will be
+/// generated if any of the expressions are not constant.
+///
+/// All indexes `$i*` must have the type `u32`.
+#[inline]
+#[cfg_attr(test,
+    assert_instr(
+        i8x16.shuffle,
+        I0 = 0,
+        I1 = 2,
+        I2 = 4,
+        I3 = 6,
+        I4 = 8,
+        I5 = 10,
+        I6 = 12,
+        I7 = 14,
+        I8 = 16,
+        I9 = 18,
+        I10 = 20,
+        I11 = 22,
+        I12 = 24,
+        I13 = 26,
+        I14 = 28,
+        I15 = 30,
+    )
+)]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shuffle"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_shuffle<
+    const I0: usize,
+    const I1: usize,
+    const I2: usize,
+    const I3: usize,
+    const I4: usize,
+    const I5: usize,
+    const I6: usize,
+    const I7: usize,
+    const I8: usize,
+    const I9: usize,
+    const I10: usize,
+    const I11: usize,
+    const I12: usize,
+    const I13: usize,
+    const I14: usize,
+    const I15: usize,
+>(
+    a: v128,
+    b: v128,
+) -> v128 {
+    static_assert!(I0 < 32);
+    static_assert!(I1 < 32);
+    static_assert!(I2 < 32);
+    static_assert!(I3 < 32);
+    static_assert!(I4 < 32);
+    static_assert!(I5 < 32);
+    static_assert!(I6 < 32);
+    static_assert!(I7 < 32);
+    static_assert!(I8 < 32);
+    static_assert!(I9 < 32);
+    static_assert!(I10 < 32);
+    static_assert!(I11 < 32);
+    static_assert!(I12 < 32);
+    static_assert!(I13 < 32);
+    static_assert!(I14 < 32);
+    static_assert!(I15 < 32);
+    let shuf: simd::u8x16 = unsafe {
+        simd_shuffle!(
+            a.as_u8x16(),
+            b.as_u8x16(),
+            [
+                I0 as u32, I1 as u32, I2 as u32, I3 as u32, I4 as u32, I5 as u32, I6 as u32,
+                I7 as u32, I8 as u32, I9 as u32, I10 as u32, I11 as u32, I12 as u32, I13 as u32,
+                I14 as u32, I15 as u32,
+            ],
+        )
+    };
+    shuf.v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_shuffle as u8x16_shuffle;
+
+/// Same as [`i8x16_shuffle`], except operates as if the inputs were eight
+/// 16-bit integers, only taking 8 indices to shuffle.
+///
+/// Indices in the range [0, 7] select from `a` while [8, 15] select from `b`.
+/// Note that this will generate the `i8x16.shuffle` instruction, since there
+/// is no native `i16x8.shuffle` instruction (there is no need for one since
+/// `i8x16.shuffle` suffices).
+#[inline]
+#[cfg_attr(test,
+    assert_instr(
+        i8x16.shuffle,
+        I0 = 0,
+        I1 = 2,
+        I2 = 4,
+        I3 = 6,
+        I4 = 8,
+        I5 = 10,
+        I6 = 12,
+        I7 = 14,
+    )
+)]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shuffle"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_shuffle<
+    const I0: usize,
+    const I1: usize,
+    const I2: usize,
+    const I3: usize,
+    const I4: usize,
+    const I5: usize,
+    const I6: usize,
+    const I7: usize,
+>(
+    a: v128,
+    b: v128,
+) -> v128 {
+    static_assert!(I0 < 16);
+    static_assert!(I1 < 16);
+    static_assert!(I2 < 16);
+    static_assert!(I3 < 16);
+    static_assert!(I4 < 16);
+    static_assert!(I5 < 16);
+    static_assert!(I6 < 16);
+    static_assert!(I7 < 16);
+    let shuf: simd::u16x8 = unsafe {
+        simd_shuffle!(
+            a.as_u16x8(),
+            b.as_u16x8(),
+            [
+                I0 as u32, I1 as u32, I2 as u32, I3 as u32, I4 as u32, I5 as u32, I6 as u32,
+                I7 as u32,
+            ],
+        )
+    };
+    shuf.v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_shuffle as u16x8_shuffle;
+
+/// Same as [`i8x16_shuffle`], except operates as if the inputs were four
+/// 32-bit integers, only taking 4 indices to shuffle.
+///
+/// Indices in the range [0, 3] select from `a` while [4, 7] select from `b`.
+/// Note that this will generate the `i8x16.shuffle` instruction, since there
+/// is no native `i32x4.shuffle` instruction (there is no need for one since
+/// `i8x16.shuffle` suffices).
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.shuffle, I0 = 0, I1 = 2, I2 = 4, I3 = 6))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shuffle"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_shuffle<const I0: usize, const I1: usize, const I2: usize, const I3: usize>(
+    a: v128,
+    b: v128,
+) -> v128 {
+    static_assert!(I0 < 8);
+    static_assert!(I1 < 8);
+    static_assert!(I2 < 8);
+    static_assert!(I3 < 8);
+    let shuf: simd::u32x4 = unsafe {
+        simd_shuffle!(
+            a.as_u32x4(),
+            b.as_u32x4(),
+            [I0 as u32, I1 as u32, I2 as u32, I3 as u32],
+        )
+    };
+    shuf.v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_shuffle as u32x4_shuffle;
+
+/// Same as [`i8x16_shuffle`], except operates as if the inputs were two
+/// 64-bit integers, only taking 2 indices to shuffle.
+///
+/// Indices in the range [0, 1] select from `a` while [2, 3] select from `b`.
+/// Note that this will generate the `v8x16.shuffle` instruction, since there
+/// is no native `i64x2.shuffle` instruction (there is no need for one since
+/// `i8x16.shuffle` suffices).
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.shuffle, I0 = 0, I1 = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shuffle"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_shuffle<const I0: usize, const I1: usize>(a: v128, b: v128) -> v128 {
+    static_assert!(I0 < 4);
+    static_assert!(I1 < 4);
+    let shuf: simd::u64x2 =
+        unsafe { simd_shuffle!(a.as_u64x2(), b.as_u64x2(), [I0 as u32, I1 as u32]) };
+    shuf.v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_shuffle as u64x2_shuffle;
+
+/// Extracts a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
+///
+/// Extracts the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.extract_lane_s, N = 3))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.extract_lane_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_extract_lane<const N: usize>(a: v128) -> i8 {
+    static_assert!(N < 16);
+    unsafe { simd_extract!(a.as_i8x16(), N as u32) }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 16 packed u8 numbers.
+///
+/// Extracts the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.extract_lane_u, N = 3))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.extract_lane_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_extract_lane<const N: usize>(a: v128) -> u8 {
+    static_assert!(N < 16);
+    unsafe { simd_extract!(a.as_u8x16(), N as u32) }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.replace_lane, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_replace_lane<const N: usize>(a: v128, val: i8) -> v128 {
+    static_assert!(N < 16);
+    unsafe { simd_insert!(a.as_i8x16(), N as u32, val).v128() }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 16 packed u8 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.replace_lane, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_replace_lane<const N: usize>(a: v128, val: u8) -> v128 {
+    static_assert!(N < 16);
+    unsafe { simd_insert!(a.as_u8x16(), N as u32, val).v128() }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
+///
+/// Extracts a the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extract_lane_s, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extract_lane_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extract_lane<const N: usize>(a: v128) -> i16 {
+    static_assert!(N < 8);
+    unsafe { simd_extract!(a.as_i16x8(), N as u32) }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 8 packed u16 numbers.
+///
+/// Extracts a the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extract_lane_u, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extract_lane_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_extract_lane<const N: usize>(a: v128) -> u16 {
+    static_assert!(N < 8);
+    unsafe { simd_extract!(a.as_u16x8(), N as u32) }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.replace_lane, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_replace_lane<const N: usize>(a: v128, val: i16) -> v128 {
+    static_assert!(N < 8);
+    unsafe { simd_insert!(a.as_i16x8(), N as u32, val).v128() }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 8 packed u16 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.replace_lane, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_replace_lane<const N: usize>(a: v128, val: u16) -> v128 {
+    static_assert!(N < 8);
+    unsafe { simd_insert!(a.as_u16x8(), N as u32, val).v128() }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 4 packed i32 numbers.
+///
+/// Extracts the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extract_lane, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extract_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extract_lane<const N: usize>(a: v128) -> i32 {
+    static_assert!(N < 4);
+    unsafe { simd_extract!(a.as_i32x4(), N as u32) }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 4 packed u32 numbers.
+///
+/// Extracts the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extract_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_extract_lane<const N: usize>(a: v128) -> u32 {
+    i32x4_extract_lane::<N>(a) as u32
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 4 packed i32 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.replace_lane, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_replace_lane<const N: usize>(a: v128, val: i32) -> v128 {
+    static_assert!(N < 4);
+    unsafe { simd_insert!(a.as_i32x4(), N as u32, val).v128() }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 4 packed u32 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_replace_lane<const N: usize>(a: v128, val: u32) -> v128 {
+    i32x4_replace_lane::<N>(a, val as i32)
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 2 packed i64 numbers.
+///
+/// Extracts the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extract_lane, N = 1))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extract_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extract_lane<const N: usize>(a: v128) -> i64 {
+    static_assert!(N < 2);
+    unsafe { simd_extract!(a.as_i64x2(), N as u32) }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 2 packed u64 numbers.
+///
+/// Extracts the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extract_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u64x2_extract_lane<const N: usize>(a: v128) -> u64 {
+    i64x2_extract_lane::<N>(a) as u64
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 2 packed i64 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.replace_lane, N = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_replace_lane<const N: usize>(a: v128, val: i64) -> v128 {
+    static_assert!(N < 2);
+    unsafe { simd_insert!(a.as_i64x2(), N as u32, val).v128() }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 2 packed u64 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u64x2_replace_lane<const N: usize>(a: v128, val: u64) -> v128 {
+    i64x2_replace_lane::<N>(a, val as i64)
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 4 packed f32 numbers.
+///
+/// Extracts the scalar value of lane specified fn the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.extract_lane, N = 1))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.extract_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_extract_lane<const N: usize>(a: v128) -> f32 {
+    static_assert!(N < 4);
+    unsafe { simd_extract!(a.as_f32x4(), N as u32) }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 4 packed f32 numbers.
+///
+/// Replaces the scalar value of lane specified fn the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.replace_lane, N = 1))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_replace_lane<const N: usize>(a: v128, val: f32) -> v128 {
+    static_assert!(N < 4);
+    unsafe { simd_insert!(a.as_f32x4(), N as u32, val).v128() }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 2 packed f64 numbers.
+///
+/// Extracts the scalar value of lane specified fn the immediate mode operand
+/// `N` from `a`. If `N` fs out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.extract_lane, N = 1))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.extract_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_extract_lane<const N: usize>(a: v128) -> f64 {
+    static_assert!(N < 2);
+    unsafe { simd_extract!(a.as_f64x2(), N as u32) }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 2 packed f64 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.replace_lane, N = 1))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_replace_lane<const N: usize>(a: v128, val: f64) -> v128 {
+    static_assert!(N < 2);
+    unsafe { simd_insert!(a.as_f64x2(), N as u32, val).v128() }
+}
+
+/// Returns a new vector with lanes selected from the lanes of the first input
+/// vector `a` specified in the second input vector `s`.
+///
+/// The indices `i` in range [0, 15] select the `i`-th element of `a`. For
+/// indices outside of the range the resulting lane is 0.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.swizzle))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.swizzle"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_swizzle(a: v128, s: v128) -> v128 {
+    unsafe { llvm_swizzle(a.as_i8x16(), s.as_i8x16()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_swizzle as u8x16_swizzle;
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 16 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_splat(a: i8) -> v128 {
+    simd::i8x16::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 16 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_splat(a: u8) -> v128 {
+    simd::u8x16::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 8 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_splat(a: i16) -> v128 {
+    simd::i16x8::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 8 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_splat(a: u16) -> v128 {
+    simd::u16x8::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_splat(a: i32) -> v128 {
+    simd::i32x4::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_splat(a: u32) -> v128 {
+    i32x4_splat(a as i32)
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_splat(a: i64) -> v128 {
+    simd::i64x2::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("u64x2.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u64x2_splat(a: u64) -> v128 {
+    i64x2_splat(a as i64)
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_splat(a: f32) -> v128 {
+    simd::f32x4::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_splat(a: f64) -> v128 {
+    simd::f64x2::splat(a).v128()
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.eq))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.eq"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_eq(a: v128, b: v128) -> v128 {
+    unsafe { simd_eq::<_, simd::i8x16>(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were not equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.ne))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.ne"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_ne(a: v128, b: v128) -> v128 {
+    unsafe { simd_ne::<_, simd::i8x16>(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_eq as u8x16_eq;
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_ne as u8x16_ne;
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.lt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.lt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i8x16>(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.lt_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.lt_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i8x16>(a.as_u8x16(), b.as_u8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.gt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.gt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i8x16>(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.gt_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.gt_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i8x16>(a.as_u8x16(), b.as_u8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.le_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.le_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i8x16>(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.le_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.le_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i8x16>(a.as_u8x16(), b.as_u8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.ge_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.ge_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i8x16>(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.ge_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.ge_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i8x16>(a.as_u8x16(), b.as_u8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.eq))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.eq"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_eq(a: v128, b: v128) -> v128 {
+    unsafe { simd_eq::<_, simd::i16x8>(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were not equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.ne))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.ne"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_ne(a: v128, b: v128) -> v128 {
+    unsafe { simd_ne::<_, simd::i16x8>(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_eq as u16x8_eq;
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_ne as u16x8_ne;
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.lt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.lt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i16x8>(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.lt_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.lt_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i16x8>(a.as_u16x8(), b.as_u16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.gt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.gt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i16x8>(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.gt_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.gt_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i16x8>(a.as_u16x8(), b.as_u16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.le_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.le_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i16x8>(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.le_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.le_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i16x8>(a.as_u16x8(), b.as_u16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.ge_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.ge_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i16x8>(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.ge_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.ge_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i16x8>(a.as_u16x8(), b.as_u16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.eq))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.eq"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_eq(a: v128, b: v128) -> v128 {
+    unsafe { simd_eq::<_, simd::i32x4>(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were not equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.ne))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.ne"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_ne(a: v128, b: v128) -> v128 {
+    unsafe { simd_ne::<_, simd::i32x4>(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_eq as u32x4_eq;
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_ne as u32x4_ne;
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.lt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.lt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i32x4>(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.lt_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.lt_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i32x4>(a.as_u32x4(), b.as_u32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.gt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.gt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i32x4>(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.gt_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.gt_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i32x4>(a.as_u32x4(), b.as_u32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.le_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.le_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i32x4>(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.le_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.le_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i32x4>(a.as_u32x4(), b.as_u32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.ge_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.ge_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i32x4>(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.ge_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.ge_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i32x4>(a.as_u32x4(), b.as_u32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.eq))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.eq"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_eq(a: v128, b: v128) -> v128 {
+    unsafe { simd_eq::<_, simd::i64x2>(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were not equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.ne))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.ne"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_ne(a: v128, b: v128) -> v128 {
+    unsafe { simd_ne::<_, simd::i64x2>(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_eq as u64x2_eq;
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_ne as u64x2_ne;
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.lt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.lt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i64x2>(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.gt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.gt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i64x2>(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.le_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.le_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i64x2>(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.ge_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.ge_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i64x2>(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.eq))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.eq"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_eq(a: v128, b: v128) -> v128 {
+    unsafe { simd_eq::<_, simd::i32x4>(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were not equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.ne))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.ne"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_ne(a: v128, b: v128) -> v128 {
+    unsafe { simd_ne::<_, simd::i32x4>(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.lt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.lt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i32x4>(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.gt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.gt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i32x4>(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.le))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.le"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i32x4>(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.ge))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.ge"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i32x4>(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.eq))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.eq"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_eq(a: v128, b: v128) -> v128 {
+    unsafe { simd_eq::<_, simd::i64x2>(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were not equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.ne))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.ne"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_ne(a: v128, b: v128) -> v128 {
+    unsafe { simd_ne::<_, simd::i64x2>(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.lt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.lt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i64x2>(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.gt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.gt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i64x2>(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.le))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.le"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i64x2>(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.ge))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.ge"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i64x2>(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Flips each bit of the 128-bit input vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.not))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.not"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_not(a: v128) -> v128 {
+    unsafe { simd_xor(a.as_i64x2(), simd::i64x2::new(!0, !0)).v128() }
+}
+
+/// Performs a bitwise and of the two input 128-bit vectors, returning the
+/// resulting vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.and))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.and"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_and(a: v128, b: v128) -> v128 {
+    unsafe { simd_and(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Bitwise AND of bits of `a` and the logical inverse of bits of `b`.
+///
+/// This operation is equivalent to `v128.and(a, v128.not(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(v128.andnot))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.andnot"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_andnot(a: v128, b: v128) -> v128 {
+    unsafe {
+        simd_and(
+            a.as_i64x2(),
+            simd_xor(b.as_i64x2(), simd::i64x2::new(-1, -1)),
+        )
+        .v128()
+    }
+}
+
+/// Performs a bitwise or of the two input 128-bit vectors, returning the
+/// resulting vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.or))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.or"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_or(a: v128, b: v128) -> v128 {
+    unsafe { simd_or(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Performs a bitwise xor of the two input 128-bit vectors, returning the
+/// resulting vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.xor))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.xor"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_xor(a: v128, b: v128) -> v128 {
+    unsafe { simd_xor(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Use the bitmask in `c` to select bits from `v1` when 1 and `v2` when 0.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.bitselect))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.bitselect"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_bitselect(v1: v128, v2: v128, c: v128) -> v128 {
+    unsafe { llvm_bitselect(v1.as_i8x16(), v2.as_i8x16(), c.as_i8x16()).v128() }
+}
+
+/// Returns `true` if any bit in `a` is set, or `false` otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.any_true))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.any_true"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_any_true(a: v128) -> bool {
+    unsafe { llvm_any_true_i8x16(a.as_i8x16()) != 0 }
+}
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.abs))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.abs"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_abs(a: v128) -> v128 {
+    unsafe {
+        let a = a.as_i8x16();
+        let zero = simd::i8x16::ZERO;
+        simd_select::<simd::m8x16, simd::i8x16>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
+    }
+}
+
+/// Negates a 128-bit vectors interpreted as sixteen 8-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.neg))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.neg"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_neg(a: v128) -> v128 {
+    unsafe { simd_mul(a.as_i8x16(), simd::i8x16::splat(-1)).v128() }
+}
+
+/// Count the number of bits set to one within each lane.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.popcnt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.popcnt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_popcnt(v: v128) -> v128 {
+    unsafe { simd_ctpop(v.as_i8x16()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_popcnt as u8x16_popcnt;
+
+/// Returns true if all lanes are non-zero, false otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.all_true))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.all_true"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_all_true(a: v128) -> bool {
+    unsafe { llvm_i8x16_all_true(a.as_i8x16()) != 0 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_all_true as u8x16_all_true;
+
+/// Extracts the high bit for each lane in `a` and produce a scalar mask with
+/// all bits concatenated.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.bitmask))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.bitmask"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_bitmask(a: v128) -> u16 {
+    unsafe { llvm_bitmask_i8x16(a.as_i8x16()) as u16 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_bitmask as u8x16_bitmask;
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x7f or 0x80 is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.narrow_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.narrow_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_narrow_i16x8(a: v128, b: v128) -> v128 {
+    unsafe { llvm_narrow_i8x16_s(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x00 or 0xff is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.narrow_i16x8_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.narrow_i16x8_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_narrow_i16x8(a: v128, b: v128) -> v128 {
+    unsafe { llvm_narrow_i8x16_u(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Shifts each lane to the left by the specified number of bits.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.shl))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shl"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_shl(a: v128, amt: u32) -> v128 {
+    // SAFETY: the safety of this intrinsic relies on the fact that the
+    // shift amount for each lane is less than the number of bits in the input
+    // lane. In this case the input has 8-bit lanes but the shift amount above
+    // is `u32`, so a mask is required to discard all the upper bits of `amt` to
+    // ensure that the safety condition is met.
+    //
+    // Note that this is distinct from the behavior of the native WebAssembly
+    // instruction here where WebAssembly defines this instruction as performing
+    // a mask as well. This is nonetheless required since this must have defined
+    // semantics in LLVM, not just WebAssembly.
+    //
+    // Finally note that this mask operation is not actually emitted into the
+    // final binary itself. LLVM understands that the wasm operation implicitly
+    // masks, so it knows this mask operation is redundant.
+    //
+    // Basically the extra mask here is required as a bridge from the documented
+    // semantics through LLVM back out to WebAssembly. Both ends have the
+    // documented semantics, and the mask is required by LLVM in the middle.
+    unsafe { simd_shl(a.as_i8x16(), simd::i8x16::splat((amt & 0x7) as i8)).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_shl as u8x16_shl;
+
+/// Shifts each lane to the right by the specified number of bits, sign
+/// extending.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.shr_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shr_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_shr(a: v128, amt: u32) -> v128 {
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_i8x16(), simd::i8x16::splat((amt & 0x7) as i8)).v128() }
+}
+
+/// Shifts each lane to the right by the specified number of bits, shifting in
+/// zeros.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.shr_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shr_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_shr(a: v128, amt: u32) -> v128 {
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_u8x16(), simd::u8x16::splat((amt & 0x7) as u8)).v128() }
+}
+
+/// Adds two 128-bit vectors as if they were two packed sixteen 8-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.add))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.add"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_add(a: v128, b: v128) -> v128 {
+    unsafe { simd_add(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_add as u8x16_add;
+
+/// Adds two 128-bit vectors as if they were two packed sixteen 8-bit signed
+/// integers, saturating on overflow to `i8::MAX`.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.add_sat_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.add_sat_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_add_sat(a: v128, b: v128) -> v128 {
+    unsafe { simd_saturating_add(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Adds two 128-bit vectors as if they were two packed sixteen 8-bit unsigned
+/// integers, saturating on overflow to `u8::MAX`.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.add_sat_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.add_sat_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_add_sat(a: v128, b: v128) -> v128 {
+    unsafe { simd_saturating_add(a.as_u8x16(), b.as_u8x16()).v128() }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.sub))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.sub"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_sub(a: v128, b: v128) -> v128 {
+    unsafe { simd_sub(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_sub as u8x16_sub;
+
+/// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit
+/// signed integers, saturating on overflow to `i8::MIN`.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.sub_sat_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.sub_sat_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_sub_sat(a: v128, b: v128) -> v128 {
+    unsafe { simd_saturating_sub(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit
+/// unsigned integers, saturating on overflow to 0.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.sub_sat_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.sub_sat_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_sub_sat(a: v128, b: v128) -> v128 {
+    unsafe { simd_saturating_sub(a.as_u8x16(), b.as_u8x16()).v128() }
+}
+
+/// Compares lane-wise signed integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.min_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.min_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_min(a: v128, b: v128) -> v128 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    unsafe { simd_select::<simd::i8x16, _>(simd_lt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise unsigned integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.min_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.min_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_min(a: v128, b: v128) -> v128 {
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
+    unsafe { simd_select::<simd::i8x16, _>(simd_lt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise signed integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.max_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.max_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_max(a: v128, b: v128) -> v128 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    unsafe { simd_select::<simd::i8x16, _>(simd_gt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise unsigned integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.max_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.max_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_max(a: v128, b: v128) -> v128 {
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
+    unsafe { simd_select::<simd::i8x16, _>(simd_gt(a, b), a, b).v128() }
+}
+
+/// Lane-wise rounding average.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.avgr_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.avgr_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_avgr(a: v128, b: v128) -> v128 {
+    unsafe { llvm_avgr_u_i8x16(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Integer extended pairwise addition producing extended results
+/// (twice wider results than the inputs).
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extadd_pairwise_i8x16_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extadd_pairwise_i8x16_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extadd_pairwise_i8x16(a: v128) -> v128 {
+    unsafe { llvm_i16x8_extadd_pairwise_i8x16_s(a.as_i8x16()).v128() }
+}
+
+/// Integer extended pairwise addition producing extended results
+/// (twice wider results than the inputs).
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extadd_pairwise_i8x16_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extadd_pairwise_i8x16_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extadd_pairwise_u8x16(a: v128) -> v128 {
+    unsafe { llvm_i16x8_extadd_pairwise_i8x16_u(a.as_i8x16()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_extadd_pairwise_u8x16 as u16x8_extadd_pairwise_u8x16;
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.abs))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.abs"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_abs(a: v128) -> v128 {
+    let a = a.as_i16x8();
+    let zero = simd::i16x8::ZERO;
+    unsafe {
+        simd_select::<simd::m16x8, simd::i16x8>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
+    }
+}
+
+/// Negates a 128-bit vectors interpreted as eight 16-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.neg))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.neg"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_neg(a: v128) -> v128 {
+    unsafe { simd_mul(a.as_i16x8(), simd::i16x8::splat(-1)).v128() }
+}
+
+/// Lane-wise saturating rounding multiplication in Q15 format.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.q15mulr_sat_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.q15mulr_sat_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_q15mulr_sat(a: v128, b: v128) -> v128 {
+    unsafe { llvm_q15mulr(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Returns true if all lanes are non-zero, false otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.all_true))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.all_true"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_all_true(a: v128) -> bool {
+    unsafe { llvm_i16x8_all_true(a.as_i16x8()) != 0 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_all_true as u16x8_all_true;
+
+/// Extracts the high bit for each lane in `a` and produce a scalar mask with
+/// all bits concatenated.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.bitmask))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.bitmask"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_bitmask(a: v128) -> u8 {
+    unsafe { llvm_bitmask_i16x8(a.as_i16x8()) as u8 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_bitmask as u16x8_bitmask;
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x7fff or 0x8000 is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.narrow_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.narrow_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_narrow_i32x4(a: v128, b: v128) -> v128 {
+    unsafe { llvm_narrow_i16x8_s(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x0000 or 0xffff is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.narrow_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.narrow_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_narrow_i32x4(a: v128, b: v128) -> v128 {
+    unsafe { llvm_narrow_i16x8_u(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extend_low_i8x16_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extend_low_i8x16_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extend_low_i8x16(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle!(
+            a.as_i8x16(),
+            a.as_i8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ))
+        .v128()
+    }
+}
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extend_high_i8x16_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extend_high_i8x16_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extend_high_i8x16(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle!(
+            a.as_i8x16(),
+            a.as_i8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ))
+        .v128()
+    }
+}
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extend_low_i8x16_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extend_low_i8x16_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extend_low_u8x16(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle!(
+            a.as_u8x16(),
+            a.as_u8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ))
+        .v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_extend_low_u8x16 as u16x8_extend_low_u8x16;
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extend_high_i8x16_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extend_high_i8x16_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extend_high_u8x16(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle!(
+            a.as_u8x16(),
+            a.as_u8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ))
+        .v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_extend_high_u8x16 as u16x8_extend_high_u8x16;
+
+/// Shifts each lane to the left by the specified number of bits.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.shl))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.shl"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_shl(a: v128, amt: u32) -> v128 {
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shl(a.as_i16x8(), simd::i16x8::splat((amt & 0xf) as i16)).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_shl as u16x8_shl;
+
+/// Shifts each lane to the right by the specified number of bits, sign
+/// extending.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.shr_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.shr_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_shr(a: v128, amt: u32) -> v128 {
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_i16x8(), simd::i16x8::splat((amt & 0xf) as i16)).v128() }
+}
+
+/// Shifts each lane to the right by the specified number of bits, shifting in
+/// zeros.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.shr_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.shr_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_shr(a: v128, amt: u32) -> v128 {
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_u16x8(), simd::u16x8::splat((amt & 0xf) as u16)).v128() }
+}
+
+/// Adds two 128-bit vectors as if they were two packed eight 16-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.add))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.add"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_add(a: v128, b: v128) -> v128 {
+    unsafe { simd_add(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_add as u16x8_add;
+
+/// Adds two 128-bit vectors as if they were two packed eight 16-bit signed
+/// integers, saturating on overflow to `i16::MAX`.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.add_sat_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.add_sat_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_add_sat(a: v128, b: v128) -> v128 {
+    unsafe { simd_saturating_add(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Adds two 128-bit vectors as if they were two packed eight 16-bit unsigned
+/// integers, saturating on overflow to `u16::MAX`.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.add_sat_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.add_sat_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_add_sat(a: v128, b: v128) -> v128 {
+    unsafe { simd_saturating_add(a.as_u16x8(), b.as_u16x8()).v128() }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed eight 16-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.sub))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.sub"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_sub(a: v128, b: v128) -> v128 {
+    unsafe { simd_sub(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_sub as u16x8_sub;
+
+/// Subtracts two 128-bit vectors as if they were two packed eight 16-bit
+/// signed integers, saturating on overflow to `i16::MIN`.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.sub_sat_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.sub_sat_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_sub_sat(a: v128, b: v128) -> v128 {
+    unsafe { simd_saturating_sub(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed eight 16-bit
+/// unsigned integers, saturating on overflow to 0.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.sub_sat_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.sub_sat_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_sub_sat(a: v128, b: v128) -> v128 {
+    unsafe { simd_saturating_sub(a.as_u16x8(), b.as_u16x8()).v128() }
+}
+
+/// Multiplies two 128-bit vectors as if they were two packed eight 16-bit
+/// signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.mul))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.mul"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_mul(a: v128, b: v128) -> v128 {
+    unsafe { simd_mul(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_mul as u16x8_mul;
+
+/// Compares lane-wise signed integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.min_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.min_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_min(a: v128, b: v128) -> v128 {
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    unsafe { simd_select::<simd::i16x8, _>(simd_lt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise unsigned integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.min_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.min_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_min(a: v128, b: v128) -> v128 {
+    let a = a.as_u16x8();
+    let b = b.as_u16x8();
+    unsafe { simd_select::<simd::i16x8, _>(simd_lt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise signed integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.max_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.max_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_max(a: v128, b: v128) -> v128 {
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    unsafe { simd_select::<simd::i16x8, _>(simd_gt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise unsigned integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.max_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.max_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_max(a: v128, b: v128) -> v128 {
+    let a = a.as_u16x8();
+    let b = b.as_u16x8();
+    unsafe { simd_select::<simd::i16x8, _>(simd_gt(a, b), a, b).v128() }
+}
+
+/// Lane-wise rounding average.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.avgr_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.avgr_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_avgr(a: v128, b: v128) -> v128 {
+    unsafe { llvm_avgr_u_i16x8(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i16x8_mul(i16x8_extend_low_i8x16(a), i16x8_extend_low_i8x16(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extmul_low_i8x16_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extmul_low_i8x16_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extmul_low_i8x16(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle!(
+            a.as_i8x16(),
+            a.as_i8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ));
+        let rhs = simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle!(
+            b.as_i8x16(),
+            b.as_i8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i16x8_mul(i16x8_extend_high_i8x16(a), i16x8_extend_high_i8x16(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extmul_high_i8x16_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extmul_high_i8x16_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extmul_high_i8x16(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle!(
+            a.as_i8x16(),
+            a.as_i8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ));
+        let rhs = simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle!(
+            b.as_i8x16(),
+            b.as_i8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i16x8_mul(i16x8_extend_low_u8x16(a), i16x8_extend_low_u8x16(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extmul_low_i8x16_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extmul_low_i8x16_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extmul_low_u8x16(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle!(
+            a.as_u8x16(),
+            a.as_u8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ));
+        let rhs = simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle!(
+            b.as_u8x16(),
+            b.as_u8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_extmul_low_u8x16 as u16x8_extmul_low_u8x16;
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i16x8_mul(i16x8_extend_high_u8x16(a), i16x8_extend_high_u8x16(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extmul_high_i8x16_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extmul_high_i8x16_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extmul_high_u8x16(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle!(
+            a.as_u8x16(),
+            a.as_u8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ));
+        let rhs = simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle!(
+            b.as_u8x16(),
+            b.as_u8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_extmul_high_u8x16 as u16x8_extmul_high_u8x16;
+
+/// Integer extended pairwise addition producing extended results
+/// (twice wider results than the inputs).
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extadd_pairwise_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extadd_pairwise_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extadd_pairwise_i16x8(a: v128) -> v128 {
+    unsafe { llvm_i32x4_extadd_pairwise_i16x8_s(a.as_i16x8()).v128() }
+}
+
+/// Integer extended pairwise addition producing extended results
+/// (twice wider results than the inputs).
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extadd_pairwise_i16x8_u))]
+#[doc(alias("i32x4.extadd_pairwise_i16x8_u"))]
+#[target_feature(enable = "simd128")]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extadd_pairwise_u16x8(a: v128) -> v128 {
+    unsafe { llvm_i32x4_extadd_pairwise_i16x8_u(a.as_i16x8()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_extadd_pairwise_u16x8 as u32x4_extadd_pairwise_u16x8;
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.abs))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.abs"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_abs(a: v128) -> v128 {
+    let a = a.as_i32x4();
+    let zero = simd::i32x4::ZERO;
+    unsafe {
+        simd_select::<simd::m32x4, simd::i32x4>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
+    }
+}
+
+/// Negates a 128-bit vectors interpreted as four 32-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.neg))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.neg"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_neg(a: v128) -> v128 {
+    unsafe { simd_mul(a.as_i32x4(), simd::i32x4::splat(-1)).v128() }
+}
+
+/// Returns true if all lanes are non-zero, false otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.all_true))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.all_true"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_all_true(a: v128) -> bool {
+    unsafe { llvm_i32x4_all_true(a.as_i32x4()) != 0 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_all_true as u32x4_all_true;
+
+/// Extracts the high bit for each lane in `a` and produce a scalar mask with
+/// all bits concatenated.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.bitmask))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.bitmask"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_bitmask(a: v128) -> u8 {
+    unsafe { llvm_bitmask_i32x4(a.as_i32x4()) as u8 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_bitmask as u32x4_bitmask;
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extend_low_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extend_low_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extend_low_i16x8(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle!(
+            a.as_i16x8(),
+            a.as_i16x8(),
+            [0, 1, 2, 3]
+        ))
+        .v128()
+    }
+}
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extend_high_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extend_high_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extend_high_i16x8(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle!(
+            a.as_i16x8(),
+            a.as_i16x8(),
+            [4, 5, 6, 7]
+        ))
+        .v128()
+    }
+}
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extend_low_i16x8_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extend_low_i16x8_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extend_low_u16x8(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle!(
+            a.as_u16x8(),
+            a.as_u16x8(),
+            [0, 1, 2, 3]
+        ))
+        .v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_extend_low_u16x8 as u32x4_extend_low_u16x8;
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extend_high_i16x8_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extend_high_i16x8_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extend_high_u16x8(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle!(
+            a.as_u16x8(),
+            a.as_u16x8(),
+            [4, 5, 6, 7]
+        ))
+        .v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_extend_high_u16x8 as u32x4_extend_high_u16x8;
+
+/// Shifts each lane to the left by the specified number of bits.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.shl))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.shl"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_shl(a: v128, amt: u32) -> v128 {
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shl(a.as_i32x4(), simd::i32x4::splat((amt & 0x1f) as i32)).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_shl as u32x4_shl;
+
+/// Shifts each lane to the right by the specified number of bits, sign
+/// extending.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.shr_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.shr_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_shr(a: v128, amt: u32) -> v128 {
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_i32x4(), simd::i32x4::splat((amt & 0x1f) as i32)).v128() }
+}
+
+/// Shifts each lane to the right by the specified number of bits, shifting in
+/// zeros.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.shr_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.shr_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_shr(a: v128, amt: u32) -> v128 {
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_u32x4(), simd::u32x4::splat(amt & 0x1f)).v128() }
+}
+
+/// Adds two 128-bit vectors as if they were two packed four 32-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.add))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.add"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_add(a: v128, b: v128) -> v128 {
+    unsafe { simd_add(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_add as u32x4_add;
+
+/// Subtracts two 128-bit vectors as if they were two packed four 32-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.sub))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.sub"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_sub(a: v128, b: v128) -> v128 {
+    unsafe { simd_sub(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_sub as u32x4_sub;
+
+/// Multiplies two 128-bit vectors as if they were two packed four 32-bit
+/// signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.mul))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.mul"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_mul(a: v128, b: v128) -> v128 {
+    unsafe { simd_mul(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_mul as u32x4_mul;
+
+/// Compares lane-wise signed integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.min_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.min_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_min(a: v128, b: v128) -> v128 {
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    unsafe { simd_select::<simd::i32x4, _>(simd_lt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise unsigned integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.min_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.min_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_min(a: v128, b: v128) -> v128 {
+    let a = a.as_u32x4();
+    let b = b.as_u32x4();
+    unsafe { simd_select::<simd::i32x4, _>(simd_lt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise signed integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.max_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.max_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_max(a: v128, b: v128) -> v128 {
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    unsafe { simd_select::<simd::i32x4, _>(simd_gt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise unsigned integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.max_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.max_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_max(a: v128, b: v128) -> v128 {
+    let a = a.as_u32x4();
+    let b = b.as_u32x4();
+    unsafe { simd_select::<simd::i32x4, _>(simd_gt(a, b), a, b).v128() }
+}
+
+/// Lane-wise multiply signed 16-bit integers in the two input vectors and add
+/// adjacent pairs of the full 32-bit results.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.dot_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.dot_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_dot_i16x8(a: v128, b: v128) -> v128 {
+    unsafe { llvm_i32x4_dot_i16x8_s(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i32x4_mul(i32x4_extend_low_i16x8_s(a), i32x4_extend_low_i16x8_s(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extmul_low_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extmul_low_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extmul_low_i16x8(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle!(
+            a.as_i16x8(),
+            a.as_i16x8(),
+            [0, 1, 2, 3]
+        ));
+        let rhs = simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle!(
+            b.as_i16x8(),
+            b.as_i16x8(),
+            [0, 1, 2, 3]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i32x4_mul(i32x4_extend_high_i16x8_s(a), i32x4_extend_high_i16x8_s(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extmul_high_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extmul_high_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extmul_high_i16x8(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle!(
+            a.as_i16x8(),
+            a.as_i16x8(),
+            [4, 5, 6, 7]
+        ));
+        let rhs = simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle!(
+            b.as_i16x8(),
+            b.as_i16x8(),
+            [4, 5, 6, 7]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i32x4_mul(i32x4_extend_low_u16x8(a), i32x4_extend_low_u16x8(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extmul_low_i16x8_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extmul_low_i16x8_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extmul_low_u16x8(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle!(
+            a.as_u16x8(),
+            a.as_u16x8(),
+            [0, 1, 2, 3]
+        ));
+        let rhs = simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle!(
+            b.as_u16x8(),
+            b.as_u16x8(),
+            [0, 1, 2, 3]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_extmul_low_u16x8 as u32x4_extmul_low_u16x8;
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i32x4_mul(i32x4_extend_high_u16x8(a), i32x4_extend_high_u16x8(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extmul_high_i16x8_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extmul_high_i16x8_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extmul_high_u16x8(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle!(
+            a.as_u16x8(),
+            a.as_u16x8(),
+            [4, 5, 6, 7]
+        ));
+        let rhs = simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle!(
+            b.as_u16x8(),
+            b.as_u16x8(),
+            [4, 5, 6, 7]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_extmul_high_u16x8 as u32x4_extmul_high_u16x8;
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.abs))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.abs"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_abs(a: v128) -> v128 {
+    let a = a.as_i64x2();
+    let zero = simd::i64x2::ZERO;
+    unsafe {
+        simd_select::<simd::m64x2, simd::i64x2>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
+    }
+}
+
+/// Negates a 128-bit vectors interpreted as two 64-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.neg))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.neg"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_neg(a: v128) -> v128 {
+    unsafe { simd_mul(a.as_i64x2(), simd::i64x2::splat(-1)).v128() }
+}
+
+/// Returns true if all lanes are non-zero, false otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.all_true))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.all_true"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_all_true(a: v128) -> bool {
+    unsafe { llvm_i64x2_all_true(a.as_i64x2()) != 0 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_all_true as u64x2_all_true;
+
+/// Extracts the high bit for each lane in `a` and produce a scalar mask with
+/// all bits concatenated.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.bitmask))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.bitmask"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_bitmask(a: v128) -> u8 {
+    unsafe { llvm_bitmask_i64x2(a.as_i64x2()) as u8 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_bitmask as u64x2_bitmask;
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extend_low_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extend_low_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extend_low_i32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle!(a.as_i32x4(), a.as_i32x4(), [0, 1]))
+            .v128()
+    }
+}
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extend_high_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extend_high_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extend_high_i32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle!(a.as_i32x4(), a.as_i32x4(), [2, 3]))
+            .v128()
+    }
+}
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extend_low_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extend_low_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extend_low_u32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u32x2, simd::i64x2>(simd_shuffle!(a.as_u32x4(), a.as_u32x4(), [0, 1]))
+            .v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_extend_low_u32x4 as u64x2_extend_low_u32x4;
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extend_high_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extend_high_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extend_high_u32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u32x2, simd::i64x2>(simd_shuffle!(a.as_u32x4(), a.as_u32x4(), [2, 3]))
+            .v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_extend_high_u32x4 as u64x2_extend_high_u32x4;
+
+/// Shifts each lane to the left by the specified number of bits.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.shl))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.shl"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_shl(a: v128, amt: u32) -> v128 {
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shl(a.as_i64x2(), simd::i64x2::splat((amt & 0x3f) as i64)).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_shl as u64x2_shl;
+
+/// Shifts each lane to the right by the specified number of bits, sign
+/// extending.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.shr_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.shr_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_shr(a: v128, amt: u32) -> v128 {
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_i64x2(), simd::i64x2::splat((amt & 0x3f) as i64)).v128() }
+}
+
+/// Shifts each lane to the right by the specified number of bits, shifting in
+/// zeros.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.shr_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.shr_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u64x2_shr(a: v128, amt: u32) -> v128 {
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_u64x2(), simd::u64x2::splat((amt & 0x3f) as u64)).v128() }
+}
+
+/// Adds two 128-bit vectors as if they were two packed two 64-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.add))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.add"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_add(a: v128, b: v128) -> v128 {
+    unsafe { simd_add(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_add as u64x2_add;
+
+/// Subtracts two 128-bit vectors as if they were two packed two 64-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.sub))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.sub"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_sub(a: v128, b: v128) -> v128 {
+    unsafe { simd_sub(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_sub as u64x2_sub;
+
+/// Multiplies two 128-bit vectors as if they were two packed two 64-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.mul))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.mul"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_mul(a: v128, b: v128) -> v128 {
+    unsafe { simd_mul(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_mul as u64x2_mul;
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i64x2_mul(i64x2_extend_low_i32x4_s(a), i64x2_extend_low_i32x4_s(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extmul_low_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extmul_low_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extmul_low_i32x4(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle!(
+            a.as_i32x4(),
+            a.as_i32x4(),
+            [0, 1]
+        ));
+        let rhs = simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle!(
+            b.as_i32x4(),
+            b.as_i32x4(),
+            [0, 1]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i64x2_mul(i64x2_extend_high_i32x4_s(a), i64x2_extend_high_i32x4_s(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extmul_high_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extmul_high_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extmul_high_i32x4(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle!(
+            a.as_i32x4(),
+            a.as_i32x4(),
+            [2, 3]
+        ));
+        let rhs = simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle!(
+            b.as_i32x4(),
+            b.as_i32x4(),
+            [2, 3]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i64x2_mul(i64x2_extend_low_i32x4_u(a), i64x2_extend_low_i32x4_u(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extmul_low_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extmul_low_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extmul_low_u32x4(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::u32x2, simd::u64x2>(simd_shuffle!(
+            a.as_u32x4(),
+            a.as_u32x4(),
+            [0, 1]
+        ));
+        let rhs = simd_cast::<simd::u32x2, simd::u64x2>(simd_shuffle!(
+            b.as_u32x4(),
+            b.as_u32x4(),
+            [0, 1]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_extmul_low_u32x4 as u64x2_extmul_low_u32x4;
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i64x2_mul(i64x2_extend_high_i32x4_u(a), i64x2_extend_high_i32x4_u(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extmul_high_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extmul_high_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extmul_high_u32x4(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::u32x2, simd::u64x2>(simd_shuffle!(
+            a.as_u32x4(),
+            a.as_u32x4(),
+            [2, 3]
+        ));
+        let rhs = simd_cast::<simd::u32x2, simd::u64x2>(simd_shuffle!(
+            b.as_u32x4(),
+            b.as_u32x4(),
+            [2, 3]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_extmul_high_u32x4 as u64x2_extmul_high_u32x4;
+
+/// Lane-wise rounding to the nearest integral value not smaller than the input.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.ceil))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.ceil"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_ceil(a: v128) -> v128 {
+    unsafe { simd_ceil(a.as_f32x4()).v128() }
+}
+
+/// Lane-wise rounding to the nearest integral value not greater than the input.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.floor))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.floor"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_floor(a: v128) -> v128 {
+    unsafe { simd_floor(a.as_f32x4()).v128() }
+}
+
+/// Lane-wise rounding to the nearest integral value with the magnitude not
+/// larger than the input.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.trunc))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.trunc"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_trunc(a: v128) -> v128 {
+    unsafe { simd_trunc(a.as_f32x4()).v128() }
+}
+
+/// Lane-wise rounding to the nearest integral value; if two values are equally
+/// near, rounds to the even one.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.nearest))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.nearest"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_nearest(a: v128) -> v128 {
+    unsafe { llvm_f32x4_nearest(a.as_f32x4()).v128() }
+}
+
+/// Calculates the absolute value of each lane of a 128-bit vector interpreted
+/// as four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.abs))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.abs"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_abs(a: v128) -> v128 {
+    unsafe { simd_fabs(a.as_f32x4()).v128() }
+}
+
+/// Negates each lane of a 128-bit vector interpreted as four 32-bit floating
+/// point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.neg))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.neg"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_neg(a: v128) -> v128 {
+    unsafe { simd_neg(a.as_f32x4()).v128() }
+}
+
+/// Calculates the square root of each lane of a 128-bit vector interpreted as
+/// four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.sqrt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.sqrt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_sqrt(a: v128) -> v128 {
+    unsafe { simd_fsqrt(a.as_f32x4()).v128() }
+}
+
+/// Lane-wise addition of two 128-bit vectors interpreted as four 32-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.add))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.add"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_add(a: v128, b: v128) -> v128 {
+    unsafe { simd_add(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Lane-wise subtraction of two 128-bit vectors interpreted as four 32-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.sub))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.sub"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_sub(a: v128, b: v128) -> v128 {
+    unsafe { simd_sub(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Lane-wise multiplication of two 128-bit vectors interpreted as four 32-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.mul))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.mul"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_mul(a: v128, b: v128) -> v128 {
+    unsafe { simd_mul(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Lane-wise division of two 128-bit vectors interpreted as four 32-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.div))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.div"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_div(a: v128, b: v128) -> v128 {
+    unsafe { simd_div(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Calculates the lane-wise minimum of two 128-bit vectors interpreted
+/// as four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.min))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.min"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_min(a: v128, b: v128) -> v128 {
+    unsafe { llvm_f32x4_min(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Calculates the lane-wise minimum of two 128-bit vectors interpreted
+/// as four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.max))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.max"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_max(a: v128, b: v128) -> v128 {
+    unsafe { llvm_f32x4_max(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Lane-wise minimum value, defined as `b < a ? b : a`
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.pmin))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.pmin"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_pmin(a: v128, b: v128) -> v128 {
+    unsafe {
+        simd_select::<simd::m32x4, simd::f32x4>(
+            simd_lt(b.as_f32x4(), a.as_f32x4()),
+            b.as_f32x4(),
+            a.as_f32x4(),
+        )
+        .v128()
+    }
+}
+
+/// Lane-wise maximum value, defined as `a < b ? b : a`
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.pmax))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.pmax"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_pmax(a: v128, b: v128) -> v128 {
+    unsafe {
+        simd_select::<simd::m32x4, simd::f32x4>(
+            simd_lt(a.as_f32x4(), b.as_f32x4()),
+            b.as_f32x4(),
+            a.as_f32x4(),
+        )
+        .v128()
+    }
+}
+
+/// Lane-wise rounding to the nearest integral value not smaller than the input.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.ceil))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.ceil"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_ceil(a: v128) -> v128 {
+    unsafe { simd_ceil(a.as_f64x2()).v128() }
+}
+
+/// Lane-wise rounding to the nearest integral value not greater than the input.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.floor))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.floor"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_floor(a: v128) -> v128 {
+    unsafe { simd_floor(a.as_f64x2()).v128() }
+}
+
+/// Lane-wise rounding to the nearest integral value with the magnitude not
+/// larger than the input.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.trunc))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.trunc"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_trunc(a: v128) -> v128 {
+    unsafe { simd_trunc(a.as_f64x2()).v128() }
+}
+
+/// Lane-wise rounding to the nearest integral value; if two values are equally
+/// near, rounds to the even one.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.nearest))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.nearest"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_nearest(a: v128) -> v128 {
+    unsafe { llvm_f64x2_nearest(a.as_f64x2()).v128() }
+}
+
+/// Calculates the absolute value of each lane of a 128-bit vector interpreted
+/// as two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.abs))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.abs"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_abs(a: v128) -> v128 {
+    unsafe { simd_fabs(a.as_f64x2()).v128() }
+}
+
+/// Negates each lane of a 128-bit vector interpreted as two 64-bit floating
+/// point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.neg))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.neg"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_neg(a: v128) -> v128 {
+    unsafe { simd_neg(a.as_f64x2()).v128() }
+}
+
+/// Calculates the square root of each lane of a 128-bit vector interpreted as
+/// two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.sqrt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.sqrt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_sqrt(a: v128) -> v128 {
+    unsafe { simd_fsqrt(a.as_f64x2()).v128() }
+}
+
+/// Lane-wise add of two 128-bit vectors interpreted as two 64-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.add))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.add"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_add(a: v128, b: v128) -> v128 {
+    unsafe { simd_add(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Lane-wise subtract of two 128-bit vectors interpreted as two 64-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.sub))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.sub"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_sub(a: v128, b: v128) -> v128 {
+    unsafe { simd_sub(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Lane-wise multiply of two 128-bit vectors interpreted as two 64-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.mul))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.mul"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_mul(a: v128, b: v128) -> v128 {
+    unsafe { simd_mul(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Lane-wise divide of two 128-bit vectors interpreted as two 64-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.div))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.div"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_div(a: v128, b: v128) -> v128 {
+    unsafe { simd_div(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Calculates the lane-wise minimum of two 128-bit vectors interpreted
+/// as two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.min))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.min"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_min(a: v128, b: v128) -> v128 {
+    unsafe { llvm_f64x2_min(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Calculates the lane-wise maximum of two 128-bit vectors interpreted
+/// as two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.max))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.max"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_max(a: v128, b: v128) -> v128 {
+    unsafe { llvm_f64x2_max(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Lane-wise minimum value, defined as `b < a ? b : a`
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.pmin))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.pmin"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_pmin(a: v128, b: v128) -> v128 {
+    unsafe {
+        simd_select::<simd::m64x2, simd::f64x2>(
+            simd_lt(b.as_f64x2(), a.as_f64x2()),
+            b.as_f64x2(),
+            a.as_f64x2(),
+        )
+        .v128()
+    }
+}
+
+/// Lane-wise maximum value, defined as `a < b ? b : a`
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.pmax))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.pmax"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_pmax(a: v128, b: v128) -> v128 {
+    unsafe {
+        simd_select::<simd::m64x2, simd::f64x2>(
+            simd_lt(a.as_f64x2(), b.as_f64x2()),
+            b.as_f64x2(),
+            a.as_f64x2(),
+        )
+        .v128()
+    }
+}
+
+/// Converts a 128-bit vector interpreted as four 32-bit floating point numbers
+/// into a 128-bit vector of four 32-bit signed integers.
+///
+/// NaN is converted to 0 and if it's out of bounds it becomes the nearest
+/// representable intger.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.trunc_sat_f32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.trunc_sat_f32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_trunc_sat_f32x4(a: v128) -> v128 {
+    unsafe { simd_as::<simd::f32x4, simd::i32x4>(a.as_f32x4()).v128() }
+}
+
+/// Converts a 128-bit vector interpreted as four 32-bit floating point numbers
+/// into a 128-bit vector of four 32-bit unsigned integers.
+///
+/// NaN is converted to 0 and if it's out of bounds it becomes the nearest
+/// representable intger.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.trunc_sat_f32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.trunc_sat_f32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_trunc_sat_f32x4(a: v128) -> v128 {
+    unsafe { simd_as::<simd::f32x4, simd::u32x4>(a.as_f32x4()).v128() }
+}
+
+/// Converts a 128-bit vector interpreted as four 32-bit signed integers into a
+/// 128-bit vector of four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.convert_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.convert_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_convert_i32x4(a: v128) -> v128 {
+    unsafe { simd_cast::<_, simd::f32x4>(a.as_i32x4()).v128() }
+}
+
+/// Converts a 128-bit vector interpreted as four 32-bit unsigned integers into a
+/// 128-bit vector of four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.convert_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.convert_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_convert_u32x4(a: v128) -> v128 {
+    unsafe { simd_cast::<_, simd::f32x4>(a.as_u32x4()).v128() }
+}
+
+/// Saturating conversion of the two double-precision floating point lanes to
+/// two lower integer lanes using the IEEE `convertToIntegerTowardZero`
+/// function.
+///
+/// The two higher lanes of the result are initialized to zero. If any input
+/// lane is a NaN, the resulting lane is 0. If the rounded integer value of a
+/// lane is outside the range of the destination type, the result is saturated
+/// to the nearest representable integer value.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.trunc_sat_f64x2_s_zero))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.trunc_sat_f64x2_s_zero"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_trunc_sat_f64x2_zero(a: v128) -> v128 {
+    let ret: simd::i32x4 = unsafe {
+        simd_shuffle!(
+            simd_as::<simd::f64x2, simd::i32x2>(a.as_f64x2()),
+            simd::i32x2::ZERO,
+            [0, 1, 2, 3],
+        )
+    };
+    ret.v128()
+}
+
+/// Saturating conversion of the two double-precision floating point lanes to
+/// two lower integer lanes using the IEEE `convertToIntegerTowardZero`
+/// function.
+///
+/// The two higher lanes of the result are initialized to zero. If any input
+/// lane is a NaN, the resulting lane is 0. If the rounded integer value of a
+/// lane is outside the range of the destination type, the result is saturated
+/// to the nearest representable integer value.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.trunc_sat_f64x2_u_zero))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.trunc_sat_f64x2_u_zero"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_trunc_sat_f64x2_zero(a: v128) -> v128 {
+    let ret: simd::u32x4 = unsafe {
+        simd_shuffle!(
+            simd_as::<simd::f64x2, simd::u32x2>(a.as_f64x2()),
+            simd::u32x2::ZERO,
+            [0, 1, 2, 3],
+        )
+    };
+    ret.v128()
+}
+
+/// Lane-wise conversion from integer to floating point.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.convert_low_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.convert_low_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_convert_low_i32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i32x2, simd::f64x2>(simd_shuffle!(a.as_i32x4(), a.as_i32x4(), [0, 1],))
+            .v128()
+    }
+}
+
+/// Lane-wise conversion from integer to floating point.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.convert_low_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.convert_low_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_convert_low_u32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u32x2, simd::f64x2>(simd_shuffle!(a.as_u32x4(), a.as_u32x4(), [0, 1],))
+            .v128()
+    }
+}
+
+/// Conversion of the two double-precision floating point lanes to two lower
+/// single-precision lanes of the result. The two higher lanes of the result are
+/// initialized to zero. If the conversion result is not representable as a
+/// single-precision floating point number, it is rounded to the nearest-even
+/// representable number.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.demote_f64x2_zero))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.demote_f64x2_zero"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_demote_f64x2_zero(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::f64x4, simd::f32x4>(simd_shuffle!(
+            a.as_f64x2(),
+            simd::f64x2::ZERO,
+            [0, 1, 2, 3]
+        ))
+        .v128()
+    }
+}
+
+/// Conversion of the two lower single-precision floating point lanes to the two
+/// double-precision lanes of the result.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.promote_low_f32x4))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.promote_low_f32x4"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_promote_low_f32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::f32x2, simd::f64x2>(simd_shuffle!(a.as_f32x4(), a.as_f32x4(), [0, 1]))
+            .v128()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use core::ops::{Add, Div, Mul, Neg, Sub};
+
+    use std::fmt::Debug;
+    use std::mem::transmute;
+    use std::num::Wrapping;
+    use std::prelude::v1::*;
+
+    const _C1: v128 = i8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    const _C2: v128 = u8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    const _C3: v128 = i16x8(0, 1, 2, 3, 4, 5, 6, 7);
+    const _C4: v128 = u16x8(0, 1, 2, 3, 4, 5, 6, 7);
+    const _C5: v128 = i32x4(0, 1, 2, 3);
+    const _C6: v128 = u32x4(0, 1, 2, 3);
+    const _C7: v128 = i64x2(0, 1);
+    const _C8: v128 = u64x2(0, 1);
+    const _C9: v128 = f32x4(0.0, 1.0, 2.0, 3.0);
+    const _C10: v128 = f64x2(0.0, 1.0);
+
+    fn compare_bytes(a: v128, b: v128) {
+        let a: [u8; 16] = unsafe { transmute(a) };
+        let b: [u8; 16] = unsafe { transmute(b) };
+        assert_eq!(a, b);
+    }
+
+    #[test]
+    fn test_load() {
+        unsafe {
+            let arr: [i32; 4] = [0, 1, 2, 3];
+            let vec = v128_load(arr.as_ptr() as *const v128);
+            compare_bytes(vec, i32x4(0, 1, 2, 3));
+        }
+    }
+
+    #[test]
+    fn test_load_extend() {
+        unsafe {
+            let arr: [i8; 8] = [-3, -2, -1, 0, 1, 2, 3, 4];
+            let vec = i16x8_load_extend_i8x8(arr.as_ptr());
+            compare_bytes(vec, i16x8(-3, -2, -1, 0, 1, 2, 3, 4));
+            let vec = i16x8_load_extend_u8x8(arr.as_ptr() as *const u8);
+            compare_bytes(vec, i16x8(253, 254, 255, 0, 1, 2, 3, 4));
+
+            let arr: [i16; 4] = [-1, 0, 1, 2];
+            let vec = i32x4_load_extend_i16x4(arr.as_ptr());
+            compare_bytes(vec, i32x4(-1, 0, 1, 2));
+            let vec = i32x4_load_extend_u16x4(arr.as_ptr() as *const u16);
+            compare_bytes(vec, i32x4(65535, 0, 1, 2));
+
+            let arr: [i32; 2] = [-1, 1];
+            let vec = i64x2_load_extend_i32x2(arr.as_ptr());
+            compare_bytes(vec, i64x2(-1, 1));
+            let vec = i64x2_load_extend_u32x2(arr.as_ptr() as *const u32);
+            compare_bytes(vec, i64x2(u32::max_value().into(), 1));
+        }
+    }
+
+    #[test]
+    fn test_load_splat() {
+        unsafe {
+            compare_bytes(v128_load8_splat(&8), i8x16_splat(8));
+            compare_bytes(v128_load16_splat(&9), i16x8_splat(9));
+            compare_bytes(v128_load32_splat(&10), i32x4_splat(10));
+            compare_bytes(v128_load64_splat(&11), i64x2_splat(11));
+        }
+    }
+
+    #[test]
+    fn test_load_zero() {
+        unsafe {
+            compare_bytes(v128_load32_zero(&10), i32x4(10, 0, 0, 0));
+            compare_bytes(v128_load64_zero(&11), i64x2(11, 0));
+        }
+    }
+
+    #[test]
+    fn test_store() {
+        unsafe {
+            let mut spot = i8x16_splat(0);
+            v128_store(&mut spot, i8x16_splat(1));
+            compare_bytes(spot, i8x16_splat(1));
+        }
+    }
+
+    #[test]
+    fn test_load_lane() {
+        unsafe {
+            let zero = i8x16_splat(0);
+            compare_bytes(
+                v128_load8_lane::<2>(zero, &1),
+                i8x16_replace_lane::<2>(zero, 1),
+            );
+
+            compare_bytes(
+                v128_load16_lane::<2>(zero, &1),
+                i16x8_replace_lane::<2>(zero, 1),
+            );
+
+            compare_bytes(
+                v128_load32_lane::<2>(zero, &1),
+                i32x4_replace_lane::<2>(zero, 1),
+            );
+
+            compare_bytes(
+                v128_load64_lane::<1>(zero, &1),
+                i64x2_replace_lane::<1>(zero, 1),
+            );
+        }
+    }
+
+    #[test]
+    fn test_store_lane() {
+        unsafe {
+            let mut spot = 0;
+            let zero = i8x16_splat(0);
+            v128_store8_lane::<5>(i8x16_replace_lane::<5>(zero, 7), &mut spot);
+            assert_eq!(spot, 7);
+
+            let mut spot = 0;
+            v128_store16_lane::<5>(i16x8_replace_lane::<5>(zero, 7), &mut spot);
+            assert_eq!(spot, 7);
+
+            let mut spot = 0;
+            v128_store32_lane::<3>(i32x4_replace_lane::<3>(zero, 7), &mut spot);
+            assert_eq!(spot, 7);
+
+            let mut spot = 0;
+            v128_store64_lane::<0>(i64x2_replace_lane::<0>(zero, 7), &mut spot);
+            assert_eq!(spot, 7);
+        }
+    }
+
+    #[test]
+    fn test_i8x16() {
+        const A: v128 = super::i8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        compare_bytes(A, A);
+
+        const _: v128 = i16x8(0, 1, 2, 3, 4, 5, 6, 7);
+        const _: v128 = i32x4(0, 1, 2, 3);
+        const _: v128 = i64x2(0, 1);
+        const _: v128 = f32x4(0., 1., 2., 3.);
+        const _: v128 = f64x2(0., 1.);
+
+        let bytes: [i16; 8] = unsafe { mem::transmute(i16x8(-1, -2, -3, -4, -5, -6, -7, -8)) };
+        assert_eq!(bytes, [-1, -2, -3, -4, -5, -6, -7, -8]);
+        let bytes: [i8; 16] = unsafe {
+            mem::transmute(i8x16(
+                -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16,
+            ))
+        };
+        assert_eq!(
+            bytes,
+            [
+                -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16
+            ]
+        );
+    }
+
+    #[test]
+    fn test_shuffle() {
+        let vec_a = i8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let vec_b = i8x16(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+
+        let vec_r = i8x16_shuffle::<0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30>(
+            vec_a, vec_b,
+        );
+        let vec_e = i8x16(0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+        compare_bytes(vec_r, vec_e);
+
+        let vec_a = i16x8(0, 1, 2, 3, 4, 5, 6, 7);
+        let vec_b = i16x8(8, 9, 10, 11, 12, 13, 14, 15);
+        let vec_r = i16x8_shuffle::<0, 8, 2, 10, 4, 12, 6, 14>(vec_a, vec_b);
+        let vec_e = i16x8(0, 8, 2, 10, 4, 12, 6, 14);
+        compare_bytes(vec_r, vec_e);
+
+        let vec_a = i32x4(0, 1, 2, 3);
+        let vec_b = i32x4(4, 5, 6, 7);
+        let vec_r = i32x4_shuffle::<0, 4, 2, 6>(vec_a, vec_b);
+        let vec_e = i32x4(0, 4, 2, 6);
+        compare_bytes(vec_r, vec_e);
+
+        let vec_a = i64x2(0, 1);
+        let vec_b = i64x2(2, 3);
+        let vec_r = i64x2_shuffle::<0, 2>(vec_a, vec_b);
+        let vec_e = i64x2(0, 2);
+        compare_bytes(vec_r, vec_e);
+    }
+
+    // tests extract and replace lanes
+    macro_rules! test_extract {
+        (
+            name: $test_id:ident,
+            extract: $extract:ident,
+            replace: $replace:ident,
+            elem: $elem:ty,
+            count: $count:expr,
+            indices: [$($idx:expr),*],
+        ) => {
+            #[test]
+            fn $test_id() {
+                unsafe {
+                    let arr: [$elem; $count] = [123 as $elem; $count];
+                    let vec: v128 = transmute(arr);
+                    $(
+                        assert_eq!($extract::<$idx>(vec), 123 as $elem);
+                    )*
+
+                    // create a vector from array and check that the indices contain
+                    // the same values as in the array:
+                    let arr: [$elem; $count] = [$($idx as $elem),*];
+                    let vec: v128 = transmute(arr);
+                    $(
+                        assert_eq!($extract::<$idx>(vec), $idx as $elem);
+
+                        let tmp = $replace::<$idx>(vec, 124 as $elem);
+                        assert_eq!($extract::<$idx>(tmp), 124 as $elem);
+                    )*
+                }
+            }
+        }
+    }
+
+    test_extract! {
+        name: test_i8x16_extract_replace,
+        extract: i8x16_extract_lane,
+        replace: i8x16_replace_lane,
+        elem: i8,
+        count: 16,
+        indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    }
+    test_extract! {
+        name: test_i16x8_extract_replace,
+        extract: i16x8_extract_lane,
+        replace: i16x8_replace_lane,
+        elem: i16,
+        count: 8,
+        indices: [0, 1, 2, 3, 4, 5, 6, 7],
+    }
+    test_extract! {
+        name: test_i32x4_extract_replace,
+        extract: i32x4_extract_lane,
+        replace: i32x4_replace_lane,
+        elem: i32,
+        count: 4,
+        indices: [0, 1, 2, 3],
+    }
+    test_extract! {
+        name: test_i64x2_extract_replace,
+        extract: i64x2_extract_lane,
+        replace: i64x2_replace_lane,
+        elem: i64,
+        count: 2,
+        indices: [0, 1],
+    }
+    test_extract! {
+        name: test_f32x4_extract_replace,
+        extract: f32x4_extract_lane,
+        replace: f32x4_replace_lane,
+        elem: f32,
+        count: 4,
+        indices: [0, 1, 2, 3],
+    }
+    test_extract! {
+        name: test_f64x2_extract_replace,
+        extract: f64x2_extract_lane,
+        replace: f64x2_replace_lane,
+        elem: f64,
+        count: 2,
+        indices: [0, 1],
+    }
+
+    #[test]
+    #[rustfmt::skip]
+    fn test_swizzle() {
+        compare_bytes(
+            i8x16_swizzle(
+                i32x4(1, 2, 3, 4),
+                i8x16(
+                    32, 31, 30, 29,
+                    0, 1, 2, 3,
+                    12, 13, 14, 15,
+                    0, 4, 8, 12),
+            ),
+            i32x4(0, 1, 4, 0x04030201),
+        );
+    }
+
+    macro_rules! test_splat {
+        ($test_id:ident: $val:expr => $($vals:expr),*) => {
+            #[test]
+            fn $test_id() {
+                let a = super::$test_id($val);
+                let b = u8x16($($vals as u8),*);
+                compare_bytes(a, b);
+            }
+        }
+    }
+
+    mod splats {
+        use super::*;
+        test_splat!(i8x16_splat: 42 => 42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42);
+        test_splat!(i16x8_splat: 42 => 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0);
+        test_splat!(i32x4_splat: 42 => 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0);
+        test_splat!(i64x2_splat: 42 => 42, 0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0);
+        test_splat!(f32x4_splat: 42. => 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66);
+        test_splat!(f64x2_splat: 42. => 0, 0, 0, 0, 0, 0, 69, 64, 0, 0, 0, 0, 0, 0, 69, 64);
+    }
+
+    #[test]
+    fn test_bitmasks() {
+        let zero = i8x16_splat(0);
+        let ones = i8x16_splat(!0);
+
+        assert_eq!(i8x16_bitmask(zero), 0);
+        assert_eq!(i8x16_bitmask(ones), 0xffff);
+        assert_eq!(i8x16_bitmask(i8x16_splat(i8::MAX)), 0);
+        assert_eq!(i8x16_bitmask(i8x16_splat(i8::MIN)), 0xffff);
+        assert_eq!(i8x16_bitmask(i8x16_replace_lane::<1>(zero, -1)), 0b10);
+
+        assert_eq!(i16x8_bitmask(zero), 0);
+        assert_eq!(i16x8_bitmask(ones), 0xff);
+        assert_eq!(i16x8_bitmask(i16x8_splat(i16::MAX)), 0);
+        assert_eq!(i16x8_bitmask(i16x8_splat(i16::MIN)), 0xff);
+        assert_eq!(i16x8_bitmask(i16x8_replace_lane::<1>(zero, -1)), 0b10);
+
+        assert_eq!(i32x4_bitmask(zero), 0);
+        assert_eq!(i32x4_bitmask(ones), 0b1111);
+        assert_eq!(i32x4_bitmask(i32x4_splat(i32::MAX)), 0);
+        assert_eq!(i32x4_bitmask(i32x4_splat(i32::MIN)), 0b1111);
+        assert_eq!(i32x4_bitmask(i32x4_replace_lane::<1>(zero, -1)), 0b10);
+
+        assert_eq!(i64x2_bitmask(zero), 0);
+        assert_eq!(i64x2_bitmask(ones), 0b11);
+        assert_eq!(i64x2_bitmask(i64x2_splat(i64::MAX)), 0);
+        assert_eq!(i64x2_bitmask(i64x2_splat(i64::MIN)), 0b11);
+        assert_eq!(i64x2_bitmask(i64x2_replace_lane::<1>(zero, -1)), 0b10);
+    }
+
+    #[test]
+    fn test_narrow() {
+        let zero = i8x16_splat(0);
+        let ones = i8x16_splat(!0);
+
+        compare_bytes(i8x16_narrow_i16x8(zero, zero), zero);
+        compare_bytes(u8x16_narrow_i16x8(zero, zero), zero);
+        compare_bytes(i8x16_narrow_i16x8(ones, ones), ones);
+        compare_bytes(u8x16_narrow_i16x8(ones, ones), zero);
+
+        compare_bytes(
+            i8x16_narrow_i16x8(
+                i16x8(
+                    0,
+                    1,
+                    2,
+                    -1,
+                    i8::MIN.into(),
+                    i8::MAX.into(),
+                    u8::MIN.into(),
+                    u8::MAX.into(),
+                ),
+                i16x8(
+                    i16::MIN,
+                    i16::MAX,
+                    u16::MIN as i16,
+                    u16::MAX as i16,
+                    0,
+                    0,
+                    0,
+                    0,
+                ),
+            ),
+            i8x16(0, 1, 2, -1, -128, 127, 0, 127, -128, 127, 0, -1, 0, 0, 0, 0),
+        );
+
+        compare_bytes(
+            u8x16_narrow_i16x8(
+                i16x8(
+                    0,
+                    1,
+                    2,
+                    -1,
+                    i8::MIN.into(),
+                    i8::MAX.into(),
+                    u8::MIN.into(),
+                    u8::MAX.into(),
+                ),
+                i16x8(
+                    i16::MIN,
+                    i16::MAX,
+                    u16::MIN as i16,
+                    u16::MAX as i16,
+                    0,
+                    0,
+                    0,
+                    0,
+                ),
+            ),
+            i8x16(0, 1, 2, 0, 0, 127, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0),
+        );
+
+        compare_bytes(i16x8_narrow_i32x4(zero, zero), zero);
+        compare_bytes(u16x8_narrow_i32x4(zero, zero), zero);
+        compare_bytes(i16x8_narrow_i32x4(ones, ones), ones);
+        compare_bytes(u16x8_narrow_i32x4(ones, ones), zero);
+
+        compare_bytes(
+            i16x8_narrow_i32x4(
+                i32x4(0, -1, i16::MIN.into(), i16::MAX.into()),
+                i32x4(i32::MIN, i32::MAX, u32::MIN as i32, u32::MAX as i32),
+            ),
+            i16x8(0, -1, i16::MIN, i16::MAX, i16::MIN, i16::MAX, 0, -1),
+        );
+
+        compare_bytes(
+            u16x8_narrow_i32x4(
+                i32x4(u16::MAX.into(), -1, i16::MIN.into(), i16::MAX.into()),
+                i32x4(i32::MIN, i32::MAX, u32::MIN as i32, u32::MAX as i32),
+            ),
+            i16x8(-1, 0, 0, i16::MAX, 0, -1, 0, 0),
+        );
+    }
+
+    #[test]
+    fn test_extend() {
+        let zero = i8x16_splat(0);
+        let ones = i8x16_splat(!0);
+
+        compare_bytes(i16x8_extend_low_i8x16(zero), zero);
+        compare_bytes(i16x8_extend_high_i8x16(zero), zero);
+        compare_bytes(i16x8_extend_low_u8x16(zero), zero);
+        compare_bytes(i16x8_extend_high_u8x16(zero), zero);
+        compare_bytes(i16x8_extend_low_i8x16(ones), ones);
+        compare_bytes(i16x8_extend_high_i8x16(ones), ones);
+        let halves = u16x8_splat(u8::MAX.into());
+        compare_bytes(i16x8_extend_low_u8x16(ones), halves);
+        compare_bytes(i16x8_extend_high_u8x16(ones), halves);
+
+        compare_bytes(i32x4_extend_low_i16x8(zero), zero);
+        compare_bytes(i32x4_extend_high_i16x8(zero), zero);
+        compare_bytes(i32x4_extend_low_u16x8(zero), zero);
+        compare_bytes(i32x4_extend_high_u16x8(zero), zero);
+        compare_bytes(i32x4_extend_low_i16x8(ones), ones);
+        compare_bytes(i32x4_extend_high_i16x8(ones), ones);
+        let halves = u32x4_splat(u16::MAX.into());
+        compare_bytes(i32x4_extend_low_u16x8(ones), halves);
+        compare_bytes(i32x4_extend_high_u16x8(ones), halves);
+
+        compare_bytes(i64x2_extend_low_i32x4(zero), zero);
+        compare_bytes(i64x2_extend_high_i32x4(zero), zero);
+        compare_bytes(i64x2_extend_low_u32x4(zero), zero);
+        compare_bytes(i64x2_extend_high_u32x4(zero), zero);
+        compare_bytes(i64x2_extend_low_i32x4(ones), ones);
+        compare_bytes(i64x2_extend_high_i32x4(ones), ones);
+        let halves = i64x2_splat(u32::MAX.into());
+        compare_bytes(u64x2_extend_low_u32x4(ones), halves);
+        compare_bytes(u64x2_extend_high_u32x4(ones), halves);
+    }
+
+    #[test]
+    fn test_dot() {
+        let zero = i8x16_splat(0);
+        let ones = i8x16_splat(!0);
+        let two = i32x4_splat(2);
+        compare_bytes(i32x4_dot_i16x8(zero, zero), zero);
+        compare_bytes(i32x4_dot_i16x8(ones, ones), two);
+    }
+
+    macro_rules! test_binop {
+        (
+            $($name:ident => {
+                $([$($vec1:tt)*] ($op:ident | $f:ident) [$($vec2:tt)*],)*
+            })*
+        ) => ($(
+            #[test]
+            fn $name() {
+                unsafe {
+                    $(
+                        let v1 = [$($vec1)*];
+                        let v2 = [$($vec2)*];
+                        let v1_v128: v128 = mem::transmute(v1);
+                        let v2_v128: v128 = mem::transmute(v2);
+                        let v3_v128 = super::$f(v1_v128, v2_v128);
+                        let mut v3 = [$($vec1)*];
+                        let _ignore = v3;
+                        v3 = mem::transmute(v3_v128);
+
+                        for (i, actual) in v3.iter().enumerate() {
+                            let expected = v1[i].$op(v2[i]);
+                            assert_eq!(*actual, expected);
+                        }
+                    )*
+                }
+            }
+        )*)
+    }
+
+    macro_rules! test_unop {
+        (
+            $($name:ident => {
+                $(($op:ident | $f:ident) [$($vec1:tt)*],)*
+            })*
+        ) => ($(
+            #[test]
+            fn $name() {
+                unsafe {
+                    $(
+                        let v1 = [$($vec1)*];
+                        let v1_v128: v128 = mem::transmute(v1);
+                        let v2_v128 = super::$f(v1_v128);
+                        let mut v2 = [$($vec1)*];
+                        let _ignore = v2;
+                        v2 = mem::transmute(v2_v128);
+
+                        for (i, actual) in v2.iter().enumerate() {
+                            let expected = v1[i].$op();
+                            assert_eq!(*actual, expected);
+                        }
+                    )*
+                }
+            }
+        )*)
+    }
+
+    trait Avgr: Sized {
+        fn avgr(self, other: Self) -> Self;
+    }
+
+    macro_rules! impl_avgr {
+        ($($i:ident)*) => ($(impl Avgr for $i {
+            fn avgr(self, other: Self) -> Self {
+                ((self as u64 + other as u64 + 1) / 2) as $i
+            }
+        })*)
+    }
+
+    impl_avgr!(u8 u16);
+
+    test_binop! {
+        test_i8x16_add => {
+            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (wrapping_add | i8x16_add)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (wrapping_add | i8x16_add)
+            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (wrapping_add | i8x16_add)
+            [127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 9, -24],
+        }
+
+        test_i8x16_add_sat_s => {
+            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_add | i8x16_add_sat)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_add | i8x16_add_sat)
+            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_add | i8x16_add_sat)
+            [127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 9, -24],
+        }
+
+        test_i8x16_add_sat_u => {
+            [0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_add | u8x16_add_sat)
+            [1u8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_add | u8x16_add_sat)
+            [255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_add | u8x16_add_sat)
+            [127, -44i8 as u8, 43, 126, 4, 2, 9, -3i8 as u8, -59i8 as u8, -43i8 as u8, 39, -69i8 as u8, 79, -3i8 as u8, 9, -24i8 as u8],
+        }
+
+        test_i8x16_sub => {
+            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (wrapping_sub | i8x16_sub)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (wrapping_sub | i8x16_sub)
+            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (wrapping_sub | i8x16_sub)
+            [-127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 4, 8],
+        }
+
+        test_i8x16_sub_sat_s => {
+            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_sub | i8x16_sub_sat)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_sub | i8x16_sub_sat)
+            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_sub | i8x16_sub_sat)
+            [-127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 4, 8],
+        }
+
+        test_i8x16_sub_sat_u => {
+            [0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_sub | u8x16_sub_sat)
+            [1u8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_sub | u8x16_sub_sat)
+            [255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_sub | u8x16_sub_sat)
+            [127, -44i8 as u8, 43, 126, 4, 2, 9, -3i8 as u8, -59i8 as u8, -43i8 as u8, 39, -69i8 as u8, 79, -3i8 as u8, 9, -24i8 as u8],
+        }
+
+        test_i8x16_min_s => {
+            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (min | i8x16_min)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (min | i8x16_min)
+            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (min | i8x16_min)
+            [-127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 4, 8],
+        }
+
+        test_i8x16_min_u => {
+            [0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (min | u8x16_min)
+            [1u8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (min | u8x16_min)
+            [255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (min | u8x16_min)
+            [127, -44i8 as u8, 43, 126, 4, 2, 9, -3i8 as u8, -59i8 as u8, -43i8 as u8, 39, -69i8 as u8, 79, -3i8 as u8, 9, -24i8 as u8],
+        }
+
+        test_i8x16_max_s => {
+            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (max | i8x16_max)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (max | i8x16_max)
+            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (max | i8x16_max)
+            [-127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 4, 8],
+        }
+
+        test_i8x16_max_u => {
+            [0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (max | u8x16_max)
+            [1u8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (max | u8x16_max)
+            [255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (max | u8x16_max)
+            [127, -44i8 as u8, 43, 126, 4, 2, 9, -3i8 as u8, -59i8 as u8, -43i8 as u8, 39, -69i8 as u8, 79, -3i8 as u8, 9, -24i8 as u8],
+        }
+
+        test_i8x16_avgr_u => {
+            [0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (avgr | u8x16_avgr)
+            [1u8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (avgr | u8x16_avgr)
+            [255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (avgr | u8x16_avgr)
+            [127, -44i8 as u8, 43, 126, 4, 2, 9, -3i8 as u8, -59i8 as u8, -43i8 as u8, 39, -69i8 as u8, 79, -3i8 as u8, 9, -24i8 as u8],
+        }
+
+        test_i16x8_add => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (wrapping_add | i16x8_add)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (wrapping_add | i16x8_add)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_add_sat_s => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_add | i16x8_add_sat)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (saturating_add | i16x8_add_sat)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_add_sat_u => {
+            [0u16, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_add | u16x8_add_sat)
+            [1u16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u16, 2, 3, 4, 5, 6, 7, 8]
+                (saturating_add | u16x8_add_sat)
+            [32767, 8, -2494i16 as u16,-4i16 as u16, 4882, -4i16 as u16, 848, 3830],
+        }
+
+        test_i16x8_sub => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (wrapping_sub | i16x8_sub)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (wrapping_sub | i16x8_sub)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_sub_sat_s => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_sub | i16x8_sub_sat)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (saturating_sub | i16x8_sub_sat)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_sub_sat_u => {
+            [0u16, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_sub | u16x8_sub_sat)
+            [1u16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u16, 2, 3, 4, 5, 6, 7, 8]
+                (saturating_sub | u16x8_sub_sat)
+            [32767, 8, -2494i16 as u16,-4i16 as u16, 4882, -4i16 as u16, 848, 3830],
+        }
+
+        test_i16x8_mul => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (wrapping_mul | i16x8_mul)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (wrapping_mul | i16x8_mul)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_min_s => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (min | i16x8_min)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (min | i16x8_min)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_min_u => {
+            [0u16, 0, 0, 0, 0, 0, 0, 0]
+                (min | u16x8_min)
+            [1u16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u16, 2, 3, 4, 5, 6, 7, 8]
+                (min | u16x8_min)
+            [32767, 8, -2494i16 as u16,-4i16 as u16, 4882, -4i16 as u16, 848, 3830],
+        }
+
+        test_i16x8_max_s => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (max | i16x8_max)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (max | i16x8_max)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_max_u => {
+            [0u16, 0, 0, 0, 0, 0, 0, 0]
+                (max | u16x8_max)
+            [1u16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u16, 2, 3, 4, 5, 6, 7, 8]
+                (max | u16x8_max)
+            [32767, 8, -2494i16 as u16,-4i16 as u16, 4882, -4i16 as u16, 848, 3830],
+        }
+
+        test_i16x8_avgr_u => {
+            [0u16, 0, 0, 0, 0, 0, 0, 0]
+                (avgr | u16x8_avgr)
+            [1u16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u16, 2, 3, 4, 5, 6, 7, 8]
+                (avgr | u16x8_avgr)
+            [32767, 8, -2494i16 as u16,-4i16 as u16, 4882, -4i16 as u16, 848, 3830],
+        }
+
+        test_i32x4_add => {
+            [0i32, 0, 0, 0] (wrapping_add | i32x4_add) [1, 2, 3, 4],
+            [1i32, 1283, i32::MAX, i32::MIN]
+                (wrapping_add | i32x4_add)
+            [i32::MAX; 4],
+        }
+
+        test_i32x4_sub => {
+            [0i32, 0, 0, 0] (wrapping_sub | i32x4_sub) [1, 2, 3, 4],
+            [1i32, 1283, i32::MAX, i32::MIN]
+                (wrapping_sub | i32x4_sub)
+            [i32::MAX; 4],
+        }
+
+        test_i32x4_mul => {
+            [0i32, 0, 0, 0] (wrapping_mul | i32x4_mul) [1, 2, 3, 4],
+            [1i32, 1283, i32::MAX, i32::MIN]
+                (wrapping_mul | i32x4_mul)
+            [i32::MAX; 4],
+        }
+
+        test_i32x4_min_s => {
+            [0i32, 0, 0, 0] (min | i32x4_min) [1, 2, 3, 4],
+            [1i32, 1283, i32::MAX, i32::MIN]
+                (min | i32x4_min)
+            [i32::MAX; 4],
+        }
+
+        test_i32x4_min_u => {
+            [0u32, 0, 0, 0] (min | u32x4_min) [1, 2, 3, 4],
+            [1u32, 1283, i32::MAX as u32, i32::MIN as u32]
+                (min | u32x4_min)
+            [i32::MAX as u32; 4],
+        }
+
+        test_i32x4_max_s => {
+            [0i32, 0, 0, 0] (max | i32x4_max) [1, 2, 3, 4],
+            [1i32, 1283, i32::MAX, i32::MIN]
+                (max | i32x4_max)
+            [i32::MAX; 4],
+        }
+
+        test_i32x4_max_u => {
+            [0u32, 0, 0, 0] (max | u32x4_max) [1, 2, 3, 4],
+            [1u32, 1283, i32::MAX as u32, i32::MIN as u32]
+                (max | u32x4_max)
+            [i32::MAX as u32; 4],
+        }
+
+        test_i64x2_add => {
+            [0i64, 0] (wrapping_add | i64x2_add) [1, 2],
+            [i64::MIN, i64::MAX] (wrapping_add | i64x2_add) [i64::MAX, i64::MIN],
+            [i64::MAX; 2] (wrapping_add | i64x2_add) [i64::MAX; 2],
+            [-4i64, -4] (wrapping_add | i64x2_add) [800, 939],
+        }
+
+        test_i64x2_sub => {
+            [0i64, 0] (wrapping_sub | i64x2_sub) [1, 2],
+            [i64::MIN, i64::MAX] (wrapping_sub | i64x2_sub) [i64::MAX, i64::MIN],
+            [i64::MAX; 2] (wrapping_sub | i64x2_sub) [i64::MAX; 2],
+            [-4i64, -4] (wrapping_sub | i64x2_sub) [800, 939],
+        }
+
+        test_i64x2_mul => {
+            [0i64, 0] (wrapping_mul | i64x2_mul) [1, 2],
+            [i64::MIN, i64::MAX] (wrapping_mul | i64x2_mul) [i64::MAX, i64::MIN],
+            [i64::MAX; 2] (wrapping_mul | i64x2_mul) [i64::MAX; 2],
+            [-4i64, -4] (wrapping_mul | i64x2_mul) [800, 939],
+        }
+
+        test_f32x4_add => {
+            [-1.0f32, 2.0, 3.0, 4.0] (add | f32x4_add) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (add | f32x4_add)
+            [1., 2., 0., 0.],
+        }
+
+        test_f32x4_sub => {
+            [-1.0f32, 2.0, 3.0, 4.0] (sub | f32x4_sub) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (sub | f32x4_sub)
+            [1., 2., 0., 0.],
+        }
+
+        test_f32x4_mul => {
+            [-1.0f32, 2.0, 3.0, 4.0] (mul | f32x4_mul) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (mul | f32x4_mul)
+            [1., 2., 1., 0.],
+        }
+
+        test_f32x4_div => {
+            [-1.0f32, 2.0, 3.0, 4.0] (div | f32x4_div) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (div | f32x4_div)
+            [1., 2., 0., 0.],
+        }
+
+        test_f32x4_min => {
+            [-1.0f32, 2.0, 3.0, 4.0] (min | f32x4_min) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (min | f32x4_min)
+            [1., 2., 0., 0.],
+        }
+
+        test_f32x4_max => {
+            [-1.0f32, 2.0, 3.0, 4.0] (max | f32x4_max) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (max | f32x4_max)
+            [1., 2., 0., 0.],
+        }
+
+        test_f32x4_pmin => {
+            [-1.0f32, 2.0, 3.0, 4.0] (min | f32x4_pmin) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (min | f32x4_pmin)
+            [1., 2., 0., 0.],
+        }
+
+        test_f32x4_pmax => {
+            [-1.0f32, 2.0, 3.0, 4.0] (max | f32x4_pmax) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (max | f32x4_pmax)
+            [1., 2., 0., 0.],
+        }
+
+        test_f64x2_add => {
+            [-1.0f64, 2.0] (add | f64x2_add) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (add | f64x2_add) [1., 2.],
+        }
+
+        test_f64x2_sub => {
+            [-1.0f64, 2.0] (sub | f64x2_sub) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (sub | f64x2_sub) [1., 2.],
+        }
+
+        test_f64x2_mul => {
+            [-1.0f64, 2.0] (mul | f64x2_mul) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (mul | f64x2_mul) [1., 2.],
+        }
+
+        test_f64x2_div => {
+            [-1.0f64, 2.0] (div | f64x2_div) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (div | f64x2_div) [1., 2.],
+        }
+
+        test_f64x2_min => {
+            [-1.0f64, 2.0] (min | f64x2_min) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (min | f64x2_min) [1., 2.],
+        }
+
+        test_f64x2_max => {
+            [-1.0f64, 2.0] (max | f64x2_max) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (max | f64x2_max) [1., 2.],
+        }
+
+        test_f64x2_pmin => {
+            [-1.0f64, 2.0] (min | f64x2_pmin) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (min | f64x2_pmin) [1., 2.],
+        }
+
+        test_f64x2_pmax => {
+            [-1.0f64, 2.0] (max | f64x2_pmax) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (max | f64x2_pmax) [1., 2.],
+        }
+    }
+
+    test_unop! {
+        test_i8x16_abs => {
+            (wrapping_abs | i8x16_abs)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            (wrapping_abs | i8x16_abs)
+            [-2i8, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            (wrapping_abs | i8x16_abs)
+            [-127i8, -44, 43, 126, 4, -128, 127, -59, -43, 39, -69, 79, -3, 35, 83, 13],
+        }
+
+        test_i8x16_neg => {
+            (wrapping_neg | i8x16_neg)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            (wrapping_neg | i8x16_neg)
+            [-2i8, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            (wrapping_neg | i8x16_neg)
+            [-127i8, -44, 43, 126, 4, -128, 127, -59, -43, 39, -69, 79, -3, 35, 83, 13],
+        }
+
+        test_i16x8_abs => {
+            (wrapping_abs | i16x8_abs) [1i16, 1, 1, 1, 1, 1, 1, 1],
+            (wrapping_abs | i16x8_abs) [2i16, 0x7fff, !0, 4, 42, -5, 33, -4847],
+        }
+
+        test_i16x8_neg => {
+            (wrapping_neg | i16x8_neg) [1i16, 1, 1, 1, 1, 1, 1, 1],
+            (wrapping_neg | i16x8_neg) [2i16, 0x7fff, !0, 4, 42, -5, 33, -4847],
+        }
+
+        test_i32x4_abs => {
+            (wrapping_abs | i32x4_abs) [1i32, 2, 3, 4],
+            (wrapping_abs | i32x4_abs) [i32::MIN, i32::MAX, 0, 4],
+        }
+
+        test_i32x4_neg => {
+            (wrapping_neg | i32x4_neg) [1i32, 2, 3, 4],
+            (wrapping_neg | i32x4_neg) [i32::MIN, i32::MAX, 0, 4],
+        }
+
+        test_i64x2_abs => {
+            (wrapping_abs | i64x2_abs) [1i64, 2],
+            (wrapping_abs | i64x2_abs) [i64::MIN, i64::MAX],
+        }
+
+        test_i64x2_neg => {
+            (wrapping_neg | i64x2_neg) [1i64, 2],
+            (wrapping_neg | i64x2_neg) [i64::MIN, i64::MAX],
+        }
+
+        test_f32x4_ceil => {
+            (ceil | f32x4_ceil) [1.0f32, 2., 2.5, 3.3],
+            (ceil | f32x4_ceil) [0.0, -0.3, f32::INFINITY, -0.0],
+        }
+
+        test_f32x4_floor => {
+            (floor | f32x4_floor) [1.0f32, 2., 2.5, 3.3],
+            (floor | f32x4_floor) [0.0, -0.3, f32::INFINITY, -0.0],
+        }
+
+        test_f32x4_trunc => {
+            (trunc | f32x4_trunc) [1.0f32, 2., 2.5, 3.3],
+            (trunc | f32x4_trunc) [0.0, -0.3, f32::INFINITY, -0.0],
+        }
+
+        test_f32x4_nearest => {
+            (round | f32x4_nearest) [1.0f32, 2., 2.6, 3.3],
+            (round | f32x4_nearest) [0.0, -0.3, f32::INFINITY, -0.0],
+        }
+
+        test_f32x4_abs => {
+            (abs | f32x4_abs) [1.0f32, 2., 2.6, 3.3],
+            (abs | f32x4_abs) [0.0, -0.3, f32::INFINITY, -0.0],
+        }
+
+        test_f32x4_neg => {
+            (neg | f32x4_neg) [1.0f32, 2., 2.6, 3.3],
+            (neg | f32x4_neg) [0.0, -0.3, f32::INFINITY, -0.0],
+        }
+
+        test_f32x4_sqrt => {
+            (sqrt | f32x4_sqrt) [1.0f32, 2., 2.6, 3.3],
+            (sqrt | f32x4_sqrt) [0.0, 0.3, f32::INFINITY, 0.1],
+        }
+
+        test_f64x2_ceil => {
+            (ceil | f64x2_ceil) [1.0f64, 2.3],
+            (ceil | f64x2_ceil) [f64::INFINITY, -0.1],
+        }
+
+        test_f64x2_floor => {
+            (floor | f64x2_floor) [1.0f64, 2.3],
+            (floor | f64x2_floor) [f64::INFINITY, -0.1],
+        }
+
+        test_f64x2_trunc => {
+            (trunc | f64x2_trunc) [1.0f64, 2.3],
+            (trunc | f64x2_trunc) [f64::INFINITY, -0.1],
+        }
+
+        test_f64x2_nearest => {
+            (round | f64x2_nearest) [1.0f64, 2.3],
+            (round | f64x2_nearest) [f64::INFINITY, -0.1],
+        }
+
+        test_f64x2_abs => {
+            (abs | f64x2_abs) [1.0f64, 2.3],
+            (abs | f64x2_abs) [f64::INFINITY, -0.1],
+        }
+
+        test_f64x2_neg => {
+            (neg | f64x2_neg) [1.0f64, 2.3],
+            (neg | f64x2_neg) [f64::INFINITY, -0.1],
+        }
+
+        test_f64x2_sqrt => {
+            (sqrt | f64x2_sqrt) [1.0f64, 2.3],
+            (sqrt | f64x2_sqrt) [f64::INFINITY, 0.1],
+        }
+    }
+
+    macro_rules! floating_point {
+        (f32) => {
+            true
+        };
+        (f64) => {
+            true
+        };
+        ($id:ident) => {
+            false
+        };
+    }
+
+    trait IsNan: Sized {
+        fn is_nan(self) -> bool {
+            false
+        }
+    }
+    impl IsNan for i8 {}
+    impl IsNan for i16 {}
+    impl IsNan for i32 {}
+    impl IsNan for i64 {}
+
+    macro_rules! test_bop {
+         ($id:ident[$ety:ident; $ecount:expr] |
+          $binary_op:ident [$op_test_id:ident] :
+          ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
+             test_bop!(
+                 $id[$ety; $ecount] => $ety | $binary_op [ $op_test_id ]:
+                 ([$($in_a),*], [$($in_b),*]) => [$($out),*]
+             );
+
+         };
+         ($id:ident[$ety:ident; $ecount:expr] => $oty:ident |
+          $binary_op:ident [$op_test_id:ident] :
+          ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
+             #[test]
+             fn $op_test_id() {
+                 unsafe {
+                     let a_input: [$ety; $ecount] = [$($in_a),*];
+                     let b_input: [$ety; $ecount] = [$($in_b),*];
+                     let output: [$oty; $ecount] = [$($out),*];
+
+                     let a_vec_in: v128 = transmute(a_input);
+                     let b_vec_in: v128 = transmute(b_input);
+                     let vec_res: v128 = $binary_op(a_vec_in, b_vec_in);
+
+                     let res: [$oty; $ecount] = transmute(vec_res);
+
+                     if !floating_point!($ety) {
+                         assert_eq!(res, output);
+                     } else {
+                         for i in 0..$ecount {
+                             let r = res[i];
+                             let o = output[i];
+                             assert_eq!(r.is_nan(), o.is_nan());
+                             if !r.is_nan() {
+                                 assert_eq!(r, o);
+                             }
+                         }
+                     }
+                 }
+             }
+         }
+     }
+
+    macro_rules! test_bops {
+         ($id:ident[$ety:ident; $ecount:expr] |
+          $binary_op:ident [$op_test_id:ident]:
+          ([$($in_a:expr),*], $in_b:expr) => [$($out:expr),*]) => {
+             #[test]
+             fn $op_test_id() {
+                 unsafe {
+                     let a_input: [$ety; $ecount] = [$($in_a),*];
+                     let output: [$ety; $ecount] = [$($out),*];
+
+                     let a_vec_in: v128 = transmute(a_input);
+                     let vec_res: v128 = $binary_op(a_vec_in, $in_b);
+
+                     let res: [$ety; $ecount] = transmute(vec_res);
+                     assert_eq!(res, output);
+                 }
+             }
+         }
+     }
+
+    macro_rules! test_uop {
+         ($id:ident[$ety:ident; $ecount:expr] |
+          $unary_op:ident [$op_test_id:ident]: [$($in_a:expr),*] => [$($out:expr),*]) => {
+             #[test]
+             fn $op_test_id() {
+                 unsafe {
+                     let a_input: [$ety; $ecount] = [$($in_a),*];
+                     let output: [$ety; $ecount] = [$($out),*];
+
+                     let a_vec_in: v128 = transmute(a_input);
+                     let vec_res: v128 = $unary_op(a_vec_in);
+
+                     let res: [$ety; $ecount] = transmute(vec_res);
+                     assert_eq!(res, output);
+                 }
+             }
+         }
+     }
+
+    test_bops!(i8x16[i8; 16] | i8x16_shl[i8x16_shl_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+               [0, -2, 4, 6, 8, 10, 12, -2, 2, 2, 2, 2, 2, 2, 2, 2]);
+    test_bops!(i16x8[i16; 8] | i16x8_shl[i16x8_shl_test]:
+                ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
+                [0, -2, 4, 6, 8, 10, 12, -2]);
+    test_bops!(i32x4[i32; 4] | i32x4_shl[i32x4_shl_test]:
+                ([0, -1, 2, 3], 1) => [0, -2, 4, 6]);
+    test_bops!(i64x2[i64; 2] | i64x2_shl[i64x2_shl_test]:
+                ([0, -1], 1) => [0, -2]);
+
+    test_bops!(i8x16[i8; 16] | i8x16_shr[i8x16_shr_s_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+               [0, -1, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
+    test_bops!(i16x8[i16; 8] | i16x8_shr[i16x8_shr_s_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
+               [0, -1, 1, 1, 2, 2, 3, i16::MAX / 2]);
+    test_bops!(i32x4[i32; 4] | i32x4_shr[i32x4_shr_s_test]:
+               ([0, -1, 2, 3], 1) => [0, -1, 1, 1]);
+    test_bops!(i64x2[i64; 2] | i64x2_shr[i64x2_shr_s_test]:
+               ([0, -1], 1) => [0, -1]);
+
+    test_bops!(i8x16[i8; 16] | u8x16_shr[i8x16_uhr_u_test]:
+                ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+                [0, i8::MAX, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
+    test_bops!(i16x8[i16; 8] | u16x8_shr[i16x8_uhr_u_test]:
+                ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
+                [0, i16::MAX, 1, 1, 2, 2, 3, i16::MAX / 2]);
+    test_bops!(i32x4[i32; 4] | u32x4_shr[i32x4_uhr_u_test]:
+                ([0, -1, 2, 3], 1) => [0, i32::MAX, 1, 1]);
+    test_bops!(i64x2[i64; 2] | u64x2_shr[i64x2_uhr_u_test]:
+                ([0, -1], 1) => [0, i64::MAX]);
+
+    #[test]
+    fn v128_bitwise_logical_ops() {
+        unsafe {
+            let a: [u32; 4] = [u32::MAX, 0, u32::MAX, 0];
+            let b: [u32; 4] = [u32::MAX; 4];
+            let c: [u32; 4] = [0; 4];
+
+            let vec_a: v128 = transmute(a);
+            let vec_b: v128 = transmute(b);
+            let vec_c: v128 = transmute(c);
+
+            let r: v128 = v128_and(vec_a, vec_a);
+            compare_bytes(r, vec_a);
+            let r: v128 = v128_and(vec_a, vec_b);
+            compare_bytes(r, vec_a);
+            let r: v128 = v128_andnot(vec_a, vec_b);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128_andnot(vec_a, vec_a);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128_andnot(vec_a, vec_c);
+            compare_bytes(r, vec_a);
+            let r: v128 = v128_or(vec_a, vec_b);
+            compare_bytes(r, vec_b);
+            let r: v128 = v128_not(vec_b);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128_xor(vec_a, vec_c);
+            compare_bytes(r, vec_a);
+
+            let r: v128 = v128_bitselect(vec_b, vec_c, vec_b);
+            compare_bytes(r, vec_b);
+            let r: v128 = v128_bitselect(vec_b, vec_c, vec_c);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128_bitselect(vec_b, vec_c, vec_a);
+            compare_bytes(r, vec_a);
+        }
+    }
+
+    macro_rules! test_bool_red {
+         ([$test_id:ident, $any:ident, $all:ident] | [$($true:expr),*] | [$($false:expr),*] | [$($alt:expr),*]) => {
+             #[test]
+             fn $test_id() {
+                 unsafe {
+                     let vec_a: v128 = transmute([$($true),*]); // true
+                     let vec_b: v128 = transmute([$($false),*]); // false
+                     let vec_c: v128 = transmute([$($alt),*]); // alternating
+
+                     // TODO
+                     // assert_eq!($any(vec_a), true);
+                     // assert_eq!($any(vec_b), false);
+                     // assert_eq!($any(vec_c), true);
+
+                     assert_eq!($all(vec_a), true);
+                     assert_eq!($all(vec_b), false);
+                     assert_eq!($all(vec_c), false);
+                 }
+             }
+         }
+     }
+
+    test_bool_red!(
+        [i8x16_boolean_reductions, v128_any_true, i8x16_all_true]
+            | [1_i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+            | [0_i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+            | [1_i8, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
+    );
+    test_bool_red!(
+        [i16x8_boolean_reductions, v128_any_true, i16x8_all_true]
+            | [1_i16, 1, 1, 1, 1, 1, 1, 1]
+            | [0_i16, 0, 0, 0, 0, 0, 0, 0]
+            | [1_i16, 0, 1, 0, 1, 0, 1, 0]
+    );
+    test_bool_red!(
+        [i32x4_boolean_reductions, v128_any_true, i32x4_all_true]
+            | [1_i32, 1, 1, 1]
+            | [0_i32, 0, 0, 0]
+            | [1_i32, 0, 1, 0]
+    );
+    test_bool_red!(
+        [i64x2_boolean_reductions, v128_any_true, i64x2_all_true]
+            | [1_i64, 1]
+            | [0_i64, 0]
+            | [1_i64, 0]
+    );
+
+    test_bop!(i8x16[i8; 16] | i8x16_eq[i8x16_eq_test]:
+              ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+               [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+              [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | i16x8_eq[i16x8_eq_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | i32x4_eq[i32x4_eq_test]:
+               ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
+    test_bop!(i64x2[i64; 2] | i64x2_eq[i64x2_eq_test]:
+               ([0, 1], [0, 2]) => [-1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_eq[f32x4_eq_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_eq[f64x2_eq_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_ne[i8x16_ne_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | i16x8_ne[i16x8_ne_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | i32x4_ne[i32x4_ne_test]:
+               ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
+    test_bop!(i64x2[i64; 2] | i64x2_ne[i64x2_ne_test]:
+               ([0, 1], [0, 2]) => [0, -1]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_ne[f32x4_ne_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_ne[f64x2_ne_test]: ([0., 1.], [0., 2.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_lt[i8x16_lt_s_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -12, 13, 14, 15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1, -1, -1, 0, 0]);
+    test_bop!(i8x16[i8; 16] | u8x16_lt[i8x16_lt_u_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -12, 13, 14, 15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | i16x8_lt[i16x8_lt_s_test]:
+               ([0, 1, 2, 3, 4, 5, 6, -7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, -1]);
+    test_bop!(i16x8[i16; 8] | u16x8_lt[i16x8_lt_u_test]:
+               ([0, 1, 2, 3, 4, 5, 6, -7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | i32x4_lt[i32x4_lt_s_test]:
+               ([-1, 1, 2, 3], [0, 2, 2, 4]) => [-1, -1, 0, -1]);
+    test_bop!(i32x4[i32; 4] | u32x4_lt[i32x4_lt_u_test]:
+               ([-1, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
+    test_bop!(i64x2[i64; 2] | i64x2_lt[i64x2_lt_s_test]:
+               ([-1, 3], [0, 2]) => [-1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_lt[f32x4_lt_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_lt[f64x2_lt_test]: ([0., 1.], [0., 2.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_gt[i8x16_gt_s_test]:
+           ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, -15],
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i8x16[i8; 16] | u8x16_gt[i8x16_gt_u_test]:
+           ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, -15],
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, -1]);
+    test_bop!(i16x8[i16; 8] | i16x8_gt[i16x8_gt_s_test]:
+               ([0, 2, 2, 4, 4, 6, 6, -7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | u16x8_gt[i16x8_gt_u_test]:
+               ([0, 2, 2, 4, 4, 6, 6, -7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, -1]);
+    test_bop!(i32x4[i32; 4] | i32x4_gt[i32x4_gt_s_test]:
+               ([0, 2, 2, -4], [0, 1, 2, 3]) => [0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | u32x4_gt[i32x4_gt_u_test]:
+               ([0, 2, 2, -4], [0, 1, 2, 3]) => [0, -1, 0, -1]);
+    test_bop!(i64x2[i64; 2] | i64x2_gt[i64x2_gt_s_test]:
+               ([-1, 2], [0, 1]) => [0, -1]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_gt[f32x4_gt_test]:
+               ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_gt[f64x2_gt_test]: ([0., 2.], [0., 1.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_ge[i8x16_ge_s_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, 0]);
+    test_bop!(i8x16[i8; 16] | u8x16_ge[i8x16_ge_u_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | i16x8_ge[i16x8_ge_s_test]:
+               ([0, 1, 2, 3, 4, 5, 6, -7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, 0]);
+    test_bop!(i16x8[i16; 8] | u16x8_ge[i16x8_ge_u_test]:
+               ([0, 1, 2, 3, 4, 5, 6, -7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | i32x4_ge[i32x4_ge_s_test]:
+               ([0, 1, 2, -3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
+    test_bop!(i32x4[i32; 4] | u32x4_ge[i32x4_ge_u_test]:
+               ([0, 1, 2, -3], [0, 2, 2, 4]) => [-1, 0, -1, -1]);
+    test_bop!(i64x2[i64; 2] | i64x2_ge[i64x2_ge_s_test]:
+               ([0, 1], [-1, 2]) => [-1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_ge[f32x4_ge_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_ge[f64x2_ge_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_le[i8x16_le_s_test]:
+               ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, -15],
+                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+               ) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i8x16[i8; 16] | u8x16_le[i8x16_le_u_test]:
+               ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, -15],
+                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+               ) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, 0]);
+    test_bop!(i16x8[i16; 8] | i16x8_le[i16x8_le_s_test]:
+               ([0, 2, 2, 4, 4, 6, 6, -7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | u16x8_le[i16x8_le_u_test]:
+               ([0, 2, 2, 4, 4, 6, 6, -7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, 0]);
+    test_bop!(i32x4[i32; 4] | i32x4_le[i32x4_le_s_test]:
+               ([0, 2, 2, -4], [0, 1, 2, 3]) => [-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | u32x4_le[i32x4_le_u_test]:
+               ([0, 2, 2, -4], [0, 1, 2, 3]) => [-1, 0, -1, 0]);
+    test_bop!(i64x2[i64; 2] | i64x2_le[i64x2_le_s_test]:
+               ([0, 2], [0, 1]) => [-1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_le[f32x4_le_test]:
+               ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [-1, 0, -1, -0]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_le[f64x2_le_test]: ([0., 2.], [0., 1.]) => [-1, 0]);
+
+    test_uop!(f32x4[f32; 4] | f32x4_neg[f32x4_neg_test]: [0., 1., 2., 3.] => [ 0., -1., -2., -3.]);
+    test_uop!(f32x4[f32; 4] | f32x4_abs[f32x4_abs_test]: [0., -1., 2., -3.] => [ 0., 1., 2., 3.]);
+    test_bop!(f32x4[f32; 4] | f32x4_min[f32x4_min_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., -3., -4., 8.]);
+    test_bop!(f32x4[f32; 4] | f32x4_min[f32x4_min_test_nan]:
+              ([0., -1., 7., 8.], [1., -3., -4., f32::NAN])
+              => [0., -3., -4., f32::NAN]);
+    test_bop!(f32x4[f32; 4] | f32x4_max[f32x4_max_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -1., 7., 10.]);
+    test_bop!(f32x4[f32; 4] | f32x4_max[f32x4_max_test_nan]:
+              ([0., -1., 7., 8.], [1., -3., -4., f32::NAN])
+              => [1., -1., 7., f32::NAN]);
+    test_bop!(f32x4[f32; 4] | f32x4_add[f32x4_add_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -4., 3., 18.]);
+    test_bop!(f32x4[f32; 4] | f32x4_sub[f32x4_sub_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [-1., 2., 11., -2.]);
+    test_bop!(f32x4[f32; 4] | f32x4_mul[f32x4_mul_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., 3., -28., 80.]);
+    test_bop!(f32x4[f32; 4] | f32x4_div[f32x4_div_test]:
+              ([0., -8., 70., 8.], [1., 4., 10., 2.]) => [0., -2., 7., 4.]);
+
+    test_uop!(f64x2[f64; 2] | f64x2_neg[f64x2_neg_test]: [0., 1.] => [ 0., -1.]);
+    test_uop!(f64x2[f64; 2] | f64x2_abs[f64x2_abs_test]: [0., -1.] => [ 0., 1.]);
+    test_bop!(f64x2[f64; 2] | f64x2_min[f64x2_min_test]:
+               ([0., -1.], [1., -3.]) => [0., -3.]);
+    test_bop!(f64x2[f64; 2] | f64x2_min[f64x2_min_test_nan]:
+               ([7., 8.], [-4., f64::NAN])
+               => [ -4., f64::NAN]);
+    test_bop!(f64x2[f64; 2] | f64x2_max[f64x2_max_test]:
+               ([0., -1.], [1., -3.]) => [1., -1.]);
+    test_bop!(f64x2[f64; 2] | f64x2_max[f64x2_max_test_nan]:
+               ([7., 8.], [ -4., f64::NAN])
+               => [7., f64::NAN]);
+    test_bop!(f64x2[f64; 2] | f64x2_add[f64x2_add_test]:
+               ([0., -1.], [1., -3.]) => [1., -4.]);
+    test_bop!(f64x2[f64; 2] | f64x2_sub[f64x2_sub_test]:
+               ([0., -1.], [1., -3.]) => [-1., 2.]);
+    test_bop!(f64x2[f64; 2] | f64x2_mul[f64x2_mul_test]:
+               ([0., -1.], [1., -3.]) => [0., 3.]);
+    test_bop!(f64x2[f64; 2] | f64x2_div[f64x2_div_test]:
+               ([0., -8.], [1., 4.]) => [0., -2.]);
+
+    macro_rules! test_conv {
+        ($test_id:ident | $conv_id:ident | $to_ty:ident | $from:expr,  $to:expr) => {
+            #[test]
+            fn $test_id() {
+                unsafe {
+                    let from: v128 = transmute($from);
+                    let to: v128 = transmute($to);
+
+                    let r: v128 = $conv_id(from);
+
+                    compare_bytes(r, to);
+                }
+            }
+        };
+    }
+
+    test_conv!(
+        f32x4_convert_s_i32x4 | f32x4_convert_i32x4 | f32x4 | [1_i32, 2, 3, 4],
+        [1_f32, 2., 3., 4.]
+    );
+    test_conv!(
+        f32x4_convert_u_i32x4 | f32x4_convert_u32x4 | f32x4 | [u32::MAX, 2, 3, 4],
+        [u32::MAX as f32, 2., 3., 4.]
+    );
+
+    #[test]
+    fn test_conversions() {
+        compare_bytes(
+            i32x4_trunc_sat_f32x4(f32x4(1., f32::NEG_INFINITY, f32::INFINITY, f32::NAN)),
+            i32x4(1, i32::MIN, i32::MAX, 0),
+        );
+        compare_bytes(
+            u32x4_trunc_sat_f32x4(f32x4(1., f32::NEG_INFINITY, f32::INFINITY, f32::NAN)),
+            u32x4(1, 0, u32::MAX, 0),
+        );
+        compare_bytes(f64x2_convert_low_i32x4(i32x4(1, 2, 3, 4)), f64x2(1., 2.));
+        compare_bytes(
+            f64x2_convert_low_i32x4(i32x4(i32::MIN, i32::MAX, 3, 4)),
+            f64x2(f64::from(i32::MIN), f64::from(i32::MAX)),
+        );
+        compare_bytes(f64x2_convert_low_u32x4(u32x4(1, 2, 3, 4)), f64x2(1., 2.));
+        compare_bytes(
+            f64x2_convert_low_u32x4(u32x4(u32::MIN, u32::MAX, 3, 4)),
+            f64x2(f64::from(u32::MIN), f64::from(u32::MAX)),
+        );
+
+        compare_bytes(
+            i32x4_trunc_sat_f64x2_zero(f64x2(1., f64::NEG_INFINITY)),
+            i32x4(1, i32::MIN, 0, 0),
+        );
+        compare_bytes(
+            i32x4_trunc_sat_f64x2_zero(f64x2(f64::NAN, f64::INFINITY)),
+            i32x4(0, i32::MAX, 0, 0),
+        );
+        compare_bytes(
+            u32x4_trunc_sat_f64x2_zero(f64x2(1., f64::NEG_INFINITY)),
+            u32x4(1, 0, 0, 0),
+        );
+        compare_bytes(
+            u32x4_trunc_sat_f64x2_zero(f64x2(f64::NAN, f64::INFINITY)),
+            u32x4(0, u32::MAX, 0, 0),
+        );
+    }
+
+    #[test]
+    fn test_popcnt() {
+        unsafe {
+            for i in 0..=255 {
+                compare_bytes(
+                    i8x16_popcnt(u8x16_splat(i)),
+                    u8x16_splat(i.count_ones() as u8),
+                )
+            }
+
+            let vectors = [
+                [0u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                [
+                    100, 200, 50, 0, 10, 7, 38, 185, 192, 3, 34, 85, 93, 7, 31, 99,
+                ],
+            ];
+
+            for vector in vectors.iter() {
+                compare_bytes(
+                    i8x16_popcnt(transmute(*vector)),
+                    i8x16(
+                        vector[0].count_ones() as i8,
+                        vector[1].count_ones() as i8,
+                        vector[2].count_ones() as i8,
+                        vector[3].count_ones() as i8,
+                        vector[4].count_ones() as i8,
+                        vector[5].count_ones() as i8,
+                        vector[6].count_ones() as i8,
+                        vector[7].count_ones() as i8,
+                        vector[8].count_ones() as i8,
+                        vector[9].count_ones() as i8,
+                        vector[10].count_ones() as i8,
+                        vector[11].count_ones() as i8,
+                        vector[12].count_ones() as i8,
+                        vector[13].count_ones() as i8,
+                        vector[14].count_ones() as i8,
+                        vector[15].count_ones() as i8,
+                    ),
+                )
+            }
+        }
+    }
+
+    #[test]
+    fn test_promote_demote() {
+        let tests = [
+            [1., 2.],
+            [f64::NAN, f64::INFINITY],
+            [100., 201.],
+            [0., -0.],
+            [f64::NEG_INFINITY, 0.],
+        ];
+
+        for [a, b] in tests {
+            compare_bytes(
+                f32x4_demote_f64x2_zero(f64x2(a, b)),
+                f32x4(a as f32, b as f32, 0., 0.),
+            );
+            compare_bytes(
+                f64x2_promote_low_f32x4(f32x4(a as f32, b as f32, 0., 0.)),
+                f64x2(a, b),
+            );
+        }
+    }
+
+    #[test]
+    fn test_extmul() {
+        macro_rules! test {
+            ($(
+                $ctor:ident {
+                    from: $from:ident,
+                    to: $to:ident,
+                    low: $low:ident,
+                    high: $high:ident,
+                } => {
+                    $(([$($a:tt)*] * [$($b:tt)*]))*
+                }
+            )*) => ($(
+                $(unsafe {
+                    let a: [$from; 16 / mem::size_of::<$from>()] = [$($a)*];
+                    let b: [$from; 16 / mem::size_of::<$from>()] = [$($b)*];
+                    let low = mem::transmute::<_, [$to; 16 / mem::size_of::<$to>()]>($low($ctor($($a)*), $ctor($($b)*)));
+                    let high = mem::transmute::<_, [$to; 16 / mem::size_of::<$to>()]>($high($ctor($($a)*), $ctor($($b)*)));
+
+                    let half = a.len() / 2;
+                    for i in 0..half {
+                        assert_eq!(
+                            (a[i] as $to).wrapping_mul((b[i] as $to)),
+                            low[i],
+                            "expected {} * {}", a[i] as $to, b[i] as $to,
+                        );
+                        assert_eq!(
+                            (a[half + i] as $to).wrapping_mul((b[half + i] as $to)),
+                            high[i],
+                            "expected {} * {}", a[half + i] as $to, b[half + i] as $to,
+                        );
+                    }
+                })*
+            )*)
+        }
+        test! {
+            i8x16 {
+                from: i8,
+                to: i16,
+                low: i16x8_extmul_low_i8x16,
+                high: i16x8_extmul_high_i8x16,
+            } => {
+                (
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                        *
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                )
+                (
+                    [-1, -2, 3, 100, 124, -38, 33, 87, 92, 108, 22, 8, -43, -128, 22, 0]
+                        *
+                    [-5, -2, 6, 10, 45, -4, 4, -2, 0, 88, 92, -102, -98, 83, 73, 54]
+                )
+            }
+            u8x16 {
+                from: u8,
+                to: u16,
+                low: u16x8_extmul_low_u8x16,
+                high: u16x8_extmul_high_u8x16,
+            } => {
+                (
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                        *
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                )
+                (
+                    [1, 2, 3, 100, 124, 38, 33, 87, 92, 198, 22, 8, 43, 128, 22, 0]
+                        *
+                    [5, 200, 6, 10, 45, 248, 4, 2, 0, 2, 92, 102, 234, 83, 73, 54]
+                )
+            }
+            i16x8 {
+                from: i16,
+                to: i32,
+                low: i32x4_extmul_low_i16x8,
+                high: i32x4_extmul_high_i16x8,
+            } => {
+                (
+                    [0, 0, 0, 0, 0, 0, 0, 0]
+                        *
+                    [0, 0, 0, 0, 0, 0, 0, 0]
+                )
+                (
+                    [-1, 0, i16::MAX, 19931, -2259, 64, 200, 87]
+                        *
+                    [1, 1, i16::MIN, 29391, 105, 2, 100, -2]
+                )
+            }
+            u16x8 {
+                from: u16,
+                to: u32,
+                low: u32x4_extmul_low_u16x8,
+                high: u32x4_extmul_high_u16x8,
+            } => {
+                (
+                    [0, 0, 0, 0, 0, 0, 0, 0]
+                        *
+                    [0, 0, 0, 0, 0, 0, 0, 0]
+                )
+                (
+                    [1, 0, u16::MAX, 19931, 2259, 64, 200, 87]
+                        *
+                    [1, 1, 3, 29391, 105, 2, 100, 2]
+                )
+            }
+            i32x4 {
+                from: i32,
+                to: i64,
+                low: i64x2_extmul_low_i32x4,
+                high: i64x2_extmul_high_i32x4,
+            } => {
+                (
+                    [0, 0, 0, 0]
+                        *
+                    [0, 0, 0, 0]
+                )
+                (
+                    [-1, 0, i32::MAX, 19931]
+                        *
+                    [1, 1, i32::MIN, 29391]
+                )
+                (
+                    [i32::MAX, 3003183, 3 << 20, 0xffffff]
+                        *
+                    [i32::MAX, i32::MIN, -40042, 300]
+                )
+            }
+            u32x4 {
+                from: u32,
+                to: u64,
+                low: u64x2_extmul_low_u32x4,
+                high: u64x2_extmul_high_u32x4,
+            } => {
+                (
+                    [0, 0, 0, 0]
+                        *
+                    [0, 0, 0, 0]
+                )
+                (
+                    [1, 0, u32::MAX, 19931]
+                        *
+                    [1, 1, 3, 29391]
+                )
+                (
+                    [u32::MAX, 3003183, 3 << 20, 0xffffff]
+                        *
+                    [u32::MAX, 3000, 40042, 300]
+                )
+            }
+        }
+    }
+
+    #[test]
+    fn test_q15mulr_sat_s() {
+        fn test(a: [i16; 8], b: [i16; 8]) {
+            let a_v = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+            let b_v = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
+            let result = i16x8_q15mulr_sat(a_v, b_v);
+            let result = unsafe { mem::transmute::<v128, [i16; 8]>(result) };
+
+            for (i, (a, b)) in a.iter().zip(&b).enumerate() {
+                assert_eq!(
+                    result[i],
+                    (((*a as i32) * (*b as i32) + 0x4000) >> 15) as i16
+                );
+            }
+        }
+
+        test([0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]);
+        test([1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]);
+        test(
+            [-1, 100, 2003, -29494, 12, 128, 994, 1],
+            [-4049, 8494, -10483, 0, 5, 2222, 883, -9],
+        );
+    }
+
+    #[test]
+    fn test_extadd() {
+        macro_rules! test {
+            ($(
+                $func:ident {
+                    from: $from:ident,
+                    to: $to:ident,
+                } => {
+                    $([$($a:tt)*])*
+                }
+            )*) => ($(
+                $(unsafe {
+                    let a: [$from; 16 / mem::size_of::<$from>()] = [$($a)*];
+                    let a_v = mem::transmute::<_, v128>(a);
+                    let r = mem::transmute::<v128, [$to; 16 / mem::size_of::<$to>()]>($func(a_v));
+
+                    let half = a.len() / 2;
+                    for i in 0..half {
+                        assert_eq!(
+                            (a[2 * i] as $to).wrapping_add((a[2 * i + 1] as $to)),
+                            r[i],
+                            "failed {} + {} != {}",
+                            a[2 * i] as $to,
+                            a[2 * i + 1] as $to,
+                            r[i],
+                        );
+                    }
+                })*
+            )*)
+        }
+        test! {
+            i16x8_extadd_pairwise_i8x16 {
+                from: i8,
+                to: i16,
+            } => {
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                [-1, -2, 3, 100, 124, -38, 33, 87, 92, 108, 22, 8, -43, -128, 22, 0]
+                [-5, -2, 6, 10, 45, -4, 4, -2, 0, 88, 92, -102, -98, 83, 73, 54]
+            }
+            i16x8_extadd_pairwise_u8x16 {
+                from: u8,
+                to: i16,
+            } => {
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                [1, 2, 3, 100, 124, 38, 33, 87, 92, 198, 22, 8, 43, 128, 22, 0]
+                [5, 200, 6, 10, 45, 248, 4, 2, 0, 2, 92, 102, 234, 83, 73, 54]
+            }
+            i32x4_extadd_pairwise_i16x8 {
+                from: i16,
+                to: i32,
+            } => {
+                [0, 0, 0, 0, 0, 0, 0, 0]
+                [-1, 0, i16::MAX, 19931, -2259, 64, 200, 87]
+                [1, 1, i16::MIN, 29391, 105, 2, 100, -2]
+            }
+            i32x4_extadd_pairwise_u16x8 {
+                from: u16,
+                to: i32,
+            } => {
+                [0, 0, 0, 0, 0, 0, 0, 0]
+                [1, 0, u16::MAX, 19931, 2259, 64, 200, 87]
+                [1, 1, 3, 29391, 105, 2, 100, 2]
+            }
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/abm.rs b/library/stdarch/crates/core_arch/src/x86/abm.rs
new file mode 100644
index 0000000000000..e6d5517600439
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/abm.rs
@@ -0,0 +1,62 @@
+//! Advanced Bit Manipulation (ABM) instructions
+//!
+//! The POPCNT and LZCNT have their own CPUID bits to indicate support.
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Counts the leading most significant zero bits.
+///
+/// When the operand is zero, it returns its size in bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_lzcnt_u32)
+#[inline]
+#[target_feature(enable = "lzcnt")]
+#[cfg_attr(test, assert_instr(lzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _lzcnt_u32(x: u32) -> u32 {
+    x.leading_zeros()
+}
+
+/// Counts the bits that are set.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_popcnt32)
+#[inline]
+#[target_feature(enable = "popcnt")]
+#[cfg_attr(test, assert_instr(popcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _popcnt32(x: i32) -> i32 {
+    x.count_ones() as i32
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "lzcnt")]
+    unsafe fn test_lzcnt_u32() {
+        assert_eq!(_lzcnt_u32(0b0101_1010), 25);
+    }
+
+    #[simd_test(enable = "popcnt")]
+    unsafe fn test_popcnt32() {
+        assert_eq!(_popcnt32(0b0101_1010), 4);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/adx.rs b/library/stdarch/crates/core_arch/src/x86/adx.rs
new file mode 100644
index 0000000000000..5ba766461653b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/adx.rs
@@ -0,0 +1,164 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.addcarry.32"]
+    fn llvm_addcarry_u32(a: u8, b: u32, c: u32) -> (u8, u32);
+    #[link_name = "llvm.x86.addcarryx.u32"]
+    fn llvm_addcarryx_u32(a: u8, b: u32, c: u32, d: *mut u32) -> u8;
+    #[link_name = "llvm.x86.subborrow.32"]
+    fn llvm_subborrow_u32(a: u8, b: u32, c: u32) -> (u8, u32);
+}
+
+/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
+/// (carry or overflow flag), and store the unsigned 32-bit result in `out`, and the carry-out
+/// is returned (carry or overflow flag).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_addcarry_u32)
+#[inline]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _addcarry_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    let (a, b) = llvm_addcarry_u32(c_in, a, b);
+    *out = b;
+    a
+}
+
+/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
+/// (carry or overflow flag), and store the unsigned 32-bit result in `out`, and
+/// the carry-out is returned (carry or overflow flag).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_addcarryx_u32)
+#[inline]
+#[target_feature(enable = "adx")]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    llvm_addcarryx_u32(c_in, a, b, out as *mut _)
+}
+
+/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
+/// (carry or overflow flag), and store the unsigned 32-bit result in `out`, and
+/// the carry-out is returned (carry or overflow flag).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_subborrow_u32)
+#[inline]
+#[cfg_attr(test, assert_instr(sbb))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _subborrow_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    let (a, b) = llvm_subborrow_u32(c_in, a, b);
+    *out = b;
+    a
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[test]
+    fn test_addcarry_u32() {
+        unsafe {
+            let a = u32::MAX;
+            let mut out = 0;
+
+            let r = _addcarry_u32(0, a, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 0);
+
+            let r = _addcarry_u32(0, a, 0, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, a);
+
+            let r = _addcarry_u32(1, a, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 1);
+
+            let r = _addcarry_u32(1, a, 0, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 0);
+
+            let r = _addcarry_u32(0, 3, 4, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 7);
+
+            let r = _addcarry_u32(1, 3, 4, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 8);
+        }
+    }
+
+    #[simd_test(enable = "adx")]
+    unsafe fn test_addcarryx_u32() {
+        let a = u32::MAX;
+        let mut out = 0;
+
+        let r = _addcarryx_u32(0, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);
+
+        let r = _addcarryx_u32(0, a, 0, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, a);
+
+        let r = _addcarryx_u32(1, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 1);
+
+        let r = _addcarryx_u32(1, a, 0, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);
+
+        let r = _addcarryx_u32(0, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 7);
+
+        let r = _addcarryx_u32(1, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 8);
+    }
+
+    #[simd_test(enable = "adx")]
+    unsafe fn test_addcarryx_u32_2() {
+        unsafe fn add_1_2_3() -> u32 {
+            let mut out = 0;
+            _addcarryx_u32(1, 2, 3, &mut out);
+            out
+        }
+        assert_eq!(6, add_1_2_3());
+    }
+
+    #[test]
+    fn test_subborrow_u32() {
+        unsafe {
+            let a = u32::MAX;
+            let mut out = 0;
+
+            let r = _subborrow_u32(0, 0, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a);
+
+            let r = _subborrow_u32(0, 0, 0, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 0);
+
+            let r = _subborrow_u32(1, 0, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a - 1);
+
+            let r = _subborrow_u32(1, 0, 0, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a);
+
+            let r = _subborrow_u32(0, 7, 3, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 4);
+
+            let r = _subborrow_u32(1, 7, 3, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 3);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/aes.rs b/library/stdarch/crates/core_arch/src/x86/aes.rs
new file mode 100644
index 0000000000000..7db743b2ccd31
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/aes.rs
@@ -0,0 +1,171 @@
+//! AES New Instructions (AES-NI)
+//!
+//! The intrinsics here correspond to those in the `wmmintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::x86::__m128i;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.aesni.aesdec"]
+    fn aesdec(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesdeclast"]
+    fn aesdeclast(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesenc"]
+    fn aesenc(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesenclast"]
+    fn aesenclast(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesimc"]
+    fn aesimc(a: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aeskeygenassist"]
+    fn aeskeygenassist(a: __m128i, imm8: u8) -> __m128i;
+}
+
+/// Performs one round of an AES decryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesdec))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_aesdec_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    unsafe { aesdec(a, round_key) }
+}
+
+/// Performs the last round of an AES decryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesdeclast))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_aesdeclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    unsafe { aesdeclast(a, round_key) }
+}
+
+/// Performs one round of an AES encryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesenc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_aesenc_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    unsafe { aesenc(a, round_key) }
+}
+
+/// Performs the last round of an AES encryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesenclast))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_aesenclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    unsafe { aesenclast(a, round_key) }
+}
+
+/// Performs the `InvMixColumns` transformation on `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesimc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_aesimc_si128(a: __m128i) -> __m128i {
+    unsafe { aesimc(a) }
+}
+
+/// Assist in expanding the AES cipher key.
+///
+/// Assist in expanding the AES cipher key by computing steps towards
+/// generating a round key for encryption cipher using data from `a` and an
+/// 8-bit round constant `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aeskeygenassist, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_aeskeygenassist_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { aeskeygenassist(a, IMM8 as u8) }
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __m128i happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesdec_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0x044e4f5176fec48f, 0xb57ecfa381da39ee);
+        let r = _mm_aesdec_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesdeclast_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0x36cad57d9072bf9e, 0xf210dd981fa4a493);
+        let r = _mm_aesdeclast_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesenc_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0x16ab0e57dfc442ed, 0x28e4ee1884504333);
+        let r = _mm_aesenc_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesenclast_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0xb6dd7df25d7ab320, 0x4b04f98cf4c860f8);
+        let r = _mm_aesenclast_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesimc_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714195.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let e = _mm_set_epi64x(0xc66c82284ee40aa0, 0x6633441122770055);
+        let r = _mm_aesimc_si128(a);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aeskeygenassist_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714138.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let e = _mm_set_epi64x(0x857c266b7c266e85, 0xeac4eea9c4eeacea);
+        let r = _mm_aeskeygenassist_si128::<5>(a);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs
new file mode 100644
index 0000000000000..df1cb63be30f0
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx.rs
@@ -0,0 +1,5022 @@
+//! Advanced Vector Extensions (AVX)
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
+//!   Programmer's Manual, Volume 3: General-Purpose and System
+//!   Instructions][amd64_ref].
+//!
+//! [Wikipedia][wiki] provides a quick overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+
+use crate::{
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+    mem, ptr,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Adds packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { simd_add(a, b) }
+}
+
+/// Adds packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vaddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { simd_add(a, b) }
+}
+
+/// Computes the bitwise AND of a packed double-precision (64-bit)
+/// floating-point elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// See https://github.com/rust-lang/stdarch/issues/71
+#[cfg_attr(test, assert_instr(vandp))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_and(a, b))
+    }
+}
+
+/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_and(a, b))
+    }
+}
+
+/// Computes the bitwise OR packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// See <https://github.com/rust-lang/stdarch/issues/71>.
+#[cfg_attr(test, assert_instr(vorp))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_or(a, b))
+    }
+}
+
+/// Computes the bitwise OR packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_or(a, b))
+    }
+}
+
+/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
+/// lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(MASK, 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [
+                MASK as u32 & 0b1,
+                ((MASK as u32 >> 1) & 0b1) + 4,
+                ((MASK as u32 >> 2) & 0b1) + 2,
+                ((MASK as u32 >> 3) & 0b1) + 6,
+            ],
+        )
+    }
+}
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a` within
+/// 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(MASK, 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11) + 8,
+                ((MASK as u32 >> 6) & 0b11) + 8,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 12,
+                ((MASK as u32 >> 6) & 0b11) + 12,
+            ],
+        )
+    }
+}
+
+/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
+/// elements in `a`, and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vandnp))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
+    }
+}
+
+/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
+/// elements in `a`
+/// and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
+    }
+}
+
+/// Compares packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`, and returns packed maximum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { vmaxpd(a, b) }
+}
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and returns packed maximum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { vmaxps(a, b) }
+}
+
+/// Compares packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`, and returns packed minimum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vminpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { vminpd(a, b) }
+}
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and returns packed minimum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vminps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { vminps(a, b) }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { simd_mul(a, b) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmulps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { simd_mul(a, b) }
+}
+
+/// Alternatively adds and subtracts packed double-precision (64-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vaddsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let add = simd_add(a, b);
+        let sub = simd_sub(a, b);
+        simd_shuffle!(add, sub, [4, 1, 6, 3])
+    }
+}
+
+/// Alternatively adds and subtracts packed single-precision (32-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vaddsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let add = simd_add(a, b);
+        let sub = simd_sub(a, b);
+        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
+    }
+}
+
+/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
+/// from packed elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { simd_sub(a, b) }
+}
+
+/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
+/// from packed elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { simd_sub(a, b) }
+}
+
+/// Computes the division of each of the 8 packed 32-bit floating-point elements
+/// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vdivps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { simd_div(a, b) }
+}
+
+/// Computes the division of each of the 4 packed 64-bit floating-point elements
+/// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { simd_div(a, b) }
+}
+
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
+///
+/// - `0x00`: Round to the nearest whole number.
+/// - `0x01`: Round down, toward negative infinity.
+/// - `0x02`: Round up, toward positive infinity.
+/// - `0x03`: Truncate the values.
+///
+/// For a complete list of options, check [the LLVM docs][llvm_docs].
+///
+/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
+    static_assert_uimm_bits!(ROUNDING, 4);
+    unsafe { roundpd256(a, ROUNDING) }
+}
+
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// toward positive infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_ceil_pd(a: __m256d) -> __m256d {
+    unsafe { simd_ceil(a) }
+}
+
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// toward negative infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_floor_pd(a: __m256d) -> __m256d {
+    unsafe { simd_floor(a) }
+}
+
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
+///
+/// - `0x00`: Round to the nearest whole number.
+/// - `0x01`: Round down, toward negative infinity.
+/// - `0x02`: Round up, toward positive infinity.
+/// - `0x03`: Truncate the values.
+///
+/// For a complete list of options, check [the LLVM docs][llvm_docs].
+///
+/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
+    static_assert_uimm_bits!(ROUNDING, 4);
+    unsafe { roundps256(a, ROUNDING) }
+}
+
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// toward positive infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_ceil_ps(a: __m256) -> __m256 {
+    unsafe { simd_ceil(a) }
+}
+
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// toward negative infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_floor_ps(a: __m256) -> __m256 {
+    unsafe { simd_floor(a) }
+}
+
+/// Returns the square root of packed single-precision (32-bit) floating point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Returns the square root of packed double-precision (64-bit) floating point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Blends packed double-precision (64-bit) floating-point elements from
+/// `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// Note: LLVM7 prefers single-precision blend instructions when
+// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
+// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
+#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM4, 4);
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [
+                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
+                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
+                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
+                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
+            ],
+        )
+    }
+}
+
+/// Blends packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [
+                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
+                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
+                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
+                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
+                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
+                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
+                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
+                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
+            ],
+        )
+    }
+}
+
+/// Blends packed double-precision (64-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vblendvpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe {
+        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
+        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
+    }
+}
+
+/// Blends packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vblendvps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe {
+        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
+        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
+    }
+}
+
+/// Conditionally multiplies the packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` using the high 4 bits in `imm8`,
+/// sum the four products, and conditionally return the sum
+///  using the low 4 bits of `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { vdpps(a, b, IMM8 as i8) }
+}
+
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vhaddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { vhaddpd(a, b) }
+}
+
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vhaddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { vhaddps(a, b) }
+}
+
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vhsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { vhsubpd(a, b) }
+}
+
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vhsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { vhsubps(a, b) }
+}
+
+/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vxorp))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_xor(a, b))
+    }
+}
+
+/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_xor(a, b))
+    }
+}
+
+/// Equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_EQ_OQ: i32 = 0x00;
+/// Less-than (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_LT_OS: i32 = 0x01;
+/// Less-than-or-equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_LE_OS: i32 = 0x02;
+/// Unordered (non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_UNORD_Q: i32 = 0x03;
+/// Not-equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NEQ_UQ: i32 = 0x04;
+/// Not-less-than (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NLT_US: i32 = 0x05;
+/// Not-less-than-or-equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NLE_US: i32 = 0x06;
+/// Ordered (non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_ORD_Q: i32 = 0x07;
+/// Equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_EQ_UQ: i32 = 0x08;
+/// Not-greater-than-or-equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NGE_US: i32 = 0x09;
+/// Not-greater-than (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NGT_US: i32 = 0x0a;
+/// False (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_FALSE_OQ: i32 = 0x0b;
+/// Not-equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NEQ_OQ: i32 = 0x0c;
+/// Greater-than-or-equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_GE_OS: i32 = 0x0d;
+/// Greater-than (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_GT_OS: i32 = 0x0e;
+/// True (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_TRUE_UQ: i32 = 0x0f;
+/// Equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_EQ_OS: i32 = 0x10;
+/// Less-than (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_LT_OQ: i32 = 0x11;
+/// Less-than-or-equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_LE_OQ: i32 = 0x12;
+/// Unordered (signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_UNORD_S: i32 = 0x13;
+/// Not-equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NEQ_US: i32 = 0x14;
+/// Not-less-than (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NLT_UQ: i32 = 0x15;
+/// Not-less-than-or-equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NLE_UQ: i32 = 0x16;
+/// Ordered (signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_ORD_S: i32 = 0x17;
+/// Equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_EQ_US: i32 = 0x18;
+/// Not-greater-than-or-equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NGE_UQ: i32 = 0x19;
+/// Not-greater-than (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NGT_UQ: i32 = 0x1a;
+/// False (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_FALSE_OS: i32 = 0x1b;
+/// Not-equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NEQ_OS: i32 = 0x1c;
+/// Greater-than-or-equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_GE_OQ: i32 = 0x1d;
+/// Greater-than (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_GT_OQ: i32 = 0x1e;
+/// True (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_TRUE_US: i32 = 0x1f;
+
+/// Compares packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM5, 5);
+    unsafe { vcmppd(a, b, const { IMM5 as i8 }) }
+}
+
+/// Compares packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM5, 5);
+    unsafe { vcmppd256(a, b, IMM5 as u8) }
+}
+
+/// Compares packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM5, 5);
+    unsafe { vcmpps(a, b, const { IMM5 as i8 }) }
+}
+
+/// Compares packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM5, 5);
+    unsafe { vcmpps256(a, b, const { IMM5 as u8 }) }
+}
+
+/// Compares the lower double-precision (64-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `IMM5`,
+/// store the result in the lower element of returned vector,
+/// and copies the upper element from `a` to the upper element of returned
+/// vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM5, 5);
+    unsafe { vcmpsd(a, b, IMM5 as i8) }
+}
+
+/// Compares the lower single-precision (32-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `IMM5`,
+/// store the result in the lower element of returned vector,
+/// and copies the upper 3 packed elements from `a` to the upper elements of
+/// returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM5, 5);
+    unsafe { vcmpss(a, b, IMM5 as i8) }
+}
+
+/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
+    unsafe { simd_cast(a.as_i32x4()) }
+}
+
+/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
+    unsafe { simd_cast(a.as_i32x8()) }
+}
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
+    unsafe { simd_cast(a) }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
+    unsafe { transmute(vcvtps2dq(a)) }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtps_pd(a: __m128) -> __m256d {
+    unsafe { simd_cast(a) }
+}
+
+/// Returns the first element of the input vector of `[4 x double]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
+#[inline]
+#[target_feature(enable = "avx")]
+//#[cfg_attr(test, assert_instr(movsd))] FIXME
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq(a)) }
+}
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
+    unsafe { transmute(vcvtpd2dq(a)) }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2dq(a)) }
+}
+
+/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
+    static_assert_uimm_bits!(IMM1, 1);
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_undefined_ps(),
+            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
+        )
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
+    static_assert_uimm_bits!(IMM1, 1);
+    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
+}
+
+/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
+    static_assert_uimm_bits!(IMM1, 1);
+    unsafe {
+        let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
+        transmute(dst)
+    }
+}
+
+/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_uimm_bits!(INDEX, 3);
+    unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
+}
+
+/// Returns the first element of the input vector of `[8 x i32]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
+#[inline]
+#[target_feature(enable = "avx")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
+    unsafe { simd_extract!(a.as_i32x8(), 0) }
+}
+
+/// Zeroes the contents of all XMM or YMM registers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vzeroall))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_zeroall() {
+    unsafe { vzeroall() }
+}
+
+/// Zeroes the upper 128 bits of all YMM registers;
+/// the lower 128-bits of the registers are unmodified.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vzeroupper))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_zeroupper() {
+    unsafe { vzeroupper() }
+}
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
+    unsafe { vpermilps256(a, b.as_i32x8()) }
+}
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
+    unsafe { vpermilps(a, b.as_i32x4()) }
+}
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_undefined_ps(),
+            [
+                (IMM8 as u32 >> 0) & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+                ((IMM8 as u32 >> 0) & 0b11) + 4,
+                ((IMM8 as u32 >> 2) & 0b11) + 4,
+                ((IMM8 as u32 >> 4) & 0b11) + 4,
+                ((IMM8 as u32 >> 6) & 0b11) + 4,
+            ],
+        )
+    }
+}
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_undefined_ps(),
+            [
+                (IMM8 as u32 >> 0) & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
+        )
+    }
+}
+
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// within 256-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
+    unsafe { vpermilpd256(a, b.as_i64x4()) }
+}
+
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
+    unsafe { vpermilpd(a, b.as_i64x2()) }
+}
+
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM4, 4);
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_undefined_pd(),
+            [
+                ((IMM4 as u32 >> 0) & 1),
+                ((IMM4 as u32 >> 1) & 1),
+                ((IMM4 as u32 >> 2) & 1) + 2,
+                ((IMM4 as u32 >> 3) & 1) + 2,
+            ],
+        )
+    }
+}
+
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM2, 2);
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_undefined_pd(),
+            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
+        )
+    }
+}
+
+/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { vperm2f128ps256(a, b, IMM8 as i8) }
+}
+
+/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { vperm2f128pd256(a, b, IMM8 as i8) }
+}
+
+/// Shuffles 128-bits (composed of integer data) selected by `imm8`
+/// from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) }
+}
+
+/// Broadcasts a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::trivially_copy_pass_by_ref)]
+pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
+    _mm256_set1_ps(*f)
+}
+
+/// Broadcasts a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::trivially_copy_pass_by_ref)]
+pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
+    _mm_set1_ps(*f)
+}
+
+/// Broadcasts a double-precision (64-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::trivially_copy_pass_by_ref)]
+pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
+    _mm256_set1_pd(*f)
+}
+
+/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
+/// (32-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
+    simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3])
+}
+
+/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
+/// (64-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
+    simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1])
+}
+
+/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
+/// single-precision (32-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
+    static_assert_uimm_bits!(IMM1, 1);
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_castps128_ps256(b),
+            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
+        )
+    }
+}
+
+/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
+/// double-precision (64-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
+    static_assert_uimm_bits!(IMM1, 1);
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_castpd128_pd256(b),
+            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+        )
+    }
+}
+
+/// Copies `a` to result, then inserts 128 bits from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+    static_assert_uimm_bits!(IMM1, 1);
+    unsafe {
+        let dst: i64x4 = simd_shuffle!(
+            a.as_i64x4(),
+            _mm256_castsi128_si256(b).as_i64x4(),
+            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+        );
+        transmute(dst)
+    }
+}
+
+/// Copies `a` to result, and inserts the 8-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
+    static_assert_uimm_bits!(INDEX, 5);
+    unsafe { transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
+}
+
+/// Copies `a` to result, and inserts the 16-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
+    static_assert_uimm_bits!(INDEX, 4);
+    unsafe { transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
+}
+
+/// Copies `a` to result, and inserts the 32-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
+    static_assert_uimm_bits!(INDEX, 3);
+    unsafe { transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
+}
+
+/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovap)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
+    *(mem_addr as *const __m256d)
+}
+
+/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovap)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
+    *(mem_addr as *mut __m256d) = a;
+}
+
+/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
+    *(mem_addr as *const __m256)
+}
+
+/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
+    *(mem_addr as *mut __m256) = a;
+}
+
+/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
+    let mut dst = _mm256_undefined_pd();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        ptr::addr_of_mut!(dst) as *mut u8,
+        mem::size_of::<__m256d>(),
+    );
+    dst
+}
+
+/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
+    mem_addr.cast::<__m256d>().write_unaligned(a);
+}
+
+/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
+    let mut dst = _mm256_undefined_ps();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        ptr::addr_of_mut!(dst) as *mut u8,
+        mem::size_of::<__m256>(),
+    );
+    dst
+}
+
+/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
+    mem_addr.cast::<__m256>().write_unaligned(a);
+}
+
+/// Loads 256-bits of integer data from memory into result.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] // FIXME vmovdqa expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
+    *mem_addr
+}
+
+/// Stores 256-bits of integer data from `a` into memory.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] // FIXME vmovdqa expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
+    *mem_addr = a;
+}
+
+/// Loads 256-bits of integer data from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
+    let mut dst = _mm256_undefined_si256();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        ptr::addr_of_mut!(dst) as *mut u8,
+        mem::size_of::<__m256i>(),
+    );
+    dst
+}
+
+/// Stores 256-bits of integer data from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
+    mem_addr.write_unaligned(a);
+}
+
+/// Loads packed double-precision (64-bit) floating-point elements from memory
+/// into result using `mask` (elements are zeroed out when the high bit of the
+/// corresponding element is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
+    maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
+}
+
+/// Stores packed double-precision (64-bit) floating-point elements from `a`
+/// into memory using `mask`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
+    maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
+}
+
+/// Loads packed double-precision (64-bit) floating-point elements from memory
+/// into result using `mask` (elements are zeroed out when the high bit of the
+/// corresponding element is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
+    maskloadpd(mem_addr as *const i8, mask.as_i64x2())
+}
+
+/// Stores packed double-precision (64-bit) floating-point elements from `a`
+/// into memory using `mask`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
+    maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a);
+}
+
+/// Loads packed single-precision (32-bit) floating-point elements from memory
+/// into result using `mask` (elements are zeroed out when the high bit of the
+/// corresponding element is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
+    maskloadps256(mem_addr as *const i8, mask.as_i32x8())
+}
+
+/// Stores packed single-precision (32-bit) floating-point elements from `a`
+/// into memory using `mask`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
+    maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
+}
+
+/// Loads packed single-precision (32-bit) floating-point elements from memory
+/// into result using `mask` (elements are zeroed out when the high bit of the
+/// corresponding element is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
+    maskloadps(mem_addr as *const i8, mask.as_i32x4())
+}
+
+/// Stores packed single-precision (32-bit) floating-point elements from `a`
+/// into memory using `mask`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
+    maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a);
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_movehdup_ps(a: __m256) -> __m256 {
+    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_moveldup_ps(a: __m256) -> __m256 {
+    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
+    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
+}
+
+/// Loads 256-bits of integer data from unaligned memory into result.
+/// This intrinsic may perform better than `_mm256_loadu_si256` when the
+/// data crosses a cache line boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vlddqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
+    transmute(vlddqu(mem_addr as *const i8))
+}
+
+/// Moves integer data from a 256-bit integer vector to a 32-byte
+/// aligned memory location. To minimize caching, the data is flagged as
+/// non-temporal (unlikely to be used again soon)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovntdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
+    crate::arch::asm!(
+        vps!("vmovntdq", ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(ymm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Moves double-precision values from a 256-bit vector of `[4 x double]`
+/// to a 32-byte aligned memory location. To minimize caching, the data is
+/// flagged as non-temporal (unlikely to be used again soon).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovntpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
+    crate::arch::asm!(
+        vps!("vmovntpd", ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(ymm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Moves single-precision floating point values from a 256-bit vector
+/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
+/// caching, the data is flagged as non-temporal (unlikely to be used again
+/// soon).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovntps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
+    crate::arch::asm!(
+        vps!("vmovntps", ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(ymm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Computes the approximate reciprocal of packed single-precision (32-bit)
+/// floating-point elements in `a`, and returns the results. The maximum
+/// relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vrcpps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
+    unsafe { vrcpps(a) }
+}
+
+/// Computes the approximate reciprocal square root of packed single-precision
+/// (32-bit) floating-point elements in `a`, and returns the results.
+/// The maximum relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vrsqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
+    unsafe { vrsqrtps(a) }
+}
+
+/// Unpacks and interleave double-precision (64-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
+}
+
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
+}
+
+/// Unpacks and interleave double-precision (64-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
+}
+
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
+    unsafe { ptestz256(a.as_i64x4(), b.as_i64x4()) }
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
+    unsafe { ptestc256(a.as_i64x4(), b.as_i64x4()) }
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
+/// `CF` values are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
+    unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
+}
+
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
+    unsafe { vtestzpd256(a, b) }
+}
+
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
+    unsafe { vtestcpd256(a, b) }
+}
+
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
+    unsafe { vtestnzcpd256(a, b) }
+}
+
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { vtestzpd(a, b) }
+}
+
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { vtestcpd(a, b) }
+}
+
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { vtestnzcpd(a, b) }
+}
+
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
+    unsafe { vtestzps256(a, b) }
+}
+
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
+    unsafe { vtestcps256(a, b) }
+}
+
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
+    unsafe { vtestnzcps256(a, b) }
+}
+
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
+    unsafe { vtestzps(a, b) }
+}
+
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
+    unsafe { vtestcps(a, b) }
+}
+
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
+    unsafe { vtestnzcps(a, b) }
+}
+
+/// Sets each bit of the returned mask based on the most significant bit of the
+/// corresponding packed double-precision (64-bit) floating-point element in
+/// `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovmskpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_movemask_pd(a: __m256d) -> i32 {
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    unsafe {
+        let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
+        simd_bitmask::<i64x4, u8>(mask).into()
+    }
+}
+
+/// Sets each bit of the returned mask based on the most significant bit of the
+/// corresponding packed single-precision (32-bit) floating-point element in
+/// `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovmskps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_movemask_ps(a: __m256) -> i32 {
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    unsafe {
+        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
+        simd_bitmask::<i32x8, u8>(mask).into()
+    }
+}
+
+/// Returns vector of type __m256d with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vxorp))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_setzero_pd() -> __m256d {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// Returns vector of type __m256 with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_setzero_ps() -> __m256 {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// Returns vector of type __m256i with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vxor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_setzero_si256() -> __m256i {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// Sets packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+    _mm256_setr_pd(d, c, b, a)
+}
+
+/// Sets packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
+    _mm256_setr_ps(h, g, f, e, d, c, b, a)
+}
+
+/// Sets packed 8-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8,
+) -> __m256i {
+    #[rustfmt::skip]
+    _mm256_setr_epi8(
+        e31, e30, e29, e28, e27, e26, e25, e24,
+        e23, e22, e21, e20, e19, e18, e17, e16,
+        e15, e14, e13, e12, e11, e10, e09, e08,
+        e07, e06, e05, e04, e03, e02, e01, e00,
+    )
+}
+
+/// Sets packed 16-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16,
+) -> __m256i {
+    #[rustfmt::skip]
+    _mm256_setr_epi16(
+        e15, e14, e13, e12,
+        e11, e10, e09, e08,
+        e07, e06, e05, e04,
+        e03, e02, e01, e00,
+    )
+}
+
+/// Sets packed 32-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32,
+) -> __m256i {
+    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
+}
+
+/// Sets packed 64-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    _mm256_setr_epi64x(d, c, b, a)
+}
+
+/// Sets packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+    __m256d([a, b, c, d])
+}
+
+/// Sets packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
+    __m256([a, b, c, d, e, f, g, h])
+}
+
+/// Sets packed 8-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_setr_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8,
+) -> __m256i {
+    unsafe {
+        #[rustfmt::skip]
+        transmute(i8x32::new(
+            e00, e01, e02, e03, e04, e05, e06, e07,
+            e08, e09, e10, e11, e12, e13, e14, e15,
+            e16, e17, e18, e19, e20, e21, e22, e23,
+            e24, e25, e26, e27, e28, e29, e30, e31,
+        ))
+    }
+}
+
+/// Sets packed 16-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_setr_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16,
+) -> __m256i {
+    unsafe {
+        #[rustfmt::skip]
+        transmute(i16x16::new(
+            e00, e01, e02, e03,
+            e04, e05, e06, e07,
+            e08, e09, e10, e11,
+            e12, e13, e14, e15,
+        ))
+    }
+}
+
+/// Sets packed 32-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_setr_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32,
+) -> __m256i {
+    unsafe { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
+}
+
+/// Sets packed 64-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    unsafe { transmute(i64x4::new(a, b, c, d)) }
+}
+
+/// Broadcasts double-precision (64-bit) floating-point value `a` to all
+/// elements of returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set1_pd(a: f64) -> __m256d {
+    _mm256_setr_pd(a, a, a, a)
+}
+
+/// Broadcasts single-precision (32-bit) floating-point value `a` to all
+/// elements of returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set1_ps(a: f32) -> __m256 {
+    _mm256_setr_ps(a, a, a, a, a, a, a, a)
+}
+
+/// Broadcasts 8-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastb`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set1_epi8(a: i8) -> __m256i {
+    #[rustfmt::skip]
+    _mm256_setr_epi8(
+        a, a, a, a, a, a, a, a,
+        a, a, a, a, a, a, a, a,
+        a, a, a, a, a, a, a, a,
+        a, a, a, a, a, a, a, a,
+    )
+}
+
+/// Broadcasts 16-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastw`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
+#[inline]
+#[target_feature(enable = "avx")]
+//#[cfg_attr(test, assert_instr(vpshufb))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set1_epi16(a: i16) -> __m256i {
+    _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+/// Broadcasts 32-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastd`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set1_epi32(a: i32) -> __m256i {
+    _mm256_setr_epi32(a, a, a, a, a, a, a, a)
+}
+
+/// Broadcasts 64-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastq`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
+#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
+    _mm256_setr_epi64x(a, a, a, a)
+}
+
+/// Cast vector of type __m256d to type __m256.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_castpd_ps(a: __m256d) -> __m256 {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type __m256 to type __m256d.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_castps_pd(a: __m256) -> __m256d {
+    unsafe { transmute(a) }
+}
+
+/// Casts vector of type __m256 to type __m256i.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_castps_si256(a: __m256) -> __m256i {
+    unsafe { transmute(a) }
+}
+
+/// Casts vector of type __m256i to type __m256.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
+    unsafe { transmute(a) }
+}
+
+/// Casts vector of type __m256d to type __m256i.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_castpd_si256(a: __m256d) -> __m256i {
+    unsafe { transmute(a) }
+}
+
+/// Casts vector of type __m256i to type __m256d.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
+    unsafe { transmute(a) }
+}
+
+/// Casts vector of type __m256 to type __m128.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_castps256_ps128(a: __m256) -> __m128 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
+}
+
+/// Casts vector of type __m256d to type __m128d.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
+    unsafe { simd_shuffle!(a, a, [0, 1]) }
+}
+
+/// Casts vector of type __m256i to type __m128i.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x4();
+        let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(dst)
+    }
+}
+
+/// Casts vector of type __m128 to type __m256;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
+    unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
+}
+
+/// Casts vector of type __m128d to type __m256d;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
+    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
+}
+
+/// Casts vector of type __m128i to type __m256i;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x2();
+        let undefined = i64x2::ZERO;
+        let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
+        transmute(dst)
+    }
+}
+
+/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
+/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
+/// the value of the source vector. The upper 128 bits are set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
+    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+
+/// Constructs a 256-bit integer vector from a 128-bit integer vector.
+/// The lower 128 bits contain the value of the source vector. The upper
+/// 128 bits are set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
+    unsafe {
+        let b = i64x2::ZERO;
+        let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
+        transmute(dst)
+    }
+}
+
+/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
+/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
+/// contain the value of the source vector. The upper 128 bits are set
+/// to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
+    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
+}
+
+/// Returns vector of type `__m256` with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_undefined_ps() -> __m256 {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// Returns vector of type `__m256d` with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_undefined_pd() -> __m256d {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// Returns vector of type __m256i with with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_undefined_si256() -> __m256i {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// Sets packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
+    unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+
+/// Sets packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
+    unsafe {
+        let hi: __m128 = transmute(hi);
+        let lo: __m128 = transmute(lo);
+        transmute(_mm256_set_m128(hi, lo))
+    }
+}
+
+/// Sets packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
+    unsafe {
+        let hi: __m128 = transmute(hi);
+        let lo: __m128 = transmute(lo);
+        transmute(_mm256_set_m128(hi, lo))
+    }
+}
+
+/// Sets packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
+    _mm256_set_m128(hi, lo)
+}
+
+/// Sets packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
+    _mm256_set_m128d(hi, lo)
+}
+
+/// Sets packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
+    _mm256_set_m128i(hi, lo)
+}
+
+/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
+/// floating-point elements) from memory, and combine them into a 256-bit
+/// value.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
+    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
+    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
+}
+
+/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from memory, and combine them into a 256-bit
+/// value.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
+    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
+    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
+}
+
+/// Loads two 128-bit values (composed of integer data) from memory, and combine
+/// them into a 256-bit value.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
+    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
+    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
+}
+
+/// Stores the high and low 128-bit halves (each composed of 4 packed
+/// single-precision (32-bit) floating-point elements) from `a` into memory two
+/// different 128-bit locations.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
+    let lo = _mm256_castps256_ps128(a);
+    _mm_storeu_ps(loaddr, lo);
+    let hi = _mm256_extractf128_ps::<1>(a);
+    _mm_storeu_ps(hiaddr, hi);
+}
+
+/// Stores the high and low 128-bit halves (each composed of 2 packed
+/// double-precision (64-bit) floating-point elements) from `a` into memory two
+/// different 128-bit locations.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
+    let lo = _mm256_castpd256_pd128(a);
+    _mm_storeu_pd(loaddr, lo);
+    let hi = _mm256_extractf128_pd::<1>(a);
+    _mm_storeu_pd(hiaddr, hi);
+}
+
+/// Stores the high and low 128-bit halves (each composed of integer data) from
+/// `a` into memory two different 128-bit locations.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
+    let lo = _mm256_castsi256_si128(a);
+    _mm_storeu_si128(loaddr, lo);
+    let hi = _mm256_extractf128_si256::<1>(a);
+    _mm_storeu_si128(hiaddr, hi);
+}
+
+/// Returns the first element of the input vector of `[8 x float]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
+#[inline]
+#[target_feature(enable = "avx")]
+//#[cfg_attr(test, assert_instr(movss))] FIXME
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtss_f32(a: __m256) -> f32 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+// LLVM intrinsics used in the above functions
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx.round.pd.256"]
+    fn roundpd256(a: __m256d, b: i32) -> __m256d;
+    #[link_name = "llvm.x86.avx.round.ps.256"]
+    fn roundps256(a: __m256, b: i32) -> __m256;
+    #[link_name = "llvm.x86.avx.dp.ps.256"]
+    fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
+    #[link_name = "llvm.x86.avx.hadd.pd.256"]
+    fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.hadd.ps.256"]
+    fn vhaddps(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.hsub.pd.256"]
+    fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.hsub.ps.256"]
+    fn vhsubps(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.sse2.cmp.pd"]
+    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
+    #[link_name = "llvm.x86.avx.cmp.pd.256"]
+    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
+    #[link_name = "llvm.x86.sse.cmp.ps"]
+    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
+    #[link_name = "llvm.x86.avx.cmp.ps.256"]
+    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
+    #[link_name = "llvm.x86.sse2.cmp.sd"]
+    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
+    #[link_name = "llvm.x86.sse.cmp.ss"]
+    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
+    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
+    fn vcvtps2dq(a: __m256) -> i32x8;
+    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
+    fn vcvttpd2dq(a: __m256d) -> i32x4;
+    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
+    fn vcvtpd2dq(a: __m256d) -> i32x4;
+    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
+    fn vcvttps2dq(a: __m256) -> i32x8;
+    #[link_name = "llvm.x86.avx.vzeroall"]
+    fn vzeroall();
+    #[link_name = "llvm.x86.avx.vzeroupper"]
+    fn vzeroupper();
+    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
+    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
+    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
+    fn vpermilps(a: __m128, b: i32x4) -> __m128;
+    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
+    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
+    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
+    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
+    #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
+    fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256;
+    #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
+    fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d;
+    #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
+    fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
+    #[link_name = "llvm.x86.avx.maskload.pd.256"]
+    fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
+    #[link_name = "llvm.x86.avx.maskstore.pd.256"]
+    fn maskstorepd256(mem_addr: *mut i8, mask: i64x4, a: __m256d);
+    #[link_name = "llvm.x86.avx.maskload.pd"]
+    fn maskloadpd(mem_addr: *const i8, mask: i64x2) -> __m128d;
+    #[link_name = "llvm.x86.avx.maskstore.pd"]
+    fn maskstorepd(mem_addr: *mut i8, mask: i64x2, a: __m128d);
+    #[link_name = "llvm.x86.avx.maskload.ps.256"]
+    fn maskloadps256(mem_addr: *const i8, mask: i32x8) -> __m256;
+    #[link_name = "llvm.x86.avx.maskstore.ps.256"]
+    fn maskstoreps256(mem_addr: *mut i8, mask: i32x8, a: __m256);
+    #[link_name = "llvm.x86.avx.maskload.ps"]
+    fn maskloadps(mem_addr: *const i8, mask: i32x4) -> __m128;
+    #[link_name = "llvm.x86.avx.maskstore.ps"]
+    fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: __m128);
+    #[link_name = "llvm.x86.avx.ldu.dq.256"]
+    fn vlddqu(mem_addr: *const i8) -> i8x32;
+    #[link_name = "llvm.x86.avx.rcp.ps.256"]
+    fn vrcpps(a: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
+    fn vrsqrtps(a: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.ptestz.256"]
+    fn ptestz256(a: i64x4, b: i64x4) -> i32;
+    #[link_name = "llvm.x86.avx.ptestc.256"]
+    fn ptestc256(a: i64x4, b: i64x4) -> i32;
+    #[link_name = "llvm.x86.avx.ptestnzc.256"]
+    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
+    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
+    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
+    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.pd"]
+    fn vtestzpd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.pd"]
+    fn vtestcpd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
+    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
+    fn vtestzps256(a: __m256, b: __m256) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
+    fn vtestcps256(a: __m256, b: __m256) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
+    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.ps"]
+    fn vtestzps(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.ps"]
+    fn vtestcps(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
+    fn vtestnzcps(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.avx.min.ps.256"]
+    fn vminps(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.max.ps.256"]
+    fn vmaxps(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.min.pd.256"]
+    fn vminpd(a: __m256d, b: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.max.pd.256"]
+    fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::hint::black_box;
+    use crate::ptr;
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_add_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_add_pd(a, b);
+        let e = _mm256_setr_pd(6., 8., 10., 12.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_add_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_add_ps(a, b);
+        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_and_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(0.6);
+        let r = _mm256_and_pd(a, b);
+        let e = _mm256_set1_pd(0.5);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_and_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(0.6);
+        let r = _mm256_and_ps(a, b);
+        let e = _mm256_set1_ps(0.5);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_or_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(0.6);
+        let r = _mm256_or_pd(a, b);
+        let e = _mm256_set1_pd(1.2);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_or_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(0.6);
+        let r = _mm256_or_ps(a, b);
+        let e = _mm256_set1_ps(1.2);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_shuffle_pd() {
+        let a = _mm256_setr_pd(1., 4., 5., 8.);
+        let b = _mm256_setr_pd(2., 3., 6., 7.);
+        let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
+        let e = _mm256_setr_pd(4., 3., 8., 7.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_shuffle_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
+        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_andnot_pd() {
+        let a = _mm256_set1_pd(0.);
+        let b = _mm256_set1_pd(0.6);
+        let r = _mm256_andnot_pd(a, b);
+        assert_eq_m256d(r, b);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_andnot_ps() {
+        let a = _mm256_set1_ps(0.);
+        let b = _mm256_set1_ps(0.6);
+        let r = _mm256_andnot_ps(a, b);
+        assert_eq_m256(r, b);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_max_pd() {
+        let a = _mm256_setr_pd(1., 4., 5., 8.);
+        let b = _mm256_setr_pd(2., 3., 6., 7.);
+        let r = _mm256_max_pd(a, b);
+        let e = _mm256_setr_pd(2., 4., 6., 8.);
+        assert_eq_m256d(r, e);
+        // > If the values being compared are both 0.0s (of either sign), the
+        // > value in the second operand (source operand) is returned.
+        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
+        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
+        let wu: [u64; 4] = transmute(w);
+        let xu: [u64; 4] = transmute(x);
+        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
+        assert_eq!(xu, [0u64; 4]);
+        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
+        // > second operand (source operand), either a NaN or a valid
+        // > floating-point value, is written to the result.
+        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
+        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
+        let yf: [f64; 4] = transmute(y);
+        let zf: [f64; 4] = transmute(z);
+        assert_eq!(yf, [0.0; 4]);
+        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_max_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_max_ps(a, b);
+        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
+        assert_eq_m256(r, e);
+        // > If the values being compared are both 0.0s (of either sign), the
+        // > value in the second operand (source operand) is returned.
+        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
+        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
+        let wu: [u32; 8] = transmute(w);
+        let xu: [u32; 8] = transmute(x);
+        assert_eq!(wu, [0x8000_0000u32; 8]);
+        assert_eq!(xu, [0u32; 8]);
+        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
+        // > second operand (source operand), either a NaN or a valid
+        // > floating-point value, is written to the result.
+        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
+        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
+        let yf: [f32; 8] = transmute(y);
+        let zf: [f32; 8] = transmute(z);
+        assert_eq!(yf, [0.0; 8]);
+        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_min_pd() {
+        let a = _mm256_setr_pd(1., 4., 5., 8.);
+        let b = _mm256_setr_pd(2., 3., 6., 7.);
+        let r = _mm256_min_pd(a, b);
+        let e = _mm256_setr_pd(1., 3., 5., 7.);
+        assert_eq_m256d(r, e);
+        // > If the values being compared are both 0.0s (of either sign), the
+        // > value in the second operand (source operand) is returned.
+        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
+        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
+        let wu: [u64; 4] = transmute(w);
+        let xu: [u64; 4] = transmute(x);
+        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
+        assert_eq!(xu, [0u64; 4]);
+        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
+        // > second operand (source operand), either a NaN or a valid
+        // > floating-point value, is written to the result.
+        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
+        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
+        let yf: [f64; 4] = transmute(y);
+        let zf: [f64; 4] = transmute(z);
+        assert_eq!(yf, [0.0; 4]);
+        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_min_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_min_ps(a, b);
+        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
+        assert_eq_m256(r, e);
+        // > If the values being compared are both 0.0s (of either sign), the
+        // > value in the second operand (source operand) is returned.
+        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
+        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
+        let wu: [u32; 8] = transmute(w);
+        let xu: [u32; 8] = transmute(x);
+        assert_eq!(wu, [0x8000_0000u32; 8]);
+        assert_eq!(xu, [0u32; 8]);
+        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
+        // > second operand (source operand), either a NaN or a valid
+        // > floating-point value, is written to the result.
+        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
+        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
+        let yf: [f32; 8] = transmute(y);
+        let zf: [f32; 8] = transmute(z);
+        assert_eq!(yf, [0.0; 8]);
+        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_mul_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_mul_pd(a, b);
+        let e = _mm256_setr_pd(5., 12., 21., 32.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_mul_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_mul_ps(a, b);
+        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_addsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_addsub_pd(a, b);
+        let e = _mm256_setr_pd(-4., 8., -4., 12.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_addsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_addsub_ps(a, b);
+        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_sub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_sub_pd(a, b);
+        let e = _mm256_setr_pd(-4., -4., -4., -4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_sub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
+        let r = _mm256_sub_ps(a, b);
+        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_round_pd() {
+        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
+        let result_closest = _mm256_round_pd::<0b0000>(a);
+        let result_down = _mm256_round_pd::<0b0001>(a);
+        let result_up = _mm256_round_pd::<0b0010>(a);
+        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
+        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
+        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
+        assert_eq_m256d(result_closest, expected_closest);
+        assert_eq_m256d(result_down, expected_down);
+        assert_eq_m256d(result_up, expected_up);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_floor_pd() {
+        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
+        let result_down = _mm256_floor_pd(a);
+        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
+        assert_eq_m256d(result_down, expected_down);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_ceil_pd() {
+        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
+        let result_up = _mm256_ceil_pd(a);
+        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
+        assert_eq_m256d(result_up, expected_up);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_round_ps() {
+        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
+        let result_closest = _mm256_round_ps::<0b0000>(a);
+        let result_down = _mm256_round_ps::<0b0001>(a);
+        let result_up = _mm256_round_ps::<0b0010>(a);
+        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
+        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
+        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
+        assert_eq_m256(result_closest, expected_closest);
+        assert_eq_m256(result_down, expected_down);
+        assert_eq_m256(result_up, expected_up);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_floor_ps() {
+        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
+        let result_down = _mm256_floor_ps(a);
+        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
+        assert_eq_m256(result_down, expected_down);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_ceil_ps() {
+        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
+        let result_up = _mm256_ceil_ps(a);
+        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
+        assert_eq_m256(result_up, expected_up);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_sqrt_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_sqrt_pd(a);
+        let e = _mm256_setr_pd(2., 3., 4., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_sqrt_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let r = _mm256_sqrt_ps(a);
+        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_div_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_div_ps(a, b);
+        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_div_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_div_pd(a, b);
+        let e = _mm256_setr_pd(1., 3., 8., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_blend_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_blend_pd::<0x0>(a, b);
+        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
+        let r = _mm256_blend_pd::<0x3>(a, b);
+        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
+        let r = _mm256_blend_pd::<0xF>(a, b);
+        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_blend_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_blend_ps::<0x0>(a, b);
+        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
+        let r = _mm256_blend_ps::<0x3>(a, b);
+        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
+        let r = _mm256_blend_ps::<0xF>(a, b);
+        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_blendv_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
+        let r = _mm256_blendv_pd(a, b, c);
+        let e = _mm256_setr_pd(4., 9., 2., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_blendv_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        #[rustfmt::skip]
+        let c = _mm256_setr_ps(
+            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
+        );
+        let r = _mm256_blendv_ps(a, b, c);
+        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_dp_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_dp_ps::<0xFF>(a, b);
+        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_hadd_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_hadd_pd(a, b);
+        let e = _mm256_setr_pd(13., 7., 41., 7.);
+        assert_eq_m256d(r, e);
+
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_hadd_pd(a, b);
+        let e = _mm256_setr_pd(3., 11., 7., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_hadd_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_hadd_ps(a, b);
+        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
+        assert_eq_m256(r, e);
+
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_hadd_ps(a, b);
+        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_hsub_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_hsub_pd(a, b);
+        let e = _mm256_setr_pd(-5., 1., -9., -3.);
+        assert_eq_m256d(r, e);
+
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_hsub_pd(a, b);
+        let e = _mm256_setr_pd(-1., -1., -1., -1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_hsub_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_hsub_ps(a, b);
+        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
+        assert_eq_m256(r, e);
+
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_hsub_ps(a, b);
+        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_xor_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_set1_pd(0.);
+        let r = _mm256_xor_pd(a, b);
+        assert_eq_m256d(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_xor_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_set1_ps(0.);
+        let r = _mm256_xor_ps(a, b);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_cmp_pd() {
+        let a = _mm_setr_pd(4., 9.);
+        let b = _mm_setr_pd(4., 3.);
+        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
+        assert!(get_m128d(r, 0).is_nan());
+        assert!(get_m128d(r, 1).is_nan());
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cmp_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
+        let e = _mm256_set1_pd(0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_cmp_ps() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let b = _mm_setr_ps(4., 9., 16., 25.);
+        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
+        assert!(get_m128(r, 0).is_nan());
+        assert_eq!(get_m128(r, 1), 0.);
+        assert_eq!(get_m128(r, 2), 0.);
+        assert_eq!(get_m128(r, 3), 0.);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cmp_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
+        let e = _mm256_set1_ps(0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_cmp_sd() {
+        let a = _mm_setr_pd(4., 9.);
+        let b = _mm_setr_pd(4., 3.);
+        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
+        assert!(get_m128d(r, 0).is_nan());
+        assert_eq!(get_m128d(r, 1), 9.);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_cmp_ss() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let b = _mm_setr_ps(4., 9., 16., 25.);
+        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
+        assert!(get_m128(r, 0).is_nan());
+        assert_eq!(get_m128(r, 1), 3.);
+        assert_eq!(get_m128(r, 2), 2.);
+        assert_eq!(get_m128(r, 3), 5.);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtepi32_pd() {
+        let a = _mm_setr_epi32(4, 9, 16, 25);
+        let r = _mm256_cvtepi32_pd(a);
+        let e = _mm256_setr_pd(4., 9., 16., 25.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtepi32_ps() {
+        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
+        let r = _mm256_cvtepi32_ps(a);
+        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtpd_ps() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_cvtpd_ps(a);
+        let e = _mm_setr_ps(4., 9., 16., 25.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtps_epi32() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let r = _mm256_cvtps_epi32(a);
+        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtps_pd() {
+        let a = _mm_setr_ps(4., 9., 16., 25.);
+        let r = _mm256_cvtps_pd(a);
+        let e = _mm256_setr_pd(4., 9., 16., 25.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtsd_f64() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_cvtsd_f64(a);
+        assert_eq!(r, 1.);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvttpd_epi32() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_cvttpd_epi32(a);
+        let e = _mm_setr_epi32(4, 9, 16, 25);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtpd_epi32() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_cvtpd_epi32(a);
+        let e = _mm_setr_epi32(4, 9, 16, 25);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvttps_epi32() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let r = _mm256_cvttps_epi32(a);
+        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extractf128_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_extractf128_ps::<0>(a);
+        let e = _mm_setr_ps(4., 3., 2., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extractf128_pd() {
+        let a = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_extractf128_pd::<0>(a);
+        let e = _mm_setr_pd(4., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extractf128_si256() {
+        let a = _mm256_setr_epi64x(4, 3, 2, 5);
+        let r = _mm256_extractf128_si256::<0>(a);
+        let e = _mm_setr_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extract_epi32() {
+        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
+        let r1 = _mm256_extract_epi32::<0>(a);
+        let r2 = _mm256_extract_epi32::<3>(a);
+        assert_eq!(r1, -1);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtsi256_si32() {
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_cvtsi256_si32(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
+    unsafe fn test_mm256_zeroall() {
+        _mm256_zeroall();
+    }
+
+    #[simd_test(enable = "avx")]
+    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
+    unsafe fn test_mm256_zeroupper() {
+        _mm256_zeroupper();
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permutevar_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_permutevar_ps(a, b);
+        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_permutevar_ps() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let b = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm_permutevar_ps(a, b);
+        let e = _mm_setr_ps(3., 2., 5., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_permute_ps::<0x1b>(a);
+        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_permute_ps() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let r = _mm_permute_ps::<0x1b>(a);
+        let e = _mm_setr_ps(5., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permutevar_pd() {
+        let a = _mm256_setr_pd(4., 3., 2., 5.);
+        let b = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_permutevar_pd(a, b);
+        let e = _mm256_setr_pd(4., 3., 5., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_permutevar_pd() {
+        let a = _mm_setr_pd(4., 3.);
+        let b = _mm_setr_epi64x(3, 0);
+        let r = _mm_permutevar_pd(a, b);
+        let e = _mm_setr_pd(3., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute_pd() {
+        let a = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_permute_pd::<5>(a);
+        let e = _mm256_setr_pd(3., 4., 5., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_permute_pd() {
+        let a = _mm_setr_pd(4., 3.);
+        let r = _mm_permute_pd::<1>(a);
+        let e = _mm_setr_pd(3., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute2f128_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_permute2f128_ps::<0x13>(a, b);
+        let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute2f128_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_permute2f128_pd::<0x31>(a, b);
+        let e = _mm256_setr_pd(3., 4., 7., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute2f128_si256() {
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4);
+        let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8);
+        let r = _mm256_permute2f128_si256::<0x20>(a, b);
+        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_broadcast_ss() {
+        let r = _mm256_broadcast_ss(&3.);
+        let e = _mm256_set1_ps(3.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_broadcast_ss() {
+        let r = _mm_broadcast_ss(&3.);
+        let e = _mm_set1_ps(3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_broadcast_sd() {
+        let r = _mm256_broadcast_sd(&3.);
+        let e = _mm256_set1_pd(3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_broadcast_ps() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let r = _mm256_broadcast_ps(&a);
+        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_broadcast_pd() {
+        let a = _mm_setr_pd(4., 3.);
+        let r = _mm256_broadcast_pd(&a);
+        let e = _mm256_setr_pd(4., 3., 4., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insertf128_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let b = _mm_setr_ps(4., 9., 16., 25.);
+        let r = _mm256_insertf128_ps::<0>(a, b);
+        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insertf128_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm256_insertf128_pd::<0>(a, b);
+        let e = _mm256_setr_pd(5., 6., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insertf128_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm_setr_epi64x(5, 6);
+        let r = _mm256_insertf128_si256::<0>(a, b);
+        let e = _mm256_setr_epi64x(5, 6, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insert_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_insert_epi8::<31>(a, 0);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insert_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r = _mm256_insert_epi16::<15>(a, 0);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insert_epi32() {
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_insert_epi32::<7>(a, 0);
+        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_load_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let p = ptr::addr_of!(a) as *const f64;
+        let r = _mm256_load_pd(p);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_store_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let mut r = _mm256_undefined_pd();
+        _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
+        assert_eq_m256d(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_load_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let p = ptr::addr_of!(a) as *const f32;
+        let r = _mm256_load_ps(p);
+        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_store_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let mut r = _mm256_undefined_ps();
+        _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu_pd() {
+        let a = &[1.0f64, 2., 3., 4.];
+        let p = a.as_ptr();
+        let r = _mm256_loadu_pd(black_box(p));
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu_pd() {
+        let a = _mm256_set1_pd(9.);
+        let mut r = _mm256_undefined_pd();
+        _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
+        assert_eq_m256d(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu_ps() {
+        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
+        let p = a.as_ptr();
+        let r = _mm256_loadu_ps(black_box(p));
+        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu_ps() {
+        let a = _mm256_set1_ps(9.);
+        let mut r = _mm256_undefined_ps();
+        _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_load_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let p = ptr::addr_of!(a);
+        let r = _mm256_load_si256(p);
+        let e = _mm256_setr_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_store_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let mut r = _mm256_undefined_si256();
+        _mm256_store_si256(ptr::addr_of_mut!(r), a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let p = ptr::addr_of!(a);
+        let r = _mm256_loadu_si256(black_box(p));
+        let e = _mm256_setr_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu_si256() {
+        let a = _mm256_set1_epi8(9);
+        let mut r = _mm256_undefined_si256();
+        _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_maskload_pd() {
+        let a = &[1.0f64, 2., 3., 4.];
+        let p = a.as_ptr();
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let r = _mm256_maskload_pd(black_box(p), mask);
+        let e = _mm256_setr_pd(0., 2., 0., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_maskstore_pd() {
+        let mut r = _mm256_set1_pd(0.);
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
+        let e = _mm256_setr_pd(0., 2., 0., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_maskload_pd() {
+        let a = &[1.0f64, 2.];
+        let p = a.as_ptr();
+        let mask = _mm_setr_epi64x(0, !0);
+        let r = _mm_maskload_pd(black_box(p), mask);
+        let e = _mm_setr_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_maskstore_pd() {
+        let mut r = _mm_set1_pd(0.);
+        let mask = _mm_setr_epi64x(0, !0);
+        let a = _mm_setr_pd(1., 2.);
+        _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
+        let e = _mm_setr_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_maskload_ps() {
+        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let r = _mm256_maskload_ps(black_box(p), mask);
+        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_maskstore_ps() {
+        let mut r = _mm256_set1_ps(0.);
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
+        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_maskload_ps() {
+        let a = &[1.0f32, 2., 3., 4.];
+        let p = a.as_ptr();
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let r = _mm_maskload_ps(black_box(p), mask);
+        let e = _mm_setr_ps(0., 2., 0., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_maskstore_ps() {
+        let mut r = _mm_set1_ps(0.);
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
+        let e = _mm_setr_ps(0., 2., 0., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_movehdup_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_movehdup_ps(a);
+        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_moveldup_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_moveldup_ps(a);
+        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_movedup_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_movedup_pd(a);
+        let e = _mm256_setr_pd(1., 1., 3., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_lddqu_si256() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let p = ptr::addr_of!(a);
+        let r = _mm256_lddqu_si256(black_box(p));
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
+    unsafe fn test_mm256_stream_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let mut r = _mm256_undefined_si256();
+        _mm256_stream_si256(ptr::addr_of_mut!(r), a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
+    unsafe fn test_mm256_stream_pd() {
+        #[repr(align(32))]
+        struct Memory {
+            pub data: [f64; 4],
+        }
+        let a = _mm256_set1_pd(7.0);
+        let mut mem = Memory { data: [-1.0; 4] };
+
+        _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
+        for i in 0..4 {
+            assert_eq!(mem.data[i], get_m256d(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx")]
+    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
+    unsafe fn test_mm256_stream_ps() {
+        #[repr(align(32))]
+        struct Memory {
+            pub data: [f32; 8],
+        }
+        let a = _mm256_set1_ps(7.0);
+        let mut mem = Memory { data: [-1.0; 8] };
+
+        _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
+        for i in 0..8 {
+            assert_eq!(mem.data[i], get_m256(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_rcp_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_rcp_ps(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_ps(
+            0.99975586, 0.49987793, 0.33325195, 0.24993896,
+            0.19995117, 0.16662598, 0.14282227, 0.12496948,
+        );
+        let rel_err = 0.00048828125;
+        for i in 0..8 {
+            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_rsqrt_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_rsqrt_ps(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_ps(
+            0.99975586, 0.7069092, 0.5772705, 0.49987793,
+            0.44714355, 0.40820313, 0.3779297, 0.3534546,
+        );
+        let rel_err = 0.00048828125;
+        for i in 0..8 {
+            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_unpackhi_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_unpackhi_pd(a, b);
+        let e = _mm256_setr_pd(2., 6., 4., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_unpackhi_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_unpackhi_ps(a, b);
+        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_unpacklo_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_unpacklo_pd(a, b);
+        let e = _mm256_setr_pd(1., 5., 3., 7.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_unpacklo_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_unpacklo_ps(a, b);
+        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testz_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm256_setr_epi64x(5, 6, 7, 8);
+        let r = _mm256_testz_si256(a, b);
+        assert_eq!(r, 0);
+        let b = _mm256_set1_epi64x(0);
+        let r = _mm256_testz_si256(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testc_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm256_setr_epi64x(5, 6, 7, 8);
+        let r = _mm256_testc_si256(a, b);
+        assert_eq!(r, 0);
+        let b = _mm256_set1_epi64x(0);
+        let r = _mm256_testc_si256(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testnzc_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm256_setr_epi64x(5, 6, 7, 8);
+        let r = _mm256_testnzc_si256(a, b);
+        assert_eq!(r, 1);
+        let a = _mm256_setr_epi64x(0, 0, 0, 0);
+        let b = _mm256_setr_epi64x(0, 0, 0, 0);
+        let r = _mm256_testnzc_si256(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testz_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_testz_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm256_set1_pd(-1.);
+        let r = _mm256_testz_pd(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testc_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_testc_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(-1.);
+        let r = _mm256_testc_pd(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testnzc_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_testnzc_pd(a, b);
+        assert_eq!(r, 0);
+        let a = _mm256_setr_pd(1., -1., -1., -1.);
+        let b = _mm256_setr_pd(-1., -1., 1., 1.);
+        let r = _mm256_testnzc_pd(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testz_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm_testz_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm_set1_pd(-1.);
+        let r = _mm_testz_pd(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testc_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm_testc_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(-1.);
+        let r = _mm_testc_pd(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testnzc_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm_testnzc_pd(a, b);
+        assert_eq!(r, 0);
+        let a = _mm_setr_pd(1., -1.);
+        let b = _mm_setr_pd(-1., -1.);
+        let r = _mm_testnzc_pd(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testz_ps() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_testz_ps(a, a);
+        assert_eq!(r, 1);
+        let a = _mm256_set1_ps(-1.);
+        let r = _mm256_testz_ps(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testc_ps() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_testc_ps(a, a);
+        assert_eq!(r, 1);
+        let b = _mm256_set1_ps(-1.);
+        let r = _mm256_testc_ps(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testnzc_ps() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_testnzc_ps(a, a);
+        assert_eq!(r, 0);
+        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
+        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
+        let r = _mm256_testnzc_ps(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testz_ps() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_testz_ps(a, a);
+        assert_eq!(r, 1);
+        let a = _mm_set1_ps(-1.);
+        let r = _mm_testz_ps(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testc_ps() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_testc_ps(a, a);
+        assert_eq!(r, 1);
+        let b = _mm_set1_ps(-1.);
+        let r = _mm_testc_ps(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testnzc_ps() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_testnzc_ps(a, a);
+        assert_eq!(r, 0);
+        let a = _mm_setr_ps(1., -1., -1., -1.);
+        let b = _mm_setr_ps(-1., -1., 1., 1.);
+        let r = _mm_testnzc_ps(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_movemask_pd() {
+        let a = _mm256_setr_pd(1., -2., 3., -4.);
+        let r = _mm256_movemask_pd(a);
+        assert_eq!(r, 0xA);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_movemask_ps() {
+        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
+        let r = _mm256_movemask_ps(a);
+        assert_eq!(r, 0xAA);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setzero_pd() {
+        let r = _mm256_setzero_pd();
+        assert_eq_m256d(r, _mm256_set1_pd(0.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setzero_ps() {
+        let r = _mm256_setzero_ps();
+        assert_eq_m256(r, _mm256_set1_ps(0.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setzero_si256() {
+        let r = _mm256_setzero_si256();
+        assert_eq_m256i(r, _mm256_set1_epi8(0));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_pd() {
+        let r = _mm256_set_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_ps() {
+        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_epi8() {
+        #[rustfmt::skip]
+        let r = _mm256_set_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            32, 31, 30, 29, 28, 27, 26, 25,
+            24, 23, 22, 21, 20, 19, 18, 17,
+            16, 15, 14, 13, 12, 11, 10, 9,
+            8, 7, 6, 5, 4, 3, 2, 1
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_epi16() {
+        #[rustfmt::skip]
+        let r = _mm256_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            16, 15, 14, 13, 12, 11, 10, 9, 8,
+            7, 6, 5, 4, 3, 2, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_epi32() {
+        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_epi64x() {
+        let r = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_pd() {
+        let r = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_ps() {
+        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_epi8() {
+        #[rustfmt::skip]
+        let r = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_epi16() {
+        #[rustfmt::skip]
+        let r = _mm256_setr_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_epi32() {
+        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_epi64x() {
+        let r = _mm256_setr_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_pd() {
+        let r = _mm256_set1_pd(1.);
+        assert_eq_m256d(r, _mm256_set1_pd(1.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_ps() {
+        let r = _mm256_set1_ps(1.);
+        assert_eq_m256(r, _mm256_set1_ps(1.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_epi8() {
+        let r = _mm256_set1_epi8(1);
+        assert_eq_m256i(r, _mm256_set1_epi8(1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_epi16() {
+        let r = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, _mm256_set1_epi16(1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_epi32() {
+        let r = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, _mm256_set1_epi32(1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_epi64x() {
+        let r = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, _mm256_set1_epi64x(1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castpd_ps() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_castpd_ps(a);
+        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castps_pd() {
+        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
+        let r = _mm256_castps_pd(a);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castps_si256() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_castps_si256(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            0, 0, -128, 63, 0, 0, 0, 64,
+            0, 0, 64, 64, 0, 0, -128, 64,
+            0, 0, -96, 64, 0, 0, -64, 64,
+            0, 0, -32, 64, 0, 0, 0, 65,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castsi256_ps() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 0, -128, 63, 0, 0, 0, 64,
+            0, 0, 64, 64, 0, 0, -128, 64,
+            0, 0, -96, 64, 0, 0, -64, 64,
+            0, 0, -32, 64, 0, 0, 0, 65,
+        );
+        let r = _mm256_castsi256_ps(a);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castpd_si256() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_castpd_si256(a);
+        assert_eq_m256d(transmute(r), a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castsi256_pd() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_castsi256_pd(a);
+        assert_eq_m256d(r, transmute(a));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castps256_ps128() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_castps256_ps128(a);
+        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castpd256_pd128() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_castpd256_pd128(a);
+        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castsi256_si128() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_castsi256_si128(a);
+        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castps128_ps256() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm256_castps128_ps256(a);
+        assert_eq_m128(_mm256_castps256_ps128(r), a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castpd128_pd256() {
+        let a = _mm_setr_pd(1., 2.);
+        let r = _mm256_castpd128_pd256(a);
+        assert_eq_m128d(_mm256_castpd256_pd128(r), a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castsi128_si256() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm256_castsi128_si256(a);
+        assert_eq_m128i(_mm256_castsi256_si128(r), a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_zextps128_ps256() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm256_zextps128_ps256(a);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_zextsi128_si256() {
+        let a = _mm_setr_epi64x(1, 2);
+        let r = _mm256_zextsi128_si256(a);
+        let e = _mm256_setr_epi64x(1, 2, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_zextpd128_pd256() {
+        let a = _mm_setr_pd(1., 2.);
+        let r = _mm256_zextpd128_pd256(a);
+        let e = _mm256_setr_pd(1., 2., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_m128() {
+        let hi = _mm_setr_ps(5., 6., 7., 8.);
+        let lo = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm256_set_m128(hi, lo);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_m128d() {
+        let hi = _mm_setr_pd(3., 4.);
+        let lo = _mm_setr_pd(1., 2.);
+        let r = _mm256_set_m128d(hi, lo);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_m128i() {
+        #[rustfmt::skip]
+        let hi = _mm_setr_epi8(
+            17, 18, 19, 20,
+            21, 22, 23, 24,
+            25, 26, 27, 28,
+            29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let lo = _mm_setr_epi8(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        let r = _mm256_set_m128i(hi, lo);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_m128() {
+        let lo = _mm_setr_ps(1., 2., 3., 4.);
+        let hi = _mm_setr_ps(5., 6., 7., 8.);
+        let r = _mm256_setr_m128(lo, hi);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_m128d() {
+        let lo = _mm_setr_pd(1., 2.);
+        let hi = _mm_setr_pd(3., 4.);
+        let r = _mm256_setr_m128d(lo, hi);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_m128i() {
+        #[rustfmt::skip]
+        let lo = _mm_setr_epi8(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let hi = _mm_setr_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_setr_m128i(lo, hi);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu2_m128() {
+        let hi = &[5., 6., 7., 8.];
+        let hiaddr = hi.as_ptr();
+        let lo = &[1., 2., 3., 4.];
+        let loaddr = lo.as_ptr();
+        let r = _mm256_loadu2_m128(hiaddr, loaddr);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu2_m128d() {
+        let hi = &[3., 4.];
+        let hiaddr = hi.as_ptr();
+        let lo = &[1., 2.];
+        let loaddr = lo.as_ptr();
+        let r = _mm256_loadu2_m128d(hiaddr, loaddr);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu2_m128i() {
+        #[rustfmt::skip]
+        let hi = _mm_setr_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let lo = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu2_m128() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let mut hi = _mm_undefined_ps();
+        let mut lo = _mm_undefined_ps();
+        _mm256_storeu2_m128(
+            ptr::addr_of_mut!(hi) as *mut f32,
+            ptr::addr_of_mut!(lo) as *mut f32,
+            a,
+        );
+        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
+        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu2_m128d() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let mut hi = _mm_undefined_pd();
+        let mut lo = _mm_undefined_pd();
+        _mm256_storeu2_m128d(
+            ptr::addr_of_mut!(hi) as *mut f64,
+            ptr::addr_of_mut!(lo) as *mut f64,
+            a,
+        );
+        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
+        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu2_m128i() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let mut hi = _mm_undefined_si128();
+        let mut lo = _mm_undefined_si128();
+        _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
+        #[rustfmt::skip]
+        let e_hi = _mm_setr_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32
+        );
+        #[rustfmt::skip]
+        let e_lo = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16
+        );
+
+        assert_eq_m128i(hi, e_hi);
+        assert_eq_m128i(lo, e_lo);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtss_f32() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_cvtss_f32(a);
+        assert_eq!(r, 1.);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
new file mode 100644
index 0000000000000..21f20f9c759e0
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -0,0 +1,5897 @@
+//! Advanced Vector Extensions 2 (AVX)
+//!
+//! AVX2 expands most AVX commands to 256-bit wide vector registers and
+//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
+//! overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+
+use core::hint::unreachable_unchecked;
+
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Computes the absolute values of packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i32x8();
+        let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Computes the absolute values of packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i16x16();
+        let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Computes the absolute values of packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i8x32();
+        let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Adds packed 64-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_add(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Adds packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_add(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Adds packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_add(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Adds packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_add(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Adds packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Adds packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
+/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        // If palignr is shifting the pair of vectors more than the size of two
+        // lanes, emit zero.
+        if IMM8 >= 32 {
+            return _mm256_setzero_si256();
+        }
+        // If palignr is shifting the pair of input vectors more than one lane,
+        // but less than two lanes, convert to shifting in zeroes.
+        let (a, b) = if IMM8 > 16 {
+            (_mm256_setzero_si256(), a)
+        } else {
+            (a, b)
+        };
+
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+
+        if IMM8 == 16 {
+            return transmute(a);
+        }
+
+        let r: i8x32 = match IMM8 % 16 {
+            0 => simd_shuffle!(
+                b,
+                a,
+                [
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ],
+            ),
+            1 => simd_shuffle!(
+                b,
+                a,
+                [
+                    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22,
+                    23, 24, 25, 26, 27, 28, 29, 30, 31, 48,
+                ],
+            ),
+            2 => simd_shuffle!(
+                b,
+                a,
+                [
+                    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23,
+                    24, 25, 26, 27, 28, 29, 30, 31, 48, 49,
+                ],
+            ),
+            3 => simd_shuffle!(
+                b,
+                a,
+                [
+                    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23,
+                    24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
+                ],
+            ),
+            4 => simd_shuffle!(
+                b,
+                a,
+                [
+                    4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24,
+                    25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
+                ],
+            ),
+            5 => simd_shuffle!(
+                b,
+                a,
+                [
+                    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25,
+                    26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
+                ],
+            ),
+            6 => simd_shuffle!(
+                b,
+                a,
+                [
+                    6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26,
+                    27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
+                ],
+            ),
+            7 => simd_shuffle!(
+                b,
+                a,
+                [
+                    7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26,
+                    27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
+                ],
+            ),
+            8 => simd_shuffle!(
+                b,
+                a,
+                [
+                    8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27,
+                    28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
+                ],
+            ),
+            9 => simd_shuffle!(
+                b,
+                a,
+                [
+                    9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28,
+                    29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+                ],
+            ),
+            10 => simd_shuffle!(
+                b,
+                a,
+                [
+                    10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29,
+                    30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+                ],
+            ),
+            11 => simd_shuffle!(
+                b,
+                a,
+                [
+                    11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30,
+                    31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+                ],
+            ),
+            12 => simd_shuffle!(
+                b,
+                a,
+                [
+                    12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31,
+                    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                ],
+            ),
+            13 => simd_shuffle!(
+                b,
+                a,
+                [
+                    13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48,
+                    49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+                ],
+            ),
+            14 => simd_shuffle!(
+                b,
+                a,
+                [
+                    14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49,
+                    50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+                ],
+            ),
+            15 => simd_shuffle!(
+                b,
+                a,
+                [
+                    15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50,
+                    51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+                ],
+            ),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data)
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_and(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Computes the bitwise NOT of 256 bits (representing integer data)
+/// in `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let all_ones = _mm256_set1_epi8(-1);
+        transmute(simd_and(
+            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
+            b.as_i64x4(),
+        ))
+    }
+}
+
+/// Averages packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = simd_cast::<_, u32x16>(a.as_u16x16());
+        let b = simd_cast::<_, u32x16>(b.as_u16x16());
+        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
+        transmute(simd_cast::<_, u16x16>(r))
+    }
+}
+
+/// Averages packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = simd_cast::<_, u16x32>(a.as_u8x32());
+        let b = simd_cast::<_, u16x32>(b.as_u8x32());
+        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
+        transmute(simd_cast::<_, u8x32>(r))
+    }
+}
+
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM4, 4);
+    unsafe {
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let r: i32x4 = simd_shuffle!(
+            a,
+            b,
+            [
+                [0, 4, 0, 4][IMM4 as usize & 0b11],
+                [1, 1, 5, 5][IMM4 as usize & 0b11],
+                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
+                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let r: i32x8 = simd_shuffle!(
+            a,
+            b,
+            [
+                [0, 8, 0, 8][IMM8 as usize & 0b11],
+                [1, 1, 9, 9][IMM8 as usize & 0b11],
+                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
+                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
+                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
+                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
+                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
+                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+
+        let r: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [
+                [0, 16, 0, 16][IMM8 as usize & 0b11],
+                [1, 1, 17, 17][IMM8 as usize & 0b11],
+                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
+                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
+                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
+                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
+                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
+                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
+                [8, 24, 8, 24][IMM8 as usize & 0b11],
+                [9, 9, 25, 25][IMM8 as usize & 0b11],
+                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
+                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
+                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
+                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
+                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
+                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Blends packed 8-bit integers from `a` and `b` using `mask`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpblendvb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
+    unsafe {
+        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
+        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
+    }
+}
+
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
+    unsafe {
+        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
+        transmute::<i8x16, _>(ret)
+    }
+}
+
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
+    unsafe {
+        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
+        transmute::<i8x32, _>(ret)
+    }
+}
+
+// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
+// often compiled to `vbroadcastss`.
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
+    unsafe {
+        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
+        transmute::<i32x4, _>(ret)
+    }
+}
+
+// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
+// often compiled to `vbroadcastss`.
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
+    unsafe {
+        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
+        transmute::<i32x8, _>(ret)
+    }
+}
+
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+// Emits `vmovddup` instead of `vpbroadcastq`
+// See https://github.com/rust-lang/stdarch/issues/791
+#[cfg_attr(test, assert_instr(vmovddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
+        transmute::<i64x2, _>(ret)
+    }
+}
+
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
+    unsafe {
+        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
+        transmute::<i64x4, _>(ret)
+    }
+}
+
+/// Broadcasts the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
+    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) }
+}
+
+/// Broadcasts the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
+    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) }
+}
+
+/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
+    unsafe {
+        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
+        transmute::<i64x4, _>(ret)
+    }
+}
+
+// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
+// `vbroadcastf128`.
+/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
+    unsafe {
+        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
+        transmute::<i64x4, _>(ret)
+    }
+}
+
+/// Broadcasts the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) }
+}
+
+/// Broadcasts the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
+    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) }
+}
+
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 128-bit returned value
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
+    unsafe {
+        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
+        transmute::<i16x8, _>(ret)
+    }
+}
+
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 256-bit returned value
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
+    unsafe {
+        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
+        transmute::<i16x16, _>(ret)
+    }
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Sign-extend 16-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
+    unsafe { transmute::<i32x8, _>(simd_cast(a.as_i16x8())) }
+}
+
+/// Sign-extend 16-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i16x8();
+        let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        transmute::<i64x4, _>(simd_cast(v64))
+    }
+}
+
+/// Sign-extend 32-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
+    unsafe { transmute::<i64x4, _>(simd_cast(a.as_i32x4())) }
+}
+
+/// Sign-extend 8-bit integers to 16-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
+    unsafe { transmute::<i16x16, _>(simd_cast(a.as_i8x16())) }
+}
+
+/// Sign-extend 8-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i8x16();
+        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<i32x8, _>(simd_cast(v64))
+    }
+}
+
+/// Sign-extend 8-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i8x16();
+        let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        transmute::<i64x4, _>(simd_cast(v32))
+    }
+}
+
+/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
+/// integers, and stores the results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
+    unsafe { transmute::<i32x8, _>(simd_cast(a.as_u16x8())) }
+}
+
+/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
+/// integers. The upper four elements of `a` are unused.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_u16x8();
+        let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        transmute::<i64x4, _>(simd_cast(v64))
+    }
+}
+
+/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
+    unsafe { transmute::<i64x4, _>(simd_cast(a.as_u32x4())) }
+}
+
+/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
+    unsafe { transmute::<i16x16, _>(simd_cast(a.as_u8x16())) }
+}
+
+/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
+/// integers. The upper eight elements of `a` are unused.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_u8x16();
+        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<i32x8, _>(simd_cast(v64))
+    }
+}
+
+/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
+/// integers. The upper twelve elements of `a` are unused.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_u8x16();
+        let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        transmute::<i64x4, _>(simd_cast(v32))
+    }
+}
+
+/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
+    static_assert_uimm_bits!(IMM1, 1);
+    unsafe {
+        let a = a.as_i64x4();
+        let b = i64x4::ZERO;
+        let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
+        transmute(dst)
+    }
+}
+
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(phaddw(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(phaddd(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(phsubw(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(phsubd(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
+    slice: *const i32,
+    offsets: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i32x4::ZERO;
+    let neg_one = _mm_set1_epi32(-1).as_i32x4();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    slice: *const i32,
+    offsets: __m128i,
+    mask: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x4();
+    let mask = mask.as_i32x4();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
+    slice: *const i32,
+    offsets: __m256i,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i32x8::ZERO;
+    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
+    let offsets = offsets.as_i32x8();
+    let slice = slice as *const i8;
+    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
+    src: __m256i,
+    slice: *const i32,
+    offsets: __m256i,
+    mask: __m256i,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x8();
+    let mask = mask.as_i32x8();
+    let offsets = offsets.as_i32x8();
+    let slice = slice as *const i8;
+    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_ps();
+    let neg_one = _mm_set1_ps(-1.0);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
+    src: __m128,
+    slice: *const f32,
+    offsets: __m128i,
+    mask: __m128,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    pgatherdps(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm256_setzero_ps();
+    let neg_one = _mm256_set1_ps(-1.0);
+    let offsets = offsets.as_i32x8();
+    let slice = slice as *const i8;
+    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
+    src: __m256,
+    slice: *const f32,
+    offsets: __m256i,
+    mask: __m256,
+) -> __m256 {
+    static_assert_imm8_scale!(SCALE);
+    let offsets = offsets.as_i32x8();
+    let slice = slice as *const i8;
+    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
+    slice: *const i64,
+    offsets: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i64x2::ZERO;
+    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
+    src: __m128i,
+    slice: *const i64,
+    offsets: __m128i,
+    mask: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x2();
+    let mask = mask.as_i64x2();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
+    slice: *const i64,
+    offsets: __m128i,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i64x4::ZERO;
+    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
+    src: __m256i,
+    slice: *const i64,
+    offsets: __m128i,
+    mask: __m256i,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x4();
+    let mask = mask.as_i64x4();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_pd();
+    let neg_one = _mm_set1_pd(-1.0);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
+    src: __m128d,
+    slice: *const f64,
+    offsets: __m128i,
+    mask: __m128d,
+) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
+    slice: *const f64,
+    offsets: __m128i,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm256_setzero_pd();
+    let neg_one = _mm256_set1_pd(-1.0);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
+    src: __m256d,
+    slice: *const f64,
+    offsets: __m128i,
+    mask: __m256d,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
+    slice: *const i32,
+    offsets: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i32x4::ZERO;
+    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    slice: *const i32,
+    offsets: __m128i,
+    mask: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x4();
+    let mask = mask.as_i32x4();
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
+    slice: *const i32,
+    offsets: __m256i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i32x4::ZERO;
+    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    slice: *const i32,
+    offsets: __m256i,
+    mask: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x4();
+    let mask = mask.as_i32x4();
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_ps();
+    let neg_one = _mm_set1_ps(-1.0);
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
+    src: __m128,
+    slice: *const f32,
+    offsets: __m128i,
+    mask: __m128,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    pgatherqps(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_ps();
+    let neg_one = _mm_set1_ps(-1.0);
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
+    src: __m128,
+    slice: *const f32,
+    offsets: __m256i,
+    mask: __m128,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
+    slice: *const i64,
+    offsets: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i64x2::ZERO;
+    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x2();
+    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
+    src: __m128i,
+    slice: *const i64,
+    offsets: __m128i,
+    mask: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x2();
+    let mask = mask.as_i64x2();
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
+    slice: *const i64,
+    offsets: __m256i,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i64x4::ZERO;
+    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x4();
+    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
+    src: __m256i,
+    slice: *const i64,
+    offsets: __m256i,
+    mask: __m256i,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x4();
+    let mask = mask.as_i64x4();
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_pd();
+    let neg_one = _mm_set1_pd(-1.0);
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x2();
+    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
+    src: __m128d,
+    slice: *const f64,
+    offsets: __m128i,
+    mask: __m128d,
+) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x2();
+    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
+    slice: *const f64,
+    offsets: __m256i,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm256_setzero_pd();
+    let neg_one = _mm256_set1_pd(-1.0);
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x4();
+    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
+    src: __m256d,
+    slice: *const f64,
+    offsets: __m256i,
+    mask: __m256d,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x4();
+    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
+/// location specified by `IMM1`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+    static_assert_uimm_bits!(IMM1, 1);
+    unsafe {
+        let a = a.as_i64x4();
+        let b = _mm256_castsi128_si256(b).as_i64x4();
+        let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
+        transmute(dst)
+    }
+}
+
+/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
+/// of intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Vertically multiplies each unsigned 8-bit integer from `a` with the
+/// corresponding signed 8-bit integer from `b`, producing intermediate
+/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
+/// signed 16-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
+    transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
+}
+
+/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
+    transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
+}
+
+/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
+    transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
+}
+
+/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
+    transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
+}
+
+/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
+    maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
+}
+
+/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
+    maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
+}
+
+/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
+    maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
+}
+
+/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
+    maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Creates mask from the most significant bit of each 8-bit element in `a`,
+/// return the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovmskb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
+    unsafe {
+        let z = i8x32::ZERO;
+        let m: i8x32 = simd_lt(a.as_i8x32(), z);
+        simd_bitmask::<_, u32>(m) as i32
+    }
+}
+
+/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
+/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
+/// results in dst. Eight SADs are performed for each 128-bit lane using one
+/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
+/// selected from `b` starting at on the offset specified in `imm8`. Eight
+/// quadruplets are formed from sequential 8-bit integers selected from `a`
+/// starting at the offset specified in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8)) }
+}
+
+/// Multiplies the low 32-bit integers from each packed 64-bit element in
+/// `a` and `b`
+///
+/// Returns the 64-bit results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
+        let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
+        transmute(simd_mul(a, b))
+    }
+}
+
+/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
+/// element in `a` and `b`
+///
+/// Returns the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        let mask = u64x4::splat(u32::MAX.into());
+        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    }
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = simd_cast::<_, i32x16>(a.as_i16x16());
+        let b = simd_cast::<_, i32x16>(b.as_i16x16());
+        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
+        transmute(simd_cast::<i32x16, i16x16>(r))
+    }
+}
+
+/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = simd_cast::<_, u32x16>(a.as_u16x16());
+        let b = simd_cast::<_, u32x16>(b.as_u16x16());
+        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
+        transmute(simd_cast::<u32x16, u16x16>(r))
+    }
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers, and returns the low 16 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Multiplies the packed 32-bit integers in `a` and `b`, producing
+/// intermediate 64-bit integers, and returns the low 32 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiplies packed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Truncate each intermediate
+/// integer to the 18 most significant bits, round by adding 1, and
+/// return bits `[16:1]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
+/// and `b`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Permutes packed 32-bit integers from `a` according to the content of `b`.
+///
+/// The last 3 bits of each integer of `b` are used as addresses into the 8
+/// integers of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(permd(a.as_u32x8(), b.as_u32x8())) }
+}
+
+/// Permutes 64-bit integers from `a` using control mask `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        let zero = i64x4::ZERO;
+        let r: i64x4 = simd_shuffle!(
+            a.as_i64x4(),
+            zero,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8)) }
+}
+
+/// Shuffles 64-bit floating-point elements in `a` across lanes using the
+/// control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_undefined_pd(),
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
+        )
+    }
+}
+
+/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
+/// the corresponding 32-bit integer index in `idx`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
+    unsafe { permps(a, idx.as_i32x8()) }
+}
+
+/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to
+/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
+/// integers in the low 16 bits of the 64-bit return value
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsadbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(psadbw(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Shuffles bytes from `a` according to the content of `b`.
+///
+/// For each of the 128-bit low and high halves of the vectors, the last
+/// 4 bits of each byte of `b` are used as addresses into the respective
+/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
+///
+/// In addition, if the highest significant bit of a byte of `b` is set, the
+/// respective destination byte is set to 0.
+///
+/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
+/// equivalent to:
+///
+/// ```
+/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
+///     let mut r = [0; 32];
+///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///         if b[i + 16] & 0x80 == 0u8 {
+///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(pshufb(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
+/// `imm8`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+///
+/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
+/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
+///
+/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
+/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
+///
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(MASK, 8);
+    unsafe {
+        let r: i32x8 = simd_shuffle!(
+            a.as_i32x8(),
+            a.as_i32x8(),
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                (MASK as u32 >> 4) & 0b11,
+                (MASK as u32 >> 6) & 0b11,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        let a = a.as_i16x16();
+        let r: i16x16 = simd_shuffle!(
+            a,
+            a,
+            [
+                0,
+                1,
+                2,
+                3,
+                4 + (IMM8 as u32 & 0b11),
+                4 + ((IMM8 as u32 >> 2) & 0b11),
+                4 + ((IMM8 as u32 >> 4) & 0b11),
+                4 + ((IMM8 as u32 >> 6) & 0b11),
+                8,
+                9,
+                10,
+                11,
+                12 + (IMM8 as u32 & 0b11),
+                12 + ((IMM8 as u32 >> 2) & 0b11),
+                12 + ((IMM8 as u32 >> 4) & 0b11),
+                12 + ((IMM8 as u32 >> 6) & 0b11),
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        let a = a.as_i16x16();
+        let r: i16x16 = simd_shuffle!(
+            a,
+            a,
+            [
+                0 + (IMM8 as u32 & 0b11),
+                0 + ((IMM8 as u32 >> 2) & 0b11),
+                0 + ((IMM8 as u32 >> 4) & 0b11),
+                0 + ((IMM8 as u32 >> 6) & 0b11),
+                4,
+                5,
+                6,
+                7,
+                8 + (IMM8 as u32 & 0b11),
+                8 + ((IMM8 as u32 >> 2) & 0b11),
+                8 + ((IMM8 as u32 >> 4) & 0b11),
+                8 + ((IMM8 as u32 >> 6) & 0b11),
+                12,
+                13,
+                14,
+                15,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Negates packed 16-bit integers in `a` when the corresponding signed
+/// 16-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsignw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(psignw(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Negates packed 32-bit integers in `a` when the corresponding signed
+/// 32-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsignd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(psignd(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Negates packed 8-bit integers in `a` when the corresponding signed
+/// 8-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsignb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(psignb(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Shifts packed 16-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
+    unsafe { transmute(psllw(a.as_i16x16(), count.as_i16x8())) }
+}
+
+/// Shifts packed 32-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
+    unsafe { transmute(pslld(a.as_i32x8(), count.as_i32x4())) }
+}
+
+/// Shifts packed 64-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
+    unsafe { transmute(psllq(a.as_i64x4(), count.as_i64x2())) }
+}
+
+/// Shifts packed 16-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        if IMM8 >= 16 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
+        }
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
+        }
+    }
+}
+
+/// Shifts packed 64-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
+        }
+    }
+}
+
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_bslli_epi128::<IMM8>(a)
+}
+
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    const fn mask(shift: i32, i: u32) -> u32 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || i % 16 < shift {
+            0
+        } else {
+            32 + (i - shift)
+        }
+    }
+    unsafe {
+        let a = a.as_i8x32();
+        let r: i8x32 = simd_shuffle!(
+            i8x32::ZERO,
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+                mask(IMM8, 4),
+                mask(IMM8, 5),
+                mask(IMM8, 6),
+                mask(IMM8, 7),
+                mask(IMM8, 8),
+                mask(IMM8, 9),
+                mask(IMM8, 10),
+                mask(IMM8, 11),
+                mask(IMM8, 12),
+                mask(IMM8, 13),
+                mask(IMM8, 14),
+                mask(IMM8, 15),
+                mask(IMM8, 16),
+                mask(IMM8, 17),
+                mask(IMM8, 18),
+                mask(IMM8, 19),
+                mask(IMM8, 20),
+                mask(IMM8, 21),
+                mask(IMM8, 22),
+                mask(IMM8, 23),
+                mask(IMM8, 24),
+                mask(IMM8, 25),
+                mask(IMM8, 26),
+                mask(IMM8, 27),
+                mask(IMM8, 28),
+                mask(IMM8, 29),
+                mask(IMM8, 30),
+                mask(IMM8, 31),
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(psllvd(a.as_i32x4(), count.as_i32x4())) }
+}
+
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) }
+}
+
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(psllvq(a.as_i64x2(), count.as_i64x2())) }
+}
+
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) }
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
+    unsafe { transmute(psraw(a.as_i16x16(), count.as_i16x8())) }
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
+    unsafe { transmute(psrad(a.as_i32x8(), count.as_i32x4())) }
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) }
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) }
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(psravd(a.as_i32x4(), count.as_i32x4())) }
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(psravd256(a.as_i32x8(), count.as_i32x8())) }
+}
+
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_bsrli_epi128::<IMM8>(a)
+}
+
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        let a = a.as_i8x32();
+        let zero = i8x32::ZERO;
+        let r: i8x32 = match IMM8 % 16 {
+            0 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ],
+            ),
+            1 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22,
+                    23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                ],
+            ),
+            2 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 18, 19, 20, 21, 22, 23,
+                    24, 25, 26, 27, 28, 29, 30, 31, 32, 32,
+                ],
+            ),
+            3 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 19, 20, 21, 22, 23,
+                    24, 25, 26, 27, 28, 29, 30, 31, 32, 32, 32,
+                ],
+            ),
+            4 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 20, 21, 22, 23, 24,
+                    25, 26, 27, 28, 29, 30, 31, 32, 32, 32, 32,
+                ],
+            ),
+            5 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 21, 22, 23, 24, 25,
+                    26, 27, 28, 29, 30, 31, 32, 32, 32, 32, 32,
+                ],
+            ),
+            6 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 22, 23, 24, 25, 26,
+                    27, 28, 29, 30, 31, 32, 32, 32, 32, 32, 32,
+                ],
+            ),
+            7 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 23, 24, 25, 26,
+                    27, 28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32,
+                ],
+            ),
+            8 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 24, 25, 26, 27,
+                    28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+                ],
+            ),
+            9 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 25, 26, 27, 28,
+                    29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+                ],
+            ),
+            10 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 26, 27, 28, 29,
+                    30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+                ],
+            ),
+            11 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 27, 28, 29, 30,
+                    31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+                ],
+            ),
+            12 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 28, 29, 30, 31,
+                    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+                ],
+            ),
+            13 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 29, 30, 31, 32,
+                    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+                ],
+            ),
+            14 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 31, 32, 32,
+                    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+                ],
+            ),
+            15 => simd_shuffle!(
+                a,
+                zero,
+                [
+                    15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
+                    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+                ],
+            ),
+            _ => zero,
+        };
+        transmute(r)
+    }
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
+    unsafe { transmute(psrlw(a.as_i16x16(), count.as_i16x8())) }
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
+    unsafe { transmute(psrld(a.as_i32x8(), count.as_i32x4())) }
+}
+
+/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
+    unsafe { transmute(psrlq(a.as_i64x4(), count.as_i64x2())) }
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        if IMM8 >= 16 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
+        }
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
+        }
+    }
+}
+
+/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
+        }
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) }
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) }
+}
+
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) }
+}
+
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) }
+}
+
+/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
+/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
+/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vmovntdqa))]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
+    let dst: __m256i;
+    crate::arch::asm!(
+        vpl!("vmovntdqa {a}"),
+        a = out(ymm_reg) dst,
+        p = in(reg) mem_addr,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Unpacks and interleave 8-bit integers from the high half of each
+/// 128-bit lane in `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi8(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+/// );
+/// let b = _mm256_setr_epi8(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
+///     -30, -31,
+/// );
+///
+/// let c = _mm256_unpackhi_epi8(a, b);
+///
+/// let expected = _mm256_setr_epi8(
+///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
+///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
+///     -31,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        #[rustfmt::skip]
+        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
+                8, 40, 9, 41, 10, 42, 11, 43,
+                12, 44, 13, 45, 14, 46, 15, 47,
+                24, 56, 25, 57, 26, 58, 27, 59,
+                28, 60, 29, 61, 30, 62, 31, 63,
+        ]);
+        transmute(r)
+    }
+}
+
+/// Unpacks and interleave 8-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi8(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+/// );
+/// let b = _mm256_setr_epi8(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
+///     -30, -31,
+/// );
+///
+/// let c = _mm256_unpacklo_epi8(a, b);
+///
+/// let expected = _mm256_setr_epi8(
+///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
+///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        #[rustfmt::skip]
+        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
+            0, 32, 1, 33, 2, 34, 3, 35,
+            4, 36, 5, 37, 6, 38, 7, 39,
+            16, 48, 17, 49, 18, 50, 19, 51,
+            20, 52, 21, 53, 22, 54, 23, 55,
+        ]);
+        transmute(r)
+    }
+}
+
+/// Unpacks and interleave 16-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi16(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+/// );
+/// let b = _mm256_setr_epi16(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+/// );
+///
+/// let c = _mm256_unpackhi_epi16(a, b);
+///
+/// let expected = _mm256_setr_epi16(
+///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r: i16x16 = simd_shuffle!(
+            a.as_i16x16(),
+            b.as_i16x16(),
+            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpacks and interleave 16-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+///
+/// let a = _mm256_setr_epi16(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+/// );
+/// let b = _mm256_setr_epi16(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+/// );
+///
+/// let c = _mm256_unpacklo_epi16(a, b);
+///
+/// let expected = _mm256_setr_epi16(
+///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r: i16x16 = simd_shuffle!(
+            a.as_i16x16(),
+            b.as_i16x16(),
+            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpacks and interleave 32-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
+///
+/// let c = _mm256_unpackhi_epi32(a, b);
+///
+/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
+        transmute(r)
+    }
+}
+
+/// Unpacks and interleave 32-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
+///
+/// let c = _mm256_unpacklo_epi32(a, b);
+///
+/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
+        transmute(r)
+    }
+}
+
+/// Unpacks and interleave 64-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
+/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
+///
+/// let c = _mm256_unpackhi_epi64(a, b);
+///
+/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
+        transmute(r)
+    }
+}
+
+/// Unpacks and interleave 64-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
+/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
+///
+/// let c = _mm256_unpacklo_epi64(a, b);
+///
+/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
+        transmute(r)
+    }
+}
+
+/// Computes the bitwise XOR of 256 bits (representing integer data)
+/// in `a` and `b`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_uimm_bits!(INDEX, 5);
+    unsafe { simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 }
+}
+
+/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_uimm_bits!(INDEX, 4);
+    unsafe { simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx2.phadd.w"]
+    fn phaddw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.phadd.d"]
+    fn phaddd(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.phadd.sw"]
+    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.phsub.w"]
+    fn phsubw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.phsub.d"]
+    fn phsubd(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.phsub.sw"]
+    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.pmadd.wd"]
+    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
+    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
+    fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
+    #[link_name = "llvm.x86.avx2.maskload.d"]
+    fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.maskload.d.256"]
+    fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.maskload.q"]
+    fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.avx2.maskload.q.256"]
+    fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx2.maskstore.d"]
+    fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
+    #[link_name = "llvm.x86.avx2.maskstore.d.256"]
+    fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
+    #[link_name = "llvm.x86.avx2.maskstore.q"]
+    fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
+    #[link_name = "llvm.x86.avx2.maskstore.q.256"]
+    fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
+    #[link_name = "llvm.x86.avx2.mpsadbw"]
+    fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
+    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.packsswb"]
+    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
+    #[link_name = "llvm.x86.avx2.packssdw"]
+    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
+    #[link_name = "llvm.x86.avx2.packuswb"]
+    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
+    #[link_name = "llvm.x86.avx2.packusdw"]
+    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
+    #[link_name = "llvm.x86.avx2.psad.bw"]
+    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
+    #[link_name = "llvm.x86.avx2.psign.b"]
+    fn psignb(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx2.psign.w"]
+    fn psignw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psign.d"]
+    fn psignd(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psll.w"]
+    fn psllw(a: i16x16, count: i16x8) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psll.d"]
+    fn pslld(a: i32x8, count: i32x4) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psll.q"]
+    fn psllq(a: i64x4, count: i64x2) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psllv.d"]
+    fn psllvd(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.psllv.d.256"]
+    fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psllv.q"]
+    fn psllvq(a: i64x2, count: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.avx2.psllv.q.256"]
+    fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psra.w"]
+    fn psraw(a: i16x16, count: i16x8) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psra.d"]
+    fn psrad(a: i32x8, count: i32x4) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrav.d"]
+    fn psravd(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.psrav.d.256"]
+    fn psravd256(a: i32x8, count: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrl.w"]
+    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psrl.d"]
+    fn psrld(a: i32x8, count: i32x4) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrl.q"]
+    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psrlv.d"]
+    fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.psrlv.d.256"]
+    fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrlv.q"]
+    fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.avx2.psrlv.q.256"]
+    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx2.pshuf.b"]
+    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
+    #[link_name = "llvm.x86.avx2.permd"]
+    fn permd(a: u32x8, b: u32x8) -> u32x8;
+    #[link_name = "llvm.x86.avx2.permps"]
+    fn permps(a: __m256, b: i32x8) -> __m256;
+    #[link_name = "llvm.x86.avx2.vperm2i128"]
+    fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
+    #[link_name = "llvm.x86.avx2.gather.d.d"]
+    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
+    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
+    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.gather.d.q"]
+    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
+    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
+    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
+    #[link_name = "llvm.x86.avx2.gather.q.d"]
+    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
+    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
+    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
+    #[link_name = "llvm.x86.avx2.gather.q.q"]
+    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
+    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
+    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
+    #[link_name = "llvm.x86.avx2.gather.d.pd"]
+    fn pgatherdpd(
+        src: __m128d,
+        slice: *const i8,
+        offsets: i32x4,
+        mask: __m128d,
+        scale: i8,
+    ) -> __m128d;
+    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
+    fn vpgatherdpd(
+        src: __m256d,
+        slice: *const i8,
+        offsets: i32x4,
+        mask: __m256d,
+        scale: i8,
+    ) -> __m256d;
+    #[link_name = "llvm.x86.avx2.gather.q.pd"]
+    fn pgatherqpd(
+        src: __m128d,
+        slice: *const i8,
+        offsets: i64x2,
+        mask: __m128d,
+        scale: i8,
+    ) -> __m128d;
+    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
+    fn vpgatherqpd(
+        src: __m256d,
+        slice: *const i8,
+        offsets: i64x4,
+        mask: __m256d,
+        scale: i8,
+    ) -> __m256d;
+    #[link_name = "llvm.x86.avx2.gather.d.ps"]
+    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
+    -> __m128;
+    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
+    fn vpgatherdps(
+        src: __m256,
+        slice: *const i8,
+        offsets: i32x8,
+        mask: __m256,
+        scale: i8,
+    ) -> __m256;
+    #[link_name = "llvm.x86.avx2.gather.q.ps"]
+    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
+    -> __m128;
+    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
+    fn vpgatherqps(
+        src: __m128,
+        slice: *const i8,
+        offsets: i64x4,
+        mask: __m128,
+        scale: i8,
+    ) -> __m128;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi32(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0,  1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi16(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
+            0, 1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_add_epi64() {
+        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
+        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
+        let r = _mm256_add_epi64(a, b);
+        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_add_epi32() {
+        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
+        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_add_epi32(a, b);
+        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_add_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r = _mm256_add_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 2, 4, 6, 8, 10, 12, 14,
+            16, 18, 20, 22, 24, 26, 28, 30,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_add_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm256_add_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            0, 2, 4, 6, 8, 10, 12, 14,
+            16, 18, 20, 22, 24, 26, 28, 30,
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63,
+        );
+        let r = _mm256_adds_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+            64, 66, 68, 70, 72, 74, 76, 78,
+            80, 82, 84, 86, 88, 90, 92, 94,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi8_saturate_positive() {
+        let a = _mm256_set1_epi8(0x7F);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_adds_epi8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi8_saturate_negative() {
+        let a = _mm256_set1_epi8(-0x80);
+        let b = _mm256_set1_epi8(-1);
+        let r = _mm256_adds_epi8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi16(
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+        );
+        let r = _mm256_adds_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi16_saturate_positive() {
+        let a = _mm256_set1_epi16(0x7FFF);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_adds_epi16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi16_saturate_negative() {
+        let a = _mm256_set1_epi16(-0x8000);
+        let b = _mm256_set1_epi16(-1);
+        let r = _mm256_adds_epi16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63,
+        );
+        let r = _mm256_adds_epu8(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+            64, 66, 68, 70, 72, 74, 76, 78,
+            80, 82, 84, 86, 88, 90, 92, 94,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epu8_saturate() {
+        let a = _mm256_set1_epi8(!0);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_adds_epu8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epu16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi16(
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+        );
+        let r = _mm256_adds_epu16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epu16_saturate() {
+        let a = _mm256_set1_epi16(!0);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_adds_epu16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_and_si256() {
+        let a = _mm256_set1_epi8(5);
+        let b = _mm256_set1_epi8(3);
+        let got = _mm256_and_si256(a, b);
+        assert_eq_m256i(got, _mm256_set1_epi8(1));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_andnot_si256() {
+        let a = _mm256_set1_epi8(5);
+        let b = _mm256_set1_epi8(3);
+        let got = _mm256_andnot_si256(a, b);
+        assert_eq_m256i(got, _mm256_set1_epi8(2));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_avg_epu8() {
+        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
+        let r = _mm256_avg_epu8(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi8(6));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_avg_epu16() {
+        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
+        let r = _mm256_avg_epu16(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi16(6));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_blend_epi32() {
+        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
+        let e = _mm_setr_epi32(9, 3, 3, 3);
+        let r = _mm_blend_epi32::<0x01>(a, b);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_blend_epi32::<0x0E>(b, a);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_blend_epi32() {
+        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
+        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
+        let r = _mm256_blend_epi32::<0x01>(a, b);
+        assert_eq_m256i(r, e);
+
+        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
+        let r = _mm256_blend_epi32::<0x82>(a, b);
+        assert_eq_m256i(r, e);
+
+        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
+        let r = _mm256_blend_epi32::<0x7C>(a, b);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_blend_epi16() {
+        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
+        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
+        let r = _mm256_blend_epi16::<0x01>(a, b);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_blend_epi16::<0xFE>(b, a);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_blendv_epi8() {
+        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
+        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
+        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
+        let r = _mm256_blendv_epi8(a, b, mask);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastb_epi8() {
+        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
+        let res = _mm_broadcastb_epi8(a);
+        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastb_epi8() {
+        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
+        let res = _mm256_broadcastb_epi8(a);
+        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastd_epi32() {
+        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
+        let res = _mm_broadcastd_epi32(a);
+        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastd_epi32() {
+        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
+        let res = _mm256_broadcastd_epi32(a);
+        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastq_epi64() {
+        let a = _mm_setr_epi64x(0x1ffffffff, 0);
+        let res = _mm_broadcastq_epi64(a);
+        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastq_epi64() {
+        let a = _mm_setr_epi64x(0x1ffffffff, 0);
+        let res = _mm256_broadcastq_epi64(a);
+        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastsd_pd() {
+        let a = _mm_setr_pd(6.88, 3.44);
+        let res = _mm_broadcastsd_pd(a);
+        assert_eq_m128d(res, _mm_set1_pd(6.88));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastsd_pd() {
+        let a = _mm_setr_pd(6.88, 3.44);
+        let res = _mm256_broadcastsd_pd(a);
+        assert_eq_m256d(res, _mm256_set1_pd(6.88f64));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastsi128_si256() {
+        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
+        let res = _mm_broadcastsi128_si256(a);
+        let retval = _mm256_setr_epi64x(
+            0x0987654321012334,
+            0x5678909876543210,
+            0x0987654321012334,
+            0x5678909876543210,
+        );
+        assert_eq_m256i(res, retval);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastsi128_si256() {
+        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
+        let res = _mm256_broadcastsi128_si256(a);
+        let retval = _mm256_setr_epi64x(
+            0x0987654321012334,
+            0x5678909876543210,
+            0x0987654321012334,
+            0x5678909876543210,
+        );
+        assert_eq_m256i(res, retval);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastss_ps() {
+        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
+        let res = _mm_broadcastss_ps(a);
+        assert_eq_m128(res, _mm_set1_ps(6.88));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastss_ps() {
+        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
+        let res = _mm256_broadcastss_ps(a);
+        assert_eq_m256(res, _mm256_set1_ps(6.88));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastw_epi16() {
+        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
+        let res = _mm_broadcastw_epi16(a);
+        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastw_epi16() {
+        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
+        let res = _mm256_broadcastw_epi16(a);
+        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpeq_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            31, 30, 2, 28, 27, 26, 25, 24,
+            23, 22, 21, 20, 19, 18, 17, 16,
+            15, 14, 13, 12, 11, 10, 9, 8,
+            7, 6, 5, 4, 3, 2, 1, 0,
+        );
+        let r = _mm256_cmpeq_epi8(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpeq_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi16(
+            15, 14, 2, 12, 11, 10, 9, 8,
+            7, 6, 5, 4, 3, 2, 1, 0,
+        );
+        let r = _mm256_cmpeq_epi16(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpeq_epi32() {
+        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
+        let r = _mm256_cmpeq_epi32(a, b);
+        let e = _mm256_set1_epi32(0);
+        let e = _mm256_insert_epi32::<2>(e, !0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpeq_epi64() {
+        let a = _mm256_setr_epi64x(0, 1, 2, 3);
+        let b = _mm256_setr_epi64x(3, 2, 2, 0);
+        let r = _mm256_cmpeq_epi64(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpgt_epi8() {
+        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
+        let b = _mm256_set1_epi8(0);
+        let r = _mm256_cmpgt_epi8(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpgt_epi16() {
+        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
+        let b = _mm256_set1_epi16(0);
+        let r = _mm256_cmpgt_epi16(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpgt_epi32() {
+        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
+        let b = _mm256_set1_epi32(0);
+        let r = _mm256_cmpgt_epi32(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpgt_epi64() {
+        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
+        let b = _mm256_set1_epi64x(0);
+        let r = _mm256_cmpgt_epi64(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi8_epi16() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 0, -1, 1, -2, 2, -3, 3,
+            -4, 4, -5, 5, -6, 6, -7, 7,
+        );
+        #[rustfmt::skip]
+        let r = _mm256_setr_epi16(
+            0, 0, -1, 1, -2, 2, -3, 3,
+            -4, 4, -5, 5, -6, 6, -7, 7,
+        );
+        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi8_epi32() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 0, -1, 1, -2, 2, -3, 3,
+            -4, 4, -5, 5, -6, 6, -7, 7,
+        );
+        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
+        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi8_epi64() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 0, -1, 1, -2, 2, -3, 3,
+            -4, 4, -5, 5, -6, 6, -7, 7,
+        );
+        let r = _mm256_setr_epi64x(0, 0, -1, 1);
+        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi16_epi32() {
+        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
+        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
+        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi16_epi64() {
+        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
+        let r = _mm256_setr_epi64x(0, 0, -1, 1);
+        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi32_epi64() {
+        let a = _mm_setr_epi32(0, 0, -1, 1);
+        let r = _mm256_setr_epi64x(0, 0, -1, 1);
+        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu16_epi32() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu16_epi64() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_setr_epi64x(0, 1, 2, 3);
+        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu32_epi64() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let r = _mm256_setr_epi64x(0, 1, 2, 3);
+        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu8_epi16() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let r = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu8_epi32() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu8_epi64() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r = _mm256_setr_epi64x(0, 1, 2, 3);
+        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_extracti128_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_extracti128_si256::<1>(a);
+        let e = _mm_setr_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hadd_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hadd_epi16(a, b);
+        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hadd_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_hadd_epi32(a, b);
+        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hadds_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
+        let a = _mm256_insert_epi16::<1>(a, 1);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hadds_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
+            4, 4, 4, 4, 8, 8, 8, 8,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hsub_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hsub_epi16(a, b);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hsub_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_hsub_epi32(a, b);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hsubs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
+        let a = _mm256_insert_epi16::<1>(a, -1);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hsubs_epi16(a, b);
+        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_madd_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_madd_epi16(a, b);
+        let e = _mm256_set1_epi32(16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_inserti128_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm_setr_epi64x(7, 8);
+        let r = _mm256_inserti128_si256::<1>(a, b);
+        let e = _mm256_setr_epi64x(1, 2, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maddubs_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_maddubs_epi16(a, b);
+        let e = _mm256_set1_epi16(16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_maskload_epi32() {
+        let nums = [1, 2, 3, 4];
+        let a = &nums as *const i32;
+        let mask = _mm_setr_epi32(-1, 0, 0, -1);
+        let r = _mm_maskload_epi32(a, mask);
+        let e = _mm_setr_epi32(1, 0, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maskload_epi32() {
+        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
+        let a = &nums as *const i32;
+        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+        let r = _mm256_maskload_epi32(a, mask);
+        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_maskload_epi64() {
+        let nums = [1_i64, 2_i64];
+        let a = &nums as *const i64;
+        let mask = _mm_setr_epi64x(0, -1);
+        let r = _mm_maskload_epi64(a, mask);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maskload_epi64() {
+        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
+        let a = &nums as *const i64;
+        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+        let r = _mm256_maskload_epi64(a, mask);
+        let e = _mm256_setr_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_maskstore_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut arr = [-1, -1, -1, -1];
+        let mask = _mm_setr_epi32(-1, 0, 0, -1);
+        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+        let e = [1, -1, -1, 4];
+        assert_eq!(arr, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maskstore_epi32() {
+        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
+        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
+        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+        let e = [1, -1, -1, 42, -1, 6, 7, -1];
+        assert_eq!(arr, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_maskstore_epi64() {
+        let a = _mm_setr_epi64x(1_i64, 2_i64);
+        let mut arr = [-1_i64, -1_i64];
+        let mask = _mm_setr_epi64x(0, -1);
+        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+        let e = [-1, 2];
+        assert_eq!(arr, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maskstore_epi64() {
+        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
+        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
+        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+        let e = [-1, 2, 3, -1];
+        assert_eq!(arr, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_max_epi16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_max_epi32(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_max_epi8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epu16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_max_epu16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epu32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_max_epu32(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_max_epu8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_min_epi16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_min_epi32(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_min_epi8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epu16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_min_epu16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epu32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_min_epu32(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_min_epu8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_movemask_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let r = _mm256_movemask_epi8(a);
+        let e = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mpsadbw_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_mpsadbw_epu8::<0>(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mul_epi32() {
+        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
+        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_mul_epi32(a, b);
+        let e = _mm256_setr_epi64x(0, 0, 10, 14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mul_epu32() {
+        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
+        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_mul_epu32(a, b);
+        let e = _mm256_setr_epi64x(0, 0, 10, 14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mulhi_epi16() {
+        let a = _mm256_set1_epi16(6535);
+        let b = _mm256_set1_epi16(6535);
+        let r = _mm256_mulhi_epi16(a, b);
+        let e = _mm256_set1_epi16(651);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mulhi_epu16() {
+        let a = _mm256_set1_epi16(6535);
+        let b = _mm256_set1_epi16(6535);
+        let r = _mm256_mulhi_epu16(a, b);
+        let e = _mm256_set1_epi16(651);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mullo_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_mullo_epi16(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mullo_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_mullo_epi32(a, b);
+        let e = _mm256_set1_epi32(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mulhrs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_mullo_epi16(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_or_si256() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(0);
+        let r = _mm256_or_si256(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_packs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_packs_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_packs_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_packs_epi32(a, b);
+        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_packus_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_packus_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_packus_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_packus_epi32(a, b);
+        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sad_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_sad_epu8(a, b);
+        let e = _mm256_set1_epi64x(16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_shufflehi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 11, 22, 33, 44,
+            4, 5, 6, 7, 55, 66, 77, 88,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 1, 2, 3, 44, 22, 22, 11,
+            4, 5, 6, 7, 88, 66, 66, 55,
+        );
+        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_shufflelo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            11, 22, 33, 44, 0, 1, 2, 3,
+            55, 66, 77, 88, 4, 5, 6, 7,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            44, 22, 22, 11, 0, 1, 2, 3,
+            88, 66, 66, 55, 4, 5, 6, 7,
+        );
+        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(-1);
+        let r = _mm256_sign_epi16(a, b);
+        let e = _mm256_set1_epi16(-2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_sign_epi32(a, b);
+        let e = _mm256_set1_epi32(-2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(-1);
+        let r = _mm256_sign_epi8(a, b);
+        let e = _mm256_set1_epi8(-2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi16() {
+        let a = _mm256_set1_epi16(0xFF);
+        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
+        let r = _mm256_sll_epi16(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi32() {
+        let a = _mm256_set1_epi32(0xFFFF);
+        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
+        let r = _mm256_sll_epi32(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi64() {
+        let a = _mm256_set1_epi64x(0xFFFFFFFF);
+        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
+        let r = _mm256_sll_epi64(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_slli_epi16() {
+        assert_eq_m256i(
+            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
+            _mm256_set1_epi16(0xFF0),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_slli_epi32() {
+        assert_eq_m256i(
+            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
+            _mm256_set1_epi32(0xFFFF0),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_slli_epi64() {
+        assert_eq_m256i(
+            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
+            _mm256_set1_epi64x(0xFFFFFFFF0),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_slli_si256() {
+        let a = _mm256_set1_epi64x(0xFFFFFFFF);
+        let r = _mm256_slli_si256::<3>(a);
+        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_sllv_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_sllv_epi32(a, b);
+        let e = _mm_set1_epi32(4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sllv_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_sllv_epi32(a, b);
+        let e = _mm256_set1_epi32(4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_sllv_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_sllv_epi64(a, b);
+        let e = _mm_set1_epi64x(4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sllv_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_sllv_epi64(a, b);
+        let e = _mm256_set1_epi64x(4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sra_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_sra_epi16(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi16(-1));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sra_epi32() {
+        let a = _mm256_set1_epi32(-1);
+        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
+        let r = _mm256_sra_epi32(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi32(-1));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srai_epi16() {
+        assert_eq_m256i(
+            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
+            _mm256_set1_epi16(-1),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srai_epi32() {
+        assert_eq_m256i(
+            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
+            _mm256_set1_epi32(-1),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_srav_epi32() {
+        let a = _mm_set1_epi32(4);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_srav_epi32(a, count);
+        let e = _mm_set1_epi32(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srav_epi32() {
+        let a = _mm256_set1_epi32(4);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_srav_epi32(a, count);
+        let e = _mm256_set1_epi32(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srli_si256() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_srli_si256::<3>(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            4, 5, 6, 7, 8, 9, 10, 11,
+            12, 13, 14, 15, 16, 0, 0, 0,
+            20, 21, 22, 23, 24, 25, 26, 27,
+            28, 29, 30, 31, 32, 0, 0, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi16() {
+        let a = _mm256_set1_epi16(0xFF);
+        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
+        let r = _mm256_srl_epi16(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi32() {
+        let a = _mm256_set1_epi32(0xFFFF);
+        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
+        let r = _mm256_srl_epi32(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi64() {
+        let a = _mm256_set1_epi64x(0xFFFFFFFF);
+        let b = _mm_setr_epi64x(4, 0);
+        let r = _mm256_srl_epi64(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srli_epi16() {
+        assert_eq_m256i(
+            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
+            _mm256_set1_epi16(0xF),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srli_epi32() {
+        assert_eq_m256i(
+            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
+            _mm256_set1_epi32(0xFFF),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srli_epi64() {
+        assert_eq_m256i(
+            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
+            _mm256_set1_epi64x(0xFFFFFFF),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_srlv_epi32() {
+        let a = _mm_set1_epi32(2);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_srlv_epi32(a, count);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srlv_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_srlv_epi32(a, count);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_srlv_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let count = _mm_set1_epi64x(1);
+        let r = _mm_srlv_epi64(a, count);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srlv_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let count = _mm256_set1_epi64x(1);
+        let r = _mm256_srlv_epi64(a, count);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_stream_load_si256() {
+        let a = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _);
+        assert_eq_m256i(a, r);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sub_epi16() {
+        let a = _mm256_set1_epi16(4);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_sub_epi16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sub_epi32() {
+        let a = _mm256_set1_epi32(4);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_sub_epi32(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sub_epi64() {
+        let a = _mm256_set1_epi64x(4);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_sub_epi64(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sub_epi8() {
+        let a = _mm256_set1_epi8(4);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_sub_epi8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_subs_epi16() {
+        let a = _mm256_set1_epi16(4);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_subs_epi16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_subs_epi8() {
+        let a = _mm256_set1_epi8(4);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_subs_epi8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_subs_epu16() {
+        let a = _mm256_set1_epi16(4);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_subs_epu16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_subs_epu8() {
+        let a = _mm256_set1_epi8(4);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_subs_epu8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_xor_si256() {
+        let a = _mm256_set1_epi8(5);
+        let b = _mm256_set1_epi8(3);
+        let r = _mm256_xor_si256(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi8(6));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            -1, -2, -3, -4, -5, -6, -7, -8,
+            -9, -10, -11, -12, -13, -14, -15, -16,
+            -17, -18, -19, -20, -21, -22, -23, -24,
+            -25, -26, -27, -28, -29, -30, -31, -32,
+        );
+        let r = _mm256_alignr_epi8::<33>(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi8(0));
+
+        let r = _mm256_alignr_epi8::<17>(a, b);
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            2, 3, 4, 5, 6, 7, 8, 9,
+            10, 11, 12, 13, 14, 15, 16, 0,
+            18, 19, 20, 21, 22, 23, 24, 25,
+            26, 27, 28, 29, 30, 31, 32, 0,
+        );
+        assert_eq_m256i(r, expected);
+
+        let r = _mm256_alignr_epi8::<4>(a, b);
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            -5, -6, -7, -8, -9, -10, -11, -12,
+            -13, -14, -15, -16, 1, 2, 3, 4,
+            -21, -22, -23, -24, -25, -26, -27, -28,
+            -29, -30, -31, -32, 17, 18, 19, 20,
+        );
+        assert_eq_m256i(r, expected);
+
+        let r = _mm256_alignr_epi8::<15>(a, b);
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            -16, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            -32, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        assert_eq_m256i(r, expected);
+
+        let r = _mm256_alignr_epi8::<0>(a, b);
+        assert_eq_m256i(r, b);
+
+        let r = _mm256_alignr_epi8::<16>(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+            12, 5, 5, 10, 4, 1, 8, 0,
+            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+            12, 5, 5, 10, 4, 1, 8, 0,
+        );
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            5, 0, 5, 4, 9, 13, 7, 4,
+            13, 6, 6, 11, 5, 2, 9, 1,
+            21, 0, 21, 20, 25, 29, 23, 20,
+            29, 22, 22, 27, 21, 18, 25, 17,
+        );
+        let r = _mm256_shuffle_epi8(a, b);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permutevar8x32_epi32() {
+        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
+        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
+        let r = _mm256_permutevar8x32_epi32(a, b);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permute4x64_epi64() {
+        let a = _mm256_setr_epi64x(100, 200, 300, 400);
+        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
+        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permute2x128_si256() {
+        let a = _mm256_setr_epi64x(100, 200, 500, 600);
+        let b = _mm256_setr_epi64x(300, 400, 700, 800);
+        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
+        let e = _mm256_setr_epi64x(700, 800, 500, 600);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permute4x64_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
+        let e = _mm256_setr_pd(4., 1., 2., 1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permutevar8x32_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+        let r = _mm256_permutevar8x32_ps(a, b);
+        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_mask_i32gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm_setr_epi32(-1, -1, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r =
+            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_mask_i32gather_epi32::<4>(
+            _mm256_set1_epi32(256),
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_mask_i32gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r =
+            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_mask_i32gather_ps::<4>(
+            _mm256_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
+        );
+        assert_eq_m256(
+            r,
+            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_mask_i32gather_epi64::<8>(
+            _mm_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(16, 16, 16, 16),
+            _mm_setr_epi64x(-1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_mask_i32gather_epi64::<8>(
+            _mm256_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm256_setr_epi64x(-1, -1, -1, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
+        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_mask_i32gather_pd::<8>(
+            _mm_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(16, 16, 16, 16),
+            _mm_setr_pd(-1.0, 0.0),
+        );
+        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_mask_i32gather_pd::<8>(
+            _mm256_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_mask_i64gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm_setr_epi64x(0, 16),
+            _mm_setr_epi32(-1, 0, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_mask_i64gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm_setr_epi32(-1, -1, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_mask_i64gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi64x(0, 16),
+            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_mask_i64gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_mask_i64gather_epi64::<8>(
+            _mm_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi64x(16, 16),
+            _mm_setr_epi64x(-1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_mask_i64gather_epi64::<8>(
+            _mm256_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm256_setr_epi64x(-1, -1, -1, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_mask_i64gather_pd::<8>(
+            _mm_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi64x(16, 16),
+            _mm_setr_pd(-1.0, 0.0),
+        );
+        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_mask_i64gather_pd::<8>(
+            _mm256_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extract_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            -1, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31
+        );
+        let r1 = _mm256_extract_epi8::<0>(a);
+        let r2 = _mm256_extract_epi8::<3>(a);
+        assert_eq!(r1, 0xFF);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_extract_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            -1, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r1 = _mm256_extract_epi16::<0>(a);
+        let r2 = _mm256_extract_epi16::<3>(a);
+        assert_eq!(r1, 0xFFFF);
+        assert_eq!(r2, 3);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs b/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs
new file mode 100644
index 0000000000000..85afd91fba7b1
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs
@@ -0,0 +1,1977 @@
+//! [AVX512BF16 intrinsics].
+//!
+//! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16
+
+use crate::arch::asm;
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.128"]
+    fn cvtne2ps2bf16(a: f32x4, b: f32x4) -> i16x8;
+    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.256"]
+    fn cvtne2ps2bf16_256(a: f32x8, b: f32x8) -> i16x16;
+    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.512"]
+    fn cvtne2ps2bf16_512(a: f32x16, b: f32x16) -> i16x32;
+    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.256"]
+    fn cvtneps2bf16_256(a: f32x8) -> i16x8;
+    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.512"]
+    fn cvtneps2bf16_512(a: f32x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.128"]
+    fn dpbf16ps(a: f32x4, b: i16x8, c: i16x8) -> f32x4;
+    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.256"]
+    fn dpbf16ps_256(a: f32x8, b: i16x16, c: i16x16) -> f32x8;
+    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.512"]
+    fn dpbf16ps_512(a: f32x16, b: i16x32, c: i16x32) -> f32x16;
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
+/// 128-bit wide vector.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
+    unsafe { transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using writemask k (elements are copied from src when the
+/// corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_mask_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh {
+    unsafe {
+        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_maskz_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
+    unsafe {
+        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
+/// 256-bit wide vector.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
+    unsafe { transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8())) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
+/// to packed BF16 (16-bit) floating-point elements and store the results in single vector
+/// dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm256_mask_cvtne2ps_pbh(src: __m256bh, k: __mmask16, a: __m256, b: __m256) -> __m256bh {
+    unsafe {
+        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
+/// to packed BF16 (16-bit) floating-point elements, and store the results in single vector
+/// dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
+    unsafe {
+        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
+/// 512-bit wide vector.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
+    unsafe { transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16())) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using writemask k (elements are copied from src when the
+/// corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_mask_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm512_mask_cvtne2ps_pbh(src: __m512bh, k: __mmask32, a: __m512, b: __m512) -> __m512bh {
+    unsafe {
+        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x32()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
+    unsafe {
+        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, cvt, u16x32::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
+    unsafe { transmute(cvtneps2bf16_256(a.as_f32x8())) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
+    unsafe {
+        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
+    unsafe {
+        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
+    unsafe { transmute(cvtneps2bf16_512(a.as_f32x16())) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
+    unsafe {
+        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
+    unsafe {
+        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
+    }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+    unsafe { transmute(dpbf16ps(src.as_f32x4(), a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_mask_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
+    unsafe {
+        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, rst, src.as_f32x4()))
+    }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_maskz_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+    unsafe {
+        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
+        let zero = _mm_set1_ps(0.0_f32).as_f32x4();
+        transmute(simd_select_bitmask(k, rst, zero))
+    }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+    unsafe { transmute(dpbf16ps_256(src.as_f32x8(), a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
+    unsafe {
+        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, rst, src.as_f32x8()))
+    }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+    unsafe {
+        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, rst, f32x8::ZERO))
+    }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit)
+/// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit)
+/// floating-point elements with elements in src, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
+    unsafe { transmute(dpbf16ps_512(src.as_f32x16(), a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
+    unsafe {
+        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, rst, src.as_f32x16()))
+    }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm512_maskz_dpbf16_ps(k: __mmask16, src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
+    unsafe {
+        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, rst, f32x16::ZERO))
+    }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
+    unsafe { _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(a)))) }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 {
+    unsafe {
+        let cvt = _mm512_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x16(), src.as_f32x16()))
+    }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
+    unsafe {
+        let cvt = _mm512_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x16(), f32x16::ZERO))
+    }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
+    unsafe { _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(a)))) }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 {
+    unsafe {
+        let cvt = _mm256_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
+    unsafe {
+        let cvt = _mm256_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
+    unsafe { _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(a)))) }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
+/// elements, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 {
+    unsafe {
+        let cvt = _mm_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
+/// elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
+    unsafe {
+        let cvt = _mm_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point
+/// element, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub fn _mm_cvtsbh_ss(a: bf16) -> f32 {
+    f32::from_bits((a.to_bits() as u32) << 16)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "vcvtneps2bf16 {dst}, {src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst = src;
+        asm!(
+            "vcvtneps2bf16 {dst}{{{k}}},{src}",
+            dst = inlateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            k = in(kreg) k,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "vcvtneps2bf16 {dst}{{{k}}}{{z}},{src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            k = in(kreg) k,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
+}
+
+/// Converts a single-precision (32-bit) floating-point element in a to a BF16 (16-bit) floating-point
+/// element, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub fn _mm_cvtness_sbh(a: f32) -> bf16 {
+    unsafe {
+        let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0);
+        bf16::from_bits(value)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::simd::u16x4;
+    use crate::{
+        core_arch::x86::*,
+        mem::{transmute, transmute_copy},
+    };
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_cvtne2ps_pbh() {
+        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
+        let a: __m128 = transmute(a_array);
+        let b: __m128 = transmute(b_array);
+        let c: __m128bh = _mm_cvtne2ps_pbh(a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_cvtne2ps_pbh() {
+        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
+        #[rustfmt::skip]
+        let src_array: [u16; 8] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        let src: __m128bh = transmute(src_array);
+        let a: __m128 = transmute(a_array);
+        let b: __m128 = transmute(b_array);
+        let k: __mmask8 = 0b1111_1111;
+        let c: __m128bh = _mm_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        assert_eq!(result, expected_result);
+        let k = 0b0000_0000;
+        let c = _mm_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtne2ps_pbh() {
+        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
+        let a: __m128 = transmute(a_array);
+        let b: __m128 = transmute(b_array);
+        let k: __mmask8 = 0b1111_1111;
+        let c: __m128bh = _mm_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        assert_eq!(result, expected_result);
+        let k = 0b0011_1100;
+        let c = _mm_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0,
+            0,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0,
+            0,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let a: __m256 = transmute(a_array);
+        let b: __m256 = transmute(b_array);
+        let c: __m256bh = _mm256_cvtne2ps_pbh(a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let src_array: [u16; 16] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        let src: __m256bh = transmute(src_array);
+        let a: __m256 = transmute(a_array);
+        let b: __m256 = transmute(b_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let a: __m256 = transmute(a_array);
+        let b: __m256 = transmute(b_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0b0110_1100_0011_0110;
+        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0,
+            0,
+            0,
+            0,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let a: __m512 = transmute(a_array);
+        let b: __m512 = transmute(b_array);
+        let c: __m512bh = _mm512_cvtne2ps_pbh(a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        #[rustfmt::skip]
+        let expected_result: [u16; 32] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_mask_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let src_array: [u16; 32] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        let src: __m512bh = transmute(src_array);
+        let a: __m512 = transmute(a_array);
+        let b: __m512 = transmute(b_array);
+        let k: __mmask32 = 0xffffffff;
+        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        #[rustfmt::skip]
+        let expected_result: [u16; 32] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask32 = 0;
+        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_maskz_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let a: __m512 = transmute(a_array);
+        let b: __m512 = transmute(b_array);
+        let k: __mmask32 = 0xffffffff;
+        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        #[rustfmt::skip]
+        let expected_result: [u16; 32] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask32 = 0b1100_1010_1001_0110_1010_0011_0101_0110;
+        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        #[rustfmt::skip]
+        let expected_result: [u16; 32] = [
+            0,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0,
+            0b1_10000011_0000100,
+            0,
+            0b1_10001000_1111010,
+            0,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0,
+            0,
+            0,
+            0b1_10000110_1111111,
+            0,
+            0b1_10001000_0010000,
+            0,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0,
+            0b0_10000011_0000100,
+            0,
+            0,
+            0b0_10001000_0010000,
+            0,
+            0b0_10000010_0101000,
+            0,
+            0b0_10000100_1001001,
+            0,
+            0,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let a: __m256 = transmute(a_array);
+        let c: __m128bh = _mm256_cvtneps_pbh(a);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let src_array: [u16; 8] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+        ];
+        let src: __m128bh = transmute(src_array);
+        let a: __m256 = transmute(a_array);
+        let k: __mmask8 = 0xff;
+        let b = _mm256_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0x0;
+        let b: __m128bh = _mm256_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        let expected_result: [u16; 8] = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let a: __m256 = transmute(a_array);
+        let k: __mmask8 = 0xff;
+        let b = _mm256_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0x6;
+        let b: __m128bh = _mm256_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        let expected_result: [u16; 8] =
+            [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let a: __m512 = transmute(a_array);
+        let c: __m256bh = _mm512_cvtneps_pbh(a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_mask_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let src_array: [u16; 16] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+        ];
+        let src: __m256bh = transmute(src_array);
+        let a: __m512 = transmute(a_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_maskz_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let a: __m512 = transmute(a_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0x653a;
+        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0,
+            0b0_10000010_0101000,
+            0,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0,
+            0,
+            0b0_10000110_0110010,
+            0,
+            0b0_10000000_1110000,
+            0,
+            0,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_dpbf16_ps() {
+        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m128 = transmute(a_array);
+        let b1: __m128 = transmute(b_array);
+        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
+        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
+        let c: __m128 = _mm_dpbf16_ps(src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_dpbf16_ps() {
+        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m128 = transmute(a_array);
+        let b1: __m128 = transmute(b_array);
+        let k: __mmask8 = 0xf3;
+        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
+        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
+        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_dpbf16_ps() {
+        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m128 = transmute(a_array);
+        let b1: __m128 = transmute(b_array);
+        let k: __mmask8 = 0xf3;
+        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
+        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
+        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [0.0, 0.0, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m256 = transmute(a_array);
+        let b1: __m256 = transmute(b_array);
+        #[rustfmt::skip]
+        let src: __m256 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
+        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
+        let c: __m256 = _mm256_dpbf16_ps(src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m256 = transmute(a_array);
+        let b1: __m256 = transmute(b_array);
+        let k: __mmask8 = 0x33;
+        #[rustfmt::skip]
+        let src: __m256 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
+        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
+        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m256 = transmute(a_array);
+        let b1: __m256 = transmute(b_array);
+        let k: __mmask8 = 0x33;
+        #[rustfmt::skip]
+        let src: __m256 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
+        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
+        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        let expected_result: [f32; 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m512 = transmute(a_array);
+        let b1: __m512 = transmute(b_array);
+        let src: __m512 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
+        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
+        let c: __m512 = _mm512_dpbf16_ps(src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_mask_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m512 = transmute(a_array);
+        let b1: __m512 = transmute(b_array);
+        let k: __mmask16 = 0x3333;
+        #[rustfmt::skip]
+        let src: __m512 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
+        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
+        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0xffff;
+        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_maskz_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m512 = transmute(a_array);
+        let b1: __m512 = transmute(b_array);
+        let k: __mmask16 = 0x3333;
+        #[rustfmt::skip]
+        let src: __m512 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
+        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
+        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32,
+            0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0xffff;
+        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    const BF16_ONE: u16 = 0b0_01111111_0000000;
+    const BF16_TWO: u16 = 0b0_10000000_0000000;
+    const BF16_THREE: u16 = 0b0_10000000_1000000;
+    const BF16_FOUR: u16 = 0b0_10000001_0000000;
+    const BF16_FIVE: u16 = 0b0_10000001_0100000;
+    const BF16_SIX: u16 = 0b0_10000001_1000000;
+    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
+    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
+
+    #[simd_test(enable = "avx512bf16")]
+    unsafe fn test_mm512_cvtpbh_ps() {
+        let a = __m256bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let r = _mm512_cvtpbh_ps(a);
+        let e = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16")]
+    unsafe fn test_mm512_mask_cvtpbh_ps() {
+        let a = __m256bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let src = _mm512_setr_ps(
+            9., 10., 11., 12., 13., 14., 15., 16., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let k = 0b1010_1010_1010_1010;
+        let r = _mm512_mask_cvtpbh_ps(src, k, a);
+        let e = _mm512_setr_ps(
+            9., 2., 11., 4., 13., 6., 15., 8., 9., 2., 11., 4., 13., 6., 15., 8.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16")]
+    unsafe fn test_mm512_maskz_cvtpbh_ps() {
+        let a = __m256bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let k = 0b1010_1010_1010_1010;
+        let r = _mm512_maskz_cvtpbh_ps(k, a);
+        let e = _mm512_setr_ps(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 2., 0., 4., 0., 6., 0., 8.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_cvtpbh_ps() {
+        let a = __m128bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let r = _mm256_cvtpbh_ps(a);
+        let e = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpbh_ps() {
+        let a = __m128bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let k = 0b1010_1010;
+        let r = _mm256_mask_cvtpbh_ps(src, k, a);
+        let e = _mm256_setr_ps(9., 2., 11., 4., 13., 6., 15., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpbh_ps() {
+        let a = __m128bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let k = 0b1010_1010;
+        let r = _mm256_maskz_cvtpbh_ps(k, a);
+        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_cvtpbh_ps() {
+        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
+        let r = _mm_cvtpbh_ps(a);
+        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_cvtpbh_ps() {
+        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
+        let src = _mm_setr_ps(9., 10., 11., 12.);
+        let k = 0b1010;
+        let r = _mm_mask_cvtpbh_ps(src, k, a);
+        let e = _mm_setr_ps(9., 2., 11., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpbh_ps() {
+        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
+        let k = 0b1010;
+        let r = _mm_maskz_cvtpbh_ps(k, a);
+        let e = _mm_setr_ps(0., 2., 0., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16")]
+    unsafe fn test_mm_cvtsbh_ss() {
+        let r = _mm_cvtsbh_ss(bf16::from_bits(BF16_ONE));
+        assert_eq!(r, 1.);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_cvtneps_pbh() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r: u16x4 = transmute_copy(&_mm_cvtneps_pbh(a));
+        let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_cvtneps_pbh() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let src = __m128bh([5, 6, 7, 8, !0, !0, !0, !0]);
+        let k = 0b1010;
+        let r: u16x4 = transmute_copy(&_mm_mask_cvtneps_pbh(src, k, a));
+        let e = u16x4::new(5, BF16_TWO, 7, BF16_FOUR);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtneps_pbh() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let k = 0b1010;
+        let r: u16x4 = transmute_copy(&_mm_maskz_cvtneps_pbh(k, a));
+        let e = u16x4::new(0, BF16_TWO, 0, BF16_FOUR);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_cvtness_sbh() {
+        let r = _mm_cvtness_sbh(1.);
+        assert_eq!(r.to_bits(), BF16_ONE);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
new file mode 100644
index 0000000000000..1cbf0faea09f9
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
@@ -0,0 +1,806 @@
+//! Bit-oriented Algorithms (BITALG)
+//!
+//! The intrinsics here correspond to those in the `immintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::simd::i8x16;
+use crate::core_arch::simd::i8x32;
+use crate::core_arch::simd::i8x64;
+use crate::core_arch::simd::i16x8;
+use crate::core_arch::simd::i16x16;
+use crate::core_arch::simd::i16x32;
+use crate::core_arch::x86::__m128i;
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+use crate::core_arch::x86::__mmask8;
+use crate::core_arch::x86::__mmask16;
+use crate::core_arch::x86::__mmask32;
+use crate::core_arch::x86::__mmask64;
+use crate::intrinsics::simd::{simd_ctpop, simd_select_bitmask};
+use crate::mem::transmute;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.mask.vpshufbitqmb.512"]
+    fn bitshuffle_512(data: i8x64, indices: i8x64, mask: __mmask64) -> __mmask64;
+    #[link_name = "llvm.x86.avx512.mask.vpshufbitqmb.256"]
+    fn bitshuffle_256(data: i8x32, indices: i8x32, mask: __mmask32) -> __mmask32;
+    #[link_name = "llvm.x86.avx512.mask.vpshufbitqmb.128"]
+    fn bitshuffle_128(data: i8x16, indices: i8x16, mask: __mmask16) -> __mmask16;
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm512_popcnt_epi16(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i16x32())) }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm512_maskz_popcnt_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x32()),
+            i16x32::ZERO,
+        ))
+    }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm512_mask_popcnt_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x32()),
+            src.as_i16x32(),
+        ))
+    }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm256_popcnt_epi16(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i16x16())) }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm256_maskz_popcnt_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x16()),
+            i16x16::ZERO,
+        ))
+    }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm256_mask_popcnt_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x16()),
+            src.as_i16x16(),
+        ))
+    }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm_popcnt_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i16x8())) }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm_maskz_popcnt_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x8()),
+            i16x8::ZERO,
+        ))
+    }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm_mask_popcnt_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x8()),
+            src.as_i16x8(),
+        ))
+    }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm512_popcnt_epi8(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i8x64())) }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm512_maskz_popcnt_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x64()),
+            i8x64::ZERO,
+        ))
+    }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm512_mask_popcnt_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x64()),
+            src.as_i8x64(),
+        ))
+    }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm256_popcnt_epi8(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i8x32())) }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm256_maskz_popcnt_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x32()),
+            i8x32::ZERO,
+        ))
+    }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm256_mask_popcnt_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x32()),
+            src.as_i8x32(),
+        ))
+    }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm_popcnt_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i8x16())) }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm_maskz_popcnt_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x16()),
+            i8x16::ZERO,
+        ))
+    }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x16()),
+            src.as_i8x16(),
+        ))
+    }
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 {
+    unsafe { bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0) }
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m512i) -> __mmask64 {
+    unsafe { bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k) }
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 {
+    unsafe { bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0) }
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m256i) -> __mmask32 {
+    unsafe { bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k) }
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
+    unsafe { bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0) }
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub fn _mm_mask_bitshuffle_epi64_mask(k: __mmask16, b: __m128i, c: __m128i) -> __mmask16 {
+    unsafe { bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k) }
+}
+
+#[cfg(test)]
+mod tests {
+    // Some of the constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __mXXXi happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_popcnt_epi16() {
+        let test_data = _mm512_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF, 0xFF_FF, -1, -100, 255, 256, 2, 4, 8, 16, 32, 64, 128, 256, 512,
+            1024, 2048,
+        );
+        let actual_result = _mm512_popcnt_epi16(test_data);
+        let reference_result = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 12, 8, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_maskz_popcnt_epi16() {
+        let test_data = _mm512_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF, 0xFF_FF, -1, -100, 255, 256, 2, 4, 8, 16, 32, 64, 128, 256, 512,
+            1024, 2048,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm512_maskz_popcnt_epi16(mask, test_data);
+        let reference_result = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_mask_popcnt_epi16() {
+        let test_data = _mm512_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF, 0xFF_FF, -1, -100, 255, 256, 2, 4, 8, 16, 32, 64, 128, 256, 512,
+            1024, 2048,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm512_mask_popcnt_epi16(test_data, mask, test_data);
+        let reference_result = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0xFF_FF, -1, -100, 255, 256, 2,
+            4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_popcnt_epi16() {
+        let test_data = _mm256_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF,
+        );
+        let actual_result = _mm256_popcnt_epi16(test_data);
+        let reference_result =
+            _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_popcnt_epi16() {
+        let test_data = _mm256_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm256_maskz_popcnt_epi16(mask, test_data);
+        let reference_result = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_popcnt_epi16() {
+        let test_data = _mm256_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm256_mask_popcnt_epi16(test_data, mask, test_data);
+        let reference_result = _mm256_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF, 0x3F_FF, 0x7F_FF,
+        );
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_popcnt_epi16() {
+        let test_data = _mm_set_epi16(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F);
+        let actual_result = _mm_popcnt_epi16(test_data);
+        let reference_result = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_popcnt_epi16() {
+        let test_data = _mm_set_epi16(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F);
+        let mask = 0xF0;
+        let actual_result = _mm_maskz_popcnt_epi16(mask, test_data);
+        let reference_result = _mm_set_epi16(0, 1, 2, 3, 0, 0, 0, 0);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_mask_popcnt_epi16() {
+        let test_data = _mm_set_epi16(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F);
+        let mask = 0xF0;
+        let actual_result = _mm_mask_popcnt_epi16(test_data, mask, test_data);
+        let reference_result = _mm_set_epi16(0, 1, 2, 3, 0xF, 0x1F, 0x3F, 0x7F);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_popcnt_epi8() {
+        let test_data = _mm512_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
+            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, 183, 154, 84, 56, 227, 189,
+            140, 35, 117, 219, 169, 226, 170, 13, 22, 159, 251, 73, 121, 143, 145, 85, 91, 137, 90,
+            225, 21, 249, 211, 155, 228, 70,
+        );
+        let actual_result = _mm512_popcnt_epi8(test_data);
+        let reference_result = _mm512_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
+            2, 4, 4, 6, 4, 3, 3, 5, 6, 3, 3, 5, 6, 4, 4, 4, 3, 3, 6, 7, 3, 5, 5, 3, 4, 5, 3, 4, 4,
+            3, 6, 5, 5, 4, 3,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_maskz_popcnt_epi8() {
+        let test_data = _mm512_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
+            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, 183, 154, 84, 56, 227, 189,
+            140, 35, 117, 219, 169, 226, 170, 13, 22, 159, 251, 73, 121, 143, 145, 85, 91, 137, 90,
+            225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_FF_FF_FF_00_00_00_00;
+        let actual_result = _mm512_maskz_popcnt_epi8(mask, test_data);
+        let reference_result = _mm512_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
+            2, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_mask_popcnt_epi8() {
+        let test_data = _mm512_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
+            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, 183, 154, 84, 56, 227, 189,
+            140, 35, 117, 219, 169, 226, 170, 13, 22, 159, 251, 73, 121, 143, 145, 85, 91, 137, 90,
+            225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_FF_FF_FF_00_00_00_00;
+        let actual_result = _mm512_mask_popcnt_epi8(test_data, mask, test_data);
+        let reference_result = _mm512_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
+            2, 4, 4, 183, 154, 84, 56, 227, 189, 140, 35, 117, 219, 169, 226, 170, 13, 22, 159,
+            251, 73, 121, 143, 145, 85, 91, 137, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_popcnt_epi8() {
+        let test_data = _mm256_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
+            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172,
+        );
+        let actual_result = _mm256_popcnt_epi8(test_data);
+        let reference_result = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
+            2, 4, 4,
+        );
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_popcnt_epi8() {
+        let test_data = _mm256_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 251, 73, 121, 143,
+            145, 85, 91, 137, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm256_maskz_popcnt_epi8(mask, test_data);
+        let reference_result = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0,
+        );
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_popcnt_epi8() {
+        let test_data = _mm256_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 251, 73, 121, 143,
+            145, 85, 91, 137, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm256_mask_popcnt_epi8(test_data, mask, test_data);
+        let reference_result = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 251, 73, 121, 143, 145, 85, 91, 137,
+            90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_popcnt_epi8() {
+        let test_data = _mm_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64,
+        );
+        let actual_result = _mm_popcnt_epi8(test_data);
+        let reference_result = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_popcnt_epi8() {
+        let test_data = _mm_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm_maskz_popcnt_epi8(mask, test_data);
+        let reference_result = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_mask_popcnt_epi8() {
+        let test_data = _mm_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm_mask_popcnt_epi8(test_data, mask, test_data);
+        let reference_result =
+            _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 90, 225, 21, 249, 211, 155, 228, 70);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_bitshuffle_epi64_mask() {
+        let test_indices = _mm512_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
+            8, 8, 56, 48, 40, 32, 24, 16, 8, 0, 63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59,
+            58, 57, 56, 32, 32, 16, 16, 0, 0, 8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
+        );
+        let test_data = _mm512_setr_epi64(
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+        );
+        let actual_result = _mm512_bitshuffle_epi64_mask(test_data, test_indices);
+        let reference_result = 0xF0 << 0
+            | 0x03 << 8
+            | 0xFF << 16
+            | 0xAC << 24
+            | 0xF0 << 32
+            | 0x03 << 40
+            | 0xFF << 48
+            | 0xAC << 56;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_mask_bitshuffle_epi64_mask() {
+        let test_indices = _mm512_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
+            8, 8, 56, 48, 40, 32, 24, 16, 8, 0, 63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59,
+            58, 57, 56, 32, 32, 16, 16, 0, 0, 8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
+        );
+        let test_data = _mm512_setr_epi64(
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+        );
+        let mask = 0xFF_FF_FF_FF_00_00_00_00;
+        let actual_result = _mm512_mask_bitshuffle_epi64_mask(mask, test_data, test_indices);
+        let reference_result = 0x00 << 0
+            | 0x00 << 8
+            | 0x00 << 16
+            | 0x00 << 24
+            | 0xF0 << 32
+            | 0x03 << 40
+            | 0xFF << 48
+            | 0xAC << 56;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_bitshuffle_epi64_mask() {
+        let test_indices = _mm256_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
+            8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
+        );
+        let test_data = _mm256_setr_epi64x(
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+        );
+        let actual_result = _mm256_bitshuffle_epi64_mask(test_data, test_indices);
+        let reference_result = 0xF0 << 0 | 0x03 << 8 | 0xFF << 16 | 0xAC << 24;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_bitshuffle_epi64_mask() {
+        let test_indices = _mm256_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
+            8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
+        );
+        let test_data = _mm256_setr_epi64x(
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm256_mask_bitshuffle_epi64_mask(mask, test_data, test_indices);
+        let reference_result = 0x00 << 0 | 0x00 << 8 | 0xFF << 16 | 0xAC << 24;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_bitshuffle_epi64_mask() {
+        let test_indices = _mm_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56,
+        );
+        let test_data = _mm_setr_epi64x(0xFF_00_00_00_00_00_00_00, 0xAC_00_00_00_00_00_00_00);
+        let actual_result = _mm_bitshuffle_epi64_mask(test_data, test_indices);
+        let reference_result = 0xFF << 0 | 0xAC << 8;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_mask_bitshuffle_epi64_mask() {
+        let test_indices = _mm_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56,
+        );
+        let test_data = _mm_setr_epi64x(0xFF_00_00_00_00_00_00_00, 0xAC_00_00_00_00_00_00_00);
+        let mask = 0xFF_00;
+        let actual_result = _mm_mask_bitshuffle_epi64_mask(mask, test_data, test_indices);
+        let reference_result = 0x00 << 0 | 0xAC << 8;
+
+        assert_eq!(actual_result, reference_result);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
new file mode 100644
index 0000000000000..b60b0b70795f6
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -0,0 +1,21343 @@
+use crate::{
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+    ptr,
+};
+
+use core::hint::unreachable_unchecked;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi16&expand=30)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm512_abs_epi16(a: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let cmp: i16x32 = simd_gt(a, i16x32::ZERO);
+        transmute(simd_select(cmp, a, simd_neg(a)))
+    }
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_epi16&expand=31)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm512_mask_abs_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, abs, src.as_i16x32()))
+    }
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_abs_epi16&expand=32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm512_maskz_abs_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, abs, i16x32::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_abs_epi16&expand=28)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm256_mask_abs_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, abs, src.as_i16x16()))
+    }
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi16&expand=29)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm256_maskz_abs_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, abs, i16x16::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi16&expand=25)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm_mask_abs_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, abs, src.as_i16x8()))
+    }
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi16&expand=26)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm_maskz_abs_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, abs, i16x8::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi8&expand=57)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm512_abs_epi8(a: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let cmp: i8x64 = simd_gt(a, i8x64::ZERO);
+        transmute(simd_select(cmp, a, simd_neg(a)))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_epi8&expand=58)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm512_mask_abs_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, abs, src.as_i8x64()))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_abs_epi8&expand=59)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm512_maskz_abs_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, abs, i8x64::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_abs_epi8&expand=55)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm256_mask_abs_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, abs, src.as_i8x32()))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi8&expand=56)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm256_maskz_abs_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, abs, i8x32::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi8&expand=52)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm_mask_abs_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, abs, src.as_i8x16()))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi8&expand=53)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm_maskz_abs_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, abs, i8x16::ZERO))
+    }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_epi16&expand=91)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm512_add_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_add(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_epi16&expand=92)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm512_mask_add_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, add, src.as_i16x32()))
+    }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_epi16&expand=93)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm512_maskz_add_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, add, i16x32::ZERO))
+    }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_epi16&expand=89)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm256_mask_add_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, add, src.as_i16x16()))
+    }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_epi16&expand=90)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm256_maskz_add_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, add, i16x16::ZERO))
+    }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_epi16&expand=86)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm_mask_add_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, add, src.as_i16x8()))
+    }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_epi16&expand=87)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm_maskz_add_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, add, i16x8::ZERO))
+    }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_epi8&expand=118)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm512_add_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_add(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_epi8&expand=119)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm512_mask_add_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, add, src.as_i8x64()))
+    }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_epi8&expand=120)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm512_maskz_add_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, add, i8x64::ZERO))
+    }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_epi8&expand=116)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm256_mask_add_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, add, src.as_i8x32()))
+    }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_epi8&expand=117)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm256_maskz_add_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, add, i8x32::ZERO))
+    }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_epi8&expand=113)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm_mask_add_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, add, src.as_i8x16()))
+    }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_epi8&expand=114)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm_maskz_add_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, add, i8x16::ZERO))
+    }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_adds_epu16&expand=197)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm512_adds_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_add(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_adds_epu16&expand=198)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm512_mask_adds_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, add, src.as_u16x32()))
+    }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_adds_epu16&expand=199)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm512_maskz_adds_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, add, u16x32::ZERO))
+    }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_adds_epu16&expand=195)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm256_mask_adds_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, add, src.as_u16x16()))
+    }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_adds_epu16&expand=196)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm256_maskz_adds_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, add, u16x16::ZERO))
+    }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_adds_epu16&expand=192)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm_mask_adds_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, add, src.as_u16x8()))
+    }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_adds_epu16&expand=193)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm_maskz_adds_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, add, u16x8::ZERO))
+    }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_adds_epu8&expand=206)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm512_adds_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_add(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_adds_epu8&expand=207)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm512_mask_adds_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, add, src.as_u8x64()))
+    }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_adds_epu8&expand=208)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm512_maskz_adds_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, add, u8x64::ZERO))
+    }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_adds_epu8&expand=204)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm256_mask_adds_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, add, src.as_u8x32()))
+    }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_adds_epu8&expand=205)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm256_maskz_adds_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, add, u8x32::ZERO))
+    }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_adds_epu8&expand=201)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm_mask_adds_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, add, src.as_u8x16()))
+    }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_adds_epu8&expand=202)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm_maskz_adds_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, add, u8x16::ZERO))
+    }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_adds_epi16&expand=179)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm512_adds_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_add(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_adds_epi16&expand=180)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm512_mask_adds_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, add, src.as_i16x32()))
+    }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_adds_epi16&expand=181)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm512_maskz_adds_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, add, i16x32::ZERO))
+    }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_adds_epi16&expand=177)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm256_mask_adds_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, add, src.as_i16x16()))
+    }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_adds_epi16&expand=178)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm256_maskz_adds_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, add, i16x16::ZERO))
+    }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_adds_epi16&expand=174)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm_mask_adds_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, add, src.as_i16x8()))
+    }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_adds_epi16&expand=175)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm_maskz_adds_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, add, i16x8::ZERO))
+    }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_adds_epi8&expand=188)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm512_adds_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_add(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_adds_epi8&expand=189)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm512_mask_adds_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, add, src.as_i8x64()))
+    }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_adds_epi8&expand=190)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm512_maskz_adds_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, add, i8x64::ZERO))
+    }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_adds_epi8&expand=186)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm256_mask_adds_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, add, src.as_i8x32()))
+    }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_adds_epi8&expand=187)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm256_maskz_adds_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, add, i8x32::ZERO))
+    }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_adds_epi8&expand=183)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm_mask_adds_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, add, src.as_i8x16()))
+    }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_adds_epi8&expand=184)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm_maskz_adds_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, add, i8x16::ZERO))
+    }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_epi16&expand=5685)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm512_sub_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_sub(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_epi16&expand=5683)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm512_mask_sub_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x32()))
+    }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_epi16&expand=5684)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm512_maskz_sub_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, sub, i16x32::ZERO))
+    }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_epi16&expand=5680)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm256_mask_sub_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x16()))
+    }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_epi16&expand=5681)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm256_maskz_sub_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, sub, i16x16::ZERO))
+    }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_epi16&expand=5677)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm_mask_sub_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x8()))
+    }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_epi16&expand=5678)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm_maskz_sub_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, sub, i16x8::ZERO))
+    }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_epi8&expand=5712)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm512_sub_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_sub(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_epi8&expand=5710)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm512_mask_sub_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x64()))
+    }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_epi8&expand=5711)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm512_maskz_sub_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, sub, i8x64::ZERO))
+    }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_epi8&expand=5707)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm256_mask_sub_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x32()))
+    }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_epi8&expand=5708)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm256_maskz_sub_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, sub, i8x32::ZERO))
+    }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_epi8&expand=5704)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm_mask_sub_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x16()))
+    }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_epi8&expand=5705)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm_maskz_sub_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, sub, i8x16::ZERO))
+    }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_subs_epu16&expand=5793)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm512_subs_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_sub(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_subs_epu16&expand=5791)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm512_mask_subs_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, sub, src.as_u16x32()))
+    }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_subs_epu16&expand=5792)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm512_maskz_subs_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, sub, u16x32::ZERO))
+    }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_subs_epu16&expand=5788)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm256_mask_subs_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, sub, src.as_u16x16()))
+    }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_subs_epu16&expand=5789)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm256_maskz_subs_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, sub, u16x16::ZERO))
+    }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_subs_epu16&expand=5785)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm_mask_subs_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, sub, src.as_u16x8()))
+    }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_subs_epu16&expand=5786)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm_maskz_subs_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, sub, u16x8::ZERO))
+    }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_subs_epu8&expand=5802)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm512_subs_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_sub(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_subs_epu8&expand=5800)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm512_mask_subs_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, sub, src.as_u8x64()))
+    }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_subs_epu8&expand=5801)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm512_maskz_subs_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, sub, u8x64::ZERO))
+    }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_subs_epu8&expand=5797)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm256_mask_subs_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, sub, src.as_u8x32()))
+    }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_subs_epu8&expand=5798)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm256_maskz_subs_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, sub, u8x32::ZERO))
+    }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_subs_epu8&expand=5794)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm_mask_subs_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, sub, src.as_u8x16()))
+    }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_subs_epu8&expand=5795)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm_maskz_subs_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, sub, u8x16::ZERO))
+    }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_subs_epi16&expand=5775)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm512_subs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_sub(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_subs_epi16&expand=5773)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm512_mask_subs_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x32()))
+    }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_subs_epi16&expand=5774)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm512_maskz_subs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, sub, i16x32::ZERO))
+    }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_subs_epi16&expand=5770)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm256_mask_subs_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x16()))
+    }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_subs_epi16&expand=5771)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm256_maskz_subs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, sub, i16x16::ZERO))
+    }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_subs_epi16&expand=5767)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm_mask_subs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x8()))
+    }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_subs_epi16&expand=5768)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm_maskz_subs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, sub, i16x8::ZERO))
+    }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_subs_epi8&expand=5784)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm512_subs_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_sub(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_subs_epi8&expand=5782)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm512_mask_subs_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x64()))
+    }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_subs_epi8&expand=5783)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm512_maskz_subs_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, sub, i8x64::ZERO))
+    }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_subs_epi8&expand=5779)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm256_mask_subs_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x32()))
+    }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_subs_epi8&expand=5780)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm256_maskz_subs_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, sub, i8x32::ZERO))
+    }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_subs_epi8&expand=5776)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm_mask_subs_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x16()))
+    }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_subs_epi8&expand=5777)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm_maskz_subs_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, sub, i8x16::ZERO))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mulhi_epu16&expand=3973)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm512_mulhi_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, u32x32>(a.as_u16x32());
+        let b = simd_cast::<_, u32x32>(b.as_u16x32());
+        let r = simd_shr(simd_mul(a, b), u32x32::splat(16));
+        transmute(simd_cast::<u32x32, u16x32>(r))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mulhi_epu16&expand=3971)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm512_mask_mulhi_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhi_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, mul, src.as_u16x32()))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mulhi_epu16&expand=3972)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm512_maskz_mulhi_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhi_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, mul, u16x32::ZERO))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mulhi_epu16&expand=3968)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm256_mask_mulhi_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhi_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, mul, src.as_u16x16()))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mulhi_epu16&expand=3969)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm256_maskz_mulhi_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhi_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, mul, u16x16::ZERO))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mulhi_epu16&expand=3965)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm_mask_mulhi_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhi_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, mul, src.as_u16x8()))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mulhi_epu16&expand=3966)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm_maskz_mulhi_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhi_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, mul, u16x8::ZERO))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mulhi_epi16&expand=3962)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm512_mulhi_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, i32x32>(a.as_i16x32());
+        let b = simd_cast::<_, i32x32>(b.as_i16x32());
+        let r = simd_shr(simd_mul(a, b), i32x32::splat(16));
+        transmute(simd_cast::<i32x32, i16x32>(r))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mulhi_epi16&expand=3960)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm512_mask_mulhi_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhi_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mulhi_epi16&expand=3961)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm512_maskz_mulhi_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhi_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mulhi_epi16&expand=3957)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm256_mask_mulhi_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhi_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mulhi_epi16&expand=3958)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm256_maskz_mulhi_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhi_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mulhi_epi16&expand=3954)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm_mask_mulhi_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhi_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mulhi_epi16&expand=3955)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm_maskz_mulhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhi_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mulhrs_epi16&expand=3986)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm512_mulhrs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpmulhrsw(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mulhrs_epi16&expand=3984)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm512_mask_mulhrs_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhrs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mulhrs_epi16&expand=3985)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm512_maskz_mulhrs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhrs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mulhrs_epi16&expand=3981)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm256_mask_mulhrs_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhrs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mulhrs_epi16&expand=3982)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm256_maskz_mulhrs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhrs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mulhrs_epi16&expand=3978)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm_mask_mulhrs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhrs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mulhrs_epi16&expand=3979)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm_maskz_mulhrs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhrs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
+    }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mullo_epi16&expand=3996)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm512_mullo_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_mul(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mullo_epi16&expand=3994)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm512_mask_mullo_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullo_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+    }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mullo_epi16&expand=3995)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm512_maskz_mullo_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullo_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
+    }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mullo_epi16&expand=3991)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm256_mask_mullo_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mullo_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+    }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mullo_epi16&expand=3992)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm256_maskz_mullo_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mullo_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
+    }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mullo_epi16&expand=3988)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm_mask_mullo_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mullo_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+    }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mullo_epi16&expand=3989)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm_maskz_mullo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mullo_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epu16&expand=3609)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm512_max_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u16x32();
+        let b = b.as_u16x32();
+        transmute(simd_select::<i16x32, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epu16&expand=3607)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm512_mask_max_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, max, src.as_u16x32()))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epu16&expand=3608)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm512_maskz_max_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, max, u16x32::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epu16&expand=3604)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm256_mask_max_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, max, src.as_u16x16()))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epu16&expand=3605)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm256_maskz_max_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, max, u16x16::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epu16&expand=3601)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm_mask_max_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, max, src.as_u16x8()))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epu16&expand=3602)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm_maskz_max_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, max, u16x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epu8&expand=3636)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm512_max_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        transmute(simd_select::<i8x64, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epu8&expand=3634)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm512_mask_max_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, max, src.as_u8x64()))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epu8&expand=3635)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm512_maskz_max_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, max, u8x64::ZERO))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epu8&expand=3631)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm256_mask_max_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, max, src.as_u8x32()))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epu8&expand=3632)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm256_maskz_max_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, max, u8x32::ZERO))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epu8&expand=3628)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm_mask_max_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, max, src.as_u8x16()))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epu8&expand=3629)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm_maskz_max_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, max, u8x16::ZERO))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epi16&expand=3573)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm512_max_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        transmute(simd_select::<i16x32, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epi16&expand=3571)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm512_mask_max_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, max, src.as_i16x32()))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epi16&expand=3572)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm512_maskz_max_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, max, i16x32::ZERO))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epi16&expand=3568)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm256_mask_max_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, max, src.as_i16x16()))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epi16&expand=3569)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm256_maskz_max_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, max, i16x16::ZERO))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epi16&expand=3565)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm_mask_max_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, max, src.as_i16x8()))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epi16&expand=3566)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm_maskz_max_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, max, i16x8::ZERO))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epi8&expand=3600)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm512_max_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        transmute(simd_select::<i8x64, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epi8&expand=3598)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm512_mask_max_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, max, src.as_i8x64()))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epi8&expand=3599)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm512_maskz_max_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, max, i8x64::ZERO))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epi8&expand=3595)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm256_mask_max_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, max, src.as_i8x32()))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epi8&expand=3596)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm256_maskz_max_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, max, i8x32::ZERO))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epi8&expand=3592)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm_mask_max_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, max, src.as_i8x16()))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epi8&expand=3593)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm_maskz_max_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, max, i8x16::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epu16&expand=3723)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm512_min_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u16x32();
+        let b = b.as_u16x32();
+        transmute(simd_select::<i16x32, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epu16&expand=3721)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm512_mask_min_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, min, src.as_u16x32()))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epu16&expand=3722)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm512_maskz_min_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, min, u16x32::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epu16&expand=3718)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm256_mask_min_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, min, src.as_u16x16()))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epu16&expand=3719)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm256_maskz_min_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, min, u16x16::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epu16&expand=3715)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm_mask_min_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, min, src.as_u16x8()))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epu16&expand=3716)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm_maskz_min_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, min, u16x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epu8&expand=3750)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm512_min_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        transmute(simd_select::<i8x64, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epu8&expand=3748)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm512_mask_min_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, min, src.as_u8x64()))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epu8&expand=3749)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm512_maskz_min_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, min, u8x64::ZERO))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epu8&expand=3745)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm256_mask_min_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, min, src.as_u8x32()))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epu8&expand=3746)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm256_maskz_min_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, min, u8x32::ZERO))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epu8&expand=3742)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm_mask_min_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, min, src.as_u8x16()))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epu8&expand=3743)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm_maskz_min_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, min, u8x16::ZERO))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epi16&expand=3687)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm512_min_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        transmute(simd_select::<i16x32, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epi16&expand=3685)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm512_mask_min_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, min, src.as_i16x32()))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epi16&expand=3686)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm512_maskz_min_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, min, i16x32::ZERO))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epi16&expand=3682)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm256_mask_min_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, min, src.as_i16x16()))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epi16&expand=3683)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm256_maskz_min_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, min, i16x16::ZERO))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi16&expand=3679)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm_mask_min_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, min, src.as_i16x8()))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi16&expand=3680)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm_maskz_min_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, min, i16x8::ZERO))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epi8&expand=3714)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm512_min_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        transmute(simd_select::<i8x64, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epi8&expand=3712)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm512_mask_min_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, min, src.as_i8x64()))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epi8&expand=3713)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm512_maskz_min_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, min, i8x64::ZERO))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epi8&expand=3709)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm256_mask_min_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, min, src.as_i8x32()))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epi8&expand=3710)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm256_maskz_min_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, min, i8x32::ZERO))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi8&expand=3706)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm_mask_min_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, min, src.as_i8x16()))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi8&expand=3707)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm_maskz_min_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, min, i8x16::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epu16_mask&expand=1050)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmplt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_lt(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu16_mask&expand=1051)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmplt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epu16_mask&expand=1050)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmplt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_lt(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu16_mask&expand=1049)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmplt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epu16_mask&expand=1018)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmplt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_lt(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu16_mask&expand=1019)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmplt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_cmplt_epu8_mask&expand=1068)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmplt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_lt(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu8_mask&expand=1069)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmplt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epu8_mask&expand=1066)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmplt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_lt(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu8_mask&expand=1067)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmplt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epu8_mask&expand=1064)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmplt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_lt(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu8_mask&expand=1065)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmplt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epi16_mask&expand=1022)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmplt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_lt(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi16_mask&expand=1023)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmplt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epi16_mask&expand=1020)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmplt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_lt(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi16_mask&expand=1021)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmplt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16_mask&expand=1018)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmplt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi16_mask&expand=1019)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmplt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epi8_mask&expand=1044)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmplt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_lt(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi8_mask&expand=1045)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmplt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epi8_mask&expand=1042)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmplt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_lt(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi8_mask&expand=1043)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmplt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8_mask&expand=1040)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmplt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi8_mask&expand=1041)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmplt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epu16_mask&expand=927)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpgt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_gt(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epu16_mask&expand=928)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpgt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epu16_mask&expand=925)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpgt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_gt(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epu16_mask&expand=926)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpgt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epu16_mask&expand=923)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpgt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_gt(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epu16_mask&expand=924)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpgt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epu8_mask&expand=945)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpgt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_gt(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epu8_mask&expand=946)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpgt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epu8_mask&expand=943)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpgt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_gt(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epu8_mask&expand=944)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpgt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epu8_mask&expand=941)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpgt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_gt(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epu8_mask&expand=942)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpgt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epi16_mask&expand=897)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpgt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_gt(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epi16_mask&expand=898)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpgt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16_mask&expand=895)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpgt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epi16_mask&expand=896)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpgt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16_mask&expand=893)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpgt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epi16_mask&expand=894)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpgt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epi8_mask&expand=921)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpgt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_gt(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epi8_mask&expand=922)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpgt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8_mask&expand=919)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpgt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epi8_mask&expand=920)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpgt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8_mask&expand=917)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpgt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epi8_mask&expand=918)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpgt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epu16_mask&expand=989)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmple_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_le(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu16_mask&expand=990)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmple_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epu16_mask&expand=987)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmple_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_le(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu16_mask&expand=988)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmple_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epu16_mask&expand=985)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmple_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_le(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu16_mask&expand=986)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmple_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epu8_mask&expand=1007)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmple_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_le(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu8_mask&expand=1008)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmple_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epu8_mask&expand=1005)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmple_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_le(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu8_mask&expand=1006)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmple_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epu8_mask&expand=1003)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmple_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_le(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu8_mask&expand=1004)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmple_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epi16_mask&expand=965)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmple_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_le(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi16_mask&expand=966)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmple_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epi16_mask&expand=963)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmple_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_le(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi16_mask&expand=964)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmple_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epi16_mask&expand=961)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmple_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_le(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi16_mask&expand=962)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmple_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epi8_mask&expand=983)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmple_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_le(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi8_mask&expand=984)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmple_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epi8_mask&expand=981)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmple_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_le(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi8_mask&expand=982)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmple_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epi8_mask&expand=979)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmple_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_le(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi8_mask&expand=980)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmple_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epu16_mask&expand=867)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpge_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_ge(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epu16_mask&expand=868)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpge_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epu16_mask&expand=865)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpge_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_ge(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epu16_mask&expand=866)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpge_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epu16_mask&expand=863)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpge_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_ge(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epu16_mask&expand=864)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpge_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epu8_mask&expand=885)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpge_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_ge(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epu8_mask&expand=886)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpge_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epu8_mask&expand=883)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpge_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_ge(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epu8_mask&expand=884)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpge_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epu8_mask&expand=881)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpge_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_ge(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epu8_mask&expand=882)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpge_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epi16_mask&expand=843)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpge_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_ge(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epi16_mask&expand=844)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpge_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epi16_mask&expand=841)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpge_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_ge(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epi16_mask&expand=842)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpge_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epi16_mask&expand=839)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpge_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_ge(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epi16_mask&expand=840)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpge_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epi8_mask&expand=861)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpge_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_ge(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epi8_mask&expand=862)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpge_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epi8_mask&expand=859)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpge_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_ge(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epi8_mask&expand=860)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpge_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epi8_mask&expand=857)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpge_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_ge(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epi8_mask&expand=858)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpge_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epu16_mask&expand=801)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpeq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_eq(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epu16_mask&expand=802)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpeq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epu16_mask&expand=799)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpeq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_eq(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epu16_mask&expand=800)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpeq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epu16_mask&expand=797)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpeq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_eq(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epu16_mask&expand=798)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpeq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epu8_mask&expand=819)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpeq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_eq(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epu8_mask&expand=820)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpeq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epu8_mask&expand=817)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpeq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_eq(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epu8_mask&expand=818)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpeq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epu8_mask&expand=815)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpeq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_eq(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epu8_mask&expand=816)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpeq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epi16_mask&expand=771)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpeq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_eq(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epi16_mask&expand=772)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpeq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16_mask&expand=769)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpeq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epi16_mask&expand=770)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpeq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16_mask&expand=767)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpeq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epi16_mask&expand=768)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpeq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epi8_mask&expand=795)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpeq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_eq(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epi8_mask&expand=796)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpeq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8_mask&expand=793)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpeq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epi8_mask&expand=794)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpeq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8_mask&expand=791)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpeq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epi8_mask&expand=792)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpeq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epu16_mask&expand=1106)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpneq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_ne(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epu16_mask&expand=1107)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpneq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epu16_mask&expand=1104)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpneq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_ne(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epu16_mask&expand=1105)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpneq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epu16_mask&expand=1102)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpneq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_ne(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epu16_mask&expand=1103)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpneq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epu8_mask&expand=1124)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpneq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_ne(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epu8_mask&expand=1125)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpneq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epu8_mask&expand=1122)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpneq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_ne(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epu8_mask&expand=1123)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpneq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epu8_mask&expand=1120)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpneq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_ne(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epu8_mask&expand=1121)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpneq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epi16_mask&expand=1082)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpneq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_ne(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epi16_mask&expand=1083)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpneq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epi16_mask&expand=1080)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpneq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_ne(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epi16_mask&expand=1081)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpneq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epi16_mask&expand=1078)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpneq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_ne(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epi16_mask&expand=1079)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpneq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epi8_mask&expand=1100)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpneq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_ne(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epi8_mask&expand=1101)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpneq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epi8_mask&expand=1098)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpneq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_ne(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epi8_mask&expand=1099)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpneq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epi8_mask&expand=1096)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpneq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_ne(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epi8_mask&expand=1097)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpneq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by `IMM8`, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epu16_mask&expand=715)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x32();
+        let b = b.as_u16x32();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x32::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x32::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epu16_mask&expand=716)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
+    k1: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x32();
+        let b = b.as_u16x32();
+        let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x32::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epu16_mask&expand=713)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epu16_mask&expand=714)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
+    k1: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epu16_mask&expand=711)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x8();
+        let b = b.as_u16x8();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epu16_mask&expand=712)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x8();
+        let b = b.as_u16x8();
+        let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epu8_mask&expand=733)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x64::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x64::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epu8_mask&expand=734)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
+    k1: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask64 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x64::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epu8_mask&expand=731)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x32::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x32::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epu8_mask&expand=732)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
+    k1: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x32::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epu8_mask&expand=729)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epu8_mask&expand=730)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epi16_mask&expand=691)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x32::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x32::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epi16_mask&expand=692)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
+    k1: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x32::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epi16_mask&expand=689)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epi16_mask&expand=690)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
+    k1: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epi16_mask&expand=687)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epi16_mask&expand=688)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epi8_mask&expand=709)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x64::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x64::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epi8_mask&expand=710)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
+    k1: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask64 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x64::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epi8_mask&expand=707)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x32::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x32::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epi8_mask&expand=708)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
+    k1: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x32::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epi8_mask&expand=705)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x16();
+        let b = b.as_i8x16();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epi8_mask&expand=706)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x16();
+        let b = b.as_i8x16();
+        let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Reduce the packed 16-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_add_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_add_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_add_unordered(a.as_i16x16()) }
+}
+
+/// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_add_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_add_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::ZERO)) }
+}
+
+/// Reduce the packed 16-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_add_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_add_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_add_unordered(a.as_i16x8()) }
+}
+
+/// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_add_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_add_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::ZERO)) }
+}
+
+/// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_add_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_add_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_add_unordered(a.as_i8x32()) }
+}
+
+/// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_add_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_add_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::ZERO)) }
+}
+
+/// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_add_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_add_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_add_unordered(a.as_i8x16()) }
+}
+
+/// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_add_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_add_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::ZERO)) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_and_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_and_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_and(a.as_i16x16()) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_and_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_and_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe {
+        simd_reduce_and(simd_select_bitmask(
+            k,
+            a.as_i16x16(),
+            _mm256_set1_epi64x(-1).as_i16x16(),
+        ))
+    }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_and_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_and_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_and(a.as_i16x8()) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_and_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_and_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe {
+        simd_reduce_and(simd_select_bitmask(
+            k,
+            a.as_i16x8(),
+            _mm_set1_epi64x(-1).as_i16x8(),
+        ))
+    }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_and_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_and_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_and(a.as_i8x32()) }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_and_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_and_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe {
+        simd_reduce_and(simd_select_bitmask(
+            k,
+            a.as_i8x32(),
+            _mm256_set1_epi64x(-1).as_i8x32(),
+        ))
+    }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_and_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_and_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_and(a.as_i8x16()) }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_and_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_and_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe {
+        simd_reduce_and(simd_select_bitmask(
+            k,
+            a.as_i8x16(),
+            _mm_set1_epi64x(-1).as_i8x16(),
+        ))
+    }
+}
+
+/// Reduce the packed 16-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_max_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_max(a.as_i16x16()) }
+}
+
+/// Reduce the packed 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_max_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(-32768))) }
+}
+
+/// Reduce the packed 16-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_max_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_max(a.as_i16x8()) }
+}
+
+/// Reduce the packed 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_max_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(-32768))) }
+}
+
+/// Reduce the packed 8-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_max_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_max(a.as_i8x32()) }
+}
+
+/// Reduce the packed 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_max_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(-128))) }
+}
+
+/// Reduce the packed 8-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_max_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_max(a.as_i8x16()) }
+}
+
+/// Reduce the packed 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_max_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(-128))) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_max_epu16(a: __m256i) -> u16 {
+    unsafe { simd_reduce_max(a.as_u16x16()) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_max_epu16(k: __mmask16, a: __m256i) -> u16 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u16x16(), u16x16::ZERO)) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_max_epu16(a: __m128i) -> u16 {
+    unsafe { simd_reduce_max(a.as_u16x8()) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_max_epu16(k: __mmask8, a: __m128i) -> u16 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u16x8(), u16x8::ZERO)) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_max_epu8(a: __m256i) -> u8 {
+    unsafe { simd_reduce_max(a.as_u8x32()) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_max_epu8(k: __mmask32, a: __m256i) -> u8 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u8x32(), u8x32::ZERO)) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_max_epu8(a: __m128i) -> u8 {
+    unsafe { simd_reduce_max(a.as_u8x16()) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_max_epu8(k: __mmask16, a: __m128i) -> u8 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u8x16(), u8x16::ZERO)) }
+}
+
+/// Reduce the packed 16-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_min_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_min(a.as_i16x16()) }
+}
+
+/// Reduce the packed 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_min_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(0x7fff))) }
+}
+
+/// Reduce the packed 16-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_min_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_min(a.as_i16x8()) }
+}
+
+/// Reduce the packed 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_min_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(0x7fff))) }
+}
+
+/// Reduce the packed 8-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_min_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_min(a.as_i8x32()) }
+}
+
+/// Reduce the packed 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_min_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(0x7f))) }
+}
+
+/// Reduce the packed 8-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_min_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_min(a.as_i8x16()) }
+}
+
+/// Reduce the packed 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_min_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(0x7f))) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_min_epu16(a: __m256i) -> u16 {
+    unsafe { simd_reduce_min(a.as_u16x16()) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_min_epu16(k: __mmask16, a: __m256i) -> u16 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u16x16(), u16x16::splat(0xffff))) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_min_epu16(a: __m128i) -> u16 {
+    unsafe { simd_reduce_min(a.as_u16x8()) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_min_epu16(k: __mmask8, a: __m128i) -> u16 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u16x8(), u16x8::splat(0xffff))) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_min_epu8(a: __m256i) -> u8 {
+    unsafe { simd_reduce_min(a.as_u8x32()) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_min_epu8(k: __mmask32, a: __m256i) -> u8 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u8x32(), u8x32::splat(0xff))) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_min_epu8(a: __m128i) -> u8 {
+    unsafe { simd_reduce_min(a.as_u8x16()) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_min_epu8(k: __mmask16, a: __m128i) -> u8 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u8x16(), u8x16::splat(0xff))) }
+}
+
+/// Reduce the packed 16-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_mul_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_mul_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_mul_unordered(a.as_i16x16()) }
+}
+
+/// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_mul_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_mul_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(1))) }
+}
+
+/// Reduce the packed 16-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_mul_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_mul_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_mul_unordered(a.as_i16x8()) }
+}
+
+/// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_mul_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_mul_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(1))) }
+}
+
+/// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_mul_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_mul_unordered(a.as_i8x32()) }
+}
+
+/// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_mul_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(1))) }
+}
+
+/// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_mul_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_mul_unordered(a.as_i8x16()) }
+}
+
+/// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_mul_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(1))) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_or_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_or_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_or(a.as_i16x16()) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_or_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_or_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i16x16(), i16x16::ZERO)) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_or_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_or_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_or(a.as_i16x8()) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_or_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_or_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i16x8(), i16x8::ZERO)) }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_or_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_or_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_or(a.as_i8x32()) }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_or_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_or_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i8x32(), i8x32::ZERO)) }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_or_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_or_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_or(a.as_i8x16()) }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_or_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_or_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i8x16(), i8x16::ZERO)) }
+}
+
+/// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi16&expand=3368)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm512_loadu_epi16(mem_addr: *const i16) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 16 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi16&expand=3365)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm256_loadu_epi16(mem_addr: *const i16) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 8 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi16&expand=3362)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm_loadu_epi16(mem_addr: *const i16) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Load 512-bits (composed of 64 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi8&expand=3395)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm512_loadu_epi8(mem_addr: *const i8) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 32 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi8&expand=3392)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm256_loadu_epi8(mem_addr: *const i8) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 16 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi8&expand=3389)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm_loadu_epi8(mem_addr: *const i8) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Store 512-bits (composed of 32 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi16&expand=5622)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm512_storeu_epi16(mem_addr: *mut i16, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 16 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi16&expand=5620)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm256_storeu_epi16(mem_addr: *mut i16, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 8 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi16&expand=5618)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm_storeu_epi16(mem_addr: *mut i16, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+}
+
+/// Store 512-bits (composed of 64 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi8&expand=5640)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm512_storeu_epi8(mem_addr: *mut i8, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 32 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi8&expand=5638)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm256_storeu_epi8(mem_addr: *mut i8, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 16 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi8&expand=5636)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+}
+
+/// Load packed 16-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *const i16) -> __m512i {
+    transmute(loaddqu16_512(mem_addr, src.as_i16x32(), k))
+}
+
+/// Load packed 16-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i {
+    _mm512_mask_loadu_epi16(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load packed 8-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *const i8) -> __m512i {
+    transmute(loaddqu8_512(mem_addr, src.as_i8x64(), k))
+}
+
+/// Load packed 8-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i {
+    _mm512_mask_loadu_epi8(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load packed 16-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *const i16) -> __m256i {
+    transmute(loaddqu16_256(mem_addr, src.as_i16x16(), k))
+}
+
+/// Load packed 16-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i {
+    _mm256_mask_loadu_epi16(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load packed 8-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *const i8) -> __m256i {
+    transmute(loaddqu8_256(mem_addr, src.as_i8x32(), k))
+}
+
+/// Load packed 8-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i {
+    _mm256_mask_loadu_epi8(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load packed 16-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i16) -> __m128i {
+    transmute(loaddqu16_128(mem_addr, src.as_i16x8(), k))
+}
+
+/// Load packed 16-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i {
+    _mm_mask_loadu_epi16(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load packed 8-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i8) -> __m128i {
+    transmute(loaddqu8_128(mem_addr, src.as_i8x16(), k))
+}
+
+/// Load packed 8-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i {
+    _mm_mask_loadu_epi8(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Store packed 16-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: __m512i) {
+    storedqu16_512(mem_addr, a.as_i16x32(), mask)
+}
+
+/// Store packed 8-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m512i) {
+    storedqu8_512(mem_addr, a.as_i8x64(), mask)
+}
+
+/// Store packed 16-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: __m256i) {
+    storedqu16_256(mem_addr, a.as_i16x16(), mask)
+}
+
+/// Store packed 8-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m256i) {
+    storedqu8_256(mem_addr, a.as_i8x32(), mask)
+}
+
+/// Store packed 16-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m128i) {
+    storedqu16_128(mem_addr, a.as_i16x8(), mask)
+}
+
+/// Store packed 8-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128i) {
+    storedqu8_128(mem_addr, a.as_i8x16(), mask)
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_madd_epi16&expand=3511)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_madd_epi16&expand=3512)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let madd = _mm512_madd_epi16(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, madd, src.as_i32x16()))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_madd_epi16&expand=3513)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let madd = _mm512_madd_epi16(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, madd, i32x16::ZERO))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_madd_epi16&expand=3509)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let madd = _mm256_madd_epi16(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, madd, src.as_i32x8()))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_madd_epi16&expand=3510)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let madd = _mm256_madd_epi16(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, madd, i32x8::ZERO))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_madd_epi16&expand=3506)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let madd = _mm_madd_epi16(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, madd, src.as_i32x4()))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_madd_epi16&expand=3507)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let madd = _mm_madd_epi16(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, madd, i32x4::ZERO))
+    }
+}
+
+/// Vertically multiply each unsigned 8-bit integer from a with the corresponding signed 8-bit integer from b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maddubs_epi16&expand=3539)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm512_maddubs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpmaddubsw(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_maddubs_epi16&expand=3540)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm512_mask_maddubs_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let madd = _mm512_maddubs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, madd, src.as_i16x32()))
+    }
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_maddubs_epi16&expand=3541)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm512_maskz_maddubs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let madd = _mm512_maddubs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, madd, i16x32::ZERO))
+    }
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_maddubs_epi16&expand=3537)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm256_mask_maddubs_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let madd = _mm256_maddubs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, madd, src.as_i16x16()))
+    }
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_maddubs_epi16&expand=3538)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm256_maskz_maddubs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let madd = _mm256_maddubs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, madd, i16x16::ZERO))
+    }
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_maddubs_epi16&expand=3534)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm_mask_maddubs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let madd = _mm_maddubs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, madd, src.as_i16x8()))
+    }
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_maddubs_epi16&expand=3535)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm_maskz_maddubs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let madd = _mm_maddubs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, madd, i16x8::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_packs_epi32&expand=4091)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpackssdw(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_packs_epi32&expand=4089)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm512_mask_packs_epi32(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packs_epi32(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_packs_epi32&expand=4090)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm512_maskz_packs_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packs_epi32(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, pack, i16x32::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_packs_epi32&expand=4086)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm256_mask_packs_epi32(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packs_epi32(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_packs_epi32&expand=4087)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm256_maskz_packs_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packs_epi32(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, pack, i16x16::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_packs_epi32&expand=4083)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm_mask_packs_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packs_epi32(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_packs_epi32&expand=4084)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packs_epi32(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, pack, i16x8::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_packs_epi16&expand=4082)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpacksswb(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_packs_epi16&expand=4080)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm512_mask_packs_epi16(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packs_epi16(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_packs_epi16&expand=4081)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm512_maskz_packs_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packs_epi16(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, pack, i8x64::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_packs_epi16&expand=4077)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm256_mask_packs_epi16(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packs_epi16(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=#text=_mm256_maskz_packs_epi16&expand=4078)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm256_maskz_packs_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packs_epi16(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, pack, i8x32::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_packs_epi16&expand=4074)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm_mask_packs_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packs_epi16(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_packs_epi16&expand=4075)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packs_epi16(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, pack, i8x16::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_packus_epi32&expand=4130)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpackusdw(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_packus_epi32&expand=4128)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm512_mask_packus_epi32(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packus_epi32(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_packus_epi32&expand=4129)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm512_maskz_packus_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packus_epi32(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, pack, i16x32::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_packus_epi32&expand=4125)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm256_mask_packus_epi32(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packus_epi32(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_packus_epi32&expand=4126)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm256_maskz_packus_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packus_epi32(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, pack, i16x16::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_packus_epi32&expand=4122)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm_mask_packus_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packus_epi32(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_packus_epi32&expand=4123)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packus_epi32(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, pack, i16x8::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_packus_epi16&expand=4121)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpackuswb(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_packus_epi16&expand=4119)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm512_mask_packus_epi16(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packus_epi16(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_packus_epi16&expand=4120)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm512_maskz_packus_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packus_epi16(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, pack, i8x64::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_packus_epi16&expand=4116)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm256_mask_packus_epi16(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packus_epi16(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_packus_epi16&expand=4117)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm256_maskz_packus_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packus_epi16(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, pack, i8x32::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_packus_epi16&expand=4113)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm_mask_packus_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packus_epi16(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_packus_epi16&expand=4114)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm_maskz_packus_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packus_epi16(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, pack, i8x16::ZERO))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_avg_epu16&expand=388)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm512_avg_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, u32x32>(a.as_u16x32());
+        let b = simd_cast::<_, u32x32>(b.as_u16x32());
+        let r = simd_shr(simd_add(simd_add(a, b), u32x32::splat(1)), u32x32::splat(1));
+        transmute(simd_cast::<_, u16x32>(r))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_avg_epu16&expand=389)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm512_mask_avg_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let avg = _mm512_avg_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, avg, src.as_u16x32()))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_avg_epu16&expand=390)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm512_maskz_avg_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let avg = _mm512_avg_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, avg, u16x32::ZERO))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_avg_epu16&expand=386)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm256_mask_avg_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let avg = _mm256_avg_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, avg, src.as_u16x16()))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_avg_epu16&expand=387)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm256_maskz_avg_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let avg = _mm256_avg_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, avg, u16x16::ZERO))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_avg_epu16&expand=383)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm_mask_avg_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let avg = _mm_avg_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, avg, src.as_u16x8()))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_avg_epu16&expand=384)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm_maskz_avg_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let avg = _mm_avg_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, avg, u16x8::ZERO))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_avg_epu8&expand=397)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm512_avg_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, u16x64>(a.as_u8x64());
+        let b = simd_cast::<_, u16x64>(b.as_u8x64());
+        let r = simd_shr(simd_add(simd_add(a, b), u16x64::splat(1)), u16x64::splat(1));
+        transmute(simd_cast::<_, u8x64>(r))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_avg_epu8&expand=398)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm512_mask_avg_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let avg = _mm512_avg_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, avg, src.as_u8x64()))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_avg_epu8&expand=399)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm512_maskz_avg_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let avg = _mm512_avg_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, avg, u8x64::ZERO))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_avg_epu8&expand=395)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm256_mask_avg_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let avg = _mm256_avg_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, avg, src.as_u8x32()))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_avg_epu8&expand=396)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm256_maskz_avg_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let avg = _mm256_avg_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, avg, u8x32::ZERO))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_avg_epu8&expand=392)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm_mask_avg_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let avg = _mm_avg_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, avg, src.as_u8x16()))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_avg_epu8&expand=393)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm_maskz_avg_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let avg = _mm_avg_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, avg, u8x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sll_epi16&expand=5271)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm512_sll_epi16(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsllw(a.as_i16x32(), count.as_i16x8())) }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sll_epi16&expand=5269)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm512_mask_sll_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sll_epi16&expand=5270)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm512_maskz_sll_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sll_epi16&expand=5266)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm256_mask_sll_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sll_epi16&expand=5267)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm256_maskz_sll_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sll_epi16&expand=5263)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm_mask_sll_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sll_epi16&expand=5264)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm_maskz_sll_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_slli_epi16&expand=5301)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_slli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16)))
+        }
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_slli_epi16&expand=5299)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_slli_epi16<const IMM8: u32>(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 16 {
+            u16x32::ZERO
+        } else {
+            simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_slli_epi16&expand=5300)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_slli_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16));
+            transmute(simd_select_bitmask(k, shf, u16x32::ZERO))
+        }
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_slli_epi16&expand=5296)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_slli_epi16<const IMM8: u32>(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 16 {
+            u16x16::ZERO
+        } else {
+            simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_slli_epi16&expand=5297)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_slli_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm256_setzero_si256()
+        } else {
+            let shf = simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16));
+            transmute(simd_select_bitmask(k, shf, u16x16::ZERO))
+        }
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_slli_epi16&expand=5293)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_slli_epi16<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 16 {
+            u16x8::ZERO
+        } else {
+            simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_slli_epi16&expand=5294)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_slli_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm_setzero_si128()
+        } else {
+            let shf = simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16));
+            transmute(simd_select_bitmask(k, shf, u16x8::ZERO))
+        }
+    }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sllv_epi16&expand=5333)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm512_sllv_epi16(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsllvw(a.as_i16x32(), count.as_i16x32())) }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sllv_epi16&expand=5331)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm512_mask_sllv_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sllv_epi16&expand=5332)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm512_maskz_sllv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi16&expand=5330)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm256_sllv_epi16(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(vpsllvw256(a.as_i16x16(), count.as_i16x16())) }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sllv_epi16&expand=5328)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm256_mask_sllv_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sllv_epi16&expand=5329)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm256_maskz_sllv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi16&expand=5327)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm_sllv_epi16(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsllvw128(a.as_i16x8(), count.as_i16x8())) }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sllv_epi16&expand=5325)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm_mask_sllv_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sllv_epi16&expand=5326)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm_maskz_sllv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srl_epi16&expand=5483)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm512_srl_epi16(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsrlw(a.as_i16x32(), count.as_i16x8())) }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srl_epi16&expand=5481)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm512_mask_srl_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srl_epi16&expand=5482)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm512_maskz_srl_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srl_epi16&expand=5478)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm256_mask_srl_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srl_epi16&expand=5479)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm256_maskz_srl_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srl_epi16&expand=5475)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm_mask_srl_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srl_epi16&expand=5476)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm_maskz_srl_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srli_epi16&expand=5513)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_srli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16)))
+        }
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srli_epi16&expand=5511)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_srli_epi16<const IMM8: u32>(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 16 {
+            u16x32::ZERO
+        } else {
+            simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srli_epi16&expand=5512)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_srli_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        //imm8 should be u32, it seems the document to verify is incorrect
+        if IMM8 >= 16 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16));
+            transmute(simd_select_bitmask(k, shf, u16x32::ZERO))
+        }
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srli_epi16&expand=5508)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_srli_epi16<const IMM8: i32>(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_srli_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shf.as_i16x16(), src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srli_epi16&expand=5509)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_srli_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_srli_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shf.as_i16x16(), i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srli_epi16&expand=5505)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_srli_epi16<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_srli_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shf.as_i16x8(), src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srli_epi16&expand=5506)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_srli_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_srli_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shf.as_i16x8(), i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srlv_epi16&expand=5545)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm512_srlv_epi16(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsrlvw(a.as_i16x32(), count.as_i16x32())) }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srlv_epi16&expand=5543)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm512_mask_srlv_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srlv_epi16&expand=5544)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm512_maskz_srlv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi16&expand=5542)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm256_srlv_epi16(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(vpsrlvw256(a.as_i16x16(), count.as_i16x16())) }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srlv_epi16&expand=5540)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm256_mask_srlv_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srlv_epi16&expand=5541)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm256_maskz_srlv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi16&expand=5539)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm_srlv_epi16(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsrlvw128(a.as_i16x8(), count.as_i16x8())) }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srlv_epi16&expand=5537)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm_mask_srlv_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srlv_epi16&expand=5538)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm_maskz_srlv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sra_epi16&expand=5398)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm512_sra_epi16(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsraw(a.as_i16x32(), count.as_i16x8())) }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sra_epi16&expand=5396)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm512_mask_sra_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sra_epi16&expand=5397)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm512_maskz_sra_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sra_epi16&expand=5393)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm256_mask_sra_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sra_epi16&expand=5394)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm256_maskz_sra_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sra_epi16&expand=5390)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm_mask_sra_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sra_epi16&expand=5391)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm_maskz_sra_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srai_epi16&expand=5427)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_srai_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16)))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srai_epi16&expand=5425)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_srai_epi16<const IMM8: u32>(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srai_epi16&expand=5426)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_srai_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srai_epi16&expand=5422)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_srai_epi16<const IMM8: u32>(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, r, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srai_epi16&expand=5423)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_srai_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, r, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srai_epi16&expand=5419)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_srai_epi16<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, r, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srai_epi16&expand=5420)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_srai_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, r, i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srav_epi16&expand=5456)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm512_srav_epi16(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsravw(a.as_i16x32(), count.as_i16x32())) }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srav_epi16&expand=5454)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm512_mask_srav_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srav_epi16&expand=5455)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm512_maskz_srav_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi16&expand=5453)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm256_srav_epi16(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(vpsravw256(a.as_i16x16(), count.as_i16x16())) }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srav_epi16&expand=5451)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm256_mask_srav_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srav_epi16&expand=5452)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm256_maskz_srav_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi16&expand=5450)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm_srav_epi16(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsravw128(a.as_i16x8(), count.as_i16x8())) }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srav_epi16&expand=5448)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm_mask_srav_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srav_epi16&expand=5449)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm_maskz_srav_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi16&expand=4226)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub fn _mm512_permutex2var_epi16(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpermi2w(a.as_i16x32(), idx.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_epi16&expand=4223)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2w))]
+pub fn _mm512_mask_permutex2var_epi16(
+    a: __m512i,
+    k: __mmask32,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, a.as_i16x32()))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_epi16&expand=4225)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub fn _mm512_maskz_permutex2var_epi16(
+    k: __mmask32,
+    a: __m512i,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, i16x32::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_epi16&expand=4224)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2w))]
+pub fn _mm512_mask2_permutex2var_epi16(
+    a: __m512i,
+    idx: __m512i,
+    k: __mmask32,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, idx.as_i16x32()))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_epi16&expand=4222)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub fn _mm256_permutex2var_epi16(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpermi2w256(a.as_i16x16(), idx.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_epi16&expand=4219)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2w))]
+pub fn _mm256_mask_permutex2var_epi16(
+    a: __m256i,
+    k: __mmask16,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, a.as_i16x16()))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_epi16&expand=4221)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub fn _mm256_maskz_permutex2var_epi16(
+    k: __mmask16,
+    a: __m256i,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, i16x16::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_epi16&expand=4220)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2w))]
+pub fn _mm256_mask2_permutex2var_epi16(
+    a: __m256i,
+    idx: __m256i,
+    k: __mmask16,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, idx.as_i16x16()))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_epi16&expand=4218)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub fn _mm_permutex2var_epi16(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpermi2w128(a.as_i16x8(), idx.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_epi16&expand=4215)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2w))]
+pub fn _mm_mask_permutex2var_epi16(a: __m128i, k: __mmask8, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, a.as_i16x8()))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_epi16&expand=4217)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub fn _mm_maskz_permutex2var_epi16(k: __mmask8, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, i16x8::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_epi16&expand=4216)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2w))]
+pub fn _mm_mask2_permutex2var_epi16(a: __m128i, idx: __m128i, k: __mmask8, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, idx.as_i16x8()))
+    }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi16&expand=4295)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm512_permutexvar_epi16(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermw(a.as_i16x32(), idx.as_i16x32())) }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_epi16&expand=4293)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm512_mask_permutexvar_epi16(
+    src: __m512i,
+    k: __mmask32,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi16(idx, a).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, src.as_i16x32()))
+    }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_epi16&expand=4294)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm512_maskz_permutexvar_epi16(k: __mmask32, idx: __m512i, a: __m512i) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi16(idx, a).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, i16x32::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_epi16&expand=4292)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm256_permutexvar_epi16(idx: __m256i, a: __m256i) -> __m256i {
+    unsafe { transmute(vpermw256(a.as_i16x16(), idx.as_i16x16())) }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_epi16&expand=4290)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm256_mask_permutexvar_epi16(
+    src: __m256i,
+    k: __mmask16,
+    idx: __m256i,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi16(idx, a).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, src.as_i16x16()))
+    }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_epi16&expand=4291)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm256_maskz_permutexvar_epi16(k: __mmask16, idx: __m256i, a: __m256i) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi16(idx, a).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, i16x16::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutexvar_epi16&expand=4289)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm_permutexvar_epi16(idx: __m128i, a: __m128i) -> __m128i {
+    unsafe { transmute(vpermw128(a.as_i16x8(), idx.as_i16x8())) }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutexvar_epi16&expand=4287)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm_mask_permutexvar_epi16(src: __m128i, k: __mmask8, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi16(idx, a).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, src.as_i16x8()))
+    }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutexvar_epi16&expand=4288)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm_maskz_permutexvar_epi16(k: __mmask8, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi16(idx, a).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, i16x8::ZERO))
+    }
+}
+
+/// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_epi16&expand=430)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
+pub fn _mm512_mask_blend_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i16x32(), a.as_i16x32())) }
+}
+
+/// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_epi16&expand=429)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
+pub fn _mm256_mask_blend_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i16x16(), a.as_i16x16())) }
+}
+
+/// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_epi16&expand=427)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
+pub fn _mm_mask_blend_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i16x8(), a.as_i16x8())) }
+}
+
+/// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_epi8&expand=441)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
+pub fn _mm512_mask_blend_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i8x64(), a.as_i8x64())) }
+}
+
+/// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_epi8&expand=440)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
+pub fn _mm256_mask_blend_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i8x32(), a.as_i8x32())) }
+}
+
+/// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_epi8&expand=439)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
+pub fn _mm_mask_blend_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i8x16(), a.as_i8x16())) }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastw_epi16&expand=587)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm512_broadcastw_epi16(a: __m128i) -> __m512i {
+    unsafe {
+        let a = _mm512_castsi128_si512(a).as_i16x32();
+        let ret: i16x32 = simd_shuffle!(
+            a,
+            a,
+            [
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0,
+            ],
+        );
+        transmute(ret)
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastw_epi16&expand=588)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm512_mask_broadcastw_epi16(src: __m512i, k: __mmask32, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastw_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i16x32()))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastw_epi16&expand=589)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm512_maskz_broadcastw_epi16(k: __mmask32, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastw_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, broadcast, i16x32::ZERO))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastw_epi16&expand=585)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm256_mask_broadcastw_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastw_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i16x16()))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastw_epi16&expand=586)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm256_maskz_broadcastw_epi16(k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastw_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, broadcast, i16x16::ZERO))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastw_epi16&expand=582)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm_mask_broadcastw_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastw_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i16x8()))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastw_epi16&expand=583)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm_maskz_broadcastw_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastw_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, broadcast, i16x8::ZERO))
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastb_epi8&expand=536)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm512_broadcastb_epi8(a: __m128i) -> __m512i {
+    unsafe {
+        let a = _mm512_castsi128_si512(a).as_i8x64();
+        let ret: i8x64 = simd_shuffle!(
+            a,
+            a,
+            [
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0,
+            ],
+        );
+        transmute(ret)
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastb_epi8&expand=537)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm512_mask_broadcastb_epi8(src: __m512i, k: __mmask64, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastb_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i8x64()))
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastb_epi8&expand=538)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm512_maskz_broadcastb_epi8(k: __mmask64, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastb_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, broadcast, i8x64::ZERO))
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastb_epi8&expand=534)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm256_mask_broadcastb_epi8(src: __m256i, k: __mmask32, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastb_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i8x32()))
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastb_epi8&expand=535)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm256_maskz_broadcastb_epi8(k: __mmask32, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastb_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, broadcast, i8x32::ZERO))
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastb_epi8&expand=531)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm_mask_broadcastb_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastb_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i8x16()))
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastb_epi8&expand=532)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm_maskz_broadcastb_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastb_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, broadcast, i8x16::ZERO))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_epi16&expand=6012)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm512_unpackhi_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        #[rustfmt::skip]
+        let r: i16x32 = simd_shuffle!(
+            a,
+            b,
+            [
+                4, 32 + 4, 5, 32 + 5,
+                6, 32 + 6, 7, 32 + 7,
+                12, 32 + 12, 13, 32 + 13,
+                14, 32 + 14, 15, 32 + 15,
+                20, 32 + 20, 21, 32 + 21,
+                22, 32 + 22, 23, 32 + 23,
+                28, 32 + 28, 29, 32 + 29,
+                30, 32 + 30, 31, 32 + 31,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_epi16&expand=6010)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm512_mask_unpackhi_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i16x32()))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_epi16&expand=6011)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm512_maskz_unpackhi_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, unpackhi, i16x32::ZERO))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_epi16&expand=6007)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm256_mask_unpackhi_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i16x16()))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_epi16&expand=6008)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm256_maskz_unpackhi_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, unpackhi, i16x16::ZERO))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_epi16&expand=6004)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm_mask_unpackhi_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i16x8()))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_epi16&expand=6005)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm_maskz_unpackhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, unpackhi, i16x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_epi8&expand=6039)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm512_unpackhi_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        #[rustfmt::skip]
+        let r: i8x64 = simd_shuffle!(
+            a,
+            b,
+            [
+                8, 64 + 8, 9, 64 + 9,
+                10, 64 + 10, 11, 64 + 11,
+                12, 64 + 12, 13, 64 + 13,
+                14, 64 + 14, 15, 64 + 15,
+                24, 64 + 24, 25, 64 + 25,
+                26, 64 + 26, 27, 64 + 27,
+                28, 64 + 28, 29, 64 + 29,
+                30, 64 + 30, 31, 64 + 31,
+                40, 64 + 40, 41, 64 + 41,
+                42, 64 + 42, 43, 64 + 43,
+                44, 64 + 44, 45, 64 + 45,
+                46, 64 + 46, 47, 64 + 47,
+                56, 64 + 56, 57, 64 + 57,
+                58, 64 + 58, 59, 64 + 59,
+                60, 64 + 60, 61, 64 + 61,
+                62, 64 + 62, 63, 64 + 63,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_epi8&expand=6037)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm512_mask_unpackhi_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i8x64()))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_epi8&expand=6038)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm512_maskz_unpackhi_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, unpackhi, i8x64::ZERO))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_epi8&expand=6034)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm256_mask_unpackhi_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i8x32()))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_epi8&expand=6035)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm256_maskz_unpackhi_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, unpackhi, i8x32::ZERO))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_epi8&expand=6031)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm_mask_unpackhi_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i8x16()))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_epi8&expand=6032)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm_maskz_unpackhi_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, unpackhi, i8x16::ZERO))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_epi16&expand=6069)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm512_unpacklo_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        #[rustfmt::skip]
+        let r: i16x32 = simd_shuffle!(
+            a,
+            b,
+            [
+               0,  32+0,   1, 32+1,
+               2,  32+2,   3, 32+3,
+               8,  32+8,   9, 32+9,
+               10, 32+10, 11, 32+11,
+               16, 32+16, 17, 32+17,
+               18, 32+18, 19, 32+19,
+               24, 32+24, 25, 32+25,
+               26, 32+26, 27, 32+27
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_epi16&expand=6067)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm512_mask_unpacklo_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i16x32()))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_epi16&expand=6068)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm512_maskz_unpacklo_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, unpacklo, i16x32::ZERO))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_epi16&expand=6064)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm256_mask_unpacklo_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i16x16()))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_epi16&expand=6065)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm256_maskz_unpacklo_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, unpacklo, i16x16::ZERO))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_epi16&expand=6061)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm_mask_unpacklo_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i16x8()))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_epi16&expand=6062)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm_maskz_unpacklo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, unpacklo, i16x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_epi8&expand=6096)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm512_unpacklo_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        #[rustfmt::skip]
+        let r: i8x64 = simd_shuffle!(
+            a,
+            b,
+            [
+                0,  64+0,   1, 64+1,
+                2,  64+2,   3, 64+3,
+                4,  64+4,   5, 64+5,
+                6,  64+6,   7, 64+7,
+                16, 64+16, 17, 64+17,
+                18, 64+18, 19, 64+19,
+                20, 64+20, 21, 64+21,
+                22, 64+22, 23, 64+23,
+                32, 64+32, 33, 64+33,
+                34, 64+34, 35, 64+35,
+                36, 64+36, 37, 64+37,
+                38, 64+38, 39, 64+39,
+                48, 64+48, 49, 64+49,
+                50, 64+50, 51, 64+51,
+                52, 64+52, 53, 64+53,
+                54, 64+54, 55, 64+55,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_epi8&expand=6094)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm512_mask_unpacklo_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i8x64()))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_epi8&expand=6095)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm512_maskz_unpacklo_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, unpacklo, i8x64::ZERO))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_epi8&expand=6091)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm256_mask_unpacklo_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i8x32()))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_epi8&expand=6092)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm256_maskz_unpacklo_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, unpacklo, i8x32::ZERO))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_epi8&expand=6088)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm_mask_unpacklo_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i8x16()))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_epi8&expand=6089)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm_maskz_unpacklo_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, unpacklo, i8x16::ZERO))
+    }
+}
+
+/// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_epi16&expand=3795)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub fn _mm512_mask_mov_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i16x32();
+        transmute(simd_select_bitmask(k, mov, src.as_i16x32()))
+    }
+}
+
+/// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_epi16&expand=3796)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub fn _mm512_maskz_mov_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i16x32();
+        transmute(simd_select_bitmask(k, mov, i16x32::ZERO))
+    }
+}
+
+/// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_epi16&expand=3793)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub fn _mm256_mask_mov_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i16x16();
+        transmute(simd_select_bitmask(k, mov, src.as_i16x16()))
+    }
+}
+
+/// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_epi16&expand=3794)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub fn _mm256_maskz_mov_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i16x16();
+        transmute(simd_select_bitmask(k, mov, i16x16::ZERO))
+    }
+}
+
+/// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_epi16&expand=3791)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub fn _mm_mask_mov_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i16x8();
+        transmute(simd_select_bitmask(k, mov, src.as_i16x8()))
+    }
+}
+
+/// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_epi16&expand=3792)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub fn _mm_maskz_mov_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i16x8();
+        transmute(simd_select_bitmask(k, mov, i16x8::ZERO))
+    }
+}
+
+/// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_epi8&expand=3813)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub fn _mm512_mask_mov_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i8x64();
+        transmute(simd_select_bitmask(k, mov, src.as_i8x64()))
+    }
+}
+
+/// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_epi8&expand=3814)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub fn _mm512_maskz_mov_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i8x64();
+        transmute(simd_select_bitmask(k, mov, i8x64::ZERO))
+    }
+}
+
+/// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_epi8&expand=3811)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub fn _mm256_mask_mov_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i8x32();
+        transmute(simd_select_bitmask(k, mov, src.as_i8x32()))
+    }
+}
+
+/// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_epi8&expand=3812)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub fn _mm256_maskz_mov_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i8x32();
+        transmute(simd_select_bitmask(k, mov, i8x32::ZERO))
+    }
+}
+
+/// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_epi8&expand=3809)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub fn _mm_mask_mov_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i8x16();
+        transmute(simd_select_bitmask(k, mov, src.as_i8x16()))
+    }
+}
+
+/// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_epi8&expand=3810)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub fn _mm_maskz_mov_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i8x16();
+        transmute(simd_select_bitmask(k, mov, i8x16::ZERO))
+    }
+}
+
+/// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_set1_epi16&expand=4942)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm512_mask_set1_epi16(src: __m512i, k: __mmask32, a: i16) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, r, src.as_i16x32()))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_set1_epi16&expand=4943)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm512_maskz_set1_epi16(k: __mmask32, a: i16) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, r, i16x32::ZERO))
+    }
+}
+
+/// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_set1_epi16&expand=4939)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm256_mask_set1_epi16(src: __m256i, k: __mmask16, a: i16) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, r, src.as_i16x16()))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_set1_epi16&expand=4940)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm256_maskz_set1_epi16(k: __mmask16, a: i16) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, r, i16x16::ZERO))
+    }
+}
+
+/// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_set1_epi16&expand=4936)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm_mask_set1_epi16(src: __m128i, k: __mmask8, a: i16) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, r, src.as_i16x8()))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_set1_epi16&expand=4937)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm_maskz_set1_epi16(k: __mmask8, a: i16) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, r, i16x8::ZERO))
+    }
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_set1_epi8&expand=4970)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))]
+pub fn _mm512_mask_set1_epi8(src: __m512i, k: __mmask64, a: i8) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
+    }
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_set1_epi8&expand=4971)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))]
+pub fn _mm512_maskz_set1_epi8(k: __mmask64, a: i8) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, r, i8x64::ZERO))
+    }
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_set1_epi8&expand=4967)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))]
+pub fn _mm256_mask_set1_epi8(src: __m256i, k: __mmask32, a: i8) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
+    }
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_set1_epi8&expand=4968)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))]
+pub fn _mm256_maskz_set1_epi8(k: __mmask32, a: i8) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, r, i8x32::ZERO))
+    }
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_set1_epi8&expand=4964)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))]
+pub fn _mm_mask_set1_epi8(src: __m128i, k: __mmask16, a: i8) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
+    }
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_set1_epi8&expand=4965)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))]
+pub fn _mm_maskz_set1_epi8(k: __mmask16, a: i8) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, r, i8x16::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shufflelo_epi16&expand=5221)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_shufflelo_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i16x32();
+        let r: i16x32 = simd_shuffle!(
+            a,
+            a,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+                4,
+                5,
+                6,
+                7,
+                (IMM8 as u32 & 0b11) + 8,
+                ((IMM8 as u32 >> 2) & 0b11) + 8,
+                ((IMM8 as u32 >> 4) & 0b11) + 8,
+                ((IMM8 as u32 >> 6) & 0b11) + 8,
+                12,
+                13,
+                14,
+                15,
+                (IMM8 as u32 & 0b11) + 16,
+                ((IMM8 as u32 >> 2) & 0b11) + 16,
+                ((IMM8 as u32 >> 4) & 0b11) + 16,
+                ((IMM8 as u32 >> 6) & 0b11) + 16,
+                20,
+                21,
+                22,
+                23,
+                (IMM8 as u32 & 0b11) + 24,
+                ((IMM8 as u32 >> 2) & 0b11) + 24,
+                ((IMM8 as u32 >> 4) & 0b11) + 24,
+                ((IMM8 as u32 >> 6) & 0b11) + 24,
+                28,
+                29,
+                30,
+                31,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shufflelo_epi16&expand=5219)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_shufflelo_epi16<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shufflelo_epi16&expand=5220)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_i16x32(), i16x32::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shufflelo_epi16&expand=5216)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_shufflelo_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm256_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shufflelo_epi16&expand=5217)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm256_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), i16x16::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shufflelo_epi16&expand=5213)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_shufflelo_epi16<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shufflelo_epi16&expand=5214)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), i16x8::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shufflehi_epi16&expand=5212)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_shufflehi_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i16x32();
+        let r: i16x32 = simd_shuffle!(
+            a,
+            a,
+            [
+                0,
+                1,
+                2,
+                3,
+                (IMM8 as u32 & 0b11) + 4,
+                ((IMM8 as u32 >> 2) & 0b11) + 4,
+                ((IMM8 as u32 >> 4) & 0b11) + 4,
+                ((IMM8 as u32 >> 6) & 0b11) + 4,
+                8,
+                9,
+                10,
+                11,
+                (IMM8 as u32 & 0b11) + 12,
+                ((IMM8 as u32 >> 2) & 0b11) + 12,
+                ((IMM8 as u32 >> 4) & 0b11) + 12,
+                ((IMM8 as u32 >> 6) & 0b11) + 12,
+                16,
+                17,
+                18,
+                19,
+                (IMM8 as u32 & 0b11) + 20,
+                ((IMM8 as u32 >> 2) & 0b11) + 20,
+                ((IMM8 as u32 >> 4) & 0b11) + 20,
+                ((IMM8 as u32 >> 6) & 0b11) + 20,
+                24,
+                25,
+                26,
+                27,
+                (IMM8 as u32 & 0b11) + 28,
+                ((IMM8 as u32 >> 2) & 0b11) + 28,
+                ((IMM8 as u32 >> 4) & 0b11) + 28,
+                ((IMM8 as u32 >> 6) & 0b11) + 28,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shufflehi_epi16&expand=5210)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_shufflehi_epi16<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shufflehi_epi16&expand=5211)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_i16x32(), i16x32::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shufflehi_epi16&expand=5207)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_shufflehi_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm256_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shufflehi_epi16&expand=5208)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm256_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), i16x16::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shufflehi_epi16&expand=5204)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_shufflehi_epi16<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shufflehi_epi16&expand=5205)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), i16x8::ZERO))
+    }
+}
+
+/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_epi8&expand=5159)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm512_shuffle_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpshufb(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_epi8&expand=5157)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm512_mask_shuffle_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let shuffle = _mm512_shuffle_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, shuffle, src.as_i8x64()))
+    }
+}
+
+/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_epi8&expand=5158)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm512_maskz_shuffle_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let shuffle = _mm512_shuffle_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, shuffle, i8x64::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_epi8&expand=5154)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm256_mask_shuffle_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let shuffle = _mm256_shuffle_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, shuffle, src.as_i8x32()))
+    }
+}
+
+/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_epi8&expand=5155)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm256_maskz_shuffle_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let shuffle = _mm256_shuffle_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, shuffle, i8x32::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shuffle_epi8&expand=5151)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm_mask_shuffle_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let shuffle = _mm_shuffle_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, shuffle, src.as_i8x16()))
+    }
+}
+
+/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shuffle_epi8&expand=5152)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm_maskz_shuffle_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let shuffle = _mm_shuffle_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, shuffle, i8x16::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_test_epi16_mask&expand=5884)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub fn _mm512_test_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_test_epi16_mask&expand=5883)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub fn _mm512_mask_test_epi16_mask(k: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_test_epi16_mask&expand=5882)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub fn _mm256_test_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpneq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_test_epi16_mask&expand=5881)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub fn _mm256_mask_test_epi16_mask(k: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpneq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_epi16_mask&expand=5880)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub fn _mm_test_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpneq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_test_epi16_mask&expand=5879)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub fn _mm_mask_test_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpneq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_test_epi8_mask&expand=5902)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub fn _mm512_test_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_test_epi8_mask&expand=5901)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub fn _mm512_mask_test_epi8_mask(k: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_test_epi8_mask&expand=5900)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub fn _mm256_test_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpneq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_test_epi8_mask&expand=5899)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub fn _mm256_mask_test_epi8_mask(k: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpneq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_epi8_mask&expand=5898)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub fn _mm_test_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpneq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_test_epi8_mask&expand=5897)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub fn _mm_mask_test_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpneq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_testn_epi16_mask&expand=5915)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub fn _mm512_testn_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_testn_epi16_mask&expand=5914)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub fn _mm512_mask_testn_epi16_mask(k: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testn_epi16_mask&expand=5913)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub fn _mm256_testn_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpeq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_testn_epi16_mask&expand=5912)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub fn _mm256_mask_testn_epi16_mask(k: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpeq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testn_epi16_mask&expand=5911)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub fn _mm_testn_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpeq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_testn_epi16_mask&expand=5910)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub fn _mm_mask_testn_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpeq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_testn_epi8_mask&expand=5933)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub fn _mm512_testn_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_testn_epi8_mask&expand=5932)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub fn _mm512_mask_testn_epi8_mask(k: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testn_epi8_mask&expand=5931)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub fn _mm256_testn_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpeq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_testn_epi8_mask&expand=5930)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub fn _mm256_mask_testn_epi8_mask(k: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpeq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testn_epi8_mask&expand=5929)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub fn _mm_testn_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpeq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_testn_epi8_mask&expand=5928)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub fn _mm_mask_testn_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpeq_epi8_mask(k, and, zero)
+}
+
+/// Store 64-bit mask from a into memory.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask64&expand=5578)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] //should be kmovq
+pub unsafe fn _store_mask64(mem_addr: *mut __mmask64, a: __mmask64) {
+    ptr::write(mem_addr as *mut __mmask64, a);
+}
+
+/// Store 32-bit mask from a into memory.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask32&expand=5577)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] //should be kmovd
+pub unsafe fn _store_mask32(mem_addr: *mut __mmask32, a: __mmask32) {
+    ptr::write(mem_addr as *mut __mmask32, a);
+}
+
+/// Load 64-bit mask from memory into k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask64&expand=3318)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] //should be kmovq
+pub unsafe fn _load_mask64(mem_addr: *const __mmask64) -> __mmask64 {
+    ptr::read(mem_addr as *const __mmask64)
+}
+
+/// Load 32-bit mask from memory into k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask32&expand=3317)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] //should be kmovd
+pub unsafe fn _load_mask32(mem_addr: *const __mmask32) -> __mmask32 {
+    ptr::read(mem_addr as *const __mmask32)
+}
+
+/// Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce eight unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sad_epu8&expand=4855)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsadbw))]
+pub fn _mm512_sad_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpsadbw(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dbsad_epu8&expand=2114)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm512_dbsad_epu8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let r = vdbpsadbw(a, b, IMM8);
+        transmute(r)
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dbsad_epu8&expand=2115)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm512_mask_dbsad_epu8<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let r = vdbpsadbw(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_u16x32()))
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dbsad_epu8&expand=2116)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm512_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let r = vdbpsadbw(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, u16x32::ZERO))
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dbsad_epu8&expand=2111)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm256_dbsad_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let r = vdbpsadbw256(a, b, IMM8);
+        transmute(r)
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dbsad_epu8&expand=2112)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm256_mask_dbsad_epu8<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let r = vdbpsadbw256(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_u16x16()))
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dbsad_epu8&expand=2113)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm256_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let r = vdbpsadbw256(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, u16x16::ZERO))
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dbsad_epu8&expand=2108)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm_dbsad_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let r = vdbpsadbw128(a, b, IMM8);
+        transmute(r)
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dbsad_epu8&expand=2109)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm_mask_dbsad_epu8<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let r = vdbpsadbw128(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_u16x8()))
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dbsad_epu8&expand=2110)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let r = vdbpsadbw128(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, u16x8::ZERO))
+    }
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movepi16_mask&expand=3873)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovw2m))]
+pub fn _mm512_movepi16_mask(a: __m512i) -> __mmask32 {
+    let filter = _mm512_set1_epi16(1 << 15);
+    let a = _mm512_and_si512(a, filter);
+    _mm512_cmpeq_epi16_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movepi16_mask&expand=3872)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovw2m))]
+pub fn _mm256_movepi16_mask(a: __m256i) -> __mmask16 {
+    let filter = _mm256_set1_epi16(1 << 15);
+    let a = _mm256_and_si256(a, filter);
+    _mm256_cmpeq_epi16_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi16_mask&expand=3871)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovw2m))]
+pub fn _mm_movepi16_mask(a: __m128i) -> __mmask8 {
+    let filter = _mm_set1_epi16(1 << 15);
+    let a = _mm_and_si128(a, filter);
+    _mm_cmpeq_epi16_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movepi8_mask&expand=3883)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovb2m))]
+pub fn _mm512_movepi8_mask(a: __m512i) -> __mmask64 {
+    let filter = _mm512_set1_epi8(1 << 7);
+    let a = _mm512_and_si512(a, filter);
+    _mm512_cmpeq_epi8_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movepi8_mask&expand=3882)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
+// using vpmovb2m plus converting the mask register to a standard register.
+pub fn _mm256_movepi8_mask(a: __m256i) -> __mmask32 {
+    let filter = _mm256_set1_epi8(1 << 7);
+    let a = _mm256_and_si256(a, filter);
+    _mm256_cmpeq_epi8_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi8_mask&expand=3881)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
+// using vpmovb2m plus converting the mask register to a standard register.
+pub fn _mm_movepi8_mask(a: __m128i) -> __mmask16 {
+    let filter = _mm_set1_epi8(1 << 7);
+    let a = _mm_and_si128(a, filter);
+    _mm_cmpeq_epi8_mask(a, filter)
+}
+
+/// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movm_epi16&expand=3886)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovm2w))]
+pub fn _mm512_movm_epi16(k: __mmask32) -> __m512i {
+    unsafe {
+        let one = _mm512_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        )
+        .as_i16x32();
+        transmute(simd_select_bitmask(k, one, i16x32::ZERO))
+    }
+}
+
+/// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movm_epi16&expand=3885)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovm2w))]
+pub fn _mm256_movm_epi16(k: __mmask16) -> __m256i {
+    unsafe {
+        let one = _mm256_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        )
+        .as_i16x16();
+        transmute(simd_select_bitmask(k, one, i16x16::ZERO))
+    }
+}
+
+/// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movm_epi16&expand=3884)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovm2w))]
+pub fn _mm_movm_epi16(k: __mmask8) -> __m128i {
+    unsafe {
+        let one = _mm_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        )
+        .as_i16x8();
+        transmute(simd_select_bitmask(k, one, i16x8::ZERO))
+    }
+}
+
+/// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movm_epi8&expand=3895)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovm2b))]
+pub fn _mm512_movm_epi8(k: __mmask64) -> __m512i {
+    unsafe {
+        let one =
+            _mm512_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
+                .as_i8x64();
+        transmute(simd_select_bitmask(k, one, i8x64::ZERO))
+    }
+}
+
+/// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movm_epi8&expand=3894)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovm2b))]
+pub fn _mm256_movm_epi8(k: __mmask32) -> __m256i {
+    unsafe {
+        let one =
+            _mm256_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
+                .as_i8x32();
+        transmute(simd_select_bitmask(k, one, i8x32::ZERO))
+    }
+}
+
+/// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movm_epi8&expand=3893)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovm2b))]
+pub fn _mm_movm_epi8(k: __mmask16) -> __m128i {
+    unsafe {
+        let one =
+            _mm_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
+                .as_i8x16();
+        transmute(simd_select_bitmask(k, one, i8x16::ZERO))
+    }
+}
+
+/// Convert 32-bit mask a into an integer value, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#_cvtmask32_u32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtmask32_u32(a: __mmask32) -> u32 {
+    a
+}
+
+/// Convert integer value a into an 32-bit mask, and store the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu32_mask32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtu32_mask32(a: u32) -> __mmask32 {
+    a
+}
+
+/// Add 32-bit masks in a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask32&expand=3207)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    a + b
+}
+
+/// Add 64-bit masks in a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask64&expand=3208)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    a + b
+}
+
+/// Compute the bitwise AND of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kand_mask32&expand=3213)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    a & b
+}
+
+/// Compute the bitwise AND of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kand_mask64&expand=3214)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    a & b
+}
+
+/// Compute the bitwise NOT of 32-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask32&expand=3234)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _knot_mask32(a: __mmask32) -> __mmask32 {
+    !a
+}
+
+/// Compute the bitwise NOT of 64-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask64&expand=3235)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _knot_mask64(a: __mmask64) -> __mmask64 {
+    !a
+}
+
+/// Compute the bitwise NOT of 32-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask32&expand=3219)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    _knot_mask32(a) & b
+}
+
+/// Compute the bitwise NOT of 64-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask64&expand=3220)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    _knot_mask64(a) & b
+}
+
+/// Compute the bitwise OR of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask32&expand=3240)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    a | b
+}
+
+/// Compute the bitwise OR of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask64&expand=3241)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    a | b
+}
+
+/// Compute the bitwise XOR of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask32&expand=3292)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    a ^ b
+}
+
+/// Compute the bitwise XOR of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask64&expand=3293)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    a ^ b
+}
+
+/// Compute the bitwise XNOR of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask32&expand=3286)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    _knot_mask32(a ^ b)
+}
+
+/// Compute the bitwise XNOR of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask64&expand=3287)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    _knot_mask64(a ^ b)
+}
+
+/// Compute the bitwise OR of 32-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _kortest_mask32_u8(a: __mmask32, b: __mmask32, all_ones: *mut u8) -> u8 {
+    let tmp = _kor_mask32(a, b);
+    *all_ones = (tmp == 0xffffffff) as u8;
+    (tmp == 0) as u8
+}
+
+/// Compute the bitwise OR of 64-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _kortest_mask64_u8(a: __mmask64, b: __mmask64, all_ones: *mut u8) -> u8 {
+    let tmp = _kor_mask64(a, b);
+    *all_ones = (tmp == 0xffffffff_ffffffff) as u8;
+    (tmp == 0) as u8
+}
+
+/// Compute the bitwise OR of 32-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+    (_kor_mask32(a, b) == 0xffffffff) as u8
+}
+
+/// Compute the bitwise OR of 64-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+    (_kor_mask64(a, b) == 0xffffffff_ffffffff) as u8
+}
+
+/// Compute the bitwise OR of 32-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+    (_kor_mask32(a, b) == 0) as u8
+}
+
+/// Compute the bitwise OR of 64-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+    (_kor_mask64(a, b) == 0) as u8
+}
+
+/// Shift the bits of 32-bit mask a left by count while shifting in zeros, and store the least significant 32 bits of the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftli_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
+    a << COUNT
+}
+
+/// Shift the bits of 64-bit mask a left by count while shifting in zeros, and store the least significant 32 bits of the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask64)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftli_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
+    a << COUNT
+}
+
+/// Shift the bits of 32-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftri_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
+    a >> COUNT
+}
+
+/// Shift the bits of 64-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask64)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftri_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
+    a >> COUNT
+}
+
+/// Compute the bitwise AND of 32-bit masks a and b, and if the result is all zeros, store 1 in dst,
+/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
+/// zeros, store 1 in and_not, otherwise store 0 in and_not.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _ktest_mask32_u8(a: __mmask32, b: __mmask32, and_not: *mut u8) -> u8 {
+    *and_not = (_kandn_mask32(a, b) == 0) as u8;
+    (_kand_mask32(a, b) == 0) as u8
+}
+
+/// Compute the bitwise AND of 64-bit masks a and b, and if the result is all zeros, store 1 in dst,
+/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
+/// zeros, store 1 in and_not, otherwise store 0 in and_not.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _ktest_mask64_u8(a: __mmask64, b: __mmask64, and_not: *mut u8) -> u8 {
+    *and_not = (_kandn_mask64(a, b) == 0) as u8;
+    (_kand_mask64(a, b) == 0) as u8
+}
+
+/// Compute the bitwise NOT of 32-bit mask a and then AND with 16-bit mask b, if the result is all
+/// zeros, store 1 in dst, otherwise store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+    (_kandn_mask32(a, b) == 0) as u8
+}
+
+/// Compute the bitwise NOT of 64-bit mask a and then AND with 8-bit mask b, if the result is all
+/// zeros, store 1 in dst, otherwise store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+    (_kandn_mask64(a, b) == 0) as u8
+}
+
+/// Compute the bitwise AND of 32-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+    (_kand_mask32(a, b) == 0) as u8
+}
+
+/// Compute the bitwise AND of 64-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+    (_kand_mask64(a, b) == 0) as u8
+}
+
+/// Unpack and interleave 16 bits from masks a and b, and store the 32-bit result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kunpackw)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckwd
+pub fn _mm512_kunpackw(a: __mmask32, b: __mmask32) -> __mmask32 {
+    ((a & 0xffff) << 16) | (b & 0xffff)
+}
+
+/// Unpack and interleave 32 bits from masks a and b, and store the 64-bit result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kunpackd)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckdq
+pub fn _mm512_kunpackd(a: __mmask64, b: __mmask64) -> __mmask64 {
+    ((a & 0xffffffff) << 32) | (b & 0xffffffff)
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi16_epi8&expand=1407)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm512_cvtepi16_epi8(a: __m512i) -> __m256i {
+    unsafe {
+        let a = a.as_i16x32();
+        transmute::<i8x32, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi16_epi8&expand=1408)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm512_mask_cvtepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, convert, src.as_i8x32()))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi16_epi8&expand=1409)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm512_maskz_cvtepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, convert, i8x32::ZERO))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi8&expand=1404)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm256_cvtepi16_epi8(a: __m256i) -> __m128i {
+    unsafe {
+        let a = a.as_i16x16();
+        transmute::<i8x16, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi16_epi8&expand=1405)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm256_mask_cvtepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi16_epi8&expand=1406)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm256_maskz_cvtepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi8&expand=1401)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm_cvtepi16_epi8(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i16x8();
+        let v256: i16x16 = simd_shuffle!(
+            a,
+            i16x8::ZERO,
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
+        );
+        transmute::<i8x16, _>(simd_cast(v256))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi16_epi8&expand=1402)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm_mask_cvtepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi8(a).as_i8x16();
+        let k: __mmask16 = 0b11111111_11111111 & k as __mmask16;
+        transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi16_epi8&expand=1403)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm_maskz_cvtepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi8(a).as_i8x16();
+        let k: __mmask16 = 0b11111111_11111111 & k as __mmask16;
+        transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi16_epi8&expand=1807)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm512_cvtsepi16_epi8(a: __m512i) -> __m256i {
+    unsafe {
+        transmute(vpmovswb(
+            a.as_i16x32(),
+            i8x32::ZERO,
+            0b11111111_11111111_11111111_11111111,
+        ))
+    }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi16_epi8&expand=1808)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm512_mask_cvtsepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovswb(a.as_i16x32(), src.as_i8x32(), k)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi16_epi8&expand=1809)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm512_maskz_cvtsepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovswb(a.as_i16x32(), i8x32::ZERO, k)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi16_epi8&expand=1804)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm256_cvtsepi16_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovswb256(a.as_i16x16(), i8x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi16_epi8&expand=1805)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm256_mask_cvtsepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovswb256(a.as_i16x16(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi16_epi8&expand=1806)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm256_maskz_cvtsepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovswb256(a.as_i16x16(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi16_epi8&expand=1801)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm_cvtsepi16_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovswb128(a.as_i16x8(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi16_epi8&expand=1802)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm_mask_cvtsepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovswb128(a.as_i16x8(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi16_epi8&expand=1803)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm_maskz_cvtsepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovswb128(a.as_i16x8(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi16_epi8&expand=2042)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm512_cvtusepi16_epi8(a: __m512i) -> __m256i {
+    unsafe {
+        transmute(vpmovuswb(
+            a.as_u16x32(),
+            u8x32::ZERO,
+            0b11111111_11111111_11111111_11111111,
+        ))
+    }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi16_epi8&expand=2043)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm512_mask_cvtusepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovuswb(a.as_u16x32(), src.as_u8x32(), k)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi16_epi8&expand=2044)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm512_maskz_cvtusepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovuswb(a.as_u16x32(), u8x32::ZERO, k)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi16_epi8&expand=2039)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm256_cvtusepi16_epi8(a: __m256i) -> __m128i {
+    unsafe {
+        transmute(vpmovuswb256(
+            a.as_u16x16(),
+            u8x16::ZERO,
+            0b11111111_11111111,
+        ))
+    }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi16_epi8&expand=2040)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm256_mask_cvtusepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovuswb256(a.as_u16x16(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi16_epi8&expand=2041)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm256_maskz_cvtusepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovuswb256(a.as_u16x16(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi16_epi8&expand=2036)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm_cvtusepi16_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovuswb128(a.as_u16x8(), u8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi16_epi8&expand=2037)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm_mask_cvtusepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovuswb128(a.as_u16x8(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi16_epi8&expand=2038)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm_maskz_cvtusepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovuswb128(a.as_u16x8(), u8x16::ZERO, k)) }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi8_epi16&expand=1526)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm512_cvtepi8_epi16(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x32();
+        transmute::<i16x32, _>(simd_cast(a))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi8_epi16&expand=1527)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm512_mask_cvtepi8_epi16(src: __m512i, k: __mmask32, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x32()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi8_epi16&expand=1528)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm512_maskz_cvtepi8_epi16(k: __mmask32, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, convert, i16x32::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi8_epi16&expand=1524)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm256_mask_cvtepi8_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi8_epi16&expand=1525)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm256_maskz_cvtepi8_epi16(k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi8_epi16&expand=1521)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm_mask_cvtepi8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi8_epi16&expand=1522)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm_maskz_cvtepi8_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu8_epi16&expand=1612)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm512_cvtepu8_epi16(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x32();
+        transmute::<i16x32, _>(simd_cast(a))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu8_epi16&expand=1613)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm512_mask_cvtepu8_epi16(src: __m512i, k: __mmask32, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x32()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu8_epi16&expand=1614)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm512_maskz_cvtepu8_epi16(k: __mmask32, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, convert, i16x32::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu8_epi16&expand=1610)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm256_mask_cvtepu8_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu8_epi16&expand=1611)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm256_maskz_cvtepu8_epi16(k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu8_epi16&expand=1607)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm_mask_cvtepu8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu8_epi16&expand=1608)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm_maskz_cvtepu8_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+    }
+}
+
+/// Shift 128-bit lanes in a left by imm8 bytes while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_bslli_epi128&expand=591)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_bslli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        const fn mask(shift: i32, i: u32) -> u32 {
+            let shift = shift as u32 & 0xff;
+            if shift > 15 || i % 16 < shift {
+                0
+            } else {
+                64 + (i - shift)
+            }
+        }
+        let a = a.as_i8x64();
+        let zero = i8x64::ZERO;
+        let r: i8x64 = simd_shuffle!(
+            zero,
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+                mask(IMM8, 4),
+                mask(IMM8, 5),
+                mask(IMM8, 6),
+                mask(IMM8, 7),
+                mask(IMM8, 8),
+                mask(IMM8, 9),
+                mask(IMM8, 10),
+                mask(IMM8, 11),
+                mask(IMM8, 12),
+                mask(IMM8, 13),
+                mask(IMM8, 14),
+                mask(IMM8, 15),
+                mask(IMM8, 16),
+                mask(IMM8, 17),
+                mask(IMM8, 18),
+                mask(IMM8, 19),
+                mask(IMM8, 20),
+                mask(IMM8, 21),
+                mask(IMM8, 22),
+                mask(IMM8, 23),
+                mask(IMM8, 24),
+                mask(IMM8, 25),
+                mask(IMM8, 26),
+                mask(IMM8, 27),
+                mask(IMM8, 28),
+                mask(IMM8, 29),
+                mask(IMM8, 30),
+                mask(IMM8, 31),
+                mask(IMM8, 32),
+                mask(IMM8, 33),
+                mask(IMM8, 34),
+                mask(IMM8, 35),
+                mask(IMM8, 36),
+                mask(IMM8, 37),
+                mask(IMM8, 38),
+                mask(IMM8, 39),
+                mask(IMM8, 40),
+                mask(IMM8, 41),
+                mask(IMM8, 42),
+                mask(IMM8, 43),
+                mask(IMM8, 44),
+                mask(IMM8, 45),
+                mask(IMM8, 46),
+                mask(IMM8, 47),
+                mask(IMM8, 48),
+                mask(IMM8, 49),
+                mask(IMM8, 50),
+                mask(IMM8, 51),
+                mask(IMM8, 52),
+                mask(IMM8, 53),
+                mask(IMM8, 54),
+                mask(IMM8, 55),
+                mask(IMM8, 56),
+                mask(IMM8, 57),
+                mask(IMM8, 58),
+                mask(IMM8, 59),
+                mask(IMM8, 60),
+                mask(IMM8, 61),
+                mask(IMM8, 62),
+                mask(IMM8, 63),
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shift 128-bit lanes in a right by imm8 bytes while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_bsrli_epi128&expand=594)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i8x64();
+        let zero = i8x64::ZERO;
+        let r: i8x64 = match IMM8 % 16 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                        40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+                        59, 60, 61, 62, 63,
+                    ],
+                )
+            }
+            1 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 17, 18, 19, 20, 21,
+                        22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 80, 33, 34, 35, 36, 37, 38, 39, 40,
+                        41, 42, 43, 44, 45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                        60, 61, 62, 63, 112,
+                    ],
+                )
+            }
+            2 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 18, 19, 20, 21, 22,
+                        23, 24, 25, 26, 27, 28, 29, 30, 31, 80, 81, 34, 35, 36, 37, 38, 39, 40, 41,
+                        42, 43, 44, 45, 46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+                        61, 62, 63, 112, 113,
+                    ],
+                )
+            }
+            3 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 19, 20, 21, 22,
+                        23, 24, 25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 35, 36, 37, 38, 39, 40, 41,
+                        42, 43, 44, 45, 46, 47, 96, 97, 98, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+                        61, 62, 63, 112, 113, 114,
+                    ],
+                )
+            }
+            4 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 20, 21, 22, 23,
+                        24, 25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 36, 37, 38, 39, 40, 41, 42,
+                        43, 44, 45, 46, 47, 96, 97, 98, 99, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+                        62, 63, 112, 113, 114, 115,
+                    ],
+                )
+            }
+            5 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 21, 22, 23, 24,
+                        25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 37, 38, 39, 40, 41, 42, 43,
+                        44, 45, 46, 47, 96, 97, 98, 99, 100, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+                        62, 63, 112, 113, 114, 115, 116,
+                    ],
+                )
+            }
+            6 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 22, 23, 24, 25,
+                        26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 38, 39, 40, 41, 42, 43, 44,
+                        45, 46, 47, 96, 97, 98, 99, 100, 101, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+                        63, 112, 113, 114, 115, 116, 117,
+                    ],
+                )
+            }
+            7 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 23, 24, 25,
+                        26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 39, 40, 41, 42, 43, 44,
+                        45, 46, 47, 96, 97, 98, 99, 100, 101, 102, 55, 56, 57, 58, 59, 60, 61, 62,
+                        63, 112, 113, 114, 115, 116, 117, 118,
+                    ],
+                )
+            }
+            8 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 24, 25, 26,
+                        27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 40, 41, 42, 43, 44, 45,
+                        46, 47, 96, 97, 98, 99, 100, 101, 102, 103, 56, 57, 58, 59, 60, 61, 62, 63,
+                        112, 113, 114, 115, 116, 117, 118, 119,
+                    ],
+                )
+            }
+            9 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 25, 26, 27,
+                        28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 41, 42, 43, 44, 45, 46,
+                        47, 96, 97, 98, 99, 100, 101, 102, 103, 104, 57, 58, 59, 60, 61, 62, 63,
+                        112, 113, 114, 115, 116, 117, 118, 119, 120,
+                    ],
+                )
+            }
+            10 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 26, 27, 28,
+                        29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 42, 43, 44, 45, 46, 47,
+                        96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 58, 59, 60, 61, 62, 63, 112,
+                        113, 114, 115, 116, 117, 118, 119, 120, 121,
+                    ],
+                )
+            }
+            11 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 27, 28, 29,
+                        30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 43, 44, 45, 46, 47, 96,
+                        97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 59, 60, 61, 62, 63, 112,
+                        113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
+                    ],
+                )
+            }
+            12 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 28, 29, 30,
+                        31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 44, 45, 46, 47, 96, 97,
+                        98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 60, 61, 62, 63, 112, 113,
+                        114, 115, 116, 117, 118, 119, 120, 121, 122, 123,
+                    ],
+                )
+            }
+            13 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 29, 30, 31,
+                        80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 45, 46, 47, 96, 97, 98,
+                        99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 61, 62, 63, 112, 113, 114,
+                        115, 116, 117, 118, 119, 120, 121, 122, 123, 124,
+                    ],
+                )
+            }
+            14 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 30, 31, 80,
+                        81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 46, 47, 96, 97, 98, 99,
+                        100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 62, 63, 112, 113, 114,
+                        115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+                    ],
+                )
+            }
+            15 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 31, 80, 81,
+                        82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 47, 96, 97, 98, 99,
+                        100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 63, 112, 113, 114,
+                        115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+                    ],
+                )
+            }
+            _ => zero,
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst.
+/// Unlike [`_mm_alignr_epi8`], [`_mm256_alignr_epi8`] functions, where the entire input vectors are concatenated to the temporary result,
+/// this concatenation happens in 4 steps, where each step builds 32-byte temporary result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi8&expand=263)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        // If palignr is shifting the pair of vectors more than the size of two
+        // lanes, emit zero.
+        if IMM8 >= 32 {
+            return _mm512_setzero_si512();
+        }
+        // If palignr is shifting the pair of input vectors more than one lane,
+        // but less than two lanes, convert to shifting in zeroes.
+        let (a, b) = if IMM8 > 16 {
+            (_mm512_setzero_si512(), a)
+        } else {
+            (a, b)
+        };
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        if IMM8 == 16 {
+            return transmute(a);
+        }
+        let r: i8x64 = match IMM8 % 16 {
+            0 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                        40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+                        59, 60, 61, 62, 63,
+                    ],
+                )
+            }
+            1 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 17, 18, 19, 20, 21,
+                        22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 80, 33, 34, 35, 36, 37, 38, 39, 40,
+                        41, 42, 43, 44, 45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                        60, 61, 62, 63, 112,
+                    ],
+                )
+            }
+            2 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 18, 19, 20, 21, 22,
+                        23, 24, 25, 26, 27, 28, 29, 30, 31, 80, 81, 34, 35, 36, 37, 38, 39, 40, 41,
+                        42, 43, 44, 45, 46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+                        61, 62, 63, 112, 113,
+                    ],
+                )
+            }
+            3 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 19, 20, 21, 22,
+                        23, 24, 25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 35, 36, 37, 38, 39, 40, 41,
+                        42, 43, 44, 45, 46, 47, 96, 97, 98, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+                        61, 62, 63, 112, 113, 114,
+                    ],
+                )
+            }
+            4 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 20, 21, 22, 23,
+                        24, 25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 36, 37, 38, 39, 40, 41, 42,
+                        43, 44, 45, 46, 47, 96, 97, 98, 99, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+                        62, 63, 112, 113, 114, 115,
+                    ],
+                )
+            }
+            5 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 21, 22, 23, 24,
+                        25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 37, 38, 39, 40, 41, 42, 43,
+                        44, 45, 46, 47, 96, 97, 98, 99, 100, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+                        62, 63, 112, 113, 114, 115, 116,
+                    ],
+                )
+            }
+            6 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 22, 23, 24, 25,
+                        26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 38, 39, 40, 41, 42, 43, 44,
+                        45, 46, 47, 96, 97, 98, 99, 100, 101, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+                        63, 112, 113, 114, 115, 116, 117,
+                    ],
+                )
+            }
+            7 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 23, 24, 25,
+                        26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 39, 40, 41, 42, 43, 44,
+                        45, 46, 47, 96, 97, 98, 99, 100, 101, 102, 55, 56, 57, 58, 59, 60, 61, 62,
+                        63, 112, 113, 114, 115, 116, 117, 118,
+                    ],
+                )
+            }
+            8 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 24, 25, 26,
+                        27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 40, 41, 42, 43, 44, 45,
+                        46, 47, 96, 97, 98, 99, 100, 101, 102, 103, 56, 57, 58, 59, 60, 61, 62, 63,
+                        112, 113, 114, 115, 116, 117, 118, 119,
+                    ],
+                )
+            }
+            9 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 25, 26, 27,
+                        28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 41, 42, 43, 44, 45, 46,
+                        47, 96, 97, 98, 99, 100, 101, 102, 103, 104, 57, 58, 59, 60, 61, 62, 63,
+                        112, 113, 114, 115, 116, 117, 118, 119, 120,
+                    ],
+                )
+            }
+            10 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 26, 27, 28,
+                        29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 42, 43, 44, 45, 46, 47,
+                        96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 58, 59, 60, 61, 62, 63, 112,
+                        113, 114, 115, 116, 117, 118, 119, 120, 121,
+                    ],
+                )
+            }
+            11 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 27, 28, 29,
+                        30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 43, 44, 45, 46, 47, 96,
+                        97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 59, 60, 61, 62, 63, 112,
+                        113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
+                    ],
+                )
+            }
+            12 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 28, 29, 30,
+                        31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 44, 45, 46, 47, 96, 97,
+                        98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 60, 61, 62, 63, 112, 113,
+                        114, 115, 116, 117, 118, 119, 120, 121, 122, 123,
+                    ],
+                )
+            }
+            13 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 29, 30, 31,
+                        80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 45, 46, 47, 96, 97, 98,
+                        99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 61, 62, 63, 112, 113, 114,
+                        115, 116, 117, 118, 119, 120, 121, 122, 123, 124,
+                    ],
+                )
+            }
+            14 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 30, 31, 80,
+                        81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 46, 47, 96, 97, 98, 99,
+                        100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 62, 63, 112, 113, 114,
+                        115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+                    ],
+                )
+            }
+            15 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 31, 80, 81,
+                        82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 47, 96, 97, 98, 99,
+                        100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 63, 112, 113, 114,
+                        115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+                    ],
+                )
+            }
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_alignr_epi8&expand=264)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_alignr_epi8<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x64(), src.as_i8x64()))
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_alignr_epi8&expand=265)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_alignr_epi8<const IMM8: i32>(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x64(), i8x64::ZERO))
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_alignr_epi8&expand=261)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub fn _mm256_mask_alignr_epi8<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x32(), src.as_i8x32()))
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_alignr_epi8&expand=262)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub fn _mm256_maskz_alignr_epi8<const IMM8: i32>(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x32(), i8x32::ZERO))
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_alignr_epi8&expand=258)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub fn _mm_mask_alignr_epi8<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask16,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x16(), src.as_i8x16()))
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_alignr_epi8&expand=259)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub fn _mm_maskz_alignr_epi8<const IMM8: i32>(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x16(), i8x16::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi16_storeu_epi8&expand=1812)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm512_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
+    vpmovswbmem(mem_addr, a.as_i16x32(), k);
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi16_storeu_epi8&expand=1811)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm256_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
+    vpmovswbmem256(mem_addr, a.as_i16x16(), k);
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi16_storeu_epi8&expand=1810)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovswbmem128(mem_addr, a.as_i16x8(), k);
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi16_storeu_epi8&expand=1412)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm512_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
+    vpmovwbmem(mem_addr, a.as_i16x32(), k);
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi16_storeu_epi8&expand=1411)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm256_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
+    vpmovwbmem256(mem_addr, a.as_i16x16(), k);
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi16_storeu_epi8&expand=1410)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovwbmem128(mem_addr, a.as_i16x8(), k);
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi16_storeu_epi8&expand=2047)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm512_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
+    vpmovuswbmem(mem_addr, a.as_i16x32(), k);
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi16_storeu_epi8&expand=2046)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm256_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
+    vpmovuswbmem256(mem_addr, a.as_i16x16(), k);
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi16_storeu_epi8&expand=2045)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovuswbmem128(mem_addr, a.as_i16x8(), k);
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
+    fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.pmaddw.d.512"]
+    fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
+    fn vpmaddubsw(a: i8x64, b: i8x64) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.packssdw.512"]
+    fn vpackssdw(a: i32x16, b: i32x16) -> i16x32;
+    #[link_name = "llvm.x86.avx512.packsswb.512"]
+    fn vpacksswb(a: i16x32, b: i16x32) -> i8x64;
+    #[link_name = "llvm.x86.avx512.packusdw.512"]
+    fn vpackusdw(a: i32x16, b: i32x16) -> u16x32;
+    #[link_name = "llvm.x86.avx512.packuswb.512"]
+    fn vpackuswb(a: i16x32, b: i16x32) -> u8x64;
+
+    #[link_name = "llvm.x86.avx512.psll.w.512"]
+    fn vpsllw(a: i16x32, count: i16x8) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.psllv.w.512"]
+    fn vpsllvw(a: i16x32, b: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.psllv.w.256"]
+    fn vpsllvw256(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.psllv.w.128"]
+    fn vpsllvw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.psrl.w.512"]
+    fn vpsrlw(a: i16x32, count: i16x8) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.psrlv.w.512"]
+    fn vpsrlvw(a: i16x32, b: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.psrlv.w.256"]
+    fn vpsrlvw256(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.psrlv.w.128"]
+    fn vpsrlvw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.psra.w.512"]
+    fn vpsraw(a: i16x32, count: i16x8) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.psrav.w.512"]
+    fn vpsravw(a: i16x32, count: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.psrav.w.256"]
+    fn vpsravw256(a: i16x16, count: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.psrav.w.128"]
+    fn vpsravw128(a: i16x8, count: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.hi.512"]
+    fn vpermi2w(a: i16x32, idx: i16x32, b: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.vpermi2var.hi.256"]
+    fn vpermi2w256(a: i16x16, idx: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.vpermi2var.hi.128"]
+    fn vpermi2w128(a: i16x8, idx: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.permvar.hi.512"]
+    fn vpermw(a: i16x32, idx: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.permvar.hi.256"]
+    fn vpermw256(a: i16x16, idx: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.permvar.hi.128"]
+    fn vpermw128(a: i16x8, idx: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.pshuf.b.512"]
+    fn vpshufb(a: i8x64, b: i8x64) -> i8x64;
+
+    #[link_name = "llvm.x86.avx512.psad.bw.512"]
+    fn vpsadbw(a: u8x64, b: u8x64) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512.dbpsadbw.512"]
+    fn vdbpsadbw(a: u8x64, b: u8x64, imm8: i32) -> u16x32;
+    #[link_name = "llvm.x86.avx512.dbpsadbw.256"]
+    fn vdbpsadbw256(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
+    #[link_name = "llvm.x86.avx512.dbpsadbw.128"]
+    fn vdbpsadbw128(a: u8x16, b: u8x16, imm8: i32) -> u16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.512"]
+    fn vpmovswb(a: i16x32, src: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.256"]
+    fn vpmovswb256(a: i16x16, src: i8x16, mask: u16) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.128"]
+    fn vpmovswb128(a: i16x8, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.512"]
+    fn vpmovuswb(a: u16x32, src: u8x32, mask: u32) -> u8x32;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.256"]
+    fn vpmovuswb256(a: u16x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.128"]
+    fn vpmovuswb128(a: u16x8, src: u8x16, mask: u8) -> u8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.512"]
+    fn vpmovswbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.256"]
+    fn vpmovswbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.128"]
+    fn vpmovswbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.512"]
+    fn vpmovwbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.256"]
+    fn vpmovwbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.128"]
+    fn vpmovwbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.512"]
+    fn vpmovuswbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.256"]
+    fn vpmovuswbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.128"]
+    fn vpmovuswbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.loadu.b.128"]
+    fn loaddqu8_128(mem_addr: *const i8, a: i8x16, mask: u16) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.loadu.w.128"]
+    fn loaddqu16_128(mem_addr: *const i16, a: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.loadu.b.256"]
+    fn loaddqu8_256(mem_addr: *const i8, a: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.loadu.w.256"]
+    fn loaddqu16_256(mem_addr: *const i16, a: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.loadu.b.512"]
+    fn loaddqu8_512(mem_addr: *const i8, a: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.loadu.w.512"]
+    fn loaddqu16_512(mem_addr: *const i16, a: i16x32, mask: u32) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.mask.storeu.b.128"]
+    fn storedqu8_128(mem_addr: *mut i8, a: i8x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.storeu.w.128"]
+    fn storedqu16_128(mem_addr: *mut i16, a: i16x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.b.256"]
+    fn storedqu8_256(mem_addr: *mut i8, a: i8x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.storeu.w.256"]
+    fn storedqu16_256(mem_addr: *mut i16, a: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.storeu.b.512"]
+    fn storedqu8_512(mem_addr: *mut i8, a: i8x64, mask: u64);
+    #[link_name = "llvm.x86.avx512.mask.storeu.w.512"]
+    fn storedqu16_512(mem_addr: *mut i16, a: i16x32, mask: u32);
+
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use crate::hint::black_box;
+    use crate::mem::{self};
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_abs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let r = _mm512_abs_epi16(a);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_abs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let r = _mm512_mask_abs_epi16(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_abs_epi16(a, 0b00000000_11111111_00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_abs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let r = _mm512_maskz_abs_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_abs_epi16(0b00000000_11111111_00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_abs_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let r = _mm256_mask_abs_epi16(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_abs_epi16(a, 0b00000000_11111111, a);
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_abs_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let r = _mm256_maskz_abs_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_abs_epi16(0b00000000_11111111, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_abs_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let r = _mm_mask_abs_epi16(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_abs_epi16(a, 0b00001111, a);
+        let e = _mm_set_epi16(-1, -1, -1, -1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_abs_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let r = _mm_maskz_abs_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_abs_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_abs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let r = _mm512_abs_epi8(a);
+        let e = _mm512_set1_epi8(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_abs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let r = _mm512_mask_abs_epi8(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_abs_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_abs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let r = _mm512_maskz_abs_epi8(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_abs_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_abs_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let r = _mm256_mask_abs_epi8(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_abs_epi8(a, 0b00000000_11111111_00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_abs_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let r = _mm256_maskz_abs_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_abs_epi8(0b00000000_11111111_00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_abs_epi8() {
+        let a = _mm_set1_epi8(-1);
+        let r = _mm_mask_abs_epi8(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_abs_epi8(a, 0b00000000_11111111, a);
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_abs_epi8() {
+        let a = _mm_set1_epi8(-1);
+        let r = _mm_maskz_abs_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_abs_epi8(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_add_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_add_epi16(a, b);
+        let e = _mm512_set1_epi16(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_add_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_mask_add_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_add_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_add_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_add_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_add_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_add_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_mask_add_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_add_epi16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_add_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_add_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_add_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_add_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_mask_add_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_add_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 3, 3, 3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_add_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_maskz_add_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_add_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 3, 3, 3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_add_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_add_epi8(a, b);
+        let e = _mm512_set1_epi8(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_add_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_mask_add_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_add_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_add_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_maskz_add_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_add_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_add_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_mask_add_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_add_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_add_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_maskz_add_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_add_epi8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_add_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_mask_add_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_add_epi8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_add_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_maskz_add_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_add_epi8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_adds_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_adds_epu16(a, b);
+        let e = _mm512_set1_epi16(u16::MAX as i16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_adds_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_mask_adds_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_adds_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_adds_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_maskz_adds_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_adds_epu16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_adds_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(u16::MAX as i16);
+        let r = _mm256_mask_adds_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_adds_epu16(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_adds_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(u16::MAX as i16);
+        let r = _mm256_maskz_adds_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_adds_epu16(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_adds_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(u16::MAX as i16);
+        let r = _mm_mask_adds_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_adds_epu16(a, 0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi16(1, 1, 1, 1, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_adds_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(u16::MAX as i16);
+        let r = _mm_maskz_adds_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_adds_epu16(0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi16(0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_adds_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_adds_epu8(a, b);
+        let e = _mm512_set1_epi8(u8::MAX as i8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_adds_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_mask_adds_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_adds_epu8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_adds_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_maskz_adds_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_adds_epu8(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_adds_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(u8::MAX as i8);
+        let r = _mm256_mask_adds_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_adds_epu8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_adds_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(u8::MAX as i8);
+        let r = _mm256_maskz_adds_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_adds_epu8(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_adds_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(u8::MAX as i8);
+        let r = _mm_mask_adds_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_adds_epu8(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_adds_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(u8::MAX as i8);
+        let r = _mm_maskz_adds_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_adds_epu8(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_adds_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_adds_epi16(a, b);
+        let e = _mm512_set1_epi16(i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_adds_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_mask_adds_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_adds_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_adds_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_maskz_adds_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_adds_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_adds_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_mask_adds_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_adds_epi16(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_adds_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_maskz_adds_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_adds_epi16(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_adds_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(i16::MAX);
+        let r = _mm_mask_adds_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_adds_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_adds_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(i16::MAX);
+        let r = _mm_maskz_adds_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_adds_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_adds_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_adds_epi8(a, b);
+        let e = _mm512_set1_epi8(i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_adds_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_mask_adds_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_adds_epi8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_adds_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_maskz_adds_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_adds_epi8(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_adds_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(i8::MAX);
+        let r = _mm256_mask_adds_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_adds_epi8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_adds_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(i8::MAX);
+        let r = _mm256_maskz_adds_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_adds_epi8(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_adds_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(i8::MAX);
+        let r = _mm_mask_adds_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_adds_epi8(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_adds_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(i8::MAX);
+        let r = _mm_maskz_adds_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_adds_epi8(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sub_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_sub_epi16(a, b);
+        let e = _mm512_set1_epi16(-1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sub_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_mask_sub_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sub_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sub_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_sub_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sub_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sub_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_mask_sub_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sub_epi16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_sub_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sub_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sub_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_mask_sub_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sub_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sub_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_maskz_sub_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sub_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sub_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_sub_epi8(a, b);
+        let e = _mm512_set1_epi8(-1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sub_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_mask_sub_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sub_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sub_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_maskz_sub_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sub_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sub_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_mask_sub_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sub_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_maskz_sub_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sub_epi8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sub_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_mask_sub_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sub_epi8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sub_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_maskz_sub_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sub_epi8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_subs_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_subs_epu16(a, b);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_subs_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_mask_subs_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_subs_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_subs_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_maskz_subs_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_subs_epu16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_subs_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(u16::MAX as i16);
+        let r = _mm256_mask_subs_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_subs_epu16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_subs_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(u16::MAX as i16);
+        let r = _mm256_maskz_subs_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_subs_epu16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_subs_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(u16::MAX as i16);
+        let r = _mm_mask_subs_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_subs_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_subs_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(u16::MAX as i16);
+        let r = _mm_maskz_subs_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_subs_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_subs_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_subs_epu8(a, b);
+        let e = _mm512_set1_epi8(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_subs_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_mask_subs_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_subs_epu8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_subs_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_maskz_subs_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_subs_epu8(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_subs_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(u8::MAX as i8);
+        let r = _mm256_mask_subs_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_subs_epu8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_subs_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(u8::MAX as i8);
+        let r = _mm256_maskz_subs_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_subs_epu8(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_subs_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(u8::MAX as i8);
+        let r = _mm_mask_subs_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_subs_epu8(a, 0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_subs_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(u8::MAX as i8);
+        let r = _mm_maskz_subs_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_subs_epu8(0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_subs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_subs_epi16(a, b);
+        let e = _mm512_set1_epi16(i16::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_subs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_mask_subs_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_subs_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_subs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_maskz_subs_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_subs_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_subs_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_mask_subs_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_subs_epi16(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_subs_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_maskz_subs_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_subs_epi16(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_subs_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(i16::MAX);
+        let r = _mm_mask_subs_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_subs_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(-1, -1, -1, -1, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_subs_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(i16::MAX);
+        let r = _mm_maskz_subs_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_subs_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_subs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_subs_epi8(a, b);
+        let e = _mm512_set1_epi8(i8::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_subs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_mask_subs_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_subs_epi8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_subs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_maskz_subs_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_subs_epi8(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_subs_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(i8::MAX);
+        let r = _mm256_mask_subs_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_subs_epi8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_subs_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(i8::MAX);
+        let r = _mm256_maskz_subs_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_subs_epi8(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_subs_epi8() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(i8::MAX);
+        let r = _mm_mask_subs_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_subs_epi8(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_subs_epi8() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(i8::MAX);
+        let r = _mm_maskz_subs_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_subs_epi8(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mulhi_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mulhi_epu16(a, b);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mulhi_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_mulhi_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mulhi_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mulhi_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_mulhi_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mulhi_epu16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mulhi_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_mulhi_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mulhi_epu16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mulhi_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_mulhi_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mulhi_epu16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mulhi_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_mulhi_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mulhi_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mulhi_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_mulhi_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mulhi_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mulhi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mulhi_epi16(a, b);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mulhi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_mulhi_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mulhi_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mulhi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_mulhi_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mulhi_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mulhi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_mulhi_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mulhi_epi16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mulhi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_mulhi_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mulhi_epi16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mulhi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_mulhi_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mulhi_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mulhi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_mulhi_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mulhi_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mulhrs_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mulhrs_epi16(a, b);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mulhrs_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_mulhrs_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mulhrs_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mulhrs_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_mulhrs_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mulhrs_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mulhrs_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_mulhrs_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mulhrs_epi16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mulhrs_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_mulhrs_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mulhrs_epi16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mulhrs_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_mulhrs_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mulhrs_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mulhrs_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_mulhrs_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mulhrs_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mullo_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mullo_epi16(a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mullo_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_mullo_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mullo_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mullo_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_mullo_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mullo_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mullo_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_mullo_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mullo_epi16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mullo_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_mullo_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mullo_epi16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mullo_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_mullo_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mullo_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mullo_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_mullo_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mullo_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_max_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epu16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epu16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epu16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epu16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epu16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_max_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_max_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epu8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epu8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epu8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epu8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epu8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_max_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epu8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_max_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epu8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_max_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epi16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_max_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_max_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epi8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_max_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epi8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_max_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epi8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_min_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epu16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epu16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epu16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epu16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epu16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_min_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_min_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epu8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epu8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epu8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epu8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epu8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_min_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epu8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_min_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epu8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_min_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epi16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_min_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_min_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epi8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_min_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epi8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_min_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epi8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmplt_epu16_mask() {
+        let a = _mm512_set1_epi16(-2);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmplt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmplt_epu16_mask() {
+        let a = _mm512_set1_epi16(-2);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmplt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmplt_epu16_mask() {
+        let a = _mm256_set1_epi16(-2);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmplt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epu16_mask() {
+        let a = _mm256_set1_epi16(-2);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmplt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmplt_epu16_mask() {
+        let a = _mm_set1_epi16(-2);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmplt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epu16_mask() {
+        let a = _mm_set1_epi16(-2);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmplt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmplt_epu8_mask() {
+        let a = _mm512_set1_epi8(-2);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmplt_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmplt_epu8_mask() {
+        let a = _mm512_set1_epi8(-2);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmplt_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmplt_epu8_mask() {
+        let a = _mm256_set1_epi8(-2);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmplt_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epu8_mask() {
+        let a = _mm256_set1_epi8(-2);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmplt_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmplt_epu8_mask() {
+        let a = _mm_set1_epi8(-2);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmplt_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epu8_mask() {
+        let a = _mm_set1_epi8(-2);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmplt_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmplt_epi16_mask() {
+        let a = _mm512_set1_epi16(-2);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmplt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmplt_epi16_mask() {
+        let a = _mm512_set1_epi16(-2);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmplt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmplt_epi16_mask() {
+        let a = _mm256_set1_epi16(-2);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmplt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epi16_mask() {
+        let a = _mm256_set1_epi16(-2);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmplt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmplt_epi16_mask() {
+        let a = _mm_set1_epi16(-2);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmplt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epi16_mask() {
+        let a = _mm_set1_epi16(-2);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmplt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmplt_epi8_mask() {
+        let a = _mm512_set1_epi8(-2);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmplt_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmplt_epi8_mask() {
+        let a = _mm512_set1_epi8(-2);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmplt_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmplt_epi8_mask() {
+        let a = _mm256_set1_epi8(-2);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmplt_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epi8_mask() {
+        let a = _mm256_set1_epi8(-2);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmplt_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmplt_epi8_mask() {
+        let a = _mm_set1_epi8(-2);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmplt_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epi8_mask() {
+        let a = _mm_set1_epi8(-2);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmplt_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpgt_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpgt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpgt_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpgt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpgt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpgt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpgt_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpgt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpgt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpgt_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpgt_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpgt_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpgt_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpgt_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpgt_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpgt_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpgt_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpgt_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpgt_epi16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmpgt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpgt_epi16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpgt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epi16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmpgt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epi16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b001010101_01010101;
+        let r = _mm256_mask_cmpgt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpgt_epi16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpgt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epi16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpgt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpgt_epi8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpgt_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpgt_epi8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpgt_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epi8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpgt_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epi8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpgt_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpgt_epi8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpgt_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epi8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpgt_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epu16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmple_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epu16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epu16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmple_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epu16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmple_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epu16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmple_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epu16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmple_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epu8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmple_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epu8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epu8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmple_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epu8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmple_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epu8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmple_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epu8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmple_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmple_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmple_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmple_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmple_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmple_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmple_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmple_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmple_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmple_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmple_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpge_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpge_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpge_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpge_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpge_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpge_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpge_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpge_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpge_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpge_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpge_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpge_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpge_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpge_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpge_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpge_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmpge_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpge_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpge_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmpge_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpge_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpge_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpge_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpge_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpge_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpge_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpge_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpge_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpge_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpge_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpge_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpge_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpeq_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpeq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpeq_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpeq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpeq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpeq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpeq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpeq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpeq_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpeq_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpeq_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpeq_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpeq_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpeq_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpeq_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpeq_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpeq_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmpeq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpeq_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpeq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmpeq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpeq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpeq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpeq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpeq_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpeq_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpeq_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpeq_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpeq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpeq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpeq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpeq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpneq_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpneq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpneq_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpneq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpneq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpneq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpneq_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpneq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpneq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpneq_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpneq_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpneq_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpneq_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpneq_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpneq_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpneq_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpneq_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpneq_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpneq_epi16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmpneq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpneq_epi16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpneq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epi16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmpneq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epi16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpneq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpneq_epi16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpneq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epi16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpneq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpneq_epi8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpneq_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpneq_epi8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpneq_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epi8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpneq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epi8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpneq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpneq_epi8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpneq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epi8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpneq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmp_epu16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmp_epu16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmp_epu16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epu16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmp_epu16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epu16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmp_epu8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmp_epu8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmp_epu8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epu8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmp_epu8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epu8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmp_epi16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmp_epi16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmp_epi16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epi16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmp_epi16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epi16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmp_epi8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmp_epi8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmp_epi8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epi8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmp_epi8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epi8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_add_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let e = _mm256_reduce_add_epi16(a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_add_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let e = _mm256_mask_reduce_add_epi16(0b11111111_00000000, a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_add_epi16() {
+        let a = _mm_set1_epi16(1);
+        let e = _mm_reduce_add_epi16(a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_add_epi16() {
+        let a = _mm_set1_epi16(1);
+        let e = _mm_mask_reduce_add_epi16(0b11110000, a);
+        assert_eq!(4, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_add_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let e = _mm256_reduce_add_epi8(a);
+        assert_eq!(32, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_add_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let e = _mm256_mask_reduce_add_epi8(0b11111111_00000000_11111111_00000000, a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_add_epi8() {
+        let a = _mm_set1_epi8(1);
+        let e = _mm_reduce_add_epi8(a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_add_epi8() {
+        let a = _mm_set1_epi8(1);
+        let e = _mm_mask_reduce_add_epi8(0b11111111_00000000, a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_and_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_reduce_and_epi16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_and_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_mask_reduce_and_epi16(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_and_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_reduce_and_epi16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_and_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_and_epi16(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_and_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        let e = _mm256_reduce_and_epi8(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_and_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        let e = _mm256_mask_reduce_and_epi8(0b11111111_00000000_11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_and_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm_reduce_and_epi8(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_and_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_and_epi8(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_mul_epi16() {
+        let a = _mm256_set_epi16(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        let e = _mm256_reduce_mul_epi16(a);
+        assert_eq!(256, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_mul_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_mask_reduce_mul_epi16(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_mul_epi16() {
+        let a = _mm_set_epi16(2, 2, 2, 2, 1, 1, 1, 1);
+        let e = _mm_reduce_mul_epi16(a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_mul_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_mul_epi16(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_mul_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            2, 2, 2,
+        );
+        let e = _mm256_reduce_mul_epi8(a);
+        assert_eq!(64, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_mul_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            2, 2, 2,
+        );
+        let e = _mm256_mask_reduce_mul_epi8(0b11111111_00000000_11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_mul_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2);
+        let e = _mm_reduce_mul_epi8(a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_mul_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2);
+        let e = _mm_mask_reduce_mul_epi8(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_max_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i16 = _mm256_reduce_max_epi16(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_max_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i16 = _mm256_mask_reduce_max_epi16(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_max_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16 = _mm_reduce_max_epi16(a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_max_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16 = _mm_mask_reduce_max_epi16(0b11110000, a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_max_epi8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: i8 = _mm256_reduce_max_epi8(a);
+        assert_eq!(31, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_max_epi8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: i8 = _mm256_mask_reduce_max_epi8(0b1111111111111111_0000000000000000, a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_max_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8 = _mm_reduce_max_epi8(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_max_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8 = _mm_mask_reduce_max_epi8(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_max_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u16 = _mm256_reduce_max_epu16(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_max_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u16 = _mm256_mask_reduce_max_epu16(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_max_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16 = _mm_reduce_max_epu16(a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_max_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16 = _mm_mask_reduce_max_epu16(0b11110000, a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_max_epu8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: u8 = _mm256_reduce_max_epu8(a);
+        assert_eq!(31, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_max_epu8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: u8 = _mm256_mask_reduce_max_epu8(0b1111111111111111_0000000000000000, a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_max_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8 = _mm_reduce_max_epu8(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_max_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8 = _mm_mask_reduce_max_epu8(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_min_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i16 = _mm256_reduce_min_epi16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_min_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i16 = _mm256_mask_reduce_min_epi16(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_min_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16 = _mm_reduce_min_epi16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_min_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16 = _mm_mask_reduce_min_epi16(0b11110000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_min_epi8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: i8 = _mm256_reduce_min_epi8(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_min_epi8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: i8 = _mm256_mask_reduce_min_epi8(0b1111111111111111_0000000000000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_min_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8 = _mm_reduce_min_epi8(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_min_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8 = _mm_mask_reduce_min_epi8(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_min_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u16 = _mm256_reduce_min_epu16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_min_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u16 = _mm256_mask_reduce_min_epu16(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_min_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16 = _mm_reduce_min_epu16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_min_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16 = _mm_mask_reduce_min_epu16(0b11110000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_min_epu8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: u8 = _mm256_reduce_min_epu8(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_min_epu8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: u8 = _mm256_mask_reduce_min_epu8(0b1111111111111111_0000000000000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_min_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8 = _mm_reduce_min_epu8(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_min_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8 = _mm_mask_reduce_min_epu8(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_or_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_reduce_or_epi16(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_or_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_mask_reduce_or_epi16(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_or_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_reduce_or_epi16(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_or_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_or_epi16(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_or_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        let e = _mm256_reduce_or_epi8(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_or_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        let e = _mm256_mask_reduce_or_epi8(0b11111111_00000000_11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_or_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm_reduce_or_epi8(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_or_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_or_epi8(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_loadu_epi16() {
+        #[rustfmt::skip]
+        let a: [i16; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let r = _mm512_loadu_epi16(&a[0]);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_loadu_epi16() {
+        let a: [i16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let r = _mm256_loadu_epi16(&a[0]);
+        let e = _mm256_set_epi16(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_loadu_epi16() {
+        let a: [i16; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let r = _mm_loadu_epi16(&a[0]);
+        let e = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_loadu_epi8() {
+        #[rustfmt::skip]
+        let a: [i8; 64] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                           1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let r = _mm512_loadu_epi8(&a[0]);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+                                32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_loadu_epi8() {
+        #[rustfmt::skip]
+        let a: [i8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let r = _mm256_loadu_epi8(&a[0]);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_loadu_epi8() {
+        let a: [i8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let r = _mm_loadu_epi8(&a[0]);
+        let e = _mm_set_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_storeu_epi16() {
+        let a = _mm512_set1_epi16(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_storeu_epi16(&mut r as *mut _ as *mut i16, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_storeu_epi16() {
+        let a = _mm256_set1_epi16(9);
+        let mut r = _mm256_set1_epi32(0);
+        _mm256_storeu_epi16(&mut r as *mut _ as *mut i16, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_storeu_epi16() {
+        let a = _mm_set1_epi16(9);
+        let mut r = _mm_set1_epi32(0);
+        _mm_storeu_epi16(&mut r as *mut _ as *mut i16, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_storeu_epi8() {
+        let a = _mm512_set1_epi8(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_storeu_epi8(&mut r as *mut _ as *mut i8, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_storeu_epi8() {
+        let a = _mm256_set1_epi8(9);
+        let mut r = _mm256_set1_epi32(0);
+        _mm256_storeu_epi8(&mut r as *mut _ as *mut i8, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_storeu_epi8() {
+        let a = _mm_set1_epi8(9);
+        let mut r = _mm_set1_epi32(0);
+        _mm_storeu_epi8(&mut r as *mut _ as *mut i8, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_loadu_epi16() {
+        let src = _mm512_set1_epi16(42);
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm512_mask_loadu_epi16(src, m, black_box(p));
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm512_loadu_epi16(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_maskz_loadu_epi16() {
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm512_maskz_loadu_epi16(m, black_box(p));
+        let e = &[
+            0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
+            26, 0, 28, 0, 30, 0, 32,
+        ];
+        let e = _mm512_loadu_epi16(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_storeu_epi16() {
+        let mut r = [42_i16; 32];
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let a = _mm512_loadu_epi16(a.as_ptr());
+        let m = 0b10101010_11001100_11101000_11001010;
+        _mm512_mask_storeu_epi16(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm512_loadu_epi16(e.as_ptr());
+        assert_eq_m512i(_mm512_loadu_epi16(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_loadu_epi8() {
+        let src = _mm512_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
+        let r = _mm512_mask_loadu_epi8(src, m, black_box(p));
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44,
+            45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42,
+        ];
+        let e = _mm512_loadu_epi8(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_maskz_loadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
+        let r = _mm512_maskz_loadu_epi8(m, black_box(p));
+        let e = &[
+            0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
+            26, 0, 28, 0, 30, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+            50, 51, 52, 53, 54, 55, 56, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+        let e = _mm512_loadu_epi8(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_storeu_epi8() {
+        let mut r = [42_i8; 64];
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let a = _mm512_loadu_epi8(a.as_ptr());
+        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
+        _mm512_mask_storeu_epi8(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44,
+            45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42,
+        ];
+        let e = _mm512_loadu_epi8(e.as_ptr());
+        assert_eq_m512i(_mm512_loadu_epi8(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi16() {
+        let src = _mm256_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_mask_loadu_epi16(src, m, black_box(p));
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm256_loadu_epi16(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_maskz_loadu_epi16(m, black_box(p));
+        let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16];
+        let e = _mm256_loadu_epi16(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi16() {
+        let mut r = [42_i16; 16];
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let a = _mm256_loadu_epi16(a.as_ptr());
+        let m = 0b11101000_11001010;
+        _mm256_mask_storeu_epi16(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm256_loadu_epi16(e.as_ptr());
+        assert_eq_m256i(_mm256_loadu_epi16(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi8() {
+        let src = _mm256_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm256_mask_loadu_epi8(src, m, black_box(p));
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm256_loadu_epi8(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm256_maskz_loadu_epi8(m, black_box(p));
+        let e = &[
+            0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
+            26, 0, 28, 0, 30, 0, 32,
+        ];
+        let e = _mm256_loadu_epi8(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi8() {
+        let mut r = [42_i8; 32];
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let a = _mm256_loadu_epi8(a.as_ptr());
+        let m = 0b10101010_11001100_11101000_11001010;
+        _mm256_mask_storeu_epi8(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm256_loadu_epi8(e.as_ptr());
+        assert_eq_m256i(_mm256_loadu_epi8(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi16() {
+        let src = _mm_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm_mask_loadu_epi16(src, m, black_box(p));
+        let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8];
+        let e = _mm_loadu_epi16(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm_maskz_loadu_epi16(m, black_box(p));
+        let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8];
+        let e = _mm_loadu_epi16(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi16() {
+        let mut r = [42_i16; 8];
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let a = _mm_loadu_epi16(a.as_ptr());
+        let m = 0b11001010;
+        _mm_mask_storeu_epi16(r.as_mut_ptr(), m, a);
+        let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8];
+        let e = _mm_loadu_epi16(e.as_ptr());
+        assert_eq_m128i(_mm_loadu_epi16(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi8() {
+        let src = _mm_set1_epi8(42);
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_mask_loadu_epi8(src, m, black_box(p));
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm_loadu_epi8(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi8() {
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_maskz_loadu_epi8(m, black_box(p));
+        let e = &[0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16];
+        let e = _mm_loadu_epi8(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi8() {
+        let mut r = [42_i8; 16];
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let a = _mm_loadu_epi8(a.as_ptr());
+        let m = 0b11101000_11001010;
+        _mm_mask_storeu_epi8(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm_loadu_epi8(e.as_ptr());
+        assert_eq_m128i(_mm_loadu_epi8(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_madd_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_madd_epi16(a, b);
+        let e = _mm512_set1_epi32(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_madd_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_madd_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_madd_epi16(a, 0b00000000_00001111, a, b);
+        let e = _mm512_set_epi32(
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            2,
+            2,
+            2,
+            2,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_madd_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_madd_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_madd_epi16(0b00000000_00001111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_madd_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_madd_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_madd_epi16(a, 0b00001111, a, b);
+        let e = _mm256_set_epi32(
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            2,
+            2,
+            2,
+            2,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_madd_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_madd_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_madd_epi16(0b00001111, a, b);
+        let e = _mm256_set_epi32(0, 0, 0, 0, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_madd_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_madd_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_madd_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_madd_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_madd_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_madd_epi16(0b00001111, a, b);
+        let e = _mm_set_epi32(2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maddubs_epi16() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maddubs_epi16(a, b);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_maddubs_epi16() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let src = _mm512_set1_epi16(1);
+        let r = _mm512_mask_maddubs_epi16(src, 0, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_add_epi16(src, 0b00000000_00000000_00000000_00000001, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1<<9|2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_maddubs_epi16() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_maddubs_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_maddubs_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_maddubs_epi16() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let src = _mm256_set1_epi16(1);
+        let r = _mm256_mask_maddubs_epi16(src, 0, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_add_epi16(src, 0b00000000_00000001, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 9 | 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_maddubs_epi16() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_maddubs_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_maddubs_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_maddubs_epi16() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let src = _mm_set1_epi16(1);
+        let r = _mm_mask_maddubs_epi16(src, 0, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_add_epi16(src, 0b00000001, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1 << 9 | 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_maddubs_epi16() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_maddubs_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_maddubs_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_packs_epi32() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_packs_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX,
+                                 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_packs_epi32() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let b = _mm512_set1_epi32(1 << 16 | 1);
+        let r = _mm512_mask_packs_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_packs_epi32(b, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_packs_epi32() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_packs_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_packs_epi32(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_packs_epi32() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let b = _mm256_set1_epi32(1 << 16 | 1);
+        let r = _mm256_mask_packs_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_packs_epi32(b, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_packs_epi32() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_packs_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_packs_epi32(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_packs_epi32() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let b = _mm_set1_epi32(1 << 16 | 1);
+        let r = _mm_mask_packs_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_packs_epi32(b, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_packs_epi32() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_packs_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_packs_epi32(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_packs_epi16() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_packs_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+                                1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+                                1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+                                1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_packs_epi16() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let b = _mm512_set1_epi16(1 << 8 | 1);
+        let r = _mm512_mask_packs_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_packs_epi16(
+            b,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_packs_epi16() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_packs_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_packs_epi16(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_packs_epi16() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let b = _mm256_set1_epi16(1 << 8 | 1);
+        let r = _mm256_mask_packs_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_packs_epi16(b, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_packs_epi16() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_packs_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_packs_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_packs_epi16() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let b = _mm_set1_epi16(1 << 8 | 1);
+        let r = _mm_mask_packs_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_packs_epi16(b, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_packs_epi16() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_packs_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_packs_epi16(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_packus_epi32() {
+        let a = _mm512_set1_epi32(-1);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_packus_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
+                                 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_packus_epi32() {
+        let a = _mm512_set1_epi32(-1);
+        let b = _mm512_set1_epi32(1 << 16 | 1);
+        let r = _mm512_mask_packus_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_packus_epi32(b, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_packus_epi32() {
+        let a = _mm512_set1_epi32(-1);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_packus_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_packus_epi32(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_packus_epi32() {
+        let a = _mm256_set1_epi32(-1);
+        let b = _mm256_set1_epi32(1 << 16 | 1);
+        let r = _mm256_mask_packus_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_packus_epi32(b, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_packus_epi32() {
+        let a = _mm256_set1_epi32(-1);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_packus_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_packus_epi32(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_packus_epi32() {
+        let a = _mm_set1_epi32(-1);
+        let b = _mm_set1_epi32(1 << 16 | 1);
+        let r = _mm_mask_packus_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_packus_epi32(b, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_packus_epi32() {
+        let a = _mm_set1_epi32(-1);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_packus_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_packus_epi32(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_packus_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_packus_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+                                1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+                                1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+                                1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_packus_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(1 << 8 | 1);
+        let r = _mm512_mask_packus_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_packus_epi16(
+            b,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_packus_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_packus_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_packus_epi16(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_packus_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(1 << 8 | 1);
+        let r = _mm256_mask_packus_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_packus_epi16(b, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_packus_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_packus_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_packus_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_packus_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(1 << 8 | 1);
+        let r = _mm_mask_packus_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_packus_epi16(b, 0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_packus_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_packus_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_packus_epi16(0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_avg_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_avg_epu16(a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_avg_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_avg_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_avg_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_avg_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_avg_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_avg_epu16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_avg_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_avg_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_avg_epu16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_avg_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_avg_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_avg_epu16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_avg_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_avg_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_avg_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_avg_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_avg_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_avg_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_avg_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_avg_epu8(a, b);
+        let e = _mm512_set1_epi8(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_avg_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_mask_avg_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_avg_epu8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_avg_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_avg_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_avg_epu8(
+            0b00000000_000000000_00000000_00000000_00000000_0000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_avg_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_mask_avg_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_avg_epu8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_avg_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_avg_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_avg_epu8(0b00000000_0000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_avg_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_mask_avg_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_avg_epu8(a, 0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_avg_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_avg_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_avg_epu8(0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sll_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_sll_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sll_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_mask_sll_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sll_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sll_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_maskz_sll_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sll_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sll_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm256_mask_sll_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sll_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sll_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm256_maskz_sll_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sll_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sll_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_sll_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sll_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sll_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_sll_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sll_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_slli_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_slli_epi16::<1>(a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_slli_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_mask_slli_epi16::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_slli_epi16::<1>(a, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_slli_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_maskz_slli_epi16::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_slli_epi16::<1>(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_slli_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_mask_slli_epi16::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_slli_epi16::<1>(a, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_slli_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_maskz_slli_epi16::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_slli_epi16::<1>(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_slli_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let r = _mm_mask_slli_epi16::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_slli_epi16::<1>(a, 0b11111111, a);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_slli_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let r = _mm_maskz_slli_epi16::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_slli_epi16::<1>(0b11111111, a);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sllv_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_sllv_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sllv_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_mask_sllv_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sllv_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sllv_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_sllv_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sllv_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_sllv_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_sllv_epi16(a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sllv_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_mask_sllv_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sllv_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sllv_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_sllv_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sllv_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_sllv_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_sllv_epi16(a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sllv_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_sllv_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sllv_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sllv_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_sllv_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sllv_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srl_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_srl_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srl_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_mask_srl_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srl_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srl_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_maskz_srl_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srl_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srl_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm256_mask_srl_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srl_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srl_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm256_maskz_srl_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srl_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srl_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_srl_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srl_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srl_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_srl_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srl_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srli_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let r = _mm512_srli_epi16::<2>(a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srli_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let r = _mm512_mask_srli_epi16::<2>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srli_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srli_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let r = _mm512_maskz_srli_epi16::<2>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srli_epi16::<2>(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srli_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let r = _mm256_mask_srli_epi16::<2>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srli_epi16::<2>(a, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srli_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let r = _mm256_maskz_srli_epi16::<2>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srli_epi16::<2>(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srli_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let r = _mm_mask_srli_epi16::<2>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srli_epi16::<2>(a, 0b11111111, a);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srli_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let r = _mm_maskz_srli_epi16::<2>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srli_epi16::<2>(0b11111111, a);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srlv_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_srlv_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srlv_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_mask_srlv_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srlv_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srlv_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_srlv_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srlv_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_srlv_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_srlv_epi16(a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srlv_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_mask_srlv_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srlv_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srlv_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_srlv_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srlv_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_srlv_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_srlv_epi16(a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srlv_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_srlv_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srlv_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srlv_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_srlv_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srlv_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sra_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm512_sra_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sra_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm512_mask_sra_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sra_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sra_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm512_maskz_sra_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sra_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sra_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm256_mask_sra_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sra_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sra_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm256_maskz_sra_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sra_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sra_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm_mask_sra_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sra_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sra_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm_maskz_sra_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sra_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srai_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let r = _mm512_srai_epi16::<2>(a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srai_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let r = _mm512_mask_srai_epi16::<2>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srai_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srai_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let r = _mm512_maskz_srai_epi16::<2>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srai_epi16::<2>(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srai_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let r = _mm256_mask_srai_epi16::<2>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srai_epi16::<2>(a, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srai_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let r = _mm256_maskz_srai_epi16::<2>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srai_epi16::<2>(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srai_epi16() {
+        let a = _mm_set1_epi16(8);
+        let r = _mm_mask_srai_epi16::<2>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srai_epi16::<2>(a, 0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srai_epi16() {
+        let a = _mm_set1_epi16(8);
+        let r = _mm_maskz_srai_epi16::<2>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srai_epi16::<2>(0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srav_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_srav_epi16(a, count);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srav_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_mask_srav_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srav_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srav_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_srav_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srav_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_srav_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_srav_epi16(a, count);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srav_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_mask_srav_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srav_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srav_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_srav_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srav_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_srav_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_srav_epi16(a, count);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srav_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_srav_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srav_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srav_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_srav_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srav_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_permutex2var_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
+                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm512_set1_epi16(100);
+        let r = _mm512_permutex2var_epi16(a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_permutex2var_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
+                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm512_set1_epi16(100);
+        let r = _mm512_mask_permutex2var_epi16(a, 0, idx, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutex2var_epi16(a, 0b11111111_11111111_11111111_11111111, idx, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_permutex2var_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
+                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm512_set1_epi16(100);
+        let r = _mm512_maskz_permutex2var_epi16(0, a, idx, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutex2var_epi16(0b11111111_11111111_11111111_11111111, a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask2_permutex2var_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
+                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm512_set1_epi16(100);
+        let r = _mm512_mask2_permutex2var_epi16(a, idx, 0, b);
+        assert_eq_m512i(r, idx);
+        let r = _mm512_mask2_permutex2var_epi16(a, idx, 0b11111111_11111111_11111111_11111111, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_permutex2var_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
+        let b = _mm256_set1_epi16(100);
+        let r = _mm256_permutex2var_epi16(a, idx, b);
+        let e = _mm256_set_epi16(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
+        let b = _mm256_set1_epi16(100);
+        let r = _mm256_mask_permutex2var_epi16(a, 0, idx, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutex2var_epi16(a, 0b11111111_11111111, idx, b);
+        let e = _mm256_set_epi16(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
+        let b = _mm256_set1_epi16(100);
+        let r = _mm256_maskz_permutex2var_epi16(0, a, idx, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutex2var_epi16(0b11111111_11111111, a, idx, b);
+        let e = _mm256_set_epi16(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
+        let b = _mm256_set1_epi16(100);
+        let r = _mm256_mask2_permutex2var_epi16(a, idx, 0, b);
+        assert_eq_m256i(r, idx);
+        let r = _mm256_mask2_permutex2var_epi16(a, idx, 0b11111111_11111111, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_permutex2var_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm_set1_epi16(100);
+        let r = _mm_permutex2var_epi16(a, idx, b);
+        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm_set1_epi16(100);
+        let r = _mm_mask_permutex2var_epi16(a, 0, idx, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutex2var_epi16(a, 0b11111111, idx, b);
+        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm_set1_epi16(100);
+        let r = _mm_maskz_permutex2var_epi16(0, a, idx, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutex2var_epi16(0b11111111, a, idx, b);
+        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm_set1_epi16(100);
+        let r = _mm_mask2_permutex2var_epi16(a, idx, 0, b);
+        assert_eq_m128i(r, idx);
+        let r = _mm_mask2_permutex2var_epi16(a, idx, 0b11111111, b);
+        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_permutexvar_epi16() {
+        let idx = _mm512_set1_epi16(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_permutexvar_epi16(idx, a);
+        let e = _mm512_set1_epi16(30);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_permutexvar_epi16() {
+        let idx = _mm512_set1_epi16(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_mask_permutexvar_epi16(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutexvar_epi16(a, 0b11111111_11111111_11111111_11111111, idx, a);
+        let e = _mm512_set1_epi16(30);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_permutexvar_epi16() {
+        let idx = _mm512_set1_epi16(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_maskz_permutexvar_epi16(0, idx, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutexvar_epi16(0b11111111_11111111_11111111_11111111, idx, a);
+        let e = _mm512_set1_epi16(30);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_permutexvar_epi16() {
+        let idx = _mm256_set1_epi16(1);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_permutexvar_epi16(idx, a);
+        let e = _mm256_set1_epi16(14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_epi16() {
+        let idx = _mm256_set1_epi16(1);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_permutexvar_epi16(a, 0, idx, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutexvar_epi16(a, 0b11111111_11111111, idx, a);
+        let e = _mm256_set1_epi16(14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_epi16() {
+        let idx = _mm256_set1_epi16(1);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_permutexvar_epi16(0, idx, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutexvar_epi16(0b11111111_11111111, idx, a);
+        let e = _mm256_set1_epi16(14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_permutexvar_epi16() {
+        let idx = _mm_set1_epi16(1);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_permutexvar_epi16(idx, a);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_permutexvar_epi16() {
+        let idx = _mm_set1_epi16(1);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_permutexvar_epi16(a, 0, idx, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutexvar_epi16(a, 0b11111111, idx, a);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_permutexvar_epi16() {
+        let idx = _mm_set1_epi16(1);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_permutexvar_epi16(0, idx, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutexvar_epi16(0b11111111, idx, a);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_blend_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_mask_blend_epi16(0b11111111_00000000_11111111_00000000, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_blend_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_mask_blend_epi16(0b11111111_00000000, a, b);
+        let e = _mm256_set_epi16(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_blend_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_mask_blend_epi16(0b11110000, a, b);
+        let e = _mm_set_epi16(2, 2, 2, 2, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_blend_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_mask_blend_epi8(
+            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_blend_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_mask_blend_epi8(0b11111111_00000000_11111111_00000000, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_blend_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_mask_blend_epi8(0b11111111_00000000, a, b);
+        let e = _mm_set_epi8(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_broadcastw_epi16() {
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_broadcastw_epi16(a);
+        let e = _mm512_set1_epi16(24);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_broadcastw_epi16() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_mask_broadcastw_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcastw_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(24);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_broadcastw_epi16() {
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_maskz_broadcastw_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcastw_epi16(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(24);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastw_epi16() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_mask_broadcastw_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcastw_epi16(src, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(24);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastw_epi16() {
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_maskz_broadcastw_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcastw_epi16(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(24);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_broadcastw_epi16() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm_mask_broadcastw_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_broadcastw_epi16(src, 0b11111111, a);
+        let e = _mm_set1_epi16(24);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastw_epi16() {
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm_maskz_broadcastw_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_broadcastw_epi16(0b11111111, a);
+        let e = _mm_set1_epi16(24);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_broadcastb_epi8() {
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_broadcastb_epi8(a);
+        let e = _mm512_set1_epi8(32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_broadcastb_epi8() {
+        let src = _mm512_set1_epi8(1);
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_mask_broadcastb_epi8(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcastb_epi8(
+            src,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_epi8(32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_broadcastb_epi8() {
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_broadcastb_epi8(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcastb_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_epi8(32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastb_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_mask_broadcastb_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcastb_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastb_epi8() {
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_maskz_broadcastb_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcastb_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_broadcastb_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm_mask_broadcastb_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_broadcastb_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastb_epi8() {
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm_maskz_broadcastb_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_broadcastb_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_unpackhi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_unpackhi_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(33, 1,  34, 2,  35, 3,  36, 4,  41, 9,  42, 10, 43, 11, 44, 12,
+                                 49, 17, 50, 18, 51, 19, 52, 20, 57, 25, 58, 26, 59, 27, 60, 28);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_unpackhi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_mask_unpackhi_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpackhi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(33, 1,  34, 2,  35, 3,  36, 4,  41, 9,  42, 10, 43, 11, 44, 12,
+                                 49, 17, 50, 18, 51, 19, 52, 20, 57, 25, 58, 26, 59, 27, 60, 28);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_unpackhi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_maskz_unpackhi_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpackhi_epi16(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(33, 1,  34, 2,  35, 3,  36, 4,  41, 9,  42, 10, 43, 11, 44, 12,
+                                 49, 17, 50, 18, 51, 19, 52, 20, 57, 25, 58, 26, 59, 27, 60, 28);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_epi16() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi16(
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+        );
+        let r = _mm256_mask_unpackhi_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpackhi_epi16(a, 0b11111111_11111111, a, b);
+        let e = _mm256_set_epi16(33, 1, 34, 2, 35, 3, 36, 4, 41, 9, 42, 10, 43, 11, 44, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_epi16() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi16(
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+        );
+        let r = _mm256_maskz_unpackhi_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpackhi_epi16(0b11111111_11111111, a, b);
+        let e = _mm256_set_epi16(33, 1, 34, 2, 35, 3, 36, 4, 41, 9, 42, 10, 43, 11, 44, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_epi16() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+        let r = _mm_mask_unpackhi_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpackhi_epi16(a, 0b11111111, a, b);
+        let e = _mm_set_epi16(33, 1, 34, 2, 35, 3, 36, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_epi16() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+        let r = _mm_maskz_unpackhi_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpackhi_epi16(0b11111111, a, b);
+        let e = _mm_set_epi16(33, 1, 34, 2, 35, 3, 36, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_unpackhi_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24,
+                                97, 33, 98, 34, 99, 35, 100, 36, 101, 37, 102, 38, 103, 39, 104, 40,
+                                113, 49, 114, 50, 115, 51, 116, 52, 117, 53, 118, 54, 119, 55, 120, 56);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_mask_unpackhi_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpackhi_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24,
+                                97, 33, 98, 34, 99, 35, 100, 36, 101, 37, 102, 38, 103, 39, 104, 40,
+                                113, 49, 114, 50, 115, 51, 116, 52, 117, 53, 118, 54, 119, 55, 120, 56);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_maskz_unpackhi_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpackhi_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24,
+                                97, 33, 98, 34, 99, 35, 100, 36, 101, 37, 102, 38, 103, 39, 104, 40,
+                                113, 49, 114, 50, 115, 51, 116, 52, 117, 53, 118, 54, 119, 55, 120, 56);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
+        let r = _mm256_mask_unpackhi_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpackhi_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
+        let r = _mm256_maskz_unpackhi_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpackhi_epi8(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_epi8() {
+        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_set_epi8(
+            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+        );
+        let r = _mm_mask_unpackhi_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpackhi_epi8(a, 0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_epi8() {
+        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_set_epi8(
+            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+        );
+        let r = _mm_maskz_unpackhi_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpackhi_epi8(0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_unpacklo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_unpacklo_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(37, 5,  38, 6,  39, 7,  40, 8,  45, 13, 46, 14, 47, 15, 48, 16,
+                                 53, 21, 54, 22, 55, 23, 56, 24, 61, 29, 62, 30, 63, 31, 64, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_unpacklo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_mask_unpacklo_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpacklo_epi16(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(37, 5,  38, 6,  39, 7,  40, 8,  45, 13, 46, 14, 47, 15, 48, 16,
+                                 53, 21, 54, 22, 55, 23, 56, 24, 61, 29, 62, 30, 63, 31, 64, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_unpacklo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_maskz_unpacklo_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpacklo_epi16(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(37, 5,  38, 6,  39, 7,  40, 8,  45, 13, 46, 14, 47, 15, 48, 16,
+                                 53, 21, 54, 22, 55, 23, 56, 24, 61, 29, 62, 30, 63, 31, 64, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_epi16() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi16(
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+        );
+        let r = _mm256_mask_unpacklo_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpacklo_epi16(a, 0b11111111_11111111, a, b);
+        let e = _mm256_set_epi16(37, 5, 38, 6, 39, 7, 40, 8, 45, 13, 46, 14, 47, 15, 48, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_epi16() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi16(
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+        );
+        let r = _mm256_maskz_unpacklo_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpacklo_epi16(0b11111111_11111111, a, b);
+        let e = _mm256_set_epi16(37, 5, 38, 6, 39, 7, 40, 8, 45, 13, 46, 14, 47, 15, 48, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_epi16() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+        let r = _mm_mask_unpacklo_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpacklo_epi16(a, 0b11111111, a, b);
+        let e = _mm_set_epi16(37, 5, 38, 6, 39, 7, 40, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_epi16() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+        let r = _mm_maskz_unpacklo_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpacklo_epi16(0b11111111, a, b);
+        let e = _mm_set_epi16(37, 5, 38, 6, 39, 7, 40, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_unpacklo_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32,
+                                105, 41, 106, 42, 107, 43, 108, 44, 109, 45, 110, 46, 111, 47, 112, 48,
+                                121, 57, 122, 58, 123, 59, 124, 60, 125, 61, 126, 62, 127, 63, 0,   64);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_mask_unpacklo_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpacklo_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32,
+                                105, 41, 106, 42, 107, 43, 108, 44, 109, 45, 110, 46, 111, 47, 112, 48,
+                                121, 57, 122, 58, 123, 59, 124, 60, 125, 61, 126, 62, 127, 63, 0,   64);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_maskz_unpacklo_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpacklo_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32,
+                                105, 41, 106, 42, 107, 43, 108, 44, 109, 45, 110, 46, 111, 47, 112, 48,
+                                121, 57, 122, 58, 123, 59, 124, 60, 125, 61, 126, 62, 127, 63, 0,   64);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
+        let r = _mm256_mask_unpacklo_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpacklo_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
+        let r = _mm256_maskz_unpacklo_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpacklo_epi8(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_epi8() {
+        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_set_epi8(
+            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+        );
+        let r = _mm_mask_unpacklo_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpacklo_epi8(a, 0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(
+            73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_epi8() {
+        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_set_epi8(
+            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+        );
+        let r = _mm_maskz_unpacklo_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpacklo_epi8(0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(
+            73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mov_epi16() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_mask_mov_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_mov_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mov_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_mov_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mov_epi16(0b11111111_11111111_11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mov_epi16() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_mask_mov_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_mov_epi16(src, 0b11111111_11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_mov_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mov_epi16(0b11111111_11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mov_epi16() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set1_epi16(2);
+        let r = _mm_mask_mov_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_mov_epi16(src, 0b11111111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mov_epi16() {
+        let a = _mm_set1_epi16(2);
+        let r = _mm_maskz_mov_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mov_epi16(0b11111111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mov_epi8() {
+        let src = _mm512_set1_epi8(1);
+        let a = _mm512_set1_epi8(2);
+        let r = _mm512_mask_mov_epi8(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_mov_epi8(
+            src,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mov_epi8() {
+        let a = _mm512_set1_epi8(2);
+        let r = _mm512_maskz_mov_epi8(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mov_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mov_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm256_set1_epi8(2);
+        let r = _mm256_mask_mov_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_mov_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm256_maskz_mov_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mov_epi8(0b11111111_11111111_11111111_11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mov_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm_mask_mov_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_mov_epi8(src, 0b11111111_11111111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mov_epi8() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm_maskz_mov_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mov_epi8(0b11111111_11111111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_set1_epi16() {
+        let src = _mm512_set1_epi16(2);
+        let a: i16 = 11;
+        let r = _mm512_mask_set1_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_set1_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_set1_epi16() {
+        let a: i16 = 11;
+        let r = _mm512_maskz_set1_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_set1_epi16(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_set1_epi16() {
+        let src = _mm256_set1_epi16(2);
+        let a: i16 = 11;
+        let r = _mm256_mask_set1_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_set1_epi16(src, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_set1_epi16() {
+        let a: i16 = 11;
+        let r = _mm256_maskz_set1_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_set1_epi16(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_set1_epi16() {
+        let src = _mm_set1_epi16(2);
+        let a: i16 = 11;
+        let r = _mm_mask_set1_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_set1_epi16(src, 0b11111111, a);
+        let e = _mm_set1_epi16(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_set1_epi16() {
+        let a: i16 = 11;
+        let r = _mm_maskz_set1_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_set1_epi16(0b11111111, a);
+        let e = _mm_set1_epi16(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_set1_epi8() {
+        let src = _mm512_set1_epi8(2);
+        let a: i8 = 11;
+        let r = _mm512_mask_set1_epi8(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_set1_epi8(
+            src,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_epi8(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_set1_epi8() {
+        let a: i8 = 11;
+        let r = _mm512_maskz_set1_epi8(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_set1_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_epi8(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_set1_epi8() {
+        let src = _mm256_set1_epi8(2);
+        let a: i8 = 11;
+        let r = _mm256_mask_set1_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_set1_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_set1_epi8() {
+        let a: i8 = 11;
+        let r = _mm256_maskz_set1_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_set1_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_set1_epi8() {
+        let src = _mm_set1_epi8(2);
+        let a: i8 = 11;
+        let r = _mm_mask_set1_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_set1_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_set1_epi8() {
+        let a: i8 = 11;
+        let r = _mm_maskz_set1_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_set1_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_shufflelo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,
+            16, 17, 18, 19, 23, 22, 22, 20, 24, 25, 26, 27, 31, 30, 30, 28,
+        );
+        let r = _mm512_shufflelo_epi16::<0b00_01_01_11>(a);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_shufflelo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm512_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shufflelo_epi16::<0b00_01_01_11>(
+            a,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,
+            16, 17, 18, 19, 23, 22, 22, 20, 24, 25, 26, 27, 31, 30, 30, 28,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_shufflelo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm512_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r =
+            _mm512_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111_11111111_11111111_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,
+            16, 17, 18, 19, 23, 22, 22, 20, 24, 25, 26, 27, 31, 30, 30, 28,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_shufflelo_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0b11111111_11111111, a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_shufflelo_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111_11111111, a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_shufflelo_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 7, 6, 6, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_shufflelo_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 7, 6, 6, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_shufflehi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,
+            19, 18, 18, 16, 20, 21, 22, 23, 27, 26, 26, 24, 28, 29, 30, 31,
+        );
+        let r = _mm512_shufflehi_epi16::<0b00_01_01_11>(a);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_shufflehi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm512_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shufflehi_epi16::<0b00_01_01_11>(
+            a,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,
+            19, 18, 18, 16, 20, 21, 22, 23, 27, 26, 26, 24, 28, 29, 30, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_shufflehi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm512_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r =
+            _mm512_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111_11111111_11111111_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,
+            19, 18, 18, 16, 20, 21, 22, 23, 27, 26, 26, 24, 28, 29, 30, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_shufflehi_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0b11111111_11111111, a);
+        let e = _mm256_set_epi16(3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_shufflehi_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111_11111111, a);
+        let e = _mm256_set_epi16(3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_shufflehi_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0b11111111, a);
+        let e = _mm_set_epi16(3, 2, 2, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_shufflehi_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111, a);
+        let e = _mm_set_epi16(3, 2, 2, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_shuffle_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+                                46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
+                                62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_mask_shuffle_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shuffle_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+                                46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
+                                62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_shuffle_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shuffle_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+                                46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
+                                62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_mask_shuffle_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_shuffle_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_epi8(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_mask_shuffle_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shuffle_epi8(a, 0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(
+            14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_shuffle_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shuffle_epi8(0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(
+            14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_test_epi16_mask() {
+        let a = _mm512_set1_epi16(1 << 0);
+        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm512_test_epi16_mask(a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_test_epi16_mask() {
+        let a = _mm512_set1_epi16(1 << 0);
+        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm512_mask_test_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_test_epi16_mask(0b11111111_11111111_11111111_11111111, a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_test_epi16_mask() {
+        let a = _mm256_set1_epi16(1 << 0);
+        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm256_test_epi16_mask(a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_test_epi16_mask() {
+        let a = _mm256_set1_epi16(1 << 0);
+        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm256_mask_test_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_test_epi16_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_test_epi16_mask() {
+        let a = _mm_set1_epi16(1 << 0);
+        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm_test_epi16_mask(a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_test_epi16_mask() {
+        let a = _mm_set1_epi16(1 << 0);
+        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm_mask_test_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_test_epi16_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_test_epi8_mask() {
+        let a = _mm512_set1_epi8(1 << 0);
+        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm512_test_epi8_mask(a, b);
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_test_epi8_mask() {
+        let a = _mm512_set1_epi8(1 << 0);
+        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm512_mask_test_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_test_epi8_mask(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_test_epi8_mask() {
+        let a = _mm256_set1_epi8(1 << 0);
+        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm256_test_epi8_mask(a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_test_epi8_mask() {
+        let a = _mm256_set1_epi8(1 << 0);
+        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm256_mask_test_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_test_epi8_mask(0b11111111_11111111_11111111_11111111, a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_test_epi8_mask() {
+        let a = _mm_set1_epi8(1 << 0);
+        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm_test_epi8_mask(a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_test_epi8_mask() {
+        let a = _mm_set1_epi8(1 << 0);
+        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm_mask_test_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_test_epi8_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_testn_epi16_mask() {
+        let a = _mm512_set1_epi16(1 << 0);
+        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm512_testn_epi16_mask(a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_testn_epi16_mask() {
+        let a = _mm512_set1_epi16(1 << 0);
+        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm512_mask_testn_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_testn_epi16_mask(0b11111111_11111111_11111111_11111111, a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_testn_epi16_mask() {
+        let a = _mm256_set1_epi16(1 << 0);
+        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm256_testn_epi16_mask(a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_testn_epi16_mask() {
+        let a = _mm256_set1_epi16(1 << 0);
+        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm256_mask_testn_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_testn_epi16_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_testn_epi16_mask() {
+        let a = _mm_set1_epi16(1 << 0);
+        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm_testn_epi16_mask(a, b);
+        let e: __mmask8 = 0b00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_testn_epi16_mask() {
+        let a = _mm_set1_epi16(1 << 0);
+        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm_mask_testn_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_testn_epi16_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_testn_epi8_mask() {
+        let a = _mm512_set1_epi8(1 << 0);
+        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm512_testn_epi8_mask(a, b);
+        let e: __mmask64 =
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_testn_epi8_mask() {
+        let a = _mm512_set1_epi8(1 << 0);
+        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm512_mask_testn_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_testn_epi8_mask(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        let e: __mmask64 =
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_testn_epi8_mask() {
+        let a = _mm256_set1_epi8(1 << 0);
+        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm256_testn_epi8_mask(a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_testn_epi8_mask() {
+        let a = _mm256_set1_epi8(1 << 0);
+        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm256_mask_testn_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_testn_epi8_mask(0b11111111_11111111_11111111_11111111, a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_testn_epi8_mask() {
+        let a = _mm_set1_epi8(1 << 0);
+        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm_testn_epi8_mask(a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_testn_epi8_mask() {
+        let a = _mm_set1_epi8(1 << 0);
+        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm_mask_testn_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_testn_epi8_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_store_mask64() {
+        let a: __mmask64 =
+            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000;
+        let mut r = 0;
+        _store_mask64(&mut r, a);
+        assert_eq!(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_store_mask32() {
+        let a: __mmask32 = 0b11111111_00000000_11111111_00000000;
+        let mut r = 0;
+        _store_mask32(&mut r, a);
+        assert_eq!(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_load_mask64() {
+        let p: __mmask64 =
+            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000;
+        let r = _load_mask64(&p);
+        let e: __mmask64 =
+            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_load_mask32() {
+        let p: __mmask32 = 0b11111111_00000000_11111111_00000000;
+        let r = _load_mask32(&p);
+        let e: __mmask32 = 0b11111111_00000000_11111111_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sad_epu8() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(4);
+        let r = _mm512_sad_epu8(a, b);
+        let e = _mm512_set1_epi64(16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_dbsad_epu8() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(4);
+        let r = _mm512_dbsad_epu8::<0>(a, b);
+        let e = _mm512_set1_epi16(8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_dbsad_epu8() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(4);
+        let r = _mm512_mask_dbsad_epu8::<0>(src, 0, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dbsad_epu8::<0>(src, 0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_dbsad_epu8() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(4);
+        let r = _mm512_maskz_dbsad_epu8::<0>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dbsad_epu8::<0>(0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_dbsad_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_dbsad_epu8::<0>(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_dbsad_epu8() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_mask_dbsad_epu8::<0>(src, 0, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dbsad_epu8::<0>(src, 0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_dbsad_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_maskz_dbsad_epu8::<0>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dbsad_epu8::<0>(0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_dbsad_epu8() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(4);
+        let r = _mm_dbsad_epu8::<0>(a, b);
+        let e = _mm_set1_epi16(8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_dbsad_epu8() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(4);
+        let r = _mm_mask_dbsad_epu8::<0>(src, 0, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dbsad_epu8::<0>(src, 0b11111111, a, b);
+        let e = _mm_set1_epi16(8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_dbsad_epu8() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(4);
+        let r = _mm_maskz_dbsad_epu8::<0>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dbsad_epu8::<0>(0b11111111, a, b);
+        let e = _mm_set1_epi16(8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_movepi16_mask() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_movepi16_mask(a);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_movepi16_mask() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_movepi16_mask(a);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_movepi16_mask() {
+        let a = _mm_set1_epi16(1 << 15);
+        let r = _mm_movepi16_mask(a);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_movepi8_mask() {
+        let a = _mm512_set1_epi8(1 << 7);
+        let r = _mm512_movepi8_mask(a);
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_movepi8_mask() {
+        let a = _mm256_set1_epi8(1 << 7);
+        let r = _mm256_movepi8_mask(a);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_movepi8_mask() {
+        let a = _mm_set1_epi8(1 << 7);
+        let r = _mm_movepi8_mask(a);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_movm_epi16() {
+        let a: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        let r = _mm512_movm_epi16(a);
+        let e = _mm512_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_movm_epi16() {
+        let a: __mmask16 = 0b11111111_11111111;
+        let r = _mm256_movm_epi16(a);
+        let e = _mm256_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_movm_epi16() {
+        let a: __mmask8 = 0b11111111;
+        let r = _mm_movm_epi16(a);
+        let e = _mm_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_movm_epi8() {
+        let a: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        let r = _mm512_movm_epi8(a);
+        let e =
+            _mm512_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_movm_epi8() {
+        let a: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        let r = _mm256_movm_epi8(a);
+        let e =
+            _mm256_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_movm_epi8() {
+        let a: __mmask16 = 0b11111111_11111111;
+        let r = _mm_movm_epi8(a);
+        let e =
+            _mm_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_cvtmask32_u32() {
+        let a: __mmask32 = 0b11001100_00110011_01100110_10011001;
+        let r = _cvtmask32_u32(a);
+        let e: u32 = 0b11001100_00110011_01100110_10011001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_cvtu32_mask32() {
+        let a: u32 = 0b11001100_00110011_01100110_10011001;
+        let r = _cvtu32_mask32(a);
+        let e: __mmask32 = 0b11001100_00110011_01100110_10011001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kadd_mask32() {
+        let a: __mmask32 = 11;
+        let b: __mmask32 = 22;
+        let r = _kadd_mask32(a, b);
+        let e: __mmask32 = 33;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kadd_mask64() {
+        let a: __mmask64 = 11;
+        let b: __mmask64 = 22;
+        let r = _kadd_mask64(a, b);
+        let e: __mmask64 = 33;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kand_mask32() {
+        let a: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kand_mask32(a, b);
+        let e: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kand_mask64() {
+        let a: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kand_mask64(a, b);
+        let e: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_knot_mask32() {
+        let a: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _knot_mask32(a);
+        let e: __mmask32 = 0b00110011_11001100_00110011_11001100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_knot_mask64() {
+        let a: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _knot_mask64(a);
+        let e: __mmask64 =
+            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kandn_mask32() {
+        let a: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kandn_mask32(a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kandn_mask64() {
+        let a: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kandn_mask64(a, b);
+        let e: __mmask64 =
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kor_mask32() {
+        let a: __mmask32 = 0b00110011_11001100_00110011_11001100;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kor_mask32(a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kor_mask64() {
+        let a: __mmask64 =
+            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kor_mask64(a, b);
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kxor_mask32() {
+        let a: __mmask32 = 0b00110011_11001100_00110011_11001100;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kxor_mask32(a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kxor_mask64() {
+        let a: __mmask64 =
+            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kxor_mask64(a, b);
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kxnor_mask32() {
+        let a: __mmask32 = 0b00110011_11001100_00110011_11001100;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kxnor_mask32(a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kxnor_mask64() {
+        let a: __mmask64 =
+            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kxnor_mask64(a, b);
+        let e: __mmask64 =
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortest_mask32_u8() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let b: __mmask32 = 0b1011011010110110_1011011010110110;
+        let mut all_ones: u8 = 0;
+        let r = _kortest_mask32_u8(a, b, &mut all_ones);
+        assert_eq!(r, 0);
+        assert_eq!(all_ones, 1);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortest_mask64_u8() {
+        let a: __mmask64 = 0b0110100101101001_0110100101101001;
+        let b: __mmask64 = 0b1011011010110110_1011011010110110;
+        let mut all_ones: u8 = 0;
+        let r = _kortest_mask64_u8(a, b, &mut all_ones);
+        assert_eq!(r, 0);
+        assert_eq!(all_ones, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortestc_mask32_u8() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let b: __mmask32 = 0b1011011010110110_1011011010110110;
+        let r = _kortestc_mask32_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortestc_mask64_u8() {
+        let a: __mmask64 = 0b0110100101101001_0110100101101001;
+        let b: __mmask64 = 0b1011011010110110_1011011010110110;
+        let r = _kortestc_mask64_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortestz_mask32_u8() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let b: __mmask32 = 0b1011011010110110_1011011010110110;
+        let r = _kortestz_mask32_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortestz_mask64_u8() {
+        let a: __mmask64 = 0b0110100101101001_0110100101101001;
+        let b: __mmask64 = 0b1011011010110110_1011011010110110;
+        let r = _kortestz_mask64_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kshiftli_mask32() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let r = _kshiftli_mask32::<3>(a);
+        let e: __mmask32 = 0b0100101101001011_0100101101001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kshiftli_mask64() {
+        let a: __mmask64 = 0b0110100101101001_0110100101101001;
+        let r = _kshiftli_mask64::<3>(a);
+        let e: __mmask64 = 0b0110100101101001011_0100101101001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kshiftri_mask32() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let r = _kshiftri_mask32::<3>(a);
+        let e: __mmask32 = 0b0000110100101101_0010110100101101;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kshiftri_mask64() {
+        let a: __mmask64 = 0b0110100101101001011_0100101101001000;
+        let r = _kshiftri_mask64::<3>(a);
+        let e: __mmask64 = 0b0110100101101001_0110100101101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktest_mask32_u8() {
+        let a: __mmask32 = 0b0110100100111100_0110100100111100;
+        let b: __mmask32 = 0b1001011011000011_1001011011000011;
+        let mut and_not: u8 = 0;
+        let r = _ktest_mask32_u8(a, b, &mut and_not);
+        assert_eq!(r, 1);
+        assert_eq!(and_not, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktestc_mask32_u8() {
+        let a: __mmask32 = 0b0110100100111100_0110100100111100;
+        let b: __mmask32 = 0b1001011011000011_1001011011000011;
+        let r = _ktestc_mask32_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktestz_mask32_u8() {
+        let a: __mmask32 = 0b0110100100111100_0110100100111100;
+        let b: __mmask32 = 0b1001011011000011_1001011011000011;
+        let r = _ktestz_mask32_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktest_mask64_u8() {
+        let a: __mmask64 = 0b0110100100111100_0110100100111100;
+        let b: __mmask64 = 0b1001011011000011_1001011011000011;
+        let mut and_not: u8 = 0;
+        let r = _ktest_mask64_u8(a, b, &mut and_not);
+        assert_eq!(r, 1);
+        assert_eq!(and_not, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktestc_mask64_u8() {
+        let a: __mmask64 = 0b0110100100111100_0110100100111100;
+        let b: __mmask64 = 0b1001011011000011_1001011011000011;
+        let r = _ktestc_mask64_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktestz_mask64_u8() {
+        let a: __mmask64 = 0b0110100100111100_0110100100111100;
+        let b: __mmask64 = 0b1001011011000011_1001011011000011;
+        let r = _ktestz_mask64_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_kunpackw() {
+        let a: u32 = 0x00110011;
+        let b: u32 = 0x00001011;
+        let r = _mm512_kunpackw(a, b);
+        let e: u32 = 0x00111011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_kunpackd() {
+        let a: u64 = 0x11001100_00110011;
+        let b: u64 = 0x00101110_00001011;
+        let r = _mm512_kunpackd(a, b);
+        let e: u64 = 0x00110011_00001011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtepi16_epi8() {
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_cvtepi16_epi8(a);
+        let e = _mm256_set1_epi8(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtepi16_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_mask_cvtepi16_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtepi16_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtepi16_epi8() {
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_cvtepi16_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtepi16_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cvtepi16_epi8() {
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_cvtepi16_epi8(a);
+        let e = _mm_set1_epi8(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_mask_cvtepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi16_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi16_epi8() {
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_cvtepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi16_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cvtepi16_epi8() {
+        let a = _mm_set1_epi16(2);
+        let r = _mm_cvtepi16_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_epi8() {
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        let a = _mm_set1_epi16(2);
+        let r = _mm_mask_cvtepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi16_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi16_epi8() {
+        let a = _mm_set1_epi16(2);
+        let r = _mm_maskz_cvtepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi16_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtsepi16_epi8() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_cvtsepi16_epi8(a);
+        let e = _mm256_set1_epi8(i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtsepi16_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_mask_cvtsepi16_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtsepi16_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cvtsepi16_epi8() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_cvtsepi16_epi8(a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi16_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_mask_cvtsepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi16_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi16_epi8() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_maskz_cvtsepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi16_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cvtsepi16_epi8() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let r = _mm_cvtsepi16_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi16_epi8() {
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        let a = _mm_set1_epi16(i16::MAX);
+        let r = _mm_mask_cvtsepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi16_epi8(src, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi16_epi8() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let r = _mm_maskz_cvtsepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi16_epi8(0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtsepi16_epi8() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_maskz_cvtsepi16_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtsepi16_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtusepi16_epi8() {
+        let a = _mm512_set1_epi16(i16::MIN);
+        let r = _mm512_cvtusepi16_epi8(a);
+        let e = _mm256_set1_epi8(-1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtusepi16_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm512_set1_epi16(i16::MIN);
+        let r = _mm512_mask_cvtusepi16_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtusepi16_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(-1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtusepi16_epi8() {
+        let a = _mm512_set1_epi16(i16::MIN);
+        let r = _mm512_maskz_cvtusepi16_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtusepi16_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(-1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cvtusepi16_epi8() {
+        let a = _mm256_set1_epi16(i16::MIN);
+        let r = _mm256_cvtusepi16_epi8(a);
+        let e = _mm_set1_epi8(-1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi16_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm256_set1_epi16(i16::MIN);
+        let r = _mm256_mask_cvtusepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi16_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(-1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi16_epi8() {
+        let a = _mm256_set1_epi16(i16::MIN);
+        let r = _mm256_maskz_cvtusepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi16_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(-1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cvtusepi16_epi8() {
+        let a = _mm_set1_epi16(i16::MIN);
+        let r = _mm_cvtusepi16_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi16_epi8() {
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        let a = _mm_set1_epi16(i16::MIN);
+        let r = _mm_mask_cvtusepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi16_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi16_epi8() {
+        let a = _mm_set1_epi16(i16::MIN);
+        let r = _mm_maskz_cvtusepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi16_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtepi8_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_cvtepi8_epi16(a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtepi8_epi16() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_mask_cvtepi8_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi8_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtepi8_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_maskz_cvtepi8_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi8_epi16(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi8_epi16() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm256_mask_cvtepi8_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi8_epi16(src, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi8_epi16() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm256_maskz_cvtepi8_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi8_epi16(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi8_epi16() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm_mask_cvtepi8_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi8_epi16(src, 0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi8_epi16() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm_maskz_cvtepi8_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi8_epi16(0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtepu8_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_cvtepu8_epi16(a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtepu8_epi16() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_mask_cvtepu8_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu8_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtepu8_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_maskz_cvtepu8_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu8_epi16(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu8_epi16() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm256_mask_cvtepu8_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu8_epi16(src, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu8_epi16() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm256_maskz_cvtepu8_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu8_epi16(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu8_epi16() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm_mask_cvtepu8_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu8_epi16(src, 0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu8_epi16() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm_maskz_cvtepu8_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu8_epi16(0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_bslli_epi128() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let r = _mm512_bslli_epi128::<9>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_bsrli_epi128() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+            49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        );
+        let r = _mm512_bsrli_epi128::<3>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+            0, 0, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+            0, 0, 0, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            0, 0, 0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_alignr_epi8::<14>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_mask_alignr_epi8::<14>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_alignr_epi8::<14>(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_alignr_epi8::<14>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_alignr_epi8::<14>(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_mask_alignr_epi8::<14>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_alignr_epi8::<14>(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_alignr_epi8::<14>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_alignr_epi8::<14>(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_alignr_epi8() {
+        let a = _mm_set_epi8(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_mask_alignr_epi8::<14>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_alignr_epi8::<14>(a, 0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_alignr_epi8() {
+        let a = _mm_set_epi8(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_alignr_epi8::<14>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_alignr_epi8::<14>(0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtsepi16_storeu_epi8() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtsepi16_storeu_epi8(
+            &mut r as *mut _ as *mut i8,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm256_set1_epi8(i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi16_storeu_epi8() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtsepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi16_storeu_epi8() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtepi16_storeu_epi8() {
+        let a = _mm512_set1_epi16(8);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtepi16_storeu_epi8(
+            &mut r as *mut _ as *mut i8,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm256_set1_epi8(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_storeu_epi8() {
+        let a = _mm256_set1_epi16(8);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_storeu_epi8() {
+        let a = _mm_set1_epi16(8);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtusepi16_storeu_epi8() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtusepi16_storeu_epi8(
+            &mut r as *mut _ as *mut i8,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm256_set1_epi8(u8::MAX as i8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi16_storeu_epi8() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtusepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi16_storeu_epi8() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, 
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512cd.rs b/library/stdarch/crates/core_arch/src/x86/avx512cd.rs
new file mode 100644
index 0000000000000..78735fcc90f5e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512cd.rs
@@ -0,0 +1,1232 @@
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastmw_epi32&expand=553)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
+pub fn _mm512_broadcastmw_epi32(k: __mmask16) -> __m512i {
+    _mm512_set1_epi32(k as i32)
+}
+
+/// Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastmw_epi32&expand=552)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
+pub fn _mm256_broadcastmw_epi32(k: __mmask16) -> __m256i {
+    _mm256_set1_epi32(k as i32)
+}
+
+/// Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastmw_epi32&expand=551)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
+pub fn _mm_broadcastmw_epi32(k: __mmask16) -> __m128i {
+    _mm_set1_epi32(k as i32)
+}
+
+/// Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastmb_epi64&expand=550)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
+pub fn _mm512_broadcastmb_epi64(k: __mmask8) -> __m512i {
+    _mm512_set1_epi64(k as i64)
+}
+
+/// Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastmb_epi64&expand=549)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
+pub fn _mm256_broadcastmb_epi64(k: __mmask8) -> __m256i {
+    _mm256_set1_epi64x(k as i64)
+}
+
+/// Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastmb_epi64&expand=548)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
+pub fn _mm_broadcastmb_epi64(k: __mmask8) -> __m128i {
+    _mm_set1_epi64x(k as i64)
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_conflict_epi32&expand=1248)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm512_conflict_epi32(a: __m512i) -> __m512i {
+    unsafe { transmute(vpconflictd(a.as_i32x16())) }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_conflict_epi32&expand=1249)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm512_mask_conflict_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, conflict, src.as_i32x16()))
+    }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_conflict_epi32&expand=1250)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm512_maskz_conflict_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, conflict, i32x16::ZERO))
+    }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_conflict_epi32&expand=1245)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm256_conflict_epi32(a: __m256i) -> __m256i {
+    unsafe { transmute(vpconflictd256(a.as_i32x8())) }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_conflict_epi32&expand=1246)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm256_mask_conflict_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, conflict, src.as_i32x8()))
+    }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_conflict_epi32&expand=1247)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm256_maskz_conflict_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, conflict, i32x8::ZERO))
+    }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_conflict_epi32&expand=1242)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm_conflict_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(vpconflictd128(a.as_i32x4())) }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_conflict_epi32&expand=1243)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm_mask_conflict_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, conflict, src.as_i32x4()))
+    }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_conflict_epi32&expand=1244)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm_maskz_conflict_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, conflict, i32x4::ZERO))
+    }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_conflict_epi64&expand=1257)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm512_conflict_epi64(a: __m512i) -> __m512i {
+    unsafe { transmute(vpconflictq(a.as_i64x8())) }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_conflict_epi64&expand=1258)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm512_mask_conflict_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, conflict, src.as_i64x8()))
+    }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_conflict_epi64&expand=1259)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm512_maskz_conflict_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, conflict, i64x8::ZERO))
+    }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_conflict_epi64&expand=1254)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm256_conflict_epi64(a: __m256i) -> __m256i {
+    unsafe { transmute(vpconflictq256(a.as_i64x4())) }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_conflict_epi64&expand=1255)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm256_mask_conflict_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, conflict, src.as_i64x4()))
+    }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_conflict_epi64&expand=1256)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm256_maskz_conflict_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, conflict, i64x4::ZERO))
+    }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_conflict_epi64&expand=1251)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm_conflict_epi64(a: __m128i) -> __m128i {
+    unsafe { transmute(vpconflictq128(a.as_i64x2())) }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_conflict_epi64&expand=1252)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm_mask_conflict_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, conflict, src.as_i64x2()))
+    }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_conflict_epi64&expand=1253)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm_maskz_conflict_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, conflict, i64x2::ZERO))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_lzcnt_epi32&expand=3491)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm512_lzcnt_epi32(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctlz(a.as_i32x16())) }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_lzcnt_epi32&expand=3492)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm512_mask_lzcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i32x16()))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_lzcnt_epi32&expand=3493)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm512_maskz_lzcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, zerocount, i32x16::ZERO))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lzcnt_epi32&expand=3488)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm256_lzcnt_epi32(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctlz(a.as_i32x8())) }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_lzcnt_epi32&expand=3489)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm256_mask_lzcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i32x8()))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_lzcnt_epi32&expand=3490)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm256_maskz_lzcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, zerocount, i32x8::ZERO))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lzcnt_epi32&expand=3485)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm_lzcnt_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctlz(a.as_i32x4())) }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_lzcnt_epi32&expand=3486)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm_mask_lzcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i32x4()))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_lzcnt_epi32&expand=3487)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm_maskz_lzcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, zerocount, i32x4::ZERO))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_lzcnt_epi64&expand=3500)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm512_lzcnt_epi64(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctlz(a.as_i64x8())) }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_lzcnt_epi64&expand=3501)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm512_mask_lzcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i64x8()))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_lzcnt_epi64&expand=3502)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm512_maskz_lzcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, zerocount, i64x8::ZERO))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lzcnt_epi64&expand=3497)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm256_lzcnt_epi64(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctlz(a.as_i64x4())) }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_lzcnt_epi64&expand=3498)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm256_mask_lzcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i64x4()))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_lzcnt_epi64&expand=3499)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm256_maskz_lzcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, zerocount, i64x4::ZERO))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lzcnt_epi64&expand=3494)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm_lzcnt_epi64(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctlz(a.as_i64x2())) }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_lzcnt_epi64&expand=3495)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm_mask_lzcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i64x2()))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_lzcnt_epi64&expand=3496)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm_maskz_lzcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, zerocount, i64x2::ZERO))
+    }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.conflict.d.512"]
+    fn vpconflictd(a: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.conflict.d.256"]
+    fn vpconflictd256(a: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.conflict.d.128"]
+    fn vpconflictd128(a: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.conflict.q.512"]
+    fn vpconflictq(a: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.conflict.q.256"]
+    fn vpconflictq256(a: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.conflict.q.128"]
+    fn vpconflictq128(a: i64x2) -> i64x2;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_broadcastmw_epi32() {
+        let a: __mmask16 = 2;
+        let r = _mm512_broadcastmw_epi32(a);
+        let e = _mm512_set1_epi32(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_broadcastmw_epi32() {
+        let a: __mmask16 = 2;
+        let r = _mm256_broadcastmw_epi32(a);
+        let e = _mm256_set1_epi32(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_broadcastmw_epi32() {
+        let a: __mmask16 = 2;
+        let r = _mm_broadcastmw_epi32(a);
+        let e = _mm_set1_epi32(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_broadcastmb_epi64() {
+        let a: __mmask8 = 2;
+        let r = _mm512_broadcastmb_epi64(a);
+        let e = _mm512_set1_epi64(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_broadcastmb_epi64() {
+        let a: __mmask8 = 2;
+        let r = _mm256_broadcastmb_epi64(a);
+        let e = _mm256_set1_epi64x(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_broadcastmb_epi64() {
+        let a: __mmask8 = 2;
+        let r = _mm_broadcastmb_epi64(a);
+        let e = _mm_set1_epi64x(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_conflict_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_conflict_epi32(a);
+        let e = _mm512_set_epi32(
+            1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 9 | 1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_mask_conflict_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_mask_conflict_epi32(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_conflict_epi32(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(
+            1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 9 | 1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_maskz_conflict_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_conflict_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_conflict_epi32(0b11111111_11111111, a);
+        let e = _mm512_set_epi32(
+            1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 9 | 1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_conflict_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_conflict_epi32(a);
+        let e = _mm256_set_epi32(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_mask_conflict_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_mask_conflict_epi32(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_conflict_epi32(a, 0b11111111, a);
+        let e = _mm256_set_epi32(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_maskz_conflict_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_conflict_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_conflict_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_conflict_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_conflict_epi32(a);
+        let e = _mm_set_epi32(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_mask_conflict_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_mask_conflict_epi32(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_conflict_epi32(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_maskz_conflict_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_maskz_conflict_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_conflict_epi32(0b00001111, a);
+        let e = _mm_set_epi32(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_conflict_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_conflict_epi64(a);
+        let e = _mm512_set_epi64(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_mask_conflict_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_mask_conflict_epi64(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_conflict_epi64(a, 0b11111111, a);
+        let e = _mm512_set_epi64(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_maskz_conflict_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_maskz_conflict_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_conflict_epi64(0b11111111, a);
+        let e = _mm512_set_epi64(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_conflict_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_conflict_epi64(a);
+        let e = _mm256_set_epi64x(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_mask_conflict_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_conflict_epi64(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_conflict_epi64(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_maskz_conflict_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_conflict_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_conflict_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_conflict_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_conflict_epi64(a);
+        let e = _mm_set_epi64x(1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_mask_conflict_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_mask_conflict_epi64(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_conflict_epi64(a, 0b00000011, a);
+        let e = _mm_set_epi64x(1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_maskz_conflict_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_maskz_conflict_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_conflict_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_lzcnt_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_lzcnt_epi32(a);
+        let e = _mm512_set1_epi32(31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_mask_lzcnt_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_mask_lzcnt_epi32(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_lzcnt_epi32(a, 0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_maskz_lzcnt_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_lzcnt_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_lzcnt_epi32(0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(30);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_lzcnt_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_lzcnt_epi32(a);
+        let e = _mm256_set1_epi32(31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_mask_lzcnt_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_mask_lzcnt_epi32(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_lzcnt_epi32(a, 0b11111111, a);
+        let e = _mm256_set1_epi32(31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_maskz_lzcnt_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_lzcnt_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_lzcnt_epi32(0b11111111, a);
+        let e = _mm256_set1_epi32(31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_lzcnt_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_lzcnt_epi32(a);
+        let e = _mm_set1_epi32(31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_mask_lzcnt_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_mask_lzcnt_epi32(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_lzcnt_epi32(a, 0b00001111, a);
+        let e = _mm_set1_epi32(31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_maskz_lzcnt_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_maskz_lzcnt_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_lzcnt_epi32(0b00001111, a);
+        let e = _mm_set1_epi32(31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_lzcnt_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_lzcnt_epi64(a);
+        let e = _mm512_set1_epi64(63);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_mask_lzcnt_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_mask_lzcnt_epi64(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_lzcnt_epi64(a, 0b11111111, a);
+        let e = _mm512_set1_epi64(63);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_maskz_lzcnt_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let r = _mm512_maskz_lzcnt_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_lzcnt_epi64(0b11111111, a);
+        let e = _mm512_set1_epi64(62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_lzcnt_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_lzcnt_epi64(a);
+        let e = _mm256_set1_epi64x(63);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_mask_lzcnt_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_lzcnt_epi64(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_lzcnt_epi64(a, 0b00001111, a);
+        let e = _mm256_set1_epi64x(63);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_maskz_lzcnt_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_lzcnt_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_lzcnt_epi64(0b00001111, a);
+        let e = _mm256_set1_epi64x(63);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_lzcnt_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_lzcnt_epi64(a);
+        let e = _mm_set1_epi64x(63);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_mask_lzcnt_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_mask_lzcnt_epi64(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_lzcnt_epi64(a, 0b00001111, a);
+        let e = _mm_set1_epi64x(63);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_maskz_lzcnt_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_maskz_lzcnt_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_lzcnt_epi64(0b00001111, a);
+        let e = _mm_set1_epi64x(63);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512dq.rs b/library/stdarch/crates/core_arch/src/x86/avx512dq.rs
new file mode 100644
index 0000000000000..c90ec894f2174
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512dq.rs
@@ -0,0 +1,10955 @@
+use crate::{
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+    mem::transmute,
+};
+
+// And //
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_and_pd&ig_expand=288)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_and_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let and = _mm_and_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, and, src.as_f64x2()))
+    }
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_and_pd&ig_expand=289)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_and_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let and = _mm_and_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, and, f64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_and_pd&ig_expand=291)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_and_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let and = _mm256_and_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, and, src.as_f64x4()))
+    }
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_and_pd&ig_expand=292)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_and_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let and = _mm256_and_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, and, f64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_pd&ig_expand=293)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandp))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_and_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_and(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) }
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_and_pd&ig_expand=294)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_and_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let and = _mm512_and_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, and, src.as_f64x8()))
+    }
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_and_pd&ig_expand=295)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_and_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let and = _mm512_and_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, and, f64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_and_ps&ig_expand=297)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_and_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let and = _mm_and_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, and, src.as_f32x4()))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_and_ps&ig_expand=298)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_and_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let and = _mm_and_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, and, f32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_and_ps&ig_expand=300)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_and_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let and = _mm256_and_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, and, src.as_f32x8()))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_and_ps&ig_expand=301)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_and_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let and = _mm256_and_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, and, f32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_ps&ig_expand=303)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_and_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(simd_and(
+            transmute::<_, u32x16>(a),
+            transmute::<_, u32x16>(b),
+        ))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_and_ps&ig_expand=304)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_and_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let and = _mm512_and_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, and, src.as_f32x16()))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_and_ps&ig_expand=305)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_and_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let and = _mm512_and_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, and, f32x16::ZERO))
+    }
+}
+
+// Andnot
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_andnot_pd&ig_expand=326)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_andnot_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let andnot = _mm_andnot_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, andnot, src.as_f64x2()))
+    }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_andnot_pd&ig_expand=327)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_andnot_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let andnot = _mm_andnot_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, andnot, f64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_andnot_pd&ig_expand=329)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_andnot_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let andnot = _mm256_andnot_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, andnot, src.as_f64x4()))
+    }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_andnot_pd&ig_expand=330)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_andnot_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let andnot = _mm256_andnot_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, andnot, f64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_pd&ig_expand=331)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandnp))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_andnot_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { _mm512_and_pd(_mm512_xor_pd(a, transmute(_mm512_set1_epi64(-1))), b) }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_andnot_pd&ig_expand=332)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandnpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_andnot_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let andnot = _mm512_andnot_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, andnot, src.as_f64x8()))
+    }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_andnot_pd&ig_expand=333)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandnpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_andnot_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let andnot = _mm512_andnot_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, andnot, f64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_andnot_ps&ig_expand=335)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_andnot_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let andnot = _mm_andnot_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, andnot, src.as_f32x4()))
+    }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_andnot_ps&ig_expand=336)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_andnot_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let andnot = _mm_andnot_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, andnot, f32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_andnot_ps&ig_expand=338)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_andnot_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let andnot = _mm256_andnot_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, andnot, src.as_f32x8()))
+    }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_andnot_ps&ig_expand=339)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_andnot_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let andnot = _mm256_andnot_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, andnot, f32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_ps&ig_expand=340)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_andnot_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { _mm512_and_ps(_mm512_xor_ps(a, transmute(_mm512_set1_epi32(-1))), b) }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_andnot_ps&ig_expand=341)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_andnot_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let andnot = _mm512_andnot_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, andnot, src.as_f32x16()))
+    }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_andnot_ps&ig_expand=342)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_andnot_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let andnot = _mm512_andnot_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, andnot, f32x16::ZERO))
+    }
+}
+
+// Or
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_or_pd&ig_expand=4824)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_or_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let or = _mm_or_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, or, src.as_f64x2()))
+    }
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_or_pd&ig_expand=4825)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_or_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let or = _mm_or_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, or, f64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_or_pd&ig_expand=4827)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_or_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let or = _mm256_or_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, or, src.as_f64x4()))
+    }
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_or_pd&ig_expand=4828)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_or_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let or = _mm256_or_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, or, f64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_pd&ig_expand=4829)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vorp))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_or_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_or(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) }
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_or_pd&ig_expand=4830)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_or_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let or = _mm512_or_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, or, src.as_f64x8()))
+    }
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_or_pd&ig_expand=4831)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_or_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let or = _mm512_or_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, or, f64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_or_ps&ig_expand=4833)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_or_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let or = _mm_or_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, or, src.as_f32x4()))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_or_ps&ig_expand=4834)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_or_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let or = _mm_or_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, or, f32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_or_ps&ig_expand=4836)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_or_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let or = _mm256_or_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, or, src.as_f32x8()))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_or_ps&ig_expand=4837)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_or_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let or = _mm256_or_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, or, f32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_ps&ig_expand=4838)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_or_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(simd_or(
+            transmute::<_, u32x16>(a),
+            transmute::<_, u32x16>(b),
+        ))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_or_ps&ig_expand=4839)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_or_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let or = _mm512_or_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, or, src.as_f32x16()))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_or_ps&ig_expand=4840)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_or_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let or = _mm512_or_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, or, f32x16::ZERO))
+    }
+}
+
+// Xor
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_xor_pd&ig_expand=7094)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_xor_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let xor = _mm_xor_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, xor, src.as_f64x2()))
+    }
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_xor_pd&ig_expand=7095)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_xor_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let xor = _mm_xor_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, xor, f64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_xor_pd&ig_expand=7097)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_xor_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let xor = _mm256_xor_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, xor, src.as_f64x4()))
+    }
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_xor_pd&ig_expand=7098)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_xor_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let xor = _mm256_xor_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, xor, f64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_pd&ig_expand=7102)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vxorp))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_xor_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_xor(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) }
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_xor_pd&ig_expand=7100)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vxorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_xor_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let xor = _mm512_xor_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, xor, src.as_f64x8()))
+    }
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_xor_pd&ig_expand=7101)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vxorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_xor_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let xor = _mm512_xor_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, xor, f64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_xor_ps&ig_expand=7103)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_xor_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let xor = _mm_xor_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, xor, src.as_f32x4()))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_xor_ps&ig_expand=7104)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_xor_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let xor = _mm_xor_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, xor, f32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_xor_ps&ig_expand=7106)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_xor_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let xor = _mm256_xor_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, xor, src.as_f32x8()))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_xor_ps&ig_expand=7107)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_xor_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let xor = _mm256_xor_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, xor, f32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_ps&ig_expand=7111)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_xor_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(simd_xor(
+            transmute::<_, u32x16>(a),
+            transmute::<_, u32x16>(b),
+        ))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_xor_ps&ig_expand=7109)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_xor_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let xor = _mm512_xor_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, xor, src.as_f32x16()))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_xor_ps&ig_expand=7110)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_xor_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let xor = _mm512_xor_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, xor, f32x16::ZERO))
+    }
+}
+
+// Broadcast
+
+/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_f32x2&ig_expand=509)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_broadcast_f32x2(a: __m128) -> __m256 {
+    unsafe {
+        let b: f32x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_f32x2&ig_expand=510)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastf32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_broadcast_f32x2(src: __m256, k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let b = _mm256_broadcast_f32x2(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
+}
+
+/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_f32x2&ig_expand=511)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastf32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_broadcast_f32x2(k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let b = _mm256_broadcast_f32x2(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
+}
+
+/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f32x2&ig_expand=512)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_f32x2(a: __m128) -> __m512 {
+    unsafe {
+        let b: f32x16 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f32x2&ig_expand=513)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vbroadcastf32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_f32x2(src: __m512, k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let b = _mm512_broadcast_f32x2(a).as_f32x16();
+        transmute(simd_select_bitmask(k, b, src.as_f32x16()))
+    }
+}
+
+/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f32x2&ig_expand=514)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vbroadcastf32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_f32x2(k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let b = _mm512_broadcast_f32x2(a).as_f32x16();
+        transmute(simd_select_bitmask(k, b, f32x16::ZERO))
+    }
+}
+
+/// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f32x8&ig_expand=521)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_f32x8(a: __m256) -> __m512 {
+    unsafe {
+        let b: f32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f32x8&ig_expand=522)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_f32x8(src: __m512, k: __mmask16, a: __m256) -> __m512 {
+    unsafe {
+        let b = _mm512_broadcast_f32x8(a).as_f32x16();
+        transmute(simd_select_bitmask(k, b, src.as_f32x16()))
+    }
+}
+
+/// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f32x8&ig_expand=523)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_f32x8(k: __mmask16, a: __m256) -> __m512 {
+    unsafe {
+        let b = _mm512_broadcast_f32x8(a).as_f32x16();
+        transmute(simd_select_bitmask(k, b, f32x16::ZERO))
+    }
+}
+
+/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
+/// elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_f64x2&ig_expand=524)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_broadcast_f64x2(a: __m128d) -> __m256d {
+    unsafe {
+        let b: f64x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
+/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_f64x2&ig_expand=525)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_broadcast_f64x2(src: __m256d, k: __mmask8, a: __m128d) -> __m256d {
+    unsafe {
+        let b = _mm256_broadcast_f64x2(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, src.as_f64x4()))
+    }
+}
+
+/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
+/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_f64x2&ig_expand=526)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m256d {
+    unsafe {
+        let b = _mm256_broadcast_f64x2(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, f64x4::ZERO))
+    }
+}
+
+/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
+/// elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f64x2&ig_expand=527)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_f64x2(a: __m128d) -> __m512d {
+    unsafe {
+        let b: f64x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
+/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f64x2&ig_expand=528)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_f64x2(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
+    unsafe {
+        let b = _mm512_broadcast_f64x2(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
+}
+
+/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
+/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f64x2&ig_expand=529)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m512d {
+    unsafe {
+        let b = _mm512_broadcast_f64x2(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_i32x2&ig_expand=533)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_broadcast_i32x2(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x4();
+        let b: i32x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcast_i32x2&ig_expand=534)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_broadcast_i32x2(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let b = _mm_broadcast_i32x2(a).as_i32x4();
+        transmute(simd_select_bitmask(k, b, src.as_i32x4()))
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcast_i32x2&ig_expand=535)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let b = _mm_broadcast_i32x2(a).as_i32x4();
+        transmute(simd_select_bitmask(k, b, i32x4::ZERO))
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_i32x2&ig_expand=536)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_broadcast_i32x2(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i32x4();
+        let b: i32x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_i32x2&ig_expand=537)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_broadcast_i32x2(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let b = _mm256_broadcast_i32x2(a).as_i32x8();
+        transmute(simd_select_bitmask(k, b, src.as_i32x8()))
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_i32x2&ig_expand=538)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let b = _mm256_broadcast_i32x2(a).as_i32x8();
+        transmute(simd_select_bitmask(k, b, i32x8::ZERO))
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i32x2&ig_expand=539)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_i32x2(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x4();
+        let b: i32x16 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i32x2&ig_expand=540)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_i32x2(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i32x2(a).as_i32x16();
+        transmute(simd_select_bitmask(k, b, src.as_i32x16()))
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i32x2&ig_expand=541)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_i32x2(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i32x2(a).as_i32x16();
+        transmute(simd_select_bitmask(k, b, i32x16::ZERO))
+    }
+}
+
+/// Broadcasts the 8 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i32x8&ig_expand=548)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_i32x8(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x8();
+        let b: i32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the 8 packed 32-bit integers from a to all elements of dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i32x8&ig_expand=549)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_i32x8(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i32x8(a).as_i32x16();
+        transmute(simd_select_bitmask(k, b, src.as_i32x16()))
+    }
+}
+
+/// Broadcasts the 8 packed 32-bit integers from a to all elements of dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i32x8&ig_expand=550)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_i32x8(k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i32x8(a).as_i32x16();
+        transmute(simd_select_bitmask(k, b, i32x16::ZERO))
+    }
+}
+
+/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_i64x2&ig_expand=551)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_broadcast_i64x2(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x2();
+        let b: i64x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_i64x2&ig_expand=552)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_broadcast_i64x2(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let b = _mm256_broadcast_i64x2(a).as_i64x4();
+        transmute(simd_select_bitmask(k, b, src.as_i64x4()))
+    }
+}
+
+/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_i64x2&ig_expand=553)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let b = _mm256_broadcast_i64x2(a).as_i64x4();
+        transmute(simd_select_bitmask(k, b, i64x4::ZERO))
+    }
+}
+
+/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i64x2&ig_expand=554)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_i64x2(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i64x2();
+        let b: i64x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i64x2&ig_expand=555)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_i64x2(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i64x2(a).as_i64x8();
+        transmute(simd_select_bitmask(k, b, src.as_i64x8()))
+    }
+}
+
+/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i64x2&ig_expand=556)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i64x2(a).as_i64x8();
+        transmute(simd_select_bitmask(k, b, i64x8::ZERO))
+    }
+}
+
+// Extract
+
+/// Extracts 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf32x8_ps&ig_expand=2946)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_extractf32x8_ps<const IMM8: i32>(a: __m512) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        }
+    }
+}
+
+/// Extracts 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst using writemask k (elements are copied from src
+/// if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf32x8_ps&ig_expand=2947)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextractf32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_extractf32x8_ps<const IMM8: i32>(src: __m256, k: __mmask8, a: __m512) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_extractf32x8_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, b.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Extracts 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf32x8_ps&ig_expand=2948)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextractf32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_extractf32x8_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_extractf32x8_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, b.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf64x2_pd&ig_expand=2949)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_extractf64x2_pd<const IMM8: i32>(a: __m256d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, a, [0, 1]),
+            _ => simd_shuffle!(a, a, [2, 3]),
+        }
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst using writemask k (elements are copied from src
+/// if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extractf64x2_pd&ig_expand=2950)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_extractf64x2_pd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m256d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_extractf64x2_pd::<IMM8>(a);
+        transmute(simd_select_bitmask(k, b.as_f64x2(), src.as_f64x2()))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extractf64x2_pd&ig_expand=2951)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_extractf64x2_pd::<IMM8>(a);
+        transmute(simd_select_bitmask(k, b.as_f64x2(), f64x2::ZERO))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf64x2_pd&ig_expand=2952)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_extractf64x2_pd<const IMM8: i32>(a: __m512d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        match IMM8 & 3 {
+            0 => simd_shuffle!(a, a, [0, 1]),
+            1 => simd_shuffle!(a, a, [2, 3]),
+            2 => simd_shuffle!(a, a, [4, 5]),
+            _ => simd_shuffle!(a, a, [6, 7]),
+        }
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst using writemask k (elements are copied from src
+/// if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf64x2_pd&ig_expand=2953)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_extractf64x2_pd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_extractf64x2_pd::<IMM8>(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, src.as_f64x2()))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf64x2_pd&ig_expand=2954)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_extractf64x2_pd::<IMM8>(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, f64x2::ZERO))
+    }
+}
+
+/// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
+/// the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti32x8_epi32&ig_expand=2965)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_extracti32x8_epi32<const IMM8: i32>(a: __m512i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i32x16();
+        let b: i32x8 = match IMM8 & 1 {
+            0 => simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        };
+        transmute(b)
+    }
+}
+
+/// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
+/// the result in dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti32x8_epi32&ig_expand=2966)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextracti32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_extracti32x8_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_extracti32x8_epi32::<IMM8>(a).as_i32x8();
+        transmute(simd_select_bitmask(k, b, src.as_i32x8()))
+    }
+}
+
+/// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
+/// the result in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti32x8_epi32&ig_expand=2967)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextracti32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_extracti32x8_epi32<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_extracti32x8_epi32::<IMM8>(a).as_i32x8();
+        transmute(simd_select_bitmask(k, b, i32x8::ZERO))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
+/// the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti64x2_epi64&ig_expand=2968)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_extracti64x2_epi64<const IMM8: i32>(a: __m256i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i64x4();
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, a, [0, 1]),
+            _ => simd_shuffle!(a, a, [2, 3]),
+        }
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
+/// the result in dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extracti64x2_epi64&ig_expand=2969)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_extracti64x2_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_extracti64x2_epi64::<IMM8>(a).as_i64x2();
+        transmute(simd_select_bitmask(k, b, src.as_i64x2()))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
+/// the result in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extracti64x2_epi64&ig_expand=2970)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_extracti64x2_epi64::<IMM8>(a).as_i64x2();
+        transmute(simd_select_bitmask(k, b, i64x2::ZERO))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
+/// the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti64x2_epi64&ig_expand=2971)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_extracti64x2_epi64<const IMM8: i32>(a: __m512i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let a = a.as_i64x8();
+        match IMM8 & 3 {
+            0 => simd_shuffle!(a, a, [0, 1]),
+            1 => simd_shuffle!(a, a, [2, 3]),
+            2 => simd_shuffle!(a, a, [4, 5]),
+            _ => simd_shuffle!(a, a, [6, 7]),
+        }
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
+/// the result in dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti64x2_epi64&ig_expand=2972)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_extracti64x2_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_extracti64x2_epi64::<IMM8>(a).as_i64x2();
+        transmute(simd_select_bitmask(k, b, src.as_i64x2()))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
+/// the result in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti64x2_epi64&ig_expand=2973)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_extracti64x2_epi64::<IMM8>(a).as_i64x2();
+        transmute(simd_select_bitmask(k, b, i64x2::ZERO))
+    }
+}
+
+// Insert
+
+/// Copy a to dst, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point
+/// elements) from b into dst at the location specified by IMM8.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf32x8&ig_expand=3850)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_insertf32x8<const IMM8: i32>(a: __m512, b: __m256) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_castps256_ps512(b);
+        match IMM8 & 1 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15]
+                )
+            }
+            _ => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
+                )
+            }
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point
+/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf32x8&ig_expand=3851)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinsertf32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_insertf32x8<const IMM8: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m256,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm512_insertf32x8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_f32x16(), src.as_f32x16()))
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point
+/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf32x8&ig_expand=3852)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinsertf32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_insertf32x8<const IMM8: i32>(k: __mmask16, a: __m512, b: __m256) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm512_insertf32x8::<IMM8>(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, c, f32x16::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
+/// elements) from b into dst at the location specified by IMM8.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf64x2&ig_expand=3853)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_insertf64x2<const IMM8: i32>(a: __m256d, b: __m128d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_castpd128_pd256(b);
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, b, [4, 5, 2, 3]),
+            _ => simd_shuffle!(a, b, [0, 1, 4, 5]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
+/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_insertf64x2&ig_expand=3854)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_insertf64x2<const IMM8: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m128d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm256_insertf64x2::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
+/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_insertf64x2&ig_expand=3855)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_insertf64x2<const IMM8: i32>(k: __mmask8, a: __m256d, b: __m128d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm256_insertf64x2::<IMM8>(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, c, f64x4::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
+/// elements) from b into dst at the location specified by IMM8.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf64x2&ig_expand=3856)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_insertf64x2<const IMM8: i32>(a: __m512d, b: __m128d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_castpd128_pd512(b);
+        match IMM8 & 3 {
+            0 => simd_shuffle!(a, b, [8, 9, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 1, 8, 9, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8, 9]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
+/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf64x2&ig_expand=3857)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_insertf64x2<const IMM8: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m128d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let c = _mm512_insertf64x2::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_f64x8(), src.as_f64x8()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
+/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf64x2&ig_expand=3858)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_insertf64x2<const IMM8: i32>(k: __mmask8, a: __m512d, b: __m128d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let c = _mm512_insertf64x2::<IMM8>(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, c, f64x8::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 256 bits (composed of 8 packed 32-bit integers) from b into dst at the
+/// location specified by IMM8.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti32x8&ig_expand=3869)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_inserti32x8<const IMM8: i32>(a: __m512i, b: __m256i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i32x16();
+        let b = _mm512_castsi256_si512(b).as_i32x16();
+        let r: i32x16 = match IMM8 & 1 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15]
+                )
+            }
+            _ => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
+                )
+            }
+        };
+        transmute(r)
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the
+/// location specified by IMM8, and copy tmp to dst using writemask k (elements are copied from src if
+/// the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti32x8&ig_expand=3870)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinserti32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_inserti32x8<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m256i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm512_inserti32x8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_i32x16(), src.as_i32x16()))
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the
+/// location specified by IMM8, and copy tmp to dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti32x8&ig_expand=3871)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinserti32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_inserti32x8<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m256i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm512_inserti32x8::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, c, i32x16::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the
+/// location specified by IMM8.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti64x2&ig_expand=3872)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_inserti64x2<const IMM8: i32>(a: __m256i, b: __m128i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i64x4();
+        let b = _mm256_castsi128_si256(b).as_i64x4();
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, b, [4, 5, 2, 3]),
+            _ => simd_shuffle!(a, b, [0, 1, 4, 5]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
+/// location specified by IMM8, and copy tmp to dst using writemask k (elements are copied from src if
+/// the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_inserti64x2&ig_expand=3873)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_inserti64x2<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m128i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm256_inserti64x2::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_i64x4(), src.as_i64x4()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
+/// location specified by IMM8, and copy tmp to dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_inserti64x2&ig_expand=3874)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_inserti64x2<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m128i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm256_inserti64x2::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, c, i64x4::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the
+/// location specified by IMM8.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti64x2&ig_expand=3875)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_inserti64x2<const IMM8: i32>(a: __m512i, b: __m128i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let a = a.as_i64x8();
+        let b = _mm512_castsi128_si512(b).as_i64x8();
+        match IMM8 & 3 {
+            0 => simd_shuffle!(a, b, [8, 9, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 1, 8, 9, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8, 9]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
+/// location specified by IMM8, and copy tmp to dst using writemask k (elements are copied from src if
+/// the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti64x2&ig_expand=3876)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_inserti64x2<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m128i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let c = _mm512_inserti64x2::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_i64x8(), src.as_i64x8()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
+/// location specified by IMM8, and copy tmp to dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti64x2&ig_expand=3877)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_inserti64x2<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m128i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let c = _mm512_inserti64x2::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, c, i64x8::ZERO))
+    }
+}
+
+// Convert
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepi64_pd&ig_expand=1437)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundepi64_pd<const ROUNDING: i32>(a: __m512i) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtqq2pd_512(a.as_i64x8(), ROUNDING))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepi64_pd&ig_expand=1438)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundepi64_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepi64_pd::<ROUNDING>(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepi64_pd&ig_expand=1439)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundepi64_pd<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepi64_pd::<ROUNDING>(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_pd&ig_expand=1705)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtepi64_pd(a: __m128i) -> __m128d {
+    unsafe { transmute(vcvtqq2pd_128(a.as_i64x2(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_pd&ig_expand=1706)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtepi64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let b = _mm_cvtepi64_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, src.as_f64x2()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_pd&ig_expand=1707)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtepi64_pd(k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let b = _mm_cvtepi64_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, f64x2::ZERO))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_pd&ig_expand=1708)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtepi64_pd(a: __m256i) -> __m256d {
+    unsafe { transmute(vcvtqq2pd_256(a.as_i64x4(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_pd&ig_expand=1709)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtepi64_pd(src: __m256d, k: __mmask8, a: __m256i) -> __m256d {
+    unsafe {
+        let b = _mm256_cvtepi64_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, src.as_f64x4()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_pd&ig_expand=1710)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtepi64_pd(k: __mmask8, a: __m256i) -> __m256d {
+    unsafe {
+        let b = _mm256_cvtepi64_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, f64x4::ZERO))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_pd&ig_expand=1711)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtepi64_pd(a: __m512i) -> __m512d {
+    unsafe { transmute(vcvtqq2pd_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_pd&ig_expand=1712)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtepi64_pd(src: __m512d, k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        let b = _mm512_cvtepi64_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_pd&ig_expand=1713)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtepi64_pd(k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        let b = _mm512_cvtepi64_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepi64_ps&ig_expand=1443)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundepi64_ps<const ROUNDING: i32>(a: __m512i) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtqq2ps_512(a.as_i64x8(), ROUNDING))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepi64_ps&ig_expand=1444)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundepi64_ps<const ROUNDING: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m512i,
+) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepi64_ps::<ROUNDING>(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepi64_ps&ig_expand=1445)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundepi64_ps<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepi64_ps::<ROUNDING>(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_ps&ig_expand=1723)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtepi64_ps(a: __m128i) -> __m128 {
+    _mm_mask_cvtepi64_ps(_mm_undefined_ps(), 0xff, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_ps&ig_expand=1724)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    unsafe { transmute(vcvtqq2ps_128(a.as_i64x2(), src.as_f32x4(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_ps&ig_expand=1725)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtepi64_ps(k: __mmask8, a: __m128i) -> __m128 {
+    _mm_mask_cvtepi64_ps(_mm_setzero_ps(), k, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_ps&ig_expand=1726)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtepi64_ps(a: __m256i) -> __m128 {
+    unsafe { transmute(vcvtqq2ps_256(a.as_i64x4(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_ps&ig_expand=1727)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m256i) -> __m128 {
+    unsafe {
+        let b = _mm256_cvtepi64_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, b, src.as_f32x4()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_ps&ig_expand=1728)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtepi64_ps(k: __mmask8, a: __m256i) -> __m128 {
+    unsafe {
+        let b = _mm256_cvtepi64_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, b, f32x4::ZERO))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_ps&ig_expand=1729)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtepi64_ps(a: __m512i) -> __m256 {
+    unsafe { transmute(vcvtqq2ps_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_ps&ig_expand=1730)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtepi64_ps(src: __m256, k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        let b = _mm512_cvtepi64_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_ps&ig_expand=1731)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtepi64_ps(k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        let b = _mm512_cvtepi64_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepu64_pd&ig_expand=1455)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundepu64_pd<const ROUNDING: i32>(a: __m512i) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtuqq2pd_512(a.as_u64x8(), ROUNDING))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepu64_pd&ig_expand=1456)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundepu64_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepu64_pd::<ROUNDING>(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepu64_pd&ig_expand=1457)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundepu64_pd<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepu64_pd::<ROUNDING>(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu64_pd&ig_expand=1827)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtepu64_pd(a: __m128i) -> __m128d {
+    unsafe { transmute(vcvtuqq2pd_128(a.as_u64x2(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu64_pd&ig_expand=1828)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtepu64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let b = _mm_cvtepu64_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, src.as_f64x2()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu64_pd&ig_expand=1829)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtepu64_pd(k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let b = _mm_cvtepu64_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, f64x2::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu64_pd&ig_expand=1830)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtepu64_pd(a: __m256i) -> __m256d {
+    unsafe { transmute(vcvtuqq2pd_256(a.as_u64x4(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu64_pd&ig_expand=1831)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtepu64_pd(src: __m256d, k: __mmask8, a: __m256i) -> __m256d {
+    unsafe {
+        let b = _mm256_cvtepu64_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, src.as_f64x4()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu64_pd&ig_expand=1832)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtepu64_pd(k: __mmask8, a: __m256i) -> __m256d {
+    unsafe {
+        let b = _mm256_cvtepu64_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, f64x4::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu64_pd&ig_expand=1833)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtepu64_pd(a: __m512i) -> __m512d {
+    unsafe { transmute(vcvtuqq2pd_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu64_pd&ig_expand=1834)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtepu64_pd(src: __m512d, k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        let b = _mm512_cvtepu64_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu64_pd&ig_expand=1835)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtepu64_pd(k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        let b = _mm512_cvtepu64_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepu64_ps&ig_expand=1461)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundepu64_ps<const ROUNDING: i32>(a: __m512i) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtuqq2ps_512(a.as_u64x8(), ROUNDING))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepu64_ps&ig_expand=1462)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundepu64_ps<const ROUNDING: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m512i,
+) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepu64_ps::<ROUNDING>(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepu64_ps&ig_expand=1463)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundepu64_ps<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepu64_ps::<ROUNDING>(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu64_ps&ig_expand=1845)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtepu64_ps(a: __m128i) -> __m128 {
+    _mm_mask_cvtepu64_ps(_mm_undefined_ps(), 0xff, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu64_ps&ig_expand=1846)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    unsafe { transmute(vcvtuqq2ps_128(a.as_u64x2(), src.as_f32x4(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu64_ps&ig_expand=1847)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtepu64_ps(k: __mmask8, a: __m128i) -> __m128 {
+    _mm_mask_cvtepu64_ps(_mm_setzero_ps(), k, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu64_ps&ig_expand=1848)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtepu64_ps(a: __m256i) -> __m128 {
+    unsafe { transmute(vcvtuqq2ps_256(a.as_u64x4(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu64_ps&ig_expand=1849)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m256i) -> __m128 {
+    unsafe {
+        let b = _mm256_cvtepu64_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, b, src.as_f32x4()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu64_ps&ig_expand=1850)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtepu64_ps(k: __mmask8, a: __m256i) -> __m128 {
+    unsafe {
+        let b = _mm256_cvtepu64_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, b, f32x4::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu64_ps&ig_expand=1851)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtepu64_ps(a: __m512i) -> __m256 {
+    unsafe { transmute(vcvtuqq2ps_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu64_ps&ig_expand=1852)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtepu64_ps(src: __m256, k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        let b = _mm512_cvtepu64_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu64_ps&ig_expand=1853)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtepu64_ps(k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        let b = _mm512_cvtepu64_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epi64&ig_expand=1472)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundpd_epi64<const ROUNDING: i32>(a: __m512d) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundpd_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epi64&ig_expand=1473)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundpd_epi64<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtpd2qq_512(a.as_f64x8(), src.as_i64x8(), k, ROUNDING))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundpd_epi64&ig_expand=1474)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundpd_epi64<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundpd_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi64&ig_expand=1941)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtpd_epi64(a: __m128d) -> __m128i {
+    _mm_mask_cvtpd_epi64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_epi64&ig_expand=1942)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtpd_epi64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2qq_128(a.as_f64x2(), src.as_i64x2(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_epi64&ig_expand=1943)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtpd_epi64(k: __mmask8, a: __m128d) -> __m128i {
+    _mm_mask_cvtpd_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi64&ig_expand=1944)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtpd_epi64(a: __m256d) -> __m256i {
+    _mm256_mask_cvtpd_epi64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_epi64&ig_expand=1945)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtpd_epi64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { transmute(vcvtpd2qq_256(a.as_f64x4(), src.as_i64x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_epi64&ig_expand=1946)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtpd_epi64(k: __mmask8, a: __m256d) -> __m256i {
+    _mm256_mask_cvtpd_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_epi64&ig_expand=1947)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtpd_epi64(a: __m512d) -> __m512i {
+    _mm512_mask_cvtpd_epi64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_epi64&ig_expand=1948)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtpd_epi64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    unsafe {
+        transmute(vcvtpd2qq_512(
+            a.as_f64x8(),
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_epi64&ig_expand=1949)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtpd_epi64(k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvtpd_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epi64&ig_expand=1514)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundps_epi64<const ROUNDING: i32>(a: __m256) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundps_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epi64&ig_expand=1515)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundps_epi64<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m256,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtps2qq_512(a.as_f32x8(), src.as_i64x8(), k, ROUNDING))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epi64&ig_expand=1516)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundps_epi64<const ROUNDING: i32>(k: __mmask8, a: __m256) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundps_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi64&ig_expand=2075)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtps_epi64(a: __m128) -> __m128i {
+    _mm_mask_cvtps_epi64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_epi64&ig_expand=2076)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtps_epi64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2qq_128(a.as_f32x4(), src.as_i64x2(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_epi64&ig_expand=2077)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtps_epi64(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_cvtps_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi64&ig_expand=2078)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtps_epi64(a: __m128) -> __m256i {
+    _mm256_mask_cvtps_epi64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_epi64&ig_expand=2079)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtps_epi64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { transmute(vcvtps2qq_256(a.as_f32x4(), src.as_i64x4(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_epi64&ig_expand=2080)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtps_epi64(k: __mmask8, a: __m128) -> __m256i {
+    _mm256_mask_cvtps_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epi64&ig_expand=2081)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtps_epi64(a: __m256) -> __m512i {
+    _mm512_mask_cvtps_epi64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_epi64&ig_expand=2082)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtps_epi64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    unsafe {
+        transmute(vcvtps2qq_512(
+            a.as_f32x8(),
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_epi64&ig_expand=2083)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtps_epi64(k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvtps_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epu64&ig_expand=1478)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundpd_epu64<const ROUNDING: i32>(a: __m512d) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundpd_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epu64&ig_expand=1479)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundpd_epu64<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtpd2uqq_512(a.as_f64x8(), src.as_u64x8(), k, ROUNDING))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundpd_epu64&ig_expand=1480)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundpd_epu64<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundpd_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epu64&ig_expand=1959)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtpd_epu64(a: __m128d) -> __m128i {
+    _mm_mask_cvtpd_epu64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_epu64&ig_expand=1960)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtpd_epu64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2uqq_128(a.as_f64x2(), src.as_u64x2(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_epu64&ig_expand=1961)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtpd_epu64(k: __mmask8, a: __m128d) -> __m128i {
+    _mm_mask_cvtpd_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epu64&ig_expand=1962)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtpd_epu64(a: __m256d) -> __m256i {
+    _mm256_mask_cvtpd_epu64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_epu64&ig_expand=1963)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtpd_epu64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { transmute(vcvtpd2uqq_256(a.as_f64x4(), src.as_u64x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_epu64&ig_expand=1964)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtpd_epu64(k: __mmask8, a: __m256d) -> __m256i {
+    _mm256_mask_cvtpd_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_epu64&ig_expand=1965)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtpd_epu64(a: __m512d) -> __m512i {
+    _mm512_mask_cvtpd_epu64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_epu64&ig_expand=1966)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtpd_epu64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    unsafe {
+        transmute(vcvtpd2uqq_512(
+            a.as_f64x8(),
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_epu64&ig_expand=1967)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtpd_epu64(k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvtpd_epu64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epu64&ig_expand=1520)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundps_epu64<const ROUNDING: i32>(a: __m256) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundps_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epu64&ig_expand=1521)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundps_epu64<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m256,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtps2uqq_512(a.as_f32x8(), src.as_u64x8(), k, ROUNDING))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epu64&ig_expand=1522)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundps_epu64<const ROUNDING: i32>(k: __mmask8, a: __m256) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundps_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epu64&ig_expand=2093)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtps_epu64(a: __m128) -> __m128i {
+    _mm_mask_cvtps_epu64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_epu64&ig_expand=2094)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtps_epu64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2uqq_128(a.as_f32x4(), src.as_u64x2(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_epu64&ig_expand=2095)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtps_epu64(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_cvtps_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epu64&ig_expand=2096)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtps_epu64(a: __m128) -> __m256i {
+    _mm256_mask_cvtps_epu64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_epu64&ig_expand=2097)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtps_epu64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { transmute(vcvtps2uqq_256(a.as_f32x4(), src.as_u64x4(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_epu64&ig_expand=2098)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtps_epu64(k: __mmask8, a: __m128) -> __m256i {
+    _mm256_mask_cvtps_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epu64&ig_expand=2099)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtps_epu64(a: __m256) -> __m512i {
+    _mm512_mask_cvtps_epu64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_epu64&ig_expand=2100)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtps_epu64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    unsafe {
+        transmute(vcvtps2uqq_512(
+            a.as_f32x8(),
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_epu64&ig_expand=2101)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtps_epu64(k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvtps_epu64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epi64&ig_expand=2264)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtt_roundpd_epi64<const SAE: i32>(a: __m512d) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundpd_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundpd_epi64&ig_expand=2265)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtt_roundpd_epi64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttpd2qq_512(a.as_f64x8(), src.as_i64x8(), k, SAE))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundpd_epi64&ig_expand=2266)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtt_roundpd_epi64<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundpd_epi64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi64&ig_expand=2329)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvttpd_epi64(a: __m128d) -> __m128i {
+    _mm_mask_cvttpd_epi64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttpd_epi64&ig_expand=2330)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvttpd_epi64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2qq_128(a.as_f64x2(), src.as_i64x2(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttpd_epi64&ig_expand=2331)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvttpd_epi64(k: __mmask8, a: __m128d) -> __m128i {
+    _mm_mask_cvttpd_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi64&ig_expand=2332)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvttpd_epi64(a: __m256d) -> __m256i {
+    _mm256_mask_cvttpd_epi64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttpd_epi64&ig_expand=2333)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvttpd_epi64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { transmute(vcvttpd2qq_256(a.as_f64x4(), src.as_i64x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttpd_epi64&ig_expand=2334)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvttpd_epi64(k: __mmask8, a: __m256d) -> __m256i {
+    _mm256_mask_cvttpd_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epi64&ig_expand=2335)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvttpd_epi64(a: __m512d) -> __m512i {
+    _mm512_mask_cvttpd_epi64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttpd_epi64&ig_expand=2336)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvttpd_epi64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    unsafe {
+        transmute(vcvttpd2qq_512(
+            a.as_f64x8(),
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttpd_epi64&ig_expand=2337)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvttpd_epi64(k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvttpd_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epi64&ig_expand=2294)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2qq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtt_roundps_epi64<const SAE: i32>(a: __m256) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundps_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundps_epi64&ig_expand=2295)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2qq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtt_roundps_epi64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m256,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttps2qq_512(a.as_f32x8(), src.as_i64x8(), k, SAE))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundps_epi64&ig_expand=2296)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2qq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtt_roundps_epi64<const SAE: i32>(k: __mmask8, a: __m256) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundps_epi64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi64&ig_expand=2420)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvttps_epi64(a: __m128) -> __m128i {
+    _mm_mask_cvttps_epi64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttps_epi64&ig_expand=2421)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvttps_epi64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2qq_128(a.as_f32x4(), src.as_i64x2(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttps_epi64&ig_expand=2422)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvttps_epi64(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_cvttps_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi64&ig_expand=2423)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvttps_epi64(a: __m128) -> __m256i {
+    _mm256_mask_cvttps_epi64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttps_epi64&ig_expand=2424)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvttps_epi64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { transmute(vcvttps2qq_256(a.as_f32x4(), src.as_i64x4(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttps_epi64&ig_expand=2425)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvttps_epi64(k: __mmask8, a: __m128) -> __m256i {
+    _mm256_mask_cvttps_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epi64&ig_expand=2426)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvttps_epi64(a: __m256) -> __m512i {
+    _mm512_mask_cvttps_epi64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttps_epi64&ig_expand=2427)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvttps_epi64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    unsafe {
+        transmute(vcvttps2qq_512(
+            a.as_f32x8(),
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttps_epi64&ig_expand=2428)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvttps_epi64(k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvttps_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epu64&ig_expand=1965)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtt_roundpd_epu64<const SAE: i32>(a: __m512d) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundpd_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundpd_epu64&ig_expand=1966)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtt_roundpd_epu64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttpd2uqq_512(a.as_f64x8(), src.as_u64x8(), k, SAE))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundpd_epu64&ig_expand=1967)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtt_roundpd_epu64<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundpd_epu64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epu64&ig_expand=2347)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvttpd_epu64(a: __m128d) -> __m128i {
+    _mm_mask_cvttpd_epu64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttpd_epu64&ig_expand=2348)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvttpd_epu64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2uqq_128(a.as_f64x2(), src.as_u64x2(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttpd_epu64&ig_expand=2349)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvttpd_epu64(k: __mmask8, a: __m128d) -> __m128i {
+    _mm_mask_cvttpd_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epu64&ig_expand=2350)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvttpd_epu64(a: __m256d) -> __m256i {
+    _mm256_mask_cvttpd_epu64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttpd_epu64&ig_expand=2351)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvttpd_epu64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { transmute(vcvttpd2uqq_256(a.as_f64x4(), src.as_u64x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the results in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttpd_epu64&ig_expand=2352)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvttpd_epu64(k: __mmask8, a: __m256d) -> __m256i {
+    _mm256_mask_cvttpd_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epu64&ig_expand=2353)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvttpd_epu64(a: __m512d) -> __m512i {
+    _mm512_mask_cvttpd_epu64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttpd_epu64&ig_expand=2354)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvttpd_epu64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    unsafe {
+        transmute(vcvttpd2uqq_512(
+            a.as_f64x8(),
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+///
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttpd_epu64&ig_expand=2355)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvttpd_epu64(k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvttpd_epu64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epu64&ig_expand=2300)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtt_roundps_epu64<const SAE: i32>(a: __m256) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundps_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundps_epu64&ig_expand=2301)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtt_roundps_epu64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m256,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttps2uqq_512(a.as_f32x8(), src.as_u64x8(), k, SAE))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundps_epu64&ig_expand=2302)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtt_roundps_epu64<const SAE: i32>(k: __mmask8, a: __m256) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundps_epu64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epu64&ig_expand=2438)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvttps_epu64(a: __m128) -> __m128i {
+    _mm_mask_cvttps_epu64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttps_epu64&ig_expand=2439)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvttps_epu64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2uqq_128(a.as_f32x4(), src.as_u64x2(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttps_epu64&ig_expand=2440)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvttps_epu64(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_cvttps_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epu64&ig_expand=2441)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvttps_epu64(a: __m128) -> __m256i {
+    _mm256_mask_cvttps_epu64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttps_epu64&ig_expand=2442)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvttps_epu64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { transmute(vcvttps2uqq_256(a.as_f32x4(), src.as_u64x4(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttps_epu64&ig_expand=2443)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvttps_epu64(k: __mmask8, a: __m128) -> __m256i {
+    _mm256_mask_cvttps_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epu64&ig_expand=2444)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvttps_epu64(a: __m256) -> __m512i {
+    _mm512_mask_cvttps_epu64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttps_epu64&ig_expand=2445)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvttps_epu64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    unsafe {
+        transmute(vcvttps2uqq_512(
+            a.as_f32x8(),
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttps_epu64&ig_expand=2446)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvttps_epu64(k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvttps_epu64(_mm512_setzero_si512(), k, a)
+}
+
+// Multiply-Low
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst`.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi64&ig_expand=4778)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mullo_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_mul(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst` using writemask `k` (elements are copied from
+/// `src` if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mullo_epi64&ig_expand=4776)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_mullo_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let b = _mm_mullo_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, b, src.as_i64x2()))
+    }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst` using zeromask `k` (elements are zeroed out if
+/// the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mullo_epi64&ig_expand=4777)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_mullo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let b = _mm_mullo_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, b, i64x2::ZERO))
+    }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst`.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi64&ig_expand=4781)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mullo_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_mul(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst` using writemask `k` (elements are copied from
+/// `src` if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mullo_epi64&ig_expand=4779)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_mullo_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let b = _mm256_mullo_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, b, src.as_i64x4()))
+    }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst` using zeromask `k` (elements are zeroed out if
+/// the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mullo_epi64&ig_expand=4780)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_mullo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let b = _mm256_mullo_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, b, i64x4::ZERO))
+    }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst`.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mullo_epi64&ig_expand=4784)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mullo_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_mul(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst` using writemask `k` (elements are copied from
+/// `src` if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mullo_epi64&ig_expand=4782)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_mullo_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let b = _mm512_mullo_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, b, src.as_i64x8()))
+    }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst` using zeromask `k` (elements are zeroed out if
+/// the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mullo_epi64&ig_expand=4783)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_mullo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let b = _mm512_mullo_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, b, i64x8::ZERO))
+    }
+}
+
+// Mask Registers
+
+/// Convert 8-bit mask a to a 32-bit integer value and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtmask8_u32&ig_expand=1891)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtmask8_u32(a: __mmask8) -> u32 {
+    a as u32
+}
+
+/// Convert 32-bit integer value a to an 8-bit mask and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu32_mask8&ig_expand=2467)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtu32_mask8(a: u32) -> __mmask8 {
+    a as __mmask8
+}
+
+/// Add 16-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask16&ig_expand=3903)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kadd_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a + b
+}
+
+/// Add 8-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask8&ig_expand=3906)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kadd_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+    a + b
+}
+
+/// Bitwise AND of 8-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kand_mask8&ig_expand=3911)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kand_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+    a & b
+}
+
+/// Bitwise AND NOT of 8-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask8&ig_expand=3916)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kandn_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+    _knot_mask8(a) & b
+}
+
+/// Bitwise NOT of 8-bit mask a, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask8&ig_expand=3922)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _knot_mask8(a: __mmask8) -> __mmask8 {
+    a ^ 0b11111111
+}
+
+/// Bitwise OR of 8-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask8&ig_expand=3927)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+    a | b
+}
+
+/// Bitwise XNOR of 8-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask8&ig_expand=3969)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kxnor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+    _knot_mask8(_kxor_mask8(a, b))
+}
+
+/// Bitwise XOR of 8-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask8&ig_expand=3974)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kxor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+    a ^ b
+}
+
+/// Compute the bitwise OR of 8-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask8_u8&ig_expand=3931)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _kortest_mask8_u8(a: __mmask8, b: __mmask8, all_ones: *mut u8) -> u8 {
+    let tmp = _kor_mask8(a, b);
+    *all_ones = (tmp == 0xff) as u8;
+    (tmp == 0) as u8
+}
+
+/// Compute the bitwise OR of 8-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask8_u8&ig_expand=3936)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestc_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
+    (_kor_mask8(a, b) == 0xff) as u8
+}
+
+/// Compute the bitwise OR of 8-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask8_u8&ig_expand=3941)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestz_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
+    (_kor_mask8(a, b) == 0) as u8
+}
+
+/// Shift 8-bit mask a left by count bits while shifting in zeros, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask8&ig_expand=3945)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftli_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
+    a << COUNT
+}
+
+/// Shift 8-bit mask a right by count bits while shifting in zeros, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask8&ig_expand=3949)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftri_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
+    a >> COUNT
+}
+
+/// Compute the bitwise AND of 16-bit masks a and b, and if the result is all zeros, store 1 in dst,
+/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
+/// zeros, store 1 in and_not, otherwise store 0 in and_not.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask16_u8&ig_expand=3950)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _ktest_mask16_u8(a: __mmask16, b: __mmask16, and_not: *mut u8) -> u8 {
+    *and_not = (_kandn_mask16(a, b) == 0) as u8;
+    (_kand_mask16(a, b) == 0) as u8
+}
+
+/// Compute the bitwise AND of 8-bit masks a and b, and if the result is all zeros, store 1 in dst,
+/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
+/// zeros, store 1 in and_not, otherwise store 0 in and_not.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask8_u8&ig_expand=3953)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _ktest_mask8_u8(a: __mmask8, b: __mmask8, and_not: *mut u8) -> u8 {
+    *and_not = (_kandn_mask8(a, b) == 0) as u8;
+    (_kand_mask8(a, b) == 0) as u8
+}
+
+/// Compute the bitwise NOT of 16-bit mask a and then AND with 16-bit mask b, if the result is all
+/// zeros, store 1 in dst, otherwise store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask16_u8&ig_expand=3954)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+    (_kandn_mask16(a, b) == 0) as u8
+}
+
+/// Compute the bitwise NOT of 8-bit mask a and then AND with 8-bit mask b, if the result is all
+/// zeros, store 1 in dst, otherwise store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask8_u8&ig_expand=3957)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestc_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
+    (_kandn_mask8(a, b) == 0) as u8
+}
+
+/// Compute the bitwise AND of 16-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask16_u8&ig_expand=3958)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+    (_kand_mask16(a, b) == 0) as u8
+}
+
+/// Compute the bitwise AND of 8-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask8_u8&ig_expand=3961)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestz_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
+    (_kand_mask8(a, b) == 0) as u8
+}
+
+/// Load 8-bit mask from memory
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask8&ig_expand=3999)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _load_mask8(mem_addr: *const __mmask8) -> __mmask8 {
+    *mem_addr
+}
+
+/// Store 8-bit mask to memory
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask8&ig_expand=6468)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _store_mask8(mem_addr: *mut __mmask8, a: __mmask8) {
+    *mem_addr = a;
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit
+/// integer in a.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi32_mask&ig_expand=4612)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_movepi32_mask(a: __m128i) -> __mmask8 {
+    let zero = _mm_setzero_si128();
+    _mm_cmplt_epi32_mask(a, zero)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit
+/// integer in a.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movepi32_mask&ig_expand=4613)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_movepi32_mask(a: __m256i) -> __mmask8 {
+    let zero = _mm256_setzero_si256();
+    _mm256_cmplt_epi32_mask(a, zero)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit
+/// integer in a.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movepi32_mask&ig_expand=4614)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_movepi32_mask(a: __m512i) -> __mmask16 {
+    let zero = _mm512_setzero_si512();
+    _mm512_cmplt_epi32_mask(a, zero)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit
+/// integer in a.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_mask&ig_expand=4615)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_movepi64_mask(a: __m128i) -> __mmask8 {
+    let zero = _mm_setzero_si128();
+    _mm_cmplt_epi64_mask(a, zero)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit
+/// integer in a.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movepi64_mask&ig_expand=4616)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_movepi64_mask(a: __m256i) -> __mmask8 {
+    let zero = _mm256_setzero_si256();
+    _mm256_cmplt_epi64_mask(a, zero)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit
+/// integer in a.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movepi64_mask&ig_expand=4617)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_movepi64_mask(a: __m512i) -> __mmask8 {
+    let zero = _mm512_setzero_si512();
+    _mm512_cmplt_epi64_mask(a, zero)
+}
+
+/// Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding
+/// bit in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movm_epi32&ig_expand=4625)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovm2d))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_movm_epi32(k: __mmask8) -> __m128i {
+    let ones = _mm_set1_epi32(-1);
+    _mm_maskz_mov_epi32(k, ones)
+}
+
+/// Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding
+/// bit in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movm_epi32&ig_expand=4626)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovm2d))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_movm_epi32(k: __mmask8) -> __m256i {
+    let ones = _mm256_set1_epi32(-1);
+    _mm256_maskz_mov_epi32(k, ones)
+}
+
+/// Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding
+/// bit in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movm_epi32&ig_expand=4627)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vpmovm2d))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_movm_epi32(k: __mmask16) -> __m512i {
+    let ones = _mm512_set1_epi32(-1);
+    _mm512_maskz_mov_epi32(k, ones)
+}
+
+/// Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding
+/// bit in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movm_epi64&ig_expand=4628)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovm2q))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_movm_epi64(k: __mmask8) -> __m128i {
+    let ones = _mm_set1_epi64x(-1);
+    _mm_maskz_mov_epi64(k, ones)
+}
+
+/// Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding
+/// bit in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movm_epi64&ig_expand=4629)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovm2q))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_movm_epi64(k: __mmask8) -> __m256i {
+    let ones = _mm256_set1_epi64x(-1);
+    _mm256_maskz_mov_epi64(k, ones)
+}
+
+/// Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding
+/// bit in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movm_epi64&ig_expand=4630)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vpmovm2q))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_movm_epi64(k: __mmask8) -> __m512i {
+    let ones = _mm512_set1_epi64(-1);
+    _mm512_maskz_mov_epi64(k, ones)
+}
+
+// Range
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_range_round_pd&ig_expand=5210)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_range_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm512_mask_range_round_pd::<IMM8, SAE>(_mm512_setzero_pd(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_range_round_pd&ig_expand=5208)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_range_round_pd<const IMM8: i32, const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        static_assert_sae!(SAE);
+        transmute(vrangepd_512(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            IMM8,
+            src.as_f64x8(),
+            k,
+            SAE,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_range_round_pd&ig_expand=5209)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_range_round_pd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm512_mask_range_round_pd::<IMM8, SAE>(_mm512_setzero_pd(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_range_pd&ig_expand=5192)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_range_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm_mask_range_pd::<IMM8>(_mm_setzero_pd(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_pd&ig_expand=5190)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_range_pd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangepd_128(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            IMM8,
+            src.as_f64x2(),
+            k,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_pd&ig_expand=5191)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm_mask_range_pd::<IMM8>(_mm_setzero_pd(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_range_pd&ig_expand=5195)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_range_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm256_mask_range_pd::<IMM8>(_mm256_setzero_pd(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_range_pd&ig_expand=5193)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_range_pd<const IMM8: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangepd_256(
+            a.as_f64x4(),
+            b.as_f64x4(),
+            IMM8,
+            src.as_f64x4(),
+            k,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_range_pd&ig_expand=5194)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm256_mask_range_pd::<IMM8>(_mm256_setzero_pd(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_range_pd&ig_expand=5198)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_range_pd<const IMM8: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm512_mask_range_pd::<IMM8>(_mm512_setzero_pd(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_range_pd&ig_expand=5196)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_range_pd<const IMM8: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangepd_512(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            IMM8,
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_range_pd&ig_expand=5197)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm512_mask_range_pd::<IMM8>(_mm512_setzero_pd(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_range_round_ps&ig_expand=5213)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_range_round_ps<const IMM8: i32, const SAE: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm512_mask_range_round_ps::<IMM8, SAE>(_mm512_setzero_ps(), 0xffff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_range_round_ps&ig_expand=5211)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_range_round_ps<const IMM8: i32, const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        static_assert_sae!(SAE);
+        transmute(vrangeps_512(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            IMM8,
+            src.as_f32x16(),
+            k,
+            SAE,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_range_round_ps&ig_expand=5212)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_range_round_ps<const IMM8: i32, const SAE: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm512_mask_range_round_ps::<IMM8, SAE>(_mm512_setzero_ps(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_range_ps&ig_expand=5201)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_range_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm_mask_range_ps::<IMM8>(_mm_setzero_ps(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_ps&ig_expand=5199)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_range_ps<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangeps_128(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            IMM8,
+            src.as_f32x4(),
+            k,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_ps&ig_expand=5200)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_range_ps<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm_mask_range_ps::<IMM8>(_mm_setzero_ps(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_range_ps&ig_expand=5204)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_range_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm256_mask_range_ps::<IMM8>(_mm256_setzero_ps(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_range_ps&ig_expand=5202)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_range_ps<const IMM8: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangeps_256(
+            a.as_f32x8(),
+            b.as_f32x8(),
+            IMM8,
+            src.as_f32x8(),
+            k,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_range_ps&ig_expand=5203)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_range_ps<const IMM8: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm256_mask_range_ps::<IMM8>(_mm256_setzero_ps(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_range_ps&ig_expand=5207)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_range_ps<const IMM8: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm512_mask_range_ps::<IMM8>(_mm512_setzero_ps(), 0xffff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_range_ps&ig_expand=5205)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_range_ps<const IMM8: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangeps_512(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            IMM8,
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_range_ps&ig_expand=5206)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_range_ps<const IMM8: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm512_mask_range_ps::<IMM8>(_mm512_setzero_ps(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
+/// of dst, and copy the upper element from a to the upper element of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_range_round_sd&ig_expand=5216)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_range_round_sd<const IMM8: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm_mask_range_round_sd::<IMM8, SAE>(_mm_setzero_pd(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the
+/// upper element from a to the upper element of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_round_sd&ig_expand=5214)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_range_round_sd<const IMM8: i32, const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        static_assert_sae!(SAE);
+        transmute(vrangesd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            IMM8,
+            SAE,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper
+/// element from a to the upper element of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_round_sd&ig_expand=5215)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_range_round_sd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm_mask_range_round_sd::<IMM8, SAE>(_mm_setzero_pd(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the
+/// upper element from a to the upper element of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_sd&ig_expand=5220)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_range_sd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangesd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper
+/// element from a to the upper element of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_sd&ig_expand=5221)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_range_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm_mask_range_sd::<IMM8>(_mm_setzero_pd(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
+/// of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_range_round_ss&ig_expand=5219)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_range_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm_mask_range_round_ss::<IMM8, SAE>(_mm_setzero_ps(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the
+/// upper 3 packed elements from a to the upper elements of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_round_ss&ig_expand=5217)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_range_round_ss<const IMM8: i32, const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        static_assert_sae!(SAE);
+        transmute(vrangess(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            IMM8,
+            SAE,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper
+/// 3 packed elements from a to the upper elements of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_round_ss&ig_expand=5218)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_range_round_ss<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm_mask_range_round_ss::<IMM8, SAE>(_mm_setzero_ps(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the
+/// upper 3 packed elements from a to the upper elements of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_ss&ig_expand=5222)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_range_ss<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangess(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper
+/// 3 packed elements from a to the upper elements of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_ss&ig_expand=5223)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_range_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm_mask_range_ss::<IMM8>(_mm_setzero_ps(), k, a, b)
+}
+
+// Reduce
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_round_pd&ig_expand=5438)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_reduce_round_pd::<IMM8, SAE>(_mm512_undefined_pd(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_round_pd&ig_expand=5436)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_round_pd<const IMM8: i32, const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        transmute(vreducepd_512(a.as_f64x8(), IMM8, src.as_f64x8(), k, SAE))
+    }
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_round_pd&ig_expand=5437)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_reduce_round_pd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_reduce_round_pd::<IMM8, SAE>(_mm512_setzero_pd(), k, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_pd&ig_expand=5411)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_pd<const IMM8: i32>(a: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_pd::<IMM8>(_mm_undefined_pd(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_pd&ig_expand=5409)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_pd<const IMM8: i32>(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducepd_128(a.as_f64x2(), IMM8, src.as_f64x2(), k))
+    }
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_pd&ig_expand=5410)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_pd::<IMM8>(_mm_setzero_pd(), k, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_pd&ig_expand=5414)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_pd<const IMM8: i32>(a: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_reduce_pd::<IMM8>(_mm256_undefined_pd(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_pd&ig_expand=5412)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_pd<const IMM8: i32>(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducepd_256(a.as_f64x4(), IMM8, src.as_f64x4(), k))
+    }
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_reduce_pd&ig_expand=5413)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_reduce_pd::<IMM8>(_mm256_setzero_pd(), k, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_pd&ig_expand=5417)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_pd<const IMM8: i32>(a: __m512d) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_pd::<IMM8>(_mm512_undefined_pd(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_pd&ig_expand=5415)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_pd<const IMM8: i32>(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducepd_512(
+            a.as_f64x8(),
+            IMM8,
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_pd&ig_expand=5416)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_pd::<IMM8>(_mm512_setzero_pd(), k, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_round_ps&ig_expand=5444)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_round_ps<const IMM8: i32, const SAE: i32>(a: __m512) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_reduce_round_ps::<IMM8, SAE>(_mm512_undefined_ps(), 0xffff, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_round_ps&ig_expand=5442)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_round_ps<const IMM8: i32, const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        transmute(vreduceps_512(a.as_f32x16(), IMM8, src.as_f32x16(), k, SAE))
+    }
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_round_ps&ig_expand=5443)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_reduce_round_ps<const IMM8: i32, const SAE: i32>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_reduce_round_ps::<IMM8, SAE>(_mm512_setzero_ps(), k, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_ps&ig_expand=5429)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_ps<const IMM8: i32>(a: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_ps::<IMM8>(_mm_undefined_ps(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_ps&ig_expand=5427)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreduceps_128(a.as_f32x4(), IMM8, src.as_f32x4(), k))
+    }
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_ps&ig_expand=5428)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_reduce_ps<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_ps::<IMM8>(_mm_setzero_ps(), k, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_ps&ig_expand=5432)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_ps<const IMM8: i32>(a: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_reduce_ps::<IMM8>(_mm256_undefined_ps(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_ps&ig_expand=5430)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_ps<const IMM8: i32>(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreduceps_256(a.as_f32x8(), IMM8, src.as_f32x8(), k))
+    }
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_reduce_ps&ig_expand=5431)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_reduce_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_reduce_ps::<IMM8>(_mm256_setzero_ps(), k, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_ps&ig_expand=5435)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_ps<const IMM8: i32>(a: __m512) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_ps::<IMM8>(_mm512_undefined_ps(), 0xffff, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_ps&ig_expand=5433)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_ps<const IMM8: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreduceps_512(
+            a.as_f32x16(),
+            IMM8,
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_ps&ig_expand=5434)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_reduce_ps<const IMM8: i32>(k: __mmask16, a: __m512) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_ps::<IMM8>(_mm512_setzero_ps(), k, a)
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst, and copy
+/// the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_round_sd&ig_expand=5447)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_round_sd<const IMM8: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_reduce_round_sd::<IMM8, SAE>(_mm_undefined_pd(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
+/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_round_sd&ig_expand=5445)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_round_sd<const IMM8: i32, const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        transmute(vreducesd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            IMM8,
+            SAE,
+        ))
+    }
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
+/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_round_sd&ig_expand=5446)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_reduce_round_sd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_reduce_round_sd::<IMM8, SAE>(_mm_setzero_pd(), k, a, b)
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using, and
+/// copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_sd&ig_expand=5456)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_sd::<IMM8>(_mm_undefined_pd(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
+/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_sd&ig_expand=5454)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_sd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducesd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
+/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_sd&ig_expand=5455)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_reduce_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_sd::<IMM8>(_mm_setzero_pd(), k, a, b)
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst, and copy
+/// the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_round_ss&ig_expand=5453)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_reduce_round_ss::<IMM8, SAE>(_mm_undefined_ps(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
+/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_round_ss&ig_expand=5451)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_round_ss<const IMM8: i32, const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        transmute(vreducess(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            IMM8,
+            SAE,
+        ))
+    }
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
+/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_round_ss&ig_expand=5452)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_reduce_round_ss<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_reduce_round_ss::<IMM8, SAE>(_mm_setzero_ps(), k, a, b)
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst, and copy
+/// the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_ss&ig_expand=5462)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_ss::<IMM8>(_mm_undefined_ps(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
+/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_ss&ig_expand=5460)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_ss<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducess(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
+/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_ss&ig_expand=5461)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_reduce_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_ss::<IMM8>(_mm_setzero_ps(), k, a, b)
+}
+
+// FP-Class
+
+/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_pd_mask&ig_expand=3493)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_fpclass_pd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_fpclass_pd_mask::<IMM8>(0xff, a)
+}
+
+/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_pd_mask&ig_expand=3494)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclasspd_128(a.as_f64x2(), IMM8, k1))
+    }
+}
+
+/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fpclass_pd_mask&ig_expand=3495)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_fpclass_pd_mask<const IMM8: i32>(a: __m256d) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_fpclass_pd_mask::<IMM8>(0xff, a)
+}
+
+/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fpclass_pd_mask&ig_expand=3496)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m256d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclasspd_256(a.as_f64x4(), IMM8, k1))
+    }
+}
+
+/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fpclass_pd_mask&ig_expand=3497)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_fpclass_pd_mask<const IMM8: i32>(a: __m512d) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_fpclass_pd_mask::<IMM8>(0xff, a)
+}
+
+/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fpclass_pd_mask&ig_expand=3498)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m512d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclasspd_512(a.as_f64x8(), IMM8, k1))
+    }
+}
+
+/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_ps_mask&ig_expand=3505)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_fpclass_ps_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_fpclass_ps_mask::<IMM8>(0xff, a)
+}
+
+/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_ps_mask&ig_expand=3506)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclassps_128(a.as_f32x4(), IMM8, k1))
+    }
+}
+
+/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fpclass_ps_mask&ig_expand=3507)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_fpclass_ps_mask<const IMM8: i32>(a: __m256) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_fpclass_ps_mask::<IMM8>(0xff, a)
+}
+
+/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fpclass_ps_mask&ig_expand=3508)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m256) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclassps_256(a.as_f32x8(), IMM8, k1))
+    }
+}
+
+/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fpclass_ps_mask&ig_expand=3509)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_fpclass_ps_mask<const IMM8: i32>(a: __m512) -> __mmask16 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_fpclass_ps_mask::<IMM8>(0xffff, a)
+}
+
+/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fpclass_ps_mask&ig_expand=3510)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask16, a: __m512) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclassps_512(a.as_f32x16(), IMM8, k1))
+    }
+}
+
+/// Test the lower double-precision (64-bit) floating-point element in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_sd_mask&ig_expand=3511)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclasssd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_fpclass_sd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_fpclass_sd_mask::<IMM8>(0xff, a)
+}
+
+/// Test the lower double-precision (64-bit) floating-point element in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_sd_mask&ig_expand=3512)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclasssd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_fpclass_sd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vfpclasssd(a.as_f64x2(), IMM8, k1)
+    }
+}
+
+/// Test the lower single-precision (32-bit) floating-point element in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_ss_mask&ig_expand=3515)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclassss, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_fpclass_ss_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_fpclass_ss_mask::<IMM8>(0xff, a)
+}
+
+/// Test the lower single-precision (32-bit) floating-point element in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_ss_mask&ig_expand=3516)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclassss, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_fpclass_ss_mask<const IMM8: i32>(k1: __mmask8, a: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vfpclassss(a.as_f32x4(), IMM8, k1)
+    }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.sitofp.round.v2f64.v2i64"]
+    fn vcvtqq2pd_128(a: i64x2, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v4f64.v4i64"]
+    fn vcvtqq2pd_256(a: i64x4, rounding: i32) -> f64x4;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v8f64.v8i64"]
+    fn vcvtqq2pd_512(a: i64x8, rounding: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtqq2ps.128"]
+    fn vcvtqq2ps_128(a: i64x2, src: f32x4, k: __mmask8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v4f32.v4i64"]
+    fn vcvtqq2ps_256(a: i64x4, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v8f32.v8i64"]
+    fn vcvtqq2ps_512(a: i64x8, rounding: i32) -> f32x8;
+
+    #[link_name = "llvm.x86.avx512.uitofp.round.v2f64.v2i64"]
+    fn vcvtuqq2pd_128(a: u64x2, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v4f64.v4i64"]
+    fn vcvtuqq2pd_256(a: u64x4, rounding: i32) -> f64x4;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v8f64.v8i64"]
+    fn vcvtuqq2pd_512(a: u64x8, rounding: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtuqq2ps.128"]
+    fn vcvtuqq2ps_128(a: u64x2, src: f32x4, k: __mmask8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v4f32.v4i64"]
+    fn vcvtuqq2ps_256(a: u64x4, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v8f32.v8i64"]
+    fn vcvtuqq2ps_512(a: u64x8, rounding: i32) -> f32x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2qq.128"]
+    fn vcvtpd2qq_128(a: f64x2, src: i64x2, k: __mmask8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2qq.256"]
+    fn vcvtpd2qq_256(a: f64x4, src: i64x4, k: __mmask8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2qq.512"]
+    fn vcvtpd2qq_512(a: f64x8, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtps2qq.128"]
+    fn vcvtps2qq_128(a: f32x4, src: i64x2, k: __mmask8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2qq.256"]
+    fn vcvtps2qq_256(a: f32x4, src: i64x4, k: __mmask8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2qq.512"]
+    fn vcvtps2qq_512(a: f32x8, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2uqq.128"]
+    fn vcvtpd2uqq_128(a: f64x2, src: u64x2, k: __mmask8) -> u64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2uqq.256"]
+    fn vcvtpd2uqq_256(a: f64x4, src: u64x4, k: __mmask8) -> u64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2uqq.512"]
+    fn vcvtpd2uqq_512(a: f64x8, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtps2uqq.128"]
+    fn vcvtps2uqq_128(a: f32x4, src: u64x2, k: __mmask8) -> u64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2uqq.256"]
+    fn vcvtps2uqq_256(a: f32x4, src: u64x4, k: __mmask8) -> u64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2uqq.512"]
+    fn vcvtps2uqq_512(a: f32x8, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2qq.128"]
+    fn vcvttpd2qq_128(a: f64x2, src: i64x2, k: __mmask8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2qq.256"]
+    fn vcvttpd2qq_256(a: f64x4, src: i64x4, k: __mmask8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2qq.512"]
+    fn vcvttpd2qq_512(a: f64x8, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttps2qq.128"]
+    fn vcvttps2qq_128(a: f32x4, src: i64x2, k: __mmask8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2qq.256"]
+    fn vcvttps2qq_256(a: f32x4, src: i64x4, k: __mmask8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2qq.512"]
+    fn vcvttps2qq_512(a: f32x8, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2uqq.128"]
+    fn vcvttpd2uqq_128(a: f64x2, src: u64x2, k: __mmask8) -> u64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2uqq.256"]
+    fn vcvttpd2uqq_256(a: f64x4, src: u64x4, k: __mmask8) -> u64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2uqq.512"]
+    fn vcvttpd2uqq_512(a: f64x8, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttps2uqq.128"]
+    fn vcvttps2uqq_128(a: f32x4, src: u64x2, k: __mmask8) -> u64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2uqq.256"]
+    fn vcvttps2uqq_256(a: f32x4, src: u64x4, k: __mmask8) -> u64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2uqq.512"]
+    fn vcvttps2uqq_512(a: f32x8, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.range.pd.128"]
+    fn vrangepd_128(a: f64x2, b: f64x2, imm8: i32, src: f64x2, k: __mmask8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.range.pd.256"]
+    fn vrangepd_256(a: f64x4, b: f64x4, imm8: i32, src: f64x4, k: __mmask8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.range.pd.512"]
+    fn vrangepd_512(a: f64x8, b: f64x8, imm8: i32, src: f64x8, k: __mmask8, sae: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.range.ps.128"]
+    fn vrangeps_128(a: f32x4, b: f32x4, imm8: i32, src: f32x4, k: __mmask8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.range.ps.256"]
+    fn vrangeps_256(a: f32x8, b: f32x8, imm8: i32, src: f32x8, k: __mmask8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.range.ps.512"]
+    fn vrangeps_512(a: f32x16, b: f32x16, imm8: i32, src: f32x16, k: __mmask16, sae: i32)
+    -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.range.sd"]
+    fn vrangesd(a: f64x2, b: f64x2, src: f64x2, k: __mmask8, imm8: i32, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.range.ss"]
+    fn vrangess(a: f32x4, b: f32x4, src: f32x4, k: __mmask8, imm8: i32, sae: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.reduce.pd.128"]
+    fn vreducepd_128(a: f64x2, imm8: i32, src: f64x2, k: __mmask8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.reduce.pd.256"]
+    fn vreducepd_256(a: f64x4, imm8: i32, src: f64x4, k: __mmask8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.reduce.pd.512"]
+    fn vreducepd_512(a: f64x8, imm8: i32, src: f64x8, k: __mmask8, sae: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.reduce.ps.128"]
+    fn vreduceps_128(a: f32x4, imm8: i32, src: f32x4, k: __mmask8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.reduce.ps.256"]
+    fn vreduceps_256(a: f32x8, imm8: i32, src: f32x8, k: __mmask8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.reduce.ps.512"]
+    fn vreduceps_512(a: f32x16, imm8: i32, src: f32x16, k: __mmask16, sae: i32) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.reduce.sd"]
+    fn vreducesd(a: f64x2, b: f64x2, src: f64x2, k: __mmask8, imm8: i32, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.reduce.ss"]
+    fn vreducess(a: f32x4, b: f32x4, src: f32x4, k: __mmask8, imm8: i32, sae: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.fpclass.pd.128"]
+    fn vfpclasspd_128(a: f64x2, imm8: i32, k: __mmask8) -> __mmask8;
+    #[link_name = "llvm.x86.avx512.mask.fpclass.pd.256"]
+    fn vfpclasspd_256(a: f64x4, imm8: i32, k: __mmask8) -> __mmask8;
+    #[link_name = "llvm.x86.avx512.mask.fpclass.pd.512"]
+    fn vfpclasspd_512(a: f64x8, imm8: i32, k: __mmask8) -> __mmask8;
+
+    #[link_name = "llvm.x86.avx512.mask.fpclass.ps.128"]
+    fn vfpclassps_128(a: f32x4, imm8: i32, k: __mmask8) -> __mmask8;
+    #[link_name = "llvm.x86.avx512.mask.fpclass.ps.256"]
+    fn vfpclassps_256(a: f32x8, imm8: i32, k: __mmask8) -> __mmask8;
+    #[link_name = "llvm.x86.avx512.mask.fpclass.ps.512"]
+    fn vfpclassps_512(a: f32x16, imm8: i32, k: __mmask16) -> __mmask16;
+
+    #[link_name = "llvm.x86.avx512.mask.fpclass.sd"]
+    fn vfpclasssd(a: f64x2, imm8: i32, k: __mmask8) -> __mmask8;
+    #[link_name = "llvm.x86.avx512.mask.fpclass.ss"]
+    fn vfpclassss(a: f32x4, imm8: i32, k: __mmask8) -> __mmask8;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use crate::mem::transmute;
+
+    const OPRND1_64: f64 = unsafe { transmute(0x3333333333333333_u64) };
+    const OPRND2_64: f64 = unsafe { transmute(0x5555555555555555_u64) };
+
+    const AND_64: f64 = unsafe { transmute(0x1111111111111111_u64) };
+    const ANDN_64: f64 = unsafe { transmute(0x4444444444444444_u64) };
+    const OR_64: f64 = unsafe { transmute(0x7777777777777777_u64) };
+    const XOR_64: f64 = unsafe { transmute(0x6666666666666666_u64) };
+
+    const OPRND1_32: f32 = unsafe { transmute(0x33333333_u32) };
+    const OPRND2_32: f32 = unsafe { transmute(0x55555555_u32) };
+
+    const AND_32: f32 = unsafe { transmute(0x11111111_u32) };
+    const ANDN_32: f32 = unsafe { transmute(0x44444444_u32) };
+    const OR_32: f32 = unsafe { transmute(0x77777777_u32) };
+    const XOR_32: f32 = unsafe { transmute(0x66666666_u32) };
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_and_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let src = _mm_set_pd(1., 2.);
+        let r = _mm_mask_and_pd(src, 0b01, a, b);
+        let e = _mm_set_pd(1., AND_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_and_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let r = _mm_maskz_and_pd(0b01, a, b);
+        let e = _mm_set_pd(0.0, AND_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_and_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let src = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_mask_and_pd(src, 0b0101, a, b);
+        let e = _mm256_set_pd(1., AND_64, 3., AND_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_and_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let r = _mm256_maskz_and_pd(0b0101, a, b);
+        let e = _mm256_set_pd(0.0, AND_64, 0.0, AND_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_and_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_and_pd(a, b);
+        let e = _mm512_set1_pd(AND_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_and_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let src = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_mask_and_pd(src, 0b01010101, a, b);
+        let e = _mm512_set_pd(1., AND_64, 3., AND_64, 5., AND_64, 7., AND_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_and_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_maskz_and_pd(0b01010101, a, b);
+        let e = _mm512_set_pd(0.0, AND_64, 0.0, AND_64, 0.0, AND_64, 0.0, AND_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_and_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let src = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_and_ps(src, 0b0101, a, b);
+        let e = _mm_set_ps(1., AND_32, 3., AND_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_and_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let r = _mm_maskz_and_ps(0b0101, a, b);
+        let e = _mm_set_ps(0.0, AND_32, 0.0, AND_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_and_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let src = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_and_ps(src, 0b01010101, a, b);
+        let e = _mm256_set_ps(1., AND_32, 3., AND_32, 5., AND_32, 7., AND_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_and_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let r = _mm256_maskz_and_ps(0b01010101, a, b);
+        let e = _mm256_set_ps(0.0, AND_32, 0.0, AND_32, 0.0, AND_32, 0.0, AND_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_and_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_and_ps(a, b);
+        let e = _mm512_set1_ps(AND_32);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_and_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let src = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_and_ps(src, 0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            1., AND_32, 3., AND_32, 5., AND_32, 7., AND_32, 9., AND_32, 11., AND_32, 13., AND_32,
+            15., AND_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_and_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_maskz_and_ps(0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0.,
+            AND_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_andnot_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let src = _mm_set_pd(1., 2.);
+        let r = _mm_mask_andnot_pd(src, 0b01, a, b);
+        let e = _mm_set_pd(1., ANDN_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_andnot_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let r = _mm_maskz_andnot_pd(0b01, a, b);
+        let e = _mm_set_pd(0.0, ANDN_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_andnot_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let src = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_mask_andnot_pd(src, 0b0101, a, b);
+        let e = _mm256_set_pd(1., ANDN_64, 3., ANDN_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_andnot_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let r = _mm256_maskz_andnot_pd(0b0101, a, b);
+        let e = _mm256_set_pd(0.0, ANDN_64, 0.0, ANDN_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_andnot_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_andnot_pd(a, b);
+        let e = _mm512_set1_pd(ANDN_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_andnot_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let src = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_mask_andnot_pd(src, 0b01010101, a, b);
+        let e = _mm512_set_pd(1., ANDN_64, 3., ANDN_64, 5., ANDN_64, 7., ANDN_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_andnot_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_maskz_andnot_pd(0b01010101, a, b);
+        let e = _mm512_set_pd(0.0, ANDN_64, 0.0, ANDN_64, 0.0, ANDN_64, 0.0, ANDN_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_andnot_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let src = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_andnot_ps(src, 0b0101, a, b);
+        let e = _mm_set_ps(1., ANDN_32, 3., ANDN_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_andnot_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let r = _mm_maskz_andnot_ps(0b0101, a, b);
+        let e = _mm_set_ps(0.0, ANDN_32, 0.0, ANDN_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_andnot_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let src = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_andnot_ps(src, 0b01010101, a, b);
+        let e = _mm256_set_ps(1., ANDN_32, 3., ANDN_32, 5., ANDN_32, 7., ANDN_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_andnot_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let r = _mm256_maskz_andnot_ps(0b01010101, a, b);
+        let e = _mm256_set_ps(0.0, ANDN_32, 0.0, ANDN_32, 0.0, ANDN_32, 0.0, ANDN_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_andnot_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_andnot_ps(a, b);
+        let e = _mm512_set1_ps(ANDN_32);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_andnot_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let src = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_andnot_ps(src, 0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            1., ANDN_32, 3., ANDN_32, 5., ANDN_32, 7., ANDN_32, 9., ANDN_32, 11., ANDN_32, 13.,
+            ANDN_32, 15., ANDN_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_andnot_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_maskz_andnot_ps(0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0.,
+            ANDN_32, 0., ANDN_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_or_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let src = _mm_set_pd(1., 2.);
+        let r = _mm_mask_or_pd(src, 0b01, a, b);
+        let e = _mm_set_pd(1., OR_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_or_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let r = _mm_maskz_or_pd(0b01, a, b);
+        let e = _mm_set_pd(0.0, OR_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_or_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let src = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_mask_or_pd(src, 0b0101, a, b);
+        let e = _mm256_set_pd(1., OR_64, 3., OR_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_or_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let r = _mm256_maskz_or_pd(0b0101, a, b);
+        let e = _mm256_set_pd(0.0, OR_64, 0.0, OR_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_or_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_or_pd(a, b);
+        let e = _mm512_set1_pd(OR_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_or_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let src = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_mask_or_pd(src, 0b01010101, a, b);
+        let e = _mm512_set_pd(1., OR_64, 3., OR_64, 5., OR_64, 7., OR_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_or_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_maskz_or_pd(0b01010101, a, b);
+        let e = _mm512_set_pd(0.0, OR_64, 0.0, OR_64, 0.0, OR_64, 0.0, OR_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_or_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let src = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_or_ps(src, 0b0101, a, b);
+        let e = _mm_set_ps(1., OR_32, 3., OR_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_or_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let r = _mm_maskz_or_ps(0b0101, a, b);
+        let e = _mm_set_ps(0.0, OR_32, 0.0, OR_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_or_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let src = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_or_ps(src, 0b01010101, a, b);
+        let e = _mm256_set_ps(1., OR_32, 3., OR_32, 5., OR_32, 7., OR_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_or_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let r = _mm256_maskz_or_ps(0b01010101, a, b);
+        let e = _mm256_set_ps(0.0, OR_32, 0.0, OR_32, 0.0, OR_32, 0.0, OR_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_or_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_or_ps(a, b);
+        let e = _mm512_set1_ps(OR_32);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_or_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let src = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_or_ps(src, 0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            1., OR_32, 3., OR_32, 5., OR_32, 7., OR_32, 9., OR_32, 11., OR_32, 13., OR_32, 15.,
+            OR_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_or_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_maskz_or_ps(0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_xor_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let src = _mm_set_pd(1., 2.);
+        let r = _mm_mask_xor_pd(src, 0b01, a, b);
+        let e = _mm_set_pd(1., XOR_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_xor_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let r = _mm_maskz_xor_pd(0b01, a, b);
+        let e = _mm_set_pd(0.0, XOR_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_xor_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let src = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_mask_xor_pd(src, 0b0101, a, b);
+        let e = _mm256_set_pd(1., XOR_64, 3., XOR_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_xor_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let r = _mm256_maskz_xor_pd(0b0101, a, b);
+        let e = _mm256_set_pd(0.0, XOR_64, 0.0, XOR_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_xor_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_xor_pd(a, b);
+        let e = _mm512_set1_pd(XOR_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_xor_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let src = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_mask_xor_pd(src, 0b01010101, a, b);
+        let e = _mm512_set_pd(1., XOR_64, 3., XOR_64, 5., XOR_64, 7., XOR_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_xor_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_maskz_xor_pd(0b01010101, a, b);
+        let e = _mm512_set_pd(0.0, XOR_64, 0.0, XOR_64, 0.0, XOR_64, 0.0, XOR_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_xor_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let src = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_xor_ps(src, 0b0101, a, b);
+        let e = _mm_set_ps(1., XOR_32, 3., XOR_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_xor_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let r = _mm_maskz_xor_ps(0b0101, a, b);
+        let e = _mm_set_ps(0.0, XOR_32, 0.0, XOR_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_xor_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let src = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_xor_ps(src, 0b01010101, a, b);
+        let e = _mm256_set_ps(1., XOR_32, 3., XOR_32, 5., XOR_32, 7., XOR_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_xor_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let r = _mm256_maskz_xor_ps(0b01010101, a, b);
+        let e = _mm256_set_ps(0.0, XOR_32, 0.0, XOR_32, 0.0, XOR_32, 0.0, XOR_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_xor_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_xor_ps(a, b);
+        let e = _mm512_set1_ps(XOR_32);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_xor_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let src = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_xor_ps(src, 0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            1., XOR_32, 3., XOR_32, 5., XOR_32, 7., XOR_32, 9., XOR_32, 11., XOR_32, 13., XOR_32,
+            15., XOR_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_xor_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_maskz_xor_ps(0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0.,
+            XOR_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_broadcast_f32x2() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_broadcast_f32x2(a);
+        let e = _mm256_set_ps(3., 4., 3., 4., 3., 4., 3., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_f32x2() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm256_set_ps(5., 6., 7., 8., 9., 10., 11., 12.);
+        let r = _mm256_mask_broadcast_f32x2(b, 0b01101001, a);
+        let e = _mm256_set_ps(5., 4., 3., 8., 3., 10., 11., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_f32x2() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_maskz_broadcast_f32x2(0b01101001, a);
+        let e = _mm256_set_ps(0., 4., 3., 0., 3., 0., 0., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_broadcast_f32x2() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm512_broadcast_f32x2(a);
+        let e = _mm512_set_ps(
+            3., 4., 3., 4., 3., 4., 3., 4., 3., 4., 3., 4., 3., 4., 3., 4.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_broadcast_f32x2() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm512_set_ps(
+            5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.,
+        );
+        let r = _mm512_mask_broadcast_f32x2(b, 0b0110100100111100, a);
+        let e = _mm512_set_ps(
+            5., 4., 3., 8., 3., 10., 11., 4., 13., 14., 3., 4., 3., 4., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_broadcast_f32x2() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm512_maskz_broadcast_f32x2(0b0110100100111100, a);
+        let e = _mm512_set_ps(
+            0., 4., 3., 0., 3., 0., 0., 4., 0., 0., 3., 4., 3., 4., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_broadcast_f32x8() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_broadcast_f32x8(a);
+        let e = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 1., 2., 3., 4., 5., 6., 7., 8.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_broadcast_f32x8() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_ps(
+            9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.,
+        );
+        let r = _mm512_mask_broadcast_f32x8(b, 0b0110100100111100, a);
+        let e = _mm512_set_ps(
+            9., 2., 3., 12., 5., 14., 15., 8., 17., 18., 3., 4., 5., 6., 23., 24.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_broadcast_f32x8() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_broadcast_f32x8(0b0110100100111100, a);
+        let e = _mm512_set_ps(
+            0., 2., 3., 0., 5., 0., 0., 8., 0., 0., 3., 4., 5., 6., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_broadcast_f64x2() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm256_broadcast_f64x2(a);
+        let e = _mm256_set_pd(1., 2., 1., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_f64x2() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm256_set_pd(3., 4., 5., 6.);
+        let r = _mm256_mask_broadcast_f64x2(b, 0b0110, a);
+        let e = _mm256_set_pd(3., 2., 1., 6.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_f64x2() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm256_maskz_broadcast_f64x2(0b0110, a);
+        let e = _mm256_set_pd(0., 2., 1., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_broadcast_f64x2() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm512_broadcast_f64x2(a);
+        let e = _mm512_set_pd(1., 2., 1., 2., 1., 2., 1., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_broadcast_f64x2() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm512_set_pd(3., 4., 5., 6., 7., 8., 9., 10.);
+        let r = _mm512_mask_broadcast_f64x2(b, 0b01101001, a);
+        let e = _mm512_set_pd(3., 2., 1., 6., 1., 8., 9., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_broadcast_f64x2() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm512_maskz_broadcast_f64x2(0b01101001, a);
+        let e = _mm512_set_pd(0., 2., 1., 0., 1., 0., 0., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_broadcast_i32x2(a);
+        let e = _mm_set_epi32(3, 4, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(5, 6, 7, 8);
+        let r = _mm_mask_broadcast_i32x2(b, 0b0110, a);
+        let e = _mm_set_epi32(5, 4, 3, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_broadcast_i32x2(0b0110, a);
+        let e = _mm_set_epi32(0, 4, 3, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm256_broadcast_i32x2(a);
+        let e = _mm256_set_epi32(3, 4, 3, 4, 3, 4, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm256_set_epi32(5, 6, 7, 8, 9, 10, 11, 12);
+        let r = _mm256_mask_broadcast_i32x2(b, 0b01101001, a);
+        let e = _mm256_set_epi32(5, 4, 3, 8, 3, 10, 11, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm256_maskz_broadcast_i32x2(0b01101001, a);
+        let e = _mm256_set_epi32(0, 4, 3, 0, 3, 0, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm512_broadcast_i32x2(a);
+        let e = _mm512_set_epi32(3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm512_set_epi32(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20);
+        let r = _mm512_mask_broadcast_i32x2(b, 0b0110100100111100, a);
+        let e = _mm512_set_epi32(5, 4, 3, 8, 3, 10, 11, 4, 13, 14, 3, 4, 3, 4, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm512_maskz_broadcast_i32x2(0b0110100100111100, a);
+        let e = _mm512_set_epi32(0, 4, 3, 0, 3, 0, 0, 4, 0, 0, 3, 4, 3, 4, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_broadcast_i32x8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_broadcast_i32x8(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_broadcast_i32x8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi32(
+            9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+        );
+        let r = _mm512_mask_broadcast_i32x8(b, 0b0110100100111100, a);
+        let e = _mm512_set_epi32(9, 2, 3, 12, 5, 14, 15, 8, 17, 18, 3, 4, 5, 6, 23, 24);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_broadcast_i32x8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_broadcast_i32x8(0b0110100100111100, a);
+        let e = _mm512_set_epi32(0, 2, 3, 0, 5, 0, 0, 8, 0, 0, 3, 4, 5, 6, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_broadcast_i64x2() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm256_broadcast_i64x2(a);
+        let e = _mm256_set_epi64x(1, 2, 1, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_i64x2() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm256_set_epi64x(3, 4, 5, 6);
+        let r = _mm256_mask_broadcast_i64x2(b, 0b0110, a);
+        let e = _mm256_set_epi64x(3, 2, 1, 6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_i64x2() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm256_maskz_broadcast_i64x2(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_broadcast_i64x2() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm512_broadcast_i64x2(a);
+        let e = _mm512_set_epi64(1, 2, 1, 2, 1, 2, 1, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_broadcast_i64x2() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm512_set_epi64(3, 4, 5, 6, 7, 8, 9, 10);
+        let r = _mm512_mask_broadcast_i64x2(b, 0b01101001, a);
+        let e = _mm512_set_epi64(3, 2, 1, 6, 1, 8, 9, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_broadcast_i64x2() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm512_maskz_broadcast_i64x2(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 1, 0, 1, 0, 0, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_extractf32x8_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_extractf32x8_ps::<1>(a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_extractf32x8_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_mask_extractf32x8_ps::<1>(b, 0b01101001, a);
+        let e = _mm256_set_ps(17., 2., 3., 20., 5., 22., 23., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_extractf32x8_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_maskz_extractf32x8_ps::<1>(0b01101001, a);
+        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_extractf64x2_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_extractf64x2_pd::<1>(a);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_extractf64x2_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm_set_pd(5., 6.);
+        let r = _mm256_mask_extractf64x2_pd::<1>(b, 0b01, a);
+        let e = _mm_set_pd(5., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_extractf64x2_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_maskz_extractf64x2_pd::<1>(0b01, a);
+        let e = _mm_set_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_extractf64x2_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_extractf64x2_pd::<2>(a);
+        let e = _mm_set_pd(3., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_extractf64x2_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_pd(9., 10.);
+        let r = _mm512_mask_extractf64x2_pd::<2>(b, 0b01, a);
+        let e = _mm_set_pd(9., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_extractf64x2_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_extractf64x2_pd::<2>(0b01, a);
+        let e = _mm_set_pd(0., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_extracti32x8_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_extracti32x8_epi32::<1>(a);
+        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_extracti32x8_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_mask_extracti32x8_epi32::<1>(b, 0b01101001, a);
+        let e = _mm256_set_epi32(17, 2, 3, 20, 5, 22, 23, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_extracti32x8_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_extracti32x8_epi32::<1>(0b01101001, a);
+        let e = _mm256_set_epi32(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_extracti64x2_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_extracti64x2_epi64::<1>(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_extracti64x2_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm256_mask_extracti64x2_epi64::<1>(b, 0b01, a);
+        let e = _mm_set_epi64x(5, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_extracti64x2_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_extracti64x2_epi64::<1>(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_extracti64x2_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_extracti64x2_epi64::<2>(a);
+        let e = _mm_set_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_extracti64x2_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi64x(9, 10);
+        let r = _mm512_mask_extracti64x2_epi64::<2>(b, 0b01, a);
+        let e = _mm_set_epi64x(9, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_extracti64x2_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_extracti64x2_epi64::<2>(0b01, a);
+        let e = _mm_set_epi64x(0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_insertf32x8() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_insertf32x8::<1>(a, b);
+        let e = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_insertf32x8() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let src = _mm512_set_ps(
+            25., 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40.,
+        );
+        let r = _mm512_mask_insertf32x8::<1>(src, 0b0110100100111100, a, b);
+        let e = _mm512_set_ps(
+            25., 18., 19., 28., 21., 30., 31., 24., 33., 34., 11., 12., 13., 14., 39., 40.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_insertf32x8() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_maskz_insertf32x8::<1>(0b0110100100111100, a, b);
+        let e = _mm512_set_ps(
+            0., 18., 19., 0., 21., 0., 0., 24., 0., 0., 11., 12., 13., 14., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_insertf64x2() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm_set_pd(5., 6.);
+        let r = _mm256_insertf64x2::<1>(a, b);
+        let e = _mm256_set_pd(5., 6., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_insertf64x2() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm_set_pd(5., 6.);
+        let src = _mm256_set_pd(7., 8., 9., 10.);
+        let r = _mm256_mask_insertf64x2::<1>(src, 0b0110, a, b);
+        let e = _mm256_set_pd(7., 6., 3., 10.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_insertf64x2() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm_set_pd(5., 6.);
+        let r = _mm256_maskz_insertf64x2::<1>(0b0110, a, b);
+        let e = _mm256_set_pd(0., 6., 3., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_insertf64x2() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_pd(9., 10.);
+        let r = _mm512_insertf64x2::<2>(a, b);
+        let e = _mm512_set_pd(1., 2., 9., 10., 5., 6., 7., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_insertf64x2() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_pd(9., 10.);
+        let src = _mm512_set_pd(11., 12., 13., 14., 15., 16., 17., 18.);
+        let r = _mm512_mask_insertf64x2::<2>(src, 0b01101001, a, b);
+        let e = _mm512_set_pd(11., 2., 9., 14., 5., 16., 17., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_insertf64x2() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_pd(9., 10.);
+        let r = _mm512_maskz_insertf64x2::<2>(0b01101001, a, b);
+        let e = _mm512_set_pd(0., 2., 9., 0., 5., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_inserti32x8() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_inserti32x8::<1>(a, b);
+        let e = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_inserti32x8() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let src = _mm512_set_epi32(
+            25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+        );
+        let r = _mm512_mask_inserti32x8::<1>(src, 0b0110100100111100, a, b);
+        let e = _mm512_set_epi32(
+            25, 18, 19, 28, 21, 30, 31, 24, 33, 34, 11, 12, 13, 14, 39, 40,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_inserti32x8() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_maskz_inserti32x8::<1>(0b0110100100111100, a, b);
+        let e = _mm512_set_epi32(0, 18, 19, 0, 21, 0, 0, 24, 0, 0, 11, 12, 13, 14, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_inserti64x2() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm256_inserti64x2::<1>(a, b);
+        let e = _mm256_set_epi64x(5, 6, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_inserti64x2() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm_set_epi64x(5, 6);
+        let src = _mm256_set_epi64x(7, 8, 9, 10);
+        let r = _mm256_mask_inserti64x2::<1>(src, 0b0110, a, b);
+        let e = _mm256_set_epi64x(7, 6, 3, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_inserti64x2() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm256_maskz_inserti64x2::<1>(0b0110, a, b);
+        let e = _mm256_set_epi64x(0, 6, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_inserti64x2() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi64x(9, 10);
+        let r = _mm512_inserti64x2::<2>(a, b);
+        let e = _mm512_set_epi64(1, 2, 9, 10, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_inserti64x2() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi64x(9, 10);
+        let src = _mm512_set_epi64(11, 12, 13, 14, 15, 16, 17, 18);
+        let r = _mm512_mask_inserti64x2::<2>(src, 0b01101001, a, b);
+        let e = _mm512_set_epi64(11, 2, 9, 14, 5, 16, 17, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_inserti64x2() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi64x(9, 10);
+        let r = _mm512_maskz_inserti64x2::<2>(0b01101001, a, b);
+        let e = _mm512_set_epi64(0, 2, 9, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundepi64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvt_roundepi64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundepi64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvt_roundepi64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm512_set_pd(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundepi64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvt_roundepi64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_pd(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtepi64_pd() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_cvtepi64_pd(a);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_pd() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_cvtepi64_pd(b, 0b01, a);
+        let e = _mm_set_pd(3., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi64_pd() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_maskz_cvtepi64_pd(0b01, a);
+        let e = _mm_set_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtepi64_pd() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepi64_pd(a);
+        let e = _mm256_set_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_pd() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_pd(5., 6., 7., 8.);
+        let r = _mm256_mask_cvtepi64_pd(b, 0b0110, a);
+        let e = _mm256_set_pd(5., 2., 3., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi64_pd() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepi64_pd(0b0110, a);
+        let e = _mm256_set_pd(0., 2., 3., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtepi64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvtepi64_pd(a);
+        let e = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtepi64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvtepi64_pd(b, 0b01101001, a);
+        let e = _mm512_set_pd(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtepi64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvtepi64_pd(0b01101001, a);
+        let e = _mm512_set_pd(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundepi64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvt_roundepi64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundepi64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvt_roundepi64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm256_set_ps(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundepi64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvt_roundepi64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtepi64_ps() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_cvtepi64_ps(a);
+        let e = _mm_set_ps(0., 0., 1., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_ps() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_ps(3., 4., 5., 6.);
+        let r = _mm_mask_cvtepi64_ps(b, 0b01, a);
+        let e = _mm_set_ps(0., 0., 5., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi64_ps() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_maskz_cvtepi64_ps(0b01, a);
+        let e = _mm_set_ps(0., 0., 0., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtepi64_ps() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepi64_ps(a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_ps() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm_set_ps(5., 6., 7., 8.);
+        let r = _mm256_mask_cvtepi64_ps(b, 0b0110, a);
+        let e = _mm_set_ps(5., 2., 3., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi64_ps() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepi64_ps(0b0110, a);
+        let e = _mm_set_ps(0., 2., 3., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtepi64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvtepi64_ps(a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtepi64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvtepi64_ps(b, 0b01101001, a);
+        let e = _mm256_set_ps(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtepi64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvtepi64_ps(0b01101001, a);
+        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundepu64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvt_roundepu64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundepu64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvt_roundepu64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm512_set_pd(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundepu64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvt_roundepu64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_pd(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtepu64_pd() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_cvtepu64_pd(a);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu64_pd() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_cvtepu64_pd(b, 0b01, a);
+        let e = _mm_set_pd(3., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu64_pd() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_maskz_cvtepu64_pd(0b01, a);
+        let e = _mm_set_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtepu64_pd() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepu64_pd(a);
+        let e = _mm256_set_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu64_pd() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_pd(5., 6., 7., 8.);
+        let r = _mm256_mask_cvtepu64_pd(b, 0b0110, a);
+        let e = _mm256_set_pd(5., 2., 3., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu64_pd() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepu64_pd(0b0110, a);
+        let e = _mm256_set_pd(0., 2., 3., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtepu64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvtepu64_pd(a);
+        let e = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtepu64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvtepu64_pd(b, 0b01101001, a);
+        let e = _mm512_set_pd(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtepu64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvtepu64_pd(0b01101001, a);
+        let e = _mm512_set_pd(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundepu64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvt_roundepu64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundepu64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvt_roundepu64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm256_set_ps(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundepu64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvt_roundepu64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtepu64_ps() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_cvtepu64_ps(a);
+        let e = _mm_set_ps(0., 0., 1., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu64_ps() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_ps(3., 4., 5., 6.);
+        let r = _mm_mask_cvtepu64_ps(b, 0b01, a);
+        let e = _mm_set_ps(0., 0., 5., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu64_ps() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_maskz_cvtepu64_ps(0b01, a);
+        let e = _mm_set_ps(0., 0., 0., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtepu64_ps() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepu64_ps(a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu64_ps() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm_set_ps(5., 6., 7., 8.);
+        let r = _mm256_mask_cvtepu64_ps(b, 0b0110, a);
+        let e = _mm_set_ps(5., 2., 3., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu64_ps() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepu64_ps(0b0110, a);
+        let e = _mm_set_ps(0., 2., 3., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtepu64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvtepu64_ps(a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtepu64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvtepu64_ps(b, 0b01101001, a);
+        let e = _mm256_set_ps(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtepu64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvtepu64_ps(0b01101001, a);
+        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvt_roundpd_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvt_roundpd_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvt_roundpd_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtpd_epi64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_cvtpd_epi64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_epi64() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_epi64x(3, 4);
+        let r = _mm_mask_cvtpd_epi64(b, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_epi64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_maskz_cvtpd_epi64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtpd_epi64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_cvtpd_epi64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_epi64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvtpd_epi64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_epi64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvtpd_epi64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtpd_epi64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtpd_epi64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtpd_epi64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvt_roundps_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvt_roundps_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvt_roundps_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_cvtps_epi64(a);
+        let e = _mm_set_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm_mask_cvtps_epi64(b, 0b01, a);
+        let e = _mm_set_epi64x(5, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_cvtps_epi64(0b01, a);
+        let e = _mm_set_epi64x(0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_cvtps_epi64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvtps_epi64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvtps_epi64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtps_epi64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtps_epi64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtps_epi64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvt_roundpd_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvt_roundpd_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvt_roundpd_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtpd_epu64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_cvtpd_epu64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_epu64() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_epi64x(3, 4);
+        let r = _mm_mask_cvtpd_epu64(b, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_epu64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_maskz_cvtpd_epu64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtpd_epu64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_cvtpd_epu64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_epu64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvtpd_epu64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_epu64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvtpd_epu64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtpd_epu64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtpd_epu64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtpd_epu64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvt_roundps_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvt_roundps_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvt_roundps_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_cvtps_epu64(a);
+        let e = _mm_set_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm_mask_cvtps_epu64(b, 0b01, a);
+        let e = _mm_set_epi64x(5, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_cvtps_epu64(0b01, a);
+        let e = _mm_set_epi64x(0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_cvtps_epu64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvtps_epu64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvtps_epu64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtps_epu64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtps_epu64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtps_epu64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtt_roundpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtt_roundpd_epi64::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtt_roundpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtt_roundpd_epi64::<_MM_FROUND_NO_EXC>(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtt_roundpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtt_roundpd_epi64::<_MM_FROUND_NO_EXC>(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvttpd_epi64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_cvttpd_epi64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvttpd_epi64() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_epi64x(3, 4);
+        let r = _mm_mask_cvttpd_epi64(b, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvttpd_epi64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_maskz_cvttpd_epi64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvttpd_epi64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_cvttpd_epi64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvttpd_epi64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvttpd_epi64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttpd_epi64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvttpd_epi64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvttpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvttpd_epi64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvttpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvttpd_epi64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvttpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvttpd_epi64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtt_roundps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtt_roundps_epi64::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtt_roundps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtt_roundps_epi64::<_MM_FROUND_NO_EXC>(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtt_roundps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtt_roundps_epi64::<_MM_FROUND_NO_EXC>(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvttps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_cvttps_epi64(a);
+        let e = _mm_set_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvttps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm_mask_cvttps_epi64(b, 0b01, a);
+        let e = _mm_set_epi64x(5, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvttps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_cvttps_epi64(0b01, a);
+        let e = _mm_set_epi64x(0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvttps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_cvttps_epi64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvttps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvttps_epi64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvttps_epi64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvttps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvttps_epi64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvttps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvttps_epi64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvttps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvttps_epi64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtt_roundpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtt_roundpd_epu64::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtt_roundpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtt_roundpd_epu64::<_MM_FROUND_NO_EXC>(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtt_roundpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtt_roundpd_epu64::<_MM_FROUND_NO_EXC>(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvttpd_epu64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_cvttpd_epu64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvttpd_epu64() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_epi64x(3, 4);
+        let r = _mm_mask_cvttpd_epu64(b, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvttpd_epu64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_maskz_cvttpd_epu64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvttpd_epu64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_cvttpd_epu64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvttpd_epu64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvttpd_epu64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttpd_epu64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvttpd_epu64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvttpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvttpd_epu64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvttpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvttpd_epu64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvttpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvttpd_epu64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtt_roundps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtt_roundps_epu64::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtt_roundps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtt_roundps_epu64::<_MM_FROUND_NO_EXC>(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtt_roundps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtt_roundps_epu64::<_MM_FROUND_NO_EXC>(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvttps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_cvttps_epu64(a);
+        let e = _mm_set_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvttps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm_mask_cvttps_epu64(b, 0b01, a);
+        let e = _mm_set_epi64x(5, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvttps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_cvttps_epu64(0b01, a);
+        let e = _mm_set_epi64x(0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvttps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_cvttps_epu64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvttps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvttps_epu64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvttps_epu64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvttps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvttps_epu64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvttps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvttps_epu64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvttps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvttps_epu64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mullo_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(3, 4);
+        let r = _mm_mullo_epi64(a, b);
+        let e = _mm_set_epi64x(3, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_mullo_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(3, 4);
+        let c = _mm_set_epi64x(5, 6);
+        let r = _mm_mask_mullo_epi64(c, 0b01, a, b);
+        let e = _mm_set_epi64x(5, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_mullo_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(3, 4);
+        let r = _mm_maskz_mullo_epi64(0b01, a, b);
+        let e = _mm_set_epi64x(0, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mullo_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mullo_epi64(a, b);
+        let e = _mm256_set_epi64x(5, 12, 21, 32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_mullo_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let c = _mm256_set_epi64x(9, 10, 11, 12);
+        let r = _mm256_mask_mullo_epi64(c, 0b0110, a, b);
+        let e = _mm256_set_epi64x(9, 12, 21, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_mullo_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_maskz_mullo_epi64(0b0110, a, b);
+        let e = _mm256_set_epi64x(0, 12, 21, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mullo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mullo_epi64(a, b);
+        let e = _mm512_set_epi64(9, 20, 33, 48, 65, 84, 105, 128);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_mullo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let c = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_mask_mullo_epi64(c, 0b01101001, a, b);
+        let e = _mm512_set_epi64(17, 20, 33, 20, 65, 22, 23, 128);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_mullo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_mullo_epi64(0b01101001, a, b);
+        let e = _mm512_set_epi64(0, 20, 33, 0, 65, 0, 0, 128);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_cvtmask8_u32() {
+        let a: __mmask8 = 0b01101001;
+        let r = _cvtmask8_u32(a);
+        let e: u32 = 0b01101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_cvtu32_mask8() {
+        let a: u32 = 0b01101001;
+        let r = _cvtu32_mask8(a);
+        let e: __mmask8 = 0b01101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kadd_mask16() {
+        let a: __mmask16 = 27549;
+        let b: __mmask16 = 23434;
+        let r = _kadd_mask16(a, b);
+        let e: __mmask16 = 50983;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kadd_mask8() {
+        let a: __mmask8 = 98;
+        let b: __mmask8 = 117;
+        let r = _kadd_mask8(a, b);
+        let e: __mmask8 = 215;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kand_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110011;
+        let r = _kand_mask8(a, b);
+        let e: __mmask8 = 0b00100001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kandn_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110011;
+        let r = _kandn_mask8(a, b);
+        let e: __mmask8 = 0b10010010;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_knot_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let r = _knot_mask8(a);
+        let e: __mmask8 = 0b10010110;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kor_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110011;
+        let r = _kor_mask8(a, b);
+        let e: __mmask8 = 0b11111011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kxnor_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110011;
+        let r = _kxnor_mask8(a, b);
+        let e: __mmask8 = 0b00100101;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kxor_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110011;
+        let r = _kxor_mask8(a, b);
+        let e: __mmask8 = 0b11011010;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortest_mask8_u8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110110;
+        let mut all_ones: u8 = 0;
+        let r = _kortest_mask8_u8(a, b, &mut all_ones);
+        assert_eq!(r, 0);
+        assert_eq!(all_ones, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortestc_mask8_u8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110110;
+        let r = _kortestc_mask8_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortestz_mask8_u8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110110;
+        let r = _kortestz_mask8_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kshiftli_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let r = _kshiftli_mask8::<3>(a);
+        let e: __mmask8 = 0b01001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kshiftri_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let r = _kshiftri_mask8::<3>(a);
+        let e: __mmask8 = 0b00001101;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_ktest_mask8_u8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10010110;
+        let mut and_not: u8 = 0;
+        let r = _ktest_mask8_u8(a, b, &mut and_not);
+        assert_eq!(r, 1);
+        assert_eq!(and_not, 0);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_ktestc_mask8_u8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10010110;
+        let r = _ktestc_mask8_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_ktestz_mask8_u8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10010110;
+        let r = _ktestz_mask8_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_ktest_mask16_u8() {
+        let a: __mmask16 = 0b0110100100111100;
+        let b: __mmask16 = 0b1001011011000011;
+        let mut and_not: u8 = 0;
+        let r = _ktest_mask16_u8(a, b, &mut and_not);
+        assert_eq!(r, 1);
+        assert_eq!(and_not, 0);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_ktestc_mask16_u8() {
+        let a: __mmask16 = 0b0110100100111100;
+        let b: __mmask16 = 0b1001011011000011;
+        let r = _ktestc_mask16_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_ktestz_mask16_u8() {
+        let a: __mmask16 = 0b0110100100111100;
+        let b: __mmask16 = 0b1001011011000011;
+        let r = _ktestz_mask16_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_load_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let r = _load_mask8(&a);
+        let e: __mmask8 = 0b01101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_store_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let mut r = 0;
+        _store_mask8(&mut r, a);
+        let e: __mmask8 = 0b01101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_movepi32_mask() {
+        let a = _mm_set_epi32(0, -2, -3, 4);
+        let r = _mm_movepi32_mask(a);
+        let e = 0b0110;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_movepi32_mask() {
+        let a = _mm256_set_epi32(0, -2, -3, 4, -5, 6, 7, -8);
+        let r = _mm256_movepi32_mask(a);
+        let e = 0b01101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_movepi32_mask() {
+        let a = _mm512_set_epi32(
+            0, -2, -3, 4, -5, 6, 7, -8, 9, 10, -11, -12, -13, -14, 15, 16,
+        );
+        let r = _mm512_movepi32_mask(a);
+        let e = 0b0110100100111100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_movepi64_mask() {
+        let a = _mm_set_epi64x(0, -2);
+        let r = _mm_movepi64_mask(a);
+        let e = 0b01;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_movepi64_mask() {
+        let a = _mm256_set_epi64x(0, -2, -3, 4);
+        let r = _mm256_movepi64_mask(a);
+        let e = 0b0110;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_movepi64_mask() {
+        let a = _mm512_set_epi64(0, -2, -3, 4, -5, 6, 7, -8);
+        let r = _mm512_movepi64_mask(a);
+        let e = 0b01101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_movm_epi32() {
+        let a = 0b0110;
+        let r = _mm_movm_epi32(a);
+        let e = _mm_set_epi32(0, -1, -1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_movm_epi32() {
+        let a = 0b01101001;
+        let r = _mm256_movm_epi32(a);
+        let e = _mm256_set_epi32(0, -1, -1, 0, -1, 0, 0, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_movm_epi32() {
+        let a = 0b0110100100111100;
+        let r = _mm512_movm_epi32(a);
+        let e = _mm512_set_epi32(0, -1, -1, 0, -1, 0, 0, -1, 0, 0, -1, -1, -1, -1, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_movm_epi64() {
+        let a = 0b01;
+        let r = _mm_movm_epi64(a);
+        let e = _mm_set_epi64x(0, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_movm_epi64() {
+        let a = 0b0110;
+        let r = _mm256_movm_epi64(a);
+        let e = _mm256_set_epi64x(0, -1, -1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_movm_epi64() {
+        let a = 0b01101001;
+        let r = _mm512_movm_epi64(a);
+        let e = _mm512_set_epi64(0, -1, -1, 0, -1, 0, 0, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_range_round_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
+        let r = _mm512_range_round_pd::<0b0101, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm512_set_pd(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_range_round_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
+        let c = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_range_round_pd::<0b0101, _MM_FROUND_NO_EXC>(c, 0b01101001, a, b);
+        let e = _mm512_set_pd(9., 2., 4., 12., 6., 14., 15., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_range_round_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
+        let r = _mm512_maskz_range_round_pd::<0b0101, _MM_FROUND_NO_EXC>(0b01101001, a, b);
+        let e = _mm512_set_pd(0., 2., 4., 0., 6., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_range_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(2., 1.);
+        let r = _mm_range_pd::<0b0101>(a, b);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_range_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(2., 1.);
+        let c = _mm_set_pd(3., 4.);
+        let r = _mm_mask_range_pd::<0b0101>(c, 0b01, a, b);
+        let e = _mm_set_pd(3., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_range_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(2., 1.);
+        let r = _mm_maskz_range_pd::<0b0101>(0b01, a, b);
+        let e = _mm_set_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_range_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(2., 1., 4., 3.);
+        let r = _mm256_range_pd::<0b0101>(a, b);
+        let e = _mm256_set_pd(2., 2., 4., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_range_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(2., 1., 4., 3.);
+        let c = _mm256_set_pd(5., 6., 7., 8.);
+        let r = _mm256_mask_range_pd::<0b0101>(c, 0b0110, a, b);
+        let e = _mm256_set_pd(5., 2., 4., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_range_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(2., 1., 4., 3.);
+        let r = _mm256_maskz_range_pd::<0b0101>(0b0110, a, b);
+        let e = _mm256_set_pd(0., 2., 4., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_range_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
+        let r = _mm512_range_pd::<0b0101>(a, b);
+        let e = _mm512_set_pd(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_range_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
+        let c = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_range_pd::<0b0101>(c, 0b01101001, a, b);
+        let e = _mm512_set_pd(9., 2., 4., 12., 6., 14., 15., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_range_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
+        let r = _mm512_maskz_range_pd::<0b0101>(0b01101001, a, b);
+        let e = _mm512_set_pd(0., 2., 4., 0., 6., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_range_round_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+        );
+        let r = _mm512_range_round_ps::<0b0101, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm512_set_ps(
+            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_range_round_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+        );
+        let c = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r =
+            _mm512_mask_range_round_ps::<0b0101, _MM_FROUND_NO_EXC>(c, 0b0110100100111100, a, b);
+        let e = _mm512_set_ps(
+            17., 2., 4., 20., 6., 22., 23., 8., 25., 26., 12., 12., 14., 14., 31., 32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_range_round_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+        );
+        let r = _mm512_maskz_range_round_ps::<0b0101, _MM_FROUND_NO_EXC>(0b0110100100111100, a, b);
+        let e = _mm512_set_ps(
+            0., 2., 4., 0., 6., 0., 0., 8., 0., 0., 12., 12., 14., 14., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_range_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(2., 1., 4., 3.);
+        let r = _mm_range_ps::<0b0101>(a, b);
+        let e = _mm_set_ps(2., 2., 4., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_range_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(2., 1., 4., 3.);
+        let c = _mm_set_ps(5., 6., 7., 8.);
+        let r = _mm_mask_range_ps::<0b0101>(c, 0b0110, a, b);
+        let e = _mm_set_ps(5., 2., 4., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_range_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(2., 1., 4., 3.);
+        let r = _mm_maskz_range_ps::<0b0101>(0b0110, a, b);
+        let e = _mm_set_ps(0., 2., 4., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_range_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(2., 1., 4., 3., 6., 5., 8., 7.);
+        let r = _mm256_range_ps::<0b0101>(a, b);
+        let e = _mm256_set_ps(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_range_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(2., 1., 4., 3., 6., 5., 8., 7.);
+        let c = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_mask_range_ps::<0b0101>(c, 0b01101001, a, b);
+        let e = _mm256_set_ps(9., 2., 4., 12., 6., 14., 15., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_range_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(2., 1., 4., 3., 6., 5., 8., 7.);
+        let r = _mm256_maskz_range_ps::<0b0101>(0b01101001, a, b);
+        let e = _mm256_set_ps(0., 2., 4., 0., 6., 0., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_range_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+        );
+        let r = _mm512_range_ps::<0b0101>(a, b);
+        let e = _mm512_set_ps(
+            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_range_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+        );
+        let c = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_mask_range_ps::<0b0101>(c, 0b0110100100111100, a, b);
+        let e = _mm512_set_ps(
+            17., 2., 4., 20., 6., 22., 23., 8., 25., 26., 12., 12., 14., 14., 31., 32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_range_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+        );
+        let r = _mm512_maskz_range_ps::<0b0101>(0b0110100100111100, a, b);
+        let e = _mm512_set_ps(
+            0., 2., 4., 0., 6., 0., 0., 8., 0., 0., 12., 12., 14., 14., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_range_round_sd() {
+        let a = _mm_set_sd(1.);
+        let b = _mm_set_sd(2.);
+        let r = _mm_range_round_sd::<0b0101, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_set_sd(2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_range_round_sd() {
+        let a = _mm_set_sd(1.);
+        let b = _mm_set_sd(2.);
+        let c = _mm_set_sd(3.);
+        let r = _mm_mask_range_round_sd::<0b0101, _MM_FROUND_NO_EXC>(c, 0b0, a, b);
+        let e = _mm_set_sd(3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_range_round_sd() {
+        let a = _mm_set_sd(1.);
+        let b = _mm_set_sd(2.);
+        let r = _mm_maskz_range_round_sd::<0b0101, _MM_FROUND_NO_EXC>(0b0, a, b);
+        let e = _mm_set_sd(0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_range_sd() {
+        let a = _mm_set_sd(1.);
+        let b = _mm_set_sd(2.);
+        let c = _mm_set_sd(3.);
+        let r = _mm_mask_range_sd::<0b0101>(c, 0b0, a, b);
+        let e = _mm_set_sd(3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_range_sd() {
+        let a = _mm_set_sd(1.);
+        let b = _mm_set_sd(2.);
+        let r = _mm_maskz_range_sd::<0b0101>(0b0, a, b);
+        let e = _mm_set_sd(0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_range_round_ss() {
+        let a = _mm_set_ss(1.);
+        let b = _mm_set_ss(2.);
+        let r = _mm_range_round_ss::<0b0101, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_set_ss(2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_range_round_ss() {
+        let a = _mm_set_ss(1.);
+        let b = _mm_set_ss(2.);
+        let c = _mm_set_ss(3.);
+        let r = _mm_mask_range_round_ss::<0b0101, _MM_FROUND_NO_EXC>(c, 0b0, a, b);
+        let e = _mm_set_ss(3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_range_round_ss() {
+        let a = _mm_set_ss(1.);
+        let b = _mm_set_ss(2.);
+        let r = _mm_maskz_range_round_ss::<0b0101, _MM_FROUND_NO_EXC>(0b0, a, b);
+        let e = _mm_set_ss(0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_range_ss() {
+        let a = _mm_set_ss(1.);
+        let b = _mm_set_ss(2.);
+        let c = _mm_set_ss(3.);
+        let r = _mm_mask_range_ss::<0b0101>(c, 0b0, a, b);
+        let e = _mm_set_ss(3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_range_ss() {
+        let a = _mm_set_ss(1.);
+        let b = _mm_set_ss(2.);
+        let r = _mm_maskz_range_ss::<0b0101>(0b0, a, b);
+        let e = _mm_set_ss(0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_reduce_round_pd() {
+        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let r = _mm512_reduce_round_pd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_pd(0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_reduce_round_pd() {
+        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let src = _mm512_set_pd(3., 4., 5., 6., 7., 8., 9., 10.);
+        let r = _mm512_mask_reduce_round_pd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            src, 0b01101001, a,
+        );
+        let e = _mm512_set_pd(3., 0., 0.25, 6., 0.25, 8., 9., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_reduce_round_pd() {
+        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let r = _mm512_maskz_reduce_round_pd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_pd(0., 0., 0.25, 0., 0.25, 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_reduce_pd() {
+        let a = _mm_set_pd(0.25, 0.50);
+        let r = _mm_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm_set_pd(0.25, 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_reduce_pd() {
+        let a = _mm_set_pd(0.25, 0.50);
+        let src = _mm_set_pd(3., 4.);
+        let r = _mm_mask_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01, a);
+        let e = _mm_set_pd(3., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_reduce_pd() {
+        let a = _mm_set_pd(0.25, 0.50);
+        let r = _mm_maskz_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01, a);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_reduce_pd() {
+        let a = _mm256_set_pd(0.25, 0.50, 0.75, 1.0);
+        let r = _mm256_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm256_set_pd(0.25, 0., 0.25, 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_pd() {
+        let a = _mm256_set_pd(0.25, 0.50, 0.75, 1.0);
+        let src = _mm256_set_pd(3., 4., 5., 6.);
+        let r = _mm256_mask_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0110, a);
+        let e = _mm256_set_pd(3., 0., 0.25, 6.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_reduce_pd() {
+        let a = _mm256_set_pd(0.25, 0.50, 0.75, 1.0);
+        let r = _mm256_maskz_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0110, a);
+        let e = _mm256_set_pd(0., 0., 0.25, 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_reduce_pd() {
+        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let r = _mm512_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm512_set_pd(0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_reduce_pd() {
+        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let src = _mm512_set_pd(3., 4., 5., 6., 7., 8., 9., 10.);
+        let r = _mm512_mask_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01101001, a);
+        let e = _mm512_set_pd(3., 0., 0.25, 6., 0.25, 8., 9., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_reduce_pd() {
+        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let r = _mm512_maskz_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01101001, a);
+        let e = _mm512_set_pd(0., 0., 0.25, 0., 0.25, 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_reduce_round_ps() {
+        let a = _mm512_set_ps(
+            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+            4.0,
+        );
+        let r = _mm512_reduce_round_ps::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_ps(
+            0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_reduce_round_ps() {
+        let a = _mm512_set_ps(
+            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+            4.0,
+        );
+        let src = _mm512_set_ps(
+            5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.,
+        );
+        let r = _mm512_mask_reduce_round_ps::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            src,
+            0b0110100100111100,
+            a,
+        );
+        let e = _mm512_set_ps(
+            5., 0., 0.25, 8., 0.25, 10., 11., 0., 13., 14., 0.25, 0., 0.25, 0., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_reduce_round_ps() {
+        let a = _mm512_set_ps(
+            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+            4.0,
+        );
+        let r = _mm512_maskz_reduce_round_ps::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            0b0110100100111100,
+            a,
+        );
+        let e = _mm512_set_ps(
+            0., 0., 0.25, 0., 0.25, 0., 0., 0., 0., 0., 0.25, 0., 0.25, 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_reduce_ps() {
+        let a = _mm_set_ps(0.25, 0.50, 0.75, 1.0);
+        let r = _mm_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm_set_ps(0.25, 0., 0.25, 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_reduce_ps() {
+        let a = _mm_set_ps(0.25, 0.50, 0.75, 1.0);
+        let src = _mm_set_ps(2., 3., 4., 5.);
+        let r = _mm_mask_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0110, a);
+        let e = _mm_set_ps(2., 0., 0.25, 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_reduce_ps() {
+        let a = _mm_set_ps(0.25, 0.50, 0.75, 1.0);
+        let r = _mm_maskz_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0110, a);
+        let e = _mm_set_ps(0., 0., 0.25, 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_reduce_ps() {
+        let a = _mm256_set_ps(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let r = _mm256_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm256_set_ps(0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_ps() {
+        let a = _mm256_set_ps(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let src = _mm256_set_ps(3., 4., 5., 6., 7., 8., 9., 10.);
+        let r = _mm256_mask_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01101001, a);
+        let e = _mm256_set_ps(3., 0., 0.25, 6., 0.25, 8., 9., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_reduce_ps() {
+        let a = _mm256_set_ps(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let r = _mm256_maskz_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01101001, a);
+        let e = _mm256_set_ps(0., 0., 0.25, 0., 0.25, 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_reduce_ps() {
+        let a = _mm512_set_ps(
+            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+            4.0,
+        );
+        let r = _mm512_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm512_set_ps(
+            0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_reduce_ps() {
+        let a = _mm512_set_ps(
+            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+            4.0,
+        );
+        let src = _mm512_set_ps(
+            5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.,
+        );
+        let r = _mm512_mask_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0110100100111100, a);
+        let e = _mm512_set_ps(
+            5., 0., 0.25, 8., 0.25, 10., 11., 0., 13., 14., 0.25, 0., 0.25, 0., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_reduce_ps() {
+        let a = _mm512_set_ps(
+            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+            4.0,
+        );
+        let r = _mm512_maskz_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0110100100111100, a);
+        let e = _mm512_set_ps(
+            0., 0., 0.25, 0., 0.25, 0., 0., 0., 0., 0., 0.25, 0., 0.25, 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_reduce_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_sd(0.25);
+        let r = _mm_reduce_round_sd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_reduce_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_sd(0.25);
+        let c = _mm_set_pd(3., 4.);
+        let r = _mm_mask_reduce_round_sd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            c, 0b0, a, b,
+        );
+        let e = _mm_set_pd(1., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_reduce_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_sd(0.25);
+        let r =
+            _mm_maskz_reduce_round_sd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0b0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_reduce_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_sd(0.25);
+        let r = _mm_reduce_sd::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_reduce_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_sd(0.25);
+        let c = _mm_set_pd(3., 4.);
+        let r = _mm_mask_reduce_sd::<{ 16 | _MM_FROUND_TO_ZERO }>(c, 0b0, a, b);
+        let e = _mm_set_pd(1., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_reduce_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_sd(0.25);
+        let r = _mm_maskz_reduce_sd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_reduce_round_ss() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ss(0.25);
+        let r = _mm_reduce_round_ss::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_set_ps(1., 2., 3., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_reduce_round_ss() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ss(0.25);
+        let c = _mm_set_ps(5., 6., 7., 8.);
+        let r = _mm_mask_reduce_round_ss::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            c, 0b0, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 3., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_reduce_round_ss() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ss(0.25);
+        let r =
+            _mm_maskz_reduce_round_ss::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0b0, a, b);
+        let e = _mm_set_ps(1., 2., 3., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_reduce_ss() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ss(0.25);
+        let r = _mm_reduce_ss::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
+        let e = _mm_set_ps(1., 2., 3., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_reduce_ss() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ss(0.25);
+        let c = _mm_set_ps(5., 6., 7., 8.);
+        let r = _mm_mask_reduce_ss::<{ 16 | _MM_FROUND_TO_ZERO }>(c, 0b0, a, b);
+        let e = _mm_set_ps(1., 2., 3., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_reduce_ss() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ss(0.25);
+        let r = _mm_maskz_reduce_ss::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0, a, b);
+        let e = _mm_set_ps(1., 2., 3., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_fpclass_pd_mask() {
+        let a = _mm_set_pd(1., f64::INFINITY);
+        let r = _mm_fpclass_pd_mask::<0x18>(a);
+        let e = 0b01;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_fpclass_pd_mask() {
+        let a = _mm_set_pd(1., f64::INFINITY);
+        let r = _mm_mask_fpclass_pd_mask::<0x18>(0b10, a);
+        let e = 0b00;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_fpclass_pd_mask() {
+        let a = _mm256_set_pd(1., f64::INFINITY, f64::NEG_INFINITY, 0.0);
+        let r = _mm256_fpclass_pd_mask::<0x18>(a);
+        let e = 0b0110;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_fpclass_pd_mask() {
+        let a = _mm256_set_pd(1., f64::INFINITY, f64::NEG_INFINITY, 0.0);
+        let r = _mm256_mask_fpclass_pd_mask::<0x18>(0b1010, a);
+        let e = 0b0010;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_fpclass_pd_mask() {
+        let a = _mm512_set_pd(
+            1.,
+            f64::INFINITY,
+            f64::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f64::NAN,
+            1.0e-308,
+        );
+        let r = _mm512_fpclass_pd_mask::<0x18>(a);
+        let e = 0b01100000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_fpclass_pd_mask() {
+        let a = _mm512_set_pd(
+            1.,
+            f64::INFINITY,
+            f64::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f64::NAN,
+            1.0e-308,
+        );
+        let r = _mm512_mask_fpclass_pd_mask::<0x18>(0b10101010, a);
+        let e = 0b00100000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_fpclass_ps_mask() {
+        let a = _mm_set_ps(1., f32::INFINITY, f32::NEG_INFINITY, 0.0);
+        let r = _mm_fpclass_ps_mask::<0x18>(a);
+        let e = 0b0110;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_fpclass_ps_mask() {
+        let a = _mm_set_ps(1., f32::INFINITY, f32::NEG_INFINITY, 0.0);
+        let r = _mm_mask_fpclass_ps_mask::<0x18>(0b1010, a);
+        let e = 0b0010;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_fpclass_ps_mask() {
+        let a = _mm256_set_ps(
+            1.,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f32::NAN,
+            1.0e-38,
+        );
+        let r = _mm256_fpclass_ps_mask::<0x18>(a);
+        let e = 0b01100000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_fpclass_ps_mask() {
+        let a = _mm256_set_ps(
+            1.,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f32::NAN,
+            1.0e-38,
+        );
+        let r = _mm256_mask_fpclass_ps_mask::<0x18>(0b10101010, a);
+        let e = 0b00100000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_fpclass_ps_mask() {
+        let a = _mm512_set_ps(
+            1.,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f32::NAN,
+            1.0e-38,
+            -1.,
+            f32::NEG_INFINITY,
+            f32::INFINITY,
+            -0.0,
+            0.0,
+            2.0,
+            f32::NAN,
+            -1.0e-38,
+        );
+        let r = _mm512_fpclass_ps_mask::<0x18>(a);
+        let e = 0b0110000001100000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_fpclass_ps_mask() {
+        let a = _mm512_set_ps(
+            1.,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f32::NAN,
+            1.0e-38,
+            -1.,
+            f32::NEG_INFINITY,
+            f32::INFINITY,
+            -0.0,
+            0.0,
+            2.0,
+            f32::NAN,
+            -1.0e-38,
+        );
+        let r = _mm512_mask_fpclass_ps_mask::<0x18>(0b1010101010101010, a);
+        let e = 0b0010000000100000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_fpclass_sd_mask() {
+        let a = _mm_set_pd(1., f64::INFINITY);
+        let r = _mm_fpclass_sd_mask::<0x18>(a);
+        let e = 0b1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_fpclass_sd_mask() {
+        let a = _mm_set_sd(f64::INFINITY);
+        let r = _mm_mask_fpclass_sd_mask::<0x18>(0b0, a);
+        let e = 0b0;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_fpclass_ss_mask() {
+        let a = _mm_set_ss(f32::INFINITY);
+        let r = _mm_fpclass_ss_mask::<0x18>(a);
+        let e = 0b1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_fpclass_ss_mask() {
+        let a = _mm_set_ss(f32::INFINITY);
+        let r = _mm_mask_fpclass_ss_mask::<0x18>(0b0, a);
+        let e = 0b0;
+        assert_eq!(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
new file mode 100644
index 0000000000000..dd224616764d6
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
@@ -0,0 +1,60683 @@
+use crate::{
+    arch::asm,
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+    intrinsics::{fmaf32, fmaf64},
+    mem, ptr,
+};
+
+use core::hint::unreachable_unchecked;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Computes the absolute values of packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi32&expand=39)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm512_abs_epi32(a: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let r = simd_select::<i32x16, _>(simd_lt(a, i32x16::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Computes the absolute value of packed 32-bit integers in `a`, and store the
+/// unsigned results in `dst` using writemask `k` (elements are copied from
+/// `src` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_epi32&expand=40)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm512_mask_abs_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, abs, src.as_i32x16()))
+    }
+}
+
+/// Computes the absolute value of packed 32-bit integers in `a`, and store the
+/// unsigned results in `dst` using zeromask `k` (elements are zeroed out when
+/// the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_abs_epi32&expand=41)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, abs, i32x16::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_abs_epi32&expand=37)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm256_mask_abs_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, abs, src.as_i32x8()))
+    }
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi32&expand=38)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm256_maskz_abs_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, abs, i32x8::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi32&expand=34)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm_mask_abs_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, abs, src.as_i32x4()))
+    }
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi32&expand=35)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm_maskz_abs_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, abs, i32x4::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi64&expand=48)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm512_abs_epi64(a: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i64x8();
+        let r = simd_select::<i64x8, _>(simd_lt(a, i64x8::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_epi64&expand=49)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm512_mask_abs_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, abs, src.as_i64x8()))
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_abs_epi64&expand=50)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm512_maskz_abs_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, abs, i64x8::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi64&expand=45)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm256_abs_epi64(a: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x4();
+        let r = simd_select::<i64x4, _>(simd_lt(a, i64x4::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_abs_epi64&expand=46)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm256_mask_abs_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, abs, src.as_i64x4()))
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm256_maskz_abs_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, abs, i64x4::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm_abs_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x2();
+        let r = simd_select::<i64x2, _>(simd_lt(a, i64x2::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm_mask_abs_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, abs, src.as_i64x2()))
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm_maskz_abs_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, abs, i64x2::ZERO))
+    }
+}
+
+/// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_ps&expand=65)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm512_abs_ps(v2: __m512) -> __m512 {
+    unsafe { simd_fabs(v2) }
+}
+
+/// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_ps&expand=66)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm512_mask_abs_ps(src: __m512, k: __mmask16, v2: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, simd_fabs(v2), src) }
+}
+
+/// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_pd&expand=60)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm512_abs_pd(v2: __m512d) -> __m512d {
+    unsafe { simd_fabs(v2) }
+}
+
+/// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_pd&expand=61)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, simd_fabs(v2), src) }
+}
+
+/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_epi32&expand=3801)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub fn _mm512_mask_mov_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i32x16();
+        transmute(simd_select_bitmask(k, mov, src.as_i32x16()))
+    }
+}
+
+/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_epi32&expand=3802)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub fn _mm512_maskz_mov_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i32x16();
+        transmute(simd_select_bitmask(k, mov, i32x16::ZERO))
+    }
+}
+
+/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_epi32&expand=3799)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub fn _mm256_mask_mov_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i32x8();
+        transmute(simd_select_bitmask(k, mov, src.as_i32x8()))
+    }
+}
+
+/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_epi32&expand=3800)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub fn _mm256_maskz_mov_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i32x8();
+        transmute(simd_select_bitmask(k, mov, i32x8::ZERO))
+    }
+}
+
+/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_epi32&expand=3797)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub fn _mm_mask_mov_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i32x4();
+        transmute(simd_select_bitmask(k, mov, src.as_i32x4()))
+    }
+}
+
+/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_epi32&expand=3798)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub fn _mm_maskz_mov_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i32x4();
+        transmute(simd_select_bitmask(k, mov, i32x4::ZERO))
+    }
+}
+
+/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_epi64&expand=3807)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub fn _mm512_mask_mov_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i64x8();
+        transmute(simd_select_bitmask(k, mov, src.as_i64x8()))
+    }
+}
+
+/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_epi64&expand=3808)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub fn _mm512_maskz_mov_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i64x8();
+        transmute(simd_select_bitmask(k, mov, i64x8::ZERO))
+    }
+}
+
+/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_epi64&expand=3805)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub fn _mm256_mask_mov_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i64x4();
+        transmute(simd_select_bitmask(k, mov, src.as_i64x4()))
+    }
+}
+
+/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_epi64&expand=3806)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub fn _mm256_maskz_mov_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i64x4();
+        transmute(simd_select_bitmask(k, mov, i64x4::ZERO))
+    }
+}
+
+/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_epi64&expand=3803)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub fn _mm_mask_mov_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i64x2();
+        transmute(simd_select_bitmask(k, mov, src.as_i64x2()))
+    }
+}
+
+/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_epi64&expand=3804)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub fn _mm_maskz_mov_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i64x2();
+        transmute(simd_select_bitmask(k, mov, i64x2::ZERO))
+    }
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_ps&expand=3825)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub fn _mm512_mask_mov_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov = a.as_f32x16();
+        transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+    }
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_ps&expand=3826)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub fn _mm512_maskz_mov_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov = a.as_f32x16();
+        transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
+    }
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_ps&expand=3823)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub fn _mm256_mask_mov_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = a.as_f32x8();
+        transmute(simd_select_bitmask(k, mov, src.as_f32x8()))
+    }
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_ps&expand=3824)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub fn _mm256_maskz_mov_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = a.as_f32x8();
+        transmute(simd_select_bitmask(k, mov, f32x8::ZERO))
+    }
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_ps&expand=3821)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub fn _mm_mask_mov_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = a.as_f32x4();
+        transmute(simd_select_bitmask(k, mov, src.as_f32x4()))
+    }
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_ps&expand=3822)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub fn _mm_maskz_mov_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = a.as_f32x4();
+        transmute(simd_select_bitmask(k, mov, f32x4::ZERO))
+    }
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_pd&expand=3819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub fn _mm512_mask_mov_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let mov = a.as_f64x8();
+        transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
+    }
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_pd&expand=3820)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub fn _mm512_maskz_mov_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let mov = a.as_f64x8();
+        transmute(simd_select_bitmask(k, mov, f64x8::ZERO))
+    }
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_pd&expand=3817)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub fn _mm256_mask_mov_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        let mov = a.as_f64x4();
+        transmute(simd_select_bitmask(k, mov, src.as_f64x4()))
+    }
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_pd&expand=3818)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub fn _mm256_maskz_mov_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        let mov = a.as_f64x4();
+        transmute(simd_select_bitmask(k, mov, f64x4::ZERO))
+    }
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_pd&expand=3815)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub fn _mm_mask_mov_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        let mov = a.as_f64x2();
+        transmute(simd_select_bitmask(k, mov, src.as_f64x2()))
+    }
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_pd&expand=3816)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub fn _mm_maskz_mov_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        let mov = a.as_f64x2();
+        transmute(simd_select_bitmask(k, mov, f64x2::ZERO))
+    }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_epi32&expand=100)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm512_add_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_add(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_epi32&expand=101)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm512_mask_add_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, add, src.as_i32x16()))
+    }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_epi32&expand=102)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm512_maskz_add_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, add, i32x16::ZERO))
+    }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_epi32&expand=98)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm256_mask_add_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, add, src.as_i32x8()))
+    }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_epi32&expand=99)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm256_maskz_add_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, add, i32x8::ZERO))
+    }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_epi32&expand=95)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm_mask_add_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, add, src.as_i32x4()))
+    }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_epi32&expand=96)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm_maskz_add_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, add, i32x4::ZERO))
+    }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_epi64&expand=109)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm512_add_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_add(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_epi64&expand=110)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm512_mask_add_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, add, src.as_i64x8()))
+    }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_epi64&expand=111)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm512_maskz_add_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, add, i64x8::ZERO))
+    }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_epi64&expand=107)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm256_mask_add_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, add, src.as_i64x4()))
+    }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_epi64&expand=108)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm256_maskz_add_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, add, i64x4::ZERO))
+    }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_epi64&expand=104)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm_mask_add_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, add, src.as_i64x2()))
+    }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_epi64&expand=105)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm_maskz_add_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, add, i64x2::ZERO))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_ps&expand=139)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm512_add_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_add(a.as_f32x16(), b.as_f32x16())) }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_ps&expand=140)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm512_mask_add_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let add = _mm512_add_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, add, src.as_f32x16()))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_ps&expand=141)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm512_maskz_add_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let add = _mm512_add_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, add, f32x16::ZERO))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_ps&expand=137)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm256_mask_add_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let add = _mm256_add_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, add, src.as_f32x8()))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_ps&expand=138)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm256_maskz_add_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let add = _mm256_add_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, add, f32x8::ZERO))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_ps&expand=134)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm_mask_add_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let add = _mm_add_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, add, src.as_f32x4()))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_ps&expand=135)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm_maskz_add_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let add = _mm_add_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, add, f32x4::ZERO))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_pd&expand=127)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm512_add_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_add(a.as_f64x8(), b.as_f64x8())) }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_pd&expand=128)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm512_mask_add_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let add = _mm512_add_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, add, src.as_f64x8()))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_pd&expand=129)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm512_maskz_add_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let add = _mm512_add_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, add, f64x8::ZERO))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_pd&expand=125)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm256_mask_add_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let add = _mm256_add_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, add, src.as_f64x4()))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_pd&expand=126)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm256_maskz_add_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let add = _mm256_add_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, add, f64x4::ZERO))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_pd&expand=122)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm_mask_add_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let add = _mm_add_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, add, src.as_f64x2()))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_pd&expand=123)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm_maskz_add_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let add = _mm_add_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, add, f64x2::ZERO))
+    }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_epi32&expand=5694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm512_sub_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_sub(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_epi32&expand=5692)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm512_mask_sub_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i32x16()))
+    }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_epi32&expand=5693)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm512_maskz_sub_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, sub, i32x16::ZERO))
+    }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_epi32&expand=5689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm256_mask_sub_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, sub, src.as_i32x8()))
+    }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_epi32&expand=5690)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm256_maskz_sub_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, sub, i32x8::ZERO))
+    }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_epi32&expand=5686)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm_mask_sub_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, sub, src.as_i32x4()))
+    }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_epi32&expand=5687)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm_maskz_sub_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, sub, i32x4::ZERO))
+    }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_epi64&expand=5703)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm512_sub_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_sub(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_epi64&expand=5701)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm512_mask_sub_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, sub, src.as_i64x8()))
+    }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_epi64&expand=5702)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm512_maskz_sub_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, sub, i64x8::ZERO))
+    }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_epi64&expand=5698)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm256_mask_sub_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, sub, src.as_i64x4()))
+    }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_epi64&expand=5699)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm256_maskz_sub_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, sub, i64x4::ZERO))
+    }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_epi64&expand=5695)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm_mask_sub_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, sub, src.as_i64x2()))
+    }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_epi64&expand=5696)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm_maskz_sub_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, sub, i64x2::ZERO))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_ps&expand=5733)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm512_sub_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_sub(a.as_f32x16(), b.as_f32x16())) }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_ps&expand=5731)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm512_mask_sub_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let sub = _mm512_sub_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, sub, src.as_f32x16()))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_ps&expand=5732)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm512_maskz_sub_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let sub = _mm512_sub_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, sub, f32x16::ZERO))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_ps&expand=5728)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm256_mask_sub_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let sub = _mm256_sub_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, sub, src.as_f32x8()))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_ps&expand=5729)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm256_maskz_sub_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let sub = _mm256_sub_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, sub, f32x8::ZERO))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_ps&expand=5725)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm_mask_sub_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let sub = _mm_sub_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, sub, src.as_f32x4()))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_ps&expand=5726)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm_maskz_sub_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let sub = _mm_sub_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, sub, f32x4::ZERO))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_pd&expand=5721)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm512_sub_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_sub(a.as_f64x8(), b.as_f64x8())) }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_pd&expand=5719)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm512_mask_sub_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let sub = _mm512_sub_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, sub, src.as_f64x8()))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_pd&expand=5720)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm512_maskz_sub_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let sub = _mm512_sub_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, sub, f64x8::ZERO))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_pd&expand=5716)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm256_mask_sub_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let sub = _mm256_sub_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, sub, src.as_f64x4()))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_pd&expand=5717)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm256_maskz_sub_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let sub = _mm256_sub_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, sub, f64x4::ZERO))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_pd&expand=5713)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm_mask_sub_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let sub = _mm_sub_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, sub, src.as_f64x2()))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_pd&expand=5714)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm_maskz_sub_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let sub = _mm_sub_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, sub, f64x2::ZERO))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_epi32&expand=3907)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm512_mul_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(a.as_i64x8()));
+        let b = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(b.as_i64x8()));
+        transmute(simd_mul(a, b))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_epi32&expand=3905)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm512_mask_mul_epi32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mul_epi32(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_epi32&expand=3906)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm512_maskz_mul_epi32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mul_epi32(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, mul, i64x8::ZERO))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mul_epi32&expand=3902)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm256_mask_mul_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mul_epi32(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, mul, src.as_i64x4()))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mul_epi32&expand=3903)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm256_maskz_mul_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mul_epi32(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, mul, i64x4::ZERO))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mul_epi32&expand=3899)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm_mask_mul_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mul_epi32(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, mul, src.as_i64x2()))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mul_epi32&expand=3900)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm_maskz_mul_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mul_epi32(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, mul, i64x2::ZERO))
+    }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mullo_epi32&expand=4005)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm512_mullo_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_mul(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mullo_epi32&expand=4003)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm512_mask_mullo_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullo_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, mul, src.as_i32x16()))
+    }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mullo_epi32&expand=4004)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm512_maskz_mullo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullo_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, mul, i32x16::ZERO))
+    }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mullo_epi32&expand=4000)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm256_mask_mullo_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mullo_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i32x8()))
+    }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mullo_epi32&expand=4001)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm256_maskz_mullo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mullo_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, mul, i32x8::ZERO))
+    }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mullo_epi32&expand=3997)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm_mask_mullo_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mullo_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, mul, src.as_i32x4()))
+    }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mullo_epi32&expand=3998)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm_maskz_mullo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mullo_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, mul, i32x4::ZERO))
+    }
+}
+
+/// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mullox_epi64&expand=4017)
+///
+/// This intrinsic generates a sequence of instructions, which may perform worse than a native instruction. Consider the performance impact of this intrinsic.
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mullox_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_mul(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mullox_epi64&expand=4016)
+///
+/// This intrinsic generates a sequence of instructions, which may perform worse than a native instruction. Consider the performance impact of this intrinsic.
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_mullox_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullox_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_epu32&expand=3916)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm512_mul_epu32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        let mask = u64x8::splat(u32::MAX.into());
+        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_epu32&expand=3914)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm512_mask_mul_epu32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mul_epu32(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, mul, src.as_u64x8()))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_epu32&expand=3915)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm512_maskz_mul_epu32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mul_epu32(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, mul, u64x8::ZERO))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mul_epu32&expand=3911)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm256_mask_mul_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mul_epu32(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, mul, src.as_u64x4()))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mul_epu32&expand=3912)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm256_maskz_mul_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mul_epu32(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, mul, u64x4::ZERO))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mul_epu32&expand=3908)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm_mask_mul_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mul_epu32(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, mul, src.as_u64x2()))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mul_epu32&expand=3909)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm_maskz_mul_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mul_epu32(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, mul, u64x2::ZERO))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_ps&expand=3934)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm512_mul_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_mul(a.as_f32x16(), b.as_f32x16())) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_ps&expand=3932)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm512_mask_mul_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let mul = _mm512_mul_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, mul, src.as_f32x16()))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_ps&expand=3933)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm512_maskz_mul_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let mul = _mm512_mul_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, mul, f32x16::ZERO))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mul_ps&expand=3929)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm256_mask_mul_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let mul = _mm256_mul_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, mul, src.as_f32x8()))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mul_ps&expand=3930)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm256_maskz_mul_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let mul = _mm256_mul_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, mul, f32x8::ZERO))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mul_ps&expand=3926)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm_mask_mul_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mul = _mm_mul_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, mul, src.as_f32x4()))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mul_ps&expand=3927)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm_maskz_mul_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mul = _mm_mul_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, mul, f32x4::ZERO))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_pd&expand=3925)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm512_mul_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_mul(a.as_f64x8(), b.as_f64x8())) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_pd&expand=3923)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm512_mask_mul_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let mul = _mm512_mul_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, mul, src.as_f64x8()))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_pd&expand=3924)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm512_maskz_mul_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let mul = _mm512_mul_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, mul, f64x8::ZERO))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mul_pd&expand=3920)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm256_mask_mul_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let mul = _mm256_mul_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, mul, src.as_f64x4()))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mul_pd&expand=3921)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm256_maskz_mul_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let mul = _mm256_mul_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, mul, f64x4::ZERO))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mul_pd&expand=3917)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm_mask_mul_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mul = _mm_mul_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, mul, src.as_f64x2()))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mul_pd&expand=3918)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm_maskz_mul_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mul = _mm_mul_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, mul, f64x2::ZERO))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_div_ps&expand=2162)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm512_div_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_div(a.as_f32x16(), b.as_f32x16())) }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_div_ps&expand=2163)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm512_mask_div_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let div = _mm512_div_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, div, src.as_f32x16()))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_div_ps&expand=2164)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm512_maskz_div_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let div = _mm512_div_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, div, f32x16::ZERO))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_div_ps&expand=2160)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm256_mask_div_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let div = _mm256_div_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, div, src.as_f32x8()))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_div_ps&expand=2161)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm256_maskz_div_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let div = _mm256_div_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, div, f32x8::ZERO))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_div_ps&expand=2157)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm_mask_div_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let div = _mm_div_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, div, src.as_f32x4()))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_div_ps&expand=2158)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm_maskz_div_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let div = _mm_div_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, div, f32x4::ZERO))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_div_pd&expand=2153)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm512_div_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_div(a.as_f64x8(), b.as_f64x8())) }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_div_pd&expand=2154)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm512_mask_div_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let div = _mm512_div_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, div, src.as_f64x8()))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_div_pd&expand=2155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm512_maskz_div_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let div = _mm512_div_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, div, f64x8::ZERO))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_div_pd&expand=2151)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm256_mask_div_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let div = _mm256_div_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, div, src.as_f64x4()))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_div_pd&expand=2152)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm256_maskz_div_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let div = _mm256_div_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, div, f64x4::ZERO))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_div_pd&expand=2148)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm_mask_div_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let div = _mm_div_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, div, src.as_f64x2()))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_div_pd&expand=2149)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm_maskz_div_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let div = _mm_div_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, div, f64x2::ZERO))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epi32&expand=3582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm512_max_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        transmute(simd_select::<i32x16, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epi32&expand=3580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm512_mask_max_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, max, src.as_i32x16()))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epi32&expand=3581)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm512_maskz_max_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, max, i32x16::ZERO))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epi32&expand=3577)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm256_mask_max_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, max, src.as_i32x8()))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epi32&expand=3578)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm256_maskz_max_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, max, i32x8::ZERO))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epi32&expand=3574)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm_mask_max_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, max, src.as_i32x4()))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epi32&expand=3575)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm_maskz_max_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, max, i32x4::ZERO))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epi64&expand=3591)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm512_max_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        transmute(simd_select::<i64x8, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epi64&expand=3589)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm512_mask_max_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, max, src.as_i64x8()))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epi64&expand=3590)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm512_maskz_max_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, max, i64x8::ZERO))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi64&expand=3588)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm256_max_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        transmute(simd_select::<i64x4, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epi64&expand=3586)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm256_mask_max_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, max, src.as_i64x4()))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epi64&expand=3587)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm256_maskz_max_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, max, i64x4::ZERO))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi64&expand=3585)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm_max_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        transmute(simd_select::<i64x2, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epi64&expand=3583)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm_mask_max_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, max, src.as_i64x2()))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epi64&expand=3584)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm_maskz_max_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, max, i64x2::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_ps&expand=3655)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm512_max_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vmaxps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_ps&expand=3653)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm512_mask_max_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let max = _mm512_max_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, max, src.as_f32x16()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_ps&expand=3654)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm512_maskz_max_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let max = _mm512_max_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, max, f32x16::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_ps&expand=3650)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm256_mask_max_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let max = _mm256_max_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, max, src.as_f32x8()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_ps&expand=3651)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm256_maskz_max_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let max = _mm256_max_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, max, f32x8::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_ps&expand=3647)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm_mask_max_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let max = _mm_max_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, max, src.as_f32x4()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_ps&expand=3648)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm_maskz_max_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let max = _mm_max_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, max, f32x4::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_pd&expand=3645)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm512_max_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(vmaxpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_pd&expand=3643)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm512_mask_max_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let max = _mm512_max_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, max, src.as_f64x8()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_pd&expand=3644)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm512_maskz_max_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let max = _mm512_max_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, max, f64x8::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_pd&expand=3640)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm256_mask_max_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let max = _mm256_max_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, max, src.as_f64x4()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_pd&expand=3641)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm256_maskz_max_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let max = _mm256_max_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, max, f64x4::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_pd&expand=3637)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm_mask_max_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let max = _mm_max_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, max, src.as_f64x2()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_pd&expand=3638)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm_maskz_max_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let max = _mm_max_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, max, f64x2::ZERO))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epu32&expand=3618)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm512_max_epu32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u32x16();
+        let b = b.as_u32x16();
+        transmute(simd_select::<i32x16, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epu32&expand=3616)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm512_mask_max_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu32(a, b).as_u32x16();
+        transmute(simd_select_bitmask(k, max, src.as_u32x16()))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epu32&expand=3617)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm512_maskz_max_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu32(a, b).as_u32x16();
+        transmute(simd_select_bitmask(k, max, u32x16::ZERO))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epu32&expand=3613)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm256_mask_max_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu32(a, b).as_u32x8();
+        transmute(simd_select_bitmask(k, max, src.as_u32x8()))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epu32&expand=3614)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm256_maskz_max_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu32(a, b).as_u32x8();
+        transmute(simd_select_bitmask(k, max, u32x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epu32&expand=3610)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm_mask_max_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu32(a, b).as_u32x4();
+        transmute(simd_select_bitmask(k, max, src.as_u32x4()))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epu32&expand=3611)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm_maskz_max_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu32(a, b).as_u32x4();
+        transmute(simd_select_bitmask(k, max, u32x4::ZERO))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epu64&expand=3627)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm512_max_epu64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        transmute(simd_select::<i64x8, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epu64&expand=3625)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm512_mask_max_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu64(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, max, src.as_u64x8()))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epu64&expand=3626)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm512_maskz_max_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu64(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, max, u64x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu64&expand=3624)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm256_max_epu64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        transmute(simd_select::<i64x4, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epu64&expand=3622)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm256_mask_max_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu64(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, max, src.as_u64x4()))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epu64&expand=3623)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm256_maskz_max_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu64(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, max, u64x4::ZERO))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu64&expand=3621)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm_max_epu64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        transmute(simd_select::<i64x2, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epu64&expand=3619)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm_mask_max_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu64(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, max, src.as_u64x2()))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epu64&expand=3620)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm_maskz_max_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu64(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, max, u64x2::ZERO))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epi32&expand=3696)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm512_min_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        transmute(simd_select::<i32x16, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epi32&expand=3694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm512_mask_min_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, min, src.as_i32x16()))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epi32&expand=3695)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm512_maskz_min_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, min, i32x16::ZERO))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epi32&expand=3691)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm256_mask_min_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, min, src.as_i32x8()))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epi32&expand=3692)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm256_maskz_min_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, min, i32x8::ZERO))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi32&expand=3688)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm_mask_min_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, min, src.as_i32x4()))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi32&expand=3689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm_maskz_min_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, min, i32x4::ZERO))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epi64&expand=3705)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm512_min_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        transmute(simd_select::<i64x8, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epi64&expand=3703)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm512_mask_min_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, min, src.as_i64x8()))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epi64&expand=3704)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm512_maskz_min_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, min, i64x8::ZERO))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi64&expand=3702)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm256_min_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        transmute(simd_select::<i64x4, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epi64&expand=3700)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm256_mask_min_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, min, src.as_i64x4()))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epi64&expand=3701)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm256_maskz_min_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, min, i64x4::ZERO))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm_min_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        transmute(simd_select::<i64x2, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm_mask_min_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, min, src.as_i64x2()))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm_maskz_min_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, min, i64x2::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_ps&expand=3769)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm512_min_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vminps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_ps&expand=3767)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm512_mask_min_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let min = _mm512_min_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, min, src.as_f32x16()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_ps&expand=3768)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm512_maskz_min_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let min = _mm512_min_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, min, f32x16::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_ps&expand=3764)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm256_mask_min_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let min = _mm256_min_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, min, src.as_f32x8()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_ps&expand=3765)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm256_maskz_min_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let min = _mm256_min_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, min, f32x8::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_ps&expand=3761)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm_mask_min_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let min = _mm_min_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, min, src.as_f32x4()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_ps&expand=3762)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm_maskz_min_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let min = _mm_min_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, min, f32x4::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_pd&expand=3759)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm512_min_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(vminpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_pd&expand=3757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm512_mask_min_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let min = _mm512_min_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, min, src.as_f64x8()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_pd&expand=3758)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm512_maskz_min_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let min = _mm512_min_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, min, f64x8::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_pd&expand=3754)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm256_mask_min_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let min = _mm256_min_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, min, src.as_f64x4()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_pd&expand=3755)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm256_maskz_min_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let min = _mm256_min_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, min, f64x4::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_pd&expand=3751)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm_mask_min_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let min = _mm_min_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, min, src.as_f64x2()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_pd&expand=3752)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm_maskz_min_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let min = _mm_min_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, min, f64x2::ZERO))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epu32&expand=3732)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm512_min_epu32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u32x16();
+        let b = b.as_u32x16();
+        transmute(simd_select::<i32x16, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epu32&expand=3730)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm512_mask_min_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu32(a, b).as_u32x16();
+        transmute(simd_select_bitmask(k, min, src.as_u32x16()))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epu32&expand=3731)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm512_maskz_min_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu32(a, b).as_u32x16();
+        transmute(simd_select_bitmask(k, min, u32x16::ZERO))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epu32&expand=3727)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm256_mask_min_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu32(a, b).as_u32x8();
+        transmute(simd_select_bitmask(k, min, src.as_u32x8()))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epu32&expand=3728)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm256_maskz_min_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu32(a, b).as_u32x8();
+        transmute(simd_select_bitmask(k, min, u32x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epu32&expand=3724)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm_mask_min_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu32(a, b).as_u32x4();
+        transmute(simd_select_bitmask(k, min, src.as_u32x4()))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epu32&expand=3725)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm_maskz_min_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu32(a, b).as_u32x4();
+        transmute(simd_select_bitmask(k, min, u32x4::ZERO))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epu64&expand=3741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm512_min_epu64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        transmute(simd_select::<i64x8, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epu64&expand=3739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm512_mask_min_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu64(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, min, src.as_u64x8()))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epu64&expand=3740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm512_maskz_min_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu64(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, min, u64x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu64&expand=3738)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm256_min_epu64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        transmute(simd_select::<i64x4, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epu64&expand=3736)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm256_mask_min_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu64(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, min, src.as_u64x4()))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epu64&expand=3737)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm256_maskz_min_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu64(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, min, u64x4::ZERO))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu64&expand=3735)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm_min_epu64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        transmute(simd_select::<i64x2, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epu64&expand=3733)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm_mask_min_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu64(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, min, src.as_u64x2()))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epu64&expand=3734)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm_maskz_min_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu64(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, min, u64x2::ZERO))
+    }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sqrt_ps&expand=5371)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm512_sqrt_ps(a: __m512) -> __m512 {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sqrt_ps&expand=5369)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm512_mask_sqrt_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sqrt_ps&expand=5370)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm512_maskz_sqrt_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_ps()) }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sqrt_ps&expand=5366)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm256_mask_sqrt_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sqrt_ps&expand=5367)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm256_maskz_sqrt_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_ps()) }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sqrt_ps&expand=5363)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm_mask_sqrt_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sqrt_ps&expand=5364)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm_maskz_sqrt_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_ps()) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sqrt_pd&expand=5362)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm512_sqrt_pd(a: __m512d) -> __m512d {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sqrt_pd&expand=5360)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm512_mask_sqrt_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sqrt_pd&expand=5361)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm512_maskz_sqrt_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_pd()) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sqrt_pd&expand=5357)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm256_mask_sqrt_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sqrt_pd&expand=5358)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm256_maskz_sqrt_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_pd()) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sqrt_pd&expand=5354)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm_mask_sqrt_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sqrt_pd&expand=5355)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm_maskz_sqrt_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_pd()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmadd_ps&expand=2557)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm512_fmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmadd_ps&expand=2558)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm512_mask_fmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmadd_ps&expand=2560)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm512_maskz_fmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), _mm512_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmadd_ps&expand=2559)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm512_mask3_fmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmadd_ps&expand=2554)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm256_mask_fmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmadd_ps&expand=2556)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm256_maskz_fmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), _mm256_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmadd_ps&expand=2555)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm256_mask3_fmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmadd_ps&expand=2550)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm_mask_fmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmadd_ps&expand=2552)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm_maskz_fmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), _mm_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmadd_ps&expand=2551)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm_mask3_fmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmadd_pd&expand=2545)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm512_fmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmadd_pd&expand=2546)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm512_mask_fmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmadd_pd&expand=2548)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm512_maskz_fmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), _mm512_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmadd_pd&expand=2547)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm512_mask3_fmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmadd_pd&expand=2542)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm256_mask_fmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmadd_pd&expand=2544)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm256_maskz_fmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), _mm256_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmadd_pd&expand=2543)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm256_mask3_fmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmadd_pd&expand=2538)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm_mask_fmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmadd_pd&expand=2540)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm_maskz_fmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), _mm_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmadd_pd&expand=2539)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsub_ps&expand=2643)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsub_ps&expand=2644)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsub_ps&expand=2646)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), _mm512_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsub_ps&expand=2645)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmsub_ps&expand=2640)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm256_mask_fmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmsub_ps&expand=2642)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm256_maskz_fmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), _mm256_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmsub_ps&expand=2641)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm256_mask3_fmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmsub_ps&expand=2636)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm_mask_fmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmsub_ps&expand=2638)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm_maskz_fmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), _mm_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmsub_ps&expand=2637)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsub_pd&expand=2631)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsub_pd&expand=2632)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsub_pd&expand=2634)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), _mm512_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsub_pd&expand=2633)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmsub_pd&expand=2628)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm256_mask_fmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmsub_pd&expand=2630)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm256_maskz_fmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), _mm256_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmsub_pd&expand=2629)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm256_mask3_fmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmsub_pd&expand=2624)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm_mask_fmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmsub_pd&expand=2626)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm_maskz_fmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), _mm_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmsub_pd&expand=2625)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm_mask3_fmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmaddsub_ps&expand=2611)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm512_fmaddsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(
+            add,
+            sub,
+            [16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15]
+        )
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmaddsub_ps&expand=2612)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm512_mask_fmaddsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmaddsub_ps&expand=2614)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm512_maskz_fmaddsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), _mm512_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmaddsub_ps&expand=2613)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm512_mask3_fmaddsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmaddsub_ps&expand=2608)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm256_mask_fmaddsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmaddsub_ps&expand=2610)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm256_maskz_fmaddsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), _mm256_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmaddsub_ps&expand=2609)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm256_mask3_fmaddsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmaddsub_ps&expand=2604)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm_mask_fmaddsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ps&expand=2606)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm_maskz_fmaddsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), _mm_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmaddsub_ps&expand=2605)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm_mask3_fmaddsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmaddsub_pd&expand=2599)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm512_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmaddsub_pd&expand=2600)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm512_mask_fmaddsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmaddsub_pd&expand=2602)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm512_maskz_fmaddsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), _mm512_setzero_pd()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmaddsub_pd&expand=2613)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm512_mask3_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmaddsub_pd&expand=2596)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm256_mask_fmaddsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmaddsub_pd&expand=2598)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm256_maskz_fmaddsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), _mm256_setzero_pd()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmaddsub_pd&expand=2597)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm256_mask3_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmaddsub_pd&expand=2592)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm_mask_fmaddsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmaddsub_pd&expand=2594)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm_maskz_fmaddsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), _mm_setzero_pd()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmaddsub_pd&expand=2593)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsubadd_ps&expand=2691)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(
+            add,
+            sub,
+            [0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31]
+        )
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsubadd_ps&expand=2692)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsubadd_ps&expand=2694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), _mm512_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsubadd_ps&expand=2693)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmsubadd_ps&expand=2688)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm256_mask_fmsubadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmsubadd_ps&expand=2690)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm256_maskz_fmsubadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), _mm256_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmsubadd_ps&expand=2689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm256_mask3_fmsubadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmsubadd_ps&expand=2684)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm_mask_fmsubadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmsubadd_ps&expand=2686)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm_maskz_fmsubadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), _mm_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmsubadd_ps&expand=2685)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsubadd_pd&expand=2679)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15])
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsubadd_pd&expand=2680)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsubadd_pd&expand=2682)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), _mm512_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsubadd_pd&expand=2681)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmsubadd_pd&expand=2676)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm256_mask_fmsubadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmsubadd_pd&expand=2678)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm256_maskz_fmsubadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), _mm256_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmsubadd_pd&expand=2677)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm256_mask3_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmsubadd_pd&expand=2672)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm_mask_fmsubadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmsubadd_pd&expand=2674)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm_maskz_fmsubadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), _mm_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmsubadd_pd&expand=2673)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmadd_ps&expand=2723)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmadd_ps&expand=2724)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmadd_ps&expand=2726)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), _mm512_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmadd_ps&expand=2725)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fnmadd_ps&expand=2720)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm256_mask_fnmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fnmadd_ps&expand=2722)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm256_maskz_fnmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), _mm256_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fnmadd_ps&expand=2721)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm256_mask3_fnmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fnmadd_ps&expand=2716)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm_mask_fnmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fnmadd_ps&expand=2718)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm_maskz_fnmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), _mm_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fnmadd_ps&expand=2717)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmadd_pd&expand=2711)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmadd_pd&expand=2712)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmadd_pd&expand=2714)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), _mm512_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmadd_pd&expand=2713)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fnmadd_pd&expand=2708)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm256_mask_fnmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fnmadd_pd&expand=2710)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm256_maskz_fnmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), _mm256_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fnmadd_pd&expand=2709)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm256_mask3_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fnmadd_pd&expand=2704)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm_mask_fnmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fnmadd_pd&expand=2706)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm_maskz_fnmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), _mm_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fnmadd_pd&expand=2705)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmsub_ps&expand=2771)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmsub_ps&expand=2772)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmsub_ps&expand=2774)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), _mm512_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmsub_ps&expand=2773)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fnmsub_ps&expand=2768)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm256_mask_fnmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fnmsub_ps&expand=2770)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm256_maskz_fnmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), _mm256_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fnmsub_ps&expand=2769)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm256_mask3_fnmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fnmsub_ps&expand=2764)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm_mask_fnmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fnmsub_ps&expand=2766)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm_maskz_fnmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), _mm_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fnmsub_ps&expand=2765)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmsub_pd&expand=2759)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmsub_pd&expand=2760)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmsub_pd&expand=2762)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), _mm512_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmsub_pd&expand=2761)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fnmsub_pd&expand=2756)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm256_mask_fnmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fnmsub_pd&expand=2758)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm256_maskz_fnmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), _mm256_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fnmsub_pd&expand=2757)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm256_mask3_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fnmsub_pd&expand=2752)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm_mask_fnmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fnmsub_pd&expand=2754)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm_maskz_fnmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), _mm_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fnmsub_pd&expand=2753)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm_mask3_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), c) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rcp14_ps&expand=4502)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm512_rcp14_ps(a: __m512) -> __m512 {
+    unsafe { transmute(vrcp14ps(a.as_f32x16(), f32x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rcp14_ps&expand=4500)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm512_mask_rcp14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vrcp14ps(a.as_f32x16(), src.as_f32x16(), k)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rcp14_ps&expand=4501)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm512_maskz_rcp14_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vrcp14ps(a.as_f32x16(), f32x16::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp14_ps&expand=4499)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm256_rcp14_ps(a: __m256) -> __m256 {
+    unsafe { transmute(vrcp14ps256(a.as_f32x8(), f32x8::ZERO, 0b11111111)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rcp14_ps&expand=4497)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm256_mask_rcp14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vrcp14ps256(a.as_f32x8(), src.as_f32x8(), k)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rcp14_ps&expand=4498)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm256_maskz_rcp14_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vrcp14ps256(a.as_f32x8(), f32x8::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp14_ps&expand=4496)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm_rcp14_ps(a: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ps128(a.as_f32x4(), f32x4::ZERO, 0b00001111)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rcp14_ps&expand=4494)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm_mask_rcp14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ps128(a.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rcp14_ps&expand=4495)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm_maskz_rcp14_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ps128(a.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rcp14_pd&expand=4493)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm512_rcp14_pd(a: __m512d) -> __m512d {
+    unsafe { transmute(vrcp14pd(a.as_f64x8(), f64x8::ZERO, 0b11111111)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rcp14_pd&expand=4491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm512_mask_rcp14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vrcp14pd(a.as_f64x8(), src.as_f64x8(), k)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rcp14_pd&expand=4492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm512_maskz_rcp14_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vrcp14pd(a.as_f64x8(), f64x8::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp14_pd&expand=4490)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm256_rcp14_pd(a: __m256d) -> __m256d {
+    unsafe { transmute(vrcp14pd256(a.as_f64x4(), f64x4::ZERO, 0b00001111)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rcp14_pd&expand=4488)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm256_mask_rcp14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vrcp14pd256(a.as_f64x4(), src.as_f64x4(), k)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rcp14_pd&expand=4489)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm256_maskz_rcp14_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vrcp14pd256(a.as_f64x4(), f64x4::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp14_pd&expand=4487)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm_rcp14_pd(a: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14pd128(a.as_f64x2(), f64x2::ZERO, 0b00000011)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rcp14_pd&expand=4485)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm_mask_rcp14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14pd128(a.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rcp14_pd&expand=4486)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm_maskz_rcp14_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14pd128(a.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rsqrt14_ps&expand=4819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm512_rsqrt14_ps(a: __m512) -> __m512 {
+    unsafe { transmute(vrsqrt14ps(a.as_f32x16(), f32x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rsqrt14_ps&expand=4817)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm512_mask_rsqrt14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vrsqrt14ps(a.as_f32x16(), src.as_f32x16(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rsqrt14_ps&expand=4818)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm512_maskz_rsqrt14_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vrsqrt14ps(a.as_f32x16(), f32x16::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt14_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm256_rsqrt14_ps(a: __m256) -> __m256 {
+    unsafe { transmute(vrsqrt14ps256(a.as_f32x8(), f32x8::ZERO, 0b11111111)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rsqrt14_ps&expand=4815)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm256_mask_rsqrt14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vrsqrt14ps256(a.as_f32x8(), src.as_f32x8(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rsqrt14_ps&expand=4816)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm256_maskz_rsqrt14_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vrsqrt14ps256(a.as_f32x8(), f32x8::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt14_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm_rsqrt14_ps(a: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ps128(a.as_f32x4(), f32x4::ZERO, 0b00001111)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rsqrt14_ps&expand=4813)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm_mask_rsqrt14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ps128(a.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rsqrt14_ps&expand=4814)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm_maskz_rsqrt14_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ps128(a.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rsqrt14_pd&expand=4812)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm512_rsqrt14_pd(a: __m512d) -> __m512d {
+    unsafe { transmute(vrsqrt14pd(a.as_f64x8(), f64x8::ZERO, 0b11111111)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rsqrt14_pd&expand=4810)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm512_mask_rsqrt14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vrsqrt14pd(a.as_f64x8(), src.as_f64x8(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rsqrt14_pd&expand=4811)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm512_maskz_rsqrt14_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vrsqrt14pd(a.as_f64x8(), f64x8::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt14_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm256_rsqrt14_pd(a: __m256d) -> __m256d {
+    unsafe { transmute(vrsqrt14pd256(a.as_f64x4(), f64x4::ZERO, 0b00001111)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rsqrt14_pd&expand=4808)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm256_mask_rsqrt14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vrsqrt14pd256(a.as_f64x4(), src.as_f64x4(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rsqrt14_pd&expand=4809)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm256_maskz_rsqrt14_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vrsqrt14pd256(a.as_f64x4(), f64x4::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt14_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm_rsqrt14_pd(a: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14pd128(a.as_f64x2(), f64x2::ZERO, 0b00000011)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rsqrt14_pd&expand=4806)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm_mask_rsqrt14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14pd128(a.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rsqrt14_pd&expand=4807)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm_maskz_rsqrt14_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14pd128(a.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getexp_ps&expand=2844)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm512_getexp_ps(a: __m512) -> __m512 {
+    unsafe {
+        transmute(vgetexpps(
+            a.as_f32x16(),
+            f32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getexp_ps&expand=2845)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm512_mask_getexp_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        transmute(vgetexpps(
+            a.as_f32x16(),
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getexp_ps&expand=2846)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm512_maskz_getexp_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        transmute(vgetexpps(
+            a.as_f32x16(),
+            f32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_getexp_ps&expand=2841)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm256_getexp_ps(a: __m256) -> __m256 {
+    unsafe { transmute(vgetexpps256(a.as_f32x8(), f32x8::ZERO, 0b11111111)) }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_getexp_ps&expand=2842)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm256_mask_getexp_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vgetexpps256(a.as_f32x8(), src.as_f32x8(), k)) }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_getexp_ps&expand=2843)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm256_maskz_getexp_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vgetexpps256(a.as_f32x8(), f32x8::ZERO, k)) }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getexp_ps&expand=2838)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm_getexp_ps(a: __m128) -> __m128 {
+    unsafe { transmute(vgetexpps128(a.as_f32x4(), f32x4::ZERO, 0b00001111)) }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_getexp_ps&expand=2839)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm_mask_getexp_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vgetexpps128(a.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_getexp_ps&expand=2840)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm_maskz_getexp_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vgetexpps128(a.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getexp_pd&expand=2835)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm512_getexp_pd(a: __m512d) -> __m512d {
+    unsafe {
+        transmute(vgetexppd(
+            a.as_f64x8(),
+            f64x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getexp_pd&expand=2836)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm512_mask_getexp_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        transmute(vgetexppd(
+            a.as_f64x8(),
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getexp_pd&expand=2837)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm512_maskz_getexp_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        transmute(vgetexppd(
+            a.as_f64x8(),
+            f64x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_getexp_pd&expand=2832)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm256_getexp_pd(a: __m256d) -> __m256d {
+    unsafe { transmute(vgetexppd256(a.as_f64x4(), f64x4::ZERO, 0b00001111)) }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_getexp_pd&expand=2833)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm256_mask_getexp_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vgetexppd256(a.as_f64x4(), src.as_f64x4(), k)) }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_getexp_pd&expand=2834)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm256_maskz_getexp_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vgetexppd256(a.as_f64x4(), f64x4::ZERO, k)) }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getexp_pd&expand=2829)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm_getexp_pd(a: __m128d) -> __m128d {
+    unsafe { transmute(vgetexppd128(a.as_f64x2(), f64x2::ZERO, 0b00000011)) }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_getexp_pd&expand=2830)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm_mask_getexp_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vgetexppd128(a.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_getexp_pd&expand=2831)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm_maskz_getexp_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vgetexppd128(a.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_roundscale_ps&expand=4784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_roundscale_ps<const IMM8: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let r = vrndscaleps(
+            a,
+            IMM8,
+            f32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_roundscale_ps&expand=4782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_roundscale_ps<const IMM8: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vrndscaleps(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_roundscale_ps&expand=4783)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_roundscale_ps<const IMM8: i32>(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let r = vrndscaleps(a, IMM8, f32x16::ZERO, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_roundscale_ps&expand=4781)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 250))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_roundscale_ps<const IMM8: i32>(a: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let r = vrndscaleps256(a, IMM8, f32x8::ZERO, 0b11111111);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_roundscale_ps&expand=4779)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_roundscale_ps<const IMM8: i32>(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let src = src.as_f32x8();
+        let r = vrndscaleps256(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_roundscale_ps&expand=4780)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let r = vrndscaleps256(a, IMM8, f32x8::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_roundscale_ps&expand=4778)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 250))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_roundscale_ps<const IMM8: i32>(a: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let r = vrndscaleps128(a, IMM8, f32x4::ZERO, 0b00001111);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_roundscale_ps&expand=4776)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_roundscale_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vrndscaleps128(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_roundscale_ps&expand=4777)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let r = vrndscaleps128(a, IMM8, f32x4::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_roundscale_pd&expand=4775)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_roundscale_pd<const IMM8: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let r = vrndscalepd(a, IMM8, f64x8::ZERO, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_roundscale_pd&expand=4773)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_roundscale_pd<const IMM8: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vrndscalepd(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_roundscale_pd&expand=4774)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let r = vrndscalepd(a, IMM8, f64x8::ZERO, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_roundscale_pd&expand=4772)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_roundscale_pd<const IMM8: i32>(a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let r = vrndscalepd256(a, IMM8, f64x4::ZERO, 0b00001111);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_roundscale_pd&expand=4770)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_roundscale_pd<const IMM8: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let src = src.as_f64x4();
+        let r = vrndscalepd256(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_roundscale_pd&expand=4771)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let r = vrndscalepd256(a, IMM8, f64x4::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_roundscale_pd&expand=4769)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_roundscale_pd<const IMM8: i32>(a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let r = vrndscalepd128(a, IMM8, f64x2::ZERO, 0b00000011);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_roundscale_pd&expand=4767)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_roundscale_pd<const IMM8: i32>(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vrndscalepd128(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_roundscale_pd&expand=4768)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let r = vrndscalepd128(a, IMM8, f64x2::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_scalef_ps&expand=4883)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm512_scalef_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vscalefps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            f32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_scalef_ps&expand=4881)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm512_mask_scalef_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vscalefps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_scalef_ps&expand=4882)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm512_maskz_scalef_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vscalefps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            f32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_scalef_ps&expand=4880)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm256_scalef_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        transmute(vscalefps256(
+            a.as_f32x8(),
+            b.as_f32x8(),
+            f32x8::ZERO,
+            0b11111111,
+        ))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_scalef_ps&expand=4878)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm256_mask_scalef_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe { transmute(vscalefps256(a.as_f32x8(), b.as_f32x8(), src.as_f32x8(), k)) }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_scalef_ps&expand=4879)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm256_maskz_scalef_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe { transmute(vscalefps256(a.as_f32x8(), b.as_f32x8(), f32x8::ZERO, k)) }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_scalef_ps&expand=4877)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm_scalef_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vscalefps128(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            0b00001111,
+        ))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_scalef_ps&expand=4875)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm_mask_scalef_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vscalefps128(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_scalef_ps&expand=4876)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm_maskz_scalef_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vscalefps128(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_scalef_pd&expand=4874)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm512_scalef_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        transmute(vscalefpd(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            f64x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_scalef_pd&expand=4872)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm512_mask_scalef_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        transmute(vscalefpd(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_scalef_pd&expand=4873)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        transmute(vscalefpd(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            f64x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_scalef_pd&expand=4871)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm256_scalef_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        transmute(vscalefpd256(
+            a.as_f64x4(),
+            b.as_f64x4(),
+            f64x4::ZERO,
+            0b00001111,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_scalef_pd&expand=4869)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm256_mask_scalef_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe { transmute(vscalefpd256(a.as_f64x4(), b.as_f64x4(), src.as_f64x4(), k)) }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_scalef_pd&expand=4870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm256_maskz_scalef_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe { transmute(vscalefpd256(a.as_f64x4(), b.as_f64x4(), f64x4::ZERO, k)) }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_scalef_pd&expand=4868)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm_scalef_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vscalefpd128(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            0b00000011,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_scalef_pd&expand=4866)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm_mask_scalef_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vscalefpd128(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_scalef_pd&expand=4867)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm_maskz_scalef_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vscalefpd128(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fixupimm_ps&expand=2499)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fixupimm_ps<const IMM8: i32>(a: __m512, b: __m512, c: __m512i) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fixupimm_ps&expand=2500)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fixupimm_ps<const IMM8: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmps(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fixupimm_ps&expand=2501)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fixupimm_ps<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmpsz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fixupimm_ps&expand=2496)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_fixupimm_ps<const IMM8: i32>(a: __m256, b: __m256, c: __m256i) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let c = c.as_i32x8();
+        let r = vfixupimmps256(a, b, c, IMM8, 0b11111111);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fixupimm_ps&expand=2497)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_fixupimm_ps<const IMM8: i32>(
+    a: __m256,
+    k: __mmask8,
+    b: __m256,
+    c: __m256i,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let c = c.as_i32x8();
+        let r = vfixupimmps256(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fixupimm_ps&expand=2498)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_maskz_fixupimm_ps<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+    c: __m256i,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let c = c.as_i32x8();
+        let r = vfixupimmpsz256(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fixupimm_ps&expand=2493)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fixupimm_ps<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmps128(a, b, c, IMM8, 0b00001111);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fixupimm_ps&expand=2494)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fixupimm_ps<const IMM8: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmps128(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fixupimm_ps&expand=2495)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fixupimm_ps<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmpsz128(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fixupimm_pd&expand=2490)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fixupimm_pd<const IMM8: i32>(a: __m512d, b: __m512d, c: __m512i) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fixupimm_pd&expand=2491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fixupimm_pd<const IMM8: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fixupimm_pd&expand=2492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fixupimm_pd<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fixupimm_pd&expand=2487)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_fixupimm_pd<const IMM8: i32>(a: __m256d, b: __m256d, c: __m256i) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let c = c.as_i64x4();
+        let r = vfixupimmpd256(a, b, c, IMM8, 0b00001111);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fixupimm_pd&expand=2488)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_fixupimm_pd<const IMM8: i32>(
+    a: __m256d,
+    k: __mmask8,
+    b: __m256d,
+    c: __m256i,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let c = c.as_i64x4();
+        let r = vfixupimmpd256(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fixupimm_pd&expand=2489)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_maskz_fixupimm_pd<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+    c: __m256i,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let c = c.as_i64x4();
+        let r = vfixupimmpdz256(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fixupimm_pd&expand=2484)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fixupimm_pd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmpd128(a, b, c, IMM8, 0b00000011);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fixupimm_pd&expand=2485)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fixupimm_pd<const IMM8: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmpd128(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fixupimm_pd&expand=2486)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fixupimm_pd<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmpdz128(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_ternarylogic_epi32&expand=5867)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_ternarylogic_epi32<const IMM8: i32>(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let c = c.as_i32x16();
+        let r = vpternlogd(a, b, c, IMM8);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_ternarylogic_epi32&expand=5865)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_ternarylogic_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i32x16();
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let r = vpternlogd(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_ternarylogic_epi32&expand=5866)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_ternarylogic_epi32<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let c = c.as_i32x16();
+        let r = vpternlogd(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ternarylogic_epi32&expand=5864)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_ternarylogic_epi32<const IMM8: i32>(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let c = c.as_i32x8();
+        let r = vpternlogd256(a, b, c, IMM8);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_ternarylogic_epi32&expand=5862)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_ternarylogic_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i32x8();
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let r = vpternlogd256(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_ternarylogic_epi32&expand=5863)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_maskz_ternarylogic_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let c = c.as_i32x8();
+        let r = vpternlogd256(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ternarylogic_epi32&expand=5861)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_ternarylogic_epi32<const IMM8: i32>(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let c = c.as_i32x4();
+        let r = vpternlogd128(a, b, c, IMM8);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_ternarylogic_epi32&expand=5859)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_ternarylogic_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i32x4();
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let r = vpternlogd128(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_ternarylogic_epi32&expand=5860)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_ternarylogic_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let c = c.as_i32x4();
+        let r = vpternlogd128(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_ternarylogic_epi64&expand=5876)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_ternarylogic_epi64<const IMM8: i32>(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let c = c.as_i64x8();
+        let r = vpternlogq(a, b, c, IMM8);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_ternarylogic_epi64&expand=5874)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_ternarylogic_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i64x8();
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let r = vpternlogq(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_ternarylogic_epi64&expand=5875)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_ternarylogic_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let c = c.as_i64x8();
+        let r = vpternlogq(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ternarylogic_epi64&expand=5873)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_ternarylogic_epi64<const IMM8: i32>(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let c = c.as_i64x4();
+        let r = vpternlogq256(a, b, c, IMM8);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_ternarylogic_epi64&expand=5871)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_ternarylogic_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i64x4();
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let r = vpternlogq256(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_ternarylogic_epi64&expand=5872)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_maskz_ternarylogic_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let c = c.as_i64x4();
+        let r = vpternlogq256(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ternarylogic_epi64&expand=5870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_ternarylogic_epi64<const IMM8: i32>(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let c = c.as_i64x2();
+        let r = vpternlogq128(a, b, c, IMM8);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_ternarylogic_epi64&expand=5868)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_ternarylogic_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i64x2();
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let r = vpternlogq128(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_ternarylogic_epi64&expand=5869)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_ternarylogic_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let c = c.as_i64x2();
+        let r = vpternlogq128(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getmant_ps&expand=2880)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm512_getmant_ps<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x16();
+        let zero = f32x16::ZERO;
+        let r = vgetmantps(
+            a,
+            SIGN << 2 | NORM,
+            zero,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getmant_ps&expand=2881)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_mask_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vgetmantps(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getmant_ps&expand=2882)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm512_maskz_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x16();
+        let r = vgetmantps(
+            a,
+            SIGN << 2 | NORM,
+            f32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_getmant_ps&expand=2877)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm256_getmant_ps<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m256,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x8();
+        let r = vgetmantps256(a, SIGN << 2 | NORM, f32x8::ZERO, 0b11111111);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_getmant_ps&expand=2878)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm256_mask_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x8();
+        let src = src.as_f32x8();
+        let r = vgetmantps256(a, SIGN << 2 | NORM, src, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_getmant_ps&expand=2879)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm256_maskz_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m256,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x8();
+        let r = vgetmantps256(a, SIGN << 2 | NORM, f32x8::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getmant_ps&expand=2874)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm_getmant_ps<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let r = vgetmantps128(a, SIGN << 2 | NORM, f32x4::ZERO, 0b00001111);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_getmant_ps&expand=2875)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_mask_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vgetmantps128(a, SIGN << 2 | NORM, src, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_getmant_ps&expand=2876)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_maskz_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let r = vgetmantps128(a, SIGN << 2 | NORM, f32x4::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getmant_pd&expand=2871)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm512_getmant_pd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x8();
+        let zero = f64x8::ZERO;
+        let r = vgetmantpd(
+            a,
+            SIGN << 2 | NORM,
+            zero,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getmant_pd&expand=2872)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_mask_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getmant_pd&expand=2873)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm512_maskz_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x8();
+        let r = vgetmantpd(
+            a,
+            SIGN << 2 | NORM,
+            f64x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_getmant_pd&expand=2868)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm256_getmant_pd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x4();
+        let r = vgetmantpd256(a, SIGN << 2 | NORM, f64x4::ZERO, 0b00001111);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_getmant_pd&expand=2869)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm256_mask_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x4();
+        let src = src.as_f64x4();
+        let r = vgetmantpd256(a, SIGN << 2 | NORM, src, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_getmant_pd&expand=2870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm256_maskz_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x4();
+        let r = vgetmantpd256(a, SIGN << 2 | NORM, f64x4::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getmant_pd&expand=2865)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm_getmant_pd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let r = vgetmantpd128(a, SIGN << 2 | NORM, f64x2::ZERO, 0b00000011);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_getmant_pd&expand=2866)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_mask_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vgetmantpd128(a, SIGN << 2 | NORM, src, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_getmant_pd&expand=2867)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_maskz_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let r = vgetmantpd128(a, SIGN << 2 | NORM, f64x2::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_round_ps&expand=145)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_add_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vaddps(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_round_ps&expand=146)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_add_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vaddps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_round_ps&expand=147)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_add_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vaddps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_round_pd&expand=142)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_add_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vaddpd(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_round_pd&expand=143)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_add_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vaddpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_round_pd&expand=144)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_add_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vaddpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_round_ps&expand=5739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_sub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vsubps(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_round_ps&expand=5737)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_sub_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vsubps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_round_ps&expand=5738)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_sub_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vsubps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_round_pd&expand=5736)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_sub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vsubpd(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_round_pd&expand=5734)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_sub_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vsubpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_round_pd&expand=5735)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_sub_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vsubpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_round_ps&expand=3940)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_mul_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmulps(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_round_ps&expand=3938)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_mul_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmulps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_round_ps&expand=3939)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_mul_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmulps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_round_pd&expand=3937)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_mul_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmulpd(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_round_pd&expand=3935)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_mul_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmulpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_round_pd&expand=3939)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_mul_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmulpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_div_round_ps&expand=2168)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_div_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vdivps(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_div_round_ps&expand=2169)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_div_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vdivps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_div_round_ps&expand=2170)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_div_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vdivps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, =and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_div_round_pd&expand=2165)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_div_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vdivpd(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_div_round_pd&expand=2166)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_div_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vdivpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_div_round_pd&expand=2167)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_div_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vdivpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sqrt_round_ps&expand=5377)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_sqrt_round_ps<const ROUNDING: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vsqrtps(a, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sqrt_round_ps&expand=5375)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_sqrt_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vsqrtps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sqrt_round_ps&expand=5376)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_sqrt_round_ps<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vsqrtps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sqrt_round_pd&expand=5374)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_sqrt_round_pd<const ROUNDING: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vsqrtpd(a, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sqrt_round_pd&expand=5372)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_sqrt_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vsqrtpd(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sqrt_round_pd&expand=5373)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_sqrt_round_pd<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vsqrtpd(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmadd_round_ps&expand=2565)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmadd_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132psround(a, b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmadd_round_ps&expand=2566)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), a)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in a using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmadd_round_ps&expand=2568)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmadd_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), _mm512_setzero_ps())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmadd_round_ps&expand=2567)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), c)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmadd_round_pd&expand=2561)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmadd_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132pdround(a, b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmadd_round_pd&expand=2562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), a)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmadd_round_pd&expand=2564)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmadd_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), _mm512_setzero_pd())
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmadd_round_pd&expand=2563)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), c)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsub_round_ps&expand=2651)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmsub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132psround(a, b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsub_round_ps&expand=2652)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsub_round_ps&expand=2654)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_ps())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsub_round_ps&expand=2653)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsub_round_pd&expand=2647)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmsub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132pdround(a, b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsub_round_pd&expand=2648)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsub_round_pd&expand=2650)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_pd())
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsub_round_pd&expand=2649)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmaddsub_round_ps&expand=2619)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmaddsub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubpsround(a, b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmaddsub_round_ps&expand=2620)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmaddsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), a)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmaddsub_round_ps&expand=2622)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmaddsub_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), _mm512_setzero_ps())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmaddsub_round_ps&expand=2621)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmaddsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), c)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmaddsub_round_pd&expand=2615)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmaddsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubpdround(a, b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmaddsub_round_pd&expand=2616)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmaddsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), a)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmaddsub_round_pd&expand=2618)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmaddsub_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), _mm512_setzero_pd())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmaddsub_round_pd&expand=2617)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmaddsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), c)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsubadd_round_ps&expand=2699)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubpsround(a, b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsubadd_round_ps&expand=2700)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsubadd_round_ps&expand=2702)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_ps())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsubadd_round_ps&expand=2701)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsubadd_round_pd&expand=2695)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubpdround(a, b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsubadd_round_pd&expand=2696)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsubadd_round_pd&expand=2698)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_pd())
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsubadd_round_pd&expand=2697)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmadd_round_ps&expand=2731)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132psround(simd_neg(a), b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmadd_round_ps&expand=2732)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmadd_round_ps&expand=2734)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_ps())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmadd_round_ps&expand=2733)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmadd_round_pd&expand=2711)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132pdround(simd_neg(a), b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmadd_round_pd&expand=2728)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmadd_round_pd&expand=2730)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_pd())
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmadd_round_pd&expand=2729)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmsub_round_ps&expand=2779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmsub_round_ps&expand=2780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmsub_round_ps&expand=2782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_ps())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmsub_round_ps&expand=2781)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmsub_round_pd&expand=2775)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmsub_round_pd&expand=2776)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmsub_round_pd&expand=2778)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_pd())
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmsub_round_pd&expand=2777)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fnmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_round_ps&expand=3662)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_max_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmaxps(a, b, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_round_ps&expand=3660)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_max_round_ps<const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmaxps(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_round_ps&expand=3661)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_max_round_ps<const SAE: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmaxps(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_round_pd&expand=3659)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_max_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmaxpd(a, b, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_round_pd&expand=3657)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_max_round_pd<const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmaxpd(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_round_pd&expand=3658)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_max_round_pd<const SAE: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmaxpd(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_round_ps&expand=3776)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_min_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vminps(a, b, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_round_ps&expand=3774)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_min_round_ps<const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vminps(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_round_ps&expand=3775)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_min_round_ps<const SAE: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vminps(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_round_pd&expand=3773)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_min_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vminpd(a, b, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_round_pd&expand=3771)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_min_round_pd<const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vminpd(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_round_pd&expand=3772)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_min_round_pd<const SAE: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vminpd(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getexp_round_ps&expand=2850)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_getexp_round_ps<const SAE: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vgetexpps(a, f32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getexp_round_ps&expand=2851)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_getexp_round_ps<const SAE: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vgetexpps(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getexp_round_ps&expand=2852)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_getexp_round_ps<const SAE: i32>(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vgetexpps(a, f32x16::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getexp_round_pd&expand=2847)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_getexp_round_pd<const SAE: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vgetexppd(a, f64x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getexp_round_pd&expand=2848)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_getexp_round_pd<const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vgetexppd(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getexp_round_pd&expand=2849)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_getexp_round_pd<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vgetexppd(a, f64x8::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_roundscale_round_ps&expand=4790)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm512_roundscale_round_ps<const IMM8: i32, const SAE: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vrndscaleps(a, IMM8, f32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_roundscale_round_ps&expand=4788)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_mask_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vrndscaleps(a, IMM8, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_roundscale_round_ps&expand=4789)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm512_maskz_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vrndscaleps(a, IMM8, f32x16::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_roundscale_round_pd&expand=4787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm512_roundscale_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vrndscalepd(a, IMM8, f64x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_roundscale_round_pd&expand=4785)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_mask_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vrndscalepd(a, IMM8, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_roundscale_round_pd&expand=4786)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm512_maskz_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vrndscalepd(a, IMM8, f64x8::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_scalef_round_ps&expand=4889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_scalef_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vscalefps(a, b, f32x16::ZERO, 0b11111111_11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_scalef_round_ps&expand=4887)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_scalef_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vscalefps(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_scalef_round_ps&expand=4888)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_scalef_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vscalefps(a, b, f32x16::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_scalef_round_pd&expand=4886)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_scalef_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vscalefpd(a, b, f64x8::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_scalef_round_pd&expand=4884)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_scalef_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vscalefpd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_scalef_round_pd&expand=4885)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_scalef_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vscalefpd(a, b, f64x8::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fixupimm_round_ps&expand=2505)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fixupimm_round_ps&expand=2506)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm512_mask_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmps(a, b, c, IMM8, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fixupimm_round_ps&expand=2507)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm512_maskz_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmpsz(a, b, c, IMM8, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fixupimm_round_pd&expand=2502)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fixupimm_round_pd&expand=2503)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm512_mask_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpd(a, b, c, IMM8, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fixupimm_round_pd&expand=2504)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm512_maskz_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpdz(a, b, c, IMM8, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getmant_round_ps&expand=2886)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(1, 2, 3)]
+pub fn _mm512_getmant_round_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vgetmantps(a, SIGN << 2 | NORM, f32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getmant_round_ps&expand=2887)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+pub fn _mm512_mask_getmant_round_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vgetmantps(a, SIGN << 2 | NORM, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getmant_round_ps&expand=2888)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+pub fn _mm512_maskz_getmant_round_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vgetmantps(a, SIGN << 2 | NORM, f32x16::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getmant_round_pd&expand=2883)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(1, 2, 3)]
+pub fn _mm512_getmant_round_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vgetmantpd(a, SIGN << 2 | NORM, f64x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getmant_round_pd&expand=2884)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+pub fn _mm512_mask_getmant_round_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getmant_round_pd&expand=2885)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+pub fn _mm512_maskz_getmant_round_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vgetmantpd(a, SIGN << 2 | NORM, f64x8::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epi32&expand=1737)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm512_cvtps_epi32(a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2dq(
+            a.as_f32x16(),
+            i32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_epi32&expand=1738)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm512_mask_cvtps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2dq(
+            a.as_f32x16(),
+            src.as_i32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_epi32&expand=1739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm512_maskz_cvtps_epi32(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2dq(
+            a.as_f32x16(),
+            i32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_epi32&expand=1735)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm256_mask_cvtps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtps_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x8(), src.as_i32x8()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_epi32&expand=1736)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm256_maskz_cvtps_epi32(k: __mmask8, a: __m256) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtps_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x8(), i32x8::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_epi32&expand=1732)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm_mask_cvtps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtps_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_epi32&expand=1733)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm_maskz_cvtps_epi32(k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtps_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epu32&expand=1755)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm512_cvtps_epu32(a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2udq(
+            a.as_f32x16(),
+            u32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_epu32&expand=1756)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm512_mask_cvtps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2udq(
+            a.as_f32x16(),
+            src.as_u32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_epu32&expand=1343)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm512_maskz_cvtps_epu32(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2udq(
+            a.as_f32x16(),
+            u32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epu32&expand=1752)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm256_cvtps_epu32(a: __m256) -> __m256i {
+    unsafe { transmute(vcvtps2udq256(a.as_f32x8(), u32x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_epu32&expand=1753)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm256_mask_cvtps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvtps2udq256(a.as_f32x8(), src.as_u32x8(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_epu32&expand=1754)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm256_maskz_cvtps_epu32(k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvtps2udq256(a.as_f32x8(), u32x8::ZERO, k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epu32&expand=1749)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm_cvtps_epu32(a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2udq128(a.as_f32x4(), u32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_epu32&expand=1750)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm_mask_cvtps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2udq128(a.as_f32x4(), src.as_u32x4(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_epu32&expand=1751)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm_maskz_cvtps_epu32(k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2udq128(a.as_f32x4(), u32x4::ZERO, k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_pd&expand=1769)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub fn _mm512_cvtps_pd(a: __m256) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            a.as_f32x8(),
+            f64x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_pd&expand=1770)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub fn _mm512_mask_cvtps_pd(src: __m512d, k: __mmask8, a: __m256) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            a.as_f32x8(),
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_pd&expand=1771)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub fn _mm512_maskz_cvtps_pd(k: __mmask8, a: __m256) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            a.as_f32x8(),
+            f64x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpslo_pd&expand=1784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub fn _mm512_cvtpslo_pd(v2: __m512) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            _mm512_castps512_ps256(v2).as_f32x8(),
+            f64x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpslo_pd&expand=1785)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub fn _mm512_mask_cvtpslo_pd(src: __m512d, k: __mmask8, v2: __m512) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            _mm512_castps512_ps256(v2).as_f32x8(),
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_ps&expand=1712)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm512_cvtpd_ps(a: __m512d) -> __m256 {
+    unsafe {
+        transmute(vcvtpd2ps(
+            a.as_f64x8(),
+            f32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_ps&expand=1713)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm512_mask_cvtpd_ps(src: __m256, k: __mmask8, a: __m512d) -> __m256 {
+    unsafe {
+        transmute(vcvtpd2ps(
+            a.as_f64x8(),
+            src.as_f32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_ps&expand=1714)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm512_maskz_cvtpd_ps(k: __mmask8, a: __m512d) -> __m256 {
+    unsafe {
+        transmute(vcvtpd2ps(
+            a.as_f64x8(),
+            f32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_ps&expand=1710)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm256_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m256d) -> __m128 {
+    unsafe {
+        let convert = _mm256_cvtpd_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_ps&expand=1711)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm256_maskz_cvtpd_ps(k: __mmask8, a: __m256d) -> __m128 {
+    unsafe {
+        let convert = _mm256_cvtpd_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_ps&expand=1707)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m128d) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtpd_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_ps&expand=1708)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm_maskz_cvtpd_ps(k: __mmask8, a: __m128d) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtpd_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_epi32&expand=1675)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm512_cvtpd_epi32(a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2dq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_epi32&expand=1676)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm512_mask_cvtpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2dq(
+            a.as_f64x8(),
+            src.as_i32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_epi32&expand=1677)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm512_maskz_cvtpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2dq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_epi32&expand=1673)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm256_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtpd_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_epi32&expand=1674)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm256_maskz_cvtpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtpd_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_epi32&expand=1670)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtpd_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_epi32&expand=1671)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm_maskz_cvtpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtpd_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_epu32&expand=1693)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm512_cvtpd_epu32(a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2udq(
+            a.as_f64x8(),
+            u32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_epu32&expand=1694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm512_mask_cvtpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2udq(
+            a.as_f64x8(),
+            src.as_u32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_epu32&expand=1695)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm512_maskz_cvtpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2udq(
+            a.as_f64x8(),
+            u32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epu32&expand=1690)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm256_cvtpd_epu32(a: __m256d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq256(a.as_f64x4(), u32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_epu32&expand=1691)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm256_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq256(a.as_f64x4(), src.as_u32x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_epu32&expand=1692)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm256_maskz_cvtpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq256(a.as_f64x4(), u32x4::ZERO, k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epu32&expand=1687)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm_cvtpd_epu32(a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq128(a.as_f64x2(), u32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_epu32&expand=1688)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq128(a.as_f64x2(), src.as_u32x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_epu32&expand=1689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm_maskz_cvtpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq128(a.as_f64x2(), u32x4::ZERO, k)) }
+}
+
+/// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst. The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_pslo&expand=1715)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm512_cvtpd_pslo(v2: __m512d) -> __m512 {
+    unsafe {
+        let r: f32x8 = vcvtpd2ps(
+            v2.as_f64x8(),
+            f32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        simd_shuffle!(
+            r,
+            f32x8::ZERO,
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+        )
+    }
+}
+
+/// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_pslo&expand=1716)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm512_mask_cvtpd_pslo(src: __m512, k: __mmask8, v2: __m512d) -> __m512 {
+    unsafe {
+        let r: f32x8 = vcvtpd2ps(
+            v2.as_f64x8(),
+            _mm512_castps512_ps256(src).as_f32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        simd_shuffle!(
+            r,
+            f32x8::ZERO,
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+        )
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi8_epi32&expand=1535)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm512_cvtepi8_epi32(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x16();
+        transmute::<i32x16, _>(simd_cast(a))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi8_epi32&expand=1536)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm512_mask_cvtepi8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi8_epi32&expand=1537)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm512_maskz_cvtepi8_epi32(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi8_epi32&expand=1533)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm256_mask_cvtepi8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi8_epi32&expand=1534)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm256_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi8_epi32&expand=1530)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm_mask_cvtepi8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi8_epi32&expand=1531)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi8_epi64&expand=1544)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm512_cvtepi8_epi64(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x16();
+        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<i64x8, _>(simd_cast(v64))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi8_epi64&expand=1545)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm512_mask_cvtepi8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi8_epi64&expand=1546)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm512_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi8_epi64&expand=1542)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm256_mask_cvtepi8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi8_epi64&expand=1543)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm256_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi8_epi64&expand=1539)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm_mask_cvtepi8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi8_epi64&expand=1540)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu8_epi32&expand=1621)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm512_cvtepu8_epi32(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x16();
+        transmute::<i32x16, _>(simd_cast(a))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu8_epi32&expand=1622)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm512_mask_cvtepu8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu8_epi32&expand=1623)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm512_maskz_cvtepu8_epi32(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu8_epi32&expand=1619)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm256_mask_cvtepu8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm256_maskz_cvtepu8_epi32&expand=1620)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm256_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu8_epi32&expand=1616)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm_mask_cvtepu8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in th elow 4 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_maskz_cvtepu8_epi32&expand=1617)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu8_epi64&expand=1630)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm512_cvtepu8_epi64(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x16();
+        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<i64x8, _>(simd_cast(v64))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu8_epi64&expand=1631)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm512_mask_cvtepu8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu8_epi64&expand=1632)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm512_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu8_epi64&expand=1628)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm256_mask_cvtepu8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu8_epi64&expand=1629)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm256_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu8_epi64&expand=1625)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm_mask_cvtepu8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu8_epi64&expand=1626)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi16_epi32&expand=1389)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm512_cvtepi16_epi32(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x16();
+        transmute::<i32x16, _>(simd_cast(a))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi16_epi32&expand=1390)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm512_mask_cvtepi16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi16_epi32&expand=1391)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm512_maskz_cvtepi16_epi32(k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi16_epi32&expand=1387)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm256_mask_cvtepi16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi16_epi32&expand=1388)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm256_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi16_epi32&expand=1384)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm_mask_cvtepi16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi16_epi32&expand=1385)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi16_epi64&expand=1398)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm512_cvtepi16_epi64(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x8();
+        transmute::<i64x8, _>(simd_cast(a))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi16_epi64&expand=1399)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm512_mask_cvtepi16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi16_epi64&expand=1400)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm512_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi16_epi64&expand=1396)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm256_mask_cvtepi16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi16_epi64&expand=1397)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm256_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi16_epi64&expand=1393)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm_mask_cvtepi16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi16_epi64&expand=1394)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu16_epi32&expand=1553)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm512_cvtepu16_epi32(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_u16x16();
+        transmute::<i32x16, _>(simd_cast(a))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu16_epi32&expand=1554)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm512_mask_cvtepu16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu16_epi32&expand=1555)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm512_maskz_cvtepu16_epi32(k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu16_epi32&expand=1551)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm256_mask_cvtepu16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu16_epi32&expand=1552)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm256_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu16_epi32&expand=1548)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm_mask_cvtepu16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu16_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu16_epi32&expand=1549)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu16_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu16_epi64&expand=1562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm512_cvtepu16_epi64(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_u16x8();
+        transmute::<i64x8, _>(simd_cast(a))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu16_epi64&expand=1563)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm512_mask_cvtepu16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu16_epi64&expand=1564)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm512_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu16_epi64&expand=1560)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm256_mask_cvtepu16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu16_epi64&expand=1561)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm256_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu16_epi64&expand=1557)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm_mask_cvtepu16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu16_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu16_epi64&expand=1558)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu16_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_epi64&expand=1428)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm512_cvtepi32_epi64(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x8();
+        transmute::<i64x8, _>(simd_cast(a))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_epi64&expand=1429)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm512_mask_cvtepi32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_epi64&expand=1430)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm512_maskz_cvtepi32_epi64(k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_epi64&expand=1426)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm256_mask_cvtepi32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_epi64&expand=1427)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm256_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_epi64&expand=1423)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm_mask_cvtepi32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi32_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_epi64&expand=1424)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi32_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu32_epi64&expand=1571)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm512_cvtepu32_epi64(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_u32x8();
+        transmute::<i64x8, _>(simd_cast(a))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu32_epi64&expand=1572)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm512_mask_cvtepu32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu32_epi64&expand=1573)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm512_maskz_cvtepu32_epi64(k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu32_epi64&expand=1569)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm256_mask_cvtepu32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu32_epi64&expand=1570)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm256_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu32_epi64&expand=1566)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm_mask_cvtepu32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu32_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu32_epi64&expand=1567)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu32_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_ps&expand=1455)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm512_cvtepi32_ps(a: __m512i) -> __m512 {
+    unsafe {
+        let a = a.as_i32x16();
+        transmute::<f32x16, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_ps&expand=1456)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm512_mask_cvtepi32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        let convert = _mm512_cvtepi32_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_ps&expand=1457)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm512_maskz_cvtepi32_ps(k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        let convert = _mm512_cvtepi32_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, convert, f32x16::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_ps&expand=1453)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm256_mask_cvtepi32_ps(src: __m256, k: __mmask8, a: __m256i) -> __m256 {
+    unsafe {
+        let convert = _mm256_cvtepi32_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f32x8()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_ps&expand=1454)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm256_maskz_cvtepi32_ps(k: __mmask8, a: __m256i) -> __m256 {
+    unsafe {
+        let convert = _mm256_cvtepi32_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, convert, f32x8::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_ps&expand=1450)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm_mask_cvtepi32_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtepi32_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_f32x4()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_ps&expand=1451)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm_maskz_cvtepi32_ps(k: __mmask8, a: __m128i) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtepi32_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, convert, f32x4::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_pd&expand=1446)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm512_cvtepi32_pd(a: __m256i) -> __m512d {
+    unsafe {
+        let a = a.as_i32x8();
+        transmute::<f64x8, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_pd&expand=1447)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm512_mask_cvtepi32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepi32_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_pd&expand=1448)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm512_maskz_cvtepi32_pd(k: __mmask8, a: __m256i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepi32_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, f64x8::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_pd&expand=1444)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm256_mask_cvtepi32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
+    unsafe {
+        let convert = _mm256_cvtepi32_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_pd&expand=1445)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm256_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m256d {
+    unsafe {
+        let convert = _mm256_cvtepi32_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, convert, f64x4::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_pd&expand=1441)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm_mask_cvtepi32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let convert = _mm_cvtepi32_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_pd&expand=1442)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let convert = _mm_cvtepi32_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, convert, f64x2::ZERO))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu32_ps&expand=1583)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps))]
+pub fn _mm512_cvtepu32_ps(a: __m512i) -> __m512 {
+    unsafe {
+        let a = a.as_u32x16();
+        transmute::<f32x16, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu32_ps&expand=1584)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps))]
+pub fn _mm512_mask_cvtepu32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        let convert = _mm512_cvtepu32_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu32_ps&expand=1585)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps))]
+pub fn _mm512_maskz_cvtepu32_ps(k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        let convert = _mm512_cvtepu32_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, convert, f32x16::ZERO))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu32_pd&expand=1580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm512_cvtepu32_pd(a: __m256i) -> __m512d {
+    unsafe {
+        let a = a.as_u32x8();
+        transmute::<f64x8, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu32_pd&expand=1581)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm512_mask_cvtepu32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepu32_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu32_pd&expand=1582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm512_maskz_cvtepu32_pd(k: __mmask8, a: __m256i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepu32_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, f64x8::ZERO))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_pd&expand=1577)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm256_cvtepu32_pd(a: __m128i) -> __m256d {
+    unsafe {
+        let a = a.as_u32x4();
+        transmute::<f64x4, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu32_pd&expand=1578)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm256_mask_cvtepu32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
+    unsafe {
+        let convert = _mm256_cvtepu32_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu32_pd&expand=1579)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm256_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m256d {
+    unsafe {
+        let convert = _mm256_cvtepu32_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, convert, f64x4::ZERO))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_pd&expand=1574)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm_cvtepu32_pd(a: __m128i) -> __m128d {
+    unsafe {
+        let a = a.as_u32x4();
+        let u64: u32x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute::<f64x2, _>(simd_cast(u64))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu32_pd&expand=1575)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm_mask_cvtepu32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let convert = _mm_cvtepu32_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu32_pd&expand=1576)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let convert = _mm_cvtepu32_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, convert, f64x2::ZERO))
+    }
+}
+
+/// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32lo_pd&expand=1464)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm512_cvtepi32lo_pd(v2: __m512i) -> __m512d {
+    unsafe {
+        let v2 = v2.as_i32x16();
+        let v256: i32x8 = simd_shuffle!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<f64x8, _>(simd_cast(v256))
+    }
+}
+
+/// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32lo_pd&expand=1465)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm512_mask_cvtepi32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepi32lo_pd(v2).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+    }
+}
+
+/// Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu32lo_pd&expand=1586)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm512_cvtepu32lo_pd(v2: __m512i) -> __m512d {
+    unsafe {
+        let v2 = v2.as_u32x16();
+        let v256: u32x8 = simd_shuffle!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<f64x8, _>(simd_cast(v256))
+    }
+}
+
+/// Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu32lo_pd&expand=1587)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm512_mask_cvtepu32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepu32lo_pd(v2).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_epi16&expand=1419)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm512_cvtepi32_epi16(a: __m512i) -> __m256i {
+    unsafe {
+        let a = a.as_i32x16();
+        transmute::<i16x16, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_epi16&expand=1420)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm512_mask_cvtepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_epi16&expand=1421)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm512_maskz_cvtepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi16&expand=1416)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm256_cvtepi32_epi16(a: __m256i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x8();
+        transmute::<i16x8, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_epi16&expand=1417)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm256_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_epi16&expand=1418)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm256_maskz_cvtepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi16&expand=1413)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm_cvtepi32_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdw128(a.as_i32x4(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_epi16&expand=1414)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdw128(a.as_i32x4(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_epi16&expand=1415)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm_maskz_cvtepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdw128(a.as_i32x4(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_epi8&expand=1437)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm512_cvtepi32_epi8(a: __m512i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x16();
+        transmute::<i8x16, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_epi8&expand=1438)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm512_mask_cvtepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_epi8&expand=1439)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm512_maskz_cvtepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi8&expand=1434)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm256_cvtepi32_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovdb256(a.as_i32x8(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_epi8&expand=1435)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm256_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovdb256(a.as_i32x8(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_epi8&expand=1436)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm256_maskz_cvtepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovdb256(a.as_i32x8(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi8&expand=1431)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm_cvtepi32_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdb128(a.as_i32x4(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_epi8&expand=1432)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdb128(a.as_i32x4(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_epi8&expand=1433)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm_maskz_cvtepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdb128(a.as_i32x4(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_epi32&expand=1481)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm512_cvtepi64_epi32(a: __m512i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x8();
+        transmute::<i32x8, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_epi32&expand=1482)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm512_mask_cvtepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_epi32&expand=1483)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm512_maskz_cvtepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_epi32&expand=1478)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm256_cvtepi64_epi32(a: __m256i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x4();
+        transmute::<i32x4, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_epi32&expand=1479)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm256_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_epi32&expand=1480)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm256_maskz_cvtepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_epi32&expand=1475)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm_cvtepi64_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqd128(a.as_i64x2(), i32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_epi32&expand=1476)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqd128(a.as_i64x2(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_epi32&expand=1477)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm_maskz_cvtepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqd128(a.as_i64x2(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_epi16&expand=1472)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm512_cvtepi64_epi16(a: __m512i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x8();
+        transmute::<i16x8, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_epi16&expand=1473)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm512_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe {
+        let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_epi16&expand=1474)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm512_maskz_cvtepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe {
+        let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_epi16&expand=1469)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm256_cvtepi64_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqw256(a.as_i64x4(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_epi16&expand=1470)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm256_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqw256(a.as_i64x4(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_epi16&expand=1471)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm256_maskz_cvtepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqw256(a.as_i64x4(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_epi16&expand=1466)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm_cvtepi64_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqw128(a.as_i64x2(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_epi16&expand=1467)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqw128(a.as_i64x2(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_epi16&expand=1468)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm_maskz_cvtepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqw128(a.as_i64x2(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_epi8&expand=1490)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm512_cvtepi64_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovqb(a.as_i64x8(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_epi8&expand=1491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm512_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovqb(a.as_i64x8(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_epi8&expand=1492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm512_maskz_cvtepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovqb(a.as_i64x8(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_epi8&expand=1487)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm256_cvtepi64_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqb256(a.as_i64x4(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_epi8&expand=1488)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm256_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqb256(a.as_i64x4(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_epi8&expand=1489)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm256_maskz_cvtepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqb256(a.as_i64x4(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_epi8&expand=1484)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm_cvtepi64_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqb128(a.as_i64x2(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_epi8&expand=1485)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqb128(a.as_i64x2(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_epi8&expand=1486)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm_maskz_cvtepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqb128(a.as_i64x2(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi32_epi16&expand=1819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm512_cvtsepi32_epi16(a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsdw(a.as_i32x16(), i16x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi32_epi16&expand=1820)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm512_mask_cvtsepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsdw(a.as_i32x16(), src.as_i16x16(), k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi32_epi16&expand=1819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm512_maskz_cvtsepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsdw(a.as_i32x16(), i16x16::ZERO, k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi32_epi16&expand=1816)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm256_cvtsepi32_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdw256(a.as_i32x8(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi32_epi16&expand=1817)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm256_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdw256(a.as_i32x8(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi32_epi16&expand=1818)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm256_maskz_cvtsepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdw256(a.as_i32x8(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi32_epi16&expand=1813)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm_cvtsepi32_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdw128(a.as_i32x4(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi32_epi16&expand=1814)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdw128(a.as_i32x4(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi32_epi16&expand=1815)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm_maskz_cvtsepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdw128(a.as_i32x4(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi32_epi8&expand=1828)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm512_cvtsepi32_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsdb(a.as_i32x16(), i8x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi32_epi8&expand=1829)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm512_mask_cvtsepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsdb(a.as_i32x16(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi32_epi8&expand=1830)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm512_maskz_cvtsepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsdb(a.as_i32x16(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi32_epi8&expand=1825)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm256_cvtsepi32_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdb256(a.as_i32x8(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi32_epi8&expand=1826)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm256_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdb256(a.as_i32x8(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi32_epi8&expand=1827)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm256_maskz_cvtsepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdb256(a.as_i32x8(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi32_epi8&expand=1822)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm_cvtsepi32_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdb128(a.as_i32x4(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi32_epi8&expand=1823)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdb128(a.as_i32x4(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi32_epi8&expand=1824)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm_maskz_cvtsepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdb128(a.as_i32x4(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi64_epi32&expand=1852)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm512_cvtsepi64_epi32(a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsqd(a.as_i64x8(), i32x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_epi32&expand=1853)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm512_mask_cvtsepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsqd(a.as_i64x8(), src.as_i32x8(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi64_epi32&expand=1854)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm512_maskz_cvtsepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsqd(a.as_i64x8(), i32x8::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi64_epi32&expand=1849)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm256_cvtsepi64_epi32(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqd256(a.as_i64x4(), i32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_epi32&expand=1850)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm256_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqd256(a.as_i64x4(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi64_epi32&expand=1851)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm256_maskz_cvtsepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqd256(a.as_i64x4(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi64_epi32&expand=1846)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm_cvtsepi64_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqd128(a.as_i64x2(), i32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_epi32&expand=1847)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqd128(a.as_i64x2(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi64_epi32&expand=1848)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm_maskz_cvtsepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqd128(a.as_i64x2(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi64_epi16&expand=1843)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm512_cvtsepi64_epi16(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqw(a.as_i64x8(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_epi16&expand=1844)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm512_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqw(a.as_i64x8(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi64_epi16&expand=1845)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm512_maskz_cvtsepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqw(a.as_i64x8(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi64_epi16&expand=1840)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm256_cvtsepi64_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqw256(a.as_i64x4(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_epi16&expand=1841)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm256_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqw256(a.as_i64x4(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi64_epi16&expand=1842)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm256_maskz_cvtsepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqw256(a.as_i64x4(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi64_epi16&expand=1837)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm_cvtsepi64_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqw128(a.as_i64x2(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_epi16&expand=1838)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqw128(a.as_i64x2(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi64_epi16&expand=1839)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm_maskz_cvtsepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqw128(a.as_i64x2(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi64_epi8&expand=1861)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm512_cvtsepi64_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqb(a.as_i64x8(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_epi8&expand=1862)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm512_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqb(a.as_i64x8(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi64_epi8&expand=1863)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm512_maskz_cvtsepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqb(a.as_i64x8(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi64_epi8&expand=1858)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm256_cvtsepi64_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqb256(a.as_i64x4(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_epi8&expand=1859)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm256_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqb256(a.as_i64x4(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi64_epi8&expand=1860)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm256_maskz_cvtsepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqb256(a.as_i64x4(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi64_epi8&expand=1855)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm_cvtsepi64_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqb128(a.as_i64x2(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_epi8&expand=1856)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqb128(a.as_i64x2(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi64_epi8&expand=1857)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm_maskz_cvtsepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqb128(a.as_i64x2(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi32_epi16&expand=2054)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm512_cvtusepi32_epi16(a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusdw(a.as_u32x16(), u16x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi32_epi16&expand=2055)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm512_mask_cvtusepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusdw(a.as_u32x16(), src.as_u16x16(), k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi32_epi16&expand=2056)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm512_maskz_cvtusepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusdw(a.as_u32x16(), u16x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi32_epi16&expand=2051)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm256_cvtusepi32_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdw256(a.as_u32x8(), u16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi32_epi16&expand=2052)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm256_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdw256(a.as_u32x8(), src.as_u16x8(), k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi32_epi16&expand=2053)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm256_maskz_cvtusepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdw256(a.as_u32x8(), u16x8::ZERO, k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi32_epi16&expand=2048)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm_cvtusepi32_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdw128(a.as_u32x4(), u16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi32_epi16&expand=2049)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdw128(a.as_u32x4(), src.as_u16x8(), k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi32_epi16&expand=2050)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm_maskz_cvtusepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdw128(a.as_u32x4(), u16x8::ZERO, k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi32_epi8&expand=2063)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm512_cvtusepi32_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusdb(a.as_u32x16(), u8x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi32_epi8&expand=2064)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm512_mask_cvtusepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusdb(a.as_u32x16(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi32_epi8&expand=2065)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm512_maskz_cvtusepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusdb(a.as_u32x16(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi32_epi8&expand=2060)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm256_cvtusepi32_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdb256(a.as_u32x8(), u8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi32_epi8&expand=2061)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm256_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdb256(a.as_u32x8(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi32_epi8&expand=2062)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm256_maskz_cvtusepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdb256(a.as_u32x8(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi32_epi8&expand=2057)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm_cvtusepi32_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdb128(a.as_u32x4(), u8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi32_epi8&expand=2058)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdb128(a.as_u32x4(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi32_epi8&expand=2059)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm_maskz_cvtusepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdb128(a.as_u32x4(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi64_epi32&expand=2087)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm512_cvtusepi64_epi32(a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusqd(a.as_u64x8(), u32x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_epi32&expand=2088)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm512_mask_cvtusepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusqd(a.as_u64x8(), src.as_u32x8(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi64_epi32&expand=2089)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm512_maskz_cvtusepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusqd(a.as_u64x8(), u32x8::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi64_epi32&expand=2084)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm256_cvtusepi64_epi32(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqd256(a.as_u64x4(), u32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_epi32&expand=2085)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm256_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqd256(a.as_u64x4(), src.as_u32x4(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi64_epi32&expand=2086)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm256_maskz_cvtusepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqd256(a.as_u64x4(), u32x4::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi64_epi32&expand=2081)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm_cvtusepi64_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqd128(a.as_u64x2(), u32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_epi32&expand=2082)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqd128(a.as_u64x2(), src.as_u32x4(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi64_epi32&expand=2083)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm_maskz_cvtusepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqd128(a.as_u64x2(), u32x4::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi64_epi16&expand=2078)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm512_cvtusepi64_epi16(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqw(a.as_u64x8(), u16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_epi16&expand=2079)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm512_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqw(a.as_u64x8(), src.as_u16x8(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi64_epi16&expand=2080)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm512_maskz_cvtusepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqw(a.as_u64x8(), u16x8::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi64_epi16&expand=2075)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm256_cvtusepi64_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqw256(a.as_u64x4(), u16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_epi16&expand=2076)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm256_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqw256(a.as_u64x4(), src.as_u16x8(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi64_epi16&expand=2077)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm256_maskz_cvtusepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqw256(a.as_u64x4(), u16x8::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi64_epi16&expand=2072)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm_cvtusepi64_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqw128(a.as_u64x2(), u16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_epi16&expand=2073)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqw128(a.as_u64x2(), src.as_u16x8(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi64_epi16&expand=2074)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm_maskz_cvtusepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqw128(a.as_u64x2(), u16x8::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi64_epi8&expand=2096)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm512_cvtusepi64_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqb(a.as_u64x8(), u8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_epi8&expand=2097)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm512_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqb(a.as_u64x8(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi64_epi8&expand=2098)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm512_maskz_cvtusepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqb(a.as_u64x8(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi64_epi8&expand=2093)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm256_cvtusepi64_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqb256(a.as_u64x4(), u8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_epi8&expand=2094)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm256_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqb256(a.as_u64x4(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi64_epi8&expand=2095)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm256_maskz_cvtusepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqb256(a.as_u64x4(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi64_epi8&expand=2090)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm_cvtusepi64_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqb128(a.as_u64x2(), u8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_epi8&expand=2091)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqb128(a.as_u64x2(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi64_epi8&expand=2092)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm_maskz_cvtusepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqb128(a.as_u64x2(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epi32&expand=1335)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundps_epi32<const ROUNDING: i32>(a: __m512) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2dq(a, i32x16::ZERO, 0b11111111_11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epi32&expand=1336)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundps_epi32<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let src = src.as_i32x16();
+        let r = vcvtps2dq(a, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epi32&expand=1337)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundps_epi32<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2dq(a, i32x16::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epu32&expand=1341)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundps_epu32<const ROUNDING: i32>(a: __m512) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2udq(a, u32x16::ZERO, 0b11111111_11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epu32&expand=1342)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundps_epu32<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let src = src.as_u32x16();
+        let r = vcvtps2udq(a, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epu32&expand=1343)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundps_epu32<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2udq(a, u32x16::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_pd&expand=1347)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundps_pd<const SAE: i32>(a: __m256) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x8();
+        let r = vcvtps2pd(a, f64x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_pd&expand=1336)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundps_pd<const SAE: i32>(src: __m512d, k: __mmask8, a: __m256) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x8();
+        let src = src.as_f64x8();
+        let r = vcvtps2pd(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_pd&expand=1337)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundps_pd<const SAE: i32>(k: __mmask8, a: __m256) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x8();
+        let r = vcvtps2pd(a, f64x8::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epi32&expand=1315)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundpd_epi32<const ROUNDING: i32>(a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2dq(a, i32x8::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epi32&expand=1316)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundpd_epi32<const ROUNDING: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let src = src.as_i32x8();
+        let r = vcvtpd2dq(a, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epi32&expand=1317)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundpd_epi32<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2dq(a, i32x8::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epu32&expand=1321)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundpd_epu32<const ROUNDING: i32>(a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2udq(a, u32x8::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epu32&expand=1322)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundpd_epu32<const ROUNDING: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let src = src.as_u32x8();
+        let r = vcvtpd2udq(a, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epu32&expand=1323)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundpd_epu32<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2udq(a, u32x8::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_ps&expand=1327)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundpd_ps<const ROUNDING: i32>(a: __m512d) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2ps(a, f32x8::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_ps&expand=1328)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundpd_ps<const ROUNDING: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let src = src.as_f32x8();
+        let r = vcvtpd2ps(a, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundpd_ps&expand=1329)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundpd_ps<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2ps(a, f32x8::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepi32_ps&expand=1294)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundepi32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_i32x16();
+        let r = vcvtdq2ps(a, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepi32_ps&expand=1295)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundepi32_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_i32x16();
+        let r = vcvtdq2ps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepi32_ps&expand=1296)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundepi32_ps<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_i32x16();
+        let r = vcvtdq2ps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepu32_ps&expand=1303)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundepu32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_u32x16();
+        let r = vcvtudq2ps(a, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepu32_ps&expand=1304)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundepu32_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_u32x16();
+        let r = vcvtudq2ps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepu32_ps&expand=1305)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundepu32_ps<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_u32x16();
+        let r = vcvtudq2ps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
+///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
+///  * [`_MM_FROUND_TO_POS_INF`]    // round up
+///  * [`_MM_FROUND_TO_ZERO`]        // truncate
+///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
+///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
+///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
+///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
+///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
+///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_ph&expand=1354)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256i {
+    unsafe {
+        static_assert_extended_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2ph(a, ROUNDING, i16x16::ZERO, 0b11111111_11111111);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
+///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
+///  * [`_MM_FROUND_TO_POS_INF`]    // round up
+///  * [`_MM_FROUND_TO_ZERO`]        // truncate
+///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
+///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
+///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
+///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
+///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
+///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_ph&expand=1355)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundps_ph<const ROUNDING: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m512,
+) -> __m256i {
+    unsafe {
+        static_assert_extended_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let src = src.as_i16x16();
+        let r = vcvtps2ph(a, ROUNDING, src, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
+///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
+///  * [`_MM_FROUND_TO_POS_INF`]    // round up
+///  * [`_MM_FROUND_TO_ZERO`]        // truncate
+///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
+///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
+///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
+///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
+///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
+///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_ph&expand=1356)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256i {
+    unsafe {
+        static_assert_extended_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2ph(a, ROUNDING, i16x16::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvt_roundps_ph&expand=1352)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_cvt_roundps_ph<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m256,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let src = src.as_i16x8();
+        let r = vcvtps2ph256(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvt_roundps_ph&expand=1353)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let r = vcvtps2ph256(a, IMM8, i16x8::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvt_roundps_ph&expand=1350)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_cvt_roundps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let src = src.as_i16x8();
+        let r = vcvtps2ph128(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvt_roundps_ph&expand=1351)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let r = vcvtps2ph128(a, IMM8, i16x8::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
+///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
+///  * [`_MM_FROUND_TO_POS_INF`]    // round up
+///  * [`_MM_FROUND_TO_ZERO`]        // truncate
+///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
+///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
+///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
+///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
+///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
+///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_ph&expand=1778)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvtps_ph<const ROUNDING: i32>(a: __m512) -> __m256i {
+    unsafe {
+        static_assert_extended_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2ph(a, ROUNDING, i16x16::ZERO, 0b11111111_11111111);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
+///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
+///  * [`_MM_FROUND_TO_POS_INF`]    // round up
+///  * [`_MM_FROUND_TO_ZERO`]        // truncate
+///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
+///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
+///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
+///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
+///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
+///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_ph&expand=1779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvtps_ph<const ROUNDING: i32>(src: __m256i, k: __mmask16, a: __m512) -> __m256i {
+    unsafe {
+        static_assert_extended_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let src = src.as_i16x16();
+        let r = vcvtps2ph(a, ROUNDING, src, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
+///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
+///  * [`_MM_FROUND_TO_POS_INF`]    // round up
+///  * [`_MM_FROUND_TO_ZERO`]        // truncate
+///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
+///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
+///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
+///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
+///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
+///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_ph&expand=1780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvtps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256i {
+    unsafe {
+        static_assert_extended_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2ph(a, ROUNDING, i16x16::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_ph&expand=1776)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_cvtps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m256) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let src = src.as_i16x8();
+        let r = vcvtps2ph256(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_ph&expand=1777)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let r = vcvtps2ph256(a, IMM8, i16x8::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_ph&expand=1773)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_cvtps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let src = src.as_i16x8();
+        let r = vcvtps2ph128(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_ph&expand=1774)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let r = vcvtps2ph128(a, IMM8, i16x8::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundph_ps&expand=1332)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundph_ps<const SAE: i32>(a: __m256i) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_i16x16();
+        let r = vcvtph2ps(a, f32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundph_ps&expand=1333)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundph_ps<const SAE: i32>(src: __m512, k: __mmask16, a: __m256i) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_i16x16();
+        let src = src.as_f32x16();
+        let r = vcvtph2ps(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundph_ps&expand=1334)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256i) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_i16x16();
+        let r = vcvtph2ps(a, f32x16::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtph_ps&expand=1723)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm512_cvtph_ps(a: __m256i) -> __m512 {
+    unsafe {
+        transmute(vcvtph2ps(
+            a.as_i16x16(),
+            f32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtph_ps&expand=1724)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm512_mask_cvtph_ps(src: __m512, k: __mmask16, a: __m256i) -> __m512 {
+    unsafe {
+        transmute(vcvtph2ps(
+            a.as_i16x16(),
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtph_ps&expand=1725)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm512_maskz_cvtph_ps(k: __mmask16, a: __m256i) -> __m512 {
+    unsafe { transmute(vcvtph2ps(a.as_i16x16(), f32x16::ZERO, k, _MM_FROUND_NO_EXC)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtph_ps&expand=1721)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm256_mask_cvtph_ps(src: __m256, k: __mmask8, a: __m128i) -> __m256 {
+    unsafe {
+        let convert = _mm256_cvtph_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtph_ps&expand=1722)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm256_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m256 {
+    unsafe {
+        let convert = _mm256_cvtph_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtph_ps&expand=1718)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm_mask_cvtph_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtph_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtph_ps&expand=1719)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtph_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epi32&expand=1916)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvtt_roundps_epi32<const SAE: i32>(a: __m512) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvttps2dq(a, i32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundps_epi32&expand=1917)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvtt_roundps_epi32<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_i32x16();
+        let r = vcvttps2dq(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundps_epi32&expand=1918)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvtt_roundps_epi32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvttps2dq(a, i32x16::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epu32&expand=1922)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvtt_roundps_epu32<const SAE: i32>(a: __m512) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvttps2udq(a, u32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundps_epu32&expand=1923)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvtt_roundps_epu32<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_u32x16();
+        let r = vcvttps2udq(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundps_epu32&expand=1924)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvtt_roundps_epu32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvttps2udq(a, u32x16::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epi32&expand=1904)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvtt_roundpd_epi32<const SAE: i32>(a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vcvttpd2dq(a, i32x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundpd_epi32&expand=1905)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvtt_roundpd_epi32<const SAE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_i32x8();
+        let r = vcvttpd2dq(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundpd_epi32&expand=1918)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvtt_roundpd_epi32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vcvttpd2dq(a, i32x8::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epu32&expand=1910)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvtt_roundpd_epu32<const SAE: i32>(a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vcvttpd2udq(a, i32x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundpd_epu32&expand=1911)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvtt_roundpd_epu32<const SAE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_i32x8();
+        let r = vcvttpd2udq(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epi32&expand=1984)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm512_cvttps_epi32(a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2dq(
+            a.as_f32x16(),
+            i32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttps_epi32&expand=1985)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm512_mask_cvttps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2dq(
+            a.as_f32x16(),
+            src.as_i32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttps_epi32&expand=1986)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm512_maskz_cvttps_epi32(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2dq(
+            a.as_f32x16(),
+            i32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttps_epi32&expand=1982)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm256_mask_cvttps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2dq256(a.as_f32x8(), src.as_i32x8(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttps_epi32&expand=1983)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm256_maskz_cvttps_epi32(k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2dq256(a.as_f32x8(), i32x8::ZERO, k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttps_epi32&expand=1979)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm_mask_cvttps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2dq128(a.as_f32x4(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttps_epi32&expand=1980)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm_maskz_cvttps_epi32(k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2dq128(a.as_f32x4(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epu32&expand=2002)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm512_cvttps_epu32(a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2udq(
+            a.as_f32x16(),
+            u32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttps_epu32&expand=2003)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm512_mask_cvttps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2udq(
+            a.as_f32x16(),
+            src.as_u32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttps_epu32&expand=2004)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm512_maskz_cvttps_epu32(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2udq(
+            a.as_f32x16(),
+            u32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epu32&expand=1999)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm256_cvttps_epu32(a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2udq256(a.as_f32x8(), u32x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttps_epu32&expand=2000)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm256_mask_cvttps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2udq256(a.as_f32x8(), src.as_u32x8(), k)) }
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttps_epu32&expand=2001)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm256_maskz_cvttps_epu32(k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2udq256(a.as_f32x8(), u32x8::ZERO, k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epu32&expand=1996)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm_cvttps_epu32(a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2udq128(a.as_f32x4(), u32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttps_epu32&expand=1997)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm_mask_cvttps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2udq128(a.as_f32x4(), src.as_u32x4(), k)) }
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttps_epu32&expand=1998)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm_maskz_cvttps_epu32(k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2udq128(a.as_f32x4(), u32x4::ZERO, k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundpd_epu32&expand=1912)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvtt_roundpd_epu32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vcvttpd2udq(a, i32x8::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epi32&expand=1947)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm512_cvttpd_epi32(a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2dq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttpd_epi32&expand=1948)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm512_mask_cvttpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2dq(
+            a.as_f64x8(),
+            src.as_i32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttpd_epi32&expand=1949)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm512_maskz_cvttpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2dq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttpd_epi32&expand=1945)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm256_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq256(a.as_f64x4(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttpd_epi32&expand=1946)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm256_maskz_cvttpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq256(a.as_f64x4(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttpd_epi32&expand=1942)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq128(a.as_f64x2(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttpd_epi32&expand=1943)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm_maskz_cvttpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq128(a.as_f64x2(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epu32&expand=1965)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm512_cvttpd_epu32(a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2udq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttpd_epu32&expand=1966)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm512_mask_cvttpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2udq(
+            a.as_f64x8(),
+            src.as_i32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttpd_epu32&expand=1967)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm512_maskz_cvttpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2udq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epu32&expand=1962)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm256_cvttpd_epu32(a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq256(a.as_f64x4(), i32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttpd_epu32&expand=1963)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm256_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq256(a.as_f64x4(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttpd_epu32&expand=1964)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm256_maskz_cvttpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq256(a.as_f64x4(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epu32&expand=1959)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm_cvttpd_epu32(a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq128(a.as_f64x2(), i32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttpd_epu32&expand=1960)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq128(a.as_f64x2(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttpd_epu32&expand=1961)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm_maskz_cvttpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq128(a.as_f64x2(), i32x4::ZERO, k)) }
+}
+
+/// Returns vector of type `__m512d` with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero_pd&expand=5018)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub fn _mm512_setzero_pd() -> __m512d {
+    // All-0 is a properly initialized __m512d
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Returns vector of type `__m512` with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero_ps&expand=5021)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub fn _mm512_setzero_ps() -> __m512 {
+    // All-0 is a properly initialized __m512
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Return vector of type `__m512` with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero&expand=5014)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub fn _mm512_setzero() -> __m512 {
+    // All-0 is a properly initialized __m512
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Returns vector of type `__m512i` with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero_si512&expand=5024)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub fn _mm512_setzero_si512() -> __m512i {
+    // All-0 is a properly initialized __m512i
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Return vector of type `__m512i` with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero_epi32&expand=5015)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub fn _mm512_setzero_epi32() -> __m512i {
+    // All-0 is a properly initialized __m512i
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Sets packed 32-bit integers in `dst` with the supplied values in reverse
+/// order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr_epi32&expand=4991)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr_epi32(
+    e15: i32,
+    e14: i32,
+    e13: i32,
+    e12: i32,
+    e11: i32,
+    e10: i32,
+    e9: i32,
+    e8: i32,
+    e7: i32,
+    e6: i32,
+    e5: i32,
+    e4: i32,
+    e3: i32,
+    e2: i32,
+    e1: i32,
+    e0: i32,
+) -> __m512i {
+    unsafe {
+        let r = i32x16::new(
+            e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
+        );
+        transmute(r)
+    }
+}
+
+/// Set packed 8-bit integers in dst with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_epi8&expand=4915)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set_epi8(
+    e63: i8,
+    e62: i8,
+    e61: i8,
+    e60: i8,
+    e59: i8,
+    e58: i8,
+    e57: i8,
+    e56: i8,
+    e55: i8,
+    e54: i8,
+    e53: i8,
+    e52: i8,
+    e51: i8,
+    e50: i8,
+    e49: i8,
+    e48: i8,
+    e47: i8,
+    e46: i8,
+    e45: i8,
+    e44: i8,
+    e43: i8,
+    e42: i8,
+    e41: i8,
+    e40: i8,
+    e39: i8,
+    e38: i8,
+    e37: i8,
+    e36: i8,
+    e35: i8,
+    e34: i8,
+    e33: i8,
+    e32: i8,
+    e31: i8,
+    e30: i8,
+    e29: i8,
+    e28: i8,
+    e27: i8,
+    e26: i8,
+    e25: i8,
+    e24: i8,
+    e23: i8,
+    e22: i8,
+    e21: i8,
+    e20: i8,
+    e19: i8,
+    e18: i8,
+    e17: i8,
+    e16: i8,
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m512i {
+    unsafe {
+        let r = i8x64::new(
+            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18,
+            e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31, e32, e33, e34, e35,
+            e36, e37, e38, e39, e40, e41, e42, e43, e44, e45, e46, e47, e48, e49, e50, e51, e52,
+            e53, e54, e55, e56, e57, e58, e59, e60, e61, e62, e63,
+        );
+        transmute(r)
+    }
+}
+
+/// Set packed 16-bit integers in dst with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_epi16&expand=4905)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set_epi16(
+    e31: i16,
+    e30: i16,
+    e29: i16,
+    e28: i16,
+    e27: i16,
+    e26: i16,
+    e25: i16,
+    e24: i16,
+    e23: i16,
+    e22: i16,
+    e21: i16,
+    e20: i16,
+    e19: i16,
+    e18: i16,
+    e17: i16,
+    e16: i16,
+    e15: i16,
+    e14: i16,
+    e13: i16,
+    e12: i16,
+    e11: i16,
+    e10: i16,
+    e9: i16,
+    e8: i16,
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m512i {
+    unsafe {
+        let r = i16x32::new(
+            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18,
+            e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+        );
+        transmute(r)
+    }
+}
+
+/// Set packed 32-bit integers in dst with the repeated 4 element sequence.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set4_epi32&expand=4982)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
+    _mm512_set_epi32(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
+}
+
+/// Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set4_ps&expand=4985)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
+    _mm512_set_ps(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set4_pd&expand=4984)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
+    _mm512_set_pd(d, c, b, a, d, c, b, a)
+}
+
+/// Set packed 32-bit integers in dst with the repeated 4 element sequence in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr4_epi32&expand=5009)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
+    _mm512_set_epi32(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
+}
+
+/// Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr4_ps&expand=5012)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
+    _mm512_set_ps(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr4_pd&expand=5011)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
+    _mm512_set_pd(a, b, c, d, a, b, c, d)
+}
+
+/// Set packed 64-bit integers in dst with the supplied values.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_epi64&expand=4910)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set_epi64(
+    e0: i64,
+    e1: i64,
+    e2: i64,
+    e3: i64,
+    e4: i64,
+    e5: i64,
+    e6: i64,
+    e7: i64,
+) -> __m512i {
+    _mm512_setr_epi64(e7, e6, e5, e4, e3, e2, e1, e0)
+}
+
+/// Set packed 64-bit integers in dst with the supplied values in reverse order.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr_epi64&expand=4993)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr_epi64(
+    e0: i64,
+    e1: i64,
+    e2: i64,
+    e3: i64,
+    e4: i64,
+    e5: i64,
+    e6: i64,
+    e7: i64,
+) -> __m512i {
+    unsafe {
+        let r = i64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
+        transmute(r)
+    }
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32gather_pd&expand=3002)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i32gather_pd<const SCALE: i32>(
+    offsets: __m256i,
+    slice: *const f64,
+) -> __m512d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = f64x8::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    let r = vgatherdpd(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32gather_pd&expand=3003)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32gather_pd<const SCALE: i32>(
+    src: __m512d,
+    mask: __mmask8,
+    offsets: __m256i,
+    slice: *const f64,
+) -> __m512d {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    let r = vgatherdpd(src, slice, offsets, mask as i8, SCALE);
+    transmute(r)
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64gather_pd&expand=3092)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i64gather_pd<const SCALE: i32>(
+    offsets: __m512i,
+    slice: *const f64,
+) -> __m512d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = f64x8::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vgatherqpd(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64gather_pd&expand=3093)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64gather_pd<const SCALE: i32>(
+    src: __m512d,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const f64,
+) -> __m512d {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vgatherqpd(src, slice, offsets, mask as i8, SCALE);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64gather_ps&expand=3100)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i64gather_ps<const SCALE: i32>(offsets: __m512i, slice: *const f32) -> __m256 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = f32x8::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vgatherqps(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64gather_ps&expand=3101)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64gather_ps<const SCALE: i32>(
+    src: __m256,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const f32,
+) -> __m256 {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x8();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vgatherqps(src, slice, offsets, mask as i8, SCALE);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32gather_ps&expand=3010)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i32gather_ps<const SCALE: i32>(offsets: __m512i, slice: *const f32) -> __m512 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = f32x16::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    let r = vgatherdps(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32gather_ps&expand=3011)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32gather_ps<const SCALE: i32>(
+    src: __m512,
+    mask: __mmask16,
+    offsets: __m512i,
+    slice: *const f32,
+) -> __m512 {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x16();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    let r = vgatherdps(src, slice, offsets, mask as i16, SCALE);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32gather_epi32&expand=2986)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i32gather_epi32<const SCALE: i32>(
+    offsets: __m512i,
+    slice: *const i32,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i32x16::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32gather_epi32&expand=2987)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32gather_epi32<const SCALE: i32>(
+    src: __m512i,
+    mask: __mmask16,
+    offsets: __m512i,
+    slice: *const i32,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x16();
+    let mask = mask as i16;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    let r = vpgatherdd(src, slice, offsets, mask, SCALE);
+    transmute(r)
+}
+
+/// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32gather_epi64&expand=2994)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i32gather_epi64<const SCALE: i32>(
+    offsets: __m256i,
+    slice: *const i64,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i64x8::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32gather_epi64&expand=2995)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32gather_epi64<const SCALE: i32>(
+    src: __m512i,
+    mask: __mmask8,
+    offsets: __m256i,
+    slice: *const i64,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    let r = vpgatherdq(src, slice, offsets, mask, SCALE);
+    transmute(r)
+}
+
+/// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64gather_epi64&expand=3084)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i64gather_epi64<const SCALE: i32>(
+    offsets: __m512i,
+    slice: *const i64,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i64x8::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64gather_epi64&expand=3085)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64gather_epi64<const SCALE: i32>(
+    src: __m512i,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const i64,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vpgatherqq(src, slice, offsets, mask, SCALE);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64gather_epi32&expand=3074)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i64gather_epi32<const SCALE: i32>(
+    offsets: __m512i,
+    slice: *const i32,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let zeros = i32x8::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vpgatherqd(zeros, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64gather_epi32&expand=3075)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64gather_epi32<const SCALE: i32>(
+    src: __m256i,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const i32,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x8();
+    let mask = mask as i8;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vpgatherqd(src, slice, offsets, mask, SCALE);
+    transmute(r)
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32scatter_pd&expand=3044)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i32scatter_pd<const SCALE: i32>(
+    slice: *mut f64,
+    offsets: __m256i,
+    src: __m512d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    vscatterdpd(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32scatter_pd&expand=3045)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32scatter_pd<const SCALE: i32>(
+    slice: *mut f64,
+    mask: __mmask8,
+    offsets: __m256i,
+    src: __m512d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    vscatterdpd(slice, mask as i8, offsets, src, SCALE);
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_pd&expand=3122)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i64scatter_pd<const SCALE: i32>(
+    slice: *mut f64,
+    offsets: __m512i,
+    src: __m512d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vscatterqpd(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64scatter_pd&expand=3123)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64scatter_pd<const SCALE: i32>(
+    slice: *mut f64,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m512d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vscatterqpd(slice, mask as i8, offsets, src, SCALE);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32scatter_ps&expand=3050)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i32scatter_ps<const SCALE: i32>(
+    slice: *mut f32,
+    offsets: __m512i,
+    src: __m512,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x16();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    vscatterdps(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32scatter_ps&expand=3051)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32scatter_ps<const SCALE: i32>(
+    slice: *mut f32,
+    mask: __mmask16,
+    offsets: __m512i,
+    src: __m512,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x16();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    vscatterdps(slice, mask as i16, offsets, src, SCALE);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_ps&expand=3128)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i64scatter_ps<const SCALE: i32>(
+    slice: *mut f32,
+    offsets: __m512i,
+    src: __m256,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vscatterqps(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64scatter_ps&expand=3129)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64scatter_ps<const SCALE: i32>(
+    slice: *mut f32,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m256,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x8();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vscatterqps(slice, mask as i8, offsets, src, SCALE);
+}
+
+/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32scatter_epi64&expand=3038)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i32scatter_epi64<const SCALE: i32>(
+    slice: *mut i64,
+    offsets: __m256i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    vpscatterdq(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32scatter_epi64&expand=3039)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32scatter_epi64<const SCALE: i32>(
+    slice: *mut i64,
+    mask: __mmask8,
+    offsets: __m256i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    vpscatterdq(slice, mask, offsets, src, SCALE);
+}
+
+/// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_epi64&expand=3116)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i64scatter_epi64<const SCALE: i32>(
+    slice: *mut i64,
+    offsets: __m512i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vpscatterqq(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64scatter_epi64&expand=3117)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64scatter_epi64<const SCALE: i32>(
+    slice: *mut i64,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vpscatterqq(slice, mask, offsets, src, SCALE);
+}
+
+/// Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32scatter_epi32&expand=3032)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i32scatter_epi32<const SCALE: i32>(
+    slice: *mut i32,
+    offsets: __m512i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x16();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    vpscatterdd(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32scatter_epi32&expand=3033)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32scatter_epi32<const SCALE: i32>(
+    slice: *mut i32,
+    mask: __mmask16,
+    offsets: __m512i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x16();
+    let mask = mask as i16;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    vpscatterdd(slice, mask, offsets, src, SCALE);
+}
+
+/// Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_epi32&expand=3108)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i64scatter_epi32<const SCALE: i32>(
+    slice: *mut i32,
+    offsets: __m512i,
+    src: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vpscatterqd(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64scatter_epi32&expand=3109)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64scatter_epi32<const SCALE: i32>(
+    slice: *mut i32,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x8();
+    let mask = mask as i8;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vpscatterqd(slice, mask, offsets, src, SCALE);
+}
+
+/// Loads 8 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale and stores them in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_i32logather_epi64<const SCALE: i32>(
+    vindex: __m512i,
+    base_addr: *const i64,
+) -> __m512i {
+    _mm512_i32gather_epi64::<SCALE>(_mm512_castsi512_si256(vindex), base_addr)
+}
+
+/// Loads 8 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale and stores them in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_i32logather_epi64<const SCALE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    vindex: __m512i,
+    base_addr: *const i64,
+) -> __m512i {
+    _mm512_mask_i32gather_epi64::<SCALE>(src, k, _mm512_castsi512_si256(vindex), base_addr)
+}
+
+/// Loads 8 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale and stores them in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_i32logather_pd<const SCALE: i32>(
+    vindex: __m512i,
+    base_addr: *const f64,
+) -> __m512d {
+    _mm512_i32gather_pd::<SCALE>(_mm512_castsi512_si256(vindex), base_addr)
+}
+
+/// Loads 8 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale and stores them in dst
+/// using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_i32logather_pd<const SCALE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    vindex: __m512i,
+    base_addr: *const f64,
+) -> __m512d {
+    _mm512_mask_i32gather_pd::<SCALE>(src, k, _mm512_castsi512_si256(vindex), base_addr)
+}
+
+/// Stores 8 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_i32loscatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    vindex: __m512i,
+    a: __m512i,
+) {
+    _mm512_i32scatter_epi64::<SCALE>(base_addr, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_i32loscatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    k: __mmask8,
+    vindex: __m512i,
+    a: __m512i,
+) {
+    _mm512_mask_i32scatter_epi64::<SCALE>(base_addr, k, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_i32loscatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    vindex: __m512i,
+    a: __m512d,
+) {
+    _mm512_i32scatter_pd::<SCALE>(base_addr, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale using writemask k
+/// (elements whose corresponding mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_i32loscatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    k: __mmask8,
+    vindex: __m512i,
+    a: __m512d,
+) {
+    _mm512_mask_i32scatter_pd::<SCALE>(base_addr, k, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i32scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    vindex: __m256i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdd_256(base_addr as _, 0xff, vindex.as_i32x8(), a.as_i32x8(), SCALE)
+}
+
+/// Stores 8 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i32scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdd_256(base_addr as _, k, vindex.as_i32x8(), a.as_i32x8(), SCALE)
+}
+
+/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32scatter_epi64&expand=4099)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_i32scatter_epi64<const SCALE: i32>(
+    slice: *mut i64,
+    offsets: __m128i,
+    src: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x4();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x4();
+    vpscatterdq_256(slice, 0xff, offsets, src, SCALE);
+}
+
+/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i32scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdq_256(base_addr as _, k, vindex.as_i32x4(), a.as_i64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i32scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    vindex: __m128i,
+    a: __m256d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdpd_256(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i32scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m256d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdpd_256(base_addr as _, k, vindex.as_i32x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 8 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i32scatter_ps<const SCALE: i32>(
+    base_addr: *mut f32,
+    vindex: __m256i,
+    a: __m256,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdps_256(base_addr as _, 0xff, vindex.as_i32x8(), a.as_f32x8(), SCALE)
+}
+
+/// Stores 8 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i32scatter_ps<const SCALE: i32>(
+    base_addr: *mut f32,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m256,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdps_256(base_addr as _, k, vindex.as_i32x8(), a.as_f32x8(), SCALE)
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i64scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    vindex: __m256i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqd_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i64scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqd_256(base_addr as _, k, vindex.as_i64x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i64scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    vindex: __m256i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqq_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_i64x4(), SCALE)
+}
+
+/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i64scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqq_256(base_addr as _, k, vindex.as_i64x4(), a.as_i64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i64scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    vindex: __m256i,
+    a: __m256d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqpd_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i64scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m256d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqpd_256(base_addr as _, k, vindex.as_i64x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i64scatter_ps<const SCALE: i32>(
+    base_addr: *mut f32,
+    vindex: __m256i,
+    a: __m128,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqps_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i64scatter_ps<const SCALE: i32>(
+    base_addr: *mut f32,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m128,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqps_256(base_addr as _, k, vindex.as_i64x4(), a.as_f32x4(), SCALE)
+}
+
+/// Loads 8 32-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i32gather_epi32<const SCALE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const i32,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherdd_256(
+        src.as_i32x8(),
+        base_addr as _,
+        vindex.as_i32x8(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i32gather_epi64<const SCALE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const i64,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherdq_256(
+        src.as_i64x4(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i32gather_pd<const SCALE: i32>(
+    src: __m256d,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const f64,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherdpd_256(
+        src.as_f64x4(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 8 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i32gather_ps<const SCALE: i32>(
+    src: __m256,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const f32,
+) -> __m256 {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherdps_256(
+        src.as_f32x8(),
+        base_addr as _,
+        vindex.as_i32x8(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 32-bit integer elements from memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i64gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const i32,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherqd_256(
+        src.as_i32x4(),
+        base_addr as _,
+        vindex.as_i64x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i64gather_epi64<const SCALE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const i64,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherqq_256(
+        src.as_i64x4(),
+        base_addr as _,
+        vindex.as_i64x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i64gather_pd<const SCALE: i32>(
+    src: __m256d,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const f64,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherqpd_256(
+        src.as_f64x4(),
+        base_addr as _,
+        vindex.as_i64x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i64gather_ps<const SCALE: i32>(
+    src: __m128,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const f32,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherqps_256(
+        src.as_f32x4(),
+        base_addr as _,
+        vindex.as_i64x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i32scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdd_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i32scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdd_128(base_addr as _, k, vindex.as_i32x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i32scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdq_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i32scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdq_128(base_addr as _, k, vindex.as_i32x4(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i32scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    vindex: __m128i,
+    a: __m128d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdpd_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i32scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdpd_128(base_addr as _, k, vindex.as_i32x4(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i32scatter_ps<const SCALE: i32>(base_addr: *mut f32, vindex: __m128i, a: __m128) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdps_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i32scatter_ps<const SCALE: i32>(
+    base_addr: *mut f32,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdps_128(base_addr as _, k, vindex.as_i32x4(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 2 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i64scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqd_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 2 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i64scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqd_128(base_addr as _, k, vindex.as_i64x2(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i64scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqq_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i64scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqq_128(base_addr as _, k, vindex.as_i64x2(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i64scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    vindex: __m128i,
+    a: __m128d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqpd_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i64scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqpd_128(base_addr as _, k, vindex.as_i64x2(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 2 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i64scatter_ps<const SCALE: i32>(base_addr: *mut f32, vindex: __m128i, a: __m128) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqps_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 2 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i64scatter_ps<const SCALE: i32>(
+    base_addr: *mut f32,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqps_128(base_addr as _, k, vindex.as_i64x2(), a.as_f32x4(), SCALE)
+}
+
+/// Loads 4 32-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i32gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const i32,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherdd_128(
+        src.as_i32x4(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i32gather_epi64<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const i64,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherdq_128(
+        src.as_i64x2(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i32gather_pd<const SCALE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const f64,
+) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherdpd_128(
+        src.as_f64x2(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i32gather_ps<const SCALE: i32>(
+    src: __m128,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const f32,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherdps_128(
+        src.as_f32x4(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 32-bit integer elements from memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i64gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const i32,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherqd_128(
+        src.as_i32x4(),
+        base_addr as _,
+        vindex.as_i64x2(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 64-bit integer elements from memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i64gather_epi64<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const i64,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherqq_128(
+        src.as_i64x2(),
+        base_addr as _,
+        vindex.as_i64x2(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i64gather_pd<const SCALE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const f64,
+) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherqpd_128(
+        src.as_f64x2(),
+        base_addr as _,
+        vindex.as_i64x2(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i64gather_ps<const SCALE: i32>(
+    src: __m128,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const f32,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherqps_128(
+        src.as_f32x4(),
+        base_addr as _,
+        vindex.as_i64x2(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi32&expand=1198)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub fn _mm512_mask_compress_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressd(a.as_i32x16(), src.as_i32x16(), k)) }
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_epi32&expand=1199)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub fn _mm512_maskz_compress_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressd(a.as_i32x16(), i32x16::ZERO, k)) }
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_epi32&expand=1196)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub fn _mm256_mask_compress_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressd256(a.as_i32x8(), src.as_i32x8(), k)) }
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_epi32&expand=1197)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub fn _mm256_maskz_compress_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressd256(a.as_i32x8(), i32x8::ZERO, k)) }
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_epi32&expand=1194)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub fn _mm_mask_compress_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressd128(a.as_i32x4(), src.as_i32x4(), k)) }
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_epi32&expand=1195)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub fn _mm_maskz_compress_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressd128(a.as_i32x4(), i32x4::ZERO, k)) }
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi64&expand=1204)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub fn _mm512_mask_compress_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressq(a.as_i64x8(), src.as_i64x8(), k)) }
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_epi64&expand=1205)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub fn _mm512_maskz_compress_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressq(a.as_i64x8(), i64x8::ZERO, k)) }
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_epi64&expand=1202)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub fn _mm256_mask_compress_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressq256(a.as_i64x4(), src.as_i64x4(), k)) }
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_epi64&expand=1203)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub fn _mm256_maskz_compress_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressq256(a.as_i64x4(), i64x4::ZERO, k)) }
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_epi64&expand=1200)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub fn _mm_mask_compress_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressq128(a.as_i64x2(), src.as_i64x2(), k)) }
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_epi64&expand=1201)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub fn _mm_maskz_compress_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressq128(a.as_i64x2(), i64x2::ZERO, k)) }
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_ps&expand=1222)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub fn _mm512_mask_compress_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vcompressps(a.as_f32x16(), src.as_f32x16(), k)) }
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_ps&expand=1223)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub fn _mm512_maskz_compress_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vcompressps(a.as_f32x16(), f32x16::ZERO, k)) }
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_ps&expand=1220)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub fn _mm256_mask_compress_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vcompressps256(a.as_f32x8(), src.as_f32x8(), k)) }
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_ps&expand=1221)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub fn _mm256_maskz_compress_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vcompressps256(a.as_f32x8(), f32x8::ZERO, k)) }
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_ps&expand=1218)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub fn _mm_mask_compress_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vcompressps128(a.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_ps&expand=1219)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub fn _mm_maskz_compress_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vcompressps128(a.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_pd&expand=1216)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub fn _mm512_mask_compress_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vcompresspd(a.as_f64x8(), src.as_f64x8(), k)) }
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_pd&expand=1217)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub fn _mm512_maskz_compress_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vcompresspd(a.as_f64x8(), f64x8::ZERO, k)) }
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_pd&expand=1214)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub fn _mm256_mask_compress_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vcompresspd256(a.as_f64x4(), src.as_f64x4(), k)) }
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_pd&expand=1215)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub fn _mm256_maskz_compress_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vcompresspd256(a.as_f64x4(), f64x4::ZERO, k)) }
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_pd&expand=1212)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub fn _mm_mask_compress_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vcompresspd128(a.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_pd&expand=1213)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vcompresspd128(a.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut i32, k: __mmask16, a: __m512i) {
+    vcompressstored(base_addr as *mut _, a.as_i32x16(), k)
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut i32, k: __mmask8, a: __m256i) {
+    vcompressstored256(base_addr as *mut _, a.as_i32x8(), k)
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut i32, k: __mmask8, a: __m128i) {
+    vcompressstored128(base_addr as *mut _, a.as_i32x4(), k)
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut i64, k: __mmask8, a: __m512i) {
+    vcompressstoreq(base_addr as *mut _, a.as_i64x8(), k)
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut i64, k: __mmask8, a: __m256i) {
+    vcompressstoreq256(base_addr as *mut _, a.as_i64x4(), k)
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut i64, k: __mmask8, a: __m128i) {
+    vcompressstoreq128(base_addr as *mut _, a.as_i64x2(), k)
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut f32, k: __mmask16, a: __m512) {
+    vcompressstoreps(base_addr as *mut _, a.as_f32x16(), k)
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut f32, k: __mmask8, a: __m256) {
+    vcompressstoreps256(base_addr as *mut _, a.as_f32x8(), k)
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut f32, k: __mmask8, a: __m128) {
+    vcompressstoreps128(base_addr as *mut _, a.as_f32x4(), k)
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut f64, k: __mmask8, a: __m512d) {
+    vcompressstorepd(base_addr as *mut _, a.as_f64x8(), k)
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut f64, k: __mmask8, a: __m256d) {
+    vcompressstorepd256(base_addr as *mut _, a.as_f64x4(), k)
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut f64, k: __mmask8, a: __m128d) {
+    vcompressstorepd128(base_addr as *mut _, a.as_f64x2(), k)
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_epi32&expand=2316)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub fn _mm512_mask_expand_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandd(a.as_i32x16(), src.as_i32x16(), k)) }
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_epi32&expand=2317)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub fn _mm512_maskz_expand_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandd(a.as_i32x16(), i32x16::ZERO, k)) }
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_epi32&expand=2314)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub fn _mm256_mask_expand_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandd256(a.as_i32x8(), src.as_i32x8(), k)) }
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_epi32&expand=2315)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub fn _mm256_maskz_expand_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandd256(a.as_i32x8(), i32x8::ZERO, k)) }
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_epi32&expand=2312)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub fn _mm_mask_expand_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandd128(a.as_i32x4(), src.as_i32x4(), k)) }
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_epi32&expand=2313)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub fn _mm_maskz_expand_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandd128(a.as_i32x4(), i32x4::ZERO, k)) }
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_epi64&expand=2322)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub fn _mm512_mask_expand_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandq(a.as_i64x8(), src.as_i64x8(), k)) }
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_epi64&expand=2323)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub fn _mm512_maskz_expand_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandq(a.as_i64x8(), i64x8::ZERO, k)) }
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_epi64&expand=2320)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub fn _mm256_mask_expand_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandq256(a.as_i64x4(), src.as_i64x4(), k)) }
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_epi64&expand=2321)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub fn _mm256_maskz_expand_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandq256(a.as_i64x4(), i64x4::ZERO, k)) }
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_epi64&expand=2318)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub fn _mm_mask_expand_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandq128(a.as_i64x2(), src.as_i64x2(), k)) }
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_epi64&expand=2319)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub fn _mm_maskz_expand_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandq128(a.as_i64x2(), i64x2::ZERO, k)) }
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_ps&expand=2340)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub fn _mm512_mask_expand_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vexpandps(a.as_f32x16(), src.as_f32x16(), k)) }
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_ps&expand=2341)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub fn _mm512_maskz_expand_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vexpandps(a.as_f32x16(), f32x16::ZERO, k)) }
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_ps&expand=2338)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub fn _mm256_mask_expand_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vexpandps256(a.as_f32x8(), src.as_f32x8(), k)) }
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_ps&expand=2339)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub fn _mm256_maskz_expand_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vexpandps256(a.as_f32x8(), f32x8::ZERO, k)) }
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_ps&expand=2336)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub fn _mm_mask_expand_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vexpandps128(a.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_ps&expand=2337)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub fn _mm_maskz_expand_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vexpandps128(a.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_pd&expand=2334)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub fn _mm512_mask_expand_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vexpandpd(a.as_f64x8(), src.as_f64x8(), k)) }
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_pd&expand=2335)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub fn _mm512_maskz_expand_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vexpandpd(a.as_f64x8(), f64x8::ZERO, k)) }
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_pd&expand=2332)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub fn _mm256_mask_expand_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vexpandpd256(a.as_f64x4(), src.as_f64x4(), k)) }
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_pd&expand=2333)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub fn _mm256_maskz_expand_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vexpandpd256(a.as_f64x4(), f64x4::ZERO, k)) }
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_pd&expand=2330)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub fn _mm_mask_expand_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vexpandpd128(a.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_pd&expand=2331)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub fn _mm_maskz_expand_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vexpandpd128(a.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rol_epi32&expand=4685)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprold(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rol_epi32&expand=4683)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_rol_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprold(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rol_epi32&expand=4684)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprold(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rol_epi32&expand=4682)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprold256(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rol_epi32&expand=4680)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_rol_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprold256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rol_epi32&expand=4681)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprold256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rol_epi32&expand=4679)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprold128(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rol_epi32&expand=4677)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_rol_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprold128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rol_epi32&expand=4678)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprold128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_ror_epi32&expand=4721)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprord(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_ror_epi32&expand=4719)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_ror_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprord(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_ror_epi32&expand=4720)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprord(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ror_epi32&expand=4718)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprord256(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_ror_epi32&expand=4716)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_ror_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprord256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_ror_epi32&expand=4717)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprord256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ror_epi32&expand=4715)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprord128(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_ror_epi32&expand=4713)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_ror_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprord128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_ror_epi32&expand=4714)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprord128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rol_epi64&expand=4694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprolq(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rol_epi64&expand=4692)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_rol_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprolq(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rol_epi64&expand=4693)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprolq(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rol_epi64&expand=4691)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprolq256(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rol_epi64&expand=4689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_rol_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprolq256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rol_epi64&expand=4690)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprolq256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rol_epi64&expand=4688)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprolq128(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rol_epi64&expand=4686)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_rol_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprolq128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rol_epi64&expand=4687)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprolq128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_ror_epi64&expand=4730)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprorq(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_ror_epi64&expand=4728)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_ror_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprorq(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_ror_epi64&expand=4729)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprorq(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ror_epi64&expand=4727)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprorq256(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_ror_epi64&expand=4725)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_ror_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprorq256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_ror_epi64&expand=4726)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprorq256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ror_epi64&expand=4724)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprorq128(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_ror_epi64&expand=4722)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_ror_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprorq128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_ror_epi64&expand=4723)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprorq128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_slli_epi32&expand=5310)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_slli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8)))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_slli_epi32&expand=5308)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_slli_epi32<const IMM8: u32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 32 {
+            u32x16::ZERO
+        } else {
+            simd_shl(a.as_u32x16(), u32x16::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_slli_epi32&expand=5309)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_slli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shl(a.as_u32x16(), u32x16::splat(IMM8));
+            transmute(simd_select_bitmask(k, shf, u32x16::ZERO))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_slli_epi32&expand=5305)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_slli_epi32<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 32 {
+            u32x8::ZERO
+        } else {
+            simd_shl(a.as_u32x8(), u32x8::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_slli_epi32&expand=5306)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            let r = simd_shl(a.as_u32x8(), u32x8::splat(IMM8));
+            transmute(simd_select_bitmask(k, r, u32x8::ZERO))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_slli_epi32&expand=5302)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_slli_epi32<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 32 {
+            u32x4::ZERO
+        } else {
+            simd_shl(a.as_u32x4(), u32x4::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_slli_epi32&expand=5303)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            let r = simd_shl(a.as_u32x4(), u32x4::splat(IMM8));
+            transmute(simd_select_bitmask(k, r, u32x4::ZERO))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srli_epi32&expand=5522)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_srli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shr(a.as_u32x16(), u32x16::splat(IMM8)))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srli_epi32&expand=5520)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_srli_epi32<const IMM8: u32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 32 {
+            u32x16::ZERO
+        } else {
+            simd_shr(a.as_u32x16(), u32x16::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srli_epi32&expand=5521)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_srli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shr(a.as_u32x16(), u32x16::splat(IMM8));
+            transmute(simd_select_bitmask(k, shf, u32x16::ZERO))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srli_epi32&expand=5517)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_srli_epi32<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 32 {
+            u32x8::ZERO
+        } else {
+            simd_shr(a.as_u32x8(), u32x8::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srli_epi32&expand=5518)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            let r = simd_shr(a.as_u32x8(), u32x8::splat(IMM8));
+            transmute(simd_select_bitmask(k, r, u32x8::ZERO))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srli_epi32&expand=5514)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_srli_epi32<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 32 {
+            u32x4::ZERO
+        } else {
+            simd_shr(a.as_u32x4(), u32x4::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srli_epi32&expand=5515)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            let r = simd_shr(a.as_u32x4(), u32x4::splat(IMM8));
+            transmute(simd_select_bitmask(k, r, u32x4::ZERO))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_slli_epi64&expand=5319)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_slli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64)))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_slli_epi64&expand=5317)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_slli_epi64<const IMM8: u32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 64 {
+            u64x8::ZERO
+        } else {
+            simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_slli_epi64&expand=5318)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, shf, u64x8::ZERO))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_slli_epi64&expand=5314)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_slli_epi64<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 64 {
+            u64x4::ZERO
+        } else {
+            simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_slli_epi64&expand=5315)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            let r = simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, r, u64x4::ZERO))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_slli_epi64&expand=5311)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_slli_epi64<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 64 {
+            u64x2::ZERO
+        } else {
+            simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_slli_epi64&expand=5312)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            let r = simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, r, u64x2::ZERO))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srli_epi64&expand=5531)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_srli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64)))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srli_epi64&expand=5529)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_srli_epi64<const IMM8: u32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 64 {
+            u64x8::ZERO
+        } else {
+            simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srli_epi64&expand=5530)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, shf, u64x8::ZERO))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srli_epi64&expand=5526)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_srli_epi64<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 64 {
+            u64x4::ZERO
+        } else {
+            simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srli_epi64&expand=5527)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            let r = simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, r, u64x4::ZERO))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srli_epi64&expand=5523)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_srli_epi64<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 64 {
+            u64x2::ZERO
+        } else {
+            simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srli_epi64&expand=5524)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            let r = simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, r, u64x2::ZERO))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sll_epi32&expand=5280)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm512_sll_epi32(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpslld(a.as_i32x16(), count.as_i32x4())) }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sll_epi32&expand=5278)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm512_mask_sll_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sll_epi32&expand=5279)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm512_maskz_sll_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sll_epi32&expand=5275)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm256_mask_sll_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sll_epi32&expand=5276)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm256_maskz_sll_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sll_epi32&expand=5272)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm_mask_sll_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sll_epi32&expand=5273)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm_maskz_sll_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srl_epi32&expand=5492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm512_srl_epi32(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsrld(a.as_i32x16(), count.as_i32x4())) }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srl_epi32&expand=5490)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm512_mask_srl_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srl_epi32&expand=5491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm512_maskz_srl_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srl_epi32&expand=5487)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm256_mask_srl_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srl_epi32&expand=5488)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm256_maskz_srl_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srl_epi32&expand=5484)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm_mask_srl_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srl_epi32&expand=5485)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm_maskz_srl_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sll_epi64&expand=5289)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm512_sll_epi64(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsllq(a.as_i64x8(), count.as_i64x2())) }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sll_epi64&expand=5287)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm512_mask_sll_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sll_epi64&expand=5288)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm512_maskz_sll_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sll_epi64&expand=5284)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm256_mask_sll_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sll_epi64&expand=5285)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm256_maskz_sll_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sll_epi64&expand=5281)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm_mask_sll_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sll_epi64&expand=5282)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm_maskz_sll_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srl_epi64&expand=5501)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm512_srl_epi64(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsrlq(a.as_i64x8(), count.as_i64x2())) }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srl_epi64&expand=5499)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm512_mask_srl_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srl_epi64&expand=5500)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm512_maskz_srl_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srl_epi64&expand=5496)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm256_mask_srl_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srl_epi64&expand=5497)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm256_maskz_srl_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srl_epi64&expand=5493)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm_mask_srl_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srl_epi64&expand=5494)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm_maskz_srl_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sra_epi32&expand=5407)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm512_sra_epi32(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsrad(a.as_i32x16(), count.as_i32x4())) }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sra_epi32&expand=5405)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm512_mask_sra_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sra_epi32&expand=5406)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm512_maskz_sra_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sra_epi32&expand=5402)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm256_mask_sra_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sra_epi32&expand=5403)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm256_maskz_sra_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sra_epi32&expand=5399)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm_mask_sra_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sra_epi32&expand=5400)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm_maskz_sra_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sra_epi64&expand=5416)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm512_sra_epi64(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsraq(a.as_i64x8(), count.as_i64x2())) }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sra_epi64&expand=5414)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm512_mask_sra_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sra_epi64&expand=5415)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm512_maskz_sra_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi64&expand=5413)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm256_sra_epi64(a: __m256i, count: __m128i) -> __m256i {
+    unsafe { transmute(vpsraq256(a.as_i64x4(), count.as_i64x2())) }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sra_epi64&expand=5411)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm256_mask_sra_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sra_epi64&expand=5412)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm256_maskz_sra_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi64&expand=5410)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm_sra_epi64(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsraq128(a.as_i64x2(), count.as_i64x2())) }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sra_epi64&expand=5408)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm_mask_sra_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sra_epi64&expand=5409)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm_maskz_sra_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srai_epi32&expand=5436)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_srai_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32)))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srai_epi32&expand=5434)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_srai_epi32<const IMM8: u32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srai_epi32&expand=5435)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srai_epi32&expand=5431)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_srai_epi32<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let r = simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srai_epi32&expand=5432)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let r = simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srai_epi32&expand=5428)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_srai_epi32<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let r = simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srai_epi32&expand=5429)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let r = simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srai_epi64&expand=5445)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_srai_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64)))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srai_epi64&expand=5443)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_srai_epi64<const IMM8: u32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srai_epi64&expand=5444)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi64&expand=5442)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_srai_epi64<const IMM8: u32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64)))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srai_epi64&expand=5440)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_srai_epi64<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srai_epi64&expand=5441)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi64&expand=5439)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_srai_epi64<const IMM8: u32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64)))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srai_epi64&expand=5437)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_srai_epi64<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srai_epi64&expand=5438)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srav_epi32&expand=5465)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsravd(a.as_i32x16(), count.as_i32x16())) }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srav_epi32&expand=5463)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm512_mask_srav_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srav_epi32&expand=5464)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm512_maskz_srav_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srav_epi32&expand=5460)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm256_mask_srav_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srav_epi32&expand=5461)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm256_maskz_srav_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srav_epi32&expand=5457)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm_mask_srav_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srav_epi32&expand=5458)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm_maskz_srav_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srav_epi64&expand=5474)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsravq(a.as_i64x8(), count.as_i64x8())) }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srav_epi64&expand=5472)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm512_mask_srav_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srav_epi64&expand=5473)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm512_maskz_srav_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi64&expand=5471)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm256_srav_epi64(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(vpsravq256(a.as_i64x4(), count.as_i64x4())) }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srav_epi64&expand=5469)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm256_mask_srav_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srav_epi64&expand=5470)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm256_maskz_srav_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi64&expand=5468)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm_srav_epi64(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsravq128(a.as_i64x2(), count.as_i64x2())) }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srav_epi64&expand=5466)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm_mask_srav_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srav_epi64&expand=5467)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm_maskz_srav_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rolv_epi32&expand=4703)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vprolvd(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rolv_epi32&expand=4701)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm512_mask_rolv_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let rol = _mm512_rolv_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, rol, src.as_i32x16()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rolv_epi32&expand=4702)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm512_maskz_rolv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let rol = _mm512_rolv_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, rol, i32x16::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rolv_epi32&expand=4700)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm256_rolv_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vprolvd256(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rolv_epi32&expand=4698)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm256_mask_rolv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let rol = _mm256_rolv_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, rol, src.as_i32x8()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rolv_epi32&expand=4699)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm256_maskz_rolv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let rol = _mm256_rolv_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, rol, i32x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rolv_epi32&expand=4697)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm_rolv_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vprolvd128(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rolv_epi32&expand=4695)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm_mask_rolv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let rol = _mm_rolv_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, rol, src.as_i32x4()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rolv_epi32&expand=4696)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm_maskz_rolv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let rol = _mm_rolv_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, rol, i32x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rorv_epi32&expand=4739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vprorvd(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rorv_epi32&expand=4737)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm512_mask_rorv_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let ror = _mm512_rorv_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, ror, src.as_i32x16()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rorv_epi32&expand=4738)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm512_maskz_rorv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let ror = _mm512_rorv_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, ror, i32x16::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rorv_epi32&expand=4736)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm256_rorv_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vprorvd256(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rorv_epi32&expand=4734)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm256_mask_rorv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let ror = _mm256_rorv_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, ror, src.as_i32x8()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rorv_epi32&expand=4735)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm256_maskz_rorv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let ror = _mm256_rorv_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, ror, i32x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rorv_epi32&expand=4733)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm_rorv_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vprorvd128(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rorv_epi32&expand=4731)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm_mask_rorv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let ror = _mm_rorv_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, ror, src.as_i32x4()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rorv_epi32&expand=4732)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm_maskz_rorv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let ror = _mm_rorv_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, ror, i32x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rolv_epi64&expand=4712)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vprolvq(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rolv_epi64&expand=4710)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm512_mask_rolv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let rol = _mm512_rolv_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, rol, src.as_i64x8()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rolv_epi64&expand=4711)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm512_maskz_rolv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let rol = _mm512_rolv_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, rol, i64x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rolv_epi64&expand=4709)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm256_rolv_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vprolvq256(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rolv_epi64&expand=4707)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm256_mask_rolv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let rol = _mm256_rolv_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, rol, src.as_i64x4()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rolv_epi64&expand=4708)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm256_maskz_rolv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let rol = _mm256_rolv_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, rol, i64x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rolv_epi64&expand=4706)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm_rolv_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vprolvq128(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rolv_epi64&expand=4704)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm_mask_rolv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let rol = _mm_rolv_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, rol, src.as_i64x2()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rolv_epi64&expand=4705)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm_maskz_rolv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let rol = _mm_rolv_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, rol, i64x2::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rorv_epi64&expand=4748)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vprorvq(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rorv_epi64&expand=4746)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm512_mask_rorv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let ror = _mm512_rorv_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, ror, src.as_i64x8()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rorv_epi64&expand=4747)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm512_maskz_rorv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let ror = _mm512_rorv_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, ror, i64x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rorv_epi64&expand=4745)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm256_rorv_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vprorvq256(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rorv_epi64&expand=4743)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm256_mask_rorv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let ror = _mm256_rorv_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, ror, src.as_i64x4()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rorv_epi64&expand=4744)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm256_maskz_rorv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let ror = _mm256_rorv_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, ror, i64x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rorv_epi64&expand=4742)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm_rorv_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vprorvq128(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rorv_epi64&expand=4740)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm_mask_rorv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let ror = _mm_rorv_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, ror, src.as_i64x2()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rorv_epi64&expand=4741)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm_maskz_rorv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let ror = _mm_rorv_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, ror, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sllv_epi32&expand=5342)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsllvd(a.as_i32x16(), count.as_i32x16())) }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sllv_epi32&expand=5340)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm512_mask_sllv_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sllv_epi32&expand=5341)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm512_maskz_sllv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sllv_epi32&expand=5337)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm256_mask_sllv_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sllv_epi32&expand=5338)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm256_maskz_sllv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sllv_epi32&expand=5334)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm_mask_sllv_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sllv_epi32&expand=5335)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm_maskz_sllv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srlv_epi32&expand=5554)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsrlvd(a.as_i32x16(), count.as_i32x16())) }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srlv_epi32&expand=5552)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm512_mask_srlv_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srlv_epi32&expand=5553)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm512_maskz_srlv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srlv_epi32&expand=5549)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm256_mask_srlv_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srlv_epi32&expand=5550)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm256_maskz_srlv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srlv_epi32&expand=5546)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm_mask_srlv_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srlv_epi32&expand=5547)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm_maskz_srlv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sllv_epi64&expand=5351)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsllvq(a.as_i64x8(), count.as_i64x8())) }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sllv_epi64&expand=5349)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm512_mask_sllv_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sllv_epi64&expand=5350)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm512_maskz_sllv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sllv_epi64&expand=5346)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm256_mask_sllv_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sllv_epi64&expand=5347)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm256_maskz_sllv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sllv_epi64&expand=5343)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm_mask_sllv_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sllv_epi64&expand=5344)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm_maskz_sllv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srlv_epi64&expand=5563)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsrlvq(a.as_i64x8(), count.as_i64x8())) }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srlv_epi64&expand=5561)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm512_mask_srlv_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srlv_epi64&expand=5562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm512_maskz_srlv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srlv_epi64&expand=5558)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm256_mask_srlv_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srlv_epi64&expand=5559)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm256_maskz_srlv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srlv_epi64&expand=5555)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm_mask_srlv_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srlv_epi64&expand=5556)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm_maskz_srlv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permute_ps&expand=4170)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_permute_ps<const MASK: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+                (MASK as u32 & 0b11) + 8,
+                ((MASK as u32 >> 2) & 0b11) + 8,
+                ((MASK as u32 >> 4) & 0b11) + 8,
+                ((MASK as u32 >> 6) & 0b11) + 8,
+                (MASK as u32 & 0b11) + 12,
+                ((MASK as u32 >> 2) & 0b11) + 12,
+                ((MASK as u32 >> 4) & 0b11) + 12,
+                ((MASK as u32 >> 6) & 0b11) + 12,
+            ],
+        )
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permute_ps&expand=4168)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_permute_ps<const MASK: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permute_ps&expand=4169)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_permute_ps<const MASK: i32>(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permute_ps&expand=4165)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_permute_ps<const MASK: i32>(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let r = _mm256_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permute_ps&expand=4166)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let r = _mm256_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permute_ps&expand=4162)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_permute_ps<const MASK: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let r = _mm_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permute_ps&expand=4163)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let r = _mm_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permute_pd&expand=4161)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_permute_pd<const MASK: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b1,
+                ((MASK as u32 >> 1) & 0b1),
+                ((MASK as u32 >> 2) & 0b1) + 2,
+                ((MASK as u32 >> 3) & 0b1) + 2,
+                ((MASK as u32 >> 4) & 0b1) + 4,
+                ((MASK as u32 >> 5) & 0b1) + 4,
+                ((MASK as u32 >> 6) & 0b1) + 6,
+                ((MASK as u32 >> 7) & 0b1) + 6,
+            ],
+        )
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permute_pd&expand=4159)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_permute_pd<const MASK: i32>(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permute_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permute_pd&expand=4160)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permute_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permute_pd&expand=4156)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_permute_pd<const MASK: i32>(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 4);
+        let r = _mm256_permute_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permute_pd&expand=4157)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 4);
+        let r = _mm256_permute_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permute_pd&expand=4153)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0b01))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_permute_pd<const IMM2: i32>(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let r = _mm_permute_pd::<IMM2>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permute_pd&expand=4154)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0b01))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_permute_pd<const IMM2: i32>(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let r = _mm_permute_pd::<IMM2>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x2(), f64x2::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex_epi64&expand=4208)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_permutex_epi64<const MASK: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+            ],
+        )
+    }
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex_epi64&expand=4206)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_permutex_epi64<const MASK: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permutex_epi64::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    }
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex_epi64&expand=4207)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permutex_epi64::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex_epi64&expand=4205)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_permutex_epi64<const MASK: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+            ],
+        )
+    }
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex_epi64&expand=4203)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_permutex_epi64<const MASK: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_permutex_epi64::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    }
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex_epi64&expand=4204)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_permutex_epi64::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex_pd&expand=4214)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_permutex_pd<const MASK: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+            ],
+        )
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex_pd&expand=4212)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_permutex_pd<const MASK: i32>(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let r = _mm512_permutex_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex_pd&expand=4213)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let r = _mm512_permutex_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex_pd&expand=4211)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_permutex_pd<const MASK: i32>(a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+            ],
+        )
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex_pd&expand=4209)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_permutex_pd<const MASK: i32>(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_permutex_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex_pd&expand=4210)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_permutex_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst. Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_permutexvar_epi32, and it is recommended that you use that intrinsic name.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutevar_epi32&expand=4182)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
+pub fn _mm512_permutevar_epi32(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermd(a.as_i32x16(), idx.as_i32x16())) }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_mask_permutexvar_epi32, and it is recommended that you use that intrinsic name.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutevar_epi32&expand=4181)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub fn _mm512_mask_permutevar_epi32(
+    src: __m512i,
+    k: __mmask16,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutevar_epi32(idx, a).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutevar_ps&expand=4200)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm512_permutevar_ps(a: __m512, b: __m512i) -> __m512 {
+    unsafe { transmute(vpermilps(a.as_f32x16(), b.as_i32x16())) }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutevar_ps&expand=4198)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm512_mask_permutevar_ps(src: __m512, k: __mmask16, a: __m512, b: __m512i) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutevar_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutevar_ps&expand=4199)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm512_maskz_permutevar_ps(k: __mmask16, a: __m512, b: __m512i) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutevar_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm256_mask_permutevar_ps&expand=4195)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm256_mask_permutevar_ps(src: __m256, k: __mmask8, a: __m256, b: __m256i) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutevar_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutevar_ps&expand=4196)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm256_maskz_permutevar_ps(k: __mmask8, a: __m256, b: __m256i) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutevar_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutevar_ps&expand=4192)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm_mask_permutevar_ps(src: __m128, k: __mmask8, a: __m128, b: __m128i) -> __m128 {
+    unsafe {
+        let permute = _mm_permutevar_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x4()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutevar_ps&expand=4193)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm_maskz_permutevar_ps(k: __mmask8, a: __m128, b: __m128i) -> __m128 {
+    unsafe {
+        let permute = _mm_permutevar_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, f32x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutevar_pd&expand=4191)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm512_permutevar_pd(a: __m512d, b: __m512i) -> __m512d {
+    unsafe { transmute(vpermilpd(a.as_f64x8(), b.as_i64x8())) }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutevar_pd&expand=4189)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm512_mask_permutevar_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512i) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutevar_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutevar_pd&expand=4190)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm512_maskz_permutevar_pd(k: __mmask8, a: __m512d, b: __m512i) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutevar_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutevar_pd&expand=4186)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm256_mask_permutevar_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256i) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutevar_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutevar_pd&expand=4187)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm256_maskz_permutevar_pd(k: __mmask8, a: __m256d, b: __m256i) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutevar_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutevar_pd&expand=4183)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm_mask_permutevar_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
+    unsafe {
+        let permute = _mm_permutevar_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x2()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutevar_pd&expand=4184)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm_maskz_permutevar_pd(k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
+    unsafe {
+        let permute = _mm_permutevar_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, f64x2::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi32&expand=4301)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
+pub fn _mm512_permutexvar_epi32(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermd(a.as_i32x16(), idx.as_i32x16())) }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_epi32&expand=4299)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub fn _mm512_mask_permutexvar_epi32(
+    src: __m512i,
+    k: __mmask16,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
+    }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_epi32&expand=4300)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m512i) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, i32x16::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_epi32&expand=4298)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
+pub fn _mm256_permutexvar_epi32(idx: __m256i, a: __m256i) -> __m256i {
+    _mm256_permutevar8x32_epi32(a, idx) // llvm use llvm.x86.avx2.permd
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_epi32&expand=4296)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub fn _mm256_mask_permutexvar_epi32(
+    src: __m256i,
+    k: __mmask8,
+    idx: __m256i,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, src.as_i32x8()))
+    }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_epi32&expand=4297)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub fn _mm256_maskz_permutexvar_epi32(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, i32x8::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi64&expand=4307)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermq
+pub fn _mm512_permutexvar_epi64(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermq(a.as_i64x8(), idx.as_i64x8())) }
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_epi64&expand=4305)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermq))]
+pub fn _mm512_mask_permutexvar_epi64(
+    src: __m512i,
+    k: __mmask8,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, src.as_i64x8()))
+    }
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_epi64&expand=4306)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermq))]
+pub fn _mm512_maskz_permutexvar_epi64(k: __mmask8, idx: __m512i, a: __m512i) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, i64x8::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_epi64&expand=4304)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermq
+pub fn _mm256_permutexvar_epi64(idx: __m256i, a: __m256i) -> __m256i {
+    unsafe { transmute(vpermq256(a.as_i64x4(), idx.as_i64x4())) }
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_epi64&expand=4302)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermq))]
+pub fn _mm256_mask_permutexvar_epi64(
+    src: __m256i,
+    k: __mmask8,
+    idx: __m256i,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, src.as_i64x4()))
+    }
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_epi64&expand=4303)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermq))]
+pub fn _mm256_maskz_permutexvar_epi64(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, i64x4::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_ps&expand=4200)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub fn _mm512_permutexvar_ps(idx: __m512i, a: __m512) -> __m512 {
+    unsafe { transmute(vpermps(a.as_f32x16(), idx.as_i32x16())) }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_ps&expand=4326)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub fn _mm512_mask_permutexvar_ps(src: __m512, k: __mmask16, idx: __m512i, a: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_ps&expand=4327)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_ps&expand=4325)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub fn _mm256_permutexvar_ps(idx: __m256i, a: __m256) -> __m256 {
+    _mm256_permutevar8x32_ps(a, idx) //llvm.x86.avx2.permps
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_ps&expand=4323)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub fn _mm256_mask_permutexvar_ps(src: __m256, k: __mmask8, idx: __m256i, a: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_ps&expand=4324)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub fn _mm256_maskz_permutexvar_ps(k: __mmask8, idx: __m256i, a: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_pd&expand=4322)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub fn _mm512_permutexvar_pd(idx: __m512i, a: __m512d) -> __m512d {
+    unsafe { transmute(vpermpd(a.as_f64x8(), idx.as_i64x8())) }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_pd&expand=4320)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub fn _mm512_mask_permutexvar_pd(src: __m512d, k: __mmask8, idx: __m512i, a: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_pd&expand=4321)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub fn _mm512_maskz_permutexvar_pd(k: __mmask8, idx: __m512i, a: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_pd&expand=4319)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub fn _mm256_permutexvar_pd(idx: __m256i, a: __m256d) -> __m256d {
+    unsafe { transmute(vpermpd256(a.as_f64x4(), idx.as_i64x4())) }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_pd&expand=4317)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub fn _mm256_mask_permutexvar_pd(src: __m256d, k: __mmask8, idx: __m256i, a: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_pd&expand=4318)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub fn _mm256_maskz_permutexvar_pd(k: __mmask8, idx: __m256i, a: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi32&expand=4238)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub fn _mm512_permutex2var_epi32(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpermi2d(a.as_i32x16(), idx.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_epi32&expand=4235)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2d))]
+pub fn _mm512_mask_permutex2var_epi32(
+    a: __m512i,
+    k: __mmask16,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, a.as_i32x16()))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_epi32&expand=4237)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub fn _mm512_maskz_permutex2var_epi32(
+    k: __mmask16,
+    a: __m512i,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, i32x16::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_epi32&expand=4236)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2d))]
+pub fn _mm512_mask2_permutex2var_epi32(
+    a: __m512i,
+    idx: __m512i,
+    k: __mmask16,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, idx.as_i32x16()))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_epi32&expand=4234)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub fn _mm256_permutex2var_epi32(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpermi2d256(a.as_i32x8(), idx.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_epi32&expand=4231)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2d))]
+pub fn _mm256_mask_permutex2var_epi32(
+    a: __m256i,
+    k: __mmask8,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, a.as_i32x8()))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_epi32&expand=4233)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub fn _mm256_maskz_permutex2var_epi32(
+    k: __mmask8,
+    a: __m256i,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, i32x8::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_epi32&expand=4232)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2d))]
+pub fn _mm256_mask2_permutex2var_epi32(
+    a: __m256i,
+    idx: __m256i,
+    k: __mmask8,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, idx.as_i32x8()))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_epi32&expand=4230)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub fn _mm_permutex2var_epi32(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpermi2d128(a.as_i32x4(), idx.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_epi32&expand=4227)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2d))]
+pub fn _mm_mask_permutex2var_epi32(a: __m128i, k: __mmask8, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
+        transmute(simd_select_bitmask(k, permute, a.as_i32x4()))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_epi32&expand=4229)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub fn _mm_maskz_permutex2var_epi32(k: __mmask8, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
+        transmute(simd_select_bitmask(k, permute, i32x4::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_epi32&expand=4228)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2d))]
+pub fn _mm_mask2_permutex2var_epi32(a: __m128i, idx: __m128i, k: __mmask8, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
+        transmute(simd_select_bitmask(k, permute, idx.as_i32x4()))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi64&expand=4250)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub fn _mm512_permutex2var_epi64(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpermi2q(a.as_i64x8(), idx.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_epi64&expand=4247)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2q))]
+pub fn _mm512_mask_permutex2var_epi64(
+    a: __m512i,
+    k: __mmask8,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, a.as_i64x8()))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_epi64&expand=4249)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub fn _mm512_maskz_permutex2var_epi64(
+    k: __mmask8,
+    a: __m512i,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, i64x8::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_epi64&expand=4248)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2q))]
+pub fn _mm512_mask2_permutex2var_epi64(
+    a: __m512i,
+    idx: __m512i,
+    k: __mmask8,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, idx.as_i64x8()))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_epi64&expand=4246)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub fn _mm256_permutex2var_epi64(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpermi2q256(a.as_i64x4(), idx.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_epi64&expand=4243)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2q))]
+pub fn _mm256_mask_permutex2var_epi64(
+    a: __m256i,
+    k: __mmask8,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, a.as_i64x4()))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_epi64&expand=4245)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub fn _mm256_maskz_permutex2var_epi64(
+    k: __mmask8,
+    a: __m256i,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, i64x4::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_epi64&expand=4244)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2q))]
+pub fn _mm256_mask2_permutex2var_epi64(
+    a: __m256i,
+    idx: __m256i,
+    k: __mmask8,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, idx.as_i64x4()))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_epi64&expand=4242)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub fn _mm_permutex2var_epi64(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpermi2q128(a.as_i64x2(), idx.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_epi64&expand=4239)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2q))]
+pub fn _mm_mask_permutex2var_epi64(a: __m128i, k: __mmask8, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
+        transmute(simd_select_bitmask(k, permute, a.as_i64x2()))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_epi64&expand=4241)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub fn _mm_maskz_permutex2var_epi64(k: __mmask8, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
+        transmute(simd_select_bitmask(k, permute, i64x2::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_epi64&expand=4240)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2q))]
+pub fn _mm_mask2_permutex2var_epi64(a: __m128i, idx: __m128i, k: __mmask8, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
+        transmute(simd_select_bitmask(k, permute, idx.as_i64x2()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_ps&expand=4286)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub fn _mm512_permutex2var_ps(a: __m512, idx: __m512i, b: __m512) -> __m512 {
+    unsafe { transmute(vpermi2ps(a.as_f32x16(), idx.as_i32x16(), b.as_f32x16())) }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_ps&expand=4283)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2ps))]
+pub fn _mm512_mask_permutex2var_ps(a: __m512, k: __mmask16, idx: __m512i, b: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, a.as_f32x16()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_ps&expand=4285)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub fn _mm512_maskz_permutex2var_ps(k: __mmask16, a: __m512, idx: __m512i, b: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_ps&expand=4284)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
+pub fn _mm512_mask2_permutex2var_ps(a: __m512, idx: __m512i, k: __mmask16, b: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
+        let idx = _mm512_castsi512_ps(idx).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_ps&expand=4282)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub fn _mm256_permutex2var_ps(a: __m256, idx: __m256i, b: __m256) -> __m256 {
+    unsafe { transmute(vpermi2ps256(a.as_f32x8(), idx.as_i32x8(), b.as_f32x8())) }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_ps&expand=4279)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2ps))]
+pub fn _mm256_mask_permutex2var_ps(a: __m256, k: __mmask8, idx: __m256i, b: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, a.as_f32x8()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_ps&expand=4281)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub fn _mm256_maskz_permutex2var_ps(k: __mmask8, a: __m256, idx: __m256i, b: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_ps&expand=4280)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
+pub fn _mm256_mask2_permutex2var_ps(a: __m256, idx: __m256i, k: __mmask8, b: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
+        let idx = _mm256_castsi256_ps(idx).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_ps&expand=4278)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub fn _mm_permutex2var_ps(a: __m128, idx: __m128i, b: __m128) -> __m128 {
+    unsafe { transmute(vpermi2ps128(a.as_f32x4(), idx.as_i32x4(), b.as_f32x4())) }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_ps&expand=4275)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2ps))]
+pub fn _mm_mask_permutex2var_ps(a: __m128, k: __mmask8, idx: __m128i, b: __m128) -> __m128 {
+    unsafe {
+        let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, a.as_f32x4()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_ps&expand=4277)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub fn _mm_maskz_permutex2var_ps(k: __mmask8, a: __m128, idx: __m128i, b: __m128) -> __m128 {
+    unsafe {
+        let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, f32x4::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_ps&expand=4276)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
+pub fn _mm_mask2_permutex2var_ps(a: __m128, idx: __m128i, k: __mmask8, b: __m128) -> __m128 {
+    unsafe {
+        let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
+        let idx = _mm_castsi128_ps(idx).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_pd&expand=4274)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub fn _mm512_permutex2var_pd(a: __m512d, idx: __m512i, b: __m512d) -> __m512d {
+    unsafe { transmute(vpermi2pd(a.as_f64x8(), idx.as_i64x8(), b.as_f64x8())) }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_pd&expand=4271)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2pd))]
+pub fn _mm512_mask_permutex2var_pd(a: __m512d, k: __mmask8, idx: __m512i, b: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, a.as_f64x8()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_pd&expand=4273)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub fn _mm512_maskz_permutex2var_pd(k: __mmask8, a: __m512d, idx: __m512i, b: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_pd&expand=4272)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
+pub fn _mm512_mask2_permutex2var_pd(a: __m512d, idx: __m512i, k: __mmask8, b: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
+        let idx = _mm512_castsi512_pd(idx).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_pd&expand=4270)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub fn _mm256_permutex2var_pd(a: __m256d, idx: __m256i, b: __m256d) -> __m256d {
+    unsafe { transmute(vpermi2pd256(a.as_f64x4(), idx.as_i64x4(), b.as_f64x4())) }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_pd&expand=4267)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2pd))]
+pub fn _mm256_mask_permutex2var_pd(a: __m256d, k: __mmask8, idx: __m256i, b: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, a.as_f64x4()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_pd&expand=4269)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub fn _mm256_maskz_permutex2var_pd(k: __mmask8, a: __m256d, idx: __m256i, b: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_pd&expand=4268)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
+pub fn _mm256_mask2_permutex2var_pd(a: __m256d, idx: __m256i, k: __mmask8, b: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
+        let idx = _mm256_castsi256_pd(idx).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_pd&expand=4266)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub fn _mm_permutex2var_pd(a: __m128d, idx: __m128i, b: __m128d) -> __m128d {
+    unsafe { transmute(vpermi2pd128(a.as_f64x2(), idx.as_i64x2(), b.as_f64x2())) }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_pd&expand=4263)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2pd))]
+pub fn _mm_mask_permutex2var_pd(a: __m128d, k: __mmask8, idx: __m128i, b: __m128d) -> __m128d {
+    unsafe {
+        let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, a.as_f64x2()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_pd&expand=4265)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub fn _mm_maskz_permutex2var_pd(k: __mmask8, a: __m128d, idx: __m128i, b: __m128d) -> __m128d {
+    unsafe {
+        let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, f64x2::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_pd&expand=4264)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
+pub fn _mm_mask2_permutex2var_pd(a: __m128d, idx: __m128i, k: __mmask8, b: __m128d) -> __m128d {
+    unsafe {
+        let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
+        let idx = _mm_castsi128_pd(idx).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_epi32&expand=5150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 9))] //should be vpshufd
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r: i32x16 = simd_shuffle!(
+            a.as_i32x16(),
+            a.as_i32x16(),
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                (MASK as u32 >> 4) & 0b11,
+                (MASK as u32 >> 6) & 0b11,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+                (MASK as u32 & 0b11) + 8,
+                ((MASK as u32 >> 2) & 0b11) + 8,
+                ((MASK as u32 >> 4) & 0b11) + 8,
+                ((MASK as u32 >> 6) & 0b11) + 8,
+                (MASK as u32 & 0b11) + 12,
+                ((MASK as u32 >> 2) & 0b11) + 12,
+                ((MASK as u32 >> 4) & 0b11) + 12,
+                ((MASK as u32 >> 6) & 0b11) + 12,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_epi32&expand=5148)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    }
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_epi32&expand=5149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_epi32&expand=5145)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    }
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_epi32&expand=5146)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shuffle_epi32&expand=5142)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shuffle_epi32&expand=5143)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_ps&expand=5203)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shuffle_ps<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            b,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11) + 16,
+                ((MASK as u32 >> 6) & 0b11) + 16,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 20,
+                ((MASK as u32 >> 6) & 0b11) + 20,
+                (MASK as u32 & 0b11) + 8,
+                ((MASK as u32 >> 2) & 0b11) + 8,
+                ((MASK as u32 >> 4) & 0b11) + 24,
+                ((MASK as u32 >> 6) & 0b11) + 24,
+                (MASK as u32 & 0b11) + 12,
+                ((MASK as u32 >> 2) & 0b11) + 12,
+                ((MASK as u32 >> 4) & 0b11) + 28,
+                ((MASK as u32 >> 6) & 0b11) + 28,
+            ],
+        )
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_ps&expand=5201)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shuffle_ps<const MASK: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_ps&expand=5202)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shuffle_ps<const MASK: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_ps&expand=5198)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shuffle_ps<const MASK: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_ps&expand=5199)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shuffle_ps&expand=5195)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shuffle_ps<const MASK: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shuffle_ps&expand=5196)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_pd&expand=5192)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shuffle_pd<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            b,
+            [
+                MASK as u32 & 0b1,
+                ((MASK as u32 >> 1) & 0b1) + 8,
+                ((MASK as u32 >> 2) & 0b1) + 2,
+                ((MASK as u32 >> 3) & 0b1) + 10,
+                ((MASK as u32 >> 4) & 0b1) + 4,
+                ((MASK as u32 >> 5) & 0b1) + 12,
+                ((MASK as u32 >> 6) & 0b1) + 6,
+                ((MASK as u32 >> 7) & 0b1) + 14,
+            ],
+        )
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_pd&expand=5190)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shuffle_pd<const MASK: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_pd&expand=5191)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shuffle_pd<const MASK: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_pd&expand=5187)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shuffle_pd<const MASK: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_pd&expand=5188)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shuffle_pd<const MASK: i32>(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shuffle_pd&expand=5184)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shuffle_pd<const MASK: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shuffle_pd&expand=5185)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shuffle_pd<const MASK: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x2(), f64x2::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_i32x4&expand=5177)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_01_01_01))] //should be vshufi32x4
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let r: i32x16 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b11) * 4 + 0,
+                (MASK as u32 & 0b11) * 4 + 1,
+                (MASK as u32 & 0b11) * 4 + 2,
+                (MASK as u32 & 0b11) * 4 + 3,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 0,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 1,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 2,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 3,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_i32x4&expand=5175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_i32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_i32x4&expand=5176)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_i32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_i32x4&expand=5174)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b11))] //should be vshufi32x4
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let r: i32x8 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b1) * 4 + 0,
+                (MASK as u32 & 0b1) * 4 + 1,
+                (MASK as u32 & 0b1) * 4 + 2,
+                (MASK as u32 & 0b1) * 4 + 3,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_i32x4&expand=5172)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_i32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_i32x4&expand=5173)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_i32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_i64x2&expand=5183)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shuffle_i64x2<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let r: i64x8 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b11) * 2 + 0,
+                (MASK as u32 & 0b11) * 2 + 1,
+                ((MASK as u32 >> 2) & 0b11) * 2 + 0,
+                ((MASK as u32 >> 2) & 0b11) * 2 + 1,
+                ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
+                ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
+                ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
+                ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_i64x2&expand=5181)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shuffle_i64x2<const MASK: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_i64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_i64x2&expand=5182)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shuffle_i64x2<const MASK: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_i64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_i64x2&expand=5180)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshufi64x2
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shuffle_i64x2<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let r: i64x4 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b1) * 2 + 0,
+                (MASK as u32 & 0b1) * 2 + 1,
+                ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
+                ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_i64x2&expand=5178)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shuffle_i64x2<const MASK: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_i64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_i64x2&expand=5179)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shuffle_i64x2<const MASK: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_i64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_f32x4&expand=5165)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b1011))] //should be vshuff32x4, but generate vshuff64x2
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shuffle_f32x4<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r: f32x16 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b11) * 4 + 0,
+                (MASK as u32 & 0b11) * 4 + 1,
+                (MASK as u32 & 0b11) * 4 + 2,
+                (MASK as u32 & 0b11) * 4 + 3,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 0,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 1,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 2,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 3,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_f32x4&expand=5163)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shuffle_f32x4<const MASK: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_f32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_f32x4&expand=5164)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shuffle_f32x4<const MASK: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_f32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_f32x4&expand=5162)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff32x4
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shuffle_f32x4<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let r: f32x8 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b1) * 4 + 0,
+                (MASK as u32 & 0b1) * 4 + 1,
+                (MASK as u32 & 0b1) * 4 + 2,
+                (MASK as u32 & 0b1) * 4 + 3,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_f32x4&expand=5160)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shuffle_f32x4<const MASK: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_f32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_f32x4&expand=5161)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_f32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_f64x2&expand=5171)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shuffle_f64x2<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r: f64x8 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b11) * 2 + 0,
+                (MASK as u32 & 0b11) * 2 + 1,
+                ((MASK as u32 >> 2) & 0b11) * 2 + 0,
+                ((MASK as u32 >> 2) & 0b11) * 2 + 1,
+                ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
+                ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
+                ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
+                ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_f64x2&expand=5169)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shuffle_f64x2<const MASK: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_f64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_f64x2&expand=5170)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shuffle_f64x2<const MASK: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_f64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_f64x2&expand=5168)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff64x2
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shuffle_f64x2<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let r: f64x4 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b1) * 2 + 0,
+                (MASK as u32 & 0b1) * 2 + 1,
+                ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
+                ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_f64x2&expand=5166)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shuffle_f64x2<const MASK: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_f64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_f64x2&expand=5167)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shuffle_f64x2<const MASK: i32>(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_f64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf32x4_ps&expand=2442)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_extractf32x4_ps<const IMM8: i32>(a: __m512) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        match IMM8 & 0x3 {
+            0 => simd_shuffle!(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
+            1 => simd_shuffle!(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
+            2 => simd_shuffle!(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
+            _ => simd_shuffle!(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
+        }
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf32x4_ps&expand=2443)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_extractf32x4_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m512) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_extractf32x4_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf32x4_ps&expand=2444)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 3))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_extractf32x4_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf32x4_ps&expand=2439)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    test,
+    assert_instr(vextract, IMM8 = 1) //should be vextractf32x4
+)]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_extractf32x4_ps<const IMM8: i32>(a: __m256) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        match IMM8 & 0x1 {
+            0 => simd_shuffle!(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
+            _ => simd_shuffle!(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
+        }
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extractf32x4_ps&expand=2440)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_extractf32x4_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m256) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_extractf32x4_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extractf32x4_ps&expand=2441)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_extractf32x4_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti64x4_epi64&expand=2473)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    test,
+    assert_instr(vextractf64x4, IMM1 = 1) //should be vextracti64x4
+)]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_extracti64x4_epi64<const IMM1: i32>(a: __m512i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        match IMM1 {
+            0 => simd_shuffle!(a, _mm512_setzero_si512(), [0, 1, 2, 3]),
+            _ => simd_shuffle!(a, _mm512_setzero_si512(), [4, 5, 6, 7]),
+        }
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti64x4_epi64&expand=2474)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextracti64x4, IMM1 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_extracti64x4_epi64<const IMM1: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let r = _mm512_extracti64x4_epi64::<IMM1>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti64x4_epi64&expand=2475)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextracti64x4, IMM1 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_extracti64x4_epi64<const IMM1: i32>(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let r = _mm512_extracti64x4_epi64::<IMM1>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf64x4_pd&expand=2454)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_extractf64x4_pd<const IMM8: i32>(a: __m512d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        match IMM8 & 0x1 {
+            0 => simd_shuffle!(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
+            _ => simd_shuffle!(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
+        }
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf64x4_pd&expand=2455)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_extractf64x4_pd<const IMM8: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_extractf64x4_pd::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf64x4_pd&expand=2456)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_extractf64x4_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_extractf64x4_pd::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti32x4_epi32&expand=2461)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    test,
+    assert_instr(vextractf32x4, IMM2 = 3) //should be vextracti32x4
+)]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_extracti32x4_epi32<const IMM2: i32>(a: __m512i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let a = a.as_i32x16();
+        let zero = i32x16::ZERO;
+        let extract: i32x4 = match IMM2 {
+            0 => simd_shuffle!(a, zero, [0, 1, 2, 3]),
+            1 => simd_shuffle!(a, zero, [4, 5, 6, 7]),
+            2 => simd_shuffle!(a, zero, [8, 9, 10, 11]),
+            _ => simd_shuffle!(a, zero, [12, 13, 14, 15]),
+        };
+        transmute(extract)
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti32x4_epi32&expand=2462)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextracti32x4, IMM2 = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_extracti32x4_epi32<const IMM2: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let r = _mm512_extracti32x4_epi32::<IMM2>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti32x4_epi32&expand=2463)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextracti32x4, IMM2 = 3))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_extracti32x4_epi32<const IMM2: i32>(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let r = _mm512_extracti32x4_epi32::<IMM2>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti32x4_epi32&expand=2458)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    test,
+    assert_instr(vextract, IMM1 = 1) //should be vextracti32x4
+)]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_extracti32x4_epi32<const IMM1: i32>(a: __m256i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let a = a.as_i32x8();
+        let zero = i32x8::ZERO;
+        let extract: i32x4 = match IMM1 {
+            0 => simd_shuffle!(a, zero, [0, 1, 2, 3]),
+            _ => simd_shuffle!(a, zero, [4, 5, 6, 7]),
+        };
+        transmute(extract)
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extracti32x4_epi32&expand=2459)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextracti32x4, IMM1 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_extracti32x4_epi32<const IMM1: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let r = _mm256_extracti32x4_epi32::<IMM1>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extracti32x4_epi32&expand=2460)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextracti32x4, IMM1 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_extracti32x4_epi32<const IMM1: i32>(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let r = _mm256_extracti32x4_epi32::<IMM1>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_moveldup_ps&expand=3862)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm512_moveldup_ps(a: __m512) -> __m512 {
+    unsafe {
+        let r: f32x16 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+        transmute(r)
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_moveldup_ps&expand=3860)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov: f32x16 =
+            simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+        transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_moveldup_ps&expand=3861)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm512_maskz_moveldup_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov: f32x16 =
+            simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+        transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_moveldup_ps&expand=3857)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm256_mask_moveldup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = _mm256_moveldup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_moveldup_ps&expand=3858)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm256_maskz_moveldup_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = _mm256_moveldup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_moveldup_ps&expand=3854)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm_mask_moveldup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = _mm_moveldup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_moveldup_ps&expand=3855)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm_maskz_moveldup_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = _mm_moveldup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movehdup_ps&expand=3852)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm512_movehdup_ps(a: __m512) -> __m512 {
+    unsafe {
+        let r: f32x16 = simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+        transmute(r)
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_movehdup_ps&expand=3850)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov: f32x16 =
+            simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+        transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_movehdup_ps&expand=3851)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm512_maskz_movehdup_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov: f32x16 =
+            simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+        transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_movehdup_ps&expand=3847)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm256_mask_movehdup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = _mm256_movehdup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_movehdup_ps&expand=3848)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm256_maskz_movehdup_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = _mm256_movehdup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_movehdup_ps&expand=3844)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm_mask_movehdup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = _mm_movehdup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_movehdup_ps&expand=3845)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm_maskz_movehdup_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = _mm_movehdup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movedup_pd&expand=3843)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm512_movedup_pd(a: __m512d) -> __m512d {
+    unsafe {
+        let r: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+        transmute(r)
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_movedup_pd&expand=3841)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let mov: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+        transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_movedup_pd&expand=3842)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let mov: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+        transmute(simd_select_bitmask(k, mov, f64x8::ZERO))
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_movedup_pd&expand=3838)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm256_mask_movedup_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        let mov = _mm256_movedup_pd(a);
+        transmute(simd_select_bitmask(k, mov.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_movedup_pd&expand=3839)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm256_maskz_movedup_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        let mov = _mm256_movedup_pd(a);
+        transmute(simd_select_bitmask(k, mov.as_f64x4(), f64x4::ZERO))
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_movedup_pd&expand=3835)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm_mask_movedup_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        let mov = _mm_movedup_pd(a);
+        transmute(simd_select_bitmask(k, mov.as_f64x2(), src.as_f64x2()))
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_movedup_pd&expand=3836)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm_maskz_movedup_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        let mov = _mm_movedup_pd(a);
+        transmute(simd_select_bitmask(k, mov.as_f64x2(), f64x2::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti32x4&expand=3174)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))] //should be vinserti32x4
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_inserti32x4<const IMM8: i32>(a: __m512i, b: __m128i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let a = a.as_i32x16();
+        let b = _mm512_castsi128_si512(b).as_i32x16();
+        let ret: i32x16 = match IMM8 & 0b11 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                )
+            }
+            1 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+                )
+            }
+            2 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+                )
+            }
+            _ => {
+                simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19])
+            }
+        };
+        transmute(ret)
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti32x4&expand=3175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_inserti32x4<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m128i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_inserti32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti32x4&expand=3176)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_inserti32x4<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m128i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_inserti32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti32x4&expand=3171)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    test,
+    assert_instr(vinsert, IMM8 = 1) //should be vinserti32x4
+)]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_inserti32x4<const IMM8: i32>(a: __m256i, b: __m128i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i32x8();
+        let b = _mm256_castsi128_si256(b).as_i32x8();
+        let ret: i32x8 = match IMM8 & 0b1 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        };
+        transmute(ret)
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_inserti32x4&expand=3172)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_inserti32x4<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m128i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_inserti32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_inserti32x4&expand=3173)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_inserti32x4<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m128i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_inserti32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 256 bits (composed of 4 packed 64-bit integers) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti64x4&expand=3186)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))] //should be vinserti64x4
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_inserti64x4<const IMM8: i32>(a: __m512i, b: __m256i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_castsi256_si512(b);
+        match IMM8 & 0b1 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti64x4&expand=3187)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinserti64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_inserti64x4<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m256i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_inserti64x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti64x4&expand=3188)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinserti64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_inserti64x4<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m256i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_inserti64x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf32x4&expand=3155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_insertf32x4<const IMM8: i32>(a: __m512, b: __m128) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_castps128_ps512(b);
+        match IMM8 & 0b11 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                )
+            }
+            1 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+                )
+            }
+            2 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+                )
+            }
+            _ => {
+                simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19])
+            }
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf32x4&expand=3156)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_insertf32x4<const IMM8: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m128,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_insertf32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf32x4&expand=3157)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_insertf32x4<const IMM8: i32>(k: __mmask16, a: __m512, b: __m128) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_insertf32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf32x4&expand=3152)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    test,
+    assert_instr(vinsert, IMM8 = 1) //should be vinsertf32x4
+)]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_insertf32x4<const IMM8: i32>(a: __m256, b: __m128) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_castps128_ps256(b);
+        match IMM8 & 0b1 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_insertf32x4&expand=3153)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_insertf32x4<const IMM8: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m128,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_insertf32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_insertf32x4&expand=3154)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_insertf32x4<const IMM8: i32>(k: __mmask8, a: __m256, b: __m128) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_insertf32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf64x4&expand=3167)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_insertf64x4<const IMM8: i32>(a: __m512d, b: __m256d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_castpd256_pd512(b);
+        match IMM8 & 0b1 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf64x4&expand=3168)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_insertf64x4<const IMM8: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m256d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_insertf64x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf64x4&expand=3169)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_insertf64x4<const IMM8: i32>(k: __mmask8, a: __m512d, b: __m256d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_insertf64x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_epi32&expand=6021)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))] //should be vpunpckhdq
+pub fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        #[rustfmt::skip]
+        let r: i32x16 = simd_shuffle!(
+            a, b,
+            [ 2, 18, 3, 19,
+              2 + 4, 18 + 4, 3 + 4, 19 + 4,
+              2 + 8, 18 + 8, 3 + 8, 19 + 8,
+              2 + 12, 18 + 12, 3 + 12, 19 + 12],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_epi32&expand=6019)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub fn _mm512_mask_unpackhi_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16()))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_epi32&expand=6020)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, unpackhi, i32x16::ZERO))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_epi32&expand=6016)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub fn _mm256_mask_unpackhi_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i32x8()))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_epi32&expand=6017)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub fn _mm256_maskz_unpackhi_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, unpackhi, i32x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_epi32&expand=6013)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub fn _mm_mask_unpackhi_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i32x4()))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_epi32&expand=6014)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub fn _mm_maskz_unpackhi_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, unpackhi, i32x4::ZERO))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_epi64&expand=6030)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))] //should be vpunpckhqdq
+pub fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { simd_shuffle!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6]) }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_epi64&expand=6028)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub fn _mm512_mask_unpackhi_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8()))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_epi64&expand=6029)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, unpackhi, i64x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_epi64&expand=6025)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub fn _mm256_mask_unpackhi_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i64x4()))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_epi64&expand=6026)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub fn _mm256_maskz_unpackhi_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, unpackhi, i64x4::ZERO))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_epi64&expand=6022)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub fn _mm_mask_unpackhi_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i64x2()))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_epi64&expand=6023)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub fn _mm_maskz_unpackhi_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, unpackhi, i64x2::ZERO))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_ps&expand=6060)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        #[rustfmt::skip]
+        simd_shuffle!(
+            a, b,
+            [ 2, 18, 3, 19,
+              2 + 4, 18 + 4, 3 + 4, 19 + 4,
+              2 + 8, 18 + 8, 3 + 8, 19 + 8,
+              2 + 12, 18 + 12, 3 + 12, 19 + 12],
+        )
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_ps&expand=6058)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm512_mask_unpackhi_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16()))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_ps&expand=6059)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, unpackhi, f32x16::ZERO))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_ps&expand=6055)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm256_mask_unpackhi_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f32x8()))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_ps&expand=6056)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm256_maskz_unpackhi_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, unpackhi, f32x8::ZERO))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_ps&expand=6052)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm_mask_unpackhi_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f32x4()))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_ps&expand=6053)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm_maskz_unpackhi_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, unpackhi, f32x4::ZERO))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_pd&expand=6048)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { simd_shuffle!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6]) }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_pd&expand=6046)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm512_mask_unpackhi_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8()))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_pd&expand=6047)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, unpackhi, f64x8::ZERO))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_pd&expand=6043)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm256_mask_unpackhi_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f64x4()))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_pd&expand=6044)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm256_maskz_unpackhi_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, unpackhi, f64x4::ZERO))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_pd&expand=6040)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm_mask_unpackhi_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f64x2()))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_pd&expand=6041)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm_maskz_unpackhi_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, unpackhi, f64x2::ZERO))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_epi32&expand=6078)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))] //should be vpunpckldq
+pub fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        #[rustfmt::skip]
+        let r: i32x16 = simd_shuffle!(
+            a, b,
+            [ 0, 16, 1, 17,
+              0 + 4, 16 + 4, 1 + 4, 17 + 4,
+              0 + 8, 16 + 8, 1 + 8, 17 + 8,
+              0 + 12, 16 + 12, 1 + 12, 17 + 12],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_epi32&expand=6076)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub fn _mm512_mask_unpacklo_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i32x16()))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_epi32&expand=6077)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub fn _mm512_maskz_unpacklo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, unpacklo, i32x16::ZERO))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_epi32&expand=6073)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub fn _mm256_mask_unpacklo_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i32x8()))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_epi32&expand=6074)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub fn _mm256_maskz_unpacklo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, unpacklo, i32x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_epi32&expand=6070)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub fn _mm_mask_unpacklo_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i32x4()))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_epi32&expand=6071)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub fn _mm_maskz_unpacklo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, unpacklo, i32x4::ZERO))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_epi64&expand=6087)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))] //should be vpunpcklqdq
+pub fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { simd_shuffle!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6]) }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_epi64&expand=6085)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub fn _mm512_mask_unpacklo_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i64x8()))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_epi64&expand=6086)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub fn _mm512_maskz_unpacklo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, unpacklo, i64x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_epi64&expand=6082)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub fn _mm256_mask_unpacklo_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i64x4()))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_epi64&expand=6083)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub fn _mm256_maskz_unpacklo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, unpacklo, i64x4::ZERO))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_epi64&expand=6079)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub fn _mm_mask_unpacklo_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i64x2()))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_epi64&expand=6080)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub fn _mm_maskz_unpacklo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, unpacklo, i64x2::ZERO))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_ps&expand=6117)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        #[rustfmt::skip]
+        simd_shuffle!(a, b,
+                       [ 0, 16, 1, 17,
+                         0 + 4, 16 + 4, 1 + 4, 17 + 4,
+                         0 + 8, 16 + 8, 1 + 8, 17 + 8,
+                         0 + 12, 16 + 12, 1 + 12, 17 + 12],
+        )
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_ps&expand=6115)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm512_mask_unpacklo_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f32x16()))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_ps&expand=6116)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm512_maskz_unpacklo_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, unpacklo, f32x16::ZERO))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_ps&expand=6112)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm256_mask_unpacklo_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f32x8()))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_ps&expand=6113)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm256_maskz_unpacklo_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, unpacklo, f32x8::ZERO))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_ps&expand=6109)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm_mask_unpacklo_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f32x4()))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_ps&expand=6110)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm_maskz_unpacklo_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, unpacklo, f32x4::ZERO))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_pd&expand=6105)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { simd_shuffle!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6]) }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_pd&expand=6103)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm512_mask_unpacklo_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f64x8()))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_pd&expand=6104)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm512_maskz_unpacklo_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, unpacklo, f64x8::ZERO))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_pd&expand=6100)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm256_mask_unpacklo_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f64x4()))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_pd&expand=6101)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm256_maskz_unpacklo_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, unpacklo, f64x4::ZERO))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_pd&expand=6097)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm_mask_unpacklo_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f64x2()))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_pd&expand=6098)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm_maskz_unpacklo_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, unpacklo, f64x2::ZERO))
+    }
+}
+
+/// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps128_ps512&expand=621)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castps128_ps512(a: __m128) -> __m512 {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_undefined_ps(),
+            [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
+        )
+    }
+}
+
+/// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps256_ps512&expand=623)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castps256_ps512(a: __m256) -> __m512 {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_undefined_ps(),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+        )
+    }
+}
+
+/// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextps128_ps512&expand=6196)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_set1_ps(0.),
+            [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
+        )
+    }
+}
+
+/// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextps256_ps512&expand=6197)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_set1_ps(0.),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+        )
+    }
+}
+
+/// Cast vector of type __m512 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps512_ps128&expand=624)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castps512_ps128(a: __m512) -> __m128 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
+}
+
+/// Cast vector of type __m512 to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps512_ps256&expand=625)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castps512_ps256(a: __m512) -> __m256 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+
+/// Cast vector of type __m512 to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps_pd&expand=616)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castps_pd(a: __m512) -> __m512d {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type __m512 to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps_si512&expand=619)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castps_si512(a: __m512) -> __m512i {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd128_pd512&expand=609)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
+    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2, 2, 2, 2, 2]) }
+}
+
+/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd256_pd512&expand=611)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
+    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [0, 1, 2, 3, 4, 4, 4, 4]) }
+}
+
+/// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextpd128_pd512&expand=6193)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
+    unsafe { simd_shuffle!(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2]) }
+}
+
+/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextpd256_pd512&expand=6194)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
+    unsafe { simd_shuffle!(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4]) }
+}
+
+/// Cast vector of type __m512d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd512_pd128&expand=612)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
+    unsafe { simd_shuffle!(a, a, [0, 1]) }
+}
+
+/// Cast vector of type __m512d to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd512_pd256&expand=613)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castpd512_pd256(a: __m512d) -> __m256d {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
+}
+
+/// Cast vector of type __m512d to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd_ps&expand=604)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castpd_ps(a: __m512d) -> __m512 {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type __m512d to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd_si512&expand=607)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castpd_si512(a: __m512d) -> __m512i {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi128_si512&expand=629)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
+    unsafe { simd_shuffle!(a, _mm_undefined_si128(), [0, 1, 2, 2, 2, 2, 2, 2]) }
+}
+
+/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi256_si512&expand=633)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
+    unsafe { simd_shuffle!(a, _mm256_undefined_si256(), [0, 1, 2, 3, 4, 4, 4, 4]) }
+}
+
+/// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextsi128_si512&expand=6199)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
+    unsafe { simd_shuffle!(a, _mm_setzero_si128(), [0, 1, 2, 2, 2, 2, 2, 2]) }
+}
+
+/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextsi256_si512&expand=6200)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
+    unsafe { simd_shuffle!(a, _mm256_setzero_si256(), [0, 1, 2, 3, 4, 4, 4, 4]) }
+}
+
+/// Cast vector of type __m512i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi512_si128&expand=636)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
+    unsafe { simd_shuffle!(a, a, [0, 1]) }
+}
+
+/// Cast vector of type __m512i to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi512_si256&expand=637)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castsi512_si256(a: __m512i) -> __m256i {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
+}
+
+/// Cast vector of type __m512i to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi512_ps&expand=635)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castsi512_ps(a: __m512i) -> __m512 {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type __m512i to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi512_pd&expand=634)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
+    unsafe { transmute(a) }
+}
+
+/// Copy the lower 32-bit integer in a to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsi512_si32&expand=1882)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovd))]
+pub fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
+    unsafe { simd_extract!(a.as_i32x16(), 0) }
+}
+
+/// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtss_f32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtss_f32(a: __m512) -> f32 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsd_f64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtsd_f64(a: __m512d) -> f64 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastd_epi32&expand=545)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd
+pub fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
+    unsafe {
+        let a = _mm512_castsi128_si512(a).as_i32x16();
+        let ret: i32x16 = simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        transmute(ret)
+    }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastd_epi32&expand=546)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub fn _mm512_mask_broadcastd_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
+    }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastd_epi32&expand=547)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, broadcast, i32x16::ZERO))
+    }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastd_epi32&expand=543)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub fn _mm256_mask_broadcastd_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
+    }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastd_epi32&expand=544)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub fn _mm256_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, broadcast, i32x8::ZERO))
+    }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastd_epi32&expand=540)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub fn _mm_mask_broadcastd_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x4()))
+    }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastd_epi32&expand=541)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub fn _mm_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, broadcast, i32x4::ZERO))
+    }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastq_epi64&expand=560)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastq
+pub fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
+    unsafe { simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0]) }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastq_epi64&expand=561)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub fn _mm512_mask_broadcastq_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
+    }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastq_epi64&expand=562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub fn _mm512_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, broadcast, i64x8::ZERO))
+    }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastq_epi64&expand=558)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub fn _mm256_mask_broadcastq_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i64x4()))
+    }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastq_epi64&expand=559)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub fn _mm256_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, broadcast, i64x4::ZERO))
+    }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastq_epi64&expand=555)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub fn _mm_mask_broadcastq_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i64x2()))
+    }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastq_epi64&expand=556)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub fn _mm_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, broadcast, i64x2::ZERO))
+    }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastss_ps&expand=578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
+    unsafe { simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastss_ps&expand=579)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm512_mask_broadcastss_ps(src: __m512, k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
+    }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastss_ps&expand=580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm512_maskz_broadcastss_ps(k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, broadcast, f32x16::ZERO))
+    }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastss_ps&expand=576)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm256_mask_broadcastss_ps(src: __m256, k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
+    }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastss_ps&expand=577)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm256_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, broadcast, f32x8::ZERO))
+    }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastss_ps&expand=573)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm_mask_broadcastss_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let broadcast = _mm_broadcastss_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x4()))
+    }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastss_ps&expand=574)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let broadcast = _mm_broadcastss_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, broadcast, f32x4::ZERO))
+    }
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastsd_pd&expand=567)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
+    unsafe { simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0]) }
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastsd_pd&expand=568)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub fn _mm512_mask_broadcastsd_pd(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
+    unsafe {
+        let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
+    }
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastsd_pd&expand=569)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
+    unsafe {
+        let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, broadcast, f64x8::ZERO))
+    }
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastsd_pd&expand=565)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub fn _mm256_mask_broadcastsd_pd(src: __m256d, k: __mmask8, a: __m128d) -> __m256d {
+    unsafe {
+        let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f64x4()))
+    }
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastsd_pd&expand=566)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub fn _mm256_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m256d {
+    unsafe {
+        let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, broadcast, f64x4::ZERO))
+    }
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i32x4&expand=510)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x4();
+        let ret: i32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
+        transmute(ret)
+    }
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i32x4&expand=511)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
+    }
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i32x4&expand=512)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
+        transmute(simd_select_bitmask(k, broadcast, i32x16::ZERO))
+    }
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_i32x4&expand=507)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_broadcast_i32x4(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i32x4();
+        let ret: i32x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
+        transmute(ret)
+    }
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_i32x4&expand=508)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_broadcast_i32x4(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
+    }
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_i32x4&expand=509)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_broadcast_i32x4(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
+        transmute(simd_select_bitmask(k, broadcast, i32x8::ZERO))
+    }
+}
+
+/// Broadcast the 4 packed 64-bit integers from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i64x4&expand=522)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) }
+}
+
+/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i64x4&expand=523)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
+    }
+}
+
+/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i64x4&expand=524)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
+        transmute(simd_select_bitmask(k, broadcast, i64x8::ZERO))
+    }
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f32x4&expand=483)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]) }
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f32x4&expand=484)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
+    }
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f32x4&expand=485)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
+        transmute(simd_select_bitmask(k, broadcast, f32x16::ZERO))
+    }
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_f32x4&expand=480)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_broadcast_f32x4(a: __m128) -> __m256 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) }
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_f32x4&expand=481)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_broadcast_f32x4(src: __m256, k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
+    }
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_f32x4&expand=482)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_broadcast_f32x4(k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
+        transmute(simd_select_bitmask(k, broadcast, f32x8::ZERO))
+    }
+}
+
+/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f64x4&expand=495)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vperm
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) }
+}
+
+/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f64x4&expand=496)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d {
+    unsafe {
+        let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
+    }
+}
+
+/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f64x4&expand=497)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
+    unsafe {
+        let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
+        transmute(simd_select_bitmask(k, broadcast, f64x8::ZERO))
+    }
+}
+
+/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_epi32&expand=435)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
+pub fn _mm512_mask_blend_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i32x16(), a.as_i32x16())) }
+}
+
+/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_epi32&expand=434)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
+pub fn _mm256_mask_blend_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i32x8(), a.as_i32x8())) }
+}
+
+/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_epi32&expand=432)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
+pub fn _mm_mask_blend_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i32x4(), a.as_i32x4())) }
+}
+
+/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_epi64&expand=438)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
+pub fn _mm512_mask_blend_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i64x8(), a.as_i64x8())) }
+}
+
+/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_epi64&expand=437)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
+pub fn _mm256_mask_blend_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i64x4(), a.as_i64x4())) }
+}
+
+/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_epi64&expand=436)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
+pub fn _mm_mask_blend_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i64x2(), a.as_i64x2())) }
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_ps&expand=451)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
+pub fn _mm512_mask_blend_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f32x16(), a.as_f32x16())) }
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_ps&expand=450)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
+pub fn _mm256_mask_blend_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f32x8(), a.as_f32x8())) }
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_ps&expand=448)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
+pub fn _mm_mask_blend_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f32x4(), a.as_f32x4())) }
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_pd&expand=446)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
+pub fn _mm512_mask_blend_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f64x8(), a.as_f64x8())) }
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_pd&expand=445)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
+pub fn _mm256_mask_blend_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f64x4(), a.as_f64x4())) }
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_pd&expand=443)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
+pub fn _mm_mask_blend_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f64x2(), a.as_f64x2())) }
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst.
+///
+/// <div class="warning">Only lowest <strong>4 bits</strong> are used from the mask (shift at maximum by 60 bytes)!</div>
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi32&expand=245)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let imm8: i32 = IMM8 % 16;
+        let r: i32x16 = match imm8 {
+            0 => simd_shuffle!(
+                a,
+                b,
+                [
+                    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ],
+            ),
+            1 => simd_shuffle!(
+                a,
+                b,
+                [
+                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
+                ],
+            ),
+            2 => simd_shuffle!(
+                a,
+                b,
+                [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
+            ),
+            3 => simd_shuffle!(
+                a,
+                b,
+                [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
+            ),
+            4 => simd_shuffle!(
+                a,
+                b,
+                [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
+            ),
+            5 => simd_shuffle!(
+                a,
+                b,
+                [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
+            ),
+            6 => simd_shuffle!(
+                a,
+                b,
+                [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
+            ),
+            7 => simd_shuffle!(
+                a,
+                b,
+                [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
+            ),
+            8 => simd_shuffle!(
+                a,
+                b,
+                [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
+            ),
+            9 => simd_shuffle!(
+                a,
+                b,
+                [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
+            ),
+            10 => simd_shuffle!(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+            11 => simd_shuffle!(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
+            12 => simd_shuffle!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
+            13 => simd_shuffle!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
+            14 => simd_shuffle!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
+            15 => simd_shuffle!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_alignr_epi32&expand=246)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_alignr_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    }
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and stores the low 64 bytes (16 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_alignr_epi32&expand=247)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_alignr_epi32<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+    }
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst.
+///
+/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 28 bytes)!</div>
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi32&expand=242)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let imm8: i32 = IMM8 % 8;
+        let r: i32x8 = match imm8 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+            1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+            2 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+            3 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+            4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+            5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+            6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+            7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_alignr_epi32&expand=243)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_alignr_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    }
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_alignr_epi32&expand=244)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_alignr_epi32<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+    }
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst.
+///
+/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 12 bytes)!</div>
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi32&expand=239)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))] //should be valignd
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let imm8: i32 = IMM8 % 4;
+        let r: i32x4 = match imm8 {
+            0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
+            2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
+            3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_alignr_epi32&expand=240)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_alignr_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_alignr_epi32&expand=241)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_alignr_epi32<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst.
+///
+/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 56 bytes)!</div>
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi64&expand=254)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let imm8: i32 = IMM8 % 8;
+        let r: i64x8 = match imm8 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+            1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+            2 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+            3 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+            4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+            5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+            6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+            7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_alignr_epi64&expand=255)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_alignr_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    }
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and stores the low 64 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_alignr_epi64&expand=256)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_alignr_epi64<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+    }
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst.
+///
+/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 24 bytes)!</div>
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi64&expand=251)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let imm8: i32 = IMM8 % 4;
+        let r: i64x4 = match imm8 {
+            0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
+            2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
+            3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_alignr_epi64&expand=252)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_alignr_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    }
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_alignr_epi64&expand=253)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_alignr_epi64<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+    }
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst.
+///
+/// <div class="warning">Only lowest <strong>bit</strong> is used from the mask (shift at maximum by 8 bytes)!</div>
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi64&expand=248)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))] //should be valignq
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let imm8: i32 = IMM8 % 2;
+        let r: i64x2 = match imm8 {
+            0 => simd_shuffle!(a, b, [2, 3]),
+            1 => simd_shuffle!(a, b, [3, 0]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_alignr_epi64&expand=249)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_alignr_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x2(), src.as_i64x2()))
+    }
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_alignr_epi64&expand=250)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_alignr_epi64<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x2(), i64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_epi32&expand=272)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))] //should be vpandd, but generate vpandq
+pub fn _mm512_and_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_and(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_and_epi32&expand=273)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm512_mask_and_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let and = _mm512_and_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, and, src.as_i32x16()))
+    }
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_and_epi32&expand=274)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm512_maskz_and_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let and = _mm512_and_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, and, i32x16::ZERO))
+    }
+}
+
+/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_and_epi32&expand=270)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm256_mask_and_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let and = simd_and(a.as_i32x8(), b.as_i32x8());
+        transmute(simd_select_bitmask(k, and, src.as_i32x8()))
+    }
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_and_epi32&expand=271)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm256_maskz_and_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let and = simd_and(a.as_i32x8(), b.as_i32x8());
+        transmute(simd_select_bitmask(k, and, i32x8::ZERO))
+    }
+}
+
+/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_and_epi32&expand=268)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm_mask_and_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let and = simd_and(a.as_i32x4(), b.as_i32x4());
+        transmute(simd_select_bitmask(k, and, src.as_i32x4()))
+    }
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_and_epi32&expand=269)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm_maskz_and_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let and = simd_and(a.as_i32x4(), b.as_i32x4());
+        transmute(simd_select_bitmask(k, and, i32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_epi64&expand=279)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm512_and_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_and(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_and_epi64&expand=280)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm512_mask_and_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let and = _mm512_and_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, and, src.as_i64x8()))
+    }
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_and_epi64&expand=281)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm512_maskz_and_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let and = _mm512_and_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, and, i64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_and_epi64&expand=277)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm256_mask_and_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let and = simd_and(a.as_i64x4(), b.as_i64x4());
+        transmute(simd_select_bitmask(k, and, src.as_i64x4()))
+    }
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_and_epi64&expand=278)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm256_maskz_and_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let and = simd_and(a.as_i64x4(), b.as_i64x4());
+        transmute(simd_select_bitmask(k, and, i64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_and_epi64&expand=275)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm_mask_and_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let and = simd_and(a.as_i64x2(), b.as_i64x2());
+        transmute(simd_select_bitmask(k, and, src.as_i64x2()))
+    }
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_and_epi64&expand=276)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm_maskz_and_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let and = simd_and(a.as_i64x2(), b.as_i64x2());
+        transmute(simd_select_bitmask(k, and, i64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of 512 bits (representing integer data) in a and b, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_si512&expand=302)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm512_and_si512(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_and(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_epi32&expand=4042)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm512_or_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_or(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_or_epi32&expand=4040)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub fn _mm512_mask_or_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let or = _mm512_or_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, or, src.as_i32x16()))
+    }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_or_epi32&expand=4041)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub fn _mm512_maskz_or_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let or = _mm512_or_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, or, i32x16::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_epi32&expand=4039)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vor))] //should be vpord
+pub fn _mm256_or_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_or_epi32&expand=4037)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub fn _mm256_mask_or_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let or = _mm256_or_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, or, src.as_i32x8()))
+    }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_or_epi32&expand=4038)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub fn _mm256_maskz_or_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let or = _mm256_or_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, or, i32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_epi32&expand=4036)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vor))] //should be vpord
+pub fn _mm_or_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_or(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_or_epi32&expand=4034)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub fn _mm_mask_or_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let or = _mm_or_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, or, src.as_i32x4()))
+    }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_or_epi32&expand=4035)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub fn _mm_maskz_or_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let or = _mm_or_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, or, i32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_epi64&expand=4051)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm512_or_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_or(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_or_epi64&expand=4049)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm512_mask_or_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let or = _mm512_or_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, or, src.as_i64x8()))
+    }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_or_epi64&expand=4050)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm512_maskz_or_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let or = _mm512_or_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, or, i64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_epi64&expand=4048)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vor))] //should be vporq
+pub fn _mm256_or_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_or(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_or_epi64&expand=4046)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm256_mask_or_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let or = _mm256_or_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, or, src.as_i64x4()))
+    }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_or_epi64&expand=4047)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm256_maskz_or_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let or = _mm256_or_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, or, i64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_epi64&expand=4045)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vor))] //should be vporq
+pub fn _mm_or_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_or(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_or_epi64&expand=4043)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm_mask_or_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let or = _mm_or_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, or, src.as_i64x2()))
+    }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_or_epi64&expand=4044)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm_maskz_or_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let or = _mm_or_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, or, i64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of 512 bits (representing integer data) in a and b, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_si512&expand=4072)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm512_or_si512(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_or(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_epi32&expand=6142)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))] //should be vpxord
+pub fn _mm512_xor_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_xor(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_xor_epi32&expand=6140)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub fn _mm512_mask_xor_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let xor = _mm512_xor_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, xor, src.as_i32x16()))
+    }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_xor_epi32&expand=6141)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub fn _mm512_maskz_xor_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let xor = _mm512_xor_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, xor, i32x16::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_epi32&expand=6139)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxor))] //should be vpxord
+pub fn _mm256_xor_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_xor(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_xor_epi32&expand=6137)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub fn _mm256_mask_xor_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let xor = _mm256_xor_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, xor, src.as_i32x8()))
+    }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_xor_epi32&expand=6138)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub fn _mm256_maskz_xor_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let xor = _mm256_xor_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, xor, i32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_epi32&expand=6136)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxor))] //should be vpxord
+pub fn _mm_xor_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_xor(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_xor_epi32&expand=6134)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub fn _mm_mask_xor_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let xor = _mm_xor_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, xor, src.as_i32x4()))
+    }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_xor_epi32&expand=6135)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub fn _mm_maskz_xor_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let xor = _mm_xor_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, xor, i32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_epi64&expand=6151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm512_xor_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_xor(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_xor_epi64&expand=6149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm512_mask_xor_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let xor = _mm512_xor_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, xor, src.as_i64x8()))
+    }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_xor_epi64&expand=6150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm512_maskz_xor_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let xor = _mm512_xor_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, xor, i64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_epi64&expand=6148)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxor))] //should be vpxorq
+pub fn _mm256_xor_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_xor_epi64&expand=6146)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm256_mask_xor_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let xor = _mm256_xor_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, xor, src.as_i64x4()))
+    }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_xor_epi64&expand=6147)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm256_maskz_xor_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let xor = _mm256_xor_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, xor, i64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_epi64&expand=6145)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxor))] //should be vpxorq
+pub fn _mm_xor_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_xor(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_xor_epi64&expand=6143)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm_mask_xor_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let xor = _mm_xor_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, xor, src.as_i64x2()))
+    }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_xor_epi64&expand=6144)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm_maskz_xor_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let xor = _mm_xor_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, xor, i64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of 512 bits (representing integer data) in a and b, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_si512&expand=6172)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_xor(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_epi32&expand=310)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
+pub fn _mm512_andnot_epi32(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_and_epi32(_mm512_xor_epi32(a, _mm512_set1_epi32(u32::MAX as i32)), b)
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_andnot_epi32&expand=311)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub fn _mm512_mask_andnot_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, andnot, src.as_i32x16()))
+    }
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_andnot_epi32&expand=312)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub fn _mm512_maskz_andnot_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, andnot, i32x16::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_andnot_epi32&expand=308)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub fn _mm256_mask_andnot_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
+        let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
+        transmute(simd_select_bitmask(k, andnot, src.as_i32x8()))
+    }
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_andnot_epi32&expand=309)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub fn _mm256_maskz_andnot_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
+        let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
+        transmute(simd_select_bitmask(k, andnot, i32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_andnot_epi32&expand=306)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub fn _mm_mask_andnot_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
+        let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
+        transmute(simd_select_bitmask(k, andnot, src.as_i32x4()))
+    }
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_andnot_epi32&expand=307)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub fn _mm_maskz_andnot_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
+        let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
+        transmute(simd_select_bitmask(k, andnot, i32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in a and then AND with b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_epi64&expand=317)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
+pub fn _mm512_andnot_epi64(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_andnot_epi64&expand=318)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm512_mask_andnot_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, andnot, src.as_i64x8()))
+    }
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_andnot_epi64&expand=319)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm512_maskz_andnot_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, andnot, i64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_andnot_epi64&expand=315)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm256_mask_andnot_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
+        let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
+        transmute(simd_select_bitmask(k, andnot, src.as_i64x4()))
+    }
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_andnot_epi64&expand=316)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm256_maskz_andnot_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
+        let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
+        transmute(simd_select_bitmask(k, andnot, i64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_andnot_epi64&expand=313)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm_mask_andnot_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
+        let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
+        transmute(simd_select_bitmask(k, andnot, src.as_i64x2()))
+    }
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_andnot_epi64&expand=314)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm_maskz_andnot_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
+        let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
+        transmute(simd_select_bitmask(k, andnot, i64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of 512 bits (representing integer data) in a and then AND with b, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_si512&expand=340)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
+}
+
+/// Convert 16-bit mask a into an integer value, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtmask16_u32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtmask16_u32(a: __mmask16) -> u32 {
+    a as u32
+}
+
+/// Convert 32-bit integer value a to an 16-bit mask and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu32_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtu32_mask16(a: u32) -> __mmask16 {
+    a as __mmask16
+}
+
+/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kand_mask16&expand=3212)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
+pub fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a & b
+}
+
+/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kand&expand=3210)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
+pub fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a & b
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kor_mask16&expand=3239)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
+pub fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a | b
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kor&expand=3237)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
+pub fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a | b
+}
+
+/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kxor_mask16&expand=3291)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
+pub fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a ^ b
+}
+
+/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kxor&expand=3289)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
+pub fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a ^ b
+}
+
+/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=knot_mask16&expand=3233)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _knot_mask16(a: __mmask16) -> __mmask16 {
+    a ^ 0b11111111_11111111
+}
+
+/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_knot&expand=3231)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_knot(a: __mmask16) -> __mmask16 {
+    a ^ 0b11111111_11111111
+}
+
+/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kandn_mask16&expand=3218)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(not))] // generate normal and, not code instead of kandnw
+pub fn _kandn_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    _mm512_kand(_mm512_knot(a), b)
+}
+
+/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kandn&expand=3216)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(not))] // generate normal and code instead of kandw
+pub fn _mm512_kandn(a: __mmask16, b: __mmask16) -> __mmask16 {
+    _mm512_kand(_mm512_knot(a), b)
+}
+
+/// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kxnor_mask16&expand=3285)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal xor, not code instead of kxnorw
+pub fn _kxnor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    _mm512_knot(_mm512_kxor(a, b))
+}
+
+/// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kxnor&expand=3283)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kandw
+pub fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 {
+    _mm512_knot(_mm512_kxor(a, b))
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask16_u8)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _kortest_mask16_u8(a: __mmask16, b: __mmask16, all_ones: *mut u8) -> u8 {
+    let tmp = _kor_mask16(a, b);
+    *all_ones = (tmp == 0xffff) as u8;
+    (tmp == 0) as u8
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask16_u8)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+    (_kor_mask16(a, b) == 0xffff) as u8
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask16_u8)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+    (_kor_mask16(a, b) == 0) as u8
+}
+
+/// Shift 16-bit mask a left by count bits while shifting in zeros, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftli_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
+    a << COUNT
+}
+
+/// Shift 16-bit mask a right by count bits while shifting in zeros, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftri_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
+    a >> COUNT
+}
+
+/// Load 16-bit mask from memory
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _load_mask16(mem_addr: *const __mmask16) -> __mmask16 {
+    *mem_addr
+}
+
+/// Store 16-bit mask to memory
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _store_mask16(mem_addr: *mut __mmask16, a: __mmask16) {
+    *mem_addr = a;
+}
+
+/// Copy 16-bit mask a to k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_kmov&expand=3228)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
+pub fn _mm512_kmov(a: __mmask16) -> __mmask16 {
+    a
+}
+
+/// Converts integer mask into bitmask, storing the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_int2mask&expand=3189)
+#[inline]
+#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_int2mask(mask: i32) -> __mmask16 {
+    mask as u16
+}
+
+/// Converts bit mask k1 into an integer value, storing the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2int&expand=3544)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
+pub fn _mm512_mask2int(k1: __mmask16) -> i32 {
+    k1 as i32
+}
+
+/// Unpack and interleave 8 bits from masks a and b, and store the 16-bit result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kunpackb&expand=3280)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckbw
+pub fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
+    ((a & 0xff) << 8) | (b & 0xff)
+}
+
+/// Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kortestc&expand=3247)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(cmp))] // generate normal and code instead of kortestw
+pub fn _mm512_kortestc(a: __mmask16, b: __mmask16) -> i32 {
+    let r = (a | b) == 0b11111111_11111111;
+    r as i32
+}
+
+/// Performs bitwise OR between k1 and k2, storing the result in dst. ZF flag is set if dst is 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kortestz)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kortestw
+pub fn _mm512_kortestz(a: __mmask16, b: __mmask16) -> i32 {
+    let r = (a | b) == 0;
+    r as i32
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_test_epi32_mask&expand=5890)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub fn _mm512_test_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_test_epi32_mask&expand=5889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub fn _mm512_mask_test_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_test_epi32_mask&expand=5888)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub fn _mm256_test_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpneq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_test_epi32_mask&expand=5887)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub fn _mm256_mask_test_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpneq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_epi32_mask&expand=5886)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub fn _mm_test_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpneq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_test_epi32_mask&expand=5885)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub fn _mm_mask_test_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpneq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_test_epi64_mask&expand=5896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub fn _mm512_test_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_test_epi64_mask&expand=5895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub fn _mm512_mask_test_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_test_epi64_mask&expand=5894)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub fn _mm256_test_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpneq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_test_epi64_mask&expand=5893)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub fn _mm256_mask_test_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpneq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_epi64_mask&expand=5892)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub fn _mm_test_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpneq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_test_epi64_mask&expand=5891)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub fn _mm_mask_test_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpneq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_testn_epi32_mask&expand=5921)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub fn _mm512_testn_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_testn_epi32_mask&expand=5920)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub fn _mm512_mask_testn_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testn_epi32_mask&expand=5919)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub fn _mm256_testn_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpeq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_testn_epi32_mask&expand=5918)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub fn _mm256_mask_testn_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpeq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testn_epi32_mask&expand=5917)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub fn _mm_testn_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpeq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_testn_epi32_mask&expand=5916)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub fn _mm_mask_testn_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpeq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_testn_epi64_mask&expand=5927)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub fn _mm512_testn_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_testn_epi64_mask&expand=5926)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub fn _mm512_mask_testn_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testn_epi64_mask&expand=5925)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub fn _mm256_testn_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpeq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_testn_epi64_mask&expand=5924)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub fn _mm256_mask_testn_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpeq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testn_epi64_mask&expand=5923)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub fn _mm_testn_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpeq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_testn_epi64_mask&expand=5922)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpeq_epi64_mask(k, and, zero)
+}
+
+/// Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_ps&expand=5671)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovntps))]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
+    crate::arch::asm!(
+        vps!("vmovntps", ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(zmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_pd&expand=5667)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovntpd))]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
+    crate::arch::asm!(
+        vps!("vmovntpd", ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(zmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_si512&expand=5675)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovntdq))]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_si512(mem_addr: *mut __m512i, a: __m512i) {
+    crate::arch::asm!(
+        vps!("vmovntdq", ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(zmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Load 512-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
+/// must be aligned on a 64-byte boundary or a general-protection exception may be generated. To
+/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_load_si512)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_stream_load_si512(mem_addr: *const __m512i) -> __m512i {
+    let dst: __m512i;
+    crate::arch::asm!(
+        vpl!("vmovntdqa {a}"),
+        a = out(zmm_reg) dst,
+        p = in(reg) mem_addr,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Sets packed 32-bit integers in `dst` with the supplied values.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_ps&expand=4931)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set_ps(
+    e0: f32,
+    e1: f32,
+    e2: f32,
+    e3: f32,
+    e4: f32,
+    e5: f32,
+    e6: f32,
+    e7: f32,
+    e8: f32,
+    e9: f32,
+    e10: f32,
+    e11: f32,
+    e12: f32,
+    e13: f32,
+    e14: f32,
+    e15: f32,
+) -> __m512 {
+    _mm512_setr_ps(
+        e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
+    )
+}
+
+/// Sets packed 32-bit integers in `dst` with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr_ps&expand=5008)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr_ps(
+    e0: f32,
+    e1: f32,
+    e2: f32,
+    e3: f32,
+    e4: f32,
+    e5: f32,
+    e6: f32,
+    e7: f32,
+    e8: f32,
+    e9: f32,
+    e10: f32,
+    e11: f32,
+    e12: f32,
+    e13: f32,
+    e14: f32,
+    e15: f32,
+) -> __m512 {
+    unsafe {
+        let r = f32x16::new(
+            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+        );
+        transmute(r)
+    }
+}
+
+/// Broadcast 64-bit float `a` to all elements of `dst`.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_pd&expand=4975)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set1_pd(a: f64) -> __m512d {
+    unsafe { transmute(f64x8::splat(a)) }
+}
+
+/// Broadcast 32-bit float `a` to all elements of `dst`.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_ps&expand=4981)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set1_ps(a: f32) -> __m512 {
+    unsafe { transmute(f32x16::splat(a)) }
+}
+
+/// Sets packed 32-bit integers in `dst` with the supplied values.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_epi32&expand=4908)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set_epi32(
+    e15: i32,
+    e14: i32,
+    e13: i32,
+    e12: i32,
+    e11: i32,
+    e10: i32,
+    e9: i32,
+    e8: i32,
+    e7: i32,
+    e6: i32,
+    e5: i32,
+    e4: i32,
+    e3: i32,
+    e2: i32,
+    e1: i32,
+    e0: i32,
+) -> __m512i {
+    _mm512_setr_epi32(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    )
+}
+
+/// Broadcast 8-bit integer a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_epi8&expand=4972)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set1_epi8(a: i8) -> __m512i {
+    unsafe { transmute(i8x64::splat(a)) }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_epi16&expand=4944)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set1_epi16(a: i16) -> __m512i {
+    unsafe { transmute(i16x32::splat(a)) }
+}
+
+/// Broadcast 32-bit integer `a` to all elements of `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set1_epi32(a: i32) -> __m512i {
+    unsafe { transmute(i32x16::splat(a)) }
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_set1_epi32&expand=4951)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub fn _mm512_mask_set1_epi32(src: __m512i, k: __mmask16, a: i32) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_set1_epi32&expand=4952)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub fn _mm512_maskz_set1_epi32(k: __mmask16, a: i32) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_set1_epi32&expand=4948)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub fn _mm256_mask_set1_epi32(src: __m256i, k: __mmask8, a: i32) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_set1_epi32&expand=4949)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub fn _mm256_maskz_set1_epi32(k: __mmask8, a: i32) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_set1_epi32&expand=4945)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub fn _mm_mask_set1_epi32(src: __m128i, k: __mmask8, a: i32) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_set1_epi32&expand=4946)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub fn _mm_maskz_set1_epi32(k: __mmask8, a: i32) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Broadcast 64-bit integer `a` to all elements of `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_epi64&expand=4961)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set1_epi64(a: i64) -> __m512i {
+    unsafe { transmute(i64x8::splat(a)) }
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_set1_epi64&expand=4959)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub fn _mm512_mask_set1_epi64(src: __m512i, k: __mmask8, a: i64) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+    }
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_set1_epi64&expand=4960)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub fn _mm512_maskz_set1_epi64(k: __mmask8, a: i64) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+    }
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_set1_epi64&expand=4957)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub fn _mm256_mask_set1_epi64(src: __m256i, k: __mmask8, a: i64) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi64x(a).as_i64x4();
+        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+    }
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_set1_epi64&expand=4958)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub fn _mm256_maskz_set1_epi64(k: __mmask8, a: i64) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi64x(a).as_i64x4();
+        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+    }
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_set1_epi64&expand=4954)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub fn _mm_mask_set1_epi64(src: __m128i, k: __mmask8, a: i64) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi64x(a).as_i64x2();
+        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+    }
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_set1_epi64&expand=4955)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub fn _mm_maskz_set1_epi64(k: __mmask8, a: i64) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi64x(a).as_i64x2();
+        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+    }
+}
+
+/// Set packed 64-bit integers in dst with the repeated 4 element sequence.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set4_epi64&expand=4983)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
+    _mm512_set_epi64(d, c, b, a, d, c, b, a)
+}
+
+/// Set packed 64-bit integers in dst with the repeated 4 element sequence in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr4_epi64&expand=5010)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
+    _mm512_set_epi64(a, b, c, d, a, b, c, d)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_ps_mask&expand=1074)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_LT_OS>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_ps_mask&expand=1075)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmplt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_LT_OS>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpnlt_ps_mask&expand=1154)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmpnlt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_NLT_US>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpnlt_ps_mask&expand=1155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmpnlt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_NLT_US>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_ps_mask&expand=1013)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_LE_OS>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_ps_mask&expand=1014)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmple_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_LE_OS>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpnle_ps_mask&expand=1146)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmpnle_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_NLE_US>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpnle_ps_mask&expand=1147)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmpnle_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_NLE_US>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_ps_mask&expand=828)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmpeq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_ps_mask&expand=829)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmpeq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_EQ_OQ>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_ps_mask&expand=1130)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_NEQ_UQ>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_ps_mask&expand=1131)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmpneq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_NEQ_UQ>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_ps_mask&expand=749)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm512_cmp_ps_mask<const IMM8: i32>(a: __m512, b: __m512) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vcmpps(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_ps_mask&expand=750)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm512_mask_cmp_ps_mask<const IMM8: i32>(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vcmpps(a, b, IMM8, k1 as i16, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps_mask&expand=747)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm256_cmp_ps_mask<const IMM8: i32>(a: __m256, b: __m256) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let r = vcmpps256(a, b, IMM8, neg_one);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_ps_mask&expand=748)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm256_mask_cmp_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m256, b: __m256) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let r = vcmpps256(a, b, IMM8, k1 as i8);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps_mask&expand=745)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_cmp_ps_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vcmpps128(a, b, IMM8, neg_one);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_ps_mask&expand=746)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vcmpps128(a, b, IMM8, k1 as i8);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_round_ps_mask&expand=753)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm512_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
+    a: __m512,
+    b: __m512,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let neg_one = -1;
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vcmpps(a, b, IMM5, neg_one, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_round_ps_mask&expand=754)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_mask_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
+    m: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vcmpps(a, b, IMM5, m as i16, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpord_ps_mask&expand=1162)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmps
+pub fn _mm512_cmpord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_ORD_Q>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpord_ps_mask&expand=1163)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmpord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_ORD_Q>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpunord_ps_mask&expand=1170)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmpunord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_UNORD_Q>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpunord_ps_mask&expand=1171)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmpunord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_UNORD_Q>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_pd_mask&expand=1071)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_LT_OS>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_pd_mask&expand=1072)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmplt_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_LT_OS>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpnlt_pd_mask&expand=1151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmpnlt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_NLT_US>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpnlt_pd_mask&expand=1152)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmpnlt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_NLT_US>(m, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_pd_mask&expand=1010)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_LE_OS>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_pd_mask&expand=1011)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmple_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_LE_OS>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpnle_pd_mask&expand=1143)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmpnle_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_NLE_US>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpnle_pd_mask&expand=1144)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmpnle_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_NLE_US>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_pd_mask&expand=822)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmpeq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_pd_mask&expand=823)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmpeq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_EQ_OQ>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_pd_mask&expand=1127)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_NEQ_UQ>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_pd_mask&expand=1128)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmpneq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_NEQ_UQ>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_pd_mask&expand=741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm512_cmp_pd_mask<const IMM8: i32>(a: __m512d, b: __m512d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vcmppd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_pd_mask&expand=742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm512_mask_cmp_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vcmppd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd_mask&expand=739)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm256_cmp_pd_mask<const IMM8: i32>(a: __m256d, b: __m256d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let r = vcmppd256(a, b, IMM8, neg_one);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_pd_mask&expand=740)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm256_mask_cmp_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m256d, b: __m256d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let r = vcmppd256(a, b, IMM8, k1 as i8);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd_mask&expand=737)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_cmp_pd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vcmppd128(a, b, IMM8, neg_one);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_pd_mask&expand=738)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vcmppd128(a, b, IMM8, k1 as i8);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_round_pd_mask&expand=751)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm512_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
+    a: __m512d,
+    b: __m512d,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let neg_one = -1;
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vcmppd(a, b, IMM5, neg_one, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_round_pd_mask&expand=752)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_mask_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
+    k1: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vcmppd(a, b, IMM5, k1 as i8, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpord_pd_mask&expand=1159)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmpord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_ORD_Q>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpord_pd_mask&expand=1160)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmpord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_ORD_Q>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpunord_pd_mask&expand=1167)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmpunord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_UNORD_Q>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpunord_pd_mask&expand=1168)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmpunord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_UNORD_Q>(k1, a, b)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss_mask&expand=763)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_cmp_ss_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let r = vcmpss(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_ss_mask&expand=764)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_ss_mask<const IMM8: i32>(k1: __mmask8, a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let r = vcmpss(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_round_ss_mask&expand=757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let neg_one = -1;
+        let r = vcmpss(a, b, IMM5, neg_one, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not seti).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_round_ss_mask&expand=758)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_mask_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(
+    k1: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let r = vcmpss(a, b, IMM5, k1 as i8, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd_mask&expand=760)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_cmp_sd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let r = vcmpsd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_sd_mask&expand=761)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_sd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let r = vcmpsd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_round_sd_mask&expand=755)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let neg_one = -1;
+        let r = vcmpsd(a, b, IMM5, neg_one, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_round_sd_mask&expand=756)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_mask_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(
+    k1: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let r = vcmpsd(a, b, IMM5, k1 as i8, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epu32_mask&expand=1056)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_lt(a.as_u32x16(), b.as_u32x16())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu32_mask&expand=1057)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_mask_cmplt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epu32_mask&expand=1054)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_cmplt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_lt(a.as_u32x8(), b.as_u32x8())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu32_mask&expand=1055)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_mask_cmplt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epu32_mask&expand=1052)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_cmplt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_lt(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu32_mask&expand=1053)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_mask_cmplt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epu32_mask&expand=933)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_gt(a.as_u32x16(), b.as_u32x16())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epu32_mask&expand=934)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_mask_cmpgt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epu32_mask&expand=931)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_cmpgt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_gt(a.as_u32x8(), b.as_u32x8())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epu32_mask&expand=932)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epu32_mask&expand=929)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_cmpgt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_gt(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epu32_mask&expand=930)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epu32_mask&expand=995)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_le(a.as_u32x16(), b.as_u32x16())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu32_mask&expand=996)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_mask_cmple_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epu32_mask&expand=993)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_cmple_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_le(a.as_u32x8(), b.as_u32x8())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu32_mask&expand=994)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_mask_cmple_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epu32_mask&expand=991)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_cmple_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_le(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu32_mask&expand=992)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_mask_cmple_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epu32_mask&expand=873)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_ge(a.as_u32x16(), b.as_u32x16())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epu32_mask&expand=874)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_mask_cmpge_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epu32_mask&expand=871)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_cmpge_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_ge(a.as_u32x8(), b.as_u32x8())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epu32_mask&expand=872)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_mask_cmpge_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epu32_mask&expand=869)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_cmpge_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_ge(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epu32_mask&expand=870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_mask_cmpge_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epu32_mask&expand=807)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_eq(a.as_u32x16(), b.as_u32x16())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epu32_mask&expand=808)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_mask_cmpeq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epu32_mask&expand=805)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_cmpeq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_eq(a.as_u32x8(), b.as_u32x8())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epu32_mask&expand=806)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epu32_mask&expand=803)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_cmpeq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_eq(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epu32_mask&expand=804)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epu32_mask&expand=1112)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_ne(a.as_u32x16(), b.as_u32x16())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epu32_mask&expand=1113)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_mask_cmpneq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epu32_mask&expand=1110)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_cmpneq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_ne(a.as_u32x8(), b.as_u32x8())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epu32_mask&expand=1111)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epu32_mask&expand=1108)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_cmpneq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_ne(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epu32_mask&expand=1109)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epu32_mask&expand=721)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x16();
+        let b = b.as_u32x16();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epu32_mask&expand=722)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x16();
+        let b = b.as_u32x16();
+        let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epu32_mask&expand=719)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epu32_mask&expand=720)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epu32_mask&expand=717)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x4();
+        let b = b.as_u32x4();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x4::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x4::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epu32_mask&expand=718)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x4();
+        let b = b.as_u32x4();
+        let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x4::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epi32_mask&expand=1029)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_lt(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi32_mask&expand=1031)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_mask_cmplt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epi32_mask&expand=1027)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_cmplt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_lt(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi32_mask&expand=1028)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_mask_cmplt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32_mask&expand=1025)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_cmplt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi32_mask&expand=1026)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_mask_cmplt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epi32_mask&expand=905)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_gt(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epi32_mask&expand=906)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_mask_cmpgt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32_mask&expand=903)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_cmpgt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epi32_mask&expand=904)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32_mask&expand=901)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_cmpgt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epi32_mask&expand=902)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epi32_mask&expand=971)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_le(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi32_mask&expand=972)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_mask_cmple_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epi32_mask&expand=969)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_cmple_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_le(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi32_mask&expand=970)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_mask_cmple_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epi32_mask&expand=967)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_cmple_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_le(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi32_mask&expand=968)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_mask_cmple_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epi32_mask&expand=849)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_ge(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epi32_mask&expand=850)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_mask_cmpge_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epi32_mask&expand=847)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_cmpge_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_ge(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epi32_mask&expand=848)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_mask_cmpge_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epi32_mask&expand=845)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_cmpge_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_ge(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epi32_mask&expand=846)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_mask_cmpge_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epi32_mask&expand=779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_eq(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epi32_mask&expand=780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_mask_cmpeq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32_mask&expand=777)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_cmpeq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epi32_mask&expand=778)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32_mask&expand=775)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_cmpeq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epi32_mask&expand=776)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epi32_mask&expand=1088)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_ne(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epi32_mask&expand=1089)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_mask_cmpneq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epi32_mask&expand=1086)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_cmpneq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_ne(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epi32_mask&expand=1087)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epi32_mask&expand=1084)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_cmpneq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_ne(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epi32_mask&expand=1085)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epi32_mask&expand=697)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epi32_mask&expand=698)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=#text=_mm256_cmp_epi32_mask&expand=695)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epi32_mask&expand=696)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epi32_mask&expand=693)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x4::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x4::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epi32_mask&expand=694)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x4::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epu64_mask&expand=1062)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_lt(a.as_u64x8(), b.as_u64x8())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu64_mask&expand=1063)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_mask_cmplt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epu64_mask&expand=1060)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_cmplt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_lt(a.as_u64x4(), b.as_u64x4())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu64_mask&expand=1061)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_mask_cmplt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epu64_mask&expand=1058)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_cmplt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_lt(a.as_u64x2(), b.as_u64x2())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu64_mask&expand=1059)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_mask_cmplt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epu64_mask&expand=939)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_gt(a.as_u64x8(), b.as_u64x8())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epu64_mask&expand=940)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epu64_mask&expand=937)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_cmpgt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_gt(a.as_u64x4(), b.as_u64x4())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epu64_mask&expand=938)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epu64_mask&expand=935)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_cmpgt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_gt(a.as_u64x2(), b.as_u64x2())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epu64_mask&expand=936)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epu64_mask&expand=1001)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_le(a.as_u64x8(), b.as_u64x8())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu64_mask&expand=1002)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_mask_cmple_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epu64_mask&expand=999)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_cmple_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_le(a.as_u64x4(), b.as_u64x4())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu64_mask&expand=1000)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_mask_cmple_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epu64_mask&expand=997)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_cmple_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_le(a.as_u64x2(), b.as_u64x2())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu64_mask&expand=998)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_mask_cmple_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epu64_mask&expand=879)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_ge(a.as_u64x8(), b.as_u64x8())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epu64_mask&expand=880)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_mask_cmpge_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epu64_mask&expand=877)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_cmpge_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_ge(a.as_u64x4(), b.as_u64x4())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epu64_mask&expand=878)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_mask_cmpge_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epu64_mask&expand=875)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_cmpge_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_ge(a.as_u64x2(), b.as_u64x2())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epu64_mask&expand=876)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_mask_cmpge_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epu64_mask&expand=813)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_eq(a.as_u64x8(), b.as_u64x8())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epu64_mask&expand=814)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epu64_mask&expand=811)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_cmpeq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_eq(a.as_u64x4(), b.as_u64x4())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epu64_mask&expand=812)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epu64_mask&expand=809)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_cmpeq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_eq(a.as_u64x2(), b.as_u64x2())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epu64_mask&expand=810)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epu64_mask&expand=1118)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epu64_mask&expand=1119)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epu64_mask&expand=1116)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_cmpneq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_ne(a.as_u64x4(), b.as_u64x4())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epu64_mask&expand=1117)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epu64_mask&expand=1114)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_cmpneq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_ne(a.as_u64x2(), b.as_u64x2())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epu64_mask&expand=1115)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epu64_mask&expand=727)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epu64_mask&expand=728)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epu64_mask&expand=725)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x4::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x4::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epu64_mask&expand=726)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x4::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epu64_mask&expand=723)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x2::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x2::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epu64_mask&expand=724)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x2::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epi64_mask&expand=1037)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_lt(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi64_mask&expand=1038)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_mask_cmplt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epi64_mask&expand=1035)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_cmplt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_lt(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi64_mask&expand=1036)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_mask_cmplt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi64_mask&expand=1033)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_cmplt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_lt(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi64_mask&expand=1034)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_mask_cmplt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epi64_mask&expand=913)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epi64_mask&expand=914)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64_mask&expand=911)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_cmpgt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epi64_mask&expand=912)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi64_mask&expand=909)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_cmpgt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_gt(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epi64_mask&expand=910)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epi64_mask&expand=977)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_le(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi64_mask&expand=978)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_mask_cmple_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epi64_mask&expand=975)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_cmple_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_le(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi64_mask&expand=976)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_mask_cmple_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epi64_mask&expand=973)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_cmple_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_le(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi64_mask&expand=974)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_mask_cmple_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epi64_mask&expand=855)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_ge(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epi64_mask&expand=856)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_mask_cmpge_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epi64_mask&expand=853)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_cmpge_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_ge(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epi64_mask&expand=854)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_mask_cmpge_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epi64_mask&expand=851)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_cmpge_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_ge(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epi64_mask&expand=852)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_mask_cmpge_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epi64_mask&expand=787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epi64_mask&expand=788)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64_mask&expand=785)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_cmpeq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epi64_mask&expand=786)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64_mask&expand=783)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_cmpeq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_eq(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epi64_mask&expand=784)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epi64_mask&expand=1094)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epi64_mask&expand=1095)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epi64_mask&expand=1092)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_cmpneq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_ne(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epi64_mask&expand=1093)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epi64_mask&expand=1090)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_cmpneq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_ne(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epi64_mask&expand=1091)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epi64_mask&expand=703)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epi64_mask&expand=704)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epi64_mask&expand=701)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x4::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x4::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epi64_mask&expand=702)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x4::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epi64_mask&expand=699)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x2::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x2::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epi64_mask&expand=700)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x2::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Reduce the packed 32-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_add_epi32&expand=4556)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_add_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_add_unordered(a.as_i32x16()) }
+}
+
+/// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_add_epi32&expand=4555)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i32x16(), i32x16::ZERO)) }
+}
+
+/// Reduce the packed 64-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_add_epi64&expand=4558)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_add_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_add_unordered(a.as_i64x8()) }
+}
+
+/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_add_epi64&expand=4557)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i64x8(), i64x8::ZERO)) }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_add_ps&expand=4562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_add_ps(a: __m512) -> f32 {
+    unsafe {
+        // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ
+        let a = _mm256_add_ps(
+            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        );
+        let a = _mm_add_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+        let a = _mm_add_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+        simd_extract::<_, f32>(a, 0) + simd_extract::<_, f32>(a, 1)
+    }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_add_ps&expand=4561)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_add_ps(k: __mmask16, a: __m512) -> f32 {
+    unsafe { _mm512_reduce_add_ps(simd_select_bitmask(k, a, _mm512_setzero_ps())) }
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_add_pd&expand=4560)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_add_pd(a: __m512d) -> f64 {
+    unsafe {
+        let a = _mm256_add_pd(
+            _mm512_extractf64x4_pd::<0>(a),
+            _mm512_extractf64x4_pd::<1>(a),
+        );
+        let a = _mm_add_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+        simd_extract::<_, f64>(a, 0) + simd_extract::<_, f64>(a, 1)
+    }
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_add_pd&expand=4559)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_add_pd(k: __mmask8, a: __m512d) -> f64 {
+    unsafe { _mm512_reduce_add_pd(simd_select_bitmask(k, a, _mm512_setzero_pd())) }
+}
+
+/// Reduce the packed 32-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_mul_epi32&expand=4600)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_mul_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_mul_unordered(a.as_i32x16()) }
+}
+
+/// Reduce the packed 32-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_mul_epi32&expand=4599)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe {
+        simd_reduce_mul_unordered(simd_select_bitmask(
+            k,
+            a.as_i32x16(),
+            _mm512_set1_epi32(1).as_i32x16(),
+        ))
+    }
+}
+
+/// Reduce the packed 64-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_mul_epi64&expand=4602)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_mul_unordered(a.as_i64x8()) }
+}
+
+/// Reduce the packed 64-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_mul_epi64&expand=4601)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe {
+        simd_reduce_mul_unordered(simd_select_bitmask(
+            k,
+            a.as_i64x8(),
+            _mm512_set1_epi64(1).as_i64x8(),
+        ))
+    }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_mul_ps&expand=4606)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_mul_ps(a: __m512) -> f32 {
+    unsafe {
+        // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ
+        let a = _mm256_mul_ps(
+            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        );
+        let a = _mm_mul_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+        let a = _mm_mul_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+        simd_extract::<_, f32>(a, 0) * simd_extract::<_, f32>(a, 1)
+    }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_mul_ps&expand=4605)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_mul_ps(k: __mmask16, a: __m512) -> f32 {
+    unsafe { _mm512_reduce_mul_ps(simd_select_bitmask(k, a, _mm512_set1_ps(1.))) }
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_mul_pd&expand=4604)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_mul_pd(a: __m512d) -> f64 {
+    unsafe {
+        let a = _mm256_mul_pd(
+            _mm512_extractf64x4_pd::<0>(a),
+            _mm512_extractf64x4_pd::<1>(a),
+        );
+        let a = _mm_mul_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+        simd_extract::<_, f64>(a, 0) * simd_extract::<_, f64>(a, 1)
+    }
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_mul_pd&expand=4603)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_mul_pd(k: __mmask8, a: __m512d) -> f64 {
+    unsafe { _mm512_reduce_mul_pd(simd_select_bitmask(k, a, _mm512_set1_pd(1.))) }
+}
+
+/// Reduce the packed signed 32-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_epi32&expand=4576)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_max_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_max(a.as_i32x16()) }
+}
+
+/// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_epi32&expand=4575)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_max_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe {
+        simd_reduce_max(simd_select_bitmask(
+            k,
+            a.as_i32x16(),
+            i32x16::splat(i32::MIN),
+        ))
+    }
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_epi64&expand=4578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_max_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_max(a.as_i64x8()) }
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_epi64&expand=4577)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_max_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(i64::MIN))) }
+}
+
+/// Reduce the packed unsigned 32-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_epu32&expand=4580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_max_epu32(a: __m512i) -> u32 {
+    unsafe { simd_reduce_max(a.as_u32x16()) }
+}
+
+/// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_epu32&expand=4579)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_max_epu32(k: __mmask16, a: __m512i) -> u32 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u32x16(), u32x16::ZERO)) }
+}
+
+/// Reduce the packed unsigned 64-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_epu64&expand=4582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_max_epu64(a: __m512i) -> u64 {
+    unsafe { simd_reduce_max(a.as_u64x8()) }
+}
+
+/// Reduce the packed unsigned 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_epu64&expand=4581)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u64x8(), u64x8::ZERO)) }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_ps&expand=4586)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_max_ps(a: __m512) -> f32 {
+    unsafe {
+        let a = _mm256_max_ps(
+            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        );
+        let a = _mm_max_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+        let a = _mm_max_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+        _mm_cvtss_f32(_mm_max_ss(a, _mm_movehdup_ps(a)))
+    }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_ps&expand=4585)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_max_ps(k: __mmask16, a: __m512) -> f32 {
+    _mm512_reduce_max_ps(_mm512_mask_mov_ps(_mm512_set1_ps(f32::MIN), k, a))
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_pd&expand=4584)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_max_pd(a: __m512d) -> f64 {
+    unsafe {
+        let a = _mm256_max_pd(
+            _mm512_extractf64x4_pd::<0>(a),
+            _mm512_extractf64x4_pd::<1>(a),
+        );
+        let a = _mm_max_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+        _mm_cvtsd_f64(_mm_max_sd(a, simd_shuffle!(a, a, [1, 0])))
+    }
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_pd&expand=4583)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_max_pd(k: __mmask8, a: __m512d) -> f64 {
+    _mm512_reduce_max_pd(_mm512_mask_mov_pd(_mm512_set1_pd(f64::MIN), k, a))
+}
+
+/// Reduce the packed signed 32-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_epi32&expand=4588)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_min_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_min(a.as_i32x16()) }
+}
+
+/// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_epi32&expand=4587)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_min_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe {
+        simd_reduce_min(simd_select_bitmask(
+            k,
+            a.as_i32x16(),
+            i32x16::splat(i32::MAX),
+        ))
+    }
+}
+
+/// Reduce the packed signed 64-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_epi64&expand=4590)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_min_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_min(a.as_i64x8()) }
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_epi64&expand=4589)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_min_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(i64::MAX))) }
+}
+
+/// Reduce the packed unsigned 32-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_epu32&expand=4592)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_min_epu32(a: __m512i) -> u32 {
+    unsafe { simd_reduce_min(a.as_u32x16()) }
+}
+
+/// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_epu32&expand=4591)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_min_epu32(k: __mmask16, a: __m512i) -> u32 {
+    unsafe {
+        simd_reduce_min(simd_select_bitmask(
+            k,
+            a.as_u32x16(),
+            u32x16::splat(u32::MAX),
+        ))
+    }
+}
+
+/// Reduce the packed unsigned 64-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_epu64&expand=4594)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_min_epu64(a: __m512i) -> u64 {
+    unsafe { simd_reduce_min(a.as_u64x8()) }
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_epu64&expand=4589)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u64x8(), u64x8::splat(u64::MAX))) }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_ps&expand=4598)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_min_ps(a: __m512) -> f32 {
+    unsafe {
+        let a = _mm256_min_ps(
+            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        );
+        let a = _mm_min_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+        let a = _mm_min_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+        _mm_cvtss_f32(_mm_min_ss(a, _mm_movehdup_ps(a)))
+    }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_ps&expand=4597)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_min_ps(k: __mmask16, a: __m512) -> f32 {
+    _mm512_reduce_min_ps(_mm512_mask_mov_ps(_mm512_set1_ps(f32::MAX), k, a))
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_pd&expand=4596)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_min_pd(a: __m512d) -> f64 {
+    unsafe {
+        let a = _mm256_min_pd(
+            _mm512_extractf64x4_pd::<0>(a),
+            _mm512_extractf64x4_pd::<1>(a),
+        );
+        let a = _mm_min_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+        _mm_cvtsd_f64(_mm_min_sd(a, simd_shuffle!(a, a, [1, 0])))
+    }
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_pd&expand=4595)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_min_pd(k: __mmask8, a: __m512d) -> f64 {
+    _mm512_reduce_min_pd(_mm512_mask_mov_pd(_mm512_set1_pd(f64::MAX), k, a))
+}
+
+/// Reduce the packed 32-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_and_epi32&expand=4564)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_and_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_and(a.as_i32x16()) }
+}
+
+/// Reduce the packed 32-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_and_epi32&expand=4563)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_and_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe { simd_reduce_and(simd_select_bitmask(k, a.as_i32x16(), i32x16::splat(-1))) }
+}
+
+/// Reduce the packed 64-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_and_epi64&expand=4566)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_and_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_and(a.as_i64x8()) }
+}
+
+/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_and_epi64&expand=4557)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_and_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_and(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(-1))) }
+}
+
+/// Reduce the packed 32-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_or_epi32&expand=4608)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_or_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_or(a.as_i32x16()) }
+}
+
+/// Reduce the packed 32-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_or_epi32&expand=4607)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_or_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i32x16(), i32x16::ZERO)) }
+}
+
+/// Reduce the packed 64-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_or_epi64&expand=4610)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_or_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_or(a.as_i64x8()) }
+}
+
+/// Reduce the packed 64-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_or_epi64&expand=4609)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_or_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i64x8(), i64x8::ZERO)) }
+}
+
+/// Returns vector of type `__m512d` with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_undefined_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+// This intrinsic has no corresponding instruction.
+pub fn _mm512_undefined_pd() -> __m512d {
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Returns vector of type `__m512` with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_undefined_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+// This intrinsic has no corresponding instruction.
+pub fn _mm512_undefined_ps() -> __m512 {
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Return vector of type __m512i with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_undefined_epi32&expand=5995)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+// This intrinsic has no corresponding instruction.
+pub fn _mm512_undefined_epi32() -> __m512i {
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Return vector of type __m512 with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_undefined&expand=5994)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+// This intrinsic has no corresponding instruction.
+pub fn _mm512_undefined() -> __m512 {
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi32&expand=3377)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm512_loadu_epi32(mem_addr: *const i32) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 8 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi32&expand=3374)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm256_loadu_epi32(mem_addr: *const i32) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 4 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi32&expand=3371)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm_loadu_epi32(mem_addr: *const i32) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_storeu_epi16&expand=1460)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm512_mask_cvtepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask16, a: __m512i) {
+    vpmovdwmem(mem_addr.cast(), a.as_i32x16(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_storeu_epi16&expand=1462)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm256_mask_cvtepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
+    vpmovdwmem256(mem_addr.cast(), a.as_i32x8(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_storeu_epi16&expand=1461)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm_mask_cvtepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vpmovdwmem128(mem_addr.cast(), a.as_i32x4(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi32_storeu_epi16&expand=1833)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask16, a: __m512i) {
+    vpmovsdwmem(mem_addr.cast(), a.as_i32x16(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi32_storeu_epi16&expand=1832)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
+    vpmovsdwmem256(mem_addr.cast(), a.as_i32x8(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi32_storeu_epi16&expand=1831)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vpmovsdwmem128(mem_addr.cast(), a.as_i32x4(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi32_storeu_epi16&expand=2068)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask16, a: __m512i) {
+    vpmovusdwmem(mem_addr.cast(), a.as_i32x16(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi32_storeu_epi16&expand=2067)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
+    vpmovusdwmem256(mem_addr.cast(), a.as_i32x8(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi32_storeu_epi16&expand=2066)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vpmovusdwmem128(mem_addr.cast(), a.as_i32x4(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_storeu_epi8&expand=1463)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm512_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovdbmem(mem_addr, a.as_i32x16(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_storeu_epi8&expand=1462)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm256_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovdbmem256(mem_addr, a.as_i32x8(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_storeu_epi8&expand=1461)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovdbmem128(mem_addr, a.as_i32x4(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi32_storeu_epi8&expand=1836)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovsdbmem(mem_addr, a.as_i32x16(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi32_storeu_epi8&expand=1835)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsdbmem256(mem_addr, a.as_i32x8(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi32_storeu_epi8&expand=1834)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsdbmem128(mem_addr, a.as_i32x4(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi32_storeu_epi8&expand=2071)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovusdbmem(mem_addr, a.as_i32x16(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi32_storeu_epi8&expand=2070)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusdbmem256(mem_addr, a.as_i32x8(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi32_storeu_epi8&expand=2069)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusdbmem128(mem_addr, a.as_i32x4(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_storeu_epi16&expand=1513)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm512_mask_cvtepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m512i) {
+    vpmovqwmem(mem_addr.cast(), a.as_i64x8(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_storeu_epi16&expand=1512)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm256_mask_cvtepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
+    vpmovqwmem256(mem_addr.cast(), a.as_i64x4(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_storeu_epi16&expand=1511)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm_mask_cvtepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vpmovqwmem128(mem_addr.cast(), a.as_i64x2(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_storeu_epi16&expand=1866)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m512i) {
+    vpmovsqwmem(mem_addr.cast(), a.as_i64x8(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_storeu_epi16&expand=1865)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
+    vpmovsqwmem256(mem_addr.cast(), a.as_i64x4(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_storeu_epi16&expand=1864)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vpmovsqwmem128(mem_addr.cast(), a.as_i64x2(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_storeu_epi16&expand=2101)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m512i) {
+    vpmovusqwmem(mem_addr.cast(), a.as_i64x8(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_storeu_epi16&expand=2100)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
+    vpmovusqwmem256(mem_addr.cast(), a.as_i64x4(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_storeu_epi16&expand=2099)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vpmovusqwmem128(mem_addr.cast(), a.as_i64x2(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_storeu_epi8&expand=1519)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm512_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovqbmem(mem_addr, a.as_i64x8(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_storeu_epi8&expand=1518)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm256_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovqbmem256(mem_addr, a.as_i64x4(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_storeu_epi8&expand=1517)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovqbmem128(mem_addr, a.as_i64x2(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_storeu_epi8&expand=1872)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovsqbmem(mem_addr, a.as_i64x8(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_storeu_epi8&expand=1871)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsqbmem256(mem_addr, a.as_i64x4(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_storeu_epi8&expand=1870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsqbmem128(mem_addr, a.as_i64x2(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_storeu_epi8&expand=2107)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovusqbmem(mem_addr, a.as_i64x8(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_storeu_epi8&expand=2106)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusqbmem256(mem_addr, a.as_i64x4(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_storeu_epi8&expand=2105)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusqbmem128(mem_addr, a.as_i64x2(), k);
+}
+
+///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_storeu_epi32&expand=1516)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm512_mask_cvtepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m512i) {
+    vpmovqdmem(mem_addr.cast(), a.as_i64x8(), k);
+}
+
+///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_storeu_epi32&expand=1515)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm256_mask_cvtepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m256i) {
+    vpmovqdmem256(mem_addr.cast(), a.as_i64x4(), k);
+}
+
+///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_storeu_epi32&expand=1514)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm_mask_cvtepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m128i) {
+    vpmovqdmem128(mem_addr.cast(), a.as_i64x2(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_storeu_epi32&expand=1869)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m512i) {
+    vpmovsqdmem(mem_addr.cast(), a.as_i64x8(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_storeu_epi32&expand=1868)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m256i) {
+    vpmovsqdmem256(mem_addr.cast(), a.as_i64x4(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_storeu_epi32&expand=1867)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m128i) {
+    vpmovsqdmem128(mem_addr.cast(), a.as_i64x2(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_storeu_epi32&expand=2104)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m512i) {
+    vpmovusqdmem(mem_addr.cast(), a.as_i64x8(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_storeu_epi32&expand=2103)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m256i) {
+    vpmovusqdmem256(mem_addr.cast(), a.as_i64x4(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_storeu_epi32&expand=2102)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m128i) {
+    vpmovusqdmem128(mem_addr.cast(), a.as_i64x2(), k);
+}
+
+/// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi32&expand=5628)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm512_storeu_epi32(mem_addr: *mut i32, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 8 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi32&expand=5626)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm256_storeu_epi32(mem_addr: *mut i32, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 4 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi32&expand=5624)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm_storeu_epi32(mem_addr: *mut i32, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+}
+
+/// Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi64&expand=3386)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm512_loadu_epi64(mem_addr: *const i64) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 4 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi64&expand=3383)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm256_loadu_epi64(mem_addr: *const i64) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 2 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi64&expand=3380)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm_loadu_epi64(mem_addr: *const i64) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Store 512-bits (composed of 8 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi64&expand=5634)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm512_storeu_epi64(mem_addr: *mut i64, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 4 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi64&expand=5632)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm256_storeu_epi64(mem_addr: *mut i64, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 2 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi64&expand=5630)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm_storeu_epi64(mem_addr: *mut i64, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+}
+
+/// Load 512-bits of integer data from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_si512&expand=3420)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm512_loadu_si512(mem_addr: *const __m512i) -> __m512i {
+    ptr::read_unaligned(mem_addr)
+}
+
+/// Store 512-bits of integer data from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_si512&expand=5657)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm512_storeu_si512(mem_addr: *mut __m512i, a: __m512i) {
+    ptr::write_unaligned(mem_addr, a);
+}
+
+/// Loads 512-bits (composed of 8 packed double-precision (64-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))]
+pub unsafe fn _mm512_loadu_pd(mem_addr: *const f64) -> __m512d {
+    ptr::read_unaligned(mem_addr as *const __m512d)
+}
+
+/// Stores 512-bits (composed of 8 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))]
+pub unsafe fn _mm512_storeu_pd(mem_addr: *mut f64, a: __m512d) {
+    ptr::write_unaligned(mem_addr as *mut __m512d, a);
+}
+
+/// Loads 512-bits (composed of 16 packed single-precision (32-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))]
+pub unsafe fn _mm512_loadu_ps(mem_addr: *const f32) -> __m512 {
+    ptr::read_unaligned(mem_addr as *const __m512)
+}
+
+/// Stores 512-bits (composed of 16 packed single-precision (32-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))]
+pub unsafe fn _mm512_storeu_ps(mem_addr: *mut f32, a: __m512) {
+    ptr::write_unaligned(mem_addr as *mut __m512, a);
+}
+
+/// Load 512-bits of integer data from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_si512&expand=3345)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm512_load_si512(mem_addr: *const __m512i) -> __m512i {
+    ptr::read(mem_addr)
+}
+
+/// Store 512-bits of integer data from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_si512&expand=5598)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm512_store_si512(mem_addr: *mut __m512i, a: __m512i) {
+    ptr::write(mem_addr, a);
+}
+
+/// Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_epi32&expand=3304)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm512_load_epi32(mem_addr: *const i32) -> __m512i {
+    ptr::read(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 8 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_epi32&expand=3301)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm256_load_epi32(mem_addr: *const i32) -> __m256i {
+    ptr::read(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 4 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_epi32&expand=3298)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm_load_epi32(mem_addr: *const i32) -> __m128i {
+    ptr::read(mem_addr as *const __m128i)
+}
+
+/// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_epi32&expand=5569)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm512_store_epi32(mem_addr: *mut i32, a: __m512i) {
+    ptr::write(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 8 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_epi32&expand=5567)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm256_store_epi32(mem_addr: *mut i32, a: __m256i) {
+    ptr::write(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 4 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_epi32&expand=5565)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm_store_epi32(mem_addr: *mut i32, a: __m128i) {
+    ptr::write(mem_addr as *mut __m128i, a);
+}
+
+/// Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_epi64&expand=3313)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
+pub unsafe fn _mm512_load_epi64(mem_addr: *const i64) -> __m512i {
+    ptr::read(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 4 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_epi64&expand=3310)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
+pub unsafe fn _mm256_load_epi64(mem_addr: *const i64) -> __m256i {
+    ptr::read(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 2 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_epi64&expand=3307)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
+pub unsafe fn _mm_load_epi64(mem_addr: *const i64) -> __m128i {
+    ptr::read(mem_addr as *const __m128i)
+}
+
+/// Store 512-bits (composed of 8 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_epi64&expand=5575)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
+pub unsafe fn _mm512_store_epi64(mem_addr: *mut i64, a: __m512i) {
+    ptr::write(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 4 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_epi64&expand=5573)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
+pub unsafe fn _mm256_store_epi64(mem_addr: *mut i64, a: __m256i) {
+    ptr::write(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 2 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_epi64&expand=5571)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
+pub unsafe fn _mm_store_epi64(mem_addr: *mut i64, a: __m128i) {
+    ptr::write(mem_addr as *mut __m128i, a);
+}
+
+/// Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_ps&expand=3336)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)]
+pub unsafe fn _mm512_load_ps(mem_addr: *const f32) -> __m512 {
+    ptr::read(mem_addr as *const __m512)
+}
+
+/// Store 512-bits of integer data from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_ps&expand=5592)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)]
+pub unsafe fn _mm512_store_ps(mem_addr: *mut f32, a: __m512) {
+    ptr::write(mem_addr as *mut __m512, a);
+}
+
+/// Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_pd&expand=3326)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovapd
+pub unsafe fn _mm512_load_pd(mem_addr: *const f64) -> __m512d {
+    ptr::read(mem_addr as *const __m512d)
+}
+
+/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_pd&expand=5585)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovapd
+pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) {
+    ptr::write(mem_addr as *mut __m512d, a);
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i {
+    transmute(loaddqu32_512(mem_addr, src.as_i32x16(), k))
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
+    _mm512_mask_loadu_epi32(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i {
+    transmute(loaddqu64_512(mem_addr, src.as_i64x8(), k))
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
+    _mm512_mask_loadu_epi64(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 {
+    transmute(loadups_512(mem_addr, src.as_f32x16(), k))
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
+    _mm512_mask_loadu_ps(_mm512_setzero_ps(), k, mem_addr)
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d {
+    transmute(loadupd_512(mem_addr, src.as_f64x8(), k))
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
+    _mm512_mask_loadu_pd(_mm512_setzero_pd(), k, mem_addr)
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i {
+    transmute(loaddqu32_256(mem_addr, src.as_i32x8(), k))
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
+    _mm256_mask_loadu_epi32(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i {
+    transmute(loaddqu64_256(mem_addr, src.as_i64x4(), k))
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
+    _mm256_mask_loadu_epi64(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
+    transmute(loadups_256(mem_addr, src.as_f32x8(), k))
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
+    _mm256_mask_loadu_ps(_mm256_setzero_ps(), k, mem_addr)
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d {
+    transmute(loadupd_256(mem_addr, src.as_f64x4(), k))
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
+    _mm256_mask_loadu_pd(_mm256_setzero_pd(), k, mem_addr)
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i {
+    transmute(loaddqu32_128(mem_addr, src.as_i32x4(), k))
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
+    _mm_mask_loadu_epi32(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i {
+    transmute(loaddqu64_128(mem_addr, src.as_i64x2(), k))
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
+    _mm_mask_loadu_epi64(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    transmute(loadups_128(mem_addr, src.as_f32x4(), k))
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    _mm_mask_loadu_ps(_mm_setzero_ps(), k, mem_addr)
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    transmute(loadupd_128(mem_addr, src.as_f64x2(), k))
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    _mm_mask_loadu_pd(_mm_setzero_pd(), k, mem_addr)
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i {
+    transmute(loaddqa32_512(mem_addr, src.as_i32x16(), k))
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
+    _mm512_mask_load_epi32(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i {
+    transmute(loaddqa64_512(mem_addr, src.as_i64x8(), k))
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
+    _mm512_mask_load_epi64(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 {
+    transmute(loadaps_512(mem_addr, src.as_f32x16(), k))
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
+    _mm512_mask_load_ps(_mm512_setzero_ps(), k, mem_addr)
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d {
+    transmute(loadapd_512(mem_addr, src.as_f64x8(), k))
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
+    _mm512_mask_load_pd(_mm512_setzero_pd(), k, mem_addr)
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i {
+    transmute(loaddqa32_256(mem_addr, src.as_i32x8(), k))
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
+    _mm256_mask_load_epi32(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i {
+    transmute(loaddqa64_256(mem_addr, src.as_i64x4(), k))
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
+    _mm256_mask_load_epi64(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
+    transmute(loadaps_256(mem_addr, src.as_f32x8(), k))
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
+    _mm256_mask_load_ps(_mm256_setzero_ps(), k, mem_addr)
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d {
+    transmute(loadapd_256(mem_addr, src.as_f64x4(), k))
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
+    _mm256_mask_load_pd(_mm256_setzero_pd(), k, mem_addr)
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i {
+    transmute(loaddqa32_128(mem_addr, src.as_i32x4(), k))
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
+    _mm_mask_load_epi32(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i {
+    transmute(loaddqa64_128(mem_addr, src.as_i64x2(), k))
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
+    _mm_mask_load_epi64(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    transmute(loadaps_128(mem_addr, src.as_f32x4(), k))
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    _mm_mask_load_ps(_mm_setzero_ps(), k, mem_addr)
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    transmute(loadapd_128(mem_addr, src.as_f64x2(), k))
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    _mm_mask_load_pd(_mm_setzero_pd(), k, mem_addr)
+}
+
+/// Load a single-precision (32-bit) floating-point element from memory into the lower element of dst
+/// using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper
+/// 3 packed elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovss))]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_load_ss(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128 = src;
+    asm!(
+        vpl!("vmovss {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Load a single-precision (32-bit) floating-point element from memory into the lower element of dst
+/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper 3 packed
+/// elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovss))]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_load_ss(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128;
+    asm!(
+        vpl!("vmovss {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory into the lower element of dst
+/// using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper
+/// element of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovsd))]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_load_sd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d = src;
+    asm!(
+        vpl!("vmovsd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory into the lower element of dst
+/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper element
+/// of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception
+/// may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovsd))]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_load_sd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d;
+    asm!(
+        vpl!("vmovsd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) {
+    storedqu32_512(mem_addr, a.as_i32x16(), mask)
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) {
+    storedqu64_512(mem_addr, a.as_i64x8(), mask)
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) {
+    storeups_512(mem_addr, a.as_f32x16(), mask)
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) {
+    storeupd_512(mem_addr, a.as_f64x8(), mask)
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) {
+    storedqu32_256(mem_addr, a.as_i32x8(), mask)
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) {
+    storedqu64_256(mem_addr, a.as_i64x4(), mask)
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) {
+    storeups_256(mem_addr, a.as_f32x8(), mask)
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) {
+    storeupd_256(mem_addr, a.as_f64x4(), mask)
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) {
+    storedqu32_128(mem_addr, a.as_i32x4(), mask)
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) {
+    storedqu64_128(mem_addr, a.as_i64x2(), mask)
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
+    storeups_128(mem_addr, a.as_f32x4(), mask)
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) {
+    storeupd_128(mem_addr, a.as_f64x2(), mask)
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) {
+    storedqa32_512(mem_addr, a.as_i32x16(), mask)
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) {
+    storedqa64_512(mem_addr, a.as_i64x8(), mask)
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) {
+    storeaps_512(mem_addr, a.as_f32x16(), mask)
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) {
+    storeapd_512(mem_addr, a.as_f64x8(), mask)
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) {
+    storedqa32_256(mem_addr, a.as_i32x8(), mask)
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) {
+    storedqa64_256(mem_addr, a.as_i64x4(), mask)
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) {
+    storeaps_256(mem_addr, a.as_f32x8(), mask)
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) {
+    storeapd_256(mem_addr, a.as_f64x4(), mask)
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) {
+    storedqa32_128(mem_addr, a.as_i32x4(), mask)
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) {
+    storedqa64_128(mem_addr, a.as_i64x2(), mask)
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
+    storeaps_128(mem_addr, a.as_f32x4(), mask)
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) {
+    storeapd_128(mem_addr, a.as_f64x2(), mask)
+}
+
+/// Store a single-precision (32-bit) floating-point element from a into memory using writemask k. mem_addr
+/// must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovss))]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_store_ss(mem_addr: *mut f32, k: __mmask8, a: __m128) {
+    asm!(
+        vps!("vmovss", "{{{k}}}, {a}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        a = in(xmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Store a double-precision (64-bit) floating-point element from a into memory using writemask k. mem_addr
+/// must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovsd))]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_store_sd(mem_addr: *mut f64, k: __mmask8, a: __m128d) {
+    asm!(
+        vps!("vmovsd", "{{{k}}}, {a}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        a = in(xmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_expandloadu_epi32(
+    src: __m512i,
+    k: __mmask16,
+    mem_addr: *const i32,
+) -> __m512i {
+    transmute(expandloadd_512(mem_addr, src.as_i32x16(), k))
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_expandloadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
+    _mm512_mask_expandloadu_epi32(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_expandloadu_epi32(
+    src: __m256i,
+    k: __mmask8,
+    mem_addr: *const i32,
+) -> __m256i {
+    transmute(expandloadd_256(mem_addr, src.as_i32x8(), k))
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
+    _mm256_mask_expandloadu_epi32(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_expandloadu_epi32(
+    src: __m128i,
+    k: __mmask8,
+    mem_addr: *const i32,
+) -> __m128i {
+    transmute(expandloadd_128(mem_addr, src.as_i32x4(), k))
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
+    _mm_mask_expandloadu_epi32(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_expandloadu_epi64(
+    src: __m512i,
+    k: __mmask8,
+    mem_addr: *const i64,
+) -> __m512i {
+    transmute(expandloadq_512(mem_addr, src.as_i64x8(), k))
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
+    _mm512_mask_expandloadu_epi64(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_expandloadu_epi64(
+    src: __m256i,
+    k: __mmask8,
+    mem_addr: *const i64,
+) -> __m256i {
+    transmute(expandloadq_256(mem_addr, src.as_i64x4(), k))
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
+    _mm256_mask_expandloadu_epi64(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_expandloadu_epi64(
+    src: __m128i,
+    k: __mmask8,
+    mem_addr: *const i64,
+) -> __m128i {
+    transmute(expandloadq_128(mem_addr, src.as_i64x2(), k))
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
+    _mm_mask_expandloadu_epi64(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_expandloadu_ps(
+    src: __m512,
+    k: __mmask16,
+    mem_addr: *const f32,
+) -> __m512 {
+    transmute(expandloadps_512(mem_addr, src.as_f32x16(), k))
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_expandloadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
+    _mm512_mask_expandloadu_ps(_mm512_setzero_ps(), k, mem_addr)
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_expandloadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
+    transmute(expandloadps_256(mem_addr, src.as_f32x8(), k))
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
+    _mm256_mask_expandloadu_ps(_mm256_setzero_ps(), k, mem_addr)
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_expandloadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    transmute(expandloadps_128(mem_addr, src.as_f32x4(), k))
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    _mm_mask_expandloadu_ps(_mm_setzero_ps(), k, mem_addr)
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_expandloadu_pd(
+    src: __m512d,
+    k: __mmask8,
+    mem_addr: *const f64,
+) -> __m512d {
+    transmute(expandloadpd_512(mem_addr, src.as_f64x8(), k))
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
+    _mm512_mask_expandloadu_pd(_mm512_setzero_pd(), k, mem_addr)
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_expandloadu_pd(
+    src: __m256d,
+    k: __mmask8,
+    mem_addr: *const f64,
+) -> __m256d {
+    transmute(expandloadpd_256(mem_addr, src.as_f64x4(), k))
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
+    _mm256_mask_expandloadu_pd(_mm256_setzero_pd(), k, mem_addr)
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_expandloadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    transmute(expandloadpd_128(mem_addr, src.as_f64x2(), k))
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    _mm_mask_expandloadu_pd(_mm_setzero_pd(), k, mem_addr)
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr_pd&expand=5002)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr_pd(
+    e0: f64,
+    e1: f64,
+    e2: f64,
+    e3: f64,
+    e4: f64,
+    e5: f64,
+    e6: f64,
+    e7: f64,
+) -> __m512d {
+    unsafe {
+        let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
+        transmute(r)
+    }
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_pd&expand=4924)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set_pd(
+    e0: f64,
+    e1: f64,
+    e2: f64,
+    e3: f64,
+    e4: f64,
+    e5: f64,
+    e6: f64,
+    e7: f64,
+) -> __m512d {
+    _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
+}
+
+/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_move_ss&expand=3832)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovss))]
+pub fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut mov: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_move_ss&expand=3833)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovss))]
+pub fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut mov: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_move_sd&expand=3829)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsd))]
+pub fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut mov: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_move_sd&expand=3830)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsd))]
+pub fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut mov: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_add_ss&expand=159)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddss))]
+pub fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut add: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_add_ss&expand=160)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddss))]
+pub fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut add: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_add_sd&expand=155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddsd))]
+pub fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut add: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_add_sd&expand=156)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddsd))]
+pub fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut add: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sub_ss&expand=5750)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubss))]
+pub fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut add: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sub_ss&expand=5751)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubss))]
+pub fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut add: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sub_sd&expand=5746)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubsd))]
+pub fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut add: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sub_sd&expand=5747)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubsd))]
+pub fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut add: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_mul_ss&expand=3950)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulss))]
+pub fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut add: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_mul_ss&expand=3951)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulss))]
+pub fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut add: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_mul_sd&expand=3947)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulsd))]
+pub fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut add: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_mul_sd&expand=3948)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulsd))]
+pub fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut add: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_div_ss&expand=2181)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivss))]
+pub fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut add: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_div_ss&expand=2182)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivss))]
+pub fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut add: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_div_sd&expand=2178)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivsd))]
+pub fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut add: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_div_sd&expand=2179)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivsd))]
+pub fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut add: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_max_ss&expand=3672)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxss))]
+pub fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vmaxss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_max_ss&expand=3673)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxss))]
+pub fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vmaxss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_max_sd&expand=3669)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxsd))]
+pub fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vmaxsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_max_sd&expand=3670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxsd))]
+pub fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vmaxsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_min_ss&expand=3786)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminss))]
+pub fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vminss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_min_ss&expand=3787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminss))]
+pub fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vminss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_min_sd&expand=3783)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminsd))]
+pub fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vminsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_min_sd&expand=3784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminsd))]
+pub fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vminsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sqrt_ss&expand=5387)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtss))]
+pub fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { vsqrtss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sqrt_ss&expand=5388)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtss))]
+pub fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { vsqrtss(a, b, _mm_setzero_ps(), k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sqrt_sd&expand=5384)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtsd))]
+pub fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { vsqrtsd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sqrt_sd&expand=5385)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtsd))]
+pub fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { vsqrtsd(a, b, _mm_setzero_pd(), k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_rsqrt14_ss&expand=4825)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, 0b1)) }
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_rsqrt14_ss&expand=4823)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_rsqrt14_ss&expand=4824)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_rsqrt14_sd&expand=4822)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, 0b1)) }
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_rsqrt14_sd&expand=4820)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_rsqrt14_sd&expand=4821)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_rcp14_ss&expand=4508)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, 0b1)) }
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_rcp14_ss&expand=4506)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_rcp14_ss&expand=4507)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_rcp14_sd&expand=4505)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, 0b1)) }
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_rcp14_sd&expand=4503)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_rcp14_sd&expand=4504)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getexp_ss&expand=2862)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vgetexpss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            0b1,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getexp_ss&expand=2863)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vgetexpss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getexp_ss&expand=2864)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vgetexpss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getexp_sd&expand=2859)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vgetexpsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            0b1,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getexp_sd&expand=2860)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vgetexpsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getexp_sd&expand=2861)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vgetexpsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getmant_ss&expand=2898)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_getmant_ss<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetmantss(
+            a,
+            b,
+            SIGN << 2 | NORM,
+            f32x4::ZERO,
+            0b1,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getmant_ss&expand=2899)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_mask_getmant_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getmant_ss&expand=2900)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_maskz_getmant_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetmantss(
+            a,
+            b,
+            SIGN << 2 | NORM,
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getmant_sd&expand=2895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_getmant_sd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetmantsd(
+            a,
+            b,
+            SIGN << 2 | NORM,
+            f64x2::ZERO,
+            0b1,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getmant_sd&expand=2896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_mask_getmant_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getmant_sd&expand=2897)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_maskz_getmant_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetmantsd(
+            a,
+            b,
+            SIGN << 2 | NORM,
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_roundscale_ss&expand=4802)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 255))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_roundscale_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vrndscaless(
+            a,
+            b,
+            f32x4::ZERO,
+            0b11111111,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_roundscale_ss&expand=4800)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_roundscale_ss<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vrndscaless(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_roundscale_ss&expand=4801)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_roundscale_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vrndscaless(a, b, f32x4::ZERO, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_roundscale_sd&expand=4799)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 255))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_roundscale_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vrndscalesd(
+            a,
+            b,
+            f64x2::ZERO,
+            0b11111111,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_roundscale_sd&expand=4797)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_roundscale_sd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vrndscalesd(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_roundscale_sd&expand=4798)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_roundscale_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vrndscalesd(a, b, f64x2::ZERO, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_scalef_ss&expand=4901)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        transmute(vscalefss(
+            a,
+            b,
+            f32x4::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_scalef_ss&expand=4899)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        transmute(vscalefss(a, b, src, k, _MM_FROUND_CUR_DIRECTION))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_scalef_ss&expand=4900)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vscalefss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_scalef_sd&expand=4898)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vscalefsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_scalef_sd&expand=4896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vscalefsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_scalef_sd&expand=4897)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vscalefsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmadd_ss&expand=2582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+pub fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fmadd: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fmadd = fmaf32(fmadd, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmadd_ss&expand=2584)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+pub fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fmadd: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fmadd = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmadd_ss&expand=2583)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+pub fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe {
+        let mut fmadd: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            fmadd = fmaf32(extracta, extractb, fmadd);
+        }
+        simd_insert!(c, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmadd_sd&expand=2578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+pub fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fmadd: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fmadd = fmaf64(fmadd, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmadd_sd&expand=2580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+pub fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fmadd: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fmadd = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmadd_sd&expand=2579)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+pub fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe {
+        let mut fmadd: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            fmadd = fmaf64(extracta, extractb, fmadd);
+        }
+        simd_insert!(c, 0, fmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmsub_ss&expand=2668)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+pub fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fmsub: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = fmaf32(fmsub, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmsub_ss&expand=2670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+pub fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fmsub: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmsub_ss&expand=2669)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+pub fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe {
+        let mut fmsub: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc = -fmsub;
+            fmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(c, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmsub_sd&expand=2664)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+pub fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fmsub: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = fmaf64(fmsub, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmsub_sd&expand=2666)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+pub fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fmsub: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmsub_sd&expand=2665)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+pub fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe {
+        let mut fmsub: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc = -fmsub;
+            fmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(c, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmadd_ss&expand=2748)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+pub fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fnmadd: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmadd;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fnmadd = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmadd_ss&expand=2750)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+pub fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fnmadd: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fnmadd = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmadd_ss&expand=2749)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+pub fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe {
+        let mut fnmadd: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            fnmadd = fmaf32(extracta, extractb, fnmadd);
+        }
+        simd_insert!(c, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmadd_sd&expand=2744)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+pub fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fnmadd: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmadd;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fnmadd = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmadd_sd&expand=2746)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+pub fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fnmadd: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fnmadd = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmadd_sd&expand=2745)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+pub fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe {
+        let mut fnmadd: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            fnmadd = fmaf64(extracta, extractb, fnmadd);
+        }
+        simd_insert!(c, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmsub_ss&expand=2796)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+pub fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fnmsub: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmsub;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmsub_ss&expand=2798)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+pub fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fnmsub: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmsub_ss&expand=2797)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+pub fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe {
+        let mut fnmsub: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc = -fnmsub;
+            fnmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(c, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmsub_sd&expand=2792)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+pub fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fnmsub: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmsub;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmsub_sd&expand=2794)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+pub fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fnmsub: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmsub_sd&expand=2793)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+pub fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe {
+        let mut fnmsub: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc = -fnmsub;
+            fnmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(c, 0, fnmsub)
+    }
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_round_ss&expand=151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_add_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vaddss(a, b, f32x4::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_add_round_ss&expand=152)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_add_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vaddss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_round_ss&expand=153)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_add_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vaddss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_round_sd&expand=148)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_add_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vaddsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_add_round_sd&expand=149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_add_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vaddsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_round_sd&expand=150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_add_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vaddsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_round_ss&expand=5745)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_sub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vsubss(a, b, f32x4::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sub_round_ss&expand=5743)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_sub_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vsubss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sub_round_ss&expand=5744)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_sub_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vsubss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_round_sd&expand=5742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_sub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vsubsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sub_round_sd&expand=5740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_sub_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vsubsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sub_round_sd&expand=5741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_sub_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vsubsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_round_ss&expand=3946)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_mul_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vmulss(a, b, f32x4::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_mul_round_ss&expand=3944)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_mul_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vmulss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_mul_round_ss&expand=3945)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_mul_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vmulss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_round_sd&expand=3943)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_mul_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vmulsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_mul_round_sd&expand=3941)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_mul_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vmulsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_mul_round_sd&expand=3942)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_mul_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vmulsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_div_round_ss&expand=2174)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_div_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vdivss(a, b, f32x4::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_div_round_ss&expand=2175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_div_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vdivss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_div_round_ss&expand=2176)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_div_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vdivss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_div_round_sd&expand=2171)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_div_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vdivsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_div_round_sd&expand=2172)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_div_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vdivsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_div_round_sd&expand=2173)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_div_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vdivsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_max_round_ss&expand=3668)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_max_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vmaxss(a, b, f32x4::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_round_ss&expand=3672)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_max_round_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vmaxss(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_max_round_ss&expand=3667)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_max_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vmaxss(a, b, f32x4::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_max_round_sd&expand=3665)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_max_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vmaxsd(a, b, f64x2::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_max_round_sd&expand=3663)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_max_round_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vmaxsd(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_round_sd&expand=3670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_max_round_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vmaxsd(a, b, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_min_round_ss&expand=3782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_min_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vminss(a, b, f32x4::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_min_round_ss&expand=3780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_min_round_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vminss(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_min_round_ss&expand=3781)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_min_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vminss(a, b, f32x4::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_min_round_sd&expand=3779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_min_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vminsd(a, b, f64x2::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_min_round_sd&expand=3777)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_min_round_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vminsd(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_min_round_sd&expand=3778)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_min_round_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vminsd(a, b, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sqrt_round_ss&expand=5383)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_sqrt_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtss(a, b, _mm_setzero_ps(), 0b1, ROUNDING)
+    }
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sqrt_round_ss&expand=5381)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_sqrt_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtss(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sqrt_round_ss&expand=5382)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtss(a, b, _mm_setzero_ps(), k, ROUNDING)
+    }
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sqrt_round_sd&expand=5380)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_sqrt_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtsd(a, b, _mm_setzero_pd(), 0b1, ROUNDING)
+    }
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sqrt_round_sd&expand=5378)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_sqrt_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtsd(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sqrt_round_sd&expand=5379)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtsd(a, b, _mm_setzero_pd(), k, ROUNDING)
+    }
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getexp_round_ss&expand=2856)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_getexp_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetexpss(a, b, f32x4::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getexp_round_ss&expand=2857)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_getexp_round_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vgetexpss(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getexp_round_ss&expand=2858)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_getexp_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetexpss(a, b, f32x4::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getexp_round_sd&expand=2853)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_getexp_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetexpsd(a, b, f64x2::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getexp_round_sd&expand=2854)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_getexp_round_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vgetexpsd(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getexp_round_sd&expand=2855)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_getexp_round_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetexpsd(a, b, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getmant_round_ss&expand=2892)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+pub fn _mm_getmant_round_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetmantss(a, b, SIGN << 2 | NORM, f32x4::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getmant_round_ss&expand=2893)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(4, 5, 6)]
+pub fn _mm_mask_getmant_round_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getmant_round_ss&expand=2894)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+pub fn _mm_maskz_getmant_round_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetmantss(a, b, SIGN << 2 | NORM, f32x4::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getmant_round_sd&expand=2889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+pub fn _mm_getmant_round_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetmantsd(a, b, SIGN << 2 | NORM, f64x2::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getmant_round_sd&expand=2890)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(4, 5, 6)]
+pub fn _mm_mask_getmant_round_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getmant_round_sd&expand=2891)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+pub fn _mm_maskz_getmant_round_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetmantsd(a, b, SIGN << 2 | NORM, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_roundscale_round_ss&expand=4796)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_roundscale_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vrndscaless(a, b, f32x4::ZERO, 0b11111111, IMM8, SAE);
+        transmute(r)
+    }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_roundscale_round_ss&expand=4794)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_mask_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vrndscaless(a, b, src, k, IMM8, SAE);
+        transmute(r)
+    }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_roundscale_round_ss&expand=4795)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_maskz_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vrndscaless(a, b, f32x4::ZERO, k, IMM8, SAE);
+        transmute(r)
+    }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_roundscale_round_sd&expand=4793)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_roundscale_round_sd<const IMM8: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vrndscalesd(a, b, f64x2::ZERO, 0b11111111, IMM8, SAE);
+        transmute(r)
+    }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_roundscale_round_sd&expand=4791)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_mask_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vrndscalesd(a, b, src, k, IMM8, SAE);
+        transmute(r)
+    }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_roundscale_round_sd&expand=4792)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_maskz_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vrndscalesd(a, b, f64x2::ZERO, k, IMM8, SAE);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_scalef_round_ss&expand=4895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_scalef_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vscalefss(a, b, f32x4::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_scalef_round_ss&expand=4893)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_scalef_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vscalefss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_scalef_round_ss&expand=4894)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_scalef_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vscalefss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_scalef_round_sd&expand=4892)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_scalef_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vscalefsd(a, b, f64x2::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_scalef_round_sd&expand=4890)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_scalef_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vscalefsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_scalef_round_sd&expand=4891)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_scalef_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vscalefsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fmadd_round_ss&expand=2573)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
+        let r = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmadd_round_ss&expand=2574)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fmadd = vfmaddssround(fmadd, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmadd_round_ss&expand=2576)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmadd_round_ss&expand=2575)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    k: __mmask8,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            fmadd = vfmaddssround(extracta, extractb, fmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fmadd_round_sd&expand=2569)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fmadd_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
+        let fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmadd_round_sd&expand=2570)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fmadd = vfmaddsdround(fmadd, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmadd_round_sd&expand=2572)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmadd_round_sd&expand=2571)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    k: __mmask8,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            fmadd = vfmaddsdround(extracta, extractb, fmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fmsub_round_ss&expand=2659)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
+        let extractc = -extractc;
+        let fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmsub_round_ss&expand=2660)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = vfmaddssround(fmsub, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmsub_round_ss&expand=2662)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmsub_round_ss&expand=2661)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    k: __mmask8,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc = -fmsub;
+            fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(c, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fmsub_round_sd&expand=2655)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fmsub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
+        let extractc = -extractc;
+        let fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmsub_round_sd&expand=2656)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = vfmaddsdround(fmsub, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmsub_round_sd&expand=2658)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmsub_round_sd&expand=2657)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    k: __mmask8,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc = -fmsub;
+            fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(c, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fnmadd_round_ss&expand=2739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
+        let fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmadd_round_ss&expand=2740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmadd;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmadd_round_ss&expand=2742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmadd_round_ss&expand=2741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    k: __mmask8,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            fnmadd = vfmaddssround(extracta, extractb, fnmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fnmadd_round_sd&expand=2735)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fnmadd_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
+        let fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmadd_round_sd&expand=2736)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmadd;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmadd_round_sd&expand=2738)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmadd_round_sd&expand=2737)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    k: __mmask8,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            fnmadd = vfmaddsdround(extracta, extractb, fnmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fnmsub_round_ss&expand=2787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
+        let extractc = -extractc;
+        let fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmsub_round_ss&expand=2788)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmsub;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmsub_round_ss&expand=2790)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmsub_round_ss&expand=2789)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    k: __mmask8,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc = -fnmsub;
+            fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fnmsub_round_sd&expand=2783)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fnmsub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
+        let extractc = -extractc;
+        let fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmsub_round_sd&expand=2784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmsub;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmsub_round_sd&expand=2786)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmsub_round_sd&expand=2785)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    k: __mmask8,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc = -fnmsub;
+            fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmsub)
+    }
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fixupimm_ss&expand=2517)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fixupimm_ss<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmss(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f32 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fixupimm_ss&expand=2518)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fixupimm_ss<const IMM8: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let fixupimm = vfixupimmss(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f32 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fixupimm_ss&expand=2519)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fixupimm_ss<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let fixupimm = vfixupimmssz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f32 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fixupimm_sd&expand=2514)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fixupimm_sd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let fixupimm = vfixupimmsd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f64 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fixupimm_sd&expand=2515)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fixupimm_sd<const IMM8: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let fixupimm = vfixupimmsd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f64 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fixupimm_sd&expand=2516)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fixupimm_sd<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let fixupimm = vfixupimmsdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f64 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fixupimm_round_ss&expand=2511)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmss(a, b, c, IMM8, 0b11111111, SAE);
+        let fixupimm: f32 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fixupimm_round_ss&expand=2512)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_mask_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmss(a, b, c, IMM8, k, SAE);
+        let fixupimm: f32 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fixupimm_round_ss&expand=2513)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_maskz_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmssz(a, b, c, IMM8, k, SAE);
+        let fixupimm: f32 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fixupimm_round_sd&expand=2508)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmsd(a, b, c, IMM8, 0b11111111, SAE);
+        let fixupimm: f64 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fixupimm_round_sd&expand=2509)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_mask_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmsd(a, b, c, IMM8, k, SAE);
+        let fixupimm: f64 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fixupimm_round_sd&expand=2510)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_maskz_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmsdz(a, b, c, IMM8, k, SAE);
+        let fixupimm: f64 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_cvtss_sd&expand=1896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2sd))]
+pub fn _mm_mask_cvtss_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128) -> __m128d {
+    unsafe {
+        transmute(vcvtss2sd(
+            a.as_f64x2(),
+            b.as_f32x4(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvtss_sd&expand=1897)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2sd))]
+pub fn _mm_maskz_cvtss_sd(k: __mmask8, a: __m128d, b: __m128) -> __m128d {
+    unsafe {
+        transmute(vcvtss2sd(
+            a.as_f64x2(),
+            b.as_f32x4(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_cvtsd_ss&expand=1797)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss))]
+pub fn _mm_mask_cvtsd_ss(src: __m128, k: __mmask8, a: __m128, b: __m128d) -> __m128 {
+    unsafe {
+        transmute(vcvtsd2ss(
+            a.as_f32x4(),
+            b.as_f64x2(),
+            src.as_f32x4(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvtsd_ss&expand=1798)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss))]
+pub fn _mm_maskz_cvtsd_ss(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
+    unsafe {
+        transmute(vcvtsd2ss(
+            a.as_f32x4(),
+            b.as_f64x2(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundss_sd&expand=1371)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundss_sd<const SAE: i32>(a: __m128d, b: __m128) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f32x4();
+        let r = vcvtss2sd(a, b, f64x2::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_cvt_roundss_sd&expand=1372)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_cvt_roundss_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128,
+) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f32x4();
+        let src = src.as_f64x2();
+        let r = vcvtss2sd(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvt_roundss_sd&expand=1373)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_cvt_roundss_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f32x4();
+        let r = vcvtss2sd(a, b, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundsd_ss&expand=1361)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundsd_ss<const ROUNDING: i32>(a: __m128, b: __m128d) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f64x2();
+        let r = vcvtsd2ss(a, b, f32x4::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_cvt_roundsd_ss&expand=1362)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_cvt_roundsd_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128d,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f64x2();
+        let src = src.as_f32x4();
+        let r = vcvtsd2ss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvt_roundsd_ss&expand=1363)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f64x2();
+        let r = vcvtsd2ss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundss_si32&expand=1374)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2si(a, ROUNDING)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundss_i32&expand=1369)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2si(a, ROUNDING)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundss_u32&expand=1376)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2usi(a, ROUNDING)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtss_i32&expand=1893)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2si))]
+pub fn _mm_cvtss_i32(a: __m128) -> i32 {
+    unsafe { vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtss_u32&expand=1901)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2usi))]
+pub fn _mm_cvtss_u32(a: __m128) -> u32 {
+    unsafe { vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundsd_si32&expand=1359)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2si(a, ROUNDING)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundsd_i32&expand=1357)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2si(a, ROUNDING)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundsd_u32&expand=1364)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2usi(a, ROUNDING)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtsd_i32&expand=1791)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2si))]
+pub fn _mm_cvtsd_i32(a: __m128d) -> i32 {
+    unsafe { vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtsd_u32&expand=1799)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi))]
+pub fn _mm_cvtsd_u32(a: __m128d) -> u32 {
+    unsafe { vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundi32_ss&expand=1312)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtsi2ss(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundsi32_ss&expand=1366)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundsi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtsi2ss(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundu32_ss&expand=1378)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtusi2ss(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvti32_ss&expand=1643)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss))]
+pub fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
+    unsafe {
+        let b = b as f32;
+        simd_insert!(a, 0, b)
+    }
+}
+
+/// Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvti32_sd&expand=1642)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd))]
+pub fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
+    unsafe {
+        let b = b as f64;
+        simd_insert!(a, 0, b)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtt_roundss_si32&expand=1936)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2si(a, SAE)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtt_roundss_i32&expand=1934)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2si(a, SAE)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtt_roundss_u32&expand=1938)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2usi(a, SAE)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_i32&expand=2022)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2si))]
+pub fn _mm_cvttss_i32(a: __m128) -> i32 {
+    unsafe { vcvttss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_u32&expand=2026)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2usi))]
+pub fn _mm_cvttss_u32(a: __m128) -> u32 {
+    unsafe { vcvttss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_roundsd_si32&expand=1930)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2si(a, SAE)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_roundsd_i32&expand=1928)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2si(a, SAE)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtt_roundsd_u32&expand=1932)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2usi(a, SAE)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_i32&expand=2015)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2si))]
+pub fn _mm_cvttsd_i32(a: __m128d) -> i32 {
+    unsafe { vcvttsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_u32&expand=2020)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2usi))]
+pub fn _mm_cvttsd_u32(a: __m128d) -> u32 {
+    unsafe { vcvttsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtu32_ss&expand=2032)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtusi2ss))]
+pub fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
+    unsafe {
+        let b = b as f32;
+        simd_insert!(a, 0, b)
+    }
+}
+
+/// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtu32_sd&expand=2031)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtusi2sd))]
+pub fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
+    unsafe {
+        let b = b as f64;
+        simd_insert!(a, 0, b)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comi_round_ss&expand=1175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 5, SAE = 4))] //should be vcomiss
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_comi_round_ss<const IMM5: i32, const SAE: i32>(a: __m128, b: __m128) -> i32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        vcomiss(a, b, IMM5, SAE)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comi_round_sd&expand=1174)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 5, SAE = 4))] //should be vcomisd
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_comi_round_sd<const IMM5: i32, const SAE: i32>(a: __m128d, b: __m128d) -> i32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        vcomisd(a, b, IMM5, SAE)
+    }
+}
+
+/// Equal
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
+/// Less-than
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_LT: _MM_CMPINT_ENUM = 0x01;
+/// Less-than-or-equal
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_LE: _MM_CMPINT_ENUM = 0x02;
+/// False
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_FALSE: _MM_CMPINT_ENUM = 0x03;
+/// Not-equal
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_NE: _MM_CMPINT_ENUM = 0x04;
+/// Not less-than
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_NLT: _MM_CMPINT_ENUM = 0x05;
+/// Not less-than-or-equal
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_NLE: _MM_CMPINT_ENUM = 0x06;
+/// True
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_TRUE: _MM_CMPINT_ENUM = 0x07;
+
+/// interval [1, 2)
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_NORM_1_2: _MM_MANTISSA_NORM_ENUM = 0x00;
+/// interval [0.5, 2)
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_NORM_P5_2: _MM_MANTISSA_NORM_ENUM = 0x01;
+/// interval [0.5, 1)
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_NORM_P5_1: _MM_MANTISSA_NORM_ENUM = 0x02;
+/// interval [0.75, 1.5)
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_NORM_P75_1P5: _MM_MANTISSA_NORM_ENUM = 0x03;
+
+/// sign = sign(SRC)
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_SIGN_SRC: _MM_MANTISSA_SIGN_ENUM = 0x00;
+/// sign = 0
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_SIGN_ZERO: _MM_MANTISSA_SIGN_ENUM = 0x01;
+/// DEST = NaN if sign(SRC) = 1
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_SIGN_NAN: _MM_MANTISSA_SIGN_ENUM = 0x02;
+
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AAAA: _MM_PERM_ENUM = 0x00;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AAAB: _MM_PERM_ENUM = 0x01;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AAAC: _MM_PERM_ENUM = 0x02;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AAAD: _MM_PERM_ENUM = 0x03;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AABA: _MM_PERM_ENUM = 0x04;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AABB: _MM_PERM_ENUM = 0x05;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AABC: _MM_PERM_ENUM = 0x06;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AABD: _MM_PERM_ENUM = 0x07;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AACA: _MM_PERM_ENUM = 0x08;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AACB: _MM_PERM_ENUM = 0x09;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AACC: _MM_PERM_ENUM = 0x0A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AACD: _MM_PERM_ENUM = 0x0B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AADA: _MM_PERM_ENUM = 0x0C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AADB: _MM_PERM_ENUM = 0x0D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AADC: _MM_PERM_ENUM = 0x0E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AADD: _MM_PERM_ENUM = 0x0F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABAA: _MM_PERM_ENUM = 0x10;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABAB: _MM_PERM_ENUM = 0x11;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABAC: _MM_PERM_ENUM = 0x12;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABAD: _MM_PERM_ENUM = 0x13;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABBA: _MM_PERM_ENUM = 0x14;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABBB: _MM_PERM_ENUM = 0x15;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABBC: _MM_PERM_ENUM = 0x16;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABBD: _MM_PERM_ENUM = 0x17;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABCA: _MM_PERM_ENUM = 0x18;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABCB: _MM_PERM_ENUM = 0x19;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABCC: _MM_PERM_ENUM = 0x1A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABCD: _MM_PERM_ENUM = 0x1B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABDA: _MM_PERM_ENUM = 0x1C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABDB: _MM_PERM_ENUM = 0x1D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABDC: _MM_PERM_ENUM = 0x1E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABDD: _MM_PERM_ENUM = 0x1F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACAA: _MM_PERM_ENUM = 0x20;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACAB: _MM_PERM_ENUM = 0x21;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACAC: _MM_PERM_ENUM = 0x22;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACAD: _MM_PERM_ENUM = 0x23;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACBA: _MM_PERM_ENUM = 0x24;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACBB: _MM_PERM_ENUM = 0x25;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACBC: _MM_PERM_ENUM = 0x26;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACBD: _MM_PERM_ENUM = 0x27;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACCA: _MM_PERM_ENUM = 0x28;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACCB: _MM_PERM_ENUM = 0x29;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACCC: _MM_PERM_ENUM = 0x2A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACCD: _MM_PERM_ENUM = 0x2B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACDA: _MM_PERM_ENUM = 0x2C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACDB: _MM_PERM_ENUM = 0x2D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACDC: _MM_PERM_ENUM = 0x2E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACDD: _MM_PERM_ENUM = 0x2F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADAA: _MM_PERM_ENUM = 0x30;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADAB: _MM_PERM_ENUM = 0x31;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADAC: _MM_PERM_ENUM = 0x32;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADAD: _MM_PERM_ENUM = 0x33;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADBA: _MM_PERM_ENUM = 0x34;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADBB: _MM_PERM_ENUM = 0x35;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADBC: _MM_PERM_ENUM = 0x36;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADBD: _MM_PERM_ENUM = 0x37;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADCA: _MM_PERM_ENUM = 0x38;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADCB: _MM_PERM_ENUM = 0x39;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADCC: _MM_PERM_ENUM = 0x3A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADCD: _MM_PERM_ENUM = 0x3B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADDA: _MM_PERM_ENUM = 0x3C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADDB: _MM_PERM_ENUM = 0x3D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADDC: _MM_PERM_ENUM = 0x3E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADDD: _MM_PERM_ENUM = 0x3F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BAAA: _MM_PERM_ENUM = 0x40;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BAAB: _MM_PERM_ENUM = 0x41;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BAAC: _MM_PERM_ENUM = 0x42;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BAAD: _MM_PERM_ENUM = 0x43;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BABA: _MM_PERM_ENUM = 0x44;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BABB: _MM_PERM_ENUM = 0x45;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BABC: _MM_PERM_ENUM = 0x46;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BABD: _MM_PERM_ENUM = 0x47;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BACA: _MM_PERM_ENUM = 0x48;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BACB: _MM_PERM_ENUM = 0x49;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BACC: _MM_PERM_ENUM = 0x4A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BACD: _MM_PERM_ENUM = 0x4B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BADA: _MM_PERM_ENUM = 0x4C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BADB: _MM_PERM_ENUM = 0x4D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BADC: _MM_PERM_ENUM = 0x4E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BADD: _MM_PERM_ENUM = 0x4F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBAA: _MM_PERM_ENUM = 0x50;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBAB: _MM_PERM_ENUM = 0x51;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBAC: _MM_PERM_ENUM = 0x52;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBAD: _MM_PERM_ENUM = 0x53;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBBA: _MM_PERM_ENUM = 0x54;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBBB: _MM_PERM_ENUM = 0x55;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBBC: _MM_PERM_ENUM = 0x56;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBBD: _MM_PERM_ENUM = 0x57;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBCA: _MM_PERM_ENUM = 0x58;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBCB: _MM_PERM_ENUM = 0x59;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBCC: _MM_PERM_ENUM = 0x5A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBCD: _MM_PERM_ENUM = 0x5B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBDA: _MM_PERM_ENUM = 0x5C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBDB: _MM_PERM_ENUM = 0x5D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBDC: _MM_PERM_ENUM = 0x5E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBDD: _MM_PERM_ENUM = 0x5F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCAA: _MM_PERM_ENUM = 0x60;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCAB: _MM_PERM_ENUM = 0x61;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCAC: _MM_PERM_ENUM = 0x62;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCAD: _MM_PERM_ENUM = 0x63;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCBA: _MM_PERM_ENUM = 0x64;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCBB: _MM_PERM_ENUM = 0x65;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCBC: _MM_PERM_ENUM = 0x66;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCBD: _MM_PERM_ENUM = 0x67;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCCA: _MM_PERM_ENUM = 0x68;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCCB: _MM_PERM_ENUM = 0x69;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCCC: _MM_PERM_ENUM = 0x6A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCCD: _MM_PERM_ENUM = 0x6B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCDA: _MM_PERM_ENUM = 0x6C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCDB: _MM_PERM_ENUM = 0x6D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCDC: _MM_PERM_ENUM = 0x6E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCDD: _MM_PERM_ENUM = 0x6F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDAA: _MM_PERM_ENUM = 0x70;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDAB: _MM_PERM_ENUM = 0x71;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDAC: _MM_PERM_ENUM = 0x72;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDAD: _MM_PERM_ENUM = 0x73;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDBA: _MM_PERM_ENUM = 0x74;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDBB: _MM_PERM_ENUM = 0x75;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDBC: _MM_PERM_ENUM = 0x76;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDBD: _MM_PERM_ENUM = 0x77;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDCA: _MM_PERM_ENUM = 0x78;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDCB: _MM_PERM_ENUM = 0x79;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDCC: _MM_PERM_ENUM = 0x7A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDCD: _MM_PERM_ENUM = 0x7B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDDA: _MM_PERM_ENUM = 0x7C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDDB: _MM_PERM_ENUM = 0x7D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDDC: _MM_PERM_ENUM = 0x7E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDDD: _MM_PERM_ENUM = 0x7F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CAAA: _MM_PERM_ENUM = 0x80;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CAAB: _MM_PERM_ENUM = 0x81;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CAAC: _MM_PERM_ENUM = 0x82;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CAAD: _MM_PERM_ENUM = 0x83;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CABA: _MM_PERM_ENUM = 0x84;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CABB: _MM_PERM_ENUM = 0x85;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CABC: _MM_PERM_ENUM = 0x86;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CABD: _MM_PERM_ENUM = 0x87;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CACA: _MM_PERM_ENUM = 0x88;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CACB: _MM_PERM_ENUM = 0x89;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CACC: _MM_PERM_ENUM = 0x8A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CACD: _MM_PERM_ENUM = 0x8B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CADA: _MM_PERM_ENUM = 0x8C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CADB: _MM_PERM_ENUM = 0x8D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CADC: _MM_PERM_ENUM = 0x8E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CADD: _MM_PERM_ENUM = 0x8F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBAA: _MM_PERM_ENUM = 0x90;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBAB: _MM_PERM_ENUM = 0x91;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBAC: _MM_PERM_ENUM = 0x92;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBAD: _MM_PERM_ENUM = 0x93;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBBA: _MM_PERM_ENUM = 0x94;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBBB: _MM_PERM_ENUM = 0x95;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBBC: _MM_PERM_ENUM = 0x96;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBBD: _MM_PERM_ENUM = 0x97;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBCA: _MM_PERM_ENUM = 0x98;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBCB: _MM_PERM_ENUM = 0x99;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBCC: _MM_PERM_ENUM = 0x9A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBCD: _MM_PERM_ENUM = 0x9B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBDA: _MM_PERM_ENUM = 0x9C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBDB: _MM_PERM_ENUM = 0x9D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBDC: _MM_PERM_ENUM = 0x9E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBDD: _MM_PERM_ENUM = 0x9F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCAA: _MM_PERM_ENUM = 0xA0;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCAB: _MM_PERM_ENUM = 0xA1;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCAC: _MM_PERM_ENUM = 0xA2;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCAD: _MM_PERM_ENUM = 0xA3;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCBA: _MM_PERM_ENUM = 0xA4;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCBB: _MM_PERM_ENUM = 0xA5;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCBC: _MM_PERM_ENUM = 0xA6;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCBD: _MM_PERM_ENUM = 0xA7;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCCA: _MM_PERM_ENUM = 0xA8;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCCB: _MM_PERM_ENUM = 0xA9;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCCC: _MM_PERM_ENUM = 0xAA;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCCD: _MM_PERM_ENUM = 0xAB;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCDA: _MM_PERM_ENUM = 0xAC;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCDB: _MM_PERM_ENUM = 0xAD;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCDC: _MM_PERM_ENUM = 0xAE;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCDD: _MM_PERM_ENUM = 0xAF;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDAA: _MM_PERM_ENUM = 0xB0;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDAB: _MM_PERM_ENUM = 0xB1;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDAC: _MM_PERM_ENUM = 0xB2;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDAD: _MM_PERM_ENUM = 0xB3;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDBA: _MM_PERM_ENUM = 0xB4;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDBB: _MM_PERM_ENUM = 0xB5;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDBC: _MM_PERM_ENUM = 0xB6;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDBD: _MM_PERM_ENUM = 0xB7;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDCA: _MM_PERM_ENUM = 0xB8;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDCB: _MM_PERM_ENUM = 0xB9;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDCC: _MM_PERM_ENUM = 0xBA;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDCD: _MM_PERM_ENUM = 0xBB;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDDA: _MM_PERM_ENUM = 0xBC;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDDB: _MM_PERM_ENUM = 0xBD;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDDC: _MM_PERM_ENUM = 0xBE;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDDD: _MM_PERM_ENUM = 0xBF;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DAAA: _MM_PERM_ENUM = 0xC0;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DAAB: _MM_PERM_ENUM = 0xC1;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DAAC: _MM_PERM_ENUM = 0xC2;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DAAD: _MM_PERM_ENUM = 0xC3;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DABA: _MM_PERM_ENUM = 0xC4;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DABB: _MM_PERM_ENUM = 0xC5;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DABC: _MM_PERM_ENUM = 0xC6;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DABD: _MM_PERM_ENUM = 0xC7;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DACA: _MM_PERM_ENUM = 0xC8;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DACB: _MM_PERM_ENUM = 0xC9;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DACC: _MM_PERM_ENUM = 0xCA;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DACD: _MM_PERM_ENUM = 0xCB;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DADA: _MM_PERM_ENUM = 0xCC;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DADB: _MM_PERM_ENUM = 0xCD;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DADC: _MM_PERM_ENUM = 0xCE;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DADD: _MM_PERM_ENUM = 0xCF;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBAA: _MM_PERM_ENUM = 0xD0;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBAB: _MM_PERM_ENUM = 0xD1;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBAC: _MM_PERM_ENUM = 0xD2;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBAD: _MM_PERM_ENUM = 0xD3;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBBA: _MM_PERM_ENUM = 0xD4;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBBB: _MM_PERM_ENUM = 0xD5;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBBC: _MM_PERM_ENUM = 0xD6;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBBD: _MM_PERM_ENUM = 0xD7;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBCA: _MM_PERM_ENUM = 0xD8;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBCB: _MM_PERM_ENUM = 0xD9;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBCC: _MM_PERM_ENUM = 0xDA;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBCD: _MM_PERM_ENUM = 0xDB;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBDA: _MM_PERM_ENUM = 0xDC;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBDB: _MM_PERM_ENUM = 0xDD;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBDC: _MM_PERM_ENUM = 0xDE;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBDD: _MM_PERM_ENUM = 0xDF;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCAA: _MM_PERM_ENUM = 0xE0;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCAB: _MM_PERM_ENUM = 0xE1;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCAC: _MM_PERM_ENUM = 0xE2;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCAD: _MM_PERM_ENUM = 0xE3;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCBA: _MM_PERM_ENUM = 0xE4;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCBB: _MM_PERM_ENUM = 0xE5;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCBC: _MM_PERM_ENUM = 0xE6;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCBD: _MM_PERM_ENUM = 0xE7;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCCA: _MM_PERM_ENUM = 0xE8;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCCB: _MM_PERM_ENUM = 0xE9;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCCC: _MM_PERM_ENUM = 0xEA;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCCD: _MM_PERM_ENUM = 0xEB;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCDA: _MM_PERM_ENUM = 0xEC;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCDB: _MM_PERM_ENUM = 0xED;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCDC: _MM_PERM_ENUM = 0xEE;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCDD: _MM_PERM_ENUM = 0xEF;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDAA: _MM_PERM_ENUM = 0xF0;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDAB: _MM_PERM_ENUM = 0xF1;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDAC: _MM_PERM_ENUM = 0xF2;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDAD: _MM_PERM_ENUM = 0xF3;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDBA: _MM_PERM_ENUM = 0xF4;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDBB: _MM_PERM_ENUM = 0xF5;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDBC: _MM_PERM_ENUM = 0xF6;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDBD: _MM_PERM_ENUM = 0xF7;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDCA: _MM_PERM_ENUM = 0xF8;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDCB: _MM_PERM_ENUM = 0xF9;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDCC: _MM_PERM_ENUM = 0xFA;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDCD: _MM_PERM_ENUM = 0xFB;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDDA: _MM_PERM_ENUM = 0xFC;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDDB: _MM_PERM_ENUM = 0xFD;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDDC: _MM_PERM_ENUM = 0xFE;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDDD: _MM_PERM_ENUM = 0xFF;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.sqrt.ps.512"]
+    fn vsqrtps(a: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.sqrt.pd.512"]
+    fn vsqrtpd(a: f64x8, rounding: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.vfmadd.ps.512"]
+    fn vfmadd132psround(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512;
+    #[link_name = "llvm.x86.avx512.vfmadd.pd.512"]
+    fn vfmadd132pdround(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d;
+
+    #[link_name = "llvm.x86.avx512.vfmaddsub.ps.512"]
+    fn vfmaddsubpsround(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512; //from clang
+    #[link_name = "llvm.x86.avx512.vfmaddsub.pd.512"]
+    fn vfmaddsubpdround(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d; //from clang
+
+    #[link_name = "llvm.x86.avx512.add.ps.512"]
+    fn vaddps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.add.pd.512"]
+    fn vaddpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.sub.ps.512"]
+    fn vsubps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.sub.pd.512"]
+    fn vsubpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mul.ps.512"]
+    fn vmulps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mul.pd.512"]
+    fn vmulpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.div.ps.512"]
+    fn vdivps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.div.pd.512"]
+    fn vdivpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.max.ps.512"]
+    fn vmaxps(a: f32x16, b: f32x16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.max.pd.512"]
+    fn vmaxpd(a: f64x8, b: f64x8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.min.ps.512"]
+    fn vminps(a: f32x16, b: f32x16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.min.pd.512"]
+    fn vminpd(a: f64x8, b: f64x8, sae: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.getexp.ps.512"]
+    fn vgetexpps(a: f32x16, src: f32x16, m: u16, sae: i32) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.getexp.ps.256"]
+    fn vgetexpps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.getexp.ps.128"]
+    fn vgetexpps128(a: f32x4, src: f32x4, m: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.getexp.pd.512"]
+    fn vgetexppd(a: f64x8, src: f64x8, m: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.getexp.pd.256"]
+    fn vgetexppd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.getexp.pd.128"]
+    fn vgetexppd128(a: f64x2, src: f64x2, m: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.512"]
+    fn vrndscaleps(a: f32x16, imm8: i32, src: f32x16, mask: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.256"]
+    fn vrndscaleps256(a: f32x8, imm8: i32, src: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.128"]
+    fn vrndscaleps128(a: f32x4, imm8: i32, src: f32x4, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.512"]
+    fn vrndscalepd(a: f64x8, imm8: i32, src: f64x8, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.256"]
+    fn vrndscalepd256(a: f64x4, imm8: i32, src: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.128"]
+    fn vrndscalepd128(a: f64x2, imm8: i32, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.scalef.ps.512"]
+    fn vscalefps(a: f32x16, b: f32x16, src: f32x16, mask: u16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.scalef.ps.256"]
+    fn vscalefps256(a: f32x8, b: f32x8, src: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.scalef.ps.128"]
+    fn vscalefps128(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.scalef.pd.512"]
+    fn vscalefpd(a: f64x8, b: f64x8, src: f64x8, mask: u8, rounding: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.scalef.pd.256"]
+    fn vscalefpd256(a: f64x4, b: f64x4, src: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.scalef.pd.128"]
+    fn vscalefpd128(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.512"]
+    fn vfixupimmps(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.256"]
+    fn vfixupimmps256(a: f32x8, b: f32x8, c: i32x8, imm8: i32, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.128"]
+    fn vfixupimmps128(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.512"]
+    fn vfixupimmpd(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.256"]
+    fn vfixupimmpd256(a: f64x4, b: f64x4, c: i64x4, imm8: i32, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.128"]
+    fn vfixupimmpd128(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.512"]
+    fn vfixupimmpsz(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.256"]
+    fn vfixupimmpsz256(a: f32x8, b: f32x8, c: i32x8, imm8: i32, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.128"]
+    fn vfixupimmpsz128(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.512"]
+    fn vfixupimmpdz(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.256"]
+    fn vfixupimmpdz256(a: f64x4, b: f64x4, c: i64x4, imm8: i32, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.128"]
+    fn vfixupimmpdz128(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.pternlog.d.512"]
+    fn vpternlogd(a: i32x16, b: i32x16, c: i32x16, imm8: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.pternlog.d.256"]
+    fn vpternlogd256(a: i32x8, b: i32x8, c: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.pternlog.d.128"]
+    fn vpternlogd128(a: i32x4, b: i32x4, c: i32x4, imm8: i32) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.pternlog.q.512"]
+    fn vpternlogq(a: i64x8, b: i64x8, c: i64x8, imm8: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.pternlog.q.256"]
+    fn vpternlogq256(a: i64x4, b: i64x4, c: i64x4, imm8: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.pternlog.q.128"]
+    fn vpternlogq128(a: i64x2, b: i64x2, c: i64x2, imm8: i32) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
+    fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.getmant.ps.256"]
+    fn vgetmantps256(a: f32x8, mantissas: i32, src: f32x8, m: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.getmant.ps.128"]
+    fn vgetmantps128(a: f32x4, mantissas: i32, src: f32x4, m: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.getmant.pd.512"]
+    fn vgetmantpd(a: f64x8, mantissas: i32, src: f64x8, m: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.getmant.pd.256"]
+    fn vgetmantpd256(a: f64x4, mantissas: i32, src: f64x4, m: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.getmant.pd.128"]
+    fn vgetmantpd128(a: f64x2, mantissas: i32, src: f64x2, m: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.rcp14.ps.512"]
+    fn vrcp14ps(a: f32x16, src: f32x16, m: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.rcp14.ps.256"]
+    fn vrcp14ps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.rcp14.ps.128"]
+    fn vrcp14ps128(a: f32x4, src: f32x4, m: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.rcp14.pd.512"]
+    fn vrcp14pd(a: f64x8, src: f64x8, m: u8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.rcp14.pd.256"]
+    fn vrcp14pd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.rcp14.pd.128"]
+    fn vrcp14pd128(a: f64x2, src: f64x2, m: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.rsqrt14.ps.512"]
+    fn vrsqrt14ps(a: f32x16, src: f32x16, m: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.rsqrt14.ps.256"]
+    fn vrsqrt14ps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.rsqrt14.ps.128"]
+    fn vrsqrt14ps128(a: f32x4, src: f32x4, m: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.rsqrt14.pd.512"]
+    fn vrsqrt14pd(a: f64x8, src: f64x8, m: u8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.rsqrt14.pd.256"]
+    fn vrsqrt14pd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.rsqrt14.pd.128"]
+    fn vrsqrt14pd128(a: f64x2, src: f64x2, m: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtps2dq.512"]
+    fn vcvtps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.512"]
+    fn vcvtps2udq(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.256"]
+    fn vcvtps2udq256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.128"]
+    fn vcvtps2udq128(a: f32x4, src: u32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtps2pd.512"]
+    fn vcvtps2pd(a: f32x8, src: f64x8, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2ps.512"]
+    fn vcvtpd2ps(a: f64x8, src: f32x8, mask: u8, rounding: i32) -> f32x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2dq.512"]
+    fn vcvtpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.512"]
+    fn vcvtpd2udq(a: f64x8, src: u32x8, mask: u8, rounding: i32) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.256"]
+    fn vcvtpd2udq256(a: f64x4, src: u32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.128"]
+    fn vcvtpd2udq128(a: f64x2, src: u32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.sitofp.round.v16f32.v16i32"]
+    fn vcvtdq2ps(a: i32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v16f32.v16i32"]
+    fn vcvtudq2ps(a: u32x16, rounding: i32) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.512"]
+    fn vcvtps2ph(a: f32x16, rounding: i32, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.256"]
+    fn vcvtps2ph256(a: f32x8, imm8: i32, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.128"]
+    fn vcvtps2ph128(a: f32x4, imm8: i32, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.vcvtph2ps.512"]
+    fn vcvtph2ps(a: i16x16, src: f32x16, mask: u16, sae: i32) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.512"]
+    fn vcvttps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.256"]
+    fn vcvttps2dq256(a: f32x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.128"]
+    fn vcvttps2dq128(a: f32x4, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.512"]
+    fn vcvttps2udq(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.256"]
+    fn vcvttps2udq256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.128"]
+    fn vcvttps2udq128(a: f32x4, src: u32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.512"]
+    fn vcvttpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.256"]
+    fn vcvttpd2dq256(a: f64x4, src: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.128"]
+    fn vcvttpd2dq128(a: f64x2, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.512"]
+    fn vcvttpd2udq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.256"]
+    fn vcvttpd2udq256(a: f64x4, src: i32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.128"]
+    fn vcvttpd2udq128(a: f64x2, src: i32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.dw.128"]
+    fn vpmovdw128(a: i32x4, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.256"]
+    fn vpmovdb256(a: i32x8, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.128"]
+    fn vpmovdb128(a: i32x4, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.256"]
+    fn vpmovqw256(a: i64x4, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.128"]
+    fn vpmovqw128(a: i64x2, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.256"]
+    fn vpmovqb256(a: i64x4, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.128"]
+    fn vpmovqb128(a: i64x2, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmov.qd.128"]
+    fn vpmovqd128(a: i64x2, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.512"]
+    fn vpmovdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.256"]
+    fn vpmovdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.128"]
+    fn vpmovdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.512"]
+    fn vpmovsdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.256"]
+    fn vpmovsdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.128"]
+    fn vpmovsdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.512"]
+    fn vpmovusdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.256"]
+    fn vpmovusdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.128"]
+    fn vpmovusdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.512"]
+    fn vpmovdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.256"]
+    fn vpmovdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.128"]
+    fn vpmovdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.512"]
+    fn vpmovsdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.256"]
+    fn vpmovsdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.128"]
+    fn vpmovsdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.512"]
+    fn vpmovusdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.256"]
+    fn vpmovusdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.128"]
+    fn vpmovusdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.512"]
+    fn vpmovqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.256"]
+    fn vpmovqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.128"]
+    fn vpmovqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.512"]
+    fn vpmovsqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.256"]
+    fn vpmovsqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.128"]
+    fn vpmovsqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.512"]
+    fn vpmovusqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.256"]
+    fn vpmovusqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.128"]
+    fn vpmovusqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.512"]
+    fn vpmovqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.256"]
+    fn vpmovqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.128"]
+    fn vpmovqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.512"]
+    fn vpmovsqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.256"]
+    fn vpmovsqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.128"]
+    fn vpmovsqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.512"]
+    fn vpmovusqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.256"]
+    fn vpmovusqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.128"]
+    fn vpmovusqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.512"]
+    fn vpmovqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.256"]
+    fn vpmovqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.128"]
+    fn vpmovqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.512"]
+    fn vpmovsqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.256"]
+    fn vpmovsqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.128"]
+    fn vpmovsqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.512"]
+    fn vpmovusqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.256"]
+    fn vpmovusqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.128"]
+    fn vpmovusqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.512"]
+    fn vpmovqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.512"]
+    fn vpmovsdw(a: i32x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.256"]
+    fn vpmovsdw256(a: i32x8, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.128"]
+    fn vpmovsdw128(a: i32x4, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.512"]
+    fn vpmovsdb(a: i32x16, src: i8x16, mask: u16) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.256"]
+    fn vpmovsdb256(a: i32x8, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.128"]
+    fn vpmovsdb128(a: i32x4, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.512"]
+    fn vpmovsqd(a: i64x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.256"]
+    fn vpmovsqd256(a: i64x4, src: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.128"]
+    fn vpmovsqd128(a: i64x2, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.512"]
+    fn vpmovsqw(a: i64x8, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.256"]
+    fn vpmovsqw256(a: i64x4, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.128"]
+    fn vpmovsqw128(a: i64x2, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.512"]
+    fn vpmovsqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.256"]
+    fn vpmovsqb256(a: i64x4, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.128"]
+    fn vpmovsqb128(a: i64x2, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.512"]
+    fn vpmovusdw(a: u32x16, src: u16x16, mask: u16) -> u16x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.256"]
+    fn vpmovusdw256(a: u32x8, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.128"]
+    fn vpmovusdw128(a: u32x4, src: u16x8, mask: u8) -> u16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.512"]
+    fn vpmovusdb(a: u32x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.256"]
+    fn vpmovusdb256(a: u32x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.128"]
+    fn vpmovusdb128(a: u32x4, src: u8x16, mask: u8) -> u8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.512"]
+    fn vpmovusqd(a: u64x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.256"]
+    fn vpmovusqd256(a: u64x4, src: u32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.128"]
+    fn vpmovusqd128(a: u64x2, src: u32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.512"]
+    fn vpmovusqw(a: u64x8, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.256"]
+    fn vpmovusqw256(a: u64x4, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.128"]
+    fn vpmovusqw128(a: u64x2, src: u16x8, mask: u8) -> u16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.512"]
+    fn vpmovusqb(a: u64x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.256"]
+    fn vpmovusqb256(a: u64x4, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.128"]
+    fn vpmovusqb128(a: u64x2, src: u8x16, mask: u8) -> u8x16;
+
+    #[link_name = "llvm.x86.avx512.gather.dpd.512"]
+    fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.gather.dps.512"]
+    fn vgatherdps(src: f32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.gather.qpd.512"]
+    fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.gather.qps.512"]
+    fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8;
+    #[link_name = "llvm.x86.avx512.gather.dpq.512"]
+    fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.gather.dpi.512"]
+    fn vpgatherdd(src: i32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.gather.qpq.512"]
+    fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.gather.qpi.512"]
+    fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8;
+
+    #[link_name = "llvm.x86.avx512.scatter.dpd.512"]
+    fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.dps.512"]
+    fn vscatterdps(slice: *mut i8, mask: i16, offsets: i32x16, src: f32x16, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpd.512"]
+    fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qps.512"]
+    fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.dpq.512"]
+    fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32);
+
+    #[link_name = "llvm.x86.avx512.scatter.dpi.512"]
+    fn vpscatterdd(slice: *mut i8, mask: i16, offsets: i32x16, src: i32x16, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpq.512"]
+    fn vpscatterqq(slice: *mut i8, mask: i8, offsets: i64x8, src: i64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpi.512"]
+    fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);
+
+    #[link_name = "llvm.x86.avx512.scattersiv4.si"]
+    fn vpscatterdd_128(slice: *mut i8, k: u8, offsets: i32x4, src: i32x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv2.di"]
+    fn vpscatterdq_128(slice: *mut i8, k: u8, offsets: i32x4, src: i64x2, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv2.df"]
+    fn vscatterdpd_128(slice: *mut i8, k: u8, offsets: i32x4, src: f64x2, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv4.sf"]
+    fn vscatterdps_128(slice: *mut i8, k: u8, offsets: i32x4, src: f32x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv4.si"]
+    fn vpscatterqd_128(slice: *mut i8, k: u8, offsets: i64x2, src: i32x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv2.di"]
+    fn vpscatterqq_128(slice: *mut i8, k: u8, offsets: i64x2, src: i64x2, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv2.df"]
+    fn vscatterqpd_128(slice: *mut i8, k: u8, offsets: i64x2, src: f64x2, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv4.sf"]
+    fn vscatterqps_128(slice: *mut i8, k: u8, offsets: i64x2, src: f32x4, scale: i32);
+
+    #[link_name = "llvm.x86.avx512.scattersiv8.si"]
+    fn vpscatterdd_256(slice: *mut i8, k: u8, offsets: i32x8, src: i32x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv4.di"]
+    fn vpscatterdq_256(slice: *mut i8, k: u8, offsets: i32x4, src: i64x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv4.df"]
+    fn vscatterdpd_256(slice: *mut i8, k: u8, offsets: i32x4, src: f64x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv8.sf"]
+    fn vscatterdps_256(slice: *mut i8, k: u8, offsets: i32x8, src: f32x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv8.si"]
+    fn vpscatterqd_256(slice: *mut i8, k: u8, offsets: i64x4, src: i32x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv4.di"]
+    fn vpscatterqq_256(slice: *mut i8, k: u8, offsets: i64x4, src: i64x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv4.df"]
+    fn vscatterqpd_256(slice: *mut i8, k: u8, offsets: i64x4, src: f64x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv8.sf"]
+    fn vscatterqps_256(slice: *mut i8, k: u8, offsets: i64x4, src: f32x4, scale: i32);
+
+    #[link_name = "llvm.x86.avx512.gather3siv4.si"]
+    fn vpgatherdd_128(src: i32x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i32x4;
+    #[link_name = "llvm.x86.avx512.gather3siv2.di"]
+    fn vpgatherdq_128(src: i64x2, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i64x2;
+    #[link_name = "llvm.x86.avx512.gather3siv2.df"]
+    fn vgatherdpd_128(src: f64x2, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.gather3siv4.sf"]
+    fn vgatherdps_128(src: f32x4, slice: *const u8, offsets: i32x4, k: u8, scale: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.gather3div4.si"]
+    fn vpgatherqd_128(src: i32x4, slice: *const u8, offsets: i64x2, k: u8, scale: i32) -> i32x4;
+    #[link_name = "llvm.x86.avx512.gather3div2.di"]
+    fn vpgatherqq_128(src: i64x2, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> i64x2;
+    #[link_name = "llvm.x86.avx512.gather3div2.df"]
+    fn vgatherqpd_128(src: f64x2, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.gather3div4.sf"]
+    fn vgatherqps_128(src: f32x4, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.gather3siv8.si"]
+    fn vpgatherdd_256(src: i32x8, slice: *const i8, offsets: i32x8, k: u8, scale: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.gather3siv4.di"]
+    fn vpgatherdq_256(src: i64x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.gather3siv4.df"]
+    fn vgatherdpd_256(src: f64x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> f64x4;
+    #[link_name = "llvm.x86.avx512.gather3siv8.sf"]
+    fn vgatherdps_256(src: f32x8, slice: *const i8, offsets: i32x8, k: u8, scale: i32) -> f32x8;
+    #[link_name = "llvm.x86.avx512.gather3div8.si"]
+    fn vpgatherqd_256(src: i32x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> i32x4;
+    #[link_name = "llvm.x86.avx512.gather3div4.di"]
+    fn vpgatherqq_256(src: i64x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.gather3div4.df"]
+    fn vgatherqpd_256(src: f64x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> f64x4;
+    #[link_name = "llvm.x86.avx512.gather3div8.sf"]
+    fn vgatherqps_256(src: f32x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cmp.ss"]
+    fn vcmpss(a: __m128, b: __m128, op: i32, m: i8, sae: i32) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.sd"]
+    fn vcmpsd(a: __m128d, b: __m128d, op: i32, m: i8, sae: i32) -> i8;
+
+    #[link_name = "llvm.x86.avx512.mask.cmp.ps.512"]
+    fn vcmpps(a: f32x16, b: f32x16, op: i32, m: i16, sae: i32) -> i16;
+    #[link_name = "llvm.x86.avx512.mask.cmp.ps.256"]
+    fn vcmpps256(a: f32x8, b: f32x8, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.ps.128"]
+    fn vcmpps128(a: f32x4, b: f32x4, op: i32, m: i8) -> i8;
+
+    #[link_name = "llvm.x86.avx512.mask.cmp.pd.512"]
+    fn vcmppd(a: f64x8, b: f64x8, op: i32, m: i8, sae: i32) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.pd.256"]
+    fn vcmppd256(a: f64x4, b: f64x4, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.pd.128"]
+    fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8;
+
+    #[link_name = "llvm.x86.avx512.mask.prol.d.512"]
+    fn vprold(a: i32x16, i8: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.prol.d.256"]
+    fn vprold256(a: i32x8, i8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.prol.d.128"]
+    fn vprold128(a: i32x4, i8: i32) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pror.d.512"]
+    fn vprord(a: i32x16, i8: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.pror.d.256"]
+    fn vprord256(a: i32x8, i8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.pror.d.128"]
+    fn vprord128(a: i32x4, i8: i32) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.prol.q.512"]
+    fn vprolq(a: i64x8, i8: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.prol.q.256"]
+    fn vprolq256(a: i64x4, i8: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.prol.q.128"]
+    fn vprolq128(a: i64x2, i8: i32) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.pror.q.512"]
+    fn vprorq(a: i64x8, i8: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.pror.q.256"]
+    fn vprorq256(a: i64x4, i8: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.pror.q.128"]
+    fn vprorq128(a: i64x2, i8: i32) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.prolv.d.512"]
+    fn vprolvd(a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.prolv.d.256"]
+    fn vprolvd256(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.prolv.d.128"]
+    fn vprolvd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.prorv.d.512"]
+    fn vprorvd(a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.prorv.d.256"]
+    fn vprorvd256(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.prorv.d.128"]
+    fn vprorvd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.prolv.q.512"]
+    fn vprolvq(a: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.prolv.q.256"]
+    fn vprolvq256(a: i64x4, b: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.prolv.q.128"]
+    fn vprolvq128(a: i64x2, b: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.prorv.q.512"]
+    fn vprorvq(a: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.prorv.q.256"]
+    fn vprorvq256(a: i64x4, b: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.prorv.q.128"]
+    fn vprorvq128(a: i64x2, b: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.psllv.d.512"]
+    fn vpsllvd(a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.psrlv.d.512"]
+    fn vpsrlvd(a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.psllv.q.512"]
+    fn vpsllvq(a: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.psrlv.q.512"]
+    fn vpsrlvq(a: i64x8, b: i64x8) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.psll.d.512"]
+    fn vpslld(a: i32x16, count: i32x4) -> i32x16;
+    #[link_name = "llvm.x86.avx512.psrl.d.512"]
+    fn vpsrld(a: i32x16, count: i32x4) -> i32x16;
+    #[link_name = "llvm.x86.avx512.psll.q.512"]
+    fn vpsllq(a: i64x8, count: i64x2) -> i64x8;
+    #[link_name = "llvm.x86.avx512.psrl.q.512"]
+    fn vpsrlq(a: i64x8, count: i64x2) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.psra.d.512"]
+    fn vpsrad(a: i32x16, count: i32x4) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.psra.q.512"]
+    fn vpsraq(a: i64x8, count: i64x2) -> i64x8;
+    #[link_name = "llvm.x86.avx512.psra.q.256"]
+    fn vpsraq256(a: i64x4, count: i64x2) -> i64x4;
+    #[link_name = "llvm.x86.avx512.psra.q.128"]
+    fn vpsraq128(a: i64x2, count: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.psrav.d.512"]
+    fn vpsravd(a: i32x16, count: i32x16) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.psrav.q.512"]
+    fn vpsravq(a: i64x8, count: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.psrav.q.256"]
+    fn vpsravq256(a: i64x4, count: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.psrav.q.128"]
+    fn vpsravq128(a: i64x2, count: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.vpermilvar.ps.512"]
+    fn vpermilps(a: f32x16, b: i32x16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.vpermilvar.pd.512"]
+    fn vpermilpd(a: f64x8, b: i64x8) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.permvar.si.512"]
+    fn vpermd(a: i32x16, idx: i32x16) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.permvar.di.512"]
+    fn vpermq(a: i64x8, idx: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.permvar.di.256"]
+    fn vpermq256(a: i64x4, idx: i64x4) -> i64x4;
+
+    #[link_name = "llvm.x86.avx512.permvar.sf.512"]
+    fn vpermps(a: f32x16, idx: i32x16) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.permvar.df.512"]
+    fn vpermpd(a: f64x8, idx: i64x8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.permvar.df.256"]
+    fn vpermpd256(a: f64x4, idx: i64x4) -> f64x4;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.d.512"]
+    fn vpermi2d(a: i32x16, idx: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpermi2var.d.256"]
+    fn vpermi2d256(a: i32x8, idx: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpermi2var.d.128"]
+    fn vpermi2d128(a: i32x4, idx: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.q.512"]
+    fn vpermi2q(a: i64x8, idx: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.vpermi2var.q.256"]
+    fn vpermi2q256(a: i64x4, idx: i64x4, b: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.vpermi2var.q.128"]
+    fn vpermi2q128(a: i64x2, idx: i64x2, b: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.ps.512"]
+    fn vpermi2ps(a: f32x16, idx: i32x16, b: f32x16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.vpermi2var.ps.256"]
+    fn vpermi2ps256(a: f32x8, idx: i32x8, b: f32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.vpermi2var.ps.128"]
+    fn vpermi2ps128(a: f32x4, idx: i32x4, b: f32x4) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.pd.512"]
+    fn vpermi2pd(a: f64x8, idx: i64x8, b: f64x8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.vpermi2var.pd.256"]
+    fn vpermi2pd256(a: f64x4, idx: i64x4, b: f64x4) -> f64x4;
+    #[link_name = "llvm.x86.avx512.vpermi2var.pd.128"]
+    fn vpermi2pd128(a: f64x2, idx: i64x2, b: f64x2) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.d.512"]
+    fn vpcompressd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.compress.d.256"]
+    fn vpcompressd256(a: i32x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.compress.d.128"]
+    fn vpcompressd128(a: i32x4, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.q.512"]
+    fn vpcompressq(a: i64x8, src: i64x8, mask: u8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.compress.q.256"]
+    fn vpcompressq256(a: i64x4, src: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.compress.q.128"]
+    fn vpcompressq128(a: i64x2, src: i64x2, mask: u8) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.ps.512"]
+    fn vcompressps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.compress.ps.256"]
+    fn vcompressps256(a: f32x8, src: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.compress.ps.128"]
+    fn vcompressps128(a: f32x4, src: f32x4, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.pd.512"]
+    fn vcompresspd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.compress.pd.256"]
+    fn vcompresspd256(a: f64x4, src: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.compress.pd.128"]
+    fn vcompresspd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.d.512"]
+    fn vcompressstored(mem: *mut i8, data: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.d.256"]
+    fn vcompressstored256(mem: *mut i8, data: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.d.128"]
+    fn vcompressstored128(mem: *mut i8, data: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.q.512"]
+    fn vcompressstoreq(mem: *mut i8, data: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.q.256"]
+    fn vcompressstoreq256(mem: *mut i8, data: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.q.128"]
+    fn vcompressstoreq128(mem: *mut i8, data: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.512"]
+    fn vcompressstoreps(mem: *mut i8, data: f32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.256"]
+    fn vcompressstoreps256(mem: *mut i8, data: f32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.128"]
+    fn vcompressstoreps128(mem: *mut i8, data: f32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.512"]
+    fn vcompressstorepd(mem: *mut i8, data: f64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.256"]
+    fn vcompressstorepd256(mem: *mut i8, data: f64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.128"]
+    fn vcompressstorepd128(mem: *mut i8, data: f64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.expand.d.512"]
+    fn vpexpandd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.d.256"]
+    fn vpexpandd256(a: i32x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.d.128"]
+    fn vpexpandd128(a: i32x4, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.q.512"]
+    fn vpexpandq(a: i64x8, src: i64x8, mask: u8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.q.256"]
+    fn vpexpandq256(a: i64x4, src: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.q.128"]
+    fn vpexpandq128(a: i64x2, src: i64x2, mask: u8) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.ps.512"]
+    fn vexpandps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.ps.256"]
+    fn vexpandps256(a: f32x8, src: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.ps.128"]
+    fn vexpandps128(a: f32x4, src: f32x4, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.pd.512"]
+    fn vexpandpd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.pd.256"]
+    fn vexpandpd256(a: f64x4, src: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.pd.128"]
+    fn vexpandpd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.add.ss.round"]
+    fn vaddss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.add.sd.round"]
+    fn vaddsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.sub.ss.round"]
+    fn vsubss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.sub.sd.round"]
+    fn vsubsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.mul.ss.round"]
+    fn vmulss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.mul.sd.round"]
+    fn vmulsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.div.ss.round"]
+    fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.div.sd.round"]
+    fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.max.ss.round"]
+    fn vmaxss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.max.sd.round"]
+    fn vmaxsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.min.ss.round"]
+    fn vminss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.min.sd.round"]
+    fn vminsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.sqrt.ss"]
+    fn vsqrtss(a: __m128, b: __m128, src: __m128, mask: u8, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
+    fn vsqrtsd(a: __m128d, b: __m128d, src: __m128d, mask: u8, rounding: i32) -> __m128d;
+    #[link_name = "llvm.x86.avx512.mask.getexp.ss"]
+    fn vgetexpss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.getexp.sd"]
+    fn vgetexpsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.getmant.ss"]
+    fn vgetmantss(a: f32x4, b: f32x4, mantissas: i32, src: f32x4, m: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.getmant.sd"]
+    fn vgetmantsd(a: f64x2, b: f64x2, mantissas: i32, src: f64x2, m: u8, sae: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.rsqrt14.ss"]
+    fn vrsqrt14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.rsqrt14.sd"]
+    fn vrsqrt14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.rcp14.ss"]
+    fn vrcp14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.rcp14.sd"]
+    fn vrcp14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ss"]
+    fn vrndscaless(a: f32x4, b: f32x4, src: f32x4, mask: u8, imm8: i32, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.sd"]
+    fn vrndscalesd(a: f64x2, b: f64x2, src: f64x2, mask: u8, imm8: i32, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.scalef.ss"]
+    fn vscalefss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.scalef.sd"]
+    fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.vfmadd.f32"]
+    fn vfmaddssround(a: f32, b: f32, c: f32, rounding: i32) -> f32;
+    #[link_name = "llvm.x86.avx512.vfmadd.f64"]
+    fn vfmaddsdround(a: f64, b: f64, c: f64, rounding: i32) -> f64;
+
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.ss"]
+    fn vfixupimmss(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.sd"]
+    fn vfixupimmsd(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ss"]
+    fn vfixupimmssz(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.sd"]
+    fn vfixupimmsdz(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8, sae: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtss2sd.round"]
+    fn vcvtss2sd(a: f64x2, b: f32x4, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvtsd2ss.round"]
+    fn vcvtsd2ss(a: f32x4, b: f64x2, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.vcvtss2si32"]
+    fn vcvtss2si(a: f32x4, rounding: i32) -> i32;
+    #[link_name = "llvm.x86.avx512.vcvtss2usi32"]
+    fn vcvtss2usi(a: f32x4, rounding: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512.vcvtsd2si32"]
+    fn vcvtsd2si(a: f64x2, rounding: i32) -> i32;
+    #[link_name = "llvm.x86.avx512.vcvtsd2usi32"]
+    fn vcvtsd2usi(a: f64x2, rounding: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512.cvtsi2ss32"]
+    fn vcvtsi2ss(a: f32x4, b: i32, rounding: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.cvtusi2ss"]
+    fn vcvtusi2ss(a: f32x4, b: u32, rounding: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.cvttss2si"]
+    fn vcvttss2si(a: f32x4, rounding: i32) -> i32;
+    #[link_name = "llvm.x86.avx512.cvttss2usi"]
+    fn vcvttss2usi(a: f32x4, rounding: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512.cvttsd2si"]
+    fn vcvttsd2si(a: f64x2, rounding: i32) -> i32;
+    #[link_name = "llvm.x86.avx512.cvttsd2usi"]
+    fn vcvttsd2usi(a: f64x2, rounding: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512.vcomi.ss"]
+    fn vcomiss(a: f32x4, b: f32x4, imm8: i32, sae: i32) -> i32;
+    #[link_name = "llvm.x86.avx512.vcomi.sd"]
+    fn vcomisd(a: f64x2, b: f64x2, imm8: i32, sae: i32) -> i32;
+
+    #[link_name = "llvm.x86.avx512.mask.loadu.d.128"]
+    fn loaddqu32_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.loadu.q.128"]
+    fn loaddqu64_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.loadu.ps.128"]
+    fn loadups_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.loadu.pd.128"]
+    fn loadupd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.loadu.d.256"]
+    fn loaddqu32_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.loadu.q.256"]
+    fn loaddqu64_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.loadu.ps.256"]
+    fn loadups_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.loadu.pd.256"]
+    fn loadupd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.loadu.d.512"]
+    fn loaddqu32_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.loadu.q.512"]
+    fn loaddqu64_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.loadu.ps.512"]
+    fn loadups_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.loadu.pd.512"]
+    fn loadupd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.load.d.128"]
+    fn loaddqa32_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.load.q.128"]
+    fn loaddqa64_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.load.ps.128"]
+    fn loadaps_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.load.pd.128"]
+    fn loadapd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.load.d.256"]
+    fn loaddqa32_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.load.q.256"]
+    fn loaddqa64_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.load.ps.256"]
+    fn loadaps_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.load.pd.256"]
+    fn loadapd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.load.d.512"]
+    fn loaddqa32_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.load.q.512"]
+    fn loaddqa64_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.load.ps.512"]
+    fn loadaps_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.load.pd.512"]
+    fn loadapd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.storeu.d.128"]
+    fn storedqu32_128(mem_addr: *mut i32, a: i32x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.q.128"]
+    fn storedqu64_128(mem_addr: *mut i64, a: i64x2, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.ps.128"]
+    fn storeups_128(mem_addr: *mut f32, a: f32x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.pd.128"]
+    fn storeupd_128(mem_addr: *mut f64, a: f64x2, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.d.256"]
+    fn storedqu32_256(mem_addr: *mut i32, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.q.256"]
+    fn storedqu64_256(mem_addr: *mut i64, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.ps.256"]
+    fn storeups_256(mem_addr: *mut f32, a: f32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.pd.256"]
+    fn storeupd_256(mem_addr: *mut f64, a: f64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.d.512"]
+    fn storedqu32_512(mem_addr: *mut i32, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.storeu.q.512"]
+    fn storedqu64_512(mem_addr: *mut i64, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.ps.512"]
+    fn storeups_512(mem_addr: *mut f32, a: f32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.storeu.pd.512"]
+    fn storeupd_512(mem_addr: *mut f64, a: f64x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.store.d.128"]
+    fn storedqa32_128(mem_addr: *mut i32, a: i32x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.q.128"]
+    fn storedqa64_128(mem_addr: *mut i64, a: i64x2, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.ps.128"]
+    fn storeaps_128(mem_addr: *mut f32, a: f32x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.pd.128"]
+    fn storeapd_128(mem_addr: *mut f64, a: f64x2, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.d.256"]
+    fn storedqa32_256(mem_addr: *mut i32, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.q.256"]
+    fn storedqa64_256(mem_addr: *mut i64, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.ps.256"]
+    fn storeaps_256(mem_addr: *mut f32, a: f32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.pd.256"]
+    fn storeapd_256(mem_addr: *mut f64, a: f64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.d.512"]
+    fn storedqa32_512(mem_addr: *mut i32, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.store.q.512"]
+    fn storedqa64_512(mem_addr: *mut i64, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.ps.512"]
+    fn storeaps_512(mem_addr: *mut f32, a: f32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.store.pd.512"]
+    fn storeapd_512(mem_addr: *mut f64, a: f64x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.expand.load.d.128"]
+    fn expandloadd_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.q.128"]
+    fn expandloadq_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.ps.128"]
+    fn expandloadps_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.pd.128"]
+    fn expandloadpd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.d.256"]
+    fn expandloadd_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.q.256"]
+    fn expandloadq_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.ps.256"]
+    fn expandloadps_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.pd.256"]
+    fn expandloadpd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.d.512"]
+    fn expandloadd_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.q.512"]
+    fn expandloadq_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.ps.512"]
+    fn expandloadps_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.pd.512"]
+    fn expandloadpd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8;
+
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use crate::hint::black_box;
+    use crate::mem::{self};
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm512_abs_epi32(a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm512_mask_abs_epi32(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_abs_epi32(a, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm512_maskz_abs_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_abs_epi32(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm256_mask_abs_epi32(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_abs_epi32(a, 0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, -100, -32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm256_maskz_abs_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_abs_epi32(0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi32(
+            0, 1, 1, i32::MAX,
+            0, 0, 0, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_abs_epi32() {
+        let a = _mm_setr_epi32(i32::MIN, 100, -100, -32);
+        let r = _mm_mask_abs_epi32(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_abs_epi32(a, 0b00001111, a);
+        let e = _mm_setr_epi32(i32::MAX.wrapping_add(1), 100, 100, 32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_abs_epi32() {
+        let a = _mm_setr_epi32(i32::MIN, 100, -100, -32);
+        let r = _mm_maskz_abs_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_abs_epi32(0b00001111, a);
+        let e = _mm_setr_epi32(i32::MAX.wrapping_add(1), 100, 100, 32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_abs_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let r = _mm512_abs_ps(a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1., 1., f32::MAX,
+            f32::MAX, 100., 100., 32.,
+            0., 1., 1., f32::MAX,
+            f32::MAX, 100., 100., 32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_abs_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let r = _mm512_mask_abs_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_abs_ps(a, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1., 1., f32::MAX,
+            f32::MAX, 100., 100., 32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(2);
+        let r = _mm512_mask_mov_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_mov_epi32(src, 0b11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_mov_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mov_epi32(0b11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mov_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(2);
+        let r = _mm256_mask_mov_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_mov_epi32(src, 0b11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let r = _mm256_maskz_mov_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mov_epi32(0b11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mov_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(2);
+        let r = _mm_mask_mov_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_mov_epi32(src, 0b00001111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mov_epi32() {
+        let a = _mm_set1_epi32(2);
+        let r = _mm_maskz_mov_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mov_epi32(0b00001111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_ps() {
+        let src = _mm512_set1_ps(1.);
+        let a = _mm512_set1_ps(2.);
+        let r = _mm512_mask_mov_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_mov_ps(src, 0b11111111_11111111, a);
+        assert_eq_m512(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_ps() {
+        let a = _mm512_set1_ps(2.);
+        let r = _mm512_maskz_mov_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_mov_ps(0b11111111_11111111, a);
+        assert_eq_m512(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mov_ps() {
+        let src = _mm256_set1_ps(1.);
+        let a = _mm256_set1_ps(2.);
+        let r = _mm256_mask_mov_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_mov_ps(src, 0b11111111, a);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_ps() {
+        let a = _mm256_set1_ps(2.);
+        let r = _mm256_maskz_mov_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_mov_ps(0b11111111, a);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mov_ps() {
+        let src = _mm_set1_ps(1.);
+        let a = _mm_set1_ps(2.);
+        let r = _mm_mask_mov_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_mov_ps(src, 0b00001111, a);
+        assert_eq_m128(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mov_ps() {
+        let a = _mm_set1_ps(2.);
+        let r = _mm_maskz_mov_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_mov_ps(0b00001111, a);
+        assert_eq_m128(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_add_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            1, 2, 0, i32::MIN,
+            i32::MIN + 1, 101, -99, -31,
+            1, 2, 0, i32::MIN,
+            i32::MIN + 1, 101, -99, -31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_mask_add_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_add_epi32(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            1, 2, 0, i32::MIN,
+            i32::MIN + 1, 101, -99, -31,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_add_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_add_epi32(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            1, 2, 0, i32::MIN,
+            i32::MIN + 1, 101, -99, -31,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_add_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_mask_add_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_add_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(1, 2, 0, i32::MIN, i32::MIN + 1, 101, -99, -31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_add_epi32() {
+        let a = _mm256_setr_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_add_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_add_epi32(0b11111111, a, b);
+        let e = _mm256_setr_epi32(1, 2, 0, i32::MIN, i32::MIN + 1, 101, -99, -31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_add_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_mask_add_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_add_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(2, 0, i32::MIN, i32::MIN + 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_add_epi32() {
+        let a = _mm_setr_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_add_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_add_epi32(0b00001111, a, b);
+        let e = _mm_setr_epi32(2, 0, i32::MIN, i32::MIN + 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_add_ps(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1., 2., 0., f32::MAX,
+            f32::MIN + 1., 101., -99., -31.,
+            1., 2., 0., f32::MAX,
+            f32::MIN + 1., 101., -99., -31.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_mask_add_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_add_ps(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1., 2., 0., f32::MAX,
+            f32::MIN + 1., 101., -99., -31.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_add_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_add_ps(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1., 2., 0., f32::MAX,
+            f32::MIN + 1., 101., -99., -31.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_add_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(1.);
+        let r = _mm256_mask_add_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_add_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(1., 2., 0., f32::MAX, f32::MIN + 1., 101., -99., -31.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_add_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_add_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_add_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(1., 2., 0., f32::MAX, f32::MIN + 1., 101., -99., -31.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_add_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(1.);
+        let r = _mm_mask_add_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_add_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(2., 0., f32::MAX, f32::MIN + 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_add_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(1.);
+        let r = _mm_maskz_add_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_add_ps(0b00001111, a, b);
+        let e = _mm_set_ps(2., 0., f32::MAX, f32::MIN + 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_sub_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            -1, 0, -2, i32::MAX - 1,
+            i32::MAX, 99, -101, -33,
+            -1, 0, -2, i32::MAX - 1,
+            i32::MAX, 99, -101, -33,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_mask_sub_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sub_epi32(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            -1, 0, -2, i32::MAX - 1,
+            i32::MAX, 99, -101, -33,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_sub_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sub_epi32(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            -1, 0, -2, i32::MAX - 1,
+            i32::MAX, 99, -101, -33,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sub_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_mask_sub_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sub_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(-1, 0, -2, i32::MAX - 1, i32::MAX, 99, -101, -33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_sub_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sub_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(-1, 0, -2, i32::MAX - 1, i32::MAX, 99, -101, -33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sub_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_mask_sub_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sub_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(0, -2, i32::MAX - 1, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sub_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_sub_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sub_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(0, -2, i32::MAX - 1, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_sub_ps(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0., -2., f32::MAX - 1.,
+            f32::MIN, 99., -101., -33.,
+            -1., 0., -2., f32::MAX - 1.,
+            f32::MIN, 99., -101., -33.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_mask_sub_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_sub_ps(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0., -2., f32::MAX - 1.,
+            f32::MIN, 99., -101., -33.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_sub_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_sub_ps(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0., -2., f32::MAX - 1.,
+            f32::MIN, 99., -101., -33.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sub_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(1.);
+        let r = _mm256_mask_sub_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_sub_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(-1., 0., -2., f32::MAX - 1., f32::MIN, 99., -101., -33.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_sub_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_sub_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(-1., 0., -2., f32::MAX - 1., f32::MIN, 99., -101., -33.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sub_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(1.);
+        let r = _mm_mask_sub_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_sub_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(0., -2., f32::MAX - 1., f32::MIN);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sub_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(1.);
+        let r = _mm_maskz_sub_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_sub_ps(0b00001111, a, b);
+        let e = _mm_set_ps(0., -2., f32::MAX - 1., f32::MIN);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mullo_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_mullo_epi32(a, b);
+        let e = _mm512_setr_epi32(
+            0, 2, -2, -2, 0, 200, -200, -64, 0, 2, -2, -2, 0, 200, -200, -64,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mullo_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_mask_mullo_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mullo_epi32(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            0, 2, -2, -2,
+            0, 200, -200, -64,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mullo_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_mullo_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mullo_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 2, -2, -2, 0, 200, -200, -64, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mullo_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_mask_mullo_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mullo_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 2, -2, -2, 0, 200, -200, -64);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mullo_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_maskz_mullo_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mullo_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 2, -2, -2, 0, 200, -200, -64);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mullo_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_mask_mullo_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mullo_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(2, -2, -2, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mullo_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_maskz_mullo_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mullo_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(2, -2, -2, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(2.);
+        let r = _mm512_mul_ps(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200.,
+            -64.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(2.);
+        let r = _mm512_mask_mul_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_mul_ps(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(2.);
+        let r = _mm512_maskz_mul_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_mul_ps(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mul_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(2.);
+        let r = _mm256_mask_mul_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_mul_ps(a, 0b11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+        );
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mul_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(2.);
+        let r = _mm256_maskz_mul_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_mul_ps(0b11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+        );
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mul_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(2.);
+        let r = _mm_mask_mul_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_mul_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(2., -2., f32::INFINITY, f32::NEG_INFINITY);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mul_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(2.);
+        let r = _mm_maskz_mul_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_mul_ps(0b00001111, a, b);
+        let e = _mm_set_ps(2., -2., f32::INFINITY, f32::NEG_INFINITY);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_div_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
+        );
+        let b = _mm512_setr_ps(
+            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
+        );
+        let r = _mm512_div_ps(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.5, -0.5, -1.,
+            50., f32::INFINITY, -50., -16.,
+            0., 0.5, -0.5, 500.,
+            f32::NEG_INFINITY, 50., -50., -16.,
+        );
+        assert_eq_m512(r, e); // 0/0 = NAN
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_div_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
+        );
+        let b = _mm512_setr_ps(
+            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
+        );
+        let r = _mm512_mask_div_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_div_ps(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.5, -0.5, -1.,
+            50., f32::INFINITY, -50., -16.,
+            0., 1., -1., 1000.,
+            -131., 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_div_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
+        );
+        let b = _mm512_setr_ps(
+            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
+        );
+        let r = _mm512_maskz_div_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_div_ps(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.5, -0.5, -1.,
+            50., f32::INFINITY, -50., -16.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_div_ps() {
+        let a = _mm256_set_ps(0., 1., -1., -2., 100., 100., -100., -32.);
+        let b = _mm256_set_ps(2., 2., 2., 2., 2., 0., 2., 2.);
+        let r = _mm256_mask_div_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_div_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(0., 0.5, -0.5, -1., 50., f32::INFINITY, -50., -16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_div_ps() {
+        let a = _mm256_set_ps(0., 1., -1., -2., 100., 100., -100., -32.);
+        let b = _mm256_set_ps(2., 2., 2., 2., 2., 0., 2., 2.);
+        let r = _mm256_maskz_div_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_div_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(0., 0.5, -0.5, -1., 50., f32::INFINITY, -50., -16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_div_ps() {
+        let a = _mm_set_ps(100., 100., -100., -32.);
+        let b = _mm_set_ps(2., 0., 2., 2.);
+        let r = _mm_mask_div_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_div_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(50., f32::INFINITY, -50., -16.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_div_ps() {
+        let a = _mm_set_ps(100., 100., -100., -32.);
+        let b = _mm_set_ps(2., 0., 2., 2.);
+        let r = _mm_maskz_div_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_div_ps(0b00001111, a, b);
+        let e = _mm_set_ps(50., f32::INFINITY, -50., -16.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epi32(a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epi32(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_mask_max_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(3, 2, 2, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_maskz_max_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(3, 2, 2, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_max_ps(a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_mask_max_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_max_ps(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_maskz_max_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_max_ps(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm256_mask_max_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_max_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm256_maskz_max_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_max_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(3., 2., 1., 0.);
+        let r = _mm_mask_max_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_max_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(3., 2., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(3., 2., 1., 0.);
+        let r = _mm_maskz_max_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_mask_max_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(3., 2., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epu32(a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epu32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epu32(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epu32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epu32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epu32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epu32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epu32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epu32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epu32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epu32(0b11111111, a, b);
+        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epu32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_mask_max_epu32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epu32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(3, 2, 2, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epu32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_maskz_max_epu32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epu32(0b00001111, a, b);
+        let e = _mm_set_epi32(3, 2, 2, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epi32(a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epi32(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_mask_min_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(0, 1, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_maskz_min_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(0, 1, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_min_ps(a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_mask_min_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_min_ps(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_maskz_min_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_min_ps(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm256_mask_min_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_min_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(0., 1., 2., 3., 3., 2., 1., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm256_maskz_min_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_min_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(0., 1., 2., 3., 3., 2., 1., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(3., 2., 1., 0.);
+        let r = _mm_mask_min_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_min_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(0., 1., 1., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(3., 2., 1., 0.);
+        let r = _mm_maskz_min_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_min_ps(0b00001111, a, b);
+        let e = _mm_set_ps(0., 1., 1., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epu32(a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epu32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epu32(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epu32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epu32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epu32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epu32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epu32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epu32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epu32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epu32(0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epu32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_mask_min_epu32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epu32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(0, 1, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epu32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_maskz_min_epu32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epu32(0b00001111, a, b);
+        let e = _mm_set_epi32(0, 1, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sqrt_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
+        );
+        let r = _mm512_sqrt_ps(a);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sqrt_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
+        );
+        let r = _mm512_mask_sqrt_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_sqrt_ps(a, 0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 64., 81., 100., 121., 144., 169., 196., 225.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sqrt_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
+        );
+        let r = _mm512_maskz_sqrt_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_sqrt_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sqrt_ps() {
+        let a = _mm256_set_ps(0., 1., 4., 9., 16., 25., 36., 49.);
+        let r = _mm256_mask_sqrt_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_sqrt_ps(a, 0b11111111, a);
+        let e = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sqrt_ps() {
+        let a = _mm256_set_ps(0., 1., 4., 9., 16., 25., 36., 49.);
+        let r = _mm256_maskz_sqrt_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_sqrt_ps(0b11111111, a);
+        let e = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sqrt_ps() {
+        let a = _mm_set_ps(0., 1., 4., 9.);
+        let r = _mm_mask_sqrt_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_sqrt_ps(a, 0b00001111, a);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sqrt_ps() {
+        let a = _mm_set_ps(0., 1., 4., 9.);
+        let r = _mm_maskz_sqrt_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_sqrt_ps(0b00001111, a);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fmadd_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmadd_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmadd_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmadd_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmadd_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(2.);
+        let r = _mm512_mask3_fmadd_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmadd_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            2., 3., 4., 5., 6., 7., 8., 9., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fmadd_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fmadd_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fmadd_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fmadd_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fmadd_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fmadd_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fmadd_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmadd_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fmadd_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fmadd_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fmadd_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmadd_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsub_ps() {
+        let a = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        let r = _mm512_fmsub_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            -1., 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmsub_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmsub_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            -1., 0., 1., 2., 3., 4., 5., 6., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmsub_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmsub_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            -1., 0., 1., 2., 3., 4., 5., 6., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fmsub_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmsub_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            -1., 0., 1., 2., 3., 4., 5., 6., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fmsub_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fmsub_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fmsub_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fmsub_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fmsub_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fmsub_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fmsub_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmsub_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(-1., 0., 1., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fmsub_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fmsub_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(-1., 0., 1., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fmsub_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmsub_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(-1., 0., 1., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmaddsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fmaddsub_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            -1., 2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmaddsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmaddsub_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmaddsub_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            -1., 2., 1., 4., 3., 6., 5., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmaddsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmaddsub_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmaddsub_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            -1., 2., 1., 4., 3., 6., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmaddsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fmaddsub_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmaddsub_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            -1., 2., 1., 4., 3., 6., 5., 8., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmaddsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fmaddsub_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fmaddsub_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmaddsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fmaddsub_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fmaddsub_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmaddsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fmaddsub_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fmaddsub_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmaddsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fmaddsub_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmaddsub_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(1., 0., 3., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmaddsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fmaddsub_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fmaddsub_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(1., 0., 3., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmaddsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fmaddsub_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmaddsub_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(1., 0., 3., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsubadd_ps() {
+        let a = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        let r = _mm512_fmsubadd_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., 3., 2., 5., 4., 7., 6., 9., 8., 11., 10., 13., 12., 15., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsubadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmsubadd_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmsubadd_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., 3., 2., 5., 4., 7., 6., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsubadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmsubadd_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmsubadd_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., 3., 2., 5., 4., 7., 6., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsubadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fmsubadd_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmsubadd_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            1., 0., 3., 2., 5., 4., 7., 6., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmsubadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fmsubadd_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fmsubadd_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsubadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fmsubadd_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fmsubadd_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsubadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fmsubadd_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fmsubadd_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmsubadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fmsubadd_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmsubadd_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(-1., 2., 1., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmsubadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fmsubadd_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fmsubadd_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(-1., 2., 1., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmsubadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fmsubadd_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmsubadd_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(-1., 2., 1., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fnmadd_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., -1., -2., -3., -4., -5., -6., -7., -8., -9., -10., -11., -12., -13., -14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fnmadd_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fnmadd_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., -1., -2., -3., -4., -5., -6., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fnmadd_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fnmadd_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., -1., -2., -3., -4., -5., -6., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fnmadd_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fnmadd_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            1., 0., -1., -2., -3., -4., -5., -6., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fnmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fnmadd_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fnmadd_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fnmadd_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fnmadd_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fnmadd_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fnmadd_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fnmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fnmadd_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmadd_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(1., 0., -1., -2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fnmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fnmadd_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fnmadd_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(1., 0., -1., -2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fnmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fnmadd_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmadd_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(1., 0., -1., -2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fnmsub_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            -1., -2., -3., -4., -5., -6., -7., -8., -9., -10., -11., -12., -13., -14., -15., -16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fnmsub_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fnmsub_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            -1., -2., -3., -4., -5., -6., -7., -8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fnmsub_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fnmsub_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            -1., -2., -3., -4., -5., -6., -7., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fnmsub_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fnmsub_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            -1., -2., -3., -4., -5., -6., -7., -8., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fnmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fnmsub_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fnmsub_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fnmsub_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fnmsub_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fnmsub_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fnmsub_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fnmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fnmsub_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmsub_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(-1., -2., -3., -4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fnmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fnmsub_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fnmsub_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(-1., -2., -3., -4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fnmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fnmsub_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmsub_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(-1., -2., -3., -4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rcp14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_rcp14_ps(a);
+        let e = _mm512_set1_ps(0.33333206);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rcp14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_mask_rcp14_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_rcp14_ps(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 0.33333206, 0.33333206, 0.33333206, 0.33333206,
+            0.33333206, 0.33333206, 0.33333206, 0.33333206,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rcp14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_rcp14_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_rcp14_ps(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0.33333206, 0.33333206, 0.33333206, 0.33333206,
+            0.33333206, 0.33333206, 0.33333206, 0.33333206,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rcp14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_rcp14_ps(a);
+        let e = _mm256_set1_ps(0.33333206);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rcp14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_mask_rcp14_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_rcp14_ps(a, 0b11111111, a);
+        let e = _mm256_set1_ps(0.33333206);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rcp14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_maskz_rcp14_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_rcp14_ps(0b11111111, a);
+        let e = _mm256_set1_ps(0.33333206);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rcp14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_rcp14_ps(a);
+        let e = _mm_set1_ps(0.33333206);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rcp14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_mask_rcp14_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_rcp14_ps(a, 0b00001111, a);
+        let e = _mm_set1_ps(0.33333206);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rcp14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_maskz_rcp14_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_rcp14_ps(0b00001111, a);
+        let e = _mm_set1_ps(0.33333206);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rsqrt14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_rsqrt14_ps(a);
+        let e = _mm512_set1_ps(0.5773392);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rsqrt14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_mask_rsqrt14_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_rsqrt14_ps(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 0.5773392, 0.5773392, 0.5773392, 0.5773392, 0.5773392,
+            0.5773392, 0.5773392, 0.5773392,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rsqrt14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_rsqrt14_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_rsqrt14_ps(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0.5773392, 0.5773392, 0.5773392, 0.5773392, 0.5773392,
+            0.5773392, 0.5773392, 0.5773392,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rsqrt14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_rsqrt14_ps(a);
+        let e = _mm256_set1_ps(0.5773392);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rsqrt14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_mask_rsqrt14_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_rsqrt14_ps(a, 0b11111111, a);
+        let e = _mm256_set1_ps(0.5773392);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rsqrt14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_maskz_rsqrt14_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_rsqrt14_ps(0b11111111, a);
+        let e = _mm256_set1_ps(0.5773392);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rsqrt14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_rsqrt14_ps(a);
+        let e = _mm_set1_ps(0.5773392);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rsqrt14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_mask_rsqrt14_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_rsqrt14_ps(a, 0b00001111, a);
+        let e = _mm_set1_ps(0.5773392);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rsqrt14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_maskz_rsqrt14_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_rsqrt14_ps(0b00001111, a);
+        let e = _mm_set1_ps(0.5773392);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getexp_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_getexp_ps(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getexp_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_mask_getexp_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_getexp_ps(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getexp_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_getexp_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_getexp_ps(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_getexp_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_getexp_ps(a);
+        let e = _mm256_set1_ps(1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_getexp_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_mask_getexp_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_getexp_ps(a, 0b11111111, a);
+        let e = _mm256_set1_ps(1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_getexp_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_maskz_getexp_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_getexp_ps(0b11111111, a);
+        let e = _mm256_set1_ps(1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_getexp_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_getexp_ps(a);
+        let e = _mm_set1_ps(1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_getexp_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_mask_getexp_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_getexp_ps(a, 0b00001111, a);
+        let e = _mm_set1_ps(1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_getexp_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_maskz_getexp_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_getexp_ps(0b00001111, a);
+        let e = _mm_set1_ps(1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_roundscale_ps::<0b00_00_00_00>(a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
+        let e = _mm512_set1_ps(1.1);
+        assert_eq_m512(r, e);
+        let r = _mm512_mask_roundscale_ps::<0b00_00_00_00>(a, 0b11111111_11111111, a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_roundscale_ps::<0b00_00_00_00>(0b11111111_11111111, a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_roundscale_ps() {
+        let a = _mm256_set1_ps(1.1);
+        let r = _mm256_roundscale_ps::<0b00_00_00_00>(a);
+        let e = _mm256_set1_ps(1.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_roundscale_ps() {
+        let a = _mm256_set1_ps(1.1);
+        let r = _mm256_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
+        let e = _mm256_set1_ps(1.1);
+        assert_eq_m256(r, e);
+        let r = _mm256_mask_roundscale_ps::<0b00_00_00_00>(a, 0b11111111, a);
+        let e = _mm256_set1_ps(1.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_roundscale_ps() {
+        let a = _mm256_set1_ps(1.1);
+        let r = _mm256_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_roundscale_ps::<0b00_00_00_00>(0b11111111, a);
+        let e = _mm256_set1_ps(1.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_roundscale_ps() {
+        let a = _mm_set1_ps(1.1);
+        let r = _mm_roundscale_ps::<0b00_00_00_00>(a);
+        let e = _mm_set1_ps(1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_roundscale_ps() {
+        let a = _mm_set1_ps(1.1);
+        let r = _mm_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
+        let e = _mm_set1_ps(1.1);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_roundscale_ps::<0b00_00_00_00>(a, 0b00001111, a);
+        let e = _mm_set1_ps(1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_roundscale_ps() {
+        let a = _mm_set1_ps(1.1);
+        let r = _mm_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_roundscale_ps::<0b00_00_00_00>(0b00001111, a);
+        let e = _mm_set1_ps(1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_scalef_ps(a, b);
+        let e = _mm512_set1_ps(8.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_mask_scalef_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_scalef_ps(a, 0b11111111_00000000, a, b);
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_scalef_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_scalef_ps(0b11111111_00000000, a, b);
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_scalef_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(3.);
+        let r = _mm256_scalef_ps(a, b);
+        let e = _mm256_set1_ps(8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_scalef_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(3.);
+        let r = _mm256_mask_scalef_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_scalef_ps(a, 0b11111111, a, b);
+        let e = _mm256_set1_ps(8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_scalef_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(3.);
+        let r = _mm256_maskz_scalef_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_scalef_ps(0b11111111, a, b);
+        let e = _mm256_set1_ps(8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_scalef_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_scalef_ps(a, b);
+        let e = _mm_set1_ps(8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_scalef_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_scalef_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_scalef_ps(a, 0b00001111, a, b);
+        let e = _mm_set1_ps(8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_scalef_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_scalef_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_scalef_ps(0b00001111, a, b);
+        let e = _mm_set1_ps(8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_ps() {
+        let a = _mm512_set1_ps(f32::NAN);
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        //let r = _mm512_fixupimm_ps(a, b, c, 5);
+        let r = _mm512_fixupimm_ps::<5>(a, b, c);
+        let e = _mm512_set1_ps(0.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_mask_fixupimm_ps::<5>(a, 0b11111111_00000000, b, c);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_maskz_fixupimm_ps::<5>(0b11111111_00000000, a, b, c);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_fixupimm_ps() {
+        let a = _mm256_set1_ps(f32::NAN);
+        let b = _mm256_set1_ps(f32::MAX);
+        let c = _mm256_set1_epi32(i32::MAX);
+        let r = _mm256_fixupimm_ps::<5>(a, b, c);
+        let e = _mm256_set1_ps(0.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fixupimm_ps() {
+        let a = _mm256_set1_ps(f32::NAN);
+        let b = _mm256_set1_ps(f32::MAX);
+        let c = _mm256_set1_epi32(i32::MAX);
+        let r = _mm256_mask_fixupimm_ps::<5>(a, 0b11111111, b, c);
+        let e = _mm256_set1_ps(0.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fixupimm_ps() {
+        let a = _mm256_set1_ps(f32::NAN);
+        let b = _mm256_set1_ps(f32::MAX);
+        let c = _mm256_set1_epi32(i32::MAX);
+        let r = _mm256_maskz_fixupimm_ps::<5>(0b11111111, a, b, c);
+        let e = _mm256_set1_ps(0.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_fixupimm_ps() {
+        let a = _mm_set1_ps(f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_fixupimm_ps::<5>(a, b, c);
+        let e = _mm_set1_ps(0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fixupimm_ps() {
+        let a = _mm_set1_ps(f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_mask_fixupimm_ps::<5>(a, 0b00001111, b, c);
+        let e = _mm_set1_ps(0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fixupimm_ps() {
+        let a = _mm_set1_ps(f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_maskz_fixupimm_ps::<5>(0b00001111, a, b, c);
+        let e = _mm_set1_ps(0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_ternarylogic_epi32() {
+        let a = _mm512_set1_epi32(1 << 2);
+        let b = _mm512_set1_epi32(1 << 1);
+        let c = _mm512_set1_epi32(1 << 0);
+        let r = _mm512_ternarylogic_epi32::<8>(a, b, c);
+        let e = _mm512_set1_epi32(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_ternarylogic_epi32() {
+        let src = _mm512_set1_epi32(1 << 2);
+        let a = _mm512_set1_epi32(1 << 1);
+        let b = _mm512_set1_epi32(1 << 0);
+        let r = _mm512_mask_ternarylogic_epi32::<8>(src, 0, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_ternarylogic_epi32::<8>(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_ternarylogic_epi32() {
+        let a = _mm512_set1_epi32(1 << 2);
+        let b = _mm512_set1_epi32(1 << 1);
+        let c = _mm512_set1_epi32(1 << 0);
+        let r = _mm512_maskz_ternarylogic_epi32::<9>(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_ternarylogic_epi32::<8>(0b11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi32(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_ternarylogic_epi32() {
+        let a = _mm256_set1_epi32(1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let c = _mm256_set1_epi32(1 << 0);
+        let r = _mm256_ternarylogic_epi32::<8>(a, b, c);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_ternarylogic_epi32() {
+        let src = _mm256_set1_epi32(1 << 2);
+        let a = _mm256_set1_epi32(1 << 1);
+        let b = _mm256_set1_epi32(1 << 0);
+        let r = _mm256_mask_ternarylogic_epi32::<8>(src, 0, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_ternarylogic_epi32::<8>(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_ternarylogic_epi32() {
+        let a = _mm256_set1_epi32(1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let c = _mm256_set1_epi32(1 << 0);
+        let r = _mm256_maskz_ternarylogic_epi32::<9>(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_ternarylogic_epi32::<8>(0b11111111, a, b, c);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_ternarylogic_epi32() {
+        let a = _mm_set1_epi32(1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let c = _mm_set1_epi32(1 << 0);
+        let r = _mm_ternarylogic_epi32::<8>(a, b, c);
+        let e = _mm_set1_epi32(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_ternarylogic_epi32() {
+        let src = _mm_set1_epi32(1 << 2);
+        let a = _mm_set1_epi32(1 << 1);
+        let b = _mm_set1_epi32(1 << 0);
+        let r = _mm_mask_ternarylogic_epi32::<8>(src, 0, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_ternarylogic_epi32::<8>(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_ternarylogic_epi32() {
+        let a = _mm_set1_epi32(1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let c = _mm_set1_epi32(1 << 0);
+        let r = _mm_maskz_ternarylogic_epi32::<9>(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_ternarylogic_epi32::<8>(0b00001111, a, b, c);
+        let e = _mm_set1_epi32(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getmant_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm512_set1_ps(1.25);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getmant_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(
+            a,
+            0b11111111_00000000,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            10., 10., 10., 10., 10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getmant_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r =
+            _mm512_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_getmant_ps() {
+        let a = _mm256_set1_ps(10.);
+        let r = _mm256_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm256_set1_ps(1.25);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_getmant_ps() {
+        let a = _mm256_set1_ps(10.);
+        let r = _mm256_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a);
+        let e = _mm256_set1_ps(1.25);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_getmant_ps() {
+        let a = _mm256_set1_ps(10.);
+        let r = _mm256_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a);
+        let e = _mm256_set1_ps(1.25);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_getmant_ps() {
+        let a = _mm_set1_ps(10.);
+        let r = _mm_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm_set1_ps(1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_getmant_ps() {
+        let a = _mm_set1_ps(10.);
+        let r = _mm_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b00001111, a);
+        let e = _mm_set1_ps(1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_getmant_ps() {
+        let a = _mm_set1_ps(10.);
+        let r = _mm_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b00001111, a);
+        let e = _mm_set1_ps(1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(-1.);
+        let r = _mm512_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0.5, 1., 2.5,
+            3., 4.5, 5., 6.5,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_ps(
+            -1., 0.5, 1., 2.5, 3., 4.5, 5., 6.5, 7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0.5, 1., 2.5,
+            3., 4.5, 5., 6.5,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_sub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_ps(
+            -1., 0.5, 1., 2.5, 3., 4.5, 5., 6.5, 7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_mask_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r =
+            _mm512_maskz_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            8., 9.5, 10., 11.5,
+            12., 13.5, 14., 0.00000000000000000000007,
+        );
+        let b = _mm512_set1_ps(0.1);
+        let r = _mm512_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.15, 0.2, 0.35,
+            0.4, 0.55, 0.6, 0.75,
+            0.8, 0.95, 1.0, 1.15,
+            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_mul_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.14999999, 0.2, 0.35,
+            0.4, 0.54999995, 0.59999996, 0.75,
+            0.8, 0.95, 1.0, 1.15,
+            1.1999999, 1.3499999, 1.4, 0.000000000000000000000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            8., 9.5, 10., 11.5,
+            12., 13.5, 14., 0.00000000000000000000007,
+        );
+        let b = _mm512_set1_ps(0.1);
+        let r = _mm512_mask_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            0.8, 0.95, 1.0, 1.15,
+            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            8., 9.5, 10., 11.5,
+            12., 13.5, 14., 0.00000000000000000000007,
+        );
+        let b = _mm512_set1_ps(0.1);
+        let r =
+            _mm512_maskz_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+            0.8, 0.95, 1.0, 1.15,
+            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_div_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ps(0.33333334);
+        assert_eq_m512(r, e);
+        let r = _mm512_div_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ps(0.3333333);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_div_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_mask_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0.33333334, 0.33333334, 0.33333334, 0.33333334,
+            0.33333334, 0.33333334, 0.33333334, 0.33333334,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_div_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r =
+            _mm512_maskz_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0.33333334, 0.33333334, 0.33333334, 0.33333334,
+            0.33333334, 0.33333334, 0.33333334, 0.33333334,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sqrt_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set1_ps(1.7320508);
+        assert_eq_m512(r, e);
+        let r = _mm512_sqrt_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set1_ps(1.7320509);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sqrt_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r =
+            _mm512_mask_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 1.7320508, 1.7320508, 1.7320508, 1.7320508, 1.7320508,
+            1.7320508, 1.7320508, 1.7320508,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sqrt_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r =
+            _mm512_maskz_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1.7320508, 1.7320508, 1.7320508, 1.7320508, 1.7320508,
+            1.7320508, 1.7320508, 1.7320508,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(-0.99999994);
+        assert_eq_m512(r, e);
+        let r = _mm512_fmadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(-0.9999999);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        #[rustfmt::skip]
+        let r = _mm512_maskz_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask3_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -1., -1., -1., -1.,
+            -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(-0.99999994);
+        assert_eq_m512(r, e);
+        let r = _mm512_fmsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(-0.9999999);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask3_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmaddsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r =
+            _mm512_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_fmaddsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_ps(
+            1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
+            -0.9999999, 1., -0.9999999, 1., -0.9999999,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmaddsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmaddsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmaddsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask3_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            -1., -1., -1., -1.,
+            -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsubadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r =
+            _mm512_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_fmsubadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_ps(
+            -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
+            -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsubadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsubadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsubadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask3_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -1., -1., -1., -1.,
+            -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r =
+            _mm512_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(0.99999994);
+        assert_eq_m512(r, e);
+        let r = _mm512_fnmadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(0.9999999);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask3_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r =
+            _mm512_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(0.99999994);
+        assert_eq_m512(r, e);
+        let r = _mm512_fnmsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(0.9999999);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask3_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_mask_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_maskz_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_mask_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_maskz_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getexp_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getexp_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_mask_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getexp_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_round_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_round_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_mask_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a);
+        let e = _mm512_set1_ps(1.1);
+        assert_eq_m512(r, e);
+        let r = _mm512_mask_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(
+            a,
+            0b11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_round_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_maskz_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r =
+            _mm512_maskz_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111_11111111, a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ps(8.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_mask_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_round_ps() {
+        let a = _mm512_set1_ps(f32::NAN);
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
+        let e = _mm512_set1_ps(0.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_mask_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(
+            a,
+            0b11111111_00000000,
+            b,
+            c,
+        );
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_maskz_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(
+            0b11111111_00000000,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getmant_round_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a);
+        let e = _mm512_set1_ps(1.25);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getmant_round_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_mask_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            10., 10., 10., 10., 10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getmant_round_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_maskz_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvtps_epi32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtps_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtps_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvtps_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtps_epi32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvtps_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtps_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvtps_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtps_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtps_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtps_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvtps_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtps_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvtps_epu32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtps_epu32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtps_epu32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvtps_epu32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtps_epu32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_cvtps_epu32(a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvtps_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtps_epu32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvtps_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtps_epu32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_cvtps_epu32(a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtps_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtps_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvtps_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtps_epu32(0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi8_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepi8_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi8_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi8_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi8_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepi8_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi8_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi8_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi8_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepi8_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi8_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepi8_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi8_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu8_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu8_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu8_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu8_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu8_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu8_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu8_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi16_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepi16_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi16_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi16_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi16_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepi16_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi16_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtepi16_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi16_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepi16_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi16_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_cvtepi16_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi16_epi32(0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu16_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepu16_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu16_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu16_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu16_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepu16_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu16_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu16_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu16_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepu16_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu16_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu16_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu16_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_ps(a);
+        let e = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_cvtepi32_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtepi32_ps(src, 0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvtepi32_ps(0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_ps() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm256_set1_ps(-1.);
+        let r = _mm256_mask_cvtepi32_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_cvtepi32_ps(src, 0b11111111, a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_ps() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_cvtepi32_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_cvtepi32_ps(0b11111111, a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_ps() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let src = _mm_set1_ps(-1.);
+        let r = _mm_mask_cvtepi32_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_cvtepi32_ps(src, 0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_ps() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_cvtepi32_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_cvtepi32_ps(0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu32_ps(a);
+        let e = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_cvtepu32_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtepu32_ps(src, 0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu32_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvtepu32_ps(0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_epi16() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_epi16(a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_epi16() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi16(-1);
+        let r = _mm512_mask_cvtepi32_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtepi32_epi16(src, 0b00000000_11111111, a);
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_epi16() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtepi32_epi16(0b00000000_11111111, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_cvtepi32_epi16(a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm256_mask_cvtepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_cvtepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi32_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi32_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_epi8() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_epi8(a);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_epi8() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi8(-1);
+        let r = _mm512_mask_cvtepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtepi32_epi8(src, 0b00000000_11111111, a);
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_epi8() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtepi32_epi8(0b00000000_11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepi32_epi8() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_cvtepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_epi8() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi32_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_epi8() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi32_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi32_epi8() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_cvtepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_epi8() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi32_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_epi8() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi32_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_cvtsepi32_epi16(a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let src = _mm256_set1_epi16(-1);
+        let r = _mm512_mask_cvtsepi32_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtsepi32_epi16(src, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            -1, -1, -1, -1,
+            -1, -1, -1, -1,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtsepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_maskz_cvtsepi32_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtsepi32_epi16(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_cvtsepi32_epi16(a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm256_mask_cvtsepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtsepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_cvtsepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtsepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtsepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let src = _mm_set1_epi8(-1);
+        let r = _mm512_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtsepi32_epi8(src, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            -1, -1, -1, -1,
+            -1, -1, -1, -1,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtsepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtsepi32_epi8(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi32_epi8(src, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi32_epi8(0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let r = _mm_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi32_epi8(src, 0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let r = _mm_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi32_epi8(0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtusepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_cvtusepi32_epi16(a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let src = _mm256_set1_epi16(-1);
+        let r = _mm512_mask_cvtusepi32_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtusepi32_epi16(src, 0b00000000_11111111, a);
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtusepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_maskz_cvtusepi32_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtusepi32_epi16(0b00000000_11111111, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_cvtusepi32_epi16(a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtusepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_cvtusepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let r = _mm_cvtusepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtusepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi32_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let r = _mm_maskz_cvtusepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi32_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtusepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let src = _mm_set1_epi8(-1);
+        let r = _mm512_mask_cvtusepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtusepi32_epi8(src, 0b00000000_11111111, a);
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtusepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_maskz_cvtusepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtusepi32_epi8(0b00000000_11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let r = _mm256_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtusepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi32_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let r = _mm256_maskz_cvtusepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi32_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let r = _mm_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtusepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi32_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let r = _mm_maskz_cvtusepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi32_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundepi32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_ps(
+            0., -2., 2., -4., 4., -6., 6., -8., 8., 10., 10., 12., 12., 14., 14., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundepi32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundepi32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundepu32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            8., 10., 10., 12.,
+            12., 14., 14., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundepu32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundepu32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let src = _mm256_set1_epi16(0);
+        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvt_roundps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvt_roundps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvt_roundps_ph() {
+        let a = _mm_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvt_roundps_ph() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_cvtps_ph::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let src = _mm256_set1_epi16(0);
+        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_ph() {
+        let a = _mm_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_ph() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let r = _mm512_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let r = _mm512_cvtph_ps(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvtph_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtph_ps(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let r = _mm512_maskz_cvtph_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvtph_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let src = _mm256_set1_ps(0.);
+        let r = _mm256_mask_cvtph_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_cvtph_ps(src, 0b11111111, a);
+        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let r = _mm256_maskz_cvtph_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_cvtph_ps(0b11111111, a);
+        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let src = _mm_set1_ps(0.);
+        let r = _mm_mask_cvtph_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_cvtph_ps(src, 0b00001111, a);
+        let e = _mm_setr_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let r = _mm_maskz_cvtph_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_cvtph_ps(0b00001111, a);
+        let e = _mm_setr_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvttps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvttps_epi32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvttps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvttps_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvttps_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvttps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvttps_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvttps_epi32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvttps_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvttps_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvttps_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvttps_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttps_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttps_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvttps_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttps_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvttps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvttps_epu32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvttps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvttps_epu32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvttps_epu32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvttps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvttps_epu32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvttps_epu32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_cvttps_epu32(a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvttps_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvttps_epu32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvttps_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvttps_epu32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_cvttps_epu32(a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttps_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttps_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvttps_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttps_epu32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_ps() {
+        let arr: [f32; 256] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        let r = _mm512_i32gather_ps::<4>(index, arr.as_ptr());
+        #[rustfmt::skip]
+        assert_eq_m512(r, _mm512_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.,
+                                         120., 128., 136., 144., 152., 160., 168., 176.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_ps() {
+        let arr: [f32; 256] = core::array::from_fn(|i| i as f32);
+        let src = _mm512_set1_ps(2.);
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        // A multiplier of 4 is word-addressing
+        let r = _mm512_mask_i32gather_ps::<4>(src, mask, index, arr.as_ptr());
+        #[rustfmt::skip]
+        assert_eq_m512(r, _mm512_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.,
+                                         2., 128., 2., 144., 2., 160., 2., 176.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_epi32() {
+        let arr: [i32; 256] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        let r = _mm512_i32gather_epi32::<4>(index, arr.as_ptr());
+        #[rustfmt::skip]
+        assert_eq_m512i(r, _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                             120, 128, 136, 144, 152, 160, 168, 176));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_epi32() {
+        let arr: [i32; 256] = core::array::from_fn(|i| i as i32);
+        let src = _mm512_set1_epi32(2);
+        let mask = 0b10101010_10101010;
+        let index = _mm512_setr_epi32(
+            0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+        );
+        // A multiplier of 4 is word-addressing
+        let r = _mm512_mask_i32gather_epi32::<4>(src, mask, index, arr.as_ptr());
+        assert_eq_m512i(
+            r,
+            _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112, 2, 144, 2, 176, 2, 208, 2, 240),
+        );
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_ps() {
+        let mut arr = [0f32; 256];
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        // A multiplier of 4 is word-addressing
+        _mm512_i32scatter_ps::<4>(arr.as_mut_ptr(), index, src);
+        let mut expected = [0f32; 256];
+        for i in 0..16 {
+            expected[i * 16] = (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_ps() {
+        let mut arr = [0f32; 256];
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i32scatter_ps::<4>(arr.as_mut_ptr(), mask, index, src);
+        let mut expected = [0f32; 256];
+        for i in 0..8 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_epi32() {
+        let mut arr = [0i32; 256];
+        #[rustfmt::skip]
+
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        // A multiplier of 4 is word-addressing
+        _mm512_i32scatter_epi32::<4>(arr.as_mut_ptr(), index, src);
+        let mut expected = [0i32; 256];
+        for i in 0..16 {
+            expected[i * 16] = (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_epi32() {
+        let mut arr = [0i32; 256];
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i32scatter_epi32::<4>(arr.as_mut_ptr(), mask, index, src);
+        let mut expected = [0i32; 256];
+        for i in 0..8 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmplt_ps_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmplt_ps_mask(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnlt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        assert_eq!(_mm512_cmpnlt_ps_mask(a, b), !_mm512_cmplt_ps_mask(a, b));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnlt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmpnlt_ps_mask(mask, a, b), 0b01111010_01111010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnle_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmpnle_ps_mask(b, a);
+        assert_eq!(m, 0b00001101_00001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnle_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmpnle_ps_mask(mask, b, a);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        assert_eq!(_mm512_cmple_ps_mask(a, b), 0b00100101_00100101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmple_ps_mask(mask, a, b), 0b00100000_00100000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let m = _mm512_cmpeq_ps_mask(b, a);
+        assert_eq!(m, 0b11001101_11001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_ps_mask(mask, b, a);
+        assert_eq!(r, 0b01001000_01001000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let m = _mm512_cmpneq_ps_mask(b, a);
+        assert_eq!(m, 0b00110010_00110010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpneq_ps_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_ps_mask() {
+        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm256_set1_ps(-1.);
+        let m = _mm256_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_ps_mask() {
+        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm256_set1_ps(-1.);
+        let mask = 0b01100110;
+        let r = _mm256_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_ps_mask() {
+        let a = _mm_set_ps(0., 1., -1., 13.);
+        let b = _mm_set1_ps(1.);
+        let m = _mm_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_ps_mask() {
+        let a = _mm_set_ps(0., 1., -1., 13.);
+        let b = _mm_set1_ps(1.);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_round_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_round_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let m = _mm512_cmpord_ps_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let mask = 0b11000011_11000011;
+        let m = _mm512_mask_cmpord_ps_mask(mask, a, b);
+        assert_eq!(m, 0b00000001_00000001);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpunord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let m = _mm512_cmpunord_ps_mask(a, b);
+
+        assert_eq!(m, 0b11111010_11111010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpunord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let mask = 0b00001111_00001111;
+        let m = _mm512_mask_cmpunord_ps_mask(mask, a, b);
+        assert_eq!(m, 0b000001010_00001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_cmp_ss_mask::<_CMP_GE_OS>(a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b1, a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_round_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_round_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_cmp_sd_mask::<_CMP_GE_OS>(a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b1, a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_round_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_round_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmplt_epu32_mask(a, b);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmplt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmplt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmplt_epu32_mask(a, b);
+        assert_eq!(r, 0b10000000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmplt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b10000000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmplt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmplt_epu32_mask(a, b);
+        assert_eq!(r, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmplt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpgt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmpgt_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpgt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpgt_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmpgt_epu32_mask(a, b);
+        assert_eq!(r, 0b00111111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpgt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00111111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpgt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmpgt_epu32_mask(a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpgt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmple_epu32_mask(a, b),
+            !_mm512_cmpgt_epu32_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(
+            _mm512_mask_cmple_epu32_mask(mask, a, b),
+            0b01111010_01111010
+        );
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmple_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmple_epu32_mask(a, b);
+        assert_eq!(r, 0b11000000)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmple_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b11000000)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmple_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmple_epu32_mask(a, b);
+        assert_eq!(r, 0b00001100)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmple_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00001100)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmpge_epu32_mask(a, b),
+            !_mm512_cmplt_epu32_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmpge_epu32_mask(mask, a, b), 0b01100000_0110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpge_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmpge_epu32_mask(a, b);
+        assert_eq!(r, 0b01111111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpge_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b01111111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpge_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmpge_epu32_mask(a, b);
+        assert_eq!(r, 0b00000111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpge_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00000111)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpeq_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm256_cmpeq_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm256_mask_cmpeq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpeq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let m = _mm_cmpeq_epu32_mask(b, a);
+        assert_eq!(m, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpeq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpneq_epu32_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epu32_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpneq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
+        let r = _mm256_cmpneq_epu32_mask(b, a);
+        assert_eq!(r, 0b00110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpneq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpneq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let r = _mm_cmpneq_epu32_mask(b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpneq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let m = _mm256_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
+        let b = _mm_set1_epi32(1);
+        let m = _mm_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmplt_epi32_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmplt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmplt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmplt_epi32_mask(a, b);
+        assert_eq!(r, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmplt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmplt_epi32_mask() {
+        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmplt_epi32_mask(a, b);
+        assert_eq!(r, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epi32_mask() {
+        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmplt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpgt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmpgt_epi32_mask(b, a);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpgt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmpgt_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmpgt_epi32_mask(a, b);
+        assert_eq!(r, 0b11011010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpgt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b11011010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpgt_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmpgt_epi32_mask(a, b);
+        assert_eq!(r, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpgt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmple_epi32_mask(a, b),
+            !_mm512_cmpgt_epi32_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmple_epi32_mask(mask, a, b), 0b01100000_0110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmple_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmple_epi32_mask(a, b);
+        assert_eq!(r, 0b00100101)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmple_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00100101)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmple_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 200);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmple_epi32_mask(a, b);
+        assert_eq!(r, 0b00000010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 200);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmple_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000010)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmpge_epi32_mask(a, b),
+            !_mm512_cmplt_epi32_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(
+            _mm512_mask_cmpge_epi32_mask(mask, a, b),
+            0b01111010_01111010
+        );
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpge_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmpge_epi32_mask(a, b);
+        assert_eq!(r, 0b11111010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpge_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b11111010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpge_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmpge_epi32_mask(a, b);
+        assert_eq!(r, 0b00001111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpge_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00001111)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpeq_epi32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm256_cmpeq_epi32_mask(b, a);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm256_mask_cmpeq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let m = _mm_cmpeq_epi32_mask(b, a);
+        assert_eq!(m, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpeq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpneq_epi32_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epi32_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpneq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm256_cmpneq_epi32_mask(b, a);
+        assert_eq!(m, !_mm256_cmpeq_epi32_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpneq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00110011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpneq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let r = _mm_cmpneq_epi32_mask(b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpneq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let m = _mm256_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b01100110;
+        let r = _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(1);
+        let m = _mm_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_epi8() {
+        let r = _mm512_set1_epi8(2);
+        assert_eq_m512i(
+            r,
+            _mm512_set_epi8(
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                2, 2, 2, 2, 2, 2, 2, 2,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_epi16() {
+        let r = _mm512_set1_epi16(2);
+        assert_eq_m512i(
+            r,
+            _mm512_set_epi16(
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                2, 2, 2, 2,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_epi32() {
+        let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(
+            r,
+            _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_epi32() {
+        let r = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(
+            r,
+            _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_epi8() {
+        let r = _mm512_set_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2, 2, 2, 2,
+        );
+        assert_eq_m512i(r, _mm512_set1_epi8(2));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_epi16() {
+        let r = _mm512_set_epi16(
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        assert_eq_m512i(r, _mm512_set1_epi16(2));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_epi32() {
+        let r = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, _mm512_set1_epi32(2));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_si512() {
+        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512());
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_epi32() {
+        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_epi32());
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_ps() {
+        let r = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(
+            r,
+            _mm512_set_ps(
+                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_ps() {
+        let r = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(
+            r,
+            _mm512_setr_ps(
+                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_ps() {
+        #[rustfmt::skip]
+        let expected = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2.,
+                                     2., 2., 2., 2., 2., 2., 2., 2.);
+        assert_eq_m512(expected, _mm512_set1_ps(2.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set4_epi32() {
+        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
+        assert_eq_m512i(r, _mm512_set4_epi32(4, 3, 2, 1));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set4_ps() {
+        let r = _mm512_set_ps(
+            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
+        );
+        assert_eq_m512(r, _mm512_set4_ps(4., 3., 2., 1.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr4_epi32() {
+        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
+        assert_eq_m512i(r, _mm512_setr4_epi32(1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr4_ps() {
+        let r = _mm512_set_ps(
+            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
+        );
+        assert_eq_m512(r, _mm512_setr4_ps(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_ps() {
+        assert_eq_m512(_mm512_setzero_ps(), _mm512_set1_ps(0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero() {
+        assert_eq_m512(_mm512_setzero(), _mm512_set1_ps(0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_pd() {
+        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_pd(black_box(p));
+        let e = _mm512_setr_pd(4., 3., 2., 5., 8., 9., 64., 50.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_pd() {
+        let a = _mm512_set1_pd(9.);
+        let mut r = _mm512_undefined_pd();
+        _mm512_storeu_pd(&mut r as *mut _ as *mut f64, a);
+        assert_eq_m512d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_ps() {
+        let a = &[
+            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        ];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_ps(black_box(p));
+        let e = _mm512_setr_ps(
+            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_ps() {
+        let a = _mm512_set1_ps(9.);
+        let mut r = _mm512_undefined_ps();
+        _mm512_storeu_ps(&mut r as *mut _ as *mut f32, a);
+        assert_eq_m512(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_epi32() {
+        let src = _mm512_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_loadu_epi32(src, m, black_box(p));
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_loadu_epi32(m, black_box(p));
+        let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let src = _mm512_set1_epi32(42);
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_load_epi32(src, m, black_box(p));
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_load_epi32(m, black_box(p));
+        let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_epi32() {
+        let mut r = [42_i32; 16];
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let m = 0b11101000_11001010;
+        _mm512_mask_storeu_epi32(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(_mm512_loadu_epi32(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16],
+        }
+        let mut r = Align { data: [42; 16] };
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let m = 0b11101000_11001010;
+        _mm512_mask_store_epi32(r.data.as_mut_ptr(), m, a);
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(_mm512_load_epi32(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_epi64() {
+        let src = _mm512_set1_epi64(42);
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_loadu_epi64(src, m, black_box(p));
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_epi64() {
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_loadu_epi64(m, black_box(p));
+        let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8], // 64 bytes
+        }
+        let src = _mm512_set1_epi64(42);
+        let a = Align {
+            data: [1_i64, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_load_epi64(src, m, black_box(p));
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8], // 64 bytes
+        }
+        let a = Align {
+            data: [1_i64, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_load_epi64(m, black_box(p));
+        let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_epi64() {
+        let mut r = [42_i64; 8];
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        _mm512_mask_storeu_epi64(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(_mm512_loadu_epi64(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8],
+        }
+        let mut r = Align { data: [42; 8] };
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        let p = r.data.as_mut_ptr();
+        _mm512_mask_store_epi64(p, m, a);
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(_mm512_load_epi64(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_ps() {
+        let src = _mm512_set1_ps(42.0);
+        let a = &[
+            1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+            16.0,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_loadu_ps(src, m, black_box(p));
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_ps() {
+        let a = &[
+            1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+            16.0,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_loadu_ps(m, black_box(p));
+        let e = _mm512_setr_ps(
+            0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_load_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16], // 64 bytes
+        }
+        let src = _mm512_set1_ps(42.0);
+        let a = Align {
+            data: [
+                1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
+                15.0, 16.0,
+            ],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_load_ps(src, m, black_box(p));
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_load_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [
+                1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
+                15.0, 16.0,
+            ],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_load_ps(m, black_box(p));
+        let e = _mm512_setr_ps(
+            0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_ps() {
+        let mut r = [42_f32; 16];
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let m = 0b11101000_11001010;
+        _mm512_mask_storeu_ps(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(_mm512_loadu_ps(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16],
+        }
+        let mut r = Align { data: [42.0; 16] };
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let m = 0b11101000_11001010;
+        _mm512_mask_store_ps(r.data.as_mut_ptr(), m, a);
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(_mm512_load_ps(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_pd() {
+        let src = _mm512_set1_pd(42.0);
+        let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_loadu_pd(src, m, black_box(p));
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_pd() {
+        let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_loadu_pd(m, black_box(p));
+        let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_load_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8], // 64 bytes
+        }
+        let src = _mm512_set1_pd(42.0);
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_load_pd(src, m, black_box(p));
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_load_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8], // 64 bytes
+        }
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_load_pd(m, black_box(p));
+        let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_pd() {
+        let mut r = [42_f64; 8];
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm512_mask_storeu_pd(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(_mm512_loadu_pd(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8],
+        }
+        let mut r = Align { data: [42.0; 8] };
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm512_mask_store_pd(r.data.as_mut_ptr(), m, a);
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(_mm512_load_pd(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi32() {
+        let src = _mm256_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_loadu_epi32(src, m, black_box(p));
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_loadu_epi32(m, black_box(p));
+        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_epi32() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i32; 8], // 32 bytes
+        }
+        let src = _mm256_set1_epi32(42);
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_load_epi32(src, m, black_box(p));
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_epi32() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i32; 8], // 32 bytes
+        }
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_load_epi32(m, black_box(p));
+        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi32() {
+        let mut r = [42_i32; 8];
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        _mm256_mask_storeu_epi32(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(_mm256_loadu_epi32(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_store_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 8],
+        }
+        let mut r = Align { data: [42; 8] };
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        _mm256_mask_store_epi32(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(_mm256_load_epi32(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi64() {
+        let src = _mm256_set1_epi64x(42);
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_loadu_epi64(src, m, black_box(p));
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi64() {
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_loadu_epi64(m, black_box(p));
+        let e = _mm256_setr_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_epi64() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i64; 4], // 32 bytes
+        }
+        let src = _mm256_set1_epi64x(42);
+        let a = Align {
+            data: [1_i64, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_load_epi64(src, m, black_box(p));
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_epi64() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i64; 4], // 32 bytes
+        }
+        let a = Align {
+            data: [1_i64, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_load_epi64(m, black_box(p));
+        let e = _mm256_setr_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi64() {
+        let mut r = [42_i64; 4];
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm256_mask_storeu_epi64(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(_mm256_loadu_epi64(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_store_epi64() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i64; 4],
+        }
+        let mut r = Align { data: [42; 4] };
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm256_mask_store_epi64(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(_mm256_load_epi64(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_ps() {
+        let src = _mm256_set1_ps(42.0);
+        let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_loadu_ps(src, m, black_box(p));
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_ps() {
+        let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_loadu_ps(m, black_box(p));
+        let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_ps() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f32; 8], // 32 bytes
+        }
+        let src = _mm256_set1_ps(42.0);
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_load_ps(src, m, black_box(p));
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_ps() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f32; 8], // 32 bytes
+        }
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_load_ps(m, black_box(p));
+        let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_ps() {
+        let mut r = [42_f32; 8];
+        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm256_mask_storeu_ps(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(_mm256_loadu_ps(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_store_ps() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f32; 8],
+        }
+        let mut r = Align { data: [42.0; 8] };
+        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm256_mask_store_ps(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(_mm256_load_ps(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_pd() {
+        let src = _mm256_set1_pd(42.0);
+        let a = &[1.0_f64, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_loadu_pd(src, m, black_box(p));
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_pd() {
+        let a = &[1.0_f64, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_loadu_pd(m, black_box(p));
+        let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_pd() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f64; 4], // 32 bytes
+        }
+        let src = _mm256_set1_pd(42.0);
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_load_pd(src, m, black_box(p));
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_pd() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f64; 4], // 32 bytes
+        }
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_load_pd(m, black_box(p));
+        let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_pd() {
+        let mut r = [42_f64; 4];
+        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm256_mask_storeu_pd(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(_mm256_loadu_pd(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_store_pd() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f64; 4],
+        }
+        let mut r = Align { data: [42.0; 4] };
+        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm256_mask_store_pd(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(_mm256_load_pd(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi32() {
+        let src = _mm_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_loadu_epi32(src, m, black_box(p));
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi32() {
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_loadu_epi32(m, black_box(p));
+        let e = _mm_setr_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_epi32() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i32; 4], // 32 bytes
+        }
+        let src = _mm_set1_epi32(42);
+        let a = Align {
+            data: [1_i32, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_load_epi32(src, m, black_box(p));
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_epi32() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i32; 4], // 16 bytes
+        }
+        let a = Align {
+            data: [1_i32, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_load_epi32(m, black_box(p));
+        let e = _mm_setr_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi32() {
+        let mut r = [42_i32; 4];
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm_mask_storeu_epi32(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(_mm_loadu_epi32(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_epi32() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i32; 4], // 16 bytes
+        }
+        let mut r = Align { data: [42; 4] };
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm_mask_store_epi32(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(_mm_load_epi32(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi64() {
+        let src = _mm_set1_epi64x(42);
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_loadu_epi64(src, m, black_box(p));
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi64() {
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_loadu_epi64(m, black_box(p));
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_epi64() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i64; 2], // 16 bytes
+        }
+        let src = _mm_set1_epi64x(42);
+        let a = Align { data: [1_i64, 2] };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_load_epi64(src, m, black_box(p));
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_epi64() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i64; 2], // 16 bytes
+        }
+        let a = Align { data: [1_i64, 2] };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_load_epi64(m, black_box(p));
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi64() {
+        let mut r = [42_i64; 2];
+        let a = _mm_setr_epi64x(1, 2);
+        let m = 0b10;
+        _mm_mask_storeu_epi64(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(_mm_loadu_epi64(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_epi64() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i64; 2], // 16 bytes
+        }
+        let mut r = Align { data: [42; 2] };
+        let a = _mm_setr_epi64x(1, 2);
+        let m = 0b10;
+        _mm_mask_store_epi64(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(_mm_load_epi64(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_ps() {
+        let src = _mm_set1_ps(42.0);
+        let a = &[1.0_f32, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_loadu_ps(src, m, black_box(p));
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_ps() {
+        let a = &[1.0_f32, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_loadu_ps(m, black_box(p));
+        let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_ps() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f32; 4], // 16 bytes
+        }
+        let src = _mm_set1_ps(42.0);
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_load_ps(src, m, black_box(p));
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_ps() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f32; 4], // 16 bytes
+        }
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_load_ps(m, black_box(p));
+        let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_ps() {
+        let mut r = [42_f32; 4];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm_mask_storeu_ps(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(_mm_loadu_ps(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_ps() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f32; 4], // 16 bytes
+        }
+        let mut r = Align { data: [42.0; 4] };
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm_mask_store_ps(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(_mm_load_ps(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_pd() {
+        let src = _mm_set1_pd(42.0);
+        let a = &[1.0_f64, 2.0];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_loadu_pd(src, m, black_box(p));
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_pd() {
+        let a = &[1.0_f64, 2.0];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_loadu_pd(m, black_box(p));
+        let e = _mm_setr_pd(0.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_pd() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f64; 2], // 16 bytes
+        }
+        let src = _mm_set1_pd(42.0);
+        let a = Align {
+            data: [1.0_f64, 2.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_load_pd(src, m, black_box(p));
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_pd() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f64; 2], // 16 bytes
+        }
+        let a = Align {
+            data: [1.0_f64, 2.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_load_pd(m, black_box(p));
+        let e = _mm_setr_pd(0.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_load_ss() {
+        #[repr(align(16))]
+        struct Align {
+            data: f32,
+        }
+        let src = _mm_set_ss(2.0);
+        let mem = Align { data: 1.0 };
+        let r = _mm_mask_load_ss(src, 0b1, &mem.data);
+        assert_eq_m128(r, _mm_set_ss(1.0));
+        let r = _mm_mask_load_ss(src, 0b0, &mem.data);
+        assert_eq_m128(r, _mm_set_ss(2.0));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_load_ss() {
+        #[repr(align(16))]
+        struct Align {
+            data: f32,
+        }
+        let mem = Align { data: 1.0 };
+        let r = _mm_maskz_load_ss(0b1, &mem.data);
+        assert_eq_m128(r, _mm_set_ss(1.0));
+        let r = _mm_maskz_load_ss(0b0, &mem.data);
+        assert_eq_m128(r, _mm_set_ss(0.0));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_load_sd() {
+        #[repr(align(16))]
+        struct Align {
+            data: f64,
+        }
+        let src = _mm_set_sd(2.0);
+        let mem = Align { data: 1.0 };
+        let r = _mm_mask_load_sd(src, 0b1, &mem.data);
+        assert_eq_m128d(r, _mm_set_sd(1.0));
+        let r = _mm_mask_load_sd(src, 0b0, &mem.data);
+        assert_eq_m128d(r, _mm_set_sd(2.0));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_load_sd() {
+        #[repr(align(16))]
+        struct Align {
+            data: f64,
+        }
+        let mem = Align { data: 1.0 };
+        let r = _mm_maskz_load_sd(0b1, &mem.data);
+        assert_eq_m128d(r, _mm_set_sd(1.0));
+        let r = _mm_maskz_load_sd(0b0, &mem.data);
+        assert_eq_m128d(r, _mm_set_sd(0.0));
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_pd() {
+        let mut r = [42_f64; 2];
+        let a = _mm_setr_pd(1.0, 2.0);
+        let m = 0b10;
+        _mm_mask_storeu_pd(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(_mm_loadu_pd(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_pd() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f64; 2], // 16 bytes
+        }
+        let mut r = Align { data: [42.0; 2] };
+        let a = _mm_setr_pd(1.0, 2.0);
+        let m = 0b10;
+        _mm_mask_store_pd(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(_mm_load_pd(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_store_ss() {
+        #[repr(align(16))]
+        struct Align {
+            data: f32,
+        }
+        let a = _mm_set_ss(2.0);
+        let mut mem = Align { data: 1.0 };
+        _mm_mask_store_ss(&mut mem.data, 0b1, a);
+        assert_eq!(mem.data, 2.0);
+        _mm_mask_store_ss(&mut mem.data, 0b0, a);
+        assert_eq!(mem.data, 2.0);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_store_sd() {
+        #[repr(align(16))]
+        struct Align {
+            data: f64,
+        }
+        let a = _mm_set_sd(2.0);
+        let mut mem = Align { data: 1.0 };
+        _mm_mask_store_sd(&mut mem.data, 0b1, a);
+        assert_eq!(mem.data, 2.0);
+        _mm_mask_store_sd(&mut mem.data, 0b0, a);
+        assert_eq!(mem.data, 2.0);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_pd() {
+        let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_pd() {
+        let r = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, _mm512_set_pd(7., 6., 5., 4., 3., 2., 1., 0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rol_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_rol_epi32::<1>(a);
+        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rol_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_mask_rol_epi32::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rol_epi32::<1>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rol_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        let r = _mm512_maskz_rol_epi32::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rol_epi32::<1>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rol_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_rol_epi32::<1>(a);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rol_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_mask_rol_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rol_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rol_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_maskz_rol_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rol_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rol_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_rol_epi32::<1>(a);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rol_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_mask_rol_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rol_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rol_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_maskz_rol_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rol_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_ror_epi32() {
+        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm512_ror_epi32::<1>(a);
+        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_ror_epi32() {
+        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm512_mask_ror_epi32::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_ror_epi32::<1>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_ror_epi32() {
+        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
+        let r = _mm512_maskz_ror_epi32::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_ror_epi32::<1>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_ror_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm256_ror_epi32::<1>(a);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_ror_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm256_mask_ror_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_ror_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_ror_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm256_maskz_ror_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_ror_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_ror_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let r = _mm_ror_epi32::<1>(a);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_ror_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let r = _mm_mask_ror_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_ror_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_ror_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let r = _mm_maskz_ror_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_ror_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_slli_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_slli_epi32::<1>(a);
+        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_slli_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_mask_slli_epi32::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_slli_epi32::<1>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_slli_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        let r = _mm512_maskz_slli_epi32::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_slli_epi32::<1>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_slli_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_mask_slli_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_slli_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_slli_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_maskz_slli_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_slli_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_slli_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_mask_slli_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_slli_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_slli_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_maskz_slli_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_slli_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srli_epi32() {
+        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm512_srli_epi32::<1>(a);
+        let e = _mm512_set_epi32(0 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srli_epi32() {
+        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm512_mask_srli_epi32::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srli_epi32::<1>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(0 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srli_epi32() {
+        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0);
+        let r = _mm512_maskz_srli_epi32::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srli_epi32::<1>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0 << 31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srli_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_mask_srli_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srli_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srli_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_maskz_srli_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srli_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srli_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let r = _mm_mask_srli_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srli_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srli_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let r = _mm_maskz_srli_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srli_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rolv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_rolv_epi32(a, b);
+        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rolv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_mask_rolv_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rolv_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rolv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_rolv_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rolv_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rolv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_rolv_epi32(a, b);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rolv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_mask_rolv_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rolv_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rolv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_rolv_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rolv_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rolv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_rolv_epi32(a, b);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rolv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_mask_rolv_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rolv_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rolv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_rolv_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rolv_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rorv_epi32() {
+        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_rorv_epi32(a, b);
+        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rorv_epi32() {
+        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_mask_rorv_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rorv_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rorv_epi32() {
+        let a = _mm512_set_epi32(3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_rorv_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rorv_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rorv_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_rorv_epi32(a, b);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rorv_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_mask_rorv_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rorv_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rorv_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_rorv_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rorv_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rorv_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_rorv_epi32(a, b);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rorv_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_mask_rorv_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rorv_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rorv_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_rorv_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rorv_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sllv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let count = _mm512_set1_epi32(1);
+        let r = _mm512_sllv_epi32(a, count);
+        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sllv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let count = _mm512_set1_epi32(1);
+        let r = _mm512_mask_sllv_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sllv_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sllv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        let count = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_maskz_sllv_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sllv_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sllv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_mask_sllv_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sllv_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sllv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_sllv_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sllv_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sllv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_mask_sllv_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sllv_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sllv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_maskz_sllv_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sllv_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srlv_epi32() {
+        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let count = _mm512_set1_epi32(1);
+        let r = _mm512_srlv_epi32(a, count);
+        let e = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srlv_epi32() {
+        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let count = _mm512_set1_epi32(1);
+        let r = _mm512_mask_srlv_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srlv_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srlv_epi32() {
+        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0);
+        let count = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_maskz_srlv_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srlv_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srlv_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_mask_srlv_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srlv_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srlv_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_srlv_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srlv_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srlv_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_mask_srlv_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srlv_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srlv_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_maskz_srlv_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srlv_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sll_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_sll_epi32(a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            0, 1 << 2, 1 << 3, 1 << 4,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sll_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_mask_sll_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sll_epi32(a, 0b11111111_11111111, a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            0, 1 << 2, 1 << 3, 1 << 4,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sll_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 31,
+        );
+        let count = _mm_set_epi32(2, 0, 0, 2);
+        let r = _mm512_maskz_sll_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sll_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sll_epi32() {
+        let a = _mm256_set_epi32(1 << 13, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_mask_sll_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sll_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 14, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sll_epi32() {
+        let a = _mm256_set_epi32(1 << 13, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_maskz_sll_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sll_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 14, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sll_epi32() {
+        let a = _mm_set_epi32(1 << 13, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_mask_sll_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sll_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 14, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sll_epi32() {
+        let a = _mm_set_epi32(1 << 13, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_maskz_sll_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sll_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 14, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srl_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_srl_epi32(a, count);
+        let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srl_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_mask_srl_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srl_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srl_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 31,
+        );
+        let count = _mm_set_epi32(2, 0, 0, 2);
+        let r = _mm512_maskz_srl_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srl_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 29);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srl_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_mask_srl_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srl_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srl_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_maskz_srl_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srl_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srl_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_mask_srl_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srl_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srl_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_maskz_srl_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srl_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sra_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        let count = _mm_set_epi32(1, 0, 0, 2);
+        let r = _mm512_sra_epi32(a, count);
+        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sra_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16);
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_mask_sra_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sra_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sra_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14);
+        let count = _mm_set_epi32(2, 0, 0, 2);
+        let r = _mm512_maskz_sra_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sra_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sra_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_mask_sra_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sra_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sra_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_maskz_sra_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sra_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sra_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_mask_sra_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sra_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sra_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_maskz_sra_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sra_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srav_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm512_srav_epi32(a, count);
+        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srav_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16);
+        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        let r = _mm512_mask_srav_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srav_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srav_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14);
+        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2);
+        let r = _mm512_maskz_srav_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srav_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srav_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_mask_srav_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srav_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srav_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_srav_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srav_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srav_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_mask_srav_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srav_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srav_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_maskz_srav_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srav_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srai_epi32() {
+        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -15);
+        let r = _mm512_srai_epi32::<2>(a);
+        let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srai_epi32() {
+        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
+        let r = _mm512_mask_srai_epi32::<2>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srai_epi32::<2>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srai_epi32() {
+        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
+        let r = _mm512_maskz_srai_epi32::<2>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srai_epi32::<2>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srai_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_mask_srai_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srai_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srai_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_maskz_srai_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srai_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srai_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let r = _mm_mask_srai_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srai_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srai_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let r = _mm_maskz_srai_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srai_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permute_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_permute_ps::<0b11_11_11_11>(a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permute_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_permute_ps::<0b11_11_11_11>(a, 0b11111111_11111111, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permute_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_maskz_permute_ps::<0b11_11_11_11>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_permute_ps::<0b11_11_11_11>(0b11111111_11111111, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permute_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_permute_ps::<0b11_11_11_11>(a, 0b11111111, a);
+        let e = _mm256_set_ps(0., 0., 0., 0., 4., 4., 4., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permute_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_maskz_permute_ps::<0b11_11_11_11>(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_permute_ps::<0b11_11_11_11>(0b11111111, a);
+        let e = _mm256_set_ps(0., 0., 0., 0., 4., 4., 4., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permute_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_permute_ps::<0b11_11_11_11>(a, 0b00001111, a);
+        let e = _mm_set_ps(0., 0., 0., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permute_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_maskz_permute_ps::<0b11_11_11_11>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_permute_ps::<0b11_11_11_11>(0b00001111, a);
+        let e = _mm_set_ps(0., 0., 0., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutevar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_permutevar_epi32(idx, a);
+        let e = _mm512_set1_epi32(14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutevar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_permutevar_epi32(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutevar_epi32(a, 0b11111111_11111111, idx, a);
+        let e = _mm512_set1_epi32(14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutevar_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_set1_epi32(0b01);
+        let r = _mm512_permutevar_ps(a, b);
+        let e = _mm512_set_ps(
+            2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutevar_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_set1_epi32(0b01);
+        let r = _mm512_mask_permutevar_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_permutevar_ps(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_ps(
+            2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutevar_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_set1_epi32(0b01);
+        let r = _mm512_maskz_permutevar_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_permutevar_ps(0b00000000_11111111, a, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 10., 10., 10., 10., 14., 14., 14., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutevar_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set1_epi32(0b01);
+        let r = _mm256_mask_permutevar_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_permutevar_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(2., 2., 2., 2., 6., 6., 6., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutevar_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set1_epi32(0b01);
+        let r = _mm256_maskz_permutevar_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_permutevar_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(2., 2., 2., 2., 6., 6., 6., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutevar_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set1_epi32(0b01);
+        let r = _mm_mask_permutevar_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_permutevar_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutevar_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set1_epi32(0b01);
+        let r = _mm_maskz_permutevar_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_permutevar_ps(0b00001111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutexvar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_permutexvar_epi32(idx, a);
+        let e = _mm512_set1_epi32(14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutexvar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_permutexvar_epi32(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutexvar_epi32(a, 0b11111111_11111111, idx, a);
+        let e = _mm512_set1_epi32(14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutexvar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_permutexvar_epi32(0, idx, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutexvar_epi32(0b00000000_11111111, idx, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutexvar_epi32() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_permutexvar_epi32(idx, a);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_epi32() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_mask_permutexvar_epi32(a, 0, idx, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutexvar_epi32(a, 0b11111111, idx, a);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_epi32() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_permutexvar_epi32(0, idx, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutexvar_epi32(0b11111111, idx, a);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutexvar_ps() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_permutexvar_ps(idx, a);
+        let e = _mm512_set1_ps(14.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutexvar_ps() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_mask_permutexvar_ps(a, 0, idx, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_permutexvar_ps(a, 0b11111111_11111111, idx, a);
+        let e = _mm512_set1_ps(14.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutexvar_ps() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_maskz_permutexvar_ps(0, idx, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_permutexvar_ps(0b00000000_11111111, idx, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 14., 14., 14., 14., 14., 14., 14., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutexvar_ps() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_permutexvar_ps(idx, a);
+        let e = _mm256_set1_ps(6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_ps() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_mask_permutexvar_ps(a, 0, idx, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_permutexvar_ps(a, 0b11111111, idx, a);
+        let e = _mm256_set1_ps(6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_ps() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_maskz_permutexvar_ps(0, idx, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_permutexvar_ps(0b11111111, idx, a);
+        let e = _mm256_set1_ps(6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex2var_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_epi32(100);
+        let r = _mm512_permutex2var_epi32(a, idx, b);
+        let e = _mm512_set_epi32(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex2var_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_epi32(100);
+        let r = _mm512_mask_permutex2var_epi32(a, 0, idx, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutex2var_epi32(a, 0b11111111_11111111, idx, b);
+        let e = _mm512_set_epi32(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex2var_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_epi32(100);
+        let r = _mm512_maskz_permutex2var_epi32(0, a, idx, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutex2var_epi32(0b00000000_11111111, a, idx, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 10, 100, 9, 100, 8, 100, 7, 100);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2_permutex2var_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1000, 1 << 4, 2000, 1 << 4,
+            3000, 1 << 4, 4000, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_epi32(100);
+        let r = _mm512_mask2_permutex2var_epi32(a, idx, 0, b);
+        assert_eq_m512i(r, idx);
+        let r = _mm512_mask2_permutex2var_epi32(a, idx, 0b00000000_11111111, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1000, 1 << 4, 2000, 1 << 4,
+            3000, 1 << 4, 4000, 1 << 4,
+            10, 100, 9, 100,
+            8, 100, 7, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex2var_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_epi32(100);
+        let r = _mm256_permutex2var_epi32(a, idx, b);
+        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_epi32(100);
+        let r = _mm256_mask_permutex2var_epi32(a, 0, idx, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutex2var_epi32(a, 0b11111111, idx, b);
+        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_epi32(100);
+        let r = _mm256_maskz_permutex2var_epi32(0, a, idx, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutex2var_epi32(0b11111111, a, idx, b);
+        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_epi32(100);
+        let r = _mm256_mask2_permutex2var_epi32(a, idx, 0, b);
+        assert_eq_m256i(r, idx);
+        let r = _mm256_mask2_permutex2var_epi32(a, idx, 0b11111111, b);
+        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_permutex2var_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_epi32(100);
+        let r = _mm_permutex2var_epi32(a, idx, b);
+        let e = _mm_set_epi32(2, 100, 1, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_epi32(100);
+        let r = _mm_mask_permutex2var_epi32(a, 0, idx, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutex2var_epi32(a, 0b00001111, idx, b);
+        let e = _mm_set_epi32(2, 100, 1, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_epi32(100);
+        let r = _mm_maskz_permutex2var_epi32(0, a, idx, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutex2var_epi32(0b00001111, a, idx, b);
+        let e = _mm_set_epi32(2, 100, 1, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_epi32(100);
+        let r = _mm_mask2_permutex2var_epi32(a, idx, 0, b);
+        assert_eq_m128i(r, idx);
+        let r = _mm_mask2_permutex2var_epi32(a, idx, 0b00001111, b);
+        let e = _mm_set_epi32(2, 100, 1, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex2var_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_ps(100.);
+        let r = _mm512_permutex2var_ps(a, idx, b);
+        let e = _mm512_set_ps(
+            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex2var_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_ps(100.);
+        let r = _mm512_mask_permutex2var_ps(a, 0, idx, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_permutex2var_ps(a, 0b11111111_11111111, idx, b);
+        let e = _mm512_set_ps(
+            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex2var_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_ps(100.);
+        let r = _mm512_maskz_permutex2var_ps(0, a, idx, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_permutex2var_ps(0b00000000_11111111, a, idx, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 10., 100., 9., 100., 8., 100., 7., 100.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2_permutex2var_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_ps(100.);
+        let r = _mm512_mask2_permutex2var_ps(a, idx, 0, b);
+        assert_eq_m512(r, _mm512_castsi512_ps(idx));
+        let r = _mm512_mask2_permutex2var_ps(a, idx, 0b11111111_11111111, b);
+        let e = _mm512_set_ps(
+            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex2var_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_ps(100.);
+        let r = _mm256_permutex2var_ps(a, idx, b);
+        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_ps(100.);
+        let r = _mm256_mask_permutex2var_ps(a, 0, idx, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_permutex2var_ps(a, 0b11111111, idx, b);
+        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_ps(100.);
+        let r = _mm256_maskz_permutex2var_ps(0, a, idx, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_permutex2var_ps(0b11111111, a, idx, b);
+        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_ps(100.);
+        let r = _mm256_mask2_permutex2var_ps(a, idx, 0, b);
+        assert_eq_m256(r, _mm256_castsi256_ps(idx));
+        let r = _mm256_mask2_permutex2var_ps(a, idx, 0b11111111, b);
+        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_permutex2var_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_ps(100.);
+        let r = _mm_permutex2var_ps(a, idx, b);
+        let e = _mm_set_ps(2., 100., 1., 100.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_ps(100.);
+        let r = _mm_mask_permutex2var_ps(a, 0, idx, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_permutex2var_ps(a, 0b00001111, idx, b);
+        let e = _mm_set_ps(2., 100., 1., 100.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_ps(100.);
+        let r = _mm_maskz_permutex2var_ps(0, a, idx, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_permutex2var_ps(0b00001111, a, idx, b);
+        let e = _mm_set_ps(2., 100., 1., 100.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_ps(100.);
+        let r = _mm_mask2_permutex2var_ps(a, idx, 0, b);
+        assert_eq_m128(r, _mm_castsi128_ps(idx));
+        let r = _mm_mask2_permutex2var_ps(a, idx, 0b00001111, b);
+        let e = _mm_set_ps(2., 100., 1., 100.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_epi32() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm512_shuffle_epi32::<_MM_PERM_AADD>(a);
+        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_epi32() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111_11111111, a);
+        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_epi32() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_epi32() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_epi32() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b11111111, a);
+        let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_epi32() {
+        let a = _mm_set_epi32(1, 4, 5, 8);
+        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b00001111, a);
+        let e = _mm_set_epi32(8, 8, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_epi32() {
+        let a = _mm_set_epi32(1, 4, 5, 8);
+        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00001111, a);
+        let e = _mm_set_epi32(8, 8, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_ps() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_shuffle_ps::<0b00_00_11_11>(a, b);
+        let e = _mm512_setr_ps(
+            8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_ps() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_mask_shuffle_ps::<0b00_00_11_11>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_shuffle_ps::<0b00_00_11_11>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_ps(
+            8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_ps() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_maskz_shuffle_ps::<0b00_00_11_11>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_shuffle_ps::<0b00_00_11_11>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            8., 8., 2., 2., 16., 16., 10., 10., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_ps() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_mask_shuffle_ps::<0b11_11_11_11>(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_shuffle_ps::<0b00_00_11_11>(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(7., 7., 1., 1., 15., 15., 9., 9.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_ps() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_maskz_shuffle_ps::<0b11_11_11_11>(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_shuffle_ps::<0b00_00_11_11>(0b11111111, a, b);
+        let e = _mm256_set_ps(7., 7., 1., 1., 15., 15., 9., 9.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_ps() {
+        let a = _mm_set_ps(1., 4., 5., 8.);
+        let b = _mm_set_ps(2., 3., 6., 7.);
+        let r = _mm_mask_shuffle_ps::<0b11_11_11_11>(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_shuffle_ps::<0b00_00_11_11>(a, 0b00001111, a, b);
+        let e = _mm_set_ps(7., 7., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_ps() {
+        let a = _mm_set_ps(1., 4., 5., 8.);
+        let b = _mm_set_ps(2., 3., 6., 7.);
+        let r = _mm_maskz_shuffle_ps::<0b11_11_11_11>(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_shuffle_ps::<0b00_00_11_11>(0b00001111, a, b);
+        let e = _mm_set_ps(7., 7., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_i32x4() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_shuffle_i32x4::<0b00_00_00_00>(a, b);
+        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_i32x4() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_i32x4() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_i32x4() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm256_shuffle_i32x4::<0b00>(a, b);
+        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_i32x4() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_i32x4() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0b11111111, a, b);
+        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_f32x4() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_shuffle_f32x4::<0b00_00_00_00>(a, b);
+        let e = _mm512_setr_ps(
+            1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_f32x4() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_ps(
+            1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_f32x4() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            1., 4., 5., 8., 1., 4., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_f32x4() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_shuffle_f32x4::<0b00>(a, b);
+        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_f32x4() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_f32x4() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0b11111111, a, b);
+        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_extractf32x4_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_extractf32x4_ps::<1>(a);
+        let e = _mm_setr_ps(5., 6., 7., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_extractf32x4_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let src = _mm_set1_ps(100.);
+        let r = _mm512_mask_extractf32x4_ps::<1>(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm512_mask_extractf32x4_ps::<1>(src, 0b11111111, a);
+        let e = _mm_setr_ps(5., 6., 7., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_extractf32x4_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_maskz_extractf32x4_ps::<1>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm512_maskz_extractf32x4_ps::<1>(0b00000001, a);
+        let e = _mm_setr_ps(5., 0., 0., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_extractf32x4_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_extractf32x4_ps::<1>(a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_extractf32x4_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let src = _mm_set1_ps(100.);
+        let r = _mm256_mask_extractf32x4_ps::<1>(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm256_mask_extractf32x4_ps::<1>(src, 0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_extractf32x4_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_maskz_extractf32x4_ps::<1>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm256_maskz_extractf32x4_ps::<1>(0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_extracti32x4_epi32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_extracti32x4_epi32::<1>(a);
+        let e = _mm_setr_epi32(5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_extracti32x4_epi32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm_set1_epi32(100);
+        let r = _mm512_mask_extracti32x4_epi32::<1>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_extracti32x4_epi32::<1>(src, 0b11111111, a);
+        let e = _mm_setr_epi32(5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm512_maskz_extracti32x4_epi32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_extracti32x4_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_extracti32x4_epi32::<1>(0b00000001, a);
+        let e = _mm_setr_epi32(5, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_extracti32x4_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_extracti32x4_epi32::<1>(a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_extracti32x4_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set1_epi32(100);
+        let r = _mm256_mask_extracti32x4_epi32::<1>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_extracti32x4_epi32::<1>(src, 0b00001111, a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_extracti32x4_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_extracti32x4_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_extracti32x4_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_moveldup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_moveldup_ps(a);
+        let e = _mm512_setr_ps(
+            1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_moveldup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_moveldup_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_moveldup_ps(a, 0b11111111_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_moveldup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_maskz_moveldup_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_moveldup_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 3., 3., 5., 5., 7., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_moveldup_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_moveldup_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_moveldup_ps(a, 0b11111111, a);
+        let e = _mm256_set_ps(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_moveldup_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_maskz_moveldup_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_moveldup_ps(0b11111111, a);
+        let e = _mm256_set_ps(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_moveldup_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_moveldup_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_moveldup_ps(a, 0b00001111, a);
+        let e = _mm_set_ps(2., 2., 4., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_moveldup_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_moveldup_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_moveldup_ps(0b00001111, a);
+        let e = _mm_set_ps(2., 2., 4., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_movehdup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_movehdup_ps(a);
+        let e = _mm512_setr_ps(
+            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_movehdup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_movehdup_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_movehdup_ps(a, 0b11111111_11111111, a);
+        let e = _mm512_setr_ps(
+            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_movehdup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_maskz_movehdup_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_movehdup_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            2., 2., 4., 4., 6., 6., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_movehdup_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_movehdup_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_movehdup_ps(a, 0b11111111, a);
+        let e = _mm256_set_ps(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_movehdup_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_maskz_movehdup_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_movehdup_ps(0b11111111, a);
+        let e = _mm256_set_ps(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_movehdup_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_movehdup_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_movehdup_ps(a, 0b00001111, a);
+        let e = _mm_set_ps(1., 1., 3., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_movehdup_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_movehdup_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_movehdup_ps(0b00001111, a);
+        let e = _mm_set_ps(1., 1., 3., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_inserti32x4() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi32(17, 18, 19, 20);
+        let r = _mm512_inserti32x4::<0>(a, b);
+        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_inserti32x4() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi32(17, 18, 19, 20);
+        let r = _mm512_mask_inserti32x4::<0>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_inserti32x4::<0>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_inserti32x4() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi32(17, 18, 19, 20);
+        let r = _mm512_maskz_inserti32x4::<0>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_inserti32x4::<0>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_inserti32x4() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_inserti32x4::<1>(a, b);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_inserti32x4() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_mask_inserti32x4::<0>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_inserti32x4::<1>(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_inserti32x4() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_maskz_inserti32x4::<0>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_inserti32x4::<1>(0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_insertf32x4() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_insertf32x4::<0>(a, b);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_insertf32x4() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_mask_insertf32x4::<0>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_insertf32x4::<0>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_insertf32x4() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_maskz_insertf32x4::<0>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_insertf32x4::<0>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_insertf32x4() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_insertf32x4::<1>(a, b);
+        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_insertf32x4() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_mask_insertf32x4::<0>(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_insertf32x4::<1>(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_insertf32x4() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_maskz_insertf32x4::<0>(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_insertf32x4::<1>(0b11111111, a, b);
+        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps128_ps512() {
+        let a = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_castps128_ps512(a);
+        assert_eq_m128(_mm512_castps512_ps128(r), a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps256_ps512() {
+        let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_castps256_ps512(a);
+        assert_eq_m256(_mm512_castps512_ps256(r), a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextps128_ps512() {
+        let a = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_zextps128_ps512(a);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextps256_ps512() {
+        let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_zextps256_ps512(a);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps512_ps128() {
+        let a = _mm512_setr_ps(
+            17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        let r = _mm512_castps512_ps128(a);
+        let e = _mm_setr_ps(17., 18., 19., 20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps512_ps256() {
+        let a = _mm512_setr_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        let r = _mm512_castps512_ps256(a);
+        let e = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps_pd() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_castps_pd(a);
+        let e = _mm512_set1_pd(0.007812501848093234);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps_si512() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_castps_si512(a);
+        let e = _mm512_set1_epi32(1065353216);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_broadcastd_epi32(a);
+        let e = _mm512_set1_epi32(20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastd_epi32() {
+        let src = _mm512_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_mask_broadcastd_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcastd_epi32(src, 0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_maskz_broadcastd_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcastd_epi32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastd_epi32() {
+        let src = _mm256_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_mask_broadcastd_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcastd_epi32(src, 0b11111111, a);
+        let e = _mm256_set1_epi32(20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_maskz_broadcastd_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcastd_epi32(0b11111111, a);
+        let e = _mm256_set1_epi32(20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_broadcastd_epi32() {
+        let src = _mm_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_mask_broadcastd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_broadcastd_epi32(src, 0b00001111, a);
+        let e = _mm_set1_epi32(20);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_maskz_broadcastd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_broadcastd_epi32(0b00001111, a);
+        let e = _mm_set1_epi32(20);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_broadcastss_ps(a);
+        let e = _mm512_set1_ps(20.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastss_ps() {
+        let src = _mm512_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_mask_broadcastss_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_broadcastss_ps(src, 0b11111111_11111111, a);
+        let e = _mm512_set1_ps(20.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_maskz_broadcastss_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_broadcastss_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            20., 20., 20., 20., 20., 20., 20., 20., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastss_ps() {
+        let src = _mm256_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_mask_broadcastss_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_broadcastss_ps(src, 0b11111111, a);
+        let e = _mm256_set1_ps(20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_maskz_broadcastss_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_broadcastss_ps(0b11111111, a);
+        let e = _mm256_set1_ps(20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_broadcastss_ps() {
+        let src = _mm_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_mask_broadcastss_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_broadcastss_ps(src, 0b00001111, a);
+        let e = _mm_set1_ps(20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_maskz_broadcastss_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_broadcastss_ps(0b00001111, a);
+        let e = _mm_set1_ps(20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_broadcast_i32x4(a);
+        let e = _mm512_set_epi32(
+            17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_i32x4() {
+        let src = _mm512_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_mask_broadcast_i32x4(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcast_i32x4(src, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(
+            17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_maskz_broadcast_i32x4(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcast_i32x4(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_broadcast_i32x4(a);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_i32x4() {
+        let src = _mm256_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_mask_broadcast_i32x4(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcast_i32x4(src, 0b11111111, a);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_maskz_broadcast_i32x4(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcast_i32x4(0b11111111, a);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_broadcast_f32x4(a);
+        let e = _mm512_set_ps(
+            17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_f32x4() {
+        let src = _mm512_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_mask_broadcast_f32x4(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_broadcast_f32x4(src, 0b11111111_11111111, a);
+        let e = _mm512_set_ps(
+            17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_maskz_broadcast_f32x4(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_broadcast_f32x4(0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 17., 18., 19., 20., 17., 18., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_broadcast_f32x4(a);
+        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_f32x4() {
+        let src = _mm256_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_mask_broadcast_f32x4(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_broadcast_f32x4(src, 0b11111111, a);
+        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_maskz_broadcast_f32x4(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_broadcast_f32x4(0b11111111, a);
+        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_mask_blend_epi32(0b11111111_00000000, a, b);
+        let e = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_mask_blend_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_mask_blend_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(2.);
+        let r = _mm512_mask_blend_ps(0b11111111_00000000, a, b);
+        let e = _mm512_set_ps(
+            2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(2.);
+        let r = _mm256_mask_blend_ps(0b11111111, a, b);
+        let e = _mm256_set1_ps(2.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let r = _mm_mask_blend_ps(0b00001111, a, b);
+        let e = _mm_set1_ps(2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_unpackhi_epi32(a, b);
+        let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_mask_unpackhi_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpackhi_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_unpackhi_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpackhi_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 9, 26, 10, 29, 13, 30, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_mask_unpackhi_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpackhi_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 1, 18, 2, 21, 5, 22, 6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_maskz_unpackhi_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpackhi_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 1, 18, 2, 21, 5, 22, 6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_mask_unpackhi_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpackhi_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(17, 1, 18, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_maskz_unpackhi_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpackhi_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(17, 1, 18, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_unpackhi_ps(a, b);
+        let e = _mm512_set_ps(
+            17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_mask_unpackhi_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_unpackhi_ps(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_ps(
+            17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_maskz_unpackhi_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_unpackhi_ps(0b00000000_11111111, a, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 25., 9., 26., 10., 29., 13., 30., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_mask_unpackhi_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_unpackhi_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(17., 1., 18., 2., 21., 5., 22., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_maskz_unpackhi_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_unpackhi_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(17., 1., 18., 2., 21., 5., 22., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_mask_unpackhi_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_unpackhi_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(17., 1., 18., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_maskz_unpackhi_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_unpackhi_ps(0b00001111, a, b);
+        let e = _mm_set_ps(17., 1., 18., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_unpacklo_epi32(a, b);
+        let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_mask_unpacklo_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpacklo_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_unpacklo_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpacklo_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 27, 11, 28, 12, 31, 15, 32, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_mask_unpacklo_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpacklo_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(19, 3, 20, 4, 23, 7, 24, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_maskz_unpacklo_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpacklo_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(19, 3, 20, 4, 23, 7, 24, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_mask_unpacklo_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpacklo_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(19, 3, 20, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_maskz_unpacklo_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpacklo_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(19, 3, 20, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_unpacklo_ps(a, b);
+        let e = _mm512_set_ps(
+            19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_mask_unpacklo_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_unpacklo_ps(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_ps(
+            19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_maskz_unpacklo_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_unpacklo_ps(0b00000000_11111111, a, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 27., 11., 28., 12., 31., 15., 32., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_mask_unpacklo_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_unpacklo_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(19., 3., 20., 4., 23., 7., 24., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_maskz_unpacklo_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_unpacklo_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(19., 3., 20., 4., 23., 7., 24., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_mask_unpacklo_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_unpacklo_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(19., 3., 20., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_maskz_unpacklo_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_unpacklo_ps(0b00001111, a, b);
+        let e = _mm_set_ps(19., 3., 20., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_alignr_epi32() {
+        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi32(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+        );
+        let r = _mm512_alignr_epi32::<0>(a, b);
+        assert_eq_m512i(r, b);
+        let r = _mm512_alignr_epi32::<16>(a, b);
+        assert_eq_m512i(r, b);
+        let r = _mm512_alignr_epi32::<1>(a, b);
+        let e = _mm512_set_epi32(
+            1, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_alignr_epi32() {
+        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi32(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+        );
+        let r = _mm512_mask_alignr_epi32::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_alignr_epi32::<1>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(
+            1, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_alignr_epi32() {
+        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi32(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+        );
+        let r = _mm512_maskz_alignr_epi32::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_alignr_epi32::<1>(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 24, 23, 22, 21, 20, 19, 18);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_alignr_epi32() {
+        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm256_alignr_epi32::<0>(a, b);
+        assert_eq_m256i(r, b);
+        let r = _mm256_alignr_epi32::<1>(a, b);
+        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_alignr_epi32() {
+        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm256_mask_alignr_epi32::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_alignr_epi32::<1>(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_alignr_epi32() {
+        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm256_maskz_alignr_epi32::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_alignr_epi32::<1>(0b11111111, a, b);
+        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_alignr_epi32() {
+        let a = _mm_set_epi32(4, 3, 2, 1);
+        let b = _mm_set_epi32(8, 7, 6, 5);
+        let r = _mm_alignr_epi32::<0>(a, b);
+        assert_eq_m128i(r, b);
+        let r = _mm_alignr_epi32::<1>(a, b);
+        let e = _mm_set_epi32(1, 8, 7, 6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_alignr_epi32() {
+        let a = _mm_set_epi32(4, 3, 2, 1);
+        let b = _mm_set_epi32(8, 7, 6, 5);
+        let r = _mm_mask_alignr_epi32::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_alignr_epi32::<1>(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(1, 8, 7, 6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_alignr_epi32() {
+        let a = _mm_set_epi32(4, 3, 2, 1);
+        let b = _mm_set_epi32(8, 7, 6, 5);
+        let r = _mm_maskz_alignr_epi32::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_alignr_epi32::<1>(0b00001111, a, b);
+        let e = _mm_set_epi32(1, 8, 7, 6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_and_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_and_epi32(a, b);
+        let e = _mm512_set_epi32(1 << 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_and_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_mask_and_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_and_epi32(a, 0b01111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_and_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_maskz_and_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_and_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_and_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_mask_and_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_and_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_and_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_maskz_and_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_and_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_and_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_mask_and_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_and_epi32(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_and_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_maskz_and_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_and_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_and_si512() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_and_epi32(a, b);
+        let e = _mm512_set_epi32(1 << 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_or_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_or_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_or_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_mask_or_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_or_epi32(a, 0b11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_or_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_maskz_or_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_or_epi32(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_or_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_or_epi32(a, b);
+        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_or_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_mask_or_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_or_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_or_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_maskz_or_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_or_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_or_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_or_epi32(a, b);
+        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_or_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_mask_or_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_or_epi32(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_or_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_maskz_or_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_or_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_or_si512() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_or_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_xor_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_xor_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_xor_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_mask_xor_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_xor_epi32(a, 0b01111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_xor_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_maskz_xor_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_xor_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_xor_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_xor_epi32(a, b);
+        let e = _mm256_set1_epi32(1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_xor_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_mask_xor_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_xor_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_xor_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_maskz_xor_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_xor_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_xor_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_xor_epi32(a, b);
+        let e = _mm_set1_epi32(1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_xor_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_mask_xor_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_xor_epi32(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_xor_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_maskz_xor_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_xor_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_xor_si512() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_xor_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_andnot_epi32() {
+        let a = _mm512_set1_epi32(0);
+        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm512_andnot_epi32(a, b);
+        let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_andnot_epi32() {
+        let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm512_mask_andnot_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_andnot_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_andnot_epi32() {
+        let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm512_maskz_andnot_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_andnot_epi32(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4,
+            1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_andnot_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm256_mask_andnot_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_andnot_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_andnot_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm256_maskz_andnot_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_andnot_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_andnot_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm_mask_andnot_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_andnot_epi32(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_andnot_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm_maskz_andnot_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_andnot_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_cvtmask16_u32() {
+        let a: __mmask16 = 0b11001100_00110011;
+        let r = _cvtmask16_u32(a);
+        let e: u32 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_cvtu32_mask16() {
+        let a: u32 = 0b11001100_00110011;
+        let r = _cvtu32_mask16(a);
+        let e: __mmask16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kand() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b11001100_00110011;
+        let r = _mm512_kand(a, b);
+        let e: u16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kand_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b11001100_00110011;
+        let r = _kand_mask16(a, b);
+        let e: u16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kor() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kor(a, b);
+        let e: u16 = 0b11101110_00111011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kor_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _kor_mask16(a, b);
+        let e: u16 = 0b11101110_00111011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kxor() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kxor(a, b);
+        let e: u16 = 0b11100010_00111000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kxor_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _kxor_mask16(a, b);
+        let e: u16 = 0b11100010_00111000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_knot() {
+        let a: u16 = 0b11001100_00110011;
+        let r = _mm512_knot(a);
+        let e: u16 = 0b00110011_11001100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_knot_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let r = _knot_mask16(a);
+        let e: u16 = 0b00110011_11001100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kandn() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kandn(a, b);
+        let e: u16 = 0b00100010_00001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kandn_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _kandn_mask16(a, b);
+        let e: u16 = 0b00100010_00001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kxnor() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kxnor(a, b);
+        let e: u16 = 0b00011101_11000111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kxnor_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _kxnor_mask16(a, b);
+        let e: u16 = 0b00011101_11000111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortest_mask16_u8() {
+        let a: __mmask16 = 0b0110100101101001;
+        let b: __mmask16 = 0b1011011010110110;
+        let mut all_ones: u8 = 0;
+        let r = _kortest_mask16_u8(a, b, &mut all_ones);
+        assert_eq!(r, 0);
+        assert_eq!(all_ones, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortestc_mask16_u8() {
+        let a: __mmask16 = 0b0110100101101001;
+        let b: __mmask16 = 0b1011011010110110;
+        let r = _kortestc_mask16_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortestz_mask16_u8() {
+        let a: __mmask16 = 0b0110100101101001;
+        let b: __mmask16 = 0b1011011010110110;
+        let r = _kortestz_mask16_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kshiftli_mask16() {
+        let a: __mmask16 = 0b1001011011000011;
+        let r = _kshiftli_mask16::<3>(a);
+        let e: __mmask16 = 0b1011011000011000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kshiftri_mask16() {
+        let a: __mmask16 = 0b0110100100111100;
+        let r = _kshiftri_mask16::<3>(a);
+        let e: __mmask16 = 0b0000110100100111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_load_mask16() {
+        let a: __mmask16 = 0b1001011011000011;
+        let r = _load_mask16(&a);
+        let e: __mmask16 = 0b1001011011000011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_store_mask16() {
+        let a: __mmask16 = 0b0110100100111100;
+        let mut r = 0;
+        _store_mask16(&mut r, a);
+        let e: __mmask16 = 0b0110100100111100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kmov() {
+        let a: u16 = 0b11001100_00110011;
+        let r = _mm512_kmov(a);
+        let e: u16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_int2mask() {
+        let a: i32 = 0b11001100_00110011;
+        let r = _mm512_int2mask(a);
+        let e: u16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2int() {
+        let k1: __mmask16 = 0b11001100_00110011;
+        let r = _mm512_mask2int(k1);
+        let e: i32 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kunpackb() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kunpackb(a, b);
+        let e: u16 = 0b00110011_00001011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kortestc() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kortestc(a, b);
+        assert_eq!(r, 0);
+        let b: u16 = 0b11111111_11111111;
+        let r = _mm512_kortestc(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kortestz() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kortestz(a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_kortestz(0, 0);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_test_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm512_test_epi32_mask(a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_test_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm512_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_test_epi32_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_test_epi32_mask() {
+        let a = _mm256_set1_epi32(1 << 0);
+        let b = _mm256_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm256_test_epi32_mask(a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_test_epi32_mask() {
+        let a = _mm256_set1_epi32(1 << 0);
+        let b = _mm256_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm256_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_test_epi32_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_test_epi32_mask() {
+        let a = _mm_set1_epi32(1 << 0);
+        let b = _mm_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm_test_epi32_mask(a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_test_epi32_mask() {
+        let a = _mm_set1_epi32(1 << 0);
+        let b = _mm_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_test_epi32_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_testn_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm512_testn_epi32_mask(a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_testn_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 1);
+        let r = _mm512_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_testn_epi32_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_testn_epi32_mask() {
+        let a = _mm256_set1_epi32(1 << 0);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_testn_epi32_mask(a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_testn_epi32_mask() {
+        let a = _mm256_set1_epi32(1 << 0);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_testn_epi32_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_testn_epi32_mask() {
+        let a = _mm_set1_epi32(1 << 0);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_testn_epi32_mask(a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_testn_epi32_mask() {
+        let a = _mm_set1_epi32(1 << 0);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_testn_epi32_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm512_stream_ps() {
+        #[repr(align(64))]
+        struct Memory {
+            pub data: [f32; 16], // 64 bytes
+        }
+        let a = _mm512_set1_ps(7.0);
+        let mut mem = Memory { data: [-1.0; 16] };
+
+        _mm512_stream_ps(&mut mem.data[0] as *mut f32, a);
+        for i in 0..16 {
+            assert_eq!(mem.data[i], get_m512(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx512f")]
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm512_stream_pd() {
+        #[repr(align(64))]
+        struct Memory {
+            pub data: [f64; 8],
+        }
+        let a = _mm512_set1_pd(7.0);
+        let mut mem = Memory { data: [-1.0; 8] };
+
+        _mm512_stream_pd(&mut mem.data[0] as *mut f64, a);
+        for i in 0..8 {
+            assert_eq!(mem.data[i], get_m512d(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx512f")]
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm512_stream_si512() {
+        #[repr(align(64))]
+        struct Memory {
+            pub data: [i64; 8],
+        }
+        let a = _mm512_set1_epi32(7);
+        let mut mem = Memory { data: [-1; 8] };
+
+        _mm512_stream_si512(mem.data.as_mut_ptr().cast(), a);
+        for i in 0..8 {
+            assert_eq!(mem.data[i], get_m512i(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_stream_load_si512() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_stream_load_si512(core::ptr::addr_of!(a) as *const _);
+        assert_eq_m512i(a, r);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_add_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let e: i32 = _mm512_reduce_add_epi32(a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_add_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let e: i32 = _mm512_mask_reduce_add_epi32(0b11111111_00000000, a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_add_ps() {
+        let a = _mm512_set1_ps(1.);
+        let e: f32 = _mm512_reduce_add_ps(a);
+        assert_eq!(16., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_add_ps() {
+        let a = _mm512_set1_ps(1.);
+        let e: f32 = _mm512_mask_reduce_add_ps(0b11111111_00000000, a);
+        assert_eq!(8., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_mul_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let e: i32 = _mm512_reduce_mul_epi32(a);
+        assert_eq!(65536, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_mul_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let e: i32 = _mm512_mask_reduce_mul_epi32(0b11111111_00000000, a);
+        assert_eq!(256, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_mul_ps() {
+        let a = _mm512_set1_ps(2.);
+        let e: f32 = _mm512_reduce_mul_ps(a);
+        assert_eq!(65536., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_mul_ps() {
+        let a = _mm512_set1_ps(2.);
+        let e: f32 = _mm512_mask_reduce_mul_ps(0b11111111_00000000, a);
+        assert_eq!(256., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i32 = _mm512_reduce_max_epi32(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i32 = _mm512_mask_reduce_max_epi32(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_epu32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u32 = _mm512_reduce_max_epu32(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_epu32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u32 = _mm512_mask_reduce_max_epu32(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let e: f32 = _mm512_reduce_max_ps(a);
+        assert_eq!(15., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let e: f32 = _mm512_mask_reduce_max_ps(0b11111111_00000000, a);
+        assert_eq!(7., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i32 = _mm512_reduce_min_epi32(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i32 = _mm512_mask_reduce_min_epi32(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_epu32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u32 = _mm512_reduce_min_epu32(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_epu32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u32 = _mm512_mask_reduce_min_epu32(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let e: f32 = _mm512_reduce_min_ps(a);
+        assert_eq!(0., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let e: f32 = _mm512_mask_reduce_min_ps(0b11111111_00000000, a);
+        assert_eq!(0., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_and_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i32 = _mm512_reduce_and_epi32(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_and_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i32 = _mm512_mask_reduce_and_epi32(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_or_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i32 = _mm512_reduce_or_epi32(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_or_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i32 = _mm512_mask_reduce_and_epi32(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compress_epi32() {
+        let src = _mm512_set1_epi32(200);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_compress_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_compress_epi32(src, 0b01010101_01010101, a);
+        let e = _mm512_set_epi32(
+            200, 200, 200, 200, 200, 200, 200, 200, 1, 3, 5, 7, 9, 11, 13, 15,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_compress_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_compress_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_compress_epi32(0b01010101_01010101, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compress_epi32() {
+        let src = _mm256_set1_epi32(200);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_mask_compress_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_compress_epi32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(200, 200, 200, 200, 1, 3, 5, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_compress_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_compress_epi32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 0, 0, 0, 1, 3, 5, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compress_epi32() {
+        let src = _mm_set1_epi32(200);
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let r = _mm_mask_compress_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_compress_epi32(src, 0b00000101, a);
+        let e = _mm_set_epi32(200, 200, 1, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_compress_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let r = _mm_maskz_compress_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_compress_epi32(0b00000101, a);
+        let e = _mm_set_epi32(0, 0, 1, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compress_ps() {
+        let src = _mm512_set1_ps(200.);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_mask_compress_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_compress_ps(src, 0b01010101_01010101, a);
+        let e = _mm512_set_ps(
+            200., 200., 200., 200., 200., 200., 200., 200., 1., 3., 5., 7., 9., 11., 13., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_compress_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_maskz_compress_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_compress_ps(0b01010101_01010101, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 3., 5., 7., 9., 11., 13., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compress_ps() {
+        let src = _mm256_set1_ps(200.);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_mask_compress_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_compress_ps(src, 0b01010101, a);
+        let e = _mm256_set_ps(200., 200., 200., 200., 1., 3., 5., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_maskz_compress_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_compress_ps(0b01010101, a);
+        let e = _mm256_set_ps(0., 0., 0., 0., 1., 3., 5., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compress_ps() {
+        let src = _mm_set1_ps(200.);
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_mask_compress_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_compress_ps(src, 0b00000101, a);
+        let e = _mm_set_ps(200., 200., 1., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_compress_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_maskz_compress_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_compress_ps(0b00000101, a);
+        let e = _mm_set_ps(0., 0., 1., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compressstoreu_epi32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let mut r = [0_i32; 16];
+        _mm512_mask_compressstoreu_epi32(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i32; 16]);
+        _mm512_mask_compressstoreu_epi32(r.as_mut_ptr(), 0b1111000011001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_epi32() {
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let mut r = [0_i32; 8];
+        _mm256_mask_compressstoreu_epi32(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i32; 8]);
+        _mm256_mask_compressstoreu_epi32(r.as_mut_ptr(), 0b11001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut r = [0_i32; 4];
+        _mm_mask_compressstoreu_epi32(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i32; 4]);
+        _mm_mask_compressstoreu_epi32(r.as_mut_ptr(), 0b1011, a);
+        assert_eq!(&r, &[1, 2, 4, 0]);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compressstoreu_epi64() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let mut r = [0_i64; 8];
+        _mm512_mask_compressstoreu_epi64(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i64; 8]);
+        _mm512_mask_compressstoreu_epi64(r.as_mut_ptr(), 0b11001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_epi64() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let mut r = [0_i64; 4];
+        _mm256_mask_compressstoreu_epi64(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i64; 4]);
+        _mm256_mask_compressstoreu_epi64(r.as_mut_ptr(), 0b1011, a);
+        assert_eq!(&r, &[1, 2, 4, 0]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_epi64() {
+        let a = _mm_setr_epi64x(1, 2);
+        let mut r = [0_i64; 2];
+        _mm_mask_compressstoreu_epi64(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i64; 2]);
+        _mm_mask_compressstoreu_epi64(r.as_mut_ptr(), 0b10, a);
+        assert_eq!(&r, &[2, 0]);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compressstoreu_ps() {
+        let a = _mm512_setr_ps(
+            1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32, 9_f32, 10_f32, 11_f32, 12_f32,
+            13_f32, 14_f32, 15_f32, 16_f32,
+        );
+        let mut r = [0_f32; 16];
+        _mm512_mask_compressstoreu_ps(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_f32; 16]);
+        _mm512_mask_compressstoreu_ps(r.as_mut_ptr(), 0b1111000011001010, a);
+        assert_eq!(
+            &r,
+            &[
+                2_f32, 4_f32, 7_f32, 8_f32, 13_f32, 14_f32, 15_f32, 16_f32, 0_f32, 0_f32, 0_f32,
+                0_f32, 0_f32, 0_f32, 0_f32, 0_f32
+            ]
+        );
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_ps() {
+        let a = _mm256_setr_ps(1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32);
+        let mut r = [0_f32; 8];
+        _mm256_mask_compressstoreu_ps(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_f32; 8]);
+        _mm256_mask_compressstoreu_ps(r.as_mut_ptr(), 0b11001010, a);
+        assert_eq!(
+            &r,
+            &[2_f32, 4_f32, 7_f32, 8_f32, 0_f32, 0_f32, 0_f32, 0_f32]
+        );
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_ps() {
+        let a = _mm_setr_ps(1_f32, 2_f32, 3_f32, 4_f32);
+        let mut r = [0.; 4];
+        _mm_mask_compressstoreu_ps(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0.; 4]);
+        _mm_mask_compressstoreu_ps(r.as_mut_ptr(), 0b1011, a);
+        assert_eq!(&r, &[1_f32, 2_f32, 4_f32, 0_f32]);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compressstoreu_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let mut r = [0.; 8];
+        _mm512_mask_compressstoreu_pd(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0.; 8]);
+        _mm512_mask_compressstoreu_pd(r.as_mut_ptr(), 0b11001010, a);
+        assert_eq!(&r, &[2., 4., 7., 8., 0., 0., 0., 0.]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let mut r = [0.; 4];
+        _mm256_mask_compressstoreu_pd(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0.; 4]);
+        _mm256_mask_compressstoreu_pd(r.as_mut_ptr(), 0b1011, a);
+        assert_eq!(&r, &[1., 2., 4., 0.]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let mut r = [0.; 2];
+        _mm_mask_compressstoreu_pd(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0.; 2]);
+        _mm_mask_compressstoreu_pd(r.as_mut_ptr(), 0b10, a);
+        assert_eq!(&r, &[2., 0.]);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expand_epi32() {
+        let src = _mm512_set1_epi32(200);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_expand_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_expand_epi32(src, 0b01010101_01010101, a);
+        let e = _mm512_set_epi32(
+            200, 8, 200, 9, 200, 10, 200, 11, 200, 12, 200, 13, 200, 14, 200, 15,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expand_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_expand_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_expand_epi32(0b01010101_01010101, a);
+        let e = _mm512_set_epi32(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expand_epi32() {
+        let src = _mm256_set1_epi32(200);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_mask_expand_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_expand_epi32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(200, 4, 200, 5, 200, 6, 200, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_expand_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_expand_epi32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 4, 0, 5, 0, 6, 0, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expand_epi32() {
+        let src = _mm_set1_epi32(200);
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let r = _mm_mask_expand_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_expand_epi32(src, 0b00000101, a);
+        let e = _mm_set_epi32(200, 2, 200, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expand_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let r = _mm_maskz_expand_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_expand_epi32(0b00000101, a);
+        let e = _mm_set_epi32(0, 2, 0, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expand_ps() {
+        let src = _mm512_set1_ps(200.);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_mask_expand_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_expand_ps(src, 0b01010101_01010101, a);
+        let e = _mm512_set_ps(
+            200., 8., 200., 9., 200., 10., 200., 11., 200., 12., 200., 13., 200., 14., 200., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expand_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_maskz_expand_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_expand_ps(0b01010101_01010101, a);
+        let e = _mm512_set_ps(
+            0., 8., 0., 9., 0., 10., 0., 11., 0., 12., 0., 13., 0., 14., 0., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expand_ps() {
+        let src = _mm256_set1_ps(200.);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_mask_expand_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_expand_ps(src, 0b01010101, a);
+        let e = _mm256_set_ps(200., 4., 200., 5., 200., 6., 200., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_maskz_expand_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_expand_ps(0b01010101, a);
+        let e = _mm256_set_ps(0., 4., 0., 5., 0., 6., 0., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expand_ps() {
+        let src = _mm_set1_ps(200.);
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_mask_expand_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_expand_ps(src, 0b00000101, a);
+        let e = _mm_set_ps(200., 2., 200., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expand_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_maskz_expand_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_expand_ps(0b00000101, a);
+        let e = _mm_set_ps(0., 2., 0., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_epi32() {
+        let a = &[4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_epi32(black_box(p));
+        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_loadu_epi32() {
+        let a = &[4, 3, 2, 5, 8, 9, 64, 50];
+        let p = a.as_ptr();
+        let r = _mm256_loadu_epi32(black_box(p));
+        let e = _mm256_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_loadu_epi32() {
+        let a = &[4, 3, 2, 5];
+        let p = a.as_ptr();
+        let r = _mm_loadu_epi32(black_box(p));
+        let e = _mm_setr_epi32(4, 3, 2, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_storeu_epi16() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(9);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_storeu_epi16() {
+        let a = _mm256_set1_epi32(9);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set1_epi16(9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_storeu_epi16() {
+        let a = _mm_set1_epi32(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_storeu_epi16() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_storeu_epi16() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set1_epi16(i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_storeu_epi16() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_storeu_epi16() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(u16::MAX as i16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_storeu_epi16() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set1_epi16(u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_storeu_epi16() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set_epi16(
+            0,
+            0,
+            0,
+            0,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_storeu_epi8() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_storeu_epi8() {
+        let a = _mm256_set1_epi32(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_storeu_epi8() {
+        let a = _mm_set1_epi32(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_storeu_epi8() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_storeu_epi8() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_storeu_epi8() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_storeu_epi8() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_storeu_epi8() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_storeu_epi8() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_epi32() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_storeu_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_storeu_epi32() {
+        let a = _mm256_set1_epi32(9);
+        let mut r = _mm256_undefined_si256();
+        _mm256_storeu_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_storeu_epi32() {
+        let a = _mm_set1_epi32(9);
+        let mut r = _mm_undefined_si128();
+        _mm_storeu_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_si512() {
+        let a = &[4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50];
+        let p = a.as_ptr().cast();
+        let r = _mm512_loadu_si512(black_box(p));
+        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_si512() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_storeu_si512(&mut r as *mut _, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_load_si512() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50],
+        };
+        let p = (a.data).as_ptr().cast();
+        let r = _mm512_load_si512(black_box(p));
+        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_store_si512() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_store_si512(&mut r as *mut _, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm512_load_epi32(black_box(p));
+        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 8],
+        }
+        let a = Align {
+            data: [4, 3, 2, 5, 8, 9, 64, 50],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm256_load_epi32(black_box(p));
+        let e = _mm256_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 4],
+        }
+        let a = Align { data: [4, 3, 2, 5] };
+        let p = (a.data).as_ptr();
+        let r = _mm_load_epi32(black_box(p));
+        let e = _mm_setr_epi32(4, 3, 2, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_store_epi32() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_store_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_store_epi32() {
+        let a = _mm256_set1_epi32(9);
+        let mut r = _mm256_undefined_si256();
+        _mm256_store_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_store_epi32() {
+        let a = _mm_set1_epi32(9);
+        let mut r = _mm_undefined_si128();
+        _mm_store_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_load_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [
+                4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+            ],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm512_load_ps(black_box(p));
+        let e = _mm512_setr_ps(
+            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_store_ps() {
+        let a = _mm512_set1_ps(9.);
+        let mut r = _mm512_undefined_ps();
+        _mm512_store_ps(&mut r as *mut _ as *mut f32, a);
+        assert_eq_m512(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_set1_epi32() {
+        let src = _mm512_set1_epi32(2);
+        let a: i32 = 11;
+        let r = _mm512_mask_set1_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_set1_epi32(src, 0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_set1_epi32() {
+        let a: i32 = 11;
+        let r = _mm512_maskz_set1_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_set1_epi32(0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_set1_epi32() {
+        let src = _mm256_set1_epi32(2);
+        let a: i32 = 11;
+        let r = _mm256_mask_set1_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_set1_epi32(src, 0b11111111, a);
+        let e = _mm256_set1_epi32(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm256_maskz_set1_epi32() {
+        let a: i32 = 11;
+        let r = _mm256_maskz_set1_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_set1_epi32(0b11111111, a);
+        let e = _mm256_set1_epi32(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_set1_epi32() {
+        let src = _mm_set1_epi32(2);
+        let a: i32 = 11;
+        let r = _mm_mask_set1_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_set1_epi32(src, 0b00001111, a);
+        let e = _mm_set1_epi32(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_set1_epi32() {
+        let a: i32 = 11;
+        let r = _mm_maskz_set1_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_set1_epi32(0b00001111, a);
+        let e = _mm_set1_epi32(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_move_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_move_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_move_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 40.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_move_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_move_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_move_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 40.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_move_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_move_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_move_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_move_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_move_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_move_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_add_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_add_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_add_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_add_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_add_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_add_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_add_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_add_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_sub_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sub_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_sub_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_sub_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sub_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sub_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sub_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_sub_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_mul_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_mul_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_mul_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_mul_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_mul_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_mul_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_mul_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_mul_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_div_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_div_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_div_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_div_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_div_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_div_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_div_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_div_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_max_ss(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_max_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_max_ss(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_max_ss(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_max_sd(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_max_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_max_sd(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_max_sd(0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_min_ss(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_min_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_min_ss(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_min_ss(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_min_sd(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_min_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_min_sd(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_min_sd(0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_sqrt_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sqrt_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_sqrt_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_sqrt_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sqrt_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sqrt_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sqrt_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_sqrt_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rsqrt14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_rsqrt14_ss(a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rsqrt14_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_rsqrt14_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_rsqrt14_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rsqrt14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_rsqrt14_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_rsqrt14_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rsqrt14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_rsqrt14_sd(a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rsqrt14_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_rsqrt14_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_rsqrt14_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rsqrt14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_rsqrt14_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_rsqrt14_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rcp14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_rcp14_ss(a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rcp14_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_rcp14_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_rcp14_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rcp14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_rcp14_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_rcp14_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rcp14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_rcp14_sd(a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rcp14_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_rcp14_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_rcp14_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rcp14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_rcp14_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_rcp14_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_getexp_ss(a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_getexp_ss(a, 0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getexp_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_getexp_ss(0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getexp_ss(0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_getexp_sd(a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_getexp_sd(a, 0, a, b);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getexp_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_getexp_sd(0, a, b);
+        let e = _mm_set_pd(2., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getexp_sd(0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_mask_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a, b);
+        let e = _mm_set_ps(20., 20., 20., 20.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_maskz_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a, b);
+        let e = _mm_set_ps(20., 20., 20., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_mask_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a, b);
+        let e = _mm_set_pd(20., 20.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_maskz_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a, b);
+        let e = _mm_set_pd(20., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_roundscale_ss::<0>(a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_mask_roundscale_ss::<0>(a, 0, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_roundscale_ss::<0>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_maskz_roundscale_ss::<0>(0, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_roundscale_ss::<0>(0b11111111, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_roundscale_sd::<0>(a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_mask_roundscale_sd::<0>(a, 0, a, b);
+        let e = _mm_set_pd(2.2, 2.2);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_roundscale_sd::<0>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_maskz_roundscale_sd::<0>(0, a, b);
+        let e = _mm_set_pd(2.2, 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_roundscale_sd::<0>(0b11111111, a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_scalef_ss(a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_scalef_ss(a, 0, a, b);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_scalef_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_scalef_ss(0, a, b);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_scalef_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_scalef_sd(a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_scalef_sd(a, 0, a, b);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_scalef_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_scalef_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_scalef_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmadd_ss(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmadd_ss(a, 0b11111111, b, c);
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmadd_ss(0, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmadd_ss(0b11111111, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmadd_ss(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmadd_ss(a, b, c, 0b11111111);
+        let e = _mm_set_ps(3., 3., 3., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmadd_sd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmadd_sd(a, 0b11111111, b, c);
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmadd_sd(0, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmadd_sd(0b11111111, a, b, c);
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmadd_sd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmadd_sd(a, b, c, 0b11111111);
+        let e = _mm_set_pd(3., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmsub_ss(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmsub_ss(a, 0b11111111, b, c);
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmsub_ss(0, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmsub_ss(0b11111111, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmsub_ss(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmsub_ss(a, b, c, 0b11111111);
+        let e = _mm_set_ps(3., 3., 3., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmsub_sd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmsub_sd(a, 0b11111111, b, c);
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmsub_sd(0, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmsub_sd(0b11111111, a, b, c);
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmsub_sd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmsub_sd(a, b, c, 0b11111111);
+        let e = _mm_set_pd(3., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fnmadd_ss(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmadd_ss(a, 0b11111111, b, c);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fnmadd_ss(0, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fnmadd_ss(0b11111111, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fnmadd_ss(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmadd_ss(a, b, c, 0b11111111);
+        let e = _mm_set_ps(3., 3., 3., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fnmadd_sd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmadd_sd(a, 0b11111111, b, c);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fnmadd_sd(0, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fnmadd_sd(0b11111111, a, b, c);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fnmadd_sd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmadd_sd(a, b, c, 0b11111111);
+        let e = _mm_set_pd(3., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fnmsub_ss(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmsub_ss(a, 0b11111111, b, c);
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fnmsub_ss(0, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fnmsub_ss(0b11111111, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fnmsub_ss(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmsub_ss(a, b, c, 0b11111111);
+        let e = _mm_set_ps(3., 3., 3., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fnmsub_sd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmsub_sd(a, 0b11111111, b, c);
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fnmsub_sd(0, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fnmsub_sd(0b11111111, a, b, c);
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fnmsub_sd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmsub_sd(a, b, c, 0b11111111);
+        let e = _mm_set_pd(3., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_add_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_add_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sub_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sub_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mul_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mul_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_div_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_div_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_max_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_max_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_min_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_min_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sqrt_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sqrt_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_round_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_round_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_round_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_round_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_round_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_round_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(2., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_round_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r =
+            _mm_getmant_round_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC, _MM_FROUND_CUR_DIRECTION>(
+                a, b,
+            );
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_round_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_mask_getmant_round_ss::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0, a, b);
+        let e = _mm_set_ps(20., 20., 20., 20.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getmant_round_ss::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0b11111111, a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_round_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_maskz_getmant_round_ss::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0, a, b);
+        let e = _mm_set_ps(20., 20., 20., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getmant_round_ss::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0b11111111, a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_round_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r =
+            _mm_getmant_round_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC, _MM_FROUND_CUR_DIRECTION>(
+                a, b,
+            );
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_round_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_mask_getmant_round_sd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0, a, b);
+        let e = _mm_set_pd(20., 20.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getmant_round_sd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0b11111111, a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_round_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_maskz_getmant_round_sd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0, a, b);
+        let e = _mm_set_pd(20., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getmant_round_sd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0b11111111, a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_mask_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_maskz_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_mask_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_pd(2.2, 2.2);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_maskz_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(2.2, 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r =
+            _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r =
+            _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_ps(3., 3., 3., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_pd(3., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_ps(3., 3., 3., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_pd(3., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fnmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_ps(3., 3., 3., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fnmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_pd(3., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fnmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_ps(3., 3., 3., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fnmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_pd(3., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fixupimm_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_fixupimm_ss::<5>(a, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fixupimm_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_mask_fixupimm_ss::<5>(a, 0b11111111, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fixupimm_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_maskz_fixupimm_ss::<5>(0b00000000, a, b, c);
+        let e = _mm_set_ps(0., 0., 0., 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fixupimm_ss::<5>(0b11111111, a, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fixupimm_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_fixupimm_sd::<5>(a, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fixupimm_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_mask_fixupimm_sd::<5>(a, 0b11111111, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fixupimm_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_maskz_fixupimm_sd::<5>(0b00000000, a, b, c);
+        let e = _mm_set_pd(0., 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fixupimm_sd::<5>(0b11111111, a, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fixupimm_round_ss() {
+        let a = _mm_set_ps(1., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
+        let e = _mm_set_ps(1., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fixupimm_round_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_mask_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fixupimm_round_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_maskz_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(0b00000000, a, b, c);
+        let e = _mm_set_ps(0., 0., 0., 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fixupimm_round_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fixupimm_round_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_mask_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fixupimm_round_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_maskz_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(0b00000000, a, b, c);
+        let e = _mm_set_pd(0., 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cvtss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_mask_cvtss_sd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_cvtss_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_cvtss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_maskz_cvtss_sd(0, a, b);
+        let e = _mm_set_pd(6., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_cvtss_sd(0b11111111, a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cvtsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_mask_cvtsd_ss(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_cvtsd_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_cvtsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvtsd_ss(0, a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_cvtsd_ss(0b11111111, a, b);
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cvt_roundss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_cvt_roundss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(6., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cvt_roundsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_cvt_roundsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
+        );
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_si32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_i32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_u32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_i32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_i32(a);
+        let e: i32 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_u32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_u32(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_si32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_i32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_u32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_i32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_i32(a);
+        let e: i32 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_u32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_u32(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundi32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i32 = 9;
+        let r = _mm_cvt_roundi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsi32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i32 = 9;
+        let r = _mm_cvt_roundsi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundu32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: u32 = 9;
+        let r = _mm_cvt_roundu32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i32 = 9;
+        let r = _mm_cvti32_ss(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti32_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i32 = 9;
+        let r = _mm_cvti32_sd(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_si32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_si32::<_MM_FROUND_NO_EXC>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_i32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_i32::<_MM_FROUND_NO_EXC>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_u32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_u32::<_MM_FROUND_NO_EXC>(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_i32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_i32(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_u32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_u32(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_si32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_si32::<_MM_FROUND_NO_EXC>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_i32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_i32::<_MM_FROUND_NO_EXC>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_u32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_u32::<_MM_FROUND_NO_EXC>(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_i32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_i32(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_u32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_u32(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtu32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: u32 = 9;
+        let r = _mm_cvtu32_ss(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtu32_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: u32 = 9;
+        let r = _mm_cvtu32_sd(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_comi_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_comi_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
+        let e: i32 = 0;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_comi_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_comi_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
+        let e: i32 = 0;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsi512_si32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_cvtsi512_si32(a);
+        let e: i32 = 1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtss_f32() {
+        let a = _mm512_setr_ps(
+            312.0134, 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        );
+        assert_eq!(_mm512_cvtss_f32(a), 312.0134);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsd_f64() {
+        let r = _mm512_cvtsd_f64(_mm512_setr_pd(-1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8));
+        assert_eq!(r, -1.1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_pd() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
+        let r = _mm512_shuffle_pd::<0b11_11_11_11>(a, b);
+        let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_pd() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
+        let r = _mm512_mask_shuffle_pd::<0b11_11_11_11>(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_shuffle_pd::<0b11_11_11_11>(a, 0b11111111, a, b);
+        let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_pd() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
+        let r = _mm512_maskz_shuffle_pd::<0b11_11_11_11>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_shuffle_pd::<0b11_11_11_11>(0b00001111, a, b);
+        let e = _mm512_setr_pd(4., 3., 8., 7., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expandloadu_epi32() {
+        let src = _mm512_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_expandloadu_epi32(src, m, black_box(p));
+        let e = _mm512_set_epi32(8, 7, 6, 42, 5, 42, 42, 42, 4, 3, 42, 42, 2, 42, 1, 42);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expandloadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_expandloadu_epi32(m, black_box(p));
+        let e = _mm512_set_epi32(8, 7, 6, 0, 5, 0, 0, 0, 4, 3, 0, 0, 2, 0, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_epi32() {
+        let src = _mm256_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_mask_expandloadu_epi32(src, m, black_box(p));
+        let e = _mm256_set_epi32(4, 3, 2, 42, 1, 42, 42, 42);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_maskz_expandloadu_epi32(m, black_box(p));
+        let e = _mm256_set_epi32(4, 3, 2, 0, 1, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_epi32() {
+        let src = _mm_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b11111000;
+        let r = _mm_mask_expandloadu_epi32(src, m, black_box(p));
+        let e = _mm_set_epi32(1, 42, 42, 42);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_epi32() {
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b11111000;
+        let r = _mm_maskz_expandloadu_epi32(m, black_box(p));
+        let e = _mm_set_epi32(1, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expandloadu_epi64() {
+        let src = _mm512_set1_epi64(42);
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm512_mask_expandloadu_epi64(src, m, black_box(p));
+        let e = _mm512_set_epi64(4, 3, 2, 42, 1, 42, 42, 42);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expandloadu_epi64() {
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm512_maskz_expandloadu_epi64(m, black_box(p));
+        let e = _mm512_set_epi64(4, 3, 2, 0, 1, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_epi64() {
+        let src = _mm256_set1_epi64x(42);
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_mask_expandloadu_epi64(src, m, black_box(p));
+        let e = _mm256_set_epi64x(1, 42, 42, 42);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_epi64() {
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_maskz_expandloadu_epi64(m, black_box(p));
+        let e = _mm256_set_epi64x(1, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_epi64() {
+        let src = _mm_set1_epi64x(42);
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_mask_expandloadu_epi64(src, m, black_box(p));
+        let e = _mm_set_epi64x(42, 42);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_epi64() {
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_maskz_expandloadu_epi64(m, black_box(p));
+        let e = _mm_set_epi64x(0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expandloadu_ps() {
+        let src = _mm512_set1_ps(42.);
+        let a = &[
+            1.0f32, 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_expandloadu_ps(src, m, black_box(p));
+        let e = _mm512_set_ps(
+            8., 7., 6., 42., 5., 42., 42., 42., 4., 3., 42., 42., 2., 42., 1., 42.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expandloadu_ps() {
+        let a = &[
+            1.0f32, 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_expandloadu_ps(m, black_box(p));
+        let e = _mm512_set_ps(
+            8., 7., 6., 0., 5., 0., 0., 0., 4., 3., 0., 0., 2., 0., 1., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_ps() {
+        let src = _mm256_set1_ps(42.);
+        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_mask_expandloadu_ps(src, m, black_box(p));
+        let e = _mm256_set_ps(4., 3., 2., 42., 1., 42., 42., 42.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_ps() {
+        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_maskz_expandloadu_ps(m, black_box(p));
+        let e = _mm256_set_ps(4., 3., 2., 0., 1., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_ps() {
+        let src = _mm_set1_ps(42.);
+        let a = &[1.0f32, 2., 3., 4.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_mask_expandloadu_ps(src, m, black_box(p));
+        let e = _mm_set_ps(1., 42., 42., 42.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_ps() {
+        let a = &[1.0f32, 2., 3., 4.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_maskz_expandloadu_ps(m, black_box(p));
+        let e = _mm_set_ps(1., 0., 0., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expandloadu_pd() {
+        let src = _mm512_set1_pd(42.);
+        let a = &[1.0f64, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm512_mask_expandloadu_pd(src, m, black_box(p));
+        let e = _mm512_set_pd(4., 3., 2., 42., 1., 42., 42., 42.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expandloadu_pd() {
+        let a = &[1.0f64, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm512_maskz_expandloadu_pd(m, black_box(p));
+        let e = _mm512_set_pd(4., 3., 2., 0., 1., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_pd() {
+        let src = _mm256_set1_pd(42.);
+        let a = &[1.0f64, 2., 3., 4.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_mask_expandloadu_pd(src, m, black_box(p));
+        let e = _mm256_set_pd(1., 42., 42., 42.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_pd() {
+        let a = &[1.0f64, 2., 3., 4.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_maskz_expandloadu_pd(m, black_box(p));
+        let e = _mm256_set_pd(1., 0., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_pd() {
+        let src = _mm_set1_pd(42.);
+        let a = &[1.0f64, 2.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_mask_expandloadu_pd(src, m, black_box(p));
+        let e = _mm_set_pd(42., 42.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_pd() {
+        let a = &[1.0f64, 2.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_maskz_expandloadu_pd(m, black_box(p));
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs b/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs
new file mode 100644
index 0000000000000..0a81a0581f97a
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs
@@ -0,0 +1,27263 @@
+use crate::arch::asm;
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::{fmaf16, simd::*};
+use crate::ptr;
+
+/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_set_ph(
+    e7: f16,
+    e6: f16,
+    e5: f16,
+    e4: f16,
+    e3: f16,
+    e2: f16,
+    e1: f16,
+    e0: f16,
+) -> __m128h {
+    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
+}
+
+/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_set_ph(
+    e15: f16,
+    e14: f16,
+    e13: f16,
+    e12: f16,
+    e11: f16,
+    e10: f16,
+    e9: f16,
+    e8: f16,
+    e7: f16,
+    e6: f16,
+    e5: f16,
+    e4: f16,
+    e3: f16,
+    e2: f16,
+    e1: f16,
+    e0: f16,
+) -> __m256h {
+    __m256h([
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    ])
+}
+
+/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_set_ph(
+    e31: f16,
+    e30: f16,
+    e29: f16,
+    e28: f16,
+    e27: f16,
+    e26: f16,
+    e25: f16,
+    e24: f16,
+    e23: f16,
+    e22: f16,
+    e21: f16,
+    e20: f16,
+    e19: f16,
+    e18: f16,
+    e17: f16,
+    e16: f16,
+    e15: f16,
+    e14: f16,
+    e13: f16,
+    e12: f16,
+    e11: f16,
+    e10: f16,
+    e9: f16,
+    e8: f16,
+    e7: f16,
+    e6: f16,
+    e5: f16,
+    e4: f16,
+    e3: f16,
+    e2: f16,
+    e1: f16,
+    e0: f16,
+) -> __m512h {
+    __m512h([
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
+        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+    ])
+}
+
+/// Copy half-precision (16-bit) floating-point elements from a to the lower element of dst and zero
+/// the upper 7 elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_set_sh(a: f16) -> __m128h {
+    __m128h([a, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
+}
+
+/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_set1_ph(a: f16) -> __m128h {
+    unsafe { transmute(f16x8::splat(a)) }
+}
+
+/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_set1_ph(a: f16) -> __m256h {
+    unsafe { transmute(f16x16::splat(a)) }
+}
+
+/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_set1_ph(a: f16) -> __m512h {
+    unsafe { transmute(f16x32::splat(a)) }
+}
+
+/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_setr_ph(
+    e0: f16,
+    e1: f16,
+    e2: f16,
+    e3: f16,
+    e4: f16,
+    e5: f16,
+    e6: f16,
+    e7: f16,
+) -> __m128h {
+    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
+}
+
+/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_setr_ph(
+    e0: f16,
+    e1: f16,
+    e2: f16,
+    e3: f16,
+    e4: f16,
+    e5: f16,
+    e6: f16,
+    e7: f16,
+    e8: f16,
+    e9: f16,
+    e10: f16,
+    e11: f16,
+    e12: f16,
+    e13: f16,
+    e14: f16,
+    e15: f16,
+) -> __m256h {
+    __m256h([
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    ])
+}
+
+/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_setr_ph(
+    e0: f16,
+    e1: f16,
+    e2: f16,
+    e3: f16,
+    e4: f16,
+    e5: f16,
+    e6: f16,
+    e7: f16,
+    e8: f16,
+    e9: f16,
+    e10: f16,
+    e11: f16,
+    e12: f16,
+    e13: f16,
+    e14: f16,
+    e15: f16,
+    e16: f16,
+    e17: f16,
+    e18: f16,
+    e19: f16,
+    e20: f16,
+    e21: f16,
+    e22: f16,
+    e23: f16,
+    e24: f16,
+    e25: f16,
+    e26: f16,
+    e27: f16,
+    e28: f16,
+    e29: f16,
+    e30: f16,
+    e31: f16,
+) -> __m512h {
+    __m512h([
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
+        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+    ])
+}
+
+/// Return vector of type __m128h with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_setzero_ph() -> __m128h {
+    unsafe { transmute(f16x8::ZERO) }
+}
+
+/// Return vector of type __m256h with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_setzero_ph() -> __m256h {
+    f16x16::ZERO.as_m256h()
+}
+
+/// Return vector of type __m512h with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_setzero_ph() -> __m512h {
+    f16x32::ZERO.as_m512h()
+}
+
+/// Return vector of type `__m128h` with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
+/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_undefined_ph() -> __m128h {
+    f16x8::ZERO.as_m128h()
+}
+
+/// Return vector of type `__m256h` with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
+/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_undefined_ph() -> __m256h {
+    f16x16::ZERO.as_m256h()
+}
+
+/// Return vector of type `__m512h` with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
+/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_undefined_ph() -> __m512h {
+    f16x32::ZERO.as_m512h()
+}
+
+/// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_castpd_ph(a: __m128d) -> __m128h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256d` to type `__m256h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castpd_ph(a: __m256d) -> __m256h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m512d` to type `__m512h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castpd_ph(a: __m512d) -> __m512h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m128h` to type `__m128d`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_castph_pd(a: __m128h) -> __m128d {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256h` to type `__m256d`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castph_pd(a: __m256h) -> __m256d {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m512h` to type `__m512d`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph_pd(a: __m512h) -> __m512d {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m128` to type `__m128h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_castps_ph(a: __m128) -> __m128h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256` to type `__m256h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castps_ph(a: __m256) -> __m256h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m512` to type `__m512h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castps_ph(a: __m512) -> __m512h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m128h` to type `__m128`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_castph_ps(a: __m128h) -> __m128 {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256h` to type `__m256`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castph_ps(a: __m256h) -> __m256 {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m512h` to type `__m512`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph_ps(a: __m512h) -> __m512 {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m128i` to type `__m128h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_castsi128_ph(a: __m128i) -> __m128h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256i` to type `__m256h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m512i` to type `__m512h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m128h` to type `__m128i`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_si128)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_castph_si128(a: __m128h) -> __m128i {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256h` to type `__m256i`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_si256)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castph_si256(a: __m256h) -> __m256i {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m512h` to type `__m512i`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_si512)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph_si512(a: __m512h) -> __m512i {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256h` to type `__m128h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph256_ph128)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+
+/// Cast vector of type `__m512h` to type `__m128h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph128)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+
+/// Cast vector of type `__m512h` to type `__m256h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph256)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+
+/// Cast vector of type `__m128h` to type `__m256h`. The upper 8 elements of the result are undefined.
+/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
+/// but most of the time it does not generate any instructions.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph128_ph256)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_undefined_ph(),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
+        )
+    }
+}
+
+/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are undefined.
+/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
+/// but most of the time it does not generate any instructions.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph128_ph512)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_undefined_ph(),
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8
+            ]
+        )
+    }
+}
+
+/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are undefined.
+/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
+/// but most of the time it does not generate any instructions.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph256_ph512)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_undefined_ph(),
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16, 16
+            ]
+        )
+    }
+}
+
+/// Cast vector of type `__m256h` to type `__m128h`. The upper 8 elements of the result are zeroed.
+/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
+/// any instructions.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextph128_ph256)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_setzero_ph(),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
+        )
+    }
+}
+
+/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
+/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
+/// any instructions.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_setzero_ph(),
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16, 16
+            ]
+        )
+    }
+}
+
+/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
+/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
+/// any instructions.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph128_ph512)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_setzero_ph(),
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8
+            ]
+        )
+    }
+}
+
+macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
+    ($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
+        let dst: $mask_type;
+        asm!(
+            "vcmpph {k}, {a}, {b}, {imm8}",
+            k = lateout(kreg) dst,
+            a = in($reg) $a,
+            b = in($reg) $b,
+            imm8 = const IMM5,
+            options(pure, nomem, nostack)
+        );
+        dst
+    }};
+    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
+        let dst: $mask_type;
+        asm!(
+            "vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
+            k = lateout(kreg) dst,
+            mask = in(kreg) $mask,
+            a = in($reg) $a,
+            b = in($reg) $b,
+            imm8 = const IMM5,
+            options(pure, nomem, nostack)
+        );
+        dst
+    }};
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask8, xmm_reg, a, b)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cmp_ph_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask8, k1, xmm_reg, a, b)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask16, ymm_reg, a, b)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
+    k1: __mmask16,
+    a: __m256h,
+    b: __m256h,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask16, k1, ymm_reg, a, b)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask32, zmm_reg, a, b)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
+    k1: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask32, k1, zmm_reg, a, b)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
+    a: __m512h,
+    b: __m512h,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_sae!(SAE);
+        if SAE == _MM_FROUND_NO_EXC {
+            let dst: __mmask32;
+            asm!(
+                "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
+                k = lateout(kreg) dst,
+                a = in(zmm_reg) a,
+                b = in(zmm_reg) b,
+                imm8 = const IMM5,
+                options(pure, nomem, nostack)
+            );
+            dst
+        } else {
+            cmp_asm!(__mmask32, zmm_reg, a, b)
+        }
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
+    k1: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_sae!(SAE);
+        if SAE == _MM_FROUND_NO_EXC {
+            let dst: __mmask32;
+            asm!(
+                "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
+                k = lateout(kreg) dst,
+                k1 = in(kreg) k1,
+                a = in(zmm_reg) a,
+                b = in(zmm_reg) b,
+                imm8 = const IMM5,
+                options(pure, nomem, nostack)
+            );
+            dst
+        } else {
+            cmp_asm!(__mmask32, k1, zmm_reg, a, b)
+        }
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
+/// passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __mmask8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    static_assert_sae!(SAE);
+    _mm_mask_cmp_round_sh_mask::<IMM5, SAE>(0xff, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the result in mask vector k using zeromask k1. Exceptions can be
+/// suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
+    k1: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_sae!(SAE);
+        vcmpsh(a, b, IMM5, k1, SAE)
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the result in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    _mm_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the result in mask vector k using zeromask k1.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cmp_sh_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and return the boolean result (0 or 1).
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_sae!(SAE);
+        vcomish(a, b, IMM5, SAE)
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and return the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    _mm_comi_round_sh::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and return
+/// the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_EQ_OS>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
+/// and return the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_GE_OS>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
+/// the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_GT_OS>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
+/// return the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_LE_OS>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
+/// the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_LT_OS>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
+/// the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_NEQ_OS>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and
+/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_EQ_OQ>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
+/// and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_GE_OQ>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
+/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_GT_OQ>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
+/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_LE_OQ>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
+/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_LT_OQ>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
+/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_NEQ_OQ>(a, b)
+}
+
+/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
+/// a new vector. The address must be aligned to 16 bytes or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_load_ph(mem_addr: *const f16) -> __m128h {
+    *mem_addr.cast()
+}
+
+/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
+/// a new vector. The address must be aligned to 32 bytes or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_load_ph(mem_addr: *const f16) -> __m256h {
+    *mem_addr.cast()
+}
+
+/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
+/// a new vector. The address must be aligned to 64 bytes or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_load_ph(mem_addr: *const f16) -> __m512h {
+    *mem_addr.cast()
+}
+
+/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector,
+/// and zero the upper elements
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_load_sh(mem_addr: *const f16) -> __m128h {
+    _mm_set_sh(*mem_addr)
+}
+
+/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
+/// using writemask k (the element is copied from src when mask bit 0 is not set), and zero the upper elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_load_sh(src: __m128h, k: __mmask8, mem_addr: *const f16) -> __m128h {
+    let mut dst = src;
+    asm!(
+        vpl!("vmovsh {dst}{{{k}}}"),
+        dst = inout(xmm_reg) dst,
+        k = in(kreg) k,
+        p = in(reg) mem_addr,
+        options(pure, readonly, nostack, preserves_flags)
+    );
+    dst
+}
+
+/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
+/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and zero the upper elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_load_sh(k: __mmask8, mem_addr: *const f16) -> __m128h {
+    let mut dst: __m128h;
+    asm!(
+        vpl!("vmovsh {dst}{{{k}}}{{z}}"),
+        dst = out(xmm_reg) dst,
+        k = in(kreg) k,
+        p = in(reg) mem_addr,
+        options(pure, readonly, nostack, preserves_flags)
+    );
+    dst
+}
+
+/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
+/// a new vector. The address does not need to be aligned to any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_loadu_ph(mem_addr: *const f16) -> __m128h {
+    ptr::read_unaligned(mem_addr.cast())
+}
+
+/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
+/// a new vector. The address does not need to be aligned to any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_loadu_ph(mem_addr: *const f16) -> __m256h {
+    ptr::read_unaligned(mem_addr.cast())
+}
+
+/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
+/// a new vector. The address does not need to be aligned to any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_loadu_ph(mem_addr: *const f16) -> __m512h {
+    ptr::read_unaligned(mem_addr.cast())
+}
+
+/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
+/// using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper
+/// 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let mut mov: f16 = simd_extract!(src, 0);
+        if (k & 1) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
+/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let mut mov: f16 = 0.;
+        if (k & 1) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst,
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let mov: f16 = simd_extract!(b, 0);
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
+/// The address must be aligned to 16 bytes or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_store_ph(mem_addr: *mut f16, a: __m128h) {
+    *mem_addr.cast() = a;
+}
+
+/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
+/// The address must be aligned to 32 bytes or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_store_ph(mem_addr: *mut f16, a: __m256h) {
+    *mem_addr.cast() = a;
+}
+
+/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
+/// The address must be aligned to 64 bytes or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_store_ph(mem_addr: *mut f16, a: __m512h) {
+    *mem_addr.cast() = a;
+}
+
+/// Store the lower half-precision (16-bit) floating-point element from a into memory.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_store_sh(mem_addr: *mut f16, a: __m128h) {
+    *mem_addr = simd_extract!(a, 0);
+}
+
+/// Store the lower half-precision (16-bit) floating-point element from a into memory using writemask k
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_store_sh(mem_addr: *mut f16, k: __mmask8, a: __m128h) {
+    asm!(
+        vps!("vmovdqu16", "{{{k}}}, {src}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        src = in(xmm_reg) a,
+        options(nostack, preserves_flags)
+    );
+}
+
+/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
+/// The address does not need to be aligned to any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_storeu_ph(mem_addr: *mut f16, a: __m128h) {
+    ptr::write_unaligned(mem_addr.cast(), a);
+}
+
+/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
+/// The address does not need to be aligned to any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_storeu_ph(mem_addr: *mut f16, a: __m256h) {
+    ptr::write_unaligned(mem_addr.cast(), a);
+}
+
+/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
+/// The address does not need to be aligned to any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_storeu_ph(mem_addr: *mut f16, a: __m512h) {
+    ptr::write_unaligned(mem_addr.cast(), a);
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_add(a, b) }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_add_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_add_ph(a, b);
+        simd_select_bitmask(k, r, _mm_setzero_ph())
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_add(a, b) }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_add_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_add_ph(a, b);
+        simd_select_bitmask(k, r, _mm256_setzero_ph())
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_add(a, b) }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_add_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_add_ph(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vaddph(a, b, ROUNDING)
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_add_round_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vaddsh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_sub(a, b) }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_sub_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_sub_ph(a, b);
+        simd_select_bitmask(k, r, _mm_setzero_ph())
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_sub(a, b) }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_sub_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_sub_ph(a, b);
+        simd_select_bitmask(k, r, _mm256_setzero_ph())
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_sub(a, b) }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_sub_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_sub_ph(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsubph(a, b, ROUNDING)
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsubsh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_mul(a, b) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_mul_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_mul_ph(a, b);
+        simd_select_bitmask(k, r, _mm_setzero_ph())
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_mul(a, b) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_mul_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_mul_ph(a, b);
+        simd_select_bitmask(k, r, _mm256_setzero_ph())
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_mul(a, b) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_mul_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_mul_ph(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vmulph(a, b, ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vmulsh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_div(a, b) }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_div_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_div_ph(a, b);
+        simd_select_bitmask(k, r, _mm_setzero_ph())
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_div(a, b) }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_div_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_div_ph(a, b);
+        simd_select_bitmask(k, r, _mm256_setzero_ph())
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_div(a, b) }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_div_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_div_ph(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vdivph(a, b, ROUNDING)
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_div_round_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vdivsh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Multiply the packed complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
+}
+
+/// Multiply the packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmulcph_512(
+            transmute(a),
+            transmute(b),
+            transmute(src),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
+/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
+/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
+/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
+/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
+/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmulcsh(
+            transmute(a),
+            transmute(b),
+            transmute(src),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
+/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mul_pch(a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_pch(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_mul_pch(k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mul_pch(a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_mul_pch(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_maskz_mul_pch(k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mul_pch(a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_mul_pch(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_maskz_mul_pch(k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mul_round_pch::<ROUNDING>(a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_mul_round_pch::<ROUNDING>(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_maskz_mul_round_pch::<ROUNDING>(k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mul_sch(a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_sch(src, k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_mul_sch(k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mul_round_sch::<ROUNDING>(a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_mul_round_sch::<ROUNDING>(src, k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_maskz_mul_round_sch::<ROUNDING>(k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmulcph_512(
+            transmute(a),
+            transmute(b),
+            transmute(src),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmulcsh(
+            transmute(a),
+            transmute(b),
+            transmute(src),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_cmul_pch(a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_pch(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_cmul_pch(k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
+    _mm256_cmul_pch(a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_cmul_pch(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_maskz_cmul_pch(k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_cmul_pch(a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_cmul_pch(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_maskz_cmul_pch(k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_cmul_round_pch::<ROUNDING>(a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cmul_round_pch::<ROUNDING>(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_maskz_cmul_round_pch::<ROUNDING>(k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_cmul_sch(a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_sch(src, k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_cmul_sch(k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_cmul_round_sch::<ROUNDING>(a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cmul_round_sch::<ROUNDING>(src, k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_maskz_cmul_round_sch::<ROUNDING>(k, a, b)
+}
+
+/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_abs_ph(v2: __m128h) -> __m128h {
+    unsafe { transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX))) }
+}
+
+/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_abs_ph(v2: __m256h) -> __m256h {
+    unsafe { transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX))) }
+}
+
+/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_abs_ph(v2: __m512h) -> __m512h {
+    unsafe { transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX))) }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex
+/// number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines
+/// the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate
+/// `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_conj_pch(a: __m128h) -> __m128h {
+    unsafe { transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN))) }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
+/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
+/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number
+/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe {
+        let r: __m128 = transmute(_mm_conj_pch(a));
+        transmute(simd_select_bitmask(k, r, transmute(src)))
+    }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
+/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
+    _mm_mask_conj_pch(_mm_setzero_ph(), k, a)
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
+/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_conj_pch(a: __m256h) -> __m256h {
+    unsafe { transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN))) }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
+/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
+/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
+    unsafe {
+        let r: __m256 = transmute(_mm256_conj_pch(a));
+        transmute(simd_select_bitmask(k, r, transmute(src)))
+    }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
+/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
+    _mm256_mask_conj_pch(_mm256_setzero_ph(), k, a)
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
+/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_conj_pch(a: __m512h) -> __m512h {
+    unsafe { transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN))) }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
+/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
+/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
+    unsafe {
+        let r: __m512 = transmute(_mm512_conj_pch(a));
+        transmute(simd_select_bitmask(k, r, transmute(src)))
+    }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
+/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
+    _mm512_mask_conj_pch(_mm512_setzero_ph(), k, a)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    _mm_mask3_fmadd_pch(a, b, c, 0xff)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from a when the corresponding
+/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from c when the corresponding
+/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        transmute(vfmaddcph_mask3_128(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        transmute(vfmaddcph_maskz_128(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    _mm256_mask3_fmadd_pch(a, b, c, 0xff)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
+    unsafe {
+        let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from c when the corresponding
+/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
+    unsafe {
+        transmute(vfmaddcph_mask3_256(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe {
+        transmute(vfmaddcph_maskz_256(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_mask_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from c when the corresponding
+/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
+    _mm512_mask3_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_maskz_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask16,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from c when the corresponding
+/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask16,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmaddcph_mask3_512(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmaddcph_maskz_512(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst, and copy the upper 6 packed elements from a to the
+/// upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    _mm_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst using writemask k (elements are copied from a when
+/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
+/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    _mm_mask_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst using writemask k (elements are copied from c when
+/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
+/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    _mm_mask3_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
+/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
+/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    _mm_maskz_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmaddcsh_mask(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            0xff,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst using writemask k (elements are copied from a when
+/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
+/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
+    a: __m128h,
+    k: __mmask8,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = transmute(a);
+        let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(_mm_mask_move_ss(a, k, a, r))
+    }
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst using writemask k (elements are copied from c when
+/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
+/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+    k: __mmask8,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let c = transmute(c);
+        let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
+        transmute(_mm_move_ss(c, r))
+    }
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
+/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
+/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmaddcsh_maskz(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
+/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    _mm_mask3_fcmadd_pch(a, b, c, 0xff)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        transmute(vfcmaddcph_mask3_128(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        transmute(vfcmaddcph_maskz_128(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
+/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    _mm256_mask3_fcmadd_pch(a, b, c, 0xff)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
+    unsafe {
+        let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
+    unsafe {
+        transmute(vfcmaddcph_mask3_256(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe {
+        transmute(vfcmaddcph_maskz_256(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
+/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_mask_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
+    _mm512_mask3_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_maskz_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
+/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask16,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c using writemask k (the element is copied from c when the corresponding
+/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
+/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask16,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmaddcph_mask3_512(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c using zeromask k (the element is zeroed out when the corresponding
+/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
+/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmaddcph_maskz_512(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
+/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    _mm_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
+/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
+/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    _mm_mask_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
+/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
+/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    _mm_mask3_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
+/// zeromask k (the element is zeroed out when the corresponding mask bit is not set), and copy the upper
+/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    _mm_maskz_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
+/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fcmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmaddcsh_mask(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            0xff,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
+/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
+/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
+    a: __m128h,
+    k: __mmask8,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = transmute(a);
+        let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
+        transmute(_mm_mask_move_ss(a, k, a, r))
+    }
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
+/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
+/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+    k: __mmask8,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let c = transmute(c);
+        let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
+        transmute(_mm_move_ss(c, r))
+    }
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c using zeromask k (the element is zeroed out when the corresponding
+/// mask bit is not set), and store the result in the lower elements of dst, and copy the upper 6 packed elements
+/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
+/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmaddcsh_maskz(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddph_512(a, b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask32,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask32,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
+/// 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = fmaf16(extracta, extractb, extractc);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fmadd: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmadd = fmaf16(fmadd, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        let mut fmadd: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fmadd = fmaf16(extracta, extractb, fmadd);
+        }
+        simd_insert!(c, 0, fmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fmadd: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmadd = fmaf16(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
+/// 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    k: __mmask8,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+    k: __mmask8,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst.
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddph_512(a, b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask32,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask32,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
+/// 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = fmaf16(extracta, extractb, -extractc);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fmsub: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmsub = fmaf16(fmsub, extractb, -extractc);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        let mut fmsub: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fmsub = fmaf16(extracta, extractb, -fmsub);
+        }
+        simd_insert!(c, 0, fmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fmsub: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmsub = fmaf16(extracta, extractb, -extractc);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
+/// 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    k: __mmask8,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+    k: __mmask8,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
+        }
+        simd_insert!(c, 0, fmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddph_512(simd_neg(a), b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask32,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask32,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = fmaf16(-extracta, extractb, extractc);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fnmadd: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmadd = fmaf16(-fnmadd, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        let mut fnmadd: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fnmadd = fmaf16(-extracta, extractb, fnmadd);
+        }
+        simd_insert!(c, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fnmadd: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmadd = fmaf16(-extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fnmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    k: __mmask8,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+    k: __mmask8,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask32,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask32,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = fmaf16(-extracta, extractb, -extractc);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fnmsub: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmsub = fmaf16(-fnmsub, extractb, -extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        let mut fnmsub: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fnmsub = fmaf16(-extracta, extractb, -fnmsub);
+        }
+        simd_insert!(c, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fnmsub: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmsub = fmaf16(-extracta, extractb, -extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fnmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    k: __mmask8,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+    k: __mmask8,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { vfmaddsubph_128(a, b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { vfmaddsubph_256(a, b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_fmaddsub_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmaddsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubph_512(a, b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask32,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask32,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { vfmaddsubph_128(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { vfmaddsubph_256(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_fmsubadd_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmsubadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask32,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask32,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_rcp_ph(a: __m128h) -> __m128h {
+    _mm_mask_rcp_ph(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
+/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { vrcpph_128(a, src, k) }
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
+    _mm_mask_rcp_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_rcp_ph(a: __m256h) -> __m256h {
+    _mm256_mask_rcp_ph(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
+/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { vrcpph_256(a, src, k) }
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
+    _mm256_mask_rcp_ph(_mm256_setzero_ph(), k, a)
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_rcp_ph(a: __m512h) -> __m512h {
+    _mm512_mask_rcp_ph(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
+/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+    unsafe { vrcpph_512(a, src, k) }
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
+    _mm512_mask_rcp_ph(_mm512_setzero_ph(), k, a)
+}
+
+/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
+/// store the result in the lower element of dst, and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrcpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
+/// store the result in the lower element of dst using writemask k (the element is copied from src when
+/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrcpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vrcpsh(a, b, src, k) }
+}
+
+/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
+/// store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
+/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrcpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
+    _mm_mask_rsqrt_ph(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { vrsqrtph_128(a, src, k) }
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
+    _mm_mask_rsqrt_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
+    _mm256_mask_rsqrt_ph(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { vrsqrtph_256(a, src, k) }
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
+    _mm256_mask_rsqrt_ph(_mm256_setzero_ph(), k, a)
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
+    _mm512_mask_rsqrt_ph(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+    unsafe { vrsqrtph_512(a, src, k) }
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
+    _mm512_mask_rsqrt_ph(_mm512_setzero_ph(), k, a)
+}
+
+/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
+/// element in b, store the result in the lower element of dst, and copy the upper 7 packed elements from a
+/// to the upper elements of dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrsqrtsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
+/// element in b, store the result in the lower element of dst using writemask k (the element is copied from src
+/// when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrsqrtsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vrsqrtsh(a, b, src, k) }
+}
+
+/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
+/// element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when
+/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrsqrtsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_sqrt_ph(a: __m128h) -> __m128h {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), src) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph()) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), src) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph()) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), src) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph()) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtph_512(a, ROUNDING)
+    }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
+    }
+}
+
+/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask
+/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_sqrt_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
+/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask
+/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtsh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
+/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
+/// value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vmaxph_128(a, b) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), src) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph()) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
+/// value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { vmaxph_256(a, b) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), src) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph()) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
+/// value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_max_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), src) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph()) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vmaxph_512(a, b, SAE)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
+/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_max_round_ph<const SAE: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
+/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_max_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
+/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
+/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value
+/// when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_max_sh(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
+/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
+/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
+/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_max_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
+/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
+/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
+/// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_max_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
+/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
+/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_sae!(SAE);
+    _mm_mask_max_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
+/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
+/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
+/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_max_round_sh<const SAE: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vmaxsh(a, b, src, k, SAE)
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
+/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
+/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
+/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
+/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_max_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_sae!(SAE);
+    _mm_mask_max_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
+/// when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vminph_128(a, b) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), src) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph()) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
+/// when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { vminph_256(a, b) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), src) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph()) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
+/// when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_min_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), src) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph()) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not
+/// follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vminph_512(a, b, SAE)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
+/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_min_round_ph<const SAE: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
+/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_min_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
+/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
+/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
+/// inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_min_sh(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
+/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
+/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
+/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_min_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
+/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
+/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
+/// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_min_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
+/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
+/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_sae!(SAE);
+    _mm_mask_min_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
+/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
+/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
+/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_min_round_sh<const SAE: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vminsh(a, b, src, k, SAE)
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
+/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
+/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
+/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
+/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_min_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_sae!(SAE);
+    _mm_mask_min_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
+/// This intrinsic essentially calculates `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_getexp_ph(a: __m128h) -> __m128h {
+    _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { vgetexpph_128(a, src, k) }
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
+/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
+    _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
+/// This intrinsic essentially calculates `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_getexp_ph(a: __m256h) -> __m256h {
+    _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { vgetexpph_256(a, src, k) }
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
+/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
+    _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
+/// This intrinsic essentially calculates `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_getexp_ph(a: __m512h) -> __m512h {
+    _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+    _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
+/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
+    _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
+/// This intrinsic essentially calculates `floor(log2(x))` for each element. Exceptions can be suppressed
+/// by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
+    static_assert_sae!(SAE);
+    _mm512_mask_getexp_round_ph::<SAE>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_getexp_round_ph<const SAE: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vgetexpph_512(a, src, k, SAE)
+    }
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
+/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
+    static_assert_sae!(SAE);
+    _mm512_mask_getexp_round_ph::<SAE>(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
+/// calculates `floor(log2(x))` for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
+/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
+/// for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
+/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
+/// lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
+/// calculates `floor(log2(x))` for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_sae!(SAE);
+    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
+/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
+/// for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_getexp_round_sh<const SAE: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vgetexpsh(a, b, src, k, SAE)
+    }
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
+/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
+/// lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_getexp_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_sae!(SAE);
+    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_getmant_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
+    }
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_getmant_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_setzero_ph(), k, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m256h,
+) -> __m256h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_getmant_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m256h,
+    k: __mmask16,
+    a: __m256h,
+) -> __m256h {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
+    }
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_getmant_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask16,
+    a: __m256h,
+) -> __m256h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_setzero_ph(), k, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_getmant_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm512_mask_getmant_round_ph::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_getmant_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_setzero_ph(), k, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
+/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_getmant_round_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    static_assert_sae!(SAE);
+    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_getmant_round_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_sae!(SAE);
+        vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
+    }
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_getmant_round_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    static_assert_sae!(SAE);
+    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_setzero_ph(), k, a)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_getmant_sh<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
+/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
+/// the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_getmant_sh<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm_mask_getmant_round_sh::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
+/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
+/// the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_getmant_sh<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
+/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_getmant_round_sh<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    static_assert_sae!(SAE);
+    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
+/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
+/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5, 6)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_getmant_round_sh<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_sae!(SAE);
+        vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
+    }
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
+/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
+/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_getmant_round_sh<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    static_assert_sae!(SAE);
+    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_roundscale_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_roundscale_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vrndscaleph_128(a, IMM8, src, k)
+    }
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_roundscale_ph::<IMM8>(_mm_setzero_ph(), k, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_roundscale_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_roundscale_ph<const IMM8: i32>(
+    src: __m256h,
+    k: __mmask16,
+    a: __m256h,
+) -> __m256h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vrndscaleph_256(a, IMM8, src, k)
+    }
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_roundscale_ph::<IMM8>(_mm256_setzero_ph(), k, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_roundscale_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_roundscale_ph<const IMM8: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_roundscale_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_roundscale_ph::<IMM8>(_mm512_setzero_ph(), k, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        vrndscaleph_512(a, IMM8, src, k, SAE)
+    }
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
+/// from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
+/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_roundscale_sh<const IMM8: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_roundscale_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
+/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
+/// from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
+/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        vrndscalesh(a, b, src, k, IMM8, SAE)
+    }
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
+/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vscalefph_128(a, b, src, k) }
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { vscalefph_256(a, b, src, k) }
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_undefined_ph(), 0xffffffff, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vscalefph_512(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vscalefsh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_reduce_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vreduceph_128(a, IMM8, src, k)
+    }
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_ph::<IMM8>(_mm_setzero_ph(), k, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_reduce_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_reduce_ph<const IMM8: i32>(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vreduceph_256(a, IMM8, src, k)
+    }
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_reduce_ph::<IMM8>(_mm256_setzero_ph(), k, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_reduce_ph<const IMM8: i32>(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_ph::<IMM8>(_mm512_setzero_ph(), k, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        vreduceph_512(a, IMM8, src, k, SAE)
+    }
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the
+/// upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
+/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from
+/// a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_reduce_sh<const IMM8: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
+/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
+/// to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper
+/// 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
+/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a
+/// to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        vreducesh(a, b, src, k, IMM8, SAE)
+    }
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
+/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
+/// to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
+/// sum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_add_ph(a: __m128h) -> f16 {
+    unsafe {
+        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
+        let a = _mm_add_ph(a, b);
+        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
+        let a = _mm_add_ph(a, b);
+        simd_extract::<_, f16>(a, 0) + simd_extract::<_, f16>(a, 1)
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
+/// sum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        _mm_reduce_add_ph(_mm_add_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
+/// sum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+        let q = simd_shuffle!(
+            a,
+            a,
+            [
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ]
+        );
+        _mm256_reduce_add_ph(_mm256_add_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
+/// the product of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
+    unsafe {
+        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
+        let a = _mm_mul_ph(a, b);
+        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
+        let a = _mm_mul_ph(a, b);
+        simd_extract::<_, f16>(a, 0) * simd_extract::<_, f16>(a, 1)
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
+/// the product of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        _mm_reduce_mul_ph(_mm_mul_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
+/// the product of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+        let q = simd_shuffle!(
+            a,
+            a,
+            [
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ]
+        );
+        _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
+/// minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_min_ph(a: __m128h) -> f16 {
+    unsafe {
+        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
+        let a = _mm_min_ph(a, b);
+        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
+        let a = _mm_min_ph(a, b);
+        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
+        simd_extract!(_mm_min_sh(a, b), 0)
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
+/// minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        _mm_reduce_min_ph(_mm_min_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
+/// minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+        let q = simd_shuffle!(
+            a,
+            a,
+            [
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ]
+        );
+        _mm256_reduce_min_ph(_mm256_min_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
+/// maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_max_ph(a: __m128h) -> f16 {
+    unsafe {
+        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
+        let a = _mm_max_ph(a, b);
+        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
+        let a = _mm_max_ph(a, b);
+        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
+        simd_extract!(_mm_max_sh(a, b), 0)
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
+/// maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        _mm_reduce_max_ph(_mm_max_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
+/// maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+        let q = simd_shuffle!(
+            a,
+            a,
+            [
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ]
+        );
+        _mm256_reduce_max_ph(_mm256_max_ph(p, q))
+    }
+}
+
+macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
+    ($mask_type: ty, $reg: ident, $a: expr) => {{
+        let dst: $mask_type;
+        asm!(
+            "vfpclassph {k}, {src}, {imm8}",
+            k = lateout(kreg) dst,
+            src = in($reg) $a,
+            imm8 = const IMM8,
+            options(pure, nomem, nostack)
+        );
+        dst
+    }};
+    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
+        let dst: $mask_type;
+        asm!(
+            "vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
+            k = lateout(kreg) dst,
+            mask = in(kreg) $mask,
+            src = in($reg) $a,
+            imm8 = const IMM8,
+            options(pure, nomem, nostack)
+        );
+        dst
+    }};
+}
+
+/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask8, xmm_reg, a)
+    }
+}
+
+/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask8, k1, xmm_reg, a)
+    }
+}
+
+/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask16, ymm_reg, a)
+    }
+}
+
+/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask16, k1, ymm_reg, a)
+    }
+}
+
+/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fpclass_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask32, zmm_reg, a)
+    }
+}
+
+/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fpclass_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask32, k1, zmm_reg, a)
+    }
+}
+
+/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
+/// by imm8, and store the result in mask vector k.
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_sh_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
+    _mm_mask_fpclass_sh_mask::<IMM8>(0xff, a)
+}
+
+/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
+/// by imm8, and store the result in mask vector k using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_sh_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vfpclasssh(a, IMM8, k1)
+    }
+}
+
+/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, b, a) }
+}
+
+/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, b, a) }
+}
+
+/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, b, a) }
+}
+
+/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
+/// and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
+    _mm_castsi128_ph(_mm_permutex2var_epi16(
+        _mm_castph_si128(a),
+        idx,
+        _mm_castph_si128(b),
+    ))
+}
+
+/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
+/// and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
+    _mm256_castsi256_ph(_mm256_permutex2var_epi16(
+        _mm256_castph_si256(a),
+        idx,
+        _mm256_castph_si256(b),
+    ))
+}
+
+/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
+/// and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
+    _mm512_castsi512_ph(_mm512_permutex2var_epi16(
+        _mm512_castph_si512(a),
+        idx,
+        _mm512_castph_si512(b),
+    ))
+}
+
+/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
+    _mm_castsi128_ph(_mm_permutexvar_epi16(idx, _mm_castph_si128(a)))
+}
+
+/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
+    _mm256_castsi256_ph(_mm256_permutexvar_epi16(idx, _mm256_castph_si256(a)))
+}
+
+/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
+    _mm512_castsi512_ph(_mm512_permutexvar_epi16(idx, _mm512_castph_si512(a)))
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
+    unsafe { vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_cvtepi16_ph(a), src) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_mask_cvtepi16_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
+    unsafe { vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
+    _mm256_mask_cvtepi16_ph(_mm256_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
+    unsafe { vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
+    _mm512_mask_cvtepi16_ph(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtw2ph_512(a.as_i16x32(), ROUNDING)
+    }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundepi16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
+    unsafe { vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_cvtepu16_ph(a), src) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_mask_cvtepu16_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
+    unsafe { vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
+    _mm256_mask_cvtepu16_ph(_mm256_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
+    unsafe { vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
+    _mm512_mask_cvtepu16_ph(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
+    }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundepu16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst. The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
+    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { vcvtdq2ph_128(a.as_i32x4(), src, k) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
+    unsafe { vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
+    _mm256_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
+    unsafe { vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
+    _mm512_mask_cvtepi32_ph(f16x16::ZERO.as_m256h(), k, a)
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
+    src: __m256h,
+    k: __mmask16,
+    a: __m512i,
+) -> __m256h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
+}
+
+/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
+/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
+/// of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti32_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsi2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
+    unsafe { vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
+/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
+/// of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsi2sh(a, b, ROUNDING)
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst. The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
+    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { vcvtudq2ph_128(a.as_u32x4(), src, k) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
+    unsafe { vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
+    _mm256_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
+    unsafe { vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
+    _mm512_mask_cvtepu32_ph(f16x16::ZERO.as_m256h(), k, a)
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
+    src: __m256h,
+    k: __mmask16,
+    a: __m512i,
+) -> __m256h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
+}
+
+/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
+/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
+/// of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtusi2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
+    unsafe { vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
+/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
+/// of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu32_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtusi2sh(a, b, ROUNDING)
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst. The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
+    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set). The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { vcvtqq2ph_128(a.as_i64x2(), src, k) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst. The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
+    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
+    unsafe { vcvtqq2ph_256(a.as_i64x4(), src, k) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
+    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
+    unsafe { vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
+    _mm512_mask_cvtepi64_ph(f16x8::ZERO.as_m128h(), k, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m512i,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst. The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
+    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set). The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { vcvtuqq2ph_128(a.as_u64x2(), src, k) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst. The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
+    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
+    unsafe { vcvtuqq2ph_256(a.as_u64x4(), src, k) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
+    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
+    unsafe { vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
+    _mm512_mask_cvtepu64_ph(f16x8::ZERO.as_m128h(), k, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m512i,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtxps_ph(a: __m128) -> __m128h {
+    _mm_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
+    unsafe { vcvtps2phx_128(a, src, k) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
+    _mm_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
+    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
+    unsafe { vcvtps2phx_256(a, src, k) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
+    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
+    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), 0xffff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
+    unsafe { vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
+    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), 0xffff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
+    src: __m256h,
+    k: __mmask16,
+    a: __m512,
+) -> __m256h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtps2phx_512(a, src, k, ROUNDING)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtss2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
+    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using writemask k (the element
+/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtss2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
+    unsafe { vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtss2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
+    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using writemask k (the element
+/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtss2sh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst. The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
+    _mm_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
+    unsafe { vcvtpd2ph_128(a, src, k) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
+    _mm_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst. The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
+    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
+    unsafe { vcvtpd2ph_256(a, src, k) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
+    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
+    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
+    unsafe { vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
+    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m512d,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtpd2ph_512(a, src, k, ROUNDING)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsd2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
+    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using writemask k (the element
+/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsd2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
+    unsafe { vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsd2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
+    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using writemask k (the element
+/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128d,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsd2sh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128d,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epi16(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2w_128(a, src.as_i16x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epi16(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
+    _mm256_mask_cvtph_epi16(_mm256_undefined_si256(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { transmute(vcvtph2w_256(a, src.as_i16x16(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
+    _mm256_mask_cvtph_epi16(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
+    _mm512_mask_cvtph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2w_512(
+            a,
+            src.as_i16x32(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_cvtph_epi16(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epu16(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2uw_128(a, src.as_u16x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epu16(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
+    _mm256_mask_cvtph_epu16(_mm256_undefined_si256(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { transmute(vcvtph2uw_256(a, src.as_u16x16(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
+    _mm256_mask_cvtph_epu16(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
+    _mm512_mask_cvtph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2uw_512(
+            a,
+            src.as_u16x32(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_cvtph_epu16(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst.
+///
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_epu16<const SAE: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvtph2uw_512(a, src.as_u16x32(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epi16(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2w_128(a, src.as_i16x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epi16(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
+    _mm256_mask_cvttph_epi16(_mm256_undefined_si256(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { transmute(vcvttph2w_256(a, src.as_i16x16(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
+    _mm256_mask_cvttph_epi16(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
+    _mm512_mask_cvttph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2w_512(
+            a,
+            src.as_i16x32(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_cvttph_epi16(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epu16(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2uw_128(a, src.as_u16x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epu16(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
+    _mm256_mask_cvttph_epu16(_mm256_undefined_si256(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { transmute(vcvttph2uw_256(a, src.as_u16x16(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
+    _mm256_mask_cvttph_epu16(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
+    _mm512_mask_cvttph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2uw_512(
+            a,
+            src.as_u16x32(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_cvttph_epu16(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epi32(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2dq_128(a, src.as_i32x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epi32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epi32(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvtph2dq_256(a, src.as_i32x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epi32(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
+    _mm512_mask_cvtph_epi32(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2dq_512(
+            a,
+            src.as_i32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
+    _mm512_mask_cvtph_epi32(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m256h,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_i32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2si))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsh_i32(a: __m128h) -> i32 {
+    unsafe { vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
+/// the result in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_i32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsh2si32(a, ROUNDING)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epu32(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2udq_128(a, src.as_u32x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epu32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epu32(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvtph2udq_256(a, src.as_u32x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epu32(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
+    _mm512_mask_cvtph_epu32(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2udq_512(
+            a,
+            src.as_u32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
+    _mm512_mask_cvtph_epu32(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m256h,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_u32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2usi))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsh_u32(a: __m128h) -> u32 {
+    unsafe { vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
+/// the result in dst.
+///
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_u32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
+    unsafe {
+        static_assert_rounding!(SAE);
+        vcvtsh2usi32(a, SAE)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epi32(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2dq_128(a, src.as_i32x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epi32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epi32(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvttph2dq_256(a, src.as_i32x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epi32(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
+    _mm512_mask_cvttph_epi32(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2dq_512(
+            a,
+            src.as_i32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
+    _mm512_mask_cvttph_epi32(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst.
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m256h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_i32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttsh2si))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttsh_i32(a: __m128h) -> i32 {
+    unsafe { vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
+/// the result in dst.
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_i32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvttsh2si32(a, SAE)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epu32(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2udq_128(a, src.as_u32x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epu32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epu32(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvttph2udq_256(a, src.as_u32x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epu32(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
+    _mm512_mask_cvttph_epu32(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2udq_512(
+            a,
+            src.as_u32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
+    _mm512_mask_cvttph_epu32(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m256h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_u32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttsh2usi))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttsh_u32(a: __m128h) -> u32 {
+    unsafe { vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
+/// the result in dst.
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_u32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvttsh2usi32(a, SAE)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epi64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2qq_128(a, src.as_i64x2(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epi64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvtph2qq_256(a, src.as_i64x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
+    _mm512_mask_cvtph_epi64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2qq_512(
+            a,
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
+    _mm512_mask_cvtph_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m128h,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epu64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2uqq_128(a, src.as_u64x2(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epu64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvtph2uqq_256(a, src.as_u64x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
+    _mm512_mask_cvtph_epu64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2uqq_512(
+            a,
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
+    _mm512_mask_cvtph_epu64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m128h,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epi64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2qq_128(a, src.as_i64x2(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epi64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvttph2qq_256(a, src.as_i64x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
+    _mm512_mask_cvttph_epi64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2qq_512(
+            a,
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
+    _mm512_mask_cvttph_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m128h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epu64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2uqq_128(a, src.as_u64x2(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epu64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvttph2uqq_256(a, src.as_u64x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
+    _mm512_mask_cvttph_epu64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2uqq_512(
+            a,
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
+    _mm512_mask_cvttph_epu64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m128h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
+    _mm_mask_cvtxph_ps(_mm_setzero_ps(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
+    unsafe { vcvtph2psx_128(a, src, k) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
+    _mm_mask_cvtxph_ps(_mm_setzero_ps(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
+    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
+    unsafe { vcvtph2psx_256(a, src, k) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
+    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
+    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
+    unsafe { vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
+    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m256h,
+) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvtph2psx_512(a, src, k, SAE)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), k, a)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
+/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed
+/// elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_ss)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2ss))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
+    _mm_mask_cvtsh_ss(a, 0xff, a, b)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
+/// floating-point element, store the result in the lower element of dst using writemask k (the element is
+/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_ss)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2ss))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
+    unsafe { vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
+/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
+/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
+/// of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_ss)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2ss))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
+    _mm_mask_cvtsh_ss(_mm_set_ss(0.0), k, a, b)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
+/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements
+/// from a to the upper elements of dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_ss)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
+    static_assert_sae!(SAE);
+    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_undefined_ps(), 0xff, a, b)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
+/// floating-point element, store the result in the lower element of dst using writemask k (the element is
+/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
+/// upper elements of dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128h,
+) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvtsh2ss(a, b, src, k, SAE)
+    }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
+/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
+/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
+/// of dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
+    static_assert_sae!(SAE);
+    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_set_ss(0.0), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_pd(a: __m128h) -> __m128d {
+    _mm_mask_cvtph_pd(_mm_setzero_pd(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
+    unsafe { vcvtph2pd_128(a, src, k) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
+    _mm_mask_cvtph_pd(_mm_setzero_pd(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
+    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
+    unsafe { vcvtph2pd_256(a, src, k) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
+    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
+    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
+    unsafe { vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
+    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m128h,
+) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvtph2pd_512(a, src, k, SAE)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), k, a)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
+/// floating-point element, store the result in the lower element of dst, and copy the upper element
+/// from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_sd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2sd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
+    _mm_mask_cvtsh_sd(a, 0xff, a, b)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
+/// floating-point element, store the result in the lower element of dst using writemask k (the element is
+/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
+/// of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_sd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2sd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
+    unsafe { vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
+/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
+/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_sd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2sd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
+    _mm_mask_cvtsh_sd(_mm_set_sd(0.0), k, a, b)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
+/// floating-point element, store the result in the lower element of dst, and copy the upper element from a
+/// to the upper element of dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_sd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
+    static_assert_sae!(SAE);
+    _mm_mask_cvt_roundsh_sd::<SAE>(a, 0xff, a, b)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
+/// floating-point element, store the result in the lower element of dst using writemask k (the element is
+/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
+/// of dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128h,
+) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvtsh2sd(a, b, src, k, SAE)
+    }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
+/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
+/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
+    static_assert_sae!(SAE);
+    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_set_sd(0.0), k, a, b)
+}
+
+/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsh_h(a: __m128h) -> f16 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtsh_h(a: __m256h) -> f16 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtsh_h(a: __m512h) -> f16 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Copy the lower 16-bit integer in a to dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
+    unsafe { simd_extract!(a.as_i16x8(), 0) }
+}
+
+/// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsi16_si128(a: i16) -> __m128i {
+    unsafe { transmute(simd_insert!(i16x8::ZERO, 0, a)) }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
+    fn vcmpsh(a: __m128h, b: __m128h, imm8: i32, mask: __mmask8, sae: i32) -> __mmask8;
+    #[link_name = "llvm.x86.avx512fp16.vcomi.sh"]
+    fn vcomish(a: __m128h, b: __m128h, imm8: i32, sae: i32) -> i32;
+
+    #[link_name = "llvm.x86.avx512fp16.add.ph.512"]
+    fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.sub.ph.512"]
+    fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mul.ph.512"]
+    fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.div.ph.512"]
+    fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"]
+    fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"]
+    fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"]
+    fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"]
+    fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.128"]
+    fn vfmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.256"]
+    fn vfmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.512"]
+    fn vfmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.csh"]
+    fn vfmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.128"]
+    fn vfcmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.256"]
+    fn vfcmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.512"]
+    fn vfcmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.csh"]
+    fn vfcmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.128"]
+    fn vfmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.128"]
+    fn vfmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.256"]
+    fn vfmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.256"]
+    fn vfmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.512"]
+    fn vfmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.512"]
+    fn vfmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.csh"]
+    fn vfmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.csh"]
+    fn vfmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.128"]
+    fn vfcmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.128"]
+    fn vfcmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.256"]
+    fn vfcmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.256"]
+    fn vfcmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.512"]
+    fn vfcmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
+    -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.512"]
+    fn vfcmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
+    -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.csh"]
+    fn vfcmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.csh"]
+    fn vfcmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
+
+    #[link_name = "llvm.x86.avx512fp16.vfmadd.ph.512"]
+    fn vfmaddph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
+    fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;
+
+    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.128"]
+    fn vfmaddsubph_128(a: __m128h, b: __m128h, c: __m128h) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.256"]
+    fn vfmaddsubph_256(a: __m256h, b: __m256h, c: __m256h) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
+    fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.128"]
+    fn vrcpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.256"]
+    fn vrcpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.512"]
+    fn vrcpph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rcp.sh"]
+    fn vrcpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.128"]
+    fn vrsqrtph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.256"]
+    fn vrsqrtph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.512"]
+    fn vrsqrtph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.sh"]
+    fn vrsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.sqrt.ph.512"]
+    fn vsqrtph_512(a: __m512h, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.sqrt.sh"]
+    fn vsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.max.ph.128"]
+    fn vmaxph_128(a: __m128h, b: __m128h) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.max.ph.256"]
+    fn vmaxph_256(a: __m256h, b: __m256h) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.max.ph.512"]
+    fn vmaxph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.max.sh.round"]
+    fn vmaxsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.min.ph.128"]
+    fn vminph_128(a: __m128h, b: __m128h) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.min.ph.256"]
+    fn vminph_256(a: __m256h, b: __m256h) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.min.ph.512"]
+    fn vminph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.min.sh.round"]
+    fn vminsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.128"]
+    fn vgetexpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.256"]
+    fn vgetexpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.512"]
+    fn vgetexpph_512(a: __m512h, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.getexp.sh"]
+    fn vgetexpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.128"]
+    fn vgetmantph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.256"]
+    fn vgetmantph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.512"]
+    fn vgetmantph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.getmant.sh"]
+    fn vgetmantsh(
+        a: __m128h,
+        b: __m128h,
+        imm8: i32,
+        src: __m128h,
+        k: __mmask8,
+        sae: i32,
+    ) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.128"]
+    fn vrndscaleph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.256"]
+    fn vrndscaleph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.512"]
+    fn vrndscaleph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.sh"]
+    fn vrndscalesh(
+        a: __m128h,
+        b: __m128h,
+        src: __m128h,
+        k: __mmask8,
+        imm8: i32,
+        sae: i32,
+    ) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.128"]
+    fn vscalefph_128(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.256"]
+    fn vscalefph_256(a: __m256h, b: __m256h, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.512"]
+    fn vscalefph_512(a: __m512h, b: __m512h, src: __m512h, k: __mmask32, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.scalef.sh"]
+    fn vscalefsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.128"]
+    fn vreduceph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.256"]
+    fn vreduceph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.512"]
+    fn vreduceph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.reduce.sh"]
+    fn vreducesh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, imm8: i32, sae: i32)
+    -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.fpclass.sh"]
+    fn vfpclasssh(a: __m128h, imm8: i32, k: __mmask8) -> __mmask8;
+
+    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i16"]
+    fn vcvtw2ph_128(a: i16x8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i16"]
+    fn vcvtw2ph_256(a: i16x16, rounding: i32) -> __m256h;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v32f16.v32i16"]
+    fn vcvtw2ph_512(a: i16x32, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i16"]
+    fn vcvtuw2ph_128(a: u16x8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i16"]
+    fn vcvtuw2ph_256(a: u16x16, rounding: i32) -> __m256h;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v32f16.v32i16"]
+    fn vcvtuw2ph_512(a: u16x32, rounding: i32) -> __m512h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtdq2ph.128"]
+    fn vcvtdq2ph_128(a: i32x4, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i32"]
+    fn vcvtdq2ph_256(a: i32x8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i32"]
+    fn vcvtdq2ph_512(a: i32x16, rounding: i32) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.vcvtsi2sh"]
+    fn vcvtsi2sh(a: __m128h, b: i32, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtudq2ph.128"]
+    fn vcvtudq2ph_128(a: u32x4, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i32"]
+    fn vcvtudq2ph_256(a: u32x8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i32"]
+    fn vcvtudq2ph_512(a: u32x16, rounding: i32) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.vcvtusi2sh"]
+    fn vcvtusi2sh(a: __m128h, b: u32, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.128"]
+    fn vcvtqq2ph_128(a: i64x2, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.256"]
+    fn vcvtqq2ph_256(a: i64x4, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i64"]
+    fn vcvtqq2ph_512(a: i64x8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.128"]
+    fn vcvtuqq2ph_128(a: u64x2, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.256"]
+    fn vcvtuqq2ph_256(a: u64x4, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i64"]
+    fn vcvtuqq2ph_512(a: u64x8, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.128"]
+    fn vcvtps2phx_128(a: __m128, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.256"]
+    fn vcvtps2phx_256(a: __m256, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.512"]
+    fn vcvtps2phx_512(a: __m512, src: __m256h, k: __mmask16, rounding: i32) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtss2sh.round"]
+    fn vcvtss2sh(a: __m128h, b: __m128, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.128"]
+    fn vcvtpd2ph_128(a: __m128d, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.256"]
+    fn vcvtpd2ph_256(a: __m256d, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.512"]
+    fn vcvtpd2ph_512(a: __m512d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsd2sh.round"]
+    fn vcvtsd2sh(a: __m128h, b: __m128d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.128"]
+    fn vcvtph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.256"]
+    fn vcvtph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.512"]
+    fn vcvtph2w_512(a: __m512h, src: i16x32, k: __mmask32, rounding: i32) -> i16x32;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.128"]
+    fn vcvtph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.256"]
+    fn vcvtph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.512"]
+    fn vcvtph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.128"]
+    fn vcvttph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.256"]
+    fn vcvttph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.512"]
+    fn vcvttph2w_512(a: __m512h, src: i16x32, k: __mmask32, sae: i32) -> i16x32;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.128"]
+    fn vcvttph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.256"]
+    fn vcvttph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.512"]
+    fn vcvttph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.128"]
+    fn vcvtph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.256"]
+    fn vcvtph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.512"]
+    fn vcvtph2dq_512(a: __m256h, src: i32x16, k: __mmask16, rounding: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512fp16.vcvtsh2si32"]
+    fn vcvtsh2si32(a: __m128h, rounding: i32) -> i32;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.128"]
+    fn vcvtph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.256"]
+    fn vcvtph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.512"]
+    fn vcvtph2udq_512(a: __m256h, src: u32x16, k: __mmask16, rounding: i32) -> u32x16;
+    #[link_name = "llvm.x86.avx512fp16.vcvtsh2usi32"]
+    fn vcvtsh2usi32(a: __m128h, sae: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.128"]
+    fn vcvttph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.256"]
+    fn vcvttph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.512"]
+    fn vcvttph2dq_512(a: __m256h, src: i32x16, k: __mmask16, sae: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512fp16.vcvttsh2si32"]
+    fn vcvttsh2si32(a: __m128h, sae: i32) -> i32;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.128"]
+    fn vcvttph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.256"]
+    fn vcvttph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.512"]
+    fn vcvttph2udq_512(a: __m256h, src: u32x16, k: __mmask16, sae: i32) -> u32x16;
+    #[link_name = "llvm.x86.avx512fp16.vcvttsh2usi32"]
+    fn vcvttsh2usi32(a: __m128h, sae: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.128"]
+    fn vcvtph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.256"]
+    fn vcvtph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.512"]
+    fn vcvtph2qq_512(a: __m128h, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.128"]
+    fn vcvtph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.256"]
+    fn vcvtph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.512"]
+    fn vcvtph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.128"]
+    fn vcvttph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.256"]
+    fn vcvttph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.512"]
+    fn vcvttph2qq_512(a: __m128h, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.128"]
+    fn vcvttph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.256"]
+    fn vcvttph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.512"]
+    fn vcvttph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.128"]
+    fn vcvtph2psx_128(a: __m128h, src: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.256"]
+    fn vcvtph2psx_256(a: __m128h, src: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.512"]
+    fn vcvtph2psx_512(a: __m256h, src: __m512, k: __mmask16, sae: i32) -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2ss.round"]
+    fn vcvtsh2ss(a: __m128, b: __m128h, src: __m128, k: __mmask8, sae: i32) -> __m128;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.128"]
+    fn vcvtph2pd_128(a: __m128h, src: __m128d, k: __mmask8) -> __m128d;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.256"]
+    fn vcvtph2pd_256(a: __m128h, src: __m256d, k: __mmask8) -> __m256d;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.512"]
+    fn vcvtph2pd_512(a: __m128h, src: __m512d, k: __mmask8, sae: i32) -> __m512d;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2sd.round"]
+    fn vcvtsh2sd(a: __m128d, b: __m128h, src: __m128d, k: __mmask8, sae: i32) -> __m128d;
+
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use crate::mem::transmute;
+    use crate::ptr::{addr_of, addr_of_mut};
+    use stdarch_test::simd_test;
+
+    #[target_feature(enable = "avx512fp16")]
+    unsafe fn _mm_set1_pch(re: f16, im: f16) -> __m128h {
+        _mm_setr_ph(re, im, re, im, re, im, re, im)
+    }
+
+    #[target_feature(enable = "avx512fp16")]
+    unsafe fn _mm256_set1_pch(re: f16, im: f16) -> __m256h {
+        _mm256_setr_ph(
+            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
+        )
+    }
+
+    #[target_feature(enable = "avx512fp16")]
+    unsafe fn _mm512_set1_pch(re: f16, im: f16) -> __m512h {
+        _mm512_setr_ph(
+            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
+            re, im, re, im, re, im, re, im, re, im,
+        )
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_set_ph() {
+        let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_set_ph() {
+        let r = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let e = _mm256_setr_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_set_ph() {
+        let r = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let e = _mm512_setr_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_set_sh() {
+        let r = _mm_set_sh(1.0);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_set1_ph() {
+        let r = _mm_set1_ph(1.0);
+        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_set1_ph() {
+        let r = _mm256_set1_ph(1.0);
+        let e = _mm256_set_ph(
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_set1_ph() {
+        let r = _mm512_set1_ph(1.0);
+        let e = _mm512_set_ph(
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_setr_ph() {
+        let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_setr_ph() {
+        let r = _mm256_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let e = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_setr_ph() {
+        let r = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let e = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_setzero_ph() {
+        let r = _mm_setzero_ph();
+        let e = _mm_set1_ph(0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_setzero_ph() {
+        let r = _mm256_setzero_ph();
+        let e = _mm256_set1_ph(0.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_setzero_ph() {
+        let r = _mm512_setzero_ph();
+        let e = _mm512_set1_ph(0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_castsi128_ph() {
+        let a = _mm_set1_epi16(0x3c00);
+        let r = _mm_castsi128_ph(a);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_castsi256_ph() {
+        let a = _mm256_set1_epi16(0x3c00);
+        let r = _mm256_castsi256_ph(a);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_castsi512_ph() {
+        let a = _mm512_set1_epi16(0x3c00);
+        let r = _mm512_castsi512_ph(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_castph_si128() {
+        let a = _mm_set1_ph(1.0);
+        let r = _mm_castph_si128(a);
+        let e = _mm_set1_epi16(0x3c00);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm256_castph_si256() {
+        let a = _mm256_set1_ph(1.0);
+        let r = _mm256_castph_si256(a);
+        let e = _mm256_set1_epi16(0x3c00);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_castph_si512() {
+        let a = _mm512_set1_ph(1.0);
+        let r = _mm512_castph_si512(a);
+        let e = _mm512_set1_epi16(0x3c00);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_castps_ph() {
+        let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00));
+        let r = _mm_castps_ph(a);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_castps_ph() {
+        let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00));
+        let r = _mm256_castps_ph(a);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_castps_ph() {
+        let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00));
+        let r = _mm512_castps_ph(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_castph_ps() {
+        let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000));
+        let r = _mm_castph_ps(a);
+        let e = _mm_set1_ps(1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm256_castph_ps() {
+        let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000));
+        let r = _mm256_castph_ps(a);
+        let e = _mm256_set1_ps(1.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_castph_ps() {
+        let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000));
+        let r = _mm512_castph_ps(a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_castpd_ph() {
+        let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00));
+        let r = _mm_castpd_ph(a);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_castpd_ph() {
+        let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00));
+        let r = _mm256_castpd_ph(a);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_castpd_ph() {
+        let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00));
+        let r = _mm512_castpd_ph(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_castph_pd() {
+        let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000));
+        let r = _mm_castph_pd(a);
+        let e = _mm_set1_pd(1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm256_castph_pd() {
+        let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000));
+        let r = _mm256_castph_pd(a);
+        let e = _mm256_set1_pd(1.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_castph_pd() {
+        let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000));
+        let r = _mm512_castph_pd(a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_castph256_ph128() {
+        let a = _mm256_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm256_castph256_ph128(a);
+        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm512_castph512_ph128() {
+        let a = _mm512_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
+            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_castph512_ph128(a);
+        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm512_castph512_ph256() {
+        let a = _mm512_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
+            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_castph512_ph256(a);
+        let e = _mm256_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_castph128_ph256() {
+        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_castph128_ph256(a);
+        assert_eq_m128h(_mm256_castph256_ph128(r), a);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm512_castph128_ph512() {
+        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_castph128_ph512(a);
+        assert_eq_m128h(_mm512_castph512_ph128(r), a);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm512_castph256_ph512() {
+        let a = _mm256_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_castph256_ph512(a);
+        assert_eq_m256h(_mm512_castph512_ph256(r), a);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_zextph128_ph256() {
+        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_zextph128_ph256(a);
+        let e = _mm256_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_zextph128_ph512() {
+        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_zextph128_ph512(a);
+        let e = _mm512_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_zextph256_ph512() {
+        let a = _mm256_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_zextph256_ph512(a);
+        let e = _mm512_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0.,
+            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cmp_ph_mask() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
+        let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
+        assert_eq!(r, 0b11110000);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cmp_ph_mask() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
+        let r = _mm_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101, a, b);
+        assert_eq!(r, 0b01010000);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cmp_ph_mask() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
+            -16.0,
+        );
+        let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
+        assert_eq!(r, 0b1111000011110000);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_ph_mask() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
+            -16.0,
+        );
+        let r = _mm256_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b0101010101010101, a, b);
+        assert_eq!(r, 0b0101000001010000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cmp_ph_mask() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
+            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
+            -29.0, -30.0, -31.0, -32.0,
+        );
+        let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
+        assert_eq!(r, 0b11110000111100001111000011110000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cmp_ph_mask() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
+            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
+            -29.0, -30.0, -31.0, -32.0,
+        );
+        let r = _mm512_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101010101010101010101010101, a, b);
+        assert_eq!(r, 0b01010000010100000101000001010000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cmp_round_ph_mask() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
+            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
+            -29.0, -30.0, -31.0, -32.0,
+        );
+        let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
+        assert_eq!(r, 0b11110000111100001111000011110000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cmp_round_ph_mask() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
+            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
+            -29.0, -30.0, -31.0, -32.0,
+        );
+        let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        assert_eq!(r, 0b01010000010100000101000001010000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cmp_round_sh_mask() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cmp_round_sh_mask() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_mask_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(0, a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cmp_sh_mask() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_cmp_sh_mask::<_CMP_EQ_OQ>(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cmp_sh_mask() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_mask_cmp_sh_mask::<_CMP_EQ_OQ>(0, a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comi_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comi_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comieq_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_comieq_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comige_sh() {
+        let a = _mm_set_sh(2.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_comige_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comigt_sh() {
+        let a = _mm_set_sh(2.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_comigt_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comile_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_comile_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comilt_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_comilt_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comineq_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_comineq_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_ucomieq_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_ucomieq_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_ucomige_sh() {
+        let a = _mm_set_sh(2.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_ucomige_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_ucomigt_sh() {
+        let a = _mm_set_sh(2.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_ucomigt_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_ucomile_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_ucomile_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_ucomilt_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_ucomilt_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_ucomineq_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_ucomineq_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_load_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_load_ph(addr_of!(a).cast());
+        assert_eq_m128h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_load_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_load_ph(addr_of!(a).cast());
+        assert_eq_m256h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_load_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_load_ph(addr_of!(a).cast());
+        assert_eq_m512h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_load_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_load_sh(addr_of!(a).cast());
+        assert_eq_m128h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_load_sh() {
+        let a = _mm_set_sh(1.0);
+        let src = _mm_set_sh(2.);
+        let b = _mm_mask_load_sh(src, 1, addr_of!(a).cast());
+        assert_eq_m128h(a, b);
+        let b = _mm_mask_load_sh(src, 0, addr_of!(a).cast());
+        assert_eq_m128h(src, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_load_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_maskz_load_sh(1, addr_of!(a).cast());
+        assert_eq_m128h(a, b);
+        let b = _mm_maskz_load_sh(0, addr_of!(a).cast());
+        assert_eq_m128h(_mm_setzero_ph(), b);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_loadu_ph() {
+        let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let r = _mm_loadu_ph(array.as_ptr());
+        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_loadu_ph() {
+        let array = [
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        ];
+        let r = _mm256_loadu_ph(array.as_ptr());
+        let e = _mm256_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_loadu_ph() {
+        let array = [
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        ];
+        let r = _mm512_loadu_ph(array.as_ptr());
+        let e = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_move_sh() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_sh(9.0);
+        let r = _mm_move_sh(a, b);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_move_sh() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_sh(9.0);
+        let src = _mm_set_sh(10.0);
+        let r = _mm_mask_move_sh(src, 0, a, b);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_move_sh() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_sh(9.0);
+        let r = _mm_maskz_move_sh(0, a, b);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_store_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let mut b = _mm_setzero_ph();
+        _mm_store_ph(addr_of_mut!(b).cast(), a);
+        assert_eq_m128h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_store_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let mut b = _mm256_setzero_ph();
+        _mm256_store_ph(addr_of_mut!(b).cast(), a);
+        assert_eq_m256h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_store_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let mut b = _mm512_setzero_ph();
+        _mm512_store_ph(addr_of_mut!(b).cast(), a);
+        assert_eq_m512h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_store_sh() {
+        let a = _mm_set_sh(1.0);
+        let mut b = _mm_setzero_ph();
+        _mm_store_sh(addr_of_mut!(b).cast(), a);
+        assert_eq_m128h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_store_sh() {
+        let a = _mm_set_sh(1.0);
+        let mut b = _mm_setzero_ph();
+        _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a);
+        assert_eq_m128h(_mm_setzero_ph(), b);
+        _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a);
+        assert_eq_m128h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_storeu_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let mut array = [0.0; 8];
+        _mm_storeu_ph(array.as_mut_ptr(), a);
+        assert_eq_m128h(a, _mm_loadu_ph(array.as_ptr()));
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_storeu_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let mut array = [0.0; 16];
+        _mm256_storeu_ph(array.as_mut_ptr(), a);
+        assert_eq_m256h(a, _mm256_loadu_ph(array.as_ptr()));
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_storeu_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let mut array = [0.0; 32];
+        _mm512_storeu_ph(array.as_mut_ptr(), a);
+        assert_eq_m512h(a, _mm512_loadu_ph(array.as_ptr()));
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_add_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let r = _mm_add_ph(a, b);
+        let e = _mm_set1_ph(9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_add_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_add_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_add_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let r = _mm_maskz_add_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_add_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let r = _mm256_add_ph(a, b);
+        let e = _mm256_set1_ph(17.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_add_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let src = _mm256_set_ph(
+            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
+        );
+        let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_add_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let r = _mm256_maskz_add_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_add_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_add_ph(a, b);
+        let e = _mm512_set1_ph(33.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_add_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let src = _mm512_set_ph(
+            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
+            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+        );
+        let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
+            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_add_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
+            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_add_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ph(33.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_add_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let src = _mm512_set_ph(
+            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
+            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+        );
+        let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
+            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_add_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
+            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_add_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_sh(3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_add_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_set_sh(3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_add_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r =
+            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_set_sh(3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_add_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_add_sh(a, b);
+        let e = _mm_set_sh(3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_add_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_add_sh(src, 0, a, b);
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_add_sh(src, 1, a, b);
+        let e = _mm_set_sh(3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_add_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_maskz_add_sh(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_add_sh(1, a, b);
+        let e = _mm_set_sh(3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_sub_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let r = _mm_sub_ph(a, b);
+        let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_sub_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_sub_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_sub_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let r = _mm_maskz_sub_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_sub_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let r = _mm256_sub_ph(a, b);
+        let e = _mm256_set_ph(
+            -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0,
+            15.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_sub_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let src = _mm256_set_ph(
+            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
+        );
+        let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_sub_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_sub_ph(a, b);
+        let e = _mm512_set_ph(
+            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
+            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
+            23.0, 25.0, 27.0, 29.0, 31.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_sub_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let src = _mm512_set_ph(
+            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
+            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+        );
+        let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
+            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_sub_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
+            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_sub_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set_ph(
+            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
+            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
+            23.0, 25.0, 27.0, 29.0, 31.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_sub_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let src = _mm512_set_ph(
+            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
+            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+        );
+        let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
+            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_sub_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
+            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_sub_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_sh(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_sub_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_set_sh(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_sub_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r =
+            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_set_sh(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_sub_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_sub_sh(a, b);
+        let e = _mm_set_sh(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_sub_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_sub_sh(src, 0, a, b);
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_sub_sh(src, 1, a, b);
+        let e = _mm_set_sh(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_sub_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_maskz_sub_sh(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_sub_sh(1, a, b);
+        let e = _mm_set_sh(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mul_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let r = _mm_mul_ph(a, b);
+        let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_mul_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_mul_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_mul_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let r = _mm_maskz_mul_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mul_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let r = _mm256_mul_ph(a, b);
+        let e = _mm256_set_ph(
+            16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0,
+            30.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_mul_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let src = _mm256_set_ph(
+            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
+        );
+        let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_mul_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mul_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_mul_ph(a, b);
+        let e = _mm512_set_ph(
+            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
+            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
+            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_mul_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let src = _mm512_set_ph(
+            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
+            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+        );
+        let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
+            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_mul_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
+            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mul_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set_ph(
+            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
+            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
+            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_mul_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let src = _mm512_set_ph(
+            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
+            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+        );
+        let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
+            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_mul_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
+            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mul_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_sh(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_mul_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_set_sh(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_mul_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r =
+            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_set_sh(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mul_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_mul_sh(a, b);
+        let e = _mm_set_sh(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_mul_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_mul_sh(src, 0, a, b);
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_mul_sh(src, 1, a, b);
+        let e = _mm_set_sh(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_mul_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_maskz_mul_sh(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_mul_sh(1, a, b);
+        let e = _mm_set_sh(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_div_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let r = _mm_div_ph(a, b);
+        let e = _mm_set1_ph(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_div_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0);
+        let r = _mm_mask_div_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_div_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let r = _mm_maskz_div_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_div_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let r = _mm256_div_ph(a, b);
+        let e = _mm256_set1_ph(0.5);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_div_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let src = _mm256_set_ph(
+            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+            19.0,
+        );
+        let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_div_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let r = _mm256_maskz_div_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_div_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let r = _mm512_div_ph(a, b);
+        let e = _mm512_set1_ph(0.5);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_div_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let src = _mm512_set_ph(
+            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
+            33.0, 34.0, 35.0,
+        );
+        let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
+            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_div_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_div_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ph(0.5);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_div_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let src = _mm512_set_ph(
+            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
+            33.0, 34.0, 35.0,
+        );
+        let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
+            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_div_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_div_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_sh(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_div_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_set_sh(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_div_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r =
+            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_set_sh(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_div_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_div_sh(a, b);
+        let e = _mm_set_sh(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_div_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_div_sh(src, 0, a, b);
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_div_sh(src, 1, a, b);
+        let e = _mm_set_sh(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_div_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_maskz_div_sh(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_div_sh(1, a, b);
+        let e = _mm_set_sh(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 1.0);
+        let r = _mm_mul_pch(a, b);
+        let e = _mm_set1_pch(-1.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_mul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 1.0);
+        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
+        let r = _mm_mask_mul_pch(src, 0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_mul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 1.0);
+        let r = _mm_maskz_mul_pch(0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 1.0);
+        let r = _mm256_mul_pch(a, b);
+        let e = _mm256_set1_pch(-1.0, 0.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_mul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 1.0);
+        let src = _mm256_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+        );
+        let r = _mm256_mask_mul_pch(src, 0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_mul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 1.0);
+        let r = _mm256_maskz_mul_pch(0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_mul_pch(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_mul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_mul_pch(src, 0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_mul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_maskz_mul_pch(0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_mul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_mul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_maskz_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r = _mm_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_mul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_mul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r =
+            _mm_maskz_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r = _mm_mul_sch(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_mul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_mul_sch(src, 0, a, b);
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_mul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r = _mm_maskz_mul_sch(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 1.0);
+        let r = _mm_fmul_pch(a, b);
+        let e = _mm_set1_pch(-1.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 1.0);
+        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
+        let r = _mm_mask_fmul_pch(src, 0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 1.0);
+        let r = _mm_maskz_fmul_pch(0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 1.0);
+        let r = _mm256_fmul_pch(a, b);
+        let e = _mm256_set1_pch(-1.0, 0.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 1.0);
+        let src = _mm256_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+        );
+        let r = _mm256_mask_fmul_pch(src, 0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 1.0);
+        let r = _mm256_maskz_fmul_pch(0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_fmul_pch(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_fmul_pch(src, 0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_maskz_fmul_pch(0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_maskz_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r = _mm_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r =
+            _mm_maskz_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r = _mm_fmul_sch(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_fmul_sch(src, 0, a, b);
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r = _mm_maskz_fmul_sch(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, -1.0);
+        let r = _mm_cmul_pch(a, b);
+        let e = _mm_set1_pch(-1.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, -1.0);
+        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
+        let r = _mm_mask_cmul_pch(src, 0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, -1.0);
+        let r = _mm_maskz_cmul_pch(0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, -1.0);
+        let r = _mm256_cmul_pch(a, b);
+        let e = _mm256_set1_pch(-1.0, 0.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, -1.0);
+        let src = _mm256_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+        );
+        let r = _mm256_mask_cmul_pch(src, 0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, -1.0);
+        let r = _mm256_maskz_cmul_pch(0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_cmul_pch(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_cmul_pch(src, 0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_maskz_cmul_pch(0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_maskz_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r = _mm_cmul_sch(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_cmul_sch(src, 0, a, b);
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r = _mm_maskz_cmul_sch(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r = _mm_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r =
+            _mm_maskz_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fcmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, -1.0);
+        let r = _mm_fcmul_pch(a, b);
+        let e = _mm_set1_pch(-1.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fcmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, -1.0);
+        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
+        let r = _mm_mask_fcmul_pch(src, 0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fcmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, -1.0);
+        let r = _mm_maskz_fcmul_pch(0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fcmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, -1.0);
+        let r = _mm256_fcmul_pch(a, b);
+        let e = _mm256_set1_pch(-1.0, 0.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fcmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, -1.0);
+        let src = _mm256_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+        );
+        let r = _mm256_mask_fcmul_pch(src, 0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fcmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, -1.0);
+        let r = _mm256_maskz_fcmul_pch(0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fcmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_fcmul_pch(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fcmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_fcmul_pch(src, 0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fcmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_maskz_fcmul_pch(0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fcmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fcmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fcmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_maskz_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fcmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r = _mm_fcmul_sch(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fcmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_fcmul_sch(src, 0, a, b);
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fcmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r = _mm_maskz_fcmul_sch(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fcmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r = _mm_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fcmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fcmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r =
+            _mm_maskz_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_abs_ph() {
+        let a = _mm_set_ph(-1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0);
+        let r = _mm_abs_ph(a);
+        let e = _mm_set_ph(1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_abs_ph() {
+        let a = _mm256_set_ph(
+            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
+            -14.0,
+        );
+        let r = _mm256_abs_ph(a);
+        let e = _mm256_set_ph(
+            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_abs_ph() {
+        let a = _mm512_set_ph(
+            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
+            -14.0, 15.0, -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0,
+            27.0, -28.0, 29.0, -30.0,
+        );
+        let r = _mm512_abs_ph(a);
+        let e = _mm512_set_ph(
+            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
+            15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0,
+            29.0, 30.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_conj_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let r = _mm_conj_pch(a);
+        let e = _mm_set1_pch(0.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_conj_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
+        let r = _mm_mask_conj_pch(src, 0b0101, a);
+        let e = _mm_setr_ph(0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_conj_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let r = _mm_maskz_conj_pch(0b0101, a);
+        let e = _mm_setr_ph(0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_conj_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let r = _mm256_conj_pch(a);
+        let e = _mm256_set1_pch(0.0, -1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_conj_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let src = _mm256_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+        );
+        let r = _mm256_mask_conj_pch(src, 0b01010101, a);
+        let e = _mm256_setr_ph(
+            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_conj_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let r = _mm256_maskz_conj_pch(0b01010101, a);
+        let e = _mm256_setr_ph(
+            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_conj_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_conj_pch(a);
+        let e = _mm512_set1_pch(0.0, -1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_conj_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_conj_pch(src, 0b0101010101010101, a);
+        let e = _mm512_setr_ph(
+            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
+            0.0, -1.0, 20.0, 21.0, 0.0, -1.0, 24.0, 25.0, 0.0, -1.0, 28.0, 29.0, 0.0, -1.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_conj_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_maskz_conj_pch(0b0101010101010101, a);
+        let e = _mm512_setr_ph(
+            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
+            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_fmadd_pch(a, b, c);
+        let e = _mm_set1_pch(-2.0, 3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_mask_fmadd_pch(a, 0b0101, b, c);
+        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_mask3_fmadd_pch(a, b, c, 0b0101);
+        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_maskz_fmadd_pch(0b0101, a, b, c);
+        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_fmadd_pch(a, b, c);
+        let e = _mm256_set1_pch(-2.0, 3.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_mask_fmadd_pch(a, 0b01010101, b, c);
+        let e = _mm256_setr_ph(
+            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_mask3_fmadd_pch(a, b, c, 0b01010101);
+        let e = _mm256_setr_ph(
+            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_maskz_fmadd_pch(0b01010101, a, b, c);
+        let e = _mm256_setr_ph(
+            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_fmadd_pch(a, b, c);
+        let e = _mm512_set1_pch(-2.0, 3.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask_fmadd_pch(a, 0b0101010101010101, b, c);
+        let e = _mm512_setr_ph(
+            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
+            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask3_fmadd_pch(a, b, c, 0b0101010101010101);
+        let e = _mm512_setr_ph(
+            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
+            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_maskz_fmadd_pch(0b0101010101010101, a, b, c);
+        let e = _mm512_setr_ph(
+            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
+            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r =
+            _mm512_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pch(-2.0, 3.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b0101010101010101,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ph(
+            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
+            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask3_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b0101010101010101,
+        );
+        let e = _mm512_setr_ph(
+            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
+            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_maskz_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ph(
+            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
+            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_fmadd_sch(a, b, c);
+        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask_fmadd_sch(a, 0, b, c);
+        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fmadd_sch(a, 1, b, c);
+        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask3_fmadd_sch(a, b, c, 0);
+        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fmadd_sch(a, b, c, 1);
+        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_maskz_fmadd_sch(0, a, b, c);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fmadd_sch(1, a, b, c);
+        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 1, b, c,
+        );
+        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 1,
+        );
+        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            1, a, b, c,
+        );
+        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fcmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_fcmadd_pch(a, b, c);
+        let e = _mm_set1_pch(2.0, 3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fcmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_mask_fcmadd_pch(a, 0b0101, b, c);
+        let e = _mm_setr_ph(2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fcmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_mask3_fcmadd_pch(a, b, c, 0b0101);
+        let e = _mm_setr_ph(2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fcmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_maskz_fcmadd_pch(0b0101, a, b, c);
+        let e = _mm_setr_ph(2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fcmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_fcmadd_pch(a, b, c);
+        let e = _mm256_set1_pch(2.0, 3.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fcmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_mask_fcmadd_pch(a, 0b01010101, b, c);
+        let e = _mm256_setr_ph(
+            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fcmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_mask3_fcmadd_pch(a, b, c, 0b01010101);
+        let e = _mm256_setr_ph(
+            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fcmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_maskz_fcmadd_pch(0b01010101, a, b, c);
+        let e = _mm256_setr_ph(
+            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fcmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_fcmadd_pch(a, b, c);
+        let e = _mm512_set1_pch(2.0, 3.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fcmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask_fcmadd_pch(a, 0b0101010101010101, b, c);
+        let e = _mm512_setr_ph(
+            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
+            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fcmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask3_fcmadd_pch(a, b, c, 0b0101010101010101);
+        let e = _mm512_setr_ph(
+            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
+            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fcmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_maskz_fcmadd_pch(0b0101010101010101, a, b, c);
+        let e = _mm512_setr_ph(
+            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
+            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fcmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r =
+            _mm512_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pch(2.0, 3.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fcmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b0101010101010101,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ph(
+            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
+            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fcmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask3_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b0101010101010101,
+        );
+        let e = _mm512_setr_ph(
+            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
+            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fcmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_maskz_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ph(
+            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
+            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fcmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_fcmadd_sch(a, b, c);
+        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fcmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask_fcmadd_sch(a, 0, b, c);
+        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fcmadd_sch(a, 1, b, c);
+        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fcmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask3_fcmadd_sch(a, b, c, 0);
+        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fcmadd_sch(a, b, c, 1);
+        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fcmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_maskz_fcmadd_sch(0, a, b, c);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fcmadd_sch(1, a, b, c);
+        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fcmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fcmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 1, b, c,
+        );
+        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fcmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 1,
+        );
+        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fcmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            1, a, b, c,
+        );
+        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_fmadd_ph(a, b, c);
+        let e = _mm_set1_ph(5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask_fmadd_ph(a, 0b01010101, b, c);
+        let e = _mm_set_ph(1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask3_fmadd_ph(a, b, c, 0b01010101);
+        let e = _mm_set_ph(3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_maskz_fmadd_ph(0b01010101, a, b, c);
+        let e = _mm_set_ph(0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_fmadd_ph(a, b, c);
+        let e = _mm256_set1_ph(5.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_fmadd_ph(a, 0b0101010101010101, b, c);
+        let e = _mm256_set_ph(
+            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask3_fmadd_ph(a, b, c, 0b0101010101010101);
+        let e = _mm256_set_ph(
+            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_fmadd_ph(0b0101010101010101, a, b, c);
+        let e = _mm256_set_ph(
+            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fmadd_ph(a, b, c);
+        let e = _mm512_set1_ph(5.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmadd_ph(a, 0b01010101010101010101010101010101, b, c);
+        let e = _mm512_set_ph(
+            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
+            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmadd_ph(a, b, c, 0b01010101010101010101010101010101);
+        let e = _mm512_set_ph(
+            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
+            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmadd_ph(0b01010101010101010101010101010101, a, b, c);
+        let e = _mm512_set_ph(
+            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
+            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ph(5.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b01010101010101010101010101010101,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
+            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b01010101010101010101010101010101,
+        );
+        let e = _mm512_set_ph(
+            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
+            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
+            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fmadd_sh(a, b, c);
+        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fmadd_sh(a, 0, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fmadd_sh(a, 1, b, c);
+        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fmadd_sh(a, b, c, 0);
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fmadd_sh(a, b, c, 1);
+        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fmadd_sh(0, a, b, c);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fmadd_sh(1, a, b, c);
+        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 1, b, c,
+        );
+        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 1,
+        );
+        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            1, a, b, c,
+        );
+        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_fmsub_ph(a, b, c);
+        let e = _mm_set1_ph(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask_fmsub_ph(a, 0b01010101, b, c);
+        let e = _mm_set_ph(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask3_fmsub_ph(a, b, c, 0b01010101);
+        let e = _mm_set_ph(3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_maskz_fmsub_ph(0b01010101, a, b, c);
+        let e = _mm_set_ph(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_fmsub_ph(a, b, c);
+        let e = _mm256_set1_ph(-1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_fmsub_ph(a, 0b0101010101010101, b, c);
+        let e = _mm256_set_ph(
+            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask3_fmsub_ph(a, b, c, 0b0101010101010101);
+        let e = _mm256_set_ph(
+            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_fmsub_ph(0b0101010101010101, a, b, c);
+        let e = _mm256_set_ph(
+            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fmsub_ph(a, b, c);
+        let e = _mm512_set1_ph(-1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmsub_ph(a, 0b01010101010101010101010101010101, b, c);
+        let e = _mm512_set_ph(
+            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmsub_ph(a, b, c, 0b01010101010101010101010101010101);
+        let e = _mm512_set_ph(
+            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
+            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmsub_ph(0b01010101010101010101010101010101, a, b, c);
+        let e = _mm512_set_ph(
+            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
+            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ph(-1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b01010101010101010101010101010101,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b01010101010101010101010101010101,
+        );
+        let e = _mm512_set_ph(
+            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
+            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
+            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fmsub_sh(a, b, c);
+        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fmsub_sh(a, 0, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fmsub_sh(a, 1, b, c);
+        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fmsub_sh(a, b, c, 0);
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fmsub_sh(a, b, c, 1);
+        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fmsub_sh(0, a, b, c);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fmsub_sh(1, a, b, c);
+        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 1, b, c,
+        );
+        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 1,
+        );
+        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            1, a, b, c,
+        );
+        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fnmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_fnmadd_ph(a, b, c);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fnmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask_fnmadd_ph(a, 0b01010101, b, c);
+        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fnmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask3_fnmadd_ph(a, b, c, 0b01010101);
+        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fnmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_maskz_fnmadd_ph(0b01010101, a, b, c);
+        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fnmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_fnmadd_ph(a, b, c);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fnmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_fnmadd_ph(a, 0b0101010101010101, b, c);
+        let e = _mm256_set_ph(
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask3_fnmadd_ph(a, b, c, 0b0101010101010101);
+        let e = _mm256_set_ph(
+            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_fnmadd_ph(0b0101010101010101, a, b, c);
+        let e = _mm256_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fnmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fnmadd_ph(a, b, c);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fnmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fnmadd_ph(a, 0b01010101010101010101010101010101, b, c);
+        let e = _mm512_set_ph(
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fnmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fnmadd_ph(a, b, c, 0b01010101010101010101010101010101);
+        let e = _mm512_set_ph(
+            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
+            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fnmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fnmadd_ph(0b01010101010101010101010101010101, a, b, c);
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fnmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r =
+            _mm512_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fnmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b01010101010101010101010101010101,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fnmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b01010101010101010101010101010101,
+        );
+        let e = _mm512_set_ph(
+            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
+            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fnmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fnmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fnmadd_sh(a, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fnmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fnmadd_sh(a, 0, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fnmadd_sh(a, 1, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fnmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fnmadd_sh(a, b, c, 0);
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fnmadd_sh(a, b, c, 1);
+        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fnmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fnmadd_sh(0, a, b, c);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fnmadd_sh(1, a, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fnmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fnmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 1, b, c,
+        );
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fnmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 1,
+        );
+        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fnmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            1, a, b, c,
+        );
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fnmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_fnmsub_ph(a, b, c);
+        let e = _mm_set1_ph(-5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fnmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask_fnmsub_ph(a, 0b01010101, b, c);
+        let e = _mm_set_ph(1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fnmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask3_fnmsub_ph(a, b, c, 0b01010101);
+        let e = _mm_set_ph(3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fnmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_maskz_fnmsub_ph(0b01010101, a, b, c);
+        let e = _mm_set_ph(0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fnmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_fnmsub_ph(a, b, c);
+        let e = _mm256_set1_ph(-5.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fnmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_fnmsub_ph(a, 0b0101010101010101, b, c);
+        let e = _mm256_set_ph(
+            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask3_fnmsub_ph(a, b, c, 0b0101010101010101);
+        let e = _mm256_set_ph(
+            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_fnmsub_ph(0b0101010101010101, a, b, c);
+        let e = _mm256_set_ph(
+            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fnmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fnmsub_ph(a, b, c);
+        let e = _mm512_set1_ph(-5.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fnmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fnmsub_ph(a, 0b01010101010101010101010101010101, b, c);
+        let e = _mm512_set_ph(
+            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
+            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fnmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fnmsub_ph(a, b, c, 0b01010101010101010101010101010101);
+        let e = _mm512_set_ph(
+            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
+            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fnmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fnmsub_ph(0b01010101010101010101010101010101, a, b, c);
+        let e = _mm512_set_ph(
+            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
+            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fnmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r =
+            _mm512_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ph(-5.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fnmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b01010101010101010101010101010101,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
+            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fnmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b01010101010101010101010101010101,
+        );
+        let e = _mm512_set_ph(
+            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
+            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fnmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
+            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fnmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fnmsub_sh(a, b, c);
+        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fnmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fnmsub_sh(a, 0, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fnmsub_sh(a, 1, b, c);
+        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fnmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fnmsub_sh(a, b, c, 0);
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fnmsub_sh(a, b, c, 1);
+        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fnmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fnmsub_sh(0, a, b, c);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fnmsub_sh(1, a, b, c);
+        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fnmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fnmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 1, b, c,
+        );
+        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fnmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 1,
+        );
+        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fnmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            1, a, b, c,
+        );
+        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fmaddsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_fmaddsub_ph(a, b, c);
+        let e = _mm_set_ph(5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fmaddsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask_fmaddsub_ph(a, 0b00110011, b, c);
+        let e = _mm_set_ph(1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fmaddsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask3_fmaddsub_ph(a, b, c, 0b00110011);
+        let e = _mm_set_ph(3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fmaddsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_maskz_fmaddsub_ph(0b00110011, a, b, c);
+        let e = _mm_set_ph(0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fmaddsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_fmaddsub_ph(a, b, c);
+        let e = _mm256_set_ph(
+            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fmaddsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_fmaddsub_ph(a, 0b0011001100110011, b, c);
+        let e = _mm256_set_ph(
+            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fmaddsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask3_fmaddsub_ph(a, b, c, 0b0011001100110011);
+        let e = _mm256_set_ph(
+            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fmaddsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_fmaddsub_ph(0b0011001100110011, a, b, c);
+        let e = _mm256_set_ph(
+            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmaddsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fmaddsub_ph(a, b, c);
+        let e = _mm512_set_ph(
+            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
+            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmaddsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmaddsub_ph(a, 0b00110011001100110011001100110011, b, c);
+        let e = _mm512_set_ph(
+            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
+            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmaddsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmaddsub_ph(a, b, c, 0b00110011001100110011001100110011);
+        let e = _mm512_set_ph(
+            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
+            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmaddsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmaddsub_ph(0b00110011001100110011001100110011, a, b, c);
+        let e = _mm512_set_ph(
+            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
+            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmaddsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r =
+            _mm512_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set_ph(
+            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
+            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmaddsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00110011001100110011001100110011,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
+            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmaddsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00110011001100110011001100110011,
+        );
+        let e = _mm512_set_ph(
+            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
+            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmaddsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00110011001100110011001100110011,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
+            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fmsubadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_fmsubadd_ph(a, b, c);
+        let e = _mm_set_ph(-1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fmsubadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask_fmsubadd_ph(a, 0b00110011, b, c);
+        let e = _mm_set_ph(1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fmsubadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask3_fmsubadd_ph(a, b, c, 0b00110011);
+        let e = _mm_set_ph(3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fmsubadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_maskz_fmsubadd_ph(0b00110011, a, b, c);
+        let e = _mm_set_ph(0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fmsubadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_fmsubadd_ph(a, b, c);
+        let e = _mm256_set_ph(
+            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fmsubadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_fmsubadd_ph(a, 0b0011001100110011, b, c);
+        let e = _mm256_set_ph(
+            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsubadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask3_fmsubadd_ph(a, b, c, 0b0011001100110011);
+        let e = _mm256_set_ph(
+            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsubadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_fmsubadd_ph(0b0011001100110011, a, b, c);
+        let e = _mm256_set_ph(
+            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmsubadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fmsubadd_ph(a, b, c);
+        let e = _mm512_set_ph(
+            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
+            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmsubadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmsubadd_ph(a, 0b00110011001100110011001100110011, b, c);
+        let e = _mm512_set_ph(
+            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
+            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmsubadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmsubadd_ph(a, b, c, 0b00110011001100110011001100110011);
+        let e = _mm512_set_ph(
+            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
+            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmsubadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmsubadd_ph(0b00110011001100110011001100110011, a, b, c);
+        let e = _mm512_set_ph(
+            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
+            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmsubadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r =
+            _mm512_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set_ph(
+            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
+            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmsubadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00110011001100110011001100110011,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
+            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmsubadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00110011001100110011001100110011,
+        );
+        let e = _mm512_set_ph(
+            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
+            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmsubadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00110011001100110011001100110011,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
+            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_rcp_ph() {
+        let a = _mm_set1_ph(2.0);
+        let r = _mm_rcp_ph(a);
+        let e = _mm_set1_ph(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_rcp_ph() {
+        let a = _mm_set1_ph(2.0);
+        let src = _mm_set1_ph(1.0);
+        let r = _mm_mask_rcp_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_rcp_ph() {
+        let a = _mm_set1_ph(2.0);
+        let r = _mm_maskz_rcp_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_rcp_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let r = _mm256_rcp_ph(a);
+        let e = _mm256_set1_ph(0.5);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_rcp_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let src = _mm256_set1_ph(1.0);
+        let r = _mm256_mask_rcp_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_rcp_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let r = _mm256_maskz_rcp_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_rcp_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let r = _mm512_rcp_ph(a);
+        let e = _mm512_set1_ph(0.5);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_rcp_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let src = _mm512_set1_ph(1.0);
+        let r = _mm512_mask_rcp_ph(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
+            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_rcp_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let r = _mm512_maskz_rcp_ph(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_rcp_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_rcp_sh(a, b);
+        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_rcp_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_rcp_sh(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_rcp_sh(src, 1, a, b);
+        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_rcp_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_maskz_rcp_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_rcp_sh(1, a, b);
+        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_rsqrt_ph() {
+        let a = _mm_set1_ph(4.0);
+        let r = _mm_rsqrt_ph(a);
+        let e = _mm_set1_ph(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_rsqrt_ph() {
+        let a = _mm_set1_ph(4.0);
+        let src = _mm_set1_ph(1.0);
+        let r = _mm_mask_rsqrt_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_rsqrt_ph() {
+        let a = _mm_set1_ph(4.0);
+        let r = _mm_maskz_rsqrt_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_rsqrt_ph() {
+        let a = _mm256_set1_ph(4.0);
+        let r = _mm256_rsqrt_ph(a);
+        let e = _mm256_set1_ph(0.5);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_rsqrt_ph() {
+        let a = _mm256_set1_ph(4.0);
+        let src = _mm256_set1_ph(1.0);
+        let r = _mm256_mask_rsqrt_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_rsqrt_ph() {
+        let a = _mm256_set1_ph(4.0);
+        let r = _mm256_maskz_rsqrt_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_rsqrt_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let r = _mm512_rsqrt_ph(a);
+        let e = _mm512_set1_ph(0.5);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_rsqrt_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let src = _mm512_set1_ph(1.0);
+        let r = _mm512_mask_rsqrt_ph(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
+            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_rsqrt_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let r = _mm512_maskz_rsqrt_ph(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_rsqrt_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let r = _mm_rsqrt_sh(a, b);
+        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_rsqrt_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_rsqrt_sh(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_rsqrt_sh(src, 1, a, b);
+        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_rsqrt_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let r = _mm_maskz_rsqrt_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_rsqrt_sh(1, a, b);
+        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_sqrt_ph() {
+        let a = _mm_set1_ph(4.0);
+        let r = _mm_sqrt_ph(a);
+        let e = _mm_set1_ph(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_sqrt_ph() {
+        let a = _mm_set1_ph(4.0);
+        let src = _mm_set1_ph(1.0);
+        let r = _mm_mask_sqrt_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_sqrt_ph() {
+        let a = _mm_set1_ph(4.0);
+        let r = _mm_maskz_sqrt_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_sqrt_ph() {
+        let a = _mm256_set1_ph(4.0);
+        let r = _mm256_sqrt_ph(a);
+        let e = _mm256_set1_ph(2.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_sqrt_ph() {
+        let a = _mm256_set1_ph(4.0);
+        let src = _mm256_set1_ph(1.0);
+        let r = _mm256_mask_sqrt_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_sqrt_ph() {
+        let a = _mm256_set1_ph(4.0);
+        let r = _mm256_maskz_sqrt_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_sqrt_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let r = _mm512_sqrt_ph(a);
+        let e = _mm512_set1_ph(2.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_sqrt_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let src = _mm512_set1_ph(1.0);
+        let r = _mm512_mask_sqrt_ph(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
+            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_sqrt_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let r = _mm512_maskz_sqrt_ph(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
+            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_sqrt_round_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let r = _mm512_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set1_ph(2.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_sqrt_round_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let src = _mm512_set1_ph(1.0);
+        let r = _mm512_mask_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
+            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_sqrt_round_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let r = _mm512_maskz_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
+            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_sqrt_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let r = _mm_sqrt_sh(a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_sqrt_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_sqrt_sh(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_sqrt_sh(src, 1, a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_sqrt_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let r = _mm_maskz_sqrt_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_sqrt_sh(1, a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_sqrt_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let r = _mm_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_sqrt_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_sqrt_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let r =
+            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_max_ph() {
+        let a = _mm_set1_ph(2.0);
+        let b = _mm_set1_ph(1.0);
+        let r = _mm_max_ph(a, b);
+        let e = _mm_set1_ph(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_max_ph() {
+        let a = _mm_set1_ph(2.0);
+        let b = _mm_set1_ph(1.0);
+        let src = _mm_set1_ph(3.0);
+        let r = _mm_mask_max_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_max_ph() {
+        let a = _mm_set1_ph(2.0);
+        let b = _mm_set1_ph(1.0);
+        let r = _mm_maskz_max_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_max_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let b = _mm256_set1_ph(1.0);
+        let r = _mm256_max_ph(a, b);
+        let e = _mm256_set1_ph(2.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_max_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let b = _mm256_set1_ph(1.0);
+        let src = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_max_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_max_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let b = _mm256_set1_ph(1.0);
+        let r = _mm256_maskz_max_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_max_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_max_ph(a, b);
+        let e = _mm512_set1_ph(2.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_max_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let src = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_max_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
+            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_max_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_maskz_max_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
+            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_max_round_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ph(2.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_max_round_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let src = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
+            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_max_round_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_maskz_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
+            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_max_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_max_sh(a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_max_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_max_sh(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_max_sh(src, 1, a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_max_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_maskz_max_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_max_sh(1, a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_max_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_max_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_max_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r =
+            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_min_ph() {
+        let a = _mm_set1_ph(2.0);
+        let b = _mm_set1_ph(1.0);
+        let r = _mm_min_ph(a, b);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_min_ph() {
+        let a = _mm_set1_ph(2.0);
+        let b = _mm_set1_ph(1.0);
+        let src = _mm_set1_ph(3.0);
+        let r = _mm_mask_min_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_min_ph() {
+        let a = _mm_set1_ph(2.0);
+        let b = _mm_set1_ph(1.0);
+        let r = _mm_maskz_min_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_min_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let b = _mm256_set1_ph(1.0);
+        let r = _mm256_min_ph(a, b);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_min_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let b = _mm256_set1_ph(1.0);
+        let src = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_min_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_min_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let b = _mm256_set1_ph(1.0);
+        let r = _mm256_maskz_min_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_min_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_min_ph(a, b);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_min_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let src = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_min_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
+            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_min_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_maskz_min_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_min_round_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_min_round_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let src = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
+            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_min_round_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_maskz_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_min_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_min_sh(a, b);
+        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_min_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_min_sh(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_min_sh(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_min_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_maskz_min_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_min_sh(1, a, b);
+        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_min_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_min_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_min_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r =
+            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_getexp_ph() {
+        let a = _mm_set1_ph(3.0);
+        let r = _mm_getexp_ph(a);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_getexp_ph() {
+        let a = _mm_set1_ph(3.0);
+        let src = _mm_set1_ph(4.0);
+        let r = _mm_mask_getexp_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_getexp_ph() {
+        let a = _mm_set1_ph(3.0);
+        let r = _mm_maskz_getexp_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_getexp_ph() {
+        let a = _mm256_set1_ph(3.0);
+        let r = _mm256_getexp_ph(a);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_getexp_ph() {
+        let a = _mm256_set1_ph(3.0);
+        let src = _mm256_set1_ph(4.0);
+        let r = _mm256_mask_getexp_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_getexp_ph() {
+        let a = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_getexp_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_getexp_ph() {
+        let a = _mm512_set1_ph(3.0);
+        let r = _mm512_getexp_ph(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_getexp_ph() {
+        let a = _mm512_set1_ph(3.0);
+        let src = _mm512_set1_ph(4.0);
+        let r = _mm512_mask_getexp_ph(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
+            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_getexp_ph() {
+        let a = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_getexp_ph(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_getexp_round_ph() {
+        let a = _mm512_set1_ph(3.0);
+        let r = _mm512_getexp_round_ph::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_getexp_round_ph() {
+        let a = _mm512_set1_ph(3.0);
+        let src = _mm512_set1_ph(4.0);
+        let r = _mm512_mask_getexp_round_ph::<_MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
+            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_getexp_round_ph() {
+        let a = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_getexp_round_ph::<_MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_getexp_sh() {
+        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_getexp_sh(a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_getexp_sh() {
+        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_getexp_sh(src, 0, a, b);
+        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_getexp_sh(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_getexp_sh() {
+        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_getexp_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_getexp_sh(1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_getexp_round_sh() {
+        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_getexp_round_sh::<_MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_getexp_round_sh() {
+        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 0, a, b);
+        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_getexp_round_sh() {
+        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_getmant_ph() {
+        let a = _mm_set1_ph(10.0);
+        let r = _mm_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm_set1_ph(1.25);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_getmant_ph() {
+        let a = _mm_set1_ph(10.0);
+        let src = _mm_set1_ph(20.0);
+        let r = _mm_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0b01010101, a);
+        let e = _mm_set_ph(20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_getmant_ph() {
+        let a = _mm_set1_ph(10.0);
+        let r = _mm_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0b01010101, a);
+        let e = _mm_set_ph(0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_getmant_ph() {
+        let a = _mm256_set1_ph(10.0);
+        let r = _mm256_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm256_set1_ph(1.25);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_getmant_ph() {
+        let a = _mm256_set1_ph(10.0);
+        let src = _mm256_set1_ph(20.0);
+        let r = _mm256_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+            20.0, 1.25,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_getmant_ph() {
+        let a = _mm256_set1_ph(10.0);
+        let r = _mm256_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_getmant_ph() {
+        let a = _mm512_set1_ph(10.0);
+        let r = _mm512_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm512_set1_ph(1.25);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_getmant_ph() {
+        let a = _mm512_set1_ph(10.0);
+        let src = _mm512_set1_ph(20.0);
+        let r = _mm512_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+            20.0, 1.25, 20.0, 1.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_getmant_ph() {
+        let a = _mm512_set1_ph(10.0);
+        let r = _mm512_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_getmant_round_ph() {
+        let a = _mm512_set1_ph(10.0);
+        let r =
+            _mm512_getmant_round_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
+                a,
+            );
+        let e = _mm512_set1_ph(1.25);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_getmant_round_ph() {
+        let a = _mm512_set1_ph(10.0);
+        let src = _mm512_set1_ph(20.0);
+        let r = _mm512_mask_getmant_round_ph::<
+            _MM_MANT_NORM_P75_1P5,
+            _MM_MANT_SIGN_NAN,
+            _MM_FROUND_NO_EXC,
+        >(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+            20.0, 1.25, 20.0, 1.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_getmant_round_ph() {
+        let a = _mm512_set1_ph(10.0);
+        let r = _mm512_maskz_getmant_round_ph::<
+            _MM_MANT_NORM_P75_1P5,
+            _MM_MANT_SIGN_NAN,
+            _MM_FROUND_NO_EXC,
+        >(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_getmant_sh() {
+        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a, b);
+        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_getmant_sh() {
+        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0, a, b);
+        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 1, a, b);
+        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_getmant_sh() {
+        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(1, a, b);
+        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_getmant_round_sh() {
+        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_getmant_round_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
+            a, b,
+        );
+        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_getmant_round_sh() {
+        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_getmant_round_sh::<
+            _MM_MANT_NORM_P75_1P5,
+            _MM_MANT_SIGN_NAN,
+            _MM_FROUND_NO_EXC,
+        >(src, 0, a, b);
+        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_getmant_round_sh::<
+            _MM_MANT_NORM_P75_1P5,
+            _MM_MANT_SIGN_NAN,
+            _MM_FROUND_NO_EXC,
+        >(src, 1, a, b);
+        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_getmant_round_sh() {
+        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_getmant_round_sh::<
+            _MM_MANT_NORM_P75_1P5,
+            _MM_MANT_SIGN_NAN,
+            _MM_FROUND_NO_EXC,
+        >(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_getmant_round_sh::<
+            _MM_MANT_NORM_P75_1P5,
+            _MM_MANT_SIGN_NAN,
+            _MM_FROUND_NO_EXC,
+        >(1, a, b);
+        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_roundscale_ph() {
+        let a = _mm_set1_ph(1.1);
+        let r = _mm_roundscale_ph::<0>(a);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_roundscale_ph() {
+        let a = _mm_set1_ph(1.1);
+        let src = _mm_set1_ph(2.0);
+        let r = _mm_mask_roundscale_ph::<0>(src, 0b01010101, a);
+        let e = _mm_set_ph(2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_roundscale_ph() {
+        let a = _mm_set1_ph(1.1);
+        let r = _mm_maskz_roundscale_ph::<0>(0b01010101, a);
+        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_roundscale_ph() {
+        let a = _mm256_set1_ph(1.1);
+        let r = _mm256_roundscale_ph::<0>(a);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_roundscale_ph() {
+        let a = _mm256_set1_ph(1.1);
+        let src = _mm256_set1_ph(2.0);
+        let r = _mm256_mask_roundscale_ph::<0>(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_roundscale_ph() {
+        let a = _mm256_set1_ph(1.1);
+        let r = _mm256_maskz_roundscale_ph::<0>(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_roundscale_ph() {
+        let a = _mm512_set1_ph(1.1);
+        let r = _mm512_roundscale_ph::<0>(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_roundscale_ph() {
+        let a = _mm512_set1_ph(1.1);
+        let src = _mm512_set1_ph(2.0);
+        let r = _mm512_mask_roundscale_ph::<0>(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
+            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_roundscale_ph() {
+        let a = _mm512_set1_ph(1.1);
+        let r = _mm512_maskz_roundscale_ph::<0>(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_roundscale_round_ph() {
+        let a = _mm512_set1_ph(1.1);
+        let r = _mm512_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_roundscale_round_ph() {
+        let a = _mm512_set1_ph(1.1);
+        let src = _mm512_set1_ph(2.0);
+        let r = _mm512_mask_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
+            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_roundscale_round_ph() {
+        let a = _mm512_set1_ph(1.1);
+        let r = _mm512_maskz_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_roundscale_sh() {
+        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_roundscale_sh::<0>(a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_roundscale_sh() {
+        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_roundscale_sh::<0>(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_roundscale_sh::<0>(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_roundscale_sh() {
+        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_roundscale_sh::<0>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_roundscale_sh::<0>(1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_roundscale_round_sh() {
+        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_roundscale_round_sh() {
+        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_roundscale_round_sh() {
+        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_scalef_ph() {
+        let a = _mm_set1_ph(1.);
+        let b = _mm_set1_ph(3.);
+        let r = _mm_scalef_ph(a, b);
+        let e = _mm_set1_ph(8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_scalef_ph() {
+        let a = _mm_set1_ph(1.);
+        let b = _mm_set1_ph(3.);
+        let src = _mm_set1_ph(2.);
+        let r = _mm_mask_scalef_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_scalef_ph() {
+        let a = _mm_set1_ph(1.);
+        let b = _mm_set1_ph(3.);
+        let r = _mm_maskz_scalef_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_scalef_ph() {
+        let a = _mm256_set1_ph(1.);
+        let b = _mm256_set1_ph(3.);
+        let r = _mm256_scalef_ph(a, b);
+        let e = _mm256_set1_ph(8.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_scalef_ph() {
+        let a = _mm256_set1_ph(1.);
+        let b = _mm256_set1_ph(3.);
+        let src = _mm256_set1_ph(2.);
+        let r = _mm256_mask_scalef_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_scalef_ph() {
+        let a = _mm256_set1_ph(1.);
+        let b = _mm256_set1_ph(3.);
+        let r = _mm256_maskz_scalef_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_scalef_ph() {
+        let a = _mm512_set1_ph(1.);
+        let b = _mm512_set1_ph(3.);
+        let r = _mm512_scalef_ph(a, b);
+        let e = _mm512_set1_ph(8.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_scalef_ph() {
+        let a = _mm512_set1_ph(1.);
+        let b = _mm512_set1_ph(3.);
+        let src = _mm512_set1_ph(2.);
+        let r = _mm512_mask_scalef_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
+            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_scalef_ph() {
+        let a = _mm512_set1_ph(1.);
+        let b = _mm512_set1_ph(3.);
+        let r = _mm512_maskz_scalef_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
+            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_scalef_round_ph() {
+        let a = _mm512_set1_ph(1.);
+        let b = _mm512_set1_ph(3.);
+        let r = _mm512_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ph(8.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_scalef_round_ph() {
+        let a = _mm512_set1_ph(1.);
+        let b = _mm512_set1_ph(3.);
+        let src = _mm512_set1_ph(2.);
+        let r = _mm512_mask_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
+            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_scalef_round_ph() {
+        let a = _mm512_set1_ph(1.);
+        let b = _mm512_set1_ph(3.);
+        let r = _mm512_maskz_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
+            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_scalef_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_scalef_sh(a, b);
+        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_scalef_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_scalef_sh(src, 0, a, b);
+        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_scalef_sh(src, 1, a, b);
+        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_scalef_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_scalef_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_scalef_sh(1, a, b);
+        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_scalef_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_scalef_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_scalef_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r =
+            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_reduce_ph() {
+        let a = _mm_set1_ph(1.25);
+        let r = _mm_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm_set1_ph(0.25);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_reduce_ph() {
+        let a = _mm_set1_ph(1.25);
+        let src = _mm_set1_ph(2.0);
+        let r = _mm_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01010101, a);
+        let e = _mm_set_ph(2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_reduce_ph() {
+        let a = _mm_set1_ph(1.25);
+        let r = _mm_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01010101, a);
+        let e = _mm_set_ph(0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_reduce_ph() {
+        let a = _mm256_set1_ph(1.25);
+        let r = _mm256_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm256_set1_ph(0.25);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_ph() {
+        let a = _mm256_set1_ph(1.25);
+        let src = _mm256_set1_ph(2.0);
+        let r = _mm256_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_reduce_ph() {
+        let a = _mm256_set1_ph(1.25);
+        let r = _mm256_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_reduce_ph() {
+        let a = _mm512_set1_ph(1.25);
+        let r = _mm512_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm512_set1_ph(0.25);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_reduce_ph() {
+        let a = _mm512_set1_ph(1.25);
+        let src = _mm512_set1_ph(2.0);
+        let r = _mm512_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_reduce_ph() {
+        let a = _mm512_set1_ph(1.25);
+        let r = _mm512_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_reduce_round_ph() {
+        let a = _mm512_set1_ph(1.25);
+        let r = _mm512_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set1_ph(0.25);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_reduce_round_ph() {
+        let a = _mm512_set1_ph(1.25);
+        let src = _mm512_set1_ph(2.0);
+        let r = _mm512_mask_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_reduce_round_ph() {
+        let a = _mm512_set1_ph(1.25);
+        let r = _mm512_maskz_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_reduce_sh() {
+        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
+        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_reduce_sh() {
+        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0, a, b);
+        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 1, a, b);
+        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_reduce_sh() {
+        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(1, a, b);
+        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_reduce_round_sh() {
+        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_reduce_round_sh() {
+        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_reduce_round_sh() {
+        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+        let r =
+            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_reduce_add_ph() {
+        let a = _mm_set1_ph(2.0);
+        let r = _mm_reduce_add_ph(a);
+        assert_eq!(r, 16.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_reduce_add_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let r = _mm256_reduce_add_ph(a);
+        assert_eq!(r, 32.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_reduce_add_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let r = _mm512_reduce_add_ph(a);
+        assert_eq!(r, 64.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_reduce_mul_ph() {
+        let a = _mm_set1_ph(2.0);
+        let r = _mm_reduce_mul_ph(a);
+        assert_eq!(r, 256.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_reduce_mul_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let r = _mm256_reduce_mul_ph(a);
+        assert_eq!(r, 65536.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_reduce_mul_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let r = _mm512_reduce_mul_ph(a);
+        assert_eq!(r, 16777216.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_reduce_max_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_reduce_max_ph(a);
+        assert_eq!(r, 8.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_reduce_max_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_reduce_max_ph(a);
+        assert_eq!(r, 16.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_reduce_max_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_reduce_max_ph(a);
+        assert_eq!(r, 32.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_reduce_min_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_reduce_min_ph(a);
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_reduce_min_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_reduce_min_ph(a);
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_reduce_min_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_reduce_min_ph(a);
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fpclass_ph_mask() {
+        let a = _mm_set_ph(
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+        );
+        let r = _mm_fpclass_ph_mask::<0x18>(a); // infinities
+        assert_eq!(r, 0b01100000);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fpclass_ph_mask() {
+        let a = _mm_set_ph(
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+        );
+        let r = _mm_mask_fpclass_ph_mask::<0x18>(0b01010101, a);
+        assert_eq!(r, 0b01000000);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fpclass_ph_mask() {
+        let a = _mm256_set_ph(
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+        );
+        let r = _mm256_fpclass_ph_mask::<0x18>(a); // infinities
+        assert_eq!(r, 0b0110000001100000);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fpclass_ph_mask() {
+        let a = _mm256_set_ph(
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+        );
+        let r = _mm256_mask_fpclass_ph_mask::<0x18>(0b0101010101010101, a);
+        assert_eq!(r, 0b0100000001000000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fpclass_ph_mask() {
+        let a = _mm512_set_ph(
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+        );
+        let r = _mm512_fpclass_ph_mask::<0x18>(a); // infinities
+        assert_eq!(r, 0b01100000011000000110000001100000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fpclass_ph_mask() {
+        let a = _mm512_set_ph(
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+        );
+        let r = _mm512_mask_fpclass_ph_mask::<0x18>(0b01010101010101010101010101010101, a);
+        assert_eq!(r, 0b01000000010000000100000001000000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fpclass_sh_mask() {
+        let a = _mm_set_sh(f16::INFINITY);
+        let r = _mm_fpclass_sh_mask::<0x18>(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fpclass_sh_mask() {
+        let a = _mm_set_sh(f16::INFINITY);
+        let r = _mm_mask_fpclass_sh_mask::<0x18>(0, a);
+        assert_eq!(r, 0);
+        let r = _mm_mask_fpclass_sh_mask::<0x18>(1, a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_blend_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0);
+        let r = _mm_mask_blend_ph(0b01010101, a, b);
+        let e = _mm_set_ph(1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_blend_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
+            -14.0, -15.0, -16.0,
+        );
+        let r = _mm256_mask_blend_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
+            -16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_blend_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
+            -14.0, -15.0, -16.0, -17.0, -18.0, -19.0, -20.0, -21.0, -22.0, -23.0, -24.0, -25.0,
+            -26.0, -27.0, -28.0, -29.0, -30.0, -31.0, -32.0,
+        );
+        let r = _mm512_mask_blend_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
+            -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0, 27.0, -28.0,
+            29.0, -30.0, 31.0, -32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_permutex2var_ph() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let idx = _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14);
+        let r = _mm_permutex2var_ph(a, idx, b);
+        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_permutex2var_ph() {
+        let a = _mm256_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_setr_ph(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let idx = _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let r = _mm256_permutex2var_ph(a, idx, b);
+        let e = _mm256_setr_ph(
+            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
+            31.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_permutex2var_ph() {
+        let a = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_setr_ph(
+            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
+            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
+            61.0, 62.0, 63.0, 64.0,
+        );
+        let idx = _mm512_set_epi16(
+            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20,
+            18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+        );
+        let r = _mm512_permutex2var_ph(a, idx, b);
+        let e = _mm512_setr_ph(
+            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
+            31.0, 33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0, 49.0, 51.0, 53.0, 55.0, 57.0,
+            59.0, 61.0, 63.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_permutexvar_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let idx = _mm_set_epi16(0, 2, 4, 6, 1, 3, 5, 7);
+        let r = _mm_permutexvar_ph(idx, a);
+        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_permutexvar_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let idx = _mm256_set_epi16(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+        let r = _mm256_permutexvar_ph(idx, a);
+        let e = _mm256_setr_ph(
+            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_permutexvar_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let idx = _mm512_set_epi16(
+            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15,
+            17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        let r = _mm512_permutexvar_ph(idx, a);
+        let e = _mm512_setr_ph(
+            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
+            31.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0,
+            30.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtepi16_ph() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm_cvtepi16_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_ph() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtepi16_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi16_ph() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm_maskz_cvtepi16_ph(0b01010101, a);
+        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtepi16_ph() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_cvtepi16_ph(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_ph() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm256_mask_cvtepi16_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi16_ph() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_maskz_cvtepi16_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtepi16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_cvtepi16_ph(a);
+        let e = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtepi16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let src = _mm512_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
+            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
+        );
+        let r = _mm512_mask_cvtepi16_ph(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
+            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtepi16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_cvtepi16_ph(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
+            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundepi16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundepi16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let src = _mm512_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
+            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
+        );
+        let r = _mm512_mask_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
+            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundepi16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
+            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtepu16_ph() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm_cvtepu16_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu16_ph() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtepu16_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu16_ph() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm_maskz_cvtepu16_ph(0b01010101, a);
+        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtepu16_ph() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_cvtepu16_ph(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu16_ph() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm256_mask_cvtepu16_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu16_ph() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_maskz_cvtepu16_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtepu16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_cvtepu16_ph(a);
+        let e = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtepu16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let src = _mm512_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
+            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
+        );
+        let r = _mm512_mask_cvtepu16_ph(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
+            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtepu16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_cvtepu16_ph(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
+            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundepu16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundepu16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let src = _mm512_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
+            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
+        );
+        let r = _mm512_mask_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
+            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundepu16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
+            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtepi32_ph() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_cvtepi32_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_ph() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtepi32_ph(src, 0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_ph() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_cvtepi32_ph(0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtepi32_ph() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_cvtepi32_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_ph() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm256_mask_cvtepi32_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_ph() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_cvtepi32_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtepi32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_cvtepi32_ph(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtepi32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm512_mask_cvtepi32_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtepi32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_cvtepi32_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundepi32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundepi32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm512_mask_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundepi32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvti32_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvti32_sh(a, 10);
+        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundi32_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvt_roundi32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
+        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtepu32_ph() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_cvtepu32_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu32_ph() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtepu32_ph(src, 0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu32_ph() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_cvtepu32_ph(0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtepu32_ph() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_cvtepu32_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu32_ph() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm256_mask_cvtepu32_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu32_ph() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_cvtepu32_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtepu32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_cvtepu32_ph(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtepu32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm512_mask_cvtepu32_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtepu32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_cvtepu32_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundepu32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundepu32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm512_mask_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
+            16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundepu32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtu32_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtu32_sh(a, 10);
+        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundu32_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvt_roundu32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
+        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtepi64_ph() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_cvtepi64_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_ph() {
+        let a = _mm_set_epi64x(1, 2);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtepi64_ph(src, 0b01, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi64_ph() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_maskz_cvtepi64_ph(0b01, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtepi64_ph() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepi64_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_ph() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm256_mask_cvtepi64_ph(src, 0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi64_ph() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepi64_ph(0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtepi64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvtepi64_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtepi64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm512_mask_cvtepi64_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtepi64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvtepi64_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundepi64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundepi64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm512_mask_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0b01010101, a,
+        );
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundepi64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101, a,
+        );
+        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtepu64_ph() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_cvtepu64_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu64_ph() {
+        let a = _mm_set_epi64x(1, 2);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtepu64_ph(src, 0b01, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu64_ph() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_maskz_cvtepu64_ph(0b01, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtepu64_ph() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepu64_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu64_ph() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm256_mask_cvtepu64_ph(src, 0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu64_ph() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepu64_ph(0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtepu64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvtepu64_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtepu64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm512_mask_cvtepu64_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtepu64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvtepu64_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundepu64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundepu64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm512_mask_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0b01010101, a,
+        );
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundepu64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101, a,
+        );
+        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtxps_ph() {
+        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtxps_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtxps_ph() {
+        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtxps_ph(src, 0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16., 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtxps_ph() {
+        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvtxps_ph(0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtxps_ph() {
+        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvtxps_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtxps_ph() {
+        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm256_mask_cvtxps_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtxps_ph() {
+        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_maskz_cvtxps_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtxps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtxps_ph(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtxps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm512_mask_cvtxps_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtxps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtxps_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtx_roundps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtx_roundps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm512_mask_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
+            16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtx_roundps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtss_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtss_sh(a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvtss_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
+        let r = _mm_mask_cvtss_sh(src, 0, a, b);
+        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_cvtss_sh(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvtss_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvtss_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_cvtss_sh(1, a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundss_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvt_roundss_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
+        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvt_roundss_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r =
+            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtpd_ph() {
+        let a = _mm_set_pd(1.0, 2.0);
+        let r = _mm_cvtpd_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_ph() {
+        let a = _mm_set_pd(1.0, 2.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtpd_ph(src, 0b01, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_ph() {
+        let a = _mm_set_pd(1.0, 2.0);
+        let r = _mm_maskz_cvtpd_ph(0b01, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtpd_ph() {
+        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_cvtpd_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_ph() {
+        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm256_mask_cvtpd_ph(src, 0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_ph() {
+        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_maskz_cvtpd_ph(0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtpd_ph() {
+        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvtpd_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtpd_ph() {
+        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm512_mask_cvtpd_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtpd_ph() {
+        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvtpd_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundpd_ph() {
+        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundpd_ph() {
+        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm512_mask_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0b01010101, a,
+        );
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundpd_ph() {
+        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101, a,
+        );
+        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsd_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_pd(1.0, 2.0);
+        let r = _mm_cvtsd_sh(a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvtsd_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_pd(1.0, 2.0);
+        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
+        let r = _mm_mask_cvtsd_sh(src, 0, a, b);
+        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_cvtsd_sh(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvtsd_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_pd(1.0, 2.0);
+        let r = _mm_maskz_cvtsd_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_cvtsd_sh(1, a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundsd_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_pd(1.0, 2.0);
+        let r = _mm_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvt_roundsd_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_pd(1.0, 2.0);
+        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
+        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvt_roundsd_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_pd(1.0, 2.0);
+        let r =
+            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_epi16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttph_epi16(a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_epi16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
+        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_epi16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
+        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_epi16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_cvttph_epi16(a);
+        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_epi16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm256_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
+        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_epi16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
+        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvttph_epi16(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_epu16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttph_epu16(a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_epu16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
+        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_epu16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
+        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_epu16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_cvttph_epu16(a);
+        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_epu16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm256_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
+        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_epu16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
+        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvttph_epu16(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvttph_epi16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttph_epi16(a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvttph_epi16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
+        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvttph_epi16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
+        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvttph_epi16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_cvttph_epi16(a);
+        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvttph_epi16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm256_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
+        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttph_epi16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
+        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvttph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvttph_epi16(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvttph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvttph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtt_roundph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtt_roundph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtt_roundph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvttph_epu16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttph_epu16(a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvttph_epu16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
+        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvttph_epu16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
+        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvttph_epu16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_cvttph_epu16(a);
+        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvttph_epu16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm256_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
+        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttph_epu16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
+        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvttph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvttph_epu16(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvttph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvttph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtt_roundph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtt_roundph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtt_roundph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_epi32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtph_epi32(a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_epi32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let src = _mm_set_epi32(10, 11, 12, 13);
+        let r = _mm_mask_cvtph_epi32(src, 0b0101, a);
+        let e = _mm_set_epi32(10, 2, 12, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_epi32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvtph_epi32(0b0101, a);
+        let e = _mm_set_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_epi32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvtph_epi32(a);
+        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_epi32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm256_mask_cvtph_epi32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_epi32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_maskz_cvtph_epi32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtph_epi32(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvtph_epi32(src, 0b0101010101010101, a);
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtph_epi32(0b0101010101010101, a);
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsh_i32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtsh_i32(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundsh_i32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvt_roundsh_i32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_epu32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtph_epu32(a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_epu32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let src = _mm_set_epi32(10, 11, 12, 13);
+        let r = _mm_mask_cvtph_epu32(src, 0b0101, a);
+        let e = _mm_set_epi32(10, 2, 12, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_epu32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvtph_epu32(0b0101, a);
+        let e = _mm_set_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_epu32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvtph_epu32(a);
+        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_epu32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm256_mask_cvtph_epu32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_epu32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_maskz_cvtph_epu32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtph_epu32(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvtph_epu32(src, 0b0101010101010101, a);
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtph_epu32(0b0101010101010101, a);
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsh_u32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtsh_u32(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundsh_u32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvt_roundsh_u32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvttph_epi32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvttph_epi32(a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvttph_epi32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let src = _mm_set_epi32(10, 11, 12, 13);
+        let r = _mm_mask_cvttph_epi32(src, 0b0101, a);
+        let e = _mm_set_epi32(10, 2, 12, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvttph_epi32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvttph_epi32(0b0101, a);
+        let e = _mm_set_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvttph_epi32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvttph_epi32(a);
+        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvttph_epi32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm256_mask_cvttph_epi32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttph_epi32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_maskz_cvttph_epi32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvttph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvttph_epi32(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvttph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvttph_epi32(src, 0b0101010101010101, a);
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvttph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvttph_epi32(0b0101010101010101, a);
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtt_roundph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtt_roundph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtt_roundph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvttsh_i32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttsh_i32(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtt_roundsh_i32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtt_roundsh_i32::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvttph_epu32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvttph_epu32(a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvttph_epu32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let src = _mm_set_epi32(10, 11, 12, 13);
+        let r = _mm_mask_cvttph_epu32(src, 0b0101, a);
+        let e = _mm_set_epi32(10, 2, 12, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvttph_epu32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvttph_epu32(0b0101, a);
+        let e = _mm_set_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvttph_epu32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvttph_epu32(a);
+        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvttph_epu32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm256_mask_cvttph_epu32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttph_epu32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_maskz_cvttph_epu32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvttph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvttph_epu32(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvttph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvttph_epu32(src, 0b0101010101010101, a);
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvttph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvttph_epu32(0b0101010101010101, a);
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtt_roundph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtt_roundph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtt_roundph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvttsh_u32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttsh_u32(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtt_roundsh_u32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtt_roundsh_u32::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_cvtph_epi64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_epi64() {
+        let src = _mm_set_epi64x(3, 4);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_mask_cvtph_epi64(src, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_maskz_cvtph_epi64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_cvtph_epi64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_epi64() {
+        let src = _mm256_set_epi64x(5, 6, 7, 8);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_mask_cvtph_epi64(src, 0b0101, a);
+        let e = _mm256_set_epi64x(5, 2, 7, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_maskz_cvtph_epi64(0b0101, a);
+        let e = _mm256_set_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvtph_epi64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_epi64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvtph_epi64(src, 0b01010101, a);
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvtph_epi64(0b01010101, a);
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_epi64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0b01010101, a,
+        );
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101, a,
+        );
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_cvtph_epu64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_epu64() {
+        let src = _mm_set_epi64x(3, 4);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_mask_cvtph_epu64(src, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_maskz_cvtph_epu64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_cvtph_epu64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_epu64() {
+        let src = _mm256_set_epi64x(5, 6, 7, 8);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_mask_cvtph_epu64(src, 0b0101, a);
+        let e = _mm256_set_epi64x(5, 2, 7, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_maskz_cvtph_epu64(0b0101, a);
+        let e = _mm256_set_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvtph_epu64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_epu64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvtph_epu64(src, 0b01010101, a);
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvtph_epu64(0b01010101, a);
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_epu64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0b01010101, a,
+        );
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101, a,
+        );
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvttph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_cvttph_epi64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvttph_epi64() {
+        let src = _mm_set_epi64x(3, 4);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_mask_cvttph_epi64(src, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvttph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_maskz_cvttph_epi64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvttph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_cvttph_epi64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvttph_epi64() {
+        let src = _mm256_set_epi64x(5, 6, 7, 8);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_mask_cvttph_epi64(src, 0b0101, a);
+        let e = _mm256_set_epi64x(5, 2, 7, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_maskz_cvttph_epi64(0b0101, a);
+        let e = _mm256_set_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvttph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvttph_epi64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvttph_epi64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvttph_epi64(src, 0b01010101, a);
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvttph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvttph_epi64(0b01010101, a);
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtt_roundph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtt_roundph_epi64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtt_roundph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(0b01010101, a);
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvttph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_cvttph_epu64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvttph_epu64() {
+        let src = _mm_set_epi64x(3, 4);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_mask_cvttph_epu64(src, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvttph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_maskz_cvttph_epu64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvttph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_cvttph_epu64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvttph_epu64() {
+        let src = _mm256_set_epi64x(5, 6, 7, 8);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_mask_cvttph_epu64(src, 0b0101, a);
+        let e = _mm256_set_epi64x(5, 2, 7, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_maskz_cvttph_epu64(0b0101, a);
+        let e = _mm256_set_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvttph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvttph_epu64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvttph_epu64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvttph_epu64(src, 0b01010101, a);
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvttph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvttph_epu64(0b01010101, a);
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtt_roundph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtt_roundph_epu64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtt_roundph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(0b01010101, a);
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtxph_ps() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtxph_ps(a);
+        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtxph_ps() {
+        let src = _mm_set_ps(10.0, 11.0, 12.0, 13.0);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_mask_cvtxph_ps(src, 0b0101, a);
+        let e = _mm_set_ps(10.0, 2.0, 12.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtxph_ps() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvtxph_ps(0b0101, a);
+        let e = _mm_set_ps(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtxph_ps() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvtxph_ps(a);
+        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtxph_ps() {
+        let src = _mm256_set_ps(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_mask_cvtxph_ps(src, 0b01010101, a);
+        let e = _mm256_set_ps(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtxph_ps() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_maskz_cvtxph_ps(0b01010101, a);
+        let e = _mm256_set_ps(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtxph_ps() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtxph_ps(a);
+        let e = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtxph_ps() {
+        let src = _mm512_set_ps(
+            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
+            24.0, 25.0,
+        );
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_mask_cvtxph_ps(src, 0b0101010101010101, a);
+        let e = _mm512_set_ps(
+            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
+            16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtxph_ps() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtxph_ps(0b0101010101010101, a);
+        let e = _mm512_set_ps(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtx_roundph_ps() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtx_roundph_ps() {
+        let src = _mm512_set_ps(
+            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
+            24.0, 25.0,
+        );
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_mask_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
+        let e = _mm512_set_ps(
+            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
+            16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtx_roundph_ps() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
+        let e = _mm512_set_ps(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsh_ss() {
+        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_cvtsh_ss(a, b);
+        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvtsh_ss() {
+        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
+        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_mask_cvtsh_ss(src, 0, a, b);
+        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_cvtsh_ss(src, 1, a, b);
+        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvtsh_ss() {
+        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_maskz_cvtsh_ss(0, a, b);
+        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_cvtsh_ss(1, a, b);
+        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundsh_ss() {
+        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvt_roundsh_ss() {
+        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
+        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 0, a, b);
+        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvt_roundsh_ss() {
+        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(0, a, b);
+        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_pd() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_cvtph_pd(a);
+        let e = _mm_set_pd(1.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_pd() {
+        let src = _mm_set_pd(10.0, 11.0);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_mask_cvtph_pd(src, 0b01, a);
+        let e = _mm_set_pd(10.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_pd() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_maskz_cvtph_pd(0b01, a);
+        let e = _mm_set_pd(0.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_pd() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_cvtph_pd(a);
+        let e = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_pd() {
+        let src = _mm256_set_pd(10.0, 11.0, 12.0, 13.0);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_mask_cvtph_pd(src, 0b0101, a);
+        let e = _mm256_set_pd(10.0, 2.0, 12.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_pd() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_maskz_cvtph_pd(0b0101, a);
+        let e = _mm256_set_pd(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_pd() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvtph_pd(a);
+        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_pd() {
+        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvtph_pd(src, 0b01010101, a);
+        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_pd() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvtph_pd(0b01010101, a);
+        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_pd() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_pd() {
+        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
+        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_pd() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(0b01010101, a);
+        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsh_sd() {
+        let a = _mm_setr_pd(2.0, 20.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_cvtsh_sd(a, b);
+        let e = _mm_setr_pd(1.0, 20.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvtsh_sd() {
+        let src = _mm_setr_pd(3.0, 11.0);
+        let a = _mm_setr_pd(2.0, 20.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_mask_cvtsh_sd(src, 0, a, b);
+        let e = _mm_setr_pd(3.0, 20.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_cvtsh_sd(src, 1, a, b);
+        let e = _mm_setr_pd(1.0, 20.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvtsh_sd() {
+        let a = _mm_setr_pd(2.0, 20.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_maskz_cvtsh_sd(0, a, b);
+        let e = _mm_setr_pd(0.0, 20.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_cvtsh_sd(1, a, b);
+        let e = _mm_setr_pd(1.0, 20.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundsh_sd() {
+        let a = _mm_setr_pd(2.0, 20.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_pd(1.0, 20.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvt_roundsh_sd() {
+        let src = _mm_setr_pd(3.0, 11.0);
+        let a = _mm_setr_pd(2.0, 20.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 0, a, b);
+        let e = _mm_setr_pd(3.0, 20.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_pd(1.0, 20.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvt_roundsh_sd() {
+        let a = _mm_setr_pd(2.0, 20.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(0, a, b);
+        let e = _mm_setr_pd(0.0, 20.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_pd(1.0, 20.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsh_h() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtsh_h(a);
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm256_cvtsh_h() {
+        let a = _mm256_setr_ph(
+            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_cvtsh_h(a);
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtsh_h() {
+        let a = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvtsh_h(a);
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsi128_si16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm_cvtsi128_si16(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsi16_si128() {
+        let a = 1;
+        let r = _mm_cvtsi16_si128(a);
+        let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs b/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs
new file mode 100644
index 0000000000000..7c9d07f690952
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs
@@ -0,0 +1,693 @@
+use crate::core_arch::x86::*;
+use crate::intrinsics::simd::simd_select_bitmask;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { vpmadd52huq_512(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are copied
+/// from `k` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm512_mask_madd52hi_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_512(a, b, c), a) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm512_maskz_madd52hi_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_512(a, b, c), _mm512_setzero_si512()) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { vpmadd52luq_512(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are copied
+/// from `k` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm512_mask_madd52lo_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_512(a, b, c), a) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm512_maskz_madd52lo_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512()) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52hi_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52huq_256(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52huq_256(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are copied
+/// from `k` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm256_mask_madd52hi_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_256(a, b, c), a) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm256_maskz_madd52hi_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256()) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52lo_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52luq_256(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52luq_256(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are copied
+/// from `k` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm256_mask_madd52lo_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_256(a, b, c), a) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm256_maskz_madd52lo_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256()) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52hi_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52huq_128(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52huq_128(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are copied
+/// from `k` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm_mask_madd52hi_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_128(a, b, c), a) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128()) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52lo_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52luq_128(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52luq_128(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are copied
+/// from `k` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm_mask_madd52lo_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_128(a, b, c), a) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm_maskz_madd52lo_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_128(a, b, c), _mm_setzero_si128()) }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"]
+    fn vpmadd52luq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.128"]
+    fn vpmadd52huq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.256"]
+    fn vpmadd52luq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.256"]
+    fn vpmadd52huq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.512"]
+    fn vpmadd52luq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i;
+    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.512"]
+    fn vpmadd52huq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    const K: __mmask8 = 0b01101101;
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_madd52hi_epu64() {
+        let a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        let actual = _mm512_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm512_set1_epi64(11030549757952);
+
+        assert_eq_m512i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_mask_madd52hi_epu64() {
+        let a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        let actual = _mm512_mask_madd52hi_epu64(a, K, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let mut expected = _mm512_set1_epi64(11030549757952);
+        expected = _mm512_mask_blend_epi64(K, a, expected);
+
+        assert_eq_m512i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_maskz_madd52hi_epu64() {
+        let a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        let actual = _mm512_maskz_madd52hi_epu64(K, a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let mut expected = _mm512_set1_epi64(11030549757952);
+        expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected);
+
+        assert_eq_m512i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_madd52lo_epu64() {
+        let a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        let actual = _mm512_madd52lo_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm512_set1_epi64(100055558127628);
+
+        assert_eq_m512i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_mask_madd52lo_epu64() {
+        let a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        let actual = _mm512_mask_madd52lo_epu64(a, K, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let mut expected = _mm512_set1_epi64(100055558127628);
+        expected = _mm512_mask_blend_epi64(K, a, expected);
+
+        assert_eq_m512i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_maskz_madd52lo_epu64() {
+        let a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        let actual = _mm512_maskz_madd52lo_epu64(K, a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let mut expected = _mm512_set1_epi64(100055558127628);
+        expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected);
+
+        assert_eq_m512i(expected, actual);
+    }
+
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm256_madd52hi_avx_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_madd52hi_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm256_set1_epi64x(11030549757952);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_madd52hi_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm256_set1_epi64x(11030549757952);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_mask_madd52hi_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_mask_madd52hi_epu64(a, K, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let mut expected = _mm256_set1_epi64x(11030549757952);
+        expected = _mm256_mask_blend_epi64(K, a, expected);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_maskz_madd52hi_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_maskz_madd52hi_epu64(K, a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let mut expected = _mm256_set1_epi64x(11030549757952);
+        expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm256_madd52lo_avx_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_madd52lo_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm256_set1_epi64x(100055558127628);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_madd52lo_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_madd52lo_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm256_set1_epi64x(100055558127628);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_mask_madd52lo_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_mask_madd52lo_epu64(a, K, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let mut expected = _mm256_set1_epi64x(100055558127628);
+        expected = _mm256_mask_blend_epi64(K, a, expected);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_maskz_madd52lo_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_maskz_madd52lo_epu64(K, a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let mut expected = _mm256_set1_epi64x(100055558127628);
+        expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm_madd52hi_avx_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_madd52hi_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm_set1_epi64x(11030549757952);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_madd52hi_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm_set1_epi64x(11030549757952);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_mask_madd52hi_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_mask_madd52hi_epu64(a, K, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let mut expected = _mm_set1_epi64x(11030549757952);
+        expected = _mm_mask_blend_epi64(K, a, expected);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_maskz_madd52hi_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_maskz_madd52hi_epu64(K, a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let mut expected = _mm_set1_epi64x(11030549757952);
+        expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm_madd52lo_avx_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_madd52lo_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm_set1_epi64x(100055558127628);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_madd52lo_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_madd52lo_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm_set1_epi64x(100055558127628);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_mask_madd52lo_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_mask_madd52lo_epu64(a, K, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let mut expected = _mm_set1_epi64x(100055558127628);
+        expected = _mm_mask_blend_epi64(K, a, expected);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_maskz_madd52lo_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_maskz_madd52lo_epu64(K, a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let mut expected = _mm_set1_epi64x(100055558127628);
+        expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected);
+
+        assert_eq_m128i(expected, actual);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vbmi.rs b/library/stdarch/crates/core_arch/src/x86/avx512vbmi.rs
new file mode 100644
index 0000000000000..3527ccc9e44a9
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vbmi.rs
@@ -0,0 +1,960 @@
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi8&expand=4262)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub fn _mm512_permutex2var_epi8(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpermi2b(a.as_i8x64(), idx.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_epi8&expand=4259)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2b))]
+pub fn _mm512_mask_permutex2var_epi8(
+    a: __m512i,
+    k: __mmask64,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, a.as_i8x64()))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_epi8&expand=4261)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub fn _mm512_maskz_permutex2var_epi8(
+    k: __mmask64,
+    a: __m512i,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_epi8&expand=4260)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2b))]
+pub fn _mm512_mask2_permutex2var_epi8(
+    a: __m512i,
+    idx: __m512i,
+    k: __mmask64,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, idx.as_i8x64()))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_epi8&expand=4258)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub fn _mm256_permutex2var_epi8(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpermi2b256(a.as_i8x32(), idx.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_epi8&expand=4255)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2b))]
+pub fn _mm256_mask_permutex2var_epi8(
+    a: __m256i,
+    k: __mmask32,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, a.as_i8x32()))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_epi8&expand=4257)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub fn _mm256_maskz_permutex2var_epi8(
+    k: __mmask32,
+    a: __m256i,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_epi8&expand=4256)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2b))]
+pub fn _mm256_mask2_permutex2var_epi8(
+    a: __m256i,
+    idx: __m256i,
+    k: __mmask32,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, idx.as_i8x32()))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_epi8&expand=4254)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub fn _mm_permutex2var_epi8(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpermi2b128(a.as_i8x16(), idx.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_epi8&expand=4251)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2b))]
+pub fn _mm_mask_permutex2var_epi8(a: __m128i, k: __mmask16, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, a.as_i8x16()))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_epi8&expand=4253)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub fn _mm_maskz_permutex2var_epi8(k: __mmask16, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_epi8&expand=4252)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2b))]
+pub fn _mm_mask2_permutex2var_epi8(a: __m128i, idx: __m128i, k: __mmask16, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, idx.as_i8x16()))
+    }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi8&expand=4316)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm512_permutexvar_epi8(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermb(a.as_i8x64(), idx.as_i8x64())) }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_epi8&expand=4314)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm512_mask_permutexvar_epi8(
+    src: __m512i,
+    k: __mmask64,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, src.as_i8x64()))
+    }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_epi8&expand=4315)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm512_maskz_permutexvar_epi8(k: __mmask64, idx: __m512i, a: __m512i) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_epi8&expand=4313)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm256_permutexvar_epi8(idx: __m256i, a: __m256i) -> __m256i {
+    unsafe { transmute(vpermb256(a.as_i8x32(), idx.as_i8x32())) }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_epi8&expand=4311)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm256_mask_permutexvar_epi8(
+    src: __m256i,
+    k: __mmask32,
+    idx: __m256i,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, src.as_i8x32()))
+    }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_epi8&expand=4312)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm256_maskz_permutexvar_epi8(k: __mmask32, idx: __m256i, a: __m256i) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutexvar_epi8&expand=4310)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm_permutexvar_epi8(idx: __m128i, a: __m128i) -> __m128i {
+    unsafe { transmute(vpermb128(a.as_i8x16(), idx.as_i8x16())) }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutexvar_epi8&expand=4308)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm_mask_permutexvar_epi8(src: __m128i, k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, src.as_i8x16()))
+    }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutexvar_epi8&expand=4309)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm_maskz_permutexvar_epi8(k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
+    }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_multishift_epi64_epi8&expand=4026)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm512_multishift_epi64_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpmultishiftqb(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_multishift_epi64_epi8&expand=4024)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm512_mask_multishift_epi64_epi8(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, multishift, src.as_i8x64()))
+    }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_multishift_epi64_epi8&expand=4025)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm512_maskz_multishift_epi64_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, multishift, i8x64::ZERO))
+    }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_multishift_epi64_epi8&expand=4023)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm256_multishift_epi64_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpmultishiftqb256(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_multishift_epi64_epi8&expand=4021)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm256_mask_multishift_epi64_epi8(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, multishift, src.as_i8x32()))
+    }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_multishift_epi64_epi8&expand=4022)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm256_maskz_multishift_epi64_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, multishift, i8x32::ZERO))
+    }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_multishift_epi64_epi8&expand=4020)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm_multishift_epi64_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpmultishiftqb128(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_multishift_epi64_epi8&expand=4018)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm_mask_multishift_epi64_epi8(
+    src: __m128i,
+    k: __mmask16,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, multishift, src.as_i8x16()))
+    }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_multishift_epi64_epi8&expand=4019)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm_maskz_multishift_epi64_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, multishift, i8x16::ZERO))
+    }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.vpermi2var.qi.512"]
+    fn vpermi2b(a: i8x64, idx: i8x64, b: i8x64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.vpermi2var.qi.256"]
+    fn vpermi2b256(a: i8x32, idx: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.vpermi2var.qi.128"]
+    fn vpermi2b128(a: i8x16, idx: i8x16, b: i8x16) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.permvar.qi.512"]
+    fn vpermb(a: i8x64, idx: i8x64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.permvar.qi.256"]
+    fn vpermb256(a: i8x32, idx: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.permvar.qi.128"]
+    fn vpermb128(a: i8x16, idx: i8x16) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.pmultishift.qb.512"]
+    fn vpmultishiftqb(a: i8x64, b: i8x64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.pmultishift.qb.256"]
+    fn vpmultishiftqb256(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.pmultishift.qb.128"]
+    fn vpmultishiftqb128(a: i8x16, b: i8x16) -> i8x16;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
+                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
+                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
+                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
+        let b = _mm512_set1_epi8(100);
+        let r = _mm512_permutex2var_epi8(a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
+            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
+            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
+            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_mask_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
+                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
+                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
+                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
+        let b = _mm512_set1_epi8(100);
+        let r = _mm512_mask_permutex2var_epi8(a, 0, idx, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutex2var_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            idx,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
+            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
+            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
+            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_maskz_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
+                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
+                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
+                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
+        let b = _mm512_set1_epi8(100);
+        let r = _mm512_maskz_permutex2var_epi8(0, a, idx, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutex2var_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            idx,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
+            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
+            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
+            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_mask2_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
+                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
+                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
+                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
+        let b = _mm512_set1_epi8(100);
+        let r = _mm512_mask2_permutex2var_epi8(a, idx, 0, b);
+        assert_eq_m512i(r, idx);
+        let r = _mm512_mask2_permutex2var_epi8(
+            a,
+            idx,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
+            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
+            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
+            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
+                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm256_set1_epi8(100);
+        let r = _mm256_permutex2var_epi8(a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
+                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm256_set1_epi8(100);
+        let r = _mm256_mask_permutex2var_epi8(a, 0, idx, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutex2var_epi8(a, 0b11111111_11111111_11111111_11111111, idx, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
+                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm256_set1_epi8(100);
+        let r = _mm256_maskz_permutex2var_epi8(0, a, idx, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutex2var_epi8(0b11111111_11111111_11111111_11111111, a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
+                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm256_set1_epi8(100);
+        let r = _mm256_mask2_permutex2var_epi8(a, idx, 0, b);
+        assert_eq_m256i(r, idx);
+        let r = _mm256_mask2_permutex2var_epi8(a, idx, 0b11111111_11111111_11111111_11111111, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_permutex2var_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
+        let b = _mm_set1_epi8(100);
+        let r = _mm_permutex2var_epi8(a, idx, b);
+        let e = _mm_set_epi8(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
+        let b = _mm_set1_epi8(100);
+        let r = _mm_mask_permutex2var_epi8(a, 0, idx, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutex2var_epi8(a, 0b11111111_11111111, idx, b);
+        let e = _mm_set_epi8(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
+        let b = _mm_set1_epi8(100);
+        let r = _mm_maskz_permutex2var_epi8(0, a, idx, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutex2var_epi8(0b11111111_11111111, a, idx, b);
+        let e = _mm_set_epi8(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
+        let b = _mm_set1_epi8(100);
+        let r = _mm_mask2_permutex2var_epi8(a, idx, 0, b);
+        assert_eq_m128i(r, idx);
+        let r = _mm_mask2_permutex2var_epi8(a, idx, 0b11111111_11111111, b);
+        let e = _mm_set_epi8(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_permutexvar_epi8() {
+        let idx = _mm512_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_permutexvar_epi8(idx, a);
+        let e = _mm512_set1_epi8(62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_mask_permutexvar_epi8() {
+        let idx = _mm512_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_mask_permutexvar_epi8(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutexvar_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            idx,
+            a,
+        );
+        let e = _mm512_set1_epi8(62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_maskz_permutexvar_epi8() {
+        let idx = _mm512_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_maskz_permutexvar_epi8(0, idx, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutexvar_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            idx,
+            a,
+        );
+        let e = _mm512_set1_epi8(62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_permutexvar_epi8() {
+        let idx = _mm256_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_permutexvar_epi8(idx, a);
+        let e = _mm256_set1_epi8(30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_epi8() {
+        let idx = _mm256_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_mask_permutexvar_epi8(a, 0, idx, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutexvar_epi8(a, 0b11111111_11111111_11111111_11111111, idx, a);
+        let e = _mm256_set1_epi8(30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_epi8() {
+        let idx = _mm256_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_maskz_permutexvar_epi8(0, idx, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutexvar_epi8(0b11111111_11111111_11111111_11111111, idx, a);
+        let e = _mm256_set1_epi8(30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_permutexvar_epi8() {
+        let idx = _mm_set1_epi8(1);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_permutexvar_epi8(idx, a);
+        let e = _mm_set1_epi8(14);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_mask_permutexvar_epi8() {
+        let idx = _mm_set1_epi8(1);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_mask_permutexvar_epi8(a, 0, idx, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutexvar_epi8(a, 0b11111111_11111111, idx, a);
+        let e = _mm_set1_epi8(14);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_maskz_permutexvar_epi8() {
+        let idx = _mm_set1_epi8(1);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_permutexvar_epi8(0, idx, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutexvar_epi8(0b11111111_11111111, idx, a);
+        let e = _mm_set1_epi8(14);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_multishift_epi64_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_multishift_epi64_epi8(a, b);
+        let e = _mm512_set1_epi8(1 << 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_mask_multishift_epi64_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_mask_multishift_epi64_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_multishift_epi64_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        let e = _mm512_set1_epi8(1 << 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_maskz_multishift_epi64_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_multishift_epi64_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_multishift_epi64_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        let e = _mm512_set1_epi8(1 << 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_multishift_epi64_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_multishift_epi64_epi8(a, b);
+        let e = _mm256_set1_epi8(1 << 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_mask_multishift_epi64_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_mask_multishift_epi64_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_multishift_epi64_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm256_set1_epi8(1 << 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_maskz_multishift_epi64_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_multishift_epi64_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_multishift_epi64_epi8(0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm256_set1_epi8(1 << 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_multishift_epi64_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_multishift_epi64_epi8(a, b);
+        let e = _mm_set1_epi8(1 << 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_mask_multishift_epi64_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_mask_multishift_epi64_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_multishift_epi64_epi8(a, 0b11111111_11111111, a, b);
+        let e = _mm_set1_epi8(1 << 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_maskz_multishift_epi64_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_multishift_epi64_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_multishift_epi64_epi8(0b11111111_11111111, a, b);
+        let e = _mm_set1_epi8(1 << 7);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs b/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs
new file mode 100644
index 0000000000000..c722f7b370ffe
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs
@@ -0,0 +1,3941 @@
+use crate::{
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_expandloadu_epi16(
+    src: __m512i,
+    k: __mmask32,
+    mem_addr: *const i16,
+) -> __m512i {
+    transmute(expandloadw_512(mem_addr, src.as_i16x32(), k))
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_expandloadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i {
+    _mm512_mask_expandloadu_epi16(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_expandloadu_epi16(
+    src: __m256i,
+    k: __mmask16,
+    mem_addr: *const i16,
+) -> __m256i {
+    transmute(expandloadw_256(mem_addr, src.as_i16x16(), k))
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_expandloadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i {
+    _mm256_mask_expandloadu_epi16(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_expandloadu_epi16(
+    src: __m128i,
+    k: __mmask8,
+    mem_addr: *const i16,
+) -> __m128i {
+    transmute(expandloadw_128(mem_addr, src.as_i16x8(), k))
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_expandloadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i {
+    _mm_mask_expandloadu_epi16(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_expandloadu_epi8(
+    src: __m512i,
+    k: __mmask64,
+    mem_addr: *const i8,
+) -> __m512i {
+    transmute(expandloadb_512(mem_addr, src.as_i8x64(), k))
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_expandloadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i {
+    _mm512_mask_expandloadu_epi8(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_expandloadu_epi8(
+    src: __m256i,
+    k: __mmask32,
+    mem_addr: *const i8,
+) -> __m256i {
+    transmute(expandloadb_256(mem_addr, src.as_i8x32(), k))
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_expandloadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i {
+    _mm256_mask_expandloadu_epi8(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_expandloadu_epi8(
+    src: __m128i,
+    k: __mmask16,
+    mem_addr: *const i8,
+) -> __m128i {
+    transmute(expandloadb_128(mem_addr, src.as_i8x16(), k))
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_expandloadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i {
+    _mm_mask_expandloadu_epi8(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm512_mask_compressstoreu_epi16(base_addr: *mut i16, k: __mmask32, a: __m512i) {
+    vcompressstorew(base_addr as *mut _, a.as_i16x32(), k)
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm256_mask_compressstoreu_epi16(base_addr: *mut i16, k: __mmask16, a: __m256i) {
+    vcompressstorew256(base_addr as *mut _, a.as_i16x16(), k)
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm_mask_compressstoreu_epi16(base_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vcompressstorew128(base_addr as *mut _, a.as_i16x8(), k)
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm512_mask_compressstoreu_epi8(base_addr: *mut i8, k: __mmask64, a: __m512i) {
+    vcompressstoreb(base_addr, a.as_i8x64(), k)
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm256_mask_compressstoreu_epi8(base_addr: *mut i8, k: __mmask32, a: __m256i) {
+    vcompressstoreb256(base_addr, a.as_i8x32(), k)
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm_mask_compressstoreu_epi8(base_addr: *mut i8, k: __mmask16, a: __m128i) {
+    vcompressstoreb128(base_addr, a.as_i8x16(), k)
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi16&expand=1192)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub fn _mm512_mask_compress_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressw(a.as_i16x32(), src.as_i16x32(), k)) }
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_epi16&expand=1193)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressw(a.as_i16x32(), i16x32::ZERO, k)) }
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_epi16&expand=1190)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub fn _mm256_mask_compress_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressw256(a.as_i16x16(), src.as_i16x16(), k)) }
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_epi16&expand=1191)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub fn _mm256_maskz_compress_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressw256(a.as_i16x16(), i16x16::ZERO, k)) }
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_epi16&expand=1188)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub fn _mm_mask_compress_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressw128(a.as_i16x8(), src.as_i16x8(), k)) }
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_epi16&expand=1189)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub fn _mm_maskz_compress_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressw128(a.as_i16x8(), i16x8::ZERO, k)) }
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi8&expand=1210)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub fn _mm512_mask_compress_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressb(a.as_i8x64(), src.as_i8x64(), k)) }
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_epi8&expand=1211)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub fn _mm512_maskz_compress_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressb(a.as_i8x64(), i8x64::ZERO, k)) }
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_epi8&expand=1208)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub fn _mm256_mask_compress_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressb256(a.as_i8x32(), src.as_i8x32(), k)) }
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_epi8&expand=1209)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub fn _mm256_maskz_compress_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressb256(a.as_i8x32(), i8x32::ZERO, k)) }
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_epi8&expand=1206)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub fn _mm_mask_compress_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressb128(a.as_i8x16(), src.as_i8x16(), k)) }
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_epi8&expand=1207)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressb128(a.as_i8x16(), i8x16::ZERO, k)) }
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_epi16&expand=2310)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub fn _mm512_mask_expand_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandw(a.as_i16x32(), src.as_i16x32(), k)) }
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_epi16&expand=2311)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub fn _mm512_maskz_expand_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandw(a.as_i16x32(), i16x32::ZERO, k)) }
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_epi16&expand=2308)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub fn _mm256_mask_expand_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandw256(a.as_i16x16(), src.as_i16x16(), k)) }
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_epi16&expand=2309)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub fn _mm256_maskz_expand_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandw256(a.as_i16x16(), i16x16::ZERO, k)) }
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_epi16&expand=2306)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub fn _mm_mask_expand_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandw128(a.as_i16x8(), src.as_i16x8(), k)) }
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_epi16&expand=2307)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub fn _mm_maskz_expand_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandw128(a.as_i16x8(), i16x8::ZERO, k)) }
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_epi8&expand=2328)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub fn _mm512_mask_expand_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandb(a.as_i8x64(), src.as_i8x64(), k)) }
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_epi8&expand=2329)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub fn _mm512_maskz_expand_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandb(a.as_i8x64(), i8x64::ZERO, k)) }
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_epi8&expand=2326)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub fn _mm256_mask_expand_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandb256(a.as_i8x32(), src.as_i8x32(), k)) }
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_epi8&expand=2327)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub fn _mm256_maskz_expand_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandb256(a.as_i8x32(), i8x32::ZERO, k)) }
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_epi8&expand=2324)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub fn _mm_mask_expand_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandb128(a.as_i8x16(), src.as_i8x16(), k)) }
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_epi8&expand=2325)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandb128(a.as_i8x16(), i8x16::ZERO, k)) }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldv_epi64&expand=5087)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshldvq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8())) }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldv_epi64&expand=5085)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm512_mask_shldv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldv_epi64&expand=5086)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldv_epi64&expand=5084)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshldvq256(a.as_i64x4(), b.as_i64x4(), c.as_i64x4())) }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldv_epi64&expand=5082)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm256_mask_shldv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldv_epi64&expand=5083)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldv_epi64&expand=5081)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshldvq128(a.as_i64x2(), b.as_i64x2(), c.as_i64x2())) }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldv_epi64&expand=5079)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm_mask_shldv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldv_epi64&expand=5080)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldv_epi32&expand=5078)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshldvd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16())) }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldv_epi32&expand=5076)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm512_mask_shldv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldv_epi32&expand=5077)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm512_maskz_shldv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldv_epi32&expand=5075)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshldvd256(a.as_i32x8(), b.as_i32x8(), c.as_i32x8())) }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldv_epi32&expand=5073)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm256_mask_shldv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldv_epi32&expand=5074)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldv_epi32&expand=5072)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshldvd128(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldv_epi32&expand=5070)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm_mask_shldv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldv_epi32&expand=5071)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldv_epi16&expand=5069)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshldvw(a.as_i16x32(), b.as_i16x32(), c.as_i16x32())) }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldv_epi16&expand=5067)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm512_mask_shldv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldv_epi16&expand=5068)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm512_maskz_shldv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldv_epi16&expand=5066)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshldvw256(a.as_i16x16(), b.as_i16x16(), c.as_i16x16())) }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldv_epi16&expand=5064)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm256_mask_shldv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldv_epi16&expand=5065)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm256_maskz_shldv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldv_epi16&expand=5063)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshldvw128(a.as_i16x8(), b.as_i16x8(), c.as_i16x8())) }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldv_epi16&expand=5061)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm_mask_shldv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldv_epi16&expand=5062)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdv_epi64&expand=5141)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshrdvq(b.as_i64x8(), a.as_i64x8(), c.as_i64x8())) }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdv_epi64&expand=5139)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm512_mask_shrdv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdv_epi64&expand=5140)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdv_epi64&expand=5138)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshrdvq256(b.as_i64x4(), a.as_i64x4(), c.as_i64x4())) }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdv_epi64&expand=5136)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm256_mask_shrdv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdv_epi64&expand=5137)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdv_epi64&expand=5135)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshrdvq128(b.as_i64x2(), a.as_i64x2(), c.as_i64x2())) }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdv_epi64&expand=5133)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm_mask_shrdv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdv_epi64&expand=5134)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdv_epi32&expand=5132)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshrdvd(b.as_i32x16(), a.as_i32x16(), c.as_i32x16())) }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdv_epi32&expand=5130)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm512_mask_shrdv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdv_epi32&expand=5131)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm512_maskz_shrdv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdv_epi32&expand=5129)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshrdvd256(b.as_i32x8(), a.as_i32x8(), c.as_i32x8())) }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdv_epi32&expand=5127)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm256_mask_shrdv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdv_epi32&expand=5128)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdv_epi32&expand=5126)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshrdvd128(b.as_i32x4(), a.as_i32x4(), c.as_i32x4())) }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdv_epi32&expand=5124)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm_mask_shrdv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdv_epi32&expand=5125)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdv_epi16&expand=5123)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshrdvw(b.as_i16x32(), a.as_i16x32(), c.as_i16x32())) }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdv_epi16&expand=5121)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm512_mask_shrdv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdv_epi16&expand=5122)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm512_maskz_shrdv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdv_epi16&expand=5120)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshrdvw256(b.as_i16x16(), a.as_i16x16(), c.as_i16x16())) }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdv_epi16&expand=5118)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm256_mask_shrdv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdv_epi16&expand=5119)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm256_maskz_shrdv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdv_epi16&expand=5117)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshrdvw128(b.as_i16x8(), a.as_i16x8(), c.as_i16x8())) }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdv_epi16&expand=5115)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm_mask_shrdv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdv_epi16&expand=5116)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldi_epi64&expand=5060)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shldi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_shldv_epi64(a, b, _mm512_set1_epi64(IMM8 as i64))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldi_epi64&expand=5058)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shldi_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi64::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldi_epi64&expand=5059)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shldi_epi64<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi64::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldi_epi64&expand=5057)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shldi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_shldv_epi64(a, b, _mm256_set1_epi64x(IMM8 as i64))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldi_epi64&expand=5055)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shldi_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi64::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldi_epi64&expand=5056)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shldi_epi64<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi64::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldi_epi64&expand=5054)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_shldi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_shldv_epi64(a, b, _mm_set1_epi64x(IMM8 as i64))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldi_epi64&expand=5052)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shldi_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi64::<IMM8>(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldi_epi64&expand=5053)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shldi_epi64<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi64::<IMM8>(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldi_epi32&expand=5051)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shldi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_shldv_epi32(a, b, _mm512_set1_epi32(IMM8))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldi_epi32&expand=5049)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shldi_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi32::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldi_epi32&expand=5050)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shldi_epi32<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi32::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldi_epi32&expand=5048)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shldi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_shldv_epi32(a, b, _mm256_set1_epi32(IMM8))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldi_epi32&expand=5046)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shldi_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi32::<IMM8>(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldi_epi32&expand=5047)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shldi_epi32<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi32::<IMM8>(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldi_epi32&expand=5045)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_shldi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_shldv_epi32(a, b, _mm_set1_epi32(IMM8))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldi_epi32&expand=5043)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shldi_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi32::<IMM8>(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldi_epi32&expand=5044)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shldi_epi32<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi32::<IMM8>(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldi_epi16&expand=5042)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shldi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_shldv_epi16(a, b, _mm512_set1_epi16(IMM8 as i16))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldi_epi16&expand=5040)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shldi_epi16<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi16::<IMM8>(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldi_epi16&expand=5041)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shldi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi16::<IMM8>(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldi_epi16&expand=5039)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shldi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_shldv_epi16(a, b, _mm256_set1_epi16(IMM8 as i16))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldi_epi16&expand=5037)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shldi_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi16::<IMM8>(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldi_epi16&expand=5038)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shldi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi16::<IMM8>(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldi_epi16&expand=5036)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_shldi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_shldv_epi16(a, b, _mm_set1_epi16(IMM8 as i16))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldi_epi16&expand=5034)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shldi_epi16<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi16::<IMM8>(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldi_epi16&expand=5035)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shldi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi16::<IMM8>(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdi_epi64&expand=5114)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shrdi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_shrdv_epi64(a, b, _mm512_set1_epi64(IMM8 as i64))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdi_epi64&expand=5112)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shrdi_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi64::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdi_epi64&expand=5113)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 255))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shrdi_epi64<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi64::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdi_epi64&expand=5111)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shrdi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_shrdv_epi64(a, b, _mm256_set1_epi64x(IMM8 as i64))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdi_epi64&expand=5109)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shrdi_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi64::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdi_epi64&expand=5110)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shrdi_epi64<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi64::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdi_epi64&expand=5108)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_shrdi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_shrdv_epi64(a, b, _mm_set1_epi64x(IMM8 as i64))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdi_epi64&expand=5106)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shrdi_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi64::<IMM8>(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdi_epi64&expand=5107)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shrdi_epi64<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi64::<IMM8>(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdi_epi32&expand=5105)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shrdi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_shrdv_epi32(a, b, _mm512_set1_epi32(IMM8))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdi_epi32&expand=5103)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shrdi_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi32::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdi_epi32&expand=5104)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shrdi_epi32<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi32::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdi_epi32&expand=5102)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shrdi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_shrdv_epi32(a, b, _mm256_set1_epi32(IMM8))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdi_epi32&expand=5100)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shrdi_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi32::<IMM8>(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdi_epi32&expand=5101)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shrdi_epi32<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi32::<IMM8>(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdi_epi32&expand=5099)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_shrdi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_shrdv_epi32(a, b, _mm_set1_epi32(IMM8))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdi_epi32&expand=5097)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shrdi_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi32::<IMM8>(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdi_epi32&expand=5098)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shrdi_epi32<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi32::<IMM8>(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdi_epi16&expand=5096)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shrdi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_shrdv_epi16(a, b, _mm512_set1_epi16(IMM8 as i16))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdi_epi16&expand=5094)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shrdi_epi16<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi16::<IMM8>(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdi_epi16&expand=5095)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shrdi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi16::<IMM8>(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdi_epi16&expand=5093)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shrdi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_shrdv_epi16(a, b, _mm256_set1_epi16(IMM8 as i16))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdi_epi16&expand=5091)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shrdi_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi16::<IMM8>(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdi_epi16&expand=5092)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shrdi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi16::<IMM8>(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdi_epi16&expand=5090)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_shrdi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_shrdv_epi16(a, b, _mm_set1_epi16(IMM8 as i16))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdi_epi16&expand=5088)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shrdi_epi16<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi16::<IMM8>(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdi_epi16&expand=5089)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shrdi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi16::<IMM8>(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.mask.compress.store.w.512"]
+    fn vcompressstorew(mem: *mut i8, data: i16x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.w.256"]
+    fn vcompressstorew256(mem: *mut i8, data: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.w.128"]
+    fn vcompressstorew128(mem: *mut i8, data: i16x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.b.512"]
+    fn vcompressstoreb(mem: *mut i8, data: i8x64, mask: u64);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.b.256"]
+    fn vcompressstoreb256(mem: *mut i8, data: i8x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.b.128"]
+    fn vcompressstoreb128(mem: *mut i8, data: i8x16, mask: u16);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.w.512"]
+    fn vpcompressw(a: i16x32, src: i16x32, mask: u32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.mask.compress.w.256"]
+    fn vpcompressw256(a: i16x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.compress.w.128"]
+    fn vpcompressw128(a: i16x8, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.b.512"]
+    fn vpcompressb(a: i8x64, src: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.compress.b.256"]
+    fn vpcompressb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.compress.b.128"]
+    fn vpcompressb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.w.512"]
+    fn vpexpandw(a: i16x32, src: i16x32, mask: u32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.mask.expand.w.256"]
+    fn vpexpandw256(a: i16x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.w.128"]
+    fn vpexpandw128(a: i16x8, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.b.512"]
+    fn vpexpandb(a: i8x64, src: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.expand.b.256"]
+    fn vpexpandb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.expand.b.128"]
+    fn vpexpandb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
+
+    #[link_name = "llvm.fshl.v8i64"]
+    fn vpshldvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8;
+    #[link_name = "llvm.fshl.v4i64"]
+    fn vpshldvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4;
+    #[link_name = "llvm.fshl.v2i64"]
+    fn vpshldvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2;
+    #[link_name = "llvm.fshl.v16i32"]
+    fn vpshldvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16;
+    #[link_name = "llvm.fshl.v8i32"]
+    fn vpshldvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8;
+    #[link_name = "llvm.fshl.v4i32"]
+    fn vpshldvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
+    #[link_name = "llvm.fshl.v32i16"]
+    fn vpshldvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32;
+    #[link_name = "llvm.fshl.v16i16"]
+    fn vpshldvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16;
+    #[link_name = "llvm.fshl.v8i16"]
+    fn vpshldvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8;
+
+    #[link_name = "llvm.fshr.v8i64"]
+    fn vpshrdvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8;
+    #[link_name = "llvm.fshr.v4i64"]
+    fn vpshrdvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4;
+    #[link_name = "llvm.fshr.v2i64"]
+    fn vpshrdvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2;
+    #[link_name = "llvm.fshr.v16i32"]
+    fn vpshrdvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16;
+    #[link_name = "llvm.fshr.v8i32"]
+    fn vpshrdvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8;
+    #[link_name = "llvm.fshr.v4i32"]
+    fn vpshrdvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
+    #[link_name = "llvm.fshr.v32i16"]
+    fn vpshrdvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32;
+    #[link_name = "llvm.fshr.v16i16"]
+    fn vpshrdvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16;
+    #[link_name = "llvm.fshr.v8i16"]
+    fn vpshrdvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.load.b.128"]
+    fn expandloadb_128(mem_addr: *const i8, a: i8x16, mask: u16) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.w.128"]
+    fn expandloadw_128(mem_addr: *const i16, a: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.b.256"]
+    fn expandloadb_256(mem_addr: *const i8, a: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.w.256"]
+    fn expandloadw_256(mem_addr: *const i16, a: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.b.512"]
+    fn expandloadb_512(mem_addr: *const i8, a: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.w.512"]
+    fn expandloadw_512(mem_addr: *const i16, a: i16x32, mask: u32) -> i16x32;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use crate::hint::black_box;
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compress_epi16() {
+        let src = _mm512_set1_epi16(200);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_mask_compress_epi16(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
+            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_compress_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_maskz_compress_epi16(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compress_epi16() {
+        let src = _mm256_set1_epi16(200);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_compress_epi16(src, 0b01010101_01010101, a);
+        let e = _mm256_set_epi16(
+            200, 200, 200, 200, 200, 200, 200, 200, 1, 3, 5, 7, 9, 11, 13, 15,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_compress_epi16(0b01010101_01010101, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compress_epi16() {
+        let src = _mm_set1_epi16(200);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_compress_epi16(src, 0b01010101, a);
+        let e = _mm_set_epi16(200, 200, 200, 200, 1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_compress_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_compress_epi16(0b01010101, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compress_epi8() {
+        let src = _mm512_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_mask_compress_epi8(
+            src,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+            33,  35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,  61,  63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_compress_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_maskz_compress_epi8(
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+            33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compress_epi8() {
+        let src = _mm256_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_mask_compress_epi8(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_maskz_compress_epi8(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compress_epi8() {
+        let src = _mm_set1_epi8(100);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_mask_compress_epi8(src, 0b01010101_01010101, a);
+        let e = _mm_set_epi8(
+            100, 100, 100, 100, 100, 100, 100, 100, 1, 3, 5, 7, 9, 11, 13, 15,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_compress_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_compress_epi8(0b01010101_01010101, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expand_epi16() {
+        let src = _mm512_set1_epi16(200);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_mask_expand_epi16(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            200, 16, 200, 17, 200, 18, 200, 19, 200, 20, 200, 21, 200, 22, 200, 23,
+            200, 24, 200, 25, 200, 26, 200, 27, 200, 28, 200, 29, 200, 30, 200, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expand_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_maskz_expand_epi16(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23,
+                                 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expand_epi16() {
+        let src = _mm256_set1_epi16(200);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_expand_epi16(src, 0b01010101_01010101, a);
+        let e = _mm256_set_epi16(
+            200, 8, 200, 9, 200, 10, 200, 11, 200, 12, 200, 13, 200, 14, 200, 15,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_expand_epi16(0b01010101_01010101, a);
+        let e = _mm256_set_epi16(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expand_epi16() {
+        let src = _mm_set1_epi16(200);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_expand_epi16(src, 0b01010101, a);
+        let e = _mm_set_epi16(200, 4, 200, 5, 200, 6, 200, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expand_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_expand_epi16(0b01010101, a);
+        let e = _mm_set_epi16(0, 4, 0, 5, 0, 6, 0, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expand_epi8() {
+        let src = _mm512_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_mask_expand_epi8(
+            src,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            100, 32, 100, 33, 100, 34, 100, 35, 100, 36, 100, 37, 100, 38, 100, 39,
+            100, 40, 100, 41, 100, 42, 100, 43, 100, 44, 100, 45, 100, 46, 100, 47,
+            100, 48, 100, 49, 100, 50, 100, 51, 100, 52, 100, 53, 100, 54, 100, 55,
+            100, 56, 100, 57, 100, 58, 100, 59, 100, 60, 100, 61, 100, 62, 100, 63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expand_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_maskz_expand_epi8(
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 32, 0, 33, 0, 34, 0, 35, 0, 36, 0, 37, 0, 38, 0, 39,
+            0, 40, 0, 41, 0, 42, 0, 43, 0, 44, 0, 45, 0, 46, 0, 47,
+            0, 48, 0, 49, 0, 50, 0, 51, 0, 52, 0, 53, 0, 54, 0, 55,
+            0, 56, 0, 57, 0, 58, 0, 59, 0, 60, 0, 61, 0, 62, 0, 63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expand_epi8() {
+        let src = _mm256_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_mask_expand_epi8(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            100, 16, 100, 17, 100, 18, 100, 19, 100, 20, 100, 21, 100, 22, 100, 23,
+            100, 24, 100, 25, 100, 26, 100, 27, 100, 28, 100, 29, 100, 30, 100, 31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_maskz_expand_epi8(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23,
+            0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expand_epi8() {
+        let src = _mm_set1_epi8(100);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_mask_expand_epi8(src, 0b01010101_01010101, a);
+        let e = _mm_set_epi8(
+            100, 8, 100, 9, 100, 10, 100, 11, 100, 12, 100, 13, 100, 14, 100, 15,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expand_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_expand_epi8(0b01010101_01010101, a);
+        let e = _mm_set_epi8(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldv_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let c = _mm512_set1_epi64(2);
+        let r = _mm512_shldv_epi64(a, b, c);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldv_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let c = _mm512_set1_epi64(2);
+        let r = _mm512_mask_shldv_epi64(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldv_epi64(a, 0b11111111, b, c);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldv_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let c = _mm512_set1_epi64(2);
+        let r = _mm512_maskz_shldv_epi64(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldv_epi64(0b11111111, a, b, c);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldv_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let c = _mm256_set1_epi64x(2);
+        let r = _mm256_shldv_epi64(a, b, c);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldv_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let c = _mm256_set1_epi64x(2);
+        let r = _mm256_mask_shldv_epi64(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldv_epi64(a, 0b00001111, b, c);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldv_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let c = _mm256_set1_epi64x(2);
+        let r = _mm256_maskz_shldv_epi64(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldv_epi64(0b00001111, a, b, c);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldv_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let c = _mm_set1_epi64x(2);
+        let r = _mm_shldv_epi64(a, b, c);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldv_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let c = _mm_set1_epi64x(2);
+        let r = _mm_mask_shldv_epi64(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldv_epi64(a, 0b00000011, b, c);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldv_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let c = _mm_set1_epi64x(2);
+        let r = _mm_maskz_shldv_epi64(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldv_epi64(0b00000011, a, b, c);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldv_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let c = _mm512_set1_epi32(2);
+        let r = _mm512_shldv_epi32(a, b, c);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldv_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let c = _mm512_set1_epi32(2);
+        let r = _mm512_mask_shldv_epi32(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldv_epi32(a, 0b11111111_11111111, b, c);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldv_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let c = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_shldv_epi32(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldv_epi32(0b11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldv_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let c = _mm256_set1_epi32(2);
+        let r = _mm256_shldv_epi32(a, b, c);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldv_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let c = _mm256_set1_epi32(2);
+        let r = _mm256_mask_shldv_epi32(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldv_epi32(a, 0b11111111, b, c);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldv_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let c = _mm256_set1_epi32(2);
+        let r = _mm256_maskz_shldv_epi32(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldv_epi32(0b11111111, a, b, c);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldv_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let c = _mm_set1_epi32(2);
+        let r = _mm_shldv_epi32(a, b, c);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldv_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let c = _mm_set1_epi32(2);
+        let r = _mm_mask_shldv_epi32(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldv_epi32(a, 0b00001111, b, c);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldv_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let c = _mm_set1_epi32(2);
+        let r = _mm_maskz_shldv_epi32(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldv_epi32(0b00001111, a, b, c);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldv_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let c = _mm512_set1_epi16(2);
+        let r = _mm512_shldv_epi16(a, b, c);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldv_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let c = _mm512_set1_epi16(2);
+        let r = _mm512_mask_shldv_epi16(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldv_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let c = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_shldv_epi16(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldv_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let c = _mm256_set1_epi16(2);
+        let r = _mm256_shldv_epi16(a, b, c);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldv_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let c = _mm256_set1_epi16(2);
+        let r = _mm256_mask_shldv_epi16(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldv_epi16(a, 0b11111111_11111111, b, c);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldv_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let c = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_shldv_epi16(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldv_epi16(0b11111111_11111111, a, b, c);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldv_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let c = _mm_set1_epi16(2);
+        let r = _mm_shldv_epi16(a, b, c);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldv_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let c = _mm_set1_epi16(2);
+        let r = _mm_mask_shldv_epi16(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldv_epi16(a, 0b11111111, b, c);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldv_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let c = _mm_set1_epi16(2);
+        let r = _mm_maskz_shldv_epi16(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldv_epi16(0b11111111, a, b, c);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdv_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let b = _mm512_set1_epi64(8);
+        let c = _mm512_set1_epi64(1);
+        let r = _mm512_shrdv_epi64(a, b, c);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdv_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let b = _mm512_set1_epi64(8);
+        let c = _mm512_set1_epi64(1);
+        let r = _mm512_mask_shrdv_epi64(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdv_epi64(a, 0b11111111, b, c);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdv_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let b = _mm512_set1_epi64(8);
+        let c = _mm512_set1_epi64(1);
+        let r = _mm512_maskz_shrdv_epi64(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdv_epi64(0b11111111, a, b, c);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdv_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(8);
+        let c = _mm256_set1_epi64x(1);
+        let r = _mm256_shrdv_epi64(a, b, c);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdv_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(8);
+        let c = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_shrdv_epi64(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdv_epi64(a, 0b00001111, b, c);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdv_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(8);
+        let c = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_shrdv_epi64(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdv_epi64(0b00001111, a, b, c);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdv_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(8);
+        let c = _mm_set1_epi64x(1);
+        let r = _mm_shrdv_epi64(a, b, c);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdv_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(8);
+        let c = _mm_set1_epi64x(1);
+        let r = _mm_mask_shrdv_epi64(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdv_epi64(a, 0b00000011, b, c);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdv_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(8);
+        let c = _mm_set1_epi64x(1);
+        let r = _mm_maskz_shrdv_epi64(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdv_epi64(0b00000011, a, b, c);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdv_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let b = _mm512_set1_epi32(8);
+        let c = _mm512_set1_epi32(1);
+        let r = _mm512_shrdv_epi32(a, b, c);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdv_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let b = _mm512_set1_epi32(8);
+        let c = _mm512_set1_epi32(1);
+        let r = _mm512_mask_shrdv_epi32(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdv_epi32(a, 0b11111111_11111111, b, c);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdv_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let b = _mm512_set1_epi32(8);
+        let c = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_shrdv_epi32(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdv_epi32(0b11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdv_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(8);
+        let c = _mm256_set1_epi32(1);
+        let r = _mm256_shrdv_epi32(a, b, c);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdv_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(8);
+        let c = _mm256_set1_epi32(1);
+        let r = _mm256_mask_shrdv_epi32(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdv_epi32(a, 0b11111111, b, c);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdv_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(8);
+        let c = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_shrdv_epi32(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdv_epi32(0b11111111, a, b, c);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdv_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(8);
+        let c = _mm_set1_epi32(1);
+        let r = _mm_shrdv_epi32(a, b, c);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdv_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(8);
+        let c = _mm_set1_epi32(1);
+        let r = _mm_mask_shrdv_epi32(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdv_epi32(a, 0b00001111, b, c);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdv_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(8);
+        let c = _mm_set1_epi32(1);
+        let r = _mm_maskz_shrdv_epi32(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdv_epi32(0b00001111, a, b, c);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdv_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(8);
+        let c = _mm512_set1_epi16(1);
+        let r = _mm512_shrdv_epi16(a, b, c);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdv_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(8);
+        let c = _mm512_set1_epi16(1);
+        let r = _mm512_mask_shrdv_epi16(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdv_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(8);
+        let c = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_shrdv_epi16(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdv_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(8);
+        let c = _mm256_set1_epi16(1);
+        let r = _mm256_shrdv_epi16(a, b, c);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdv_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(8);
+        let c = _mm256_set1_epi16(1);
+        let r = _mm256_mask_shrdv_epi16(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdv_epi16(a, 0b11111111_11111111, b, c);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdv_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(8);
+        let c = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_shrdv_epi16(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdv_epi16(0b11111111_11111111, a, b, c);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdv_epi16() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(8);
+        let c = _mm_set1_epi16(1);
+        let r = _mm_shrdv_epi16(a, b, c);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdv_epi16() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(8);
+        let c = _mm_set1_epi16(1);
+        let r = _mm_mask_shrdv_epi16(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdv_epi16(a, 0b11111111, b, c);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdv_epi16() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(8);
+        let c = _mm_set1_epi16(1);
+        let r = _mm_maskz_shrdv_epi16(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdv_epi16(0b11111111, a, b, c);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldi_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let r = _mm512_shldi_epi64::<2>(a, b);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldi_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let r = _mm512_mask_shldi_epi64::<2>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldi_epi64::<2>(a, 0b11111111, a, b);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldi_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let r = _mm512_maskz_shldi_epi64::<2>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldi_epi64::<2>(0b11111111, a, b);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldi_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let r = _mm256_shldi_epi64::<2>(a, b);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldi_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let r = _mm256_mask_shldi_epi64::<2>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldi_epi64::<2>(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldi_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let r = _mm256_maskz_shldi_epi64::<2>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldi_epi64::<2>(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldi_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let r = _mm_shldi_epi64::<2>(a, b);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldi_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let r = _mm_mask_shldi_epi64::<2>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldi_epi64::<2>(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldi_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let r = _mm_maskz_shldi_epi64::<2>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldi_epi64::<2>(0b00000011, a, b);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldi_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let r = _mm512_shldi_epi32::<2>(a, b);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldi_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let r = _mm512_mask_shldi_epi32::<2>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldi_epi32::<2>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldi_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let r = _mm512_maskz_shldi_epi32::<2>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldi_epi32::<2>(0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldi_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let r = _mm256_shldi_epi32::<2>(a, b);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldi_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let r = _mm256_mask_shldi_epi32::<2>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldi_epi32::<2>(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldi_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let r = _mm256_maskz_shldi_epi32::<2>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldi_epi32::<2>(0b11111111, a, b);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldi_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let r = _mm_shldi_epi32::<2>(a, b);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldi_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let r = _mm_mask_shldi_epi32::<2>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldi_epi32::<2>(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldi_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let r = _mm_maskz_shldi_epi32::<2>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldi_epi32::<2>(0b00001111, a, b);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_shldi_epi16::<2>(a, b);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_mask_shldi_epi16::<2>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldi_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_maskz_shldi_epi16::<2>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldi_epi16::<2>(0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_shldi_epi16::<2>(a, b);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_mask_shldi_epi16::<2>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldi_epi16::<2>(a, 0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_maskz_shldi_epi16::<2>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldi_epi16::<2>(0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let r = _mm_shldi_epi16::<2>(a, b);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let r = _mm_mask_shldi_epi16::<2>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldi_epi16::<2>(a, 0b11111111, a, b);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let r = _mm_maskz_shldi_epi16::<2>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldi_epi16::<2>(0b11111111, a, b);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdi_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let b = _mm512_set1_epi64(8);
+        let r = _mm512_shrdi_epi64::<1>(a, b);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdi_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let b = _mm512_set1_epi64(8);
+        let r = _mm512_mask_shrdi_epi64::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdi_epi64::<1>(a, 0b11111111, a, b);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdi_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let b = _mm512_set1_epi64(8);
+        let r = _mm512_maskz_shrdi_epi64::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdi_epi64::<1>(0b11111111, a, b);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdi_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(8);
+        let r = _mm256_shrdi_epi64::<1>(a, b);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdi_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(8);
+        let r = _mm256_mask_shrdi_epi64::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdi_epi64::<1>(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdi_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(8);
+        let r = _mm256_maskz_shrdi_epi64::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdi_epi64::<1>(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdi_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(8);
+        let r = _mm_shrdi_epi64::<1>(a, b);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdi_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(8);
+        let r = _mm_mask_shrdi_epi64::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdi_epi64::<1>(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdi_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(8);
+        let r = _mm_maskz_shrdi_epi64::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdi_epi64::<1>(0b00000011, a, b);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdi_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let b = _mm512_set1_epi32(8);
+        let r = _mm512_shrdi_epi32::<1>(a, b);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdi_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let b = _mm512_set1_epi32(8);
+        let r = _mm512_mask_shrdi_epi32::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdi_epi32::<1>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdi_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let b = _mm512_set1_epi32(8);
+        let r = _mm512_maskz_shrdi_epi32::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdi_epi32::<1>(0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdi_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(8);
+        let r = _mm256_shrdi_epi32::<1>(a, b);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdi_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(8);
+        let r = _mm256_mask_shrdi_epi32::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdi_epi32::<1>(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdi_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(8);
+        let r = _mm256_maskz_shrdi_epi32::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdi_epi32::<1>(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdi_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(8);
+        let r = _mm_shrdi_epi32::<1>(a, b);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdi_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(8);
+        let r = _mm_mask_shrdi_epi32::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdi_epi32::<1>(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdi_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(8);
+        let r = _mm_maskz_shrdi_epi32::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdi_epi32::<1>(0b00001111, a, b);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdi_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(8);
+        let r = _mm512_shrdi_epi16::<1>(a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdi_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(8);
+        let r = _mm512_mask_shrdi_epi16::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdi_epi16::<1>(a, 0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdi_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(8);
+        let r = _mm512_maskz_shrdi_epi16::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdi_epi16::<1>(0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdi_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(8);
+        let r = _mm256_shrdi_epi16::<1>(a, b);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdi_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(8);
+        let r = _mm256_mask_shrdi_epi16::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdi_epi16::<1>(a, 0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdi_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(8);
+        let r = _mm256_maskz_shrdi_epi16::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdi_epi16::<1>(0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdi_epi16() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(8);
+        let r = _mm_shrdi_epi16::<1>(a, b);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdi_epi16() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(8);
+        let r = _mm_mask_shrdi_epi16::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdi_epi16::<1>(a, 0b11111111, a, b);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdi_epi16() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(8);
+        let r = _mm_maskz_shrdi_epi16::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdi_epi16::<1>(0b11111111, a, b);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expandloadu_epi16() {
+        let src = _mm512_set1_epi16(42);
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111;
+        let r = _mm512_mask_expandloadu_epi16(src, m, black_box(p));
+        let e = _mm512_set_epi16(
+            16, 15, 14, 42, 13, 42, 42, 42, 12, 11, 42, 42, 10, 42, 9, 42, 8, 7, 6, 5, 42, 42, 42,
+            42, 42, 42, 42, 42, 4, 3, 2, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expandloadu_epi16() {
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111;
+        let r = _mm512_maskz_expandloadu_epi16(m, black_box(p));
+        let e = _mm512_set_epi16(
+            16, 15, 14, 0, 13, 0, 0, 0, 12, 11, 0, 0, 10, 0, 9, 0, 8, 7, 6, 5, 0, 0, 0, 0, 0, 0, 0,
+            0, 4, 3, 2, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_epi16() {
+        let src = _mm256_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_mask_expandloadu_epi16(src, m, black_box(p));
+        let e = _mm256_set_epi16(8, 7, 6, 42, 5, 42, 42, 42, 4, 3, 42, 42, 2, 42, 1, 42);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_maskz_expandloadu_epi16(m, black_box(p));
+        let e = _mm256_set_epi16(8, 7, 6, 0, 5, 0, 0, 0, 4, 3, 0, 0, 2, 0, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_epi16() {
+        let src = _mm_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_mask_expandloadu_epi16(src, m, black_box(p));
+        let e = _mm_set_epi16(4, 3, 2, 42, 1, 42, 42, 42);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_maskz_expandloadu_epi16(m, black_box(p));
+        let e = _mm_set_epi16(4, 3, 2, 0, 1, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expandloadu_epi8() {
+        let src = _mm512_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111_11111111_00000000_10101010_01010101;
+        let r = _mm512_mask_expandloadu_epi8(src, m, black_box(p));
+        let e = _mm512_set_epi8(
+            32, 31, 30, 42, 29, 42, 42, 42, 28, 27, 42, 42, 26, 42, 25, 42, 24, 23, 22, 21, 42, 42,
+            42, 42, 42, 42, 42, 42, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 42, 42, 42, 42,
+            42, 42, 42, 42, 8, 42, 7, 42, 6, 42, 5, 42, 42, 4, 42, 3, 42, 2, 42, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expandloadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111_11111111_00000000_10101010_01010101;
+        let r = _mm512_maskz_expandloadu_epi8(m, black_box(p));
+        let e = _mm512_set_epi8(
+            32, 31, 30, 0, 29, 0, 0, 0, 28, 27, 0, 0, 26, 0, 25, 0, 24, 23, 22, 21, 0, 0, 0, 0, 0,
+            0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0,
+            7, 0, 6, 0, 5, 0, 0, 4, 0, 3, 0, 2, 0, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_epi8() {
+        let src = _mm256_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111;
+        let r = _mm256_mask_expandloadu_epi8(src, m, black_box(p));
+        let e = _mm256_set_epi8(
+            16, 15, 14, 42, 13, 42, 42, 42, 12, 11, 42, 42, 10, 42, 9, 42, 8, 7, 6, 5, 42, 42, 42,
+            42, 42, 42, 42, 42, 4, 3, 2, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111;
+        let r = _mm256_maskz_expandloadu_epi8(m, black_box(p));
+        let e = _mm256_set_epi8(
+            16, 15, 14, 0, 13, 0, 0, 0, 12, 11, 0, 0, 10, 0, 9, 0, 8, 7, 6, 5, 0, 0, 0, 0, 0, 0, 0,
+            0, 4, 3, 2, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_epi8() {
+        let src = _mm_set1_epi8(42);
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_mask_expandloadu_epi8(src, m, black_box(p));
+        let e = _mm_set_epi8(8, 7, 6, 42, 5, 42, 42, 42, 4, 3, 42, 42, 2, 42, 1, 42);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_epi8() {
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_maskz_expandloadu_epi8(m, black_box(p));
+        let e = _mm_set_epi8(8, 7, 6, 0, 5, 0, 0, 0, 4, 3, 0, 0, 2, 0, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compressstoreu_epi16() {
+        let a = _mm512_set_epi16(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11,
+            10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+        );
+        let mut r = [0_i16; 32];
+        _mm512_mask_compressstoreu_epi16(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i16; 32]);
+        _mm512_mask_compressstoreu_epi16(r.as_mut_ptr(), 0b11110000_11001010_11111111_00000000, a);
+        assert_eq!(
+            &r,
+            &[
+                9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 23, 24, 29, 30, 31, 32, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0
+            ]
+        );
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_epi16() {
+        let a = _mm256_set_epi16(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let mut r = [0_i16; 16];
+        _mm256_mask_compressstoreu_epi16(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i16; 16]);
+        _mm256_mask_compressstoreu_epi16(r.as_mut_ptr(), 0b11110000_11001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_epi16() {
+        let a = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        let mut r = [0_i16; 8];
+        _mm_mask_compressstoreu_epi16(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i16; 8]);
+        _mm_mask_compressstoreu_epi16(r.as_mut_ptr(), 0b11110000, a);
+        assert_eq!(&r, &[5, 6, 7, 8, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compressstoreu_epi8() {
+        let a = _mm512_set_epi8(
+            64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43,
+            42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21,
+            20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+        );
+        let mut r = [0_i8; 64];
+        _mm512_mask_compressstoreu_epi8(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i8; 64]);
+        _mm512_mask_compressstoreu_epi8(
+            r.as_mut_ptr(),
+            0b11110000_11001010_11111111_00000000_10101010_01010101_11110000_00001111,
+            a,
+        );
+        assert_eq!(
+            &r,
+            &[
+                1, 2, 3, 4, 13, 14, 15, 16, 17, 19, 21, 23, 26, 28, 30, 32, 41, 42, 43, 44, 45, 46,
+                47, 48, 50, 52, 55, 56, 61, 62, 63, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+            ]
+        );
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_epi8() {
+        let a = _mm256_set_epi8(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11,
+            10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+        );
+        let mut r = [0_i8; 32];
+        _mm256_mask_compressstoreu_epi8(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i8; 32]);
+        _mm256_mask_compressstoreu_epi8(r.as_mut_ptr(), 0b11110000_11001010_11111111_00000000, a);
+        assert_eq!(
+            &r,
+            &[
+                9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 23, 24, 29, 30, 31, 32, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0
+            ]
+        );
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_epi8() {
+        let a = _mm_set_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let mut r = [0_i8; 16];
+        _mm_mask_compressstoreu_epi8(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i8; 16]);
+        _mm_mask_compressstoreu_epi8(r.as_mut_ptr(), 0b11110000_11001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
new file mode 100644
index 0000000000000..93ea01cbb45b3
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
@@ -0,0 +1,1699 @@
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpwssd_epi32&expand=2219)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpwssd_epi32&expand=2220)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm512_mask_dpwssd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpwssd_epi32&expand=2221)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm512_maskz_dpwssd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_avx_epi32&expand=2713)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_epi32&expand=2216)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpwssd_epi32&expand=2217)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm256_mask_dpwssd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpwssd_epi32&expand=2218)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm256_maskz_dpwssd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_avx_epi32&expand=2712)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_epi32&expand=2213)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpwssd_epi32&expand=2214)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpwssd_epi32&expand=2215)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpwssds_epi32&expand=2228)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpwssds_epi32&expand=2229)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm512_mask_dpwssds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpwssds_epi32&expand=2230)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm512_maskz_dpwssds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_avx_epi32&expand=2726)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_epi32&expand=2225)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpwssds_epi32&expand=2226)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm256_mask_dpwssds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpwssds_epi32&expand=2227)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm256_maskz_dpwssds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_avx_epi32&expand=2725)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_epi32&expand=2222)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpwssds_epi32&expand=2223)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpwssds_epi32&expand=2224)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm_maskz_dpwssds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpbusd_epi32&expand=2201)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpbusd_epi32&expand=2202)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm512_mask_dpbusd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpbusd_epi32&expand=2203)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm512_maskz_dpbusd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_avx_epi32&expand=2683)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_epi32&expand=2198)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpbusd_epi32&expand=2199)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm256_mask_dpbusd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpbusd_epi32&expand=2200)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm256_maskz_dpbusd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_avx_epi32&expand=2682)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_epi32&expand=2195)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpbusd_epi32&expand=2196)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpbusd_epi32&expand=2197)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpbusds_epi32&expand=2210)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpbusds_epi32&expand=2211)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm512_mask_dpbusds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpbusds_epi32&expand=2212)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm512_maskz_dpbusds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_avx_epi32&expand=2696)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_epi32&expand=2207)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpbusds_epi32&expand=2208)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm256_mask_dpbusds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpbusds_epi32&expand=2209)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm256_maskz_dpbusds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_avx_epi32&expand=2695)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_epi32&expand=2204)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpbusds_epi32&expand=2205)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpbusds_epi32&expand=2206)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm_maskz_dpbusds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssd_epi32&expand=2674)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbssd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssd_epi32&expand=2675)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbssd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssds_epi32&expand=2676)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbssds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssds_epi32&expand=2677)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbssds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsud_epi32&expand=2678)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbsud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsud_epi32&expand=2679)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbsud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsuds_epi32&expand=2680)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbsuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsuds_epi32&expand=2681)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbsuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuud_epi32&expand=2708)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbuud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuud_epi32&expand=2709)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbuud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuuds_epi32&expand=2710)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbuuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuuds_epi32&expand=2711)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbuuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsud_epi32&expand=2738)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwsud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsud_epi32&expand=2739)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwsud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsuds_epi32&expand=2740)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwsuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsuds_epi32&expand=2741)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwsuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusd_epi32&expand=2742)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwusd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusd_epi32&expand=2743)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwusd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusds_epi32&expand=2744)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwusds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusds_epi32&expand=2745)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwusds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuud_epi32&expand=2746)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwuud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuud_epi32&expand=2747)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwuud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuuds_epi32&expand=2748)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwuuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuuds_epi32&expand=2749)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwuuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.vpdpwssd.512"]
+    fn vpdpwssd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpwssd.256"]
+    fn vpdpwssd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpwssd.128"]
+    fn vpdpwssd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpdpwssds.512"]
+    fn vpdpwssds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpwssds.256"]
+    fn vpdpwssds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpwssds.128"]
+    fn vpdpwssds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpdpbusd.512"]
+    fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpbusd.256"]
+    fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpbusd.128"]
+    fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpdpbusds.512"]
+    fn vpdpbusds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpbusds.256"]
+    fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpbusds.128"]
+    fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx2.vpdpbssd.128"]
+    fn vpdpbssd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbssd.256"]
+    fn vpdpbssd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbssds.128"]
+    fn vpdpbssds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbssds.256"]
+    fn vpdpbssds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbsud.128"]
+    fn vpdpbsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbsud.256"]
+    fn vpdpbsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbsuds.128"]
+    fn vpdpbsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbsuds.256"]
+    fn vpdpbsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbuud.128"]
+    fn vpdpbuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbuud.256"]
+    fn vpdpbuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbuuds.128"]
+    fn vpdpbuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbuuds.256"]
+    fn vpdpbuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwsud.128"]
+    fn vpdpwsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwsud.256"]
+    fn vpdpwsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwsuds.128"]
+    fn vpdpwsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwsuds.256"]
+    fn vpdpwsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwusd.128"]
+    fn vpdpwusd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwusd.256"]
+    fn vpdpwusd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwusds.128"]
+    fn vpdpwusds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwusds.256"]
+    fn vpdpwusds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwuud.128"]
+    fn vpdpwuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwuud.256"]
+    fn vpdpwuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwuuds.128"]
+    fn vpdpwuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwuuds.256"]
+    fn vpdpwuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpwssd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_dpwssd_epi32(src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpwssd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_mask_dpwssd_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpwssd_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpwssd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_maskz_dpwssd_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpwssd_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpwssd_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssd_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpwssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpwssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_mask_dpwssd_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpwssd_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpwssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_maskz_dpwssd_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpwssd_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpwssd_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssd_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpwssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssd_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpwssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_mask_dpwssd_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpwssd_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpwssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_maskz_dpwssd_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpwssd_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpwssds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_dpwssds_epi32(src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpwssds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_mask_dpwssds_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpwssds_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpwssds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_maskz_dpwssds_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpwssds_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpwssds_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssds_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpwssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpwssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_mask_dpwssds_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpwssds_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpwssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_maskz_dpwssds_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpwssds_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpwssds_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssds_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpwssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpwssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_mask_dpwssds_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpwssds_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpwssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_maskz_dpwssds_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpwssds_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpbusd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_dpbusd_epi32(src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpbusd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_mask_dpbusd_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpbusd_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpbusd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_maskz_dpbusd_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpbusd_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpbusd_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusd_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpbusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpbusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_mask_dpbusd_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpbusd_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpbusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_maskz_dpbusd_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpbusd_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpbusd_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusd_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpbusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusd_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpbusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_mask_dpbusd_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpbusd_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpbusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_maskz_dpbusd_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpbusd_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpbusds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_dpbusds_epi32(src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpbusds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_mask_dpbusds_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpbusds_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpbusds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_maskz_dpbusds_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpbusds_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpbusds_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusds_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpbusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpbusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_mask_dpbusds_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpbusds_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpbusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_maskz_dpbusds_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpbusds_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpbusds_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusds_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpbusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpbusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_mask_dpbusds_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpbusds_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpbusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_maskz_dpbusds_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpbusds_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbssd_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbssd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbssds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbssds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbsud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbsud_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbsud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbsud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbsuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbsuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbsuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbsuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbuud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbuud_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbuud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbuud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbuuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbuuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbuuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbuuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwsud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwsud_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwsud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwsud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwsuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwsuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwsuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwsuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwusd_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwusd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwusds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwusds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwuud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwuud_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwuud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwuud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwuuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwuuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwuuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwuuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vpopcntdq.rs b/library/stdarch/crates/core_arch/src/x86/avx512vpopcntdq.rs
new file mode 100644
index 0000000000000..e47a14b24dfc7
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vpopcntdq.rs
@@ -0,0 +1,573 @@
+//! Vectorized Population Count Instructions for Double- and Quadwords (VPOPCNTDQ)
+//!
+//! The intrinsics here correspond to those in the `immintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::simd::*;
+use crate::core_arch::x86::__m128i;
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+use crate::core_arch::x86::__mmask8;
+use crate::core_arch::x86::__mmask16;
+use crate::intrinsics::simd::{simd_ctpop, simd_select_bitmask};
+use crate::mem::transmute;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm512_popcnt_epi32(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i32x16())) }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm512_maskz_popcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x16()),
+            i32x16::ZERO,
+        ))
+    }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm512_mask_popcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x16()),
+            src.as_i32x16(),
+        ))
+    }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm256_popcnt_epi32(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i32x8())) }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm256_maskz_popcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x8()),
+            i32x8::ZERO,
+        ))
+    }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm256_mask_popcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x8()),
+            src.as_i32x8(),
+        ))
+    }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm_popcnt_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i32x4())) }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm_maskz_popcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x4()),
+            i32x4::ZERO,
+        ))
+    }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm_mask_popcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x4()),
+            src.as_i32x4(),
+        ))
+    }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm512_popcnt_epi64(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i64x8())) }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm512_maskz_popcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x8()),
+            i64x8::ZERO,
+        ))
+    }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm512_mask_popcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x8()),
+            src.as_i64x8(),
+        ))
+    }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm256_popcnt_epi64(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i64x4())) }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm256_maskz_popcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x4()),
+            i64x4::ZERO,
+        ))
+    }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm256_mask_popcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x4()),
+            src.as_i64x4(),
+        ))
+    }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm_popcnt_epi64(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i64x2())) }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm_maskz_popcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x2()),
+            i64x2::ZERO,
+        ))
+    }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm_mask_popcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x2()),
+            src.as_i64x2(),
+        ))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_popcnt_epi32() {
+        let test_data = _mm512_set_epi32(
+            0,
+            1,
+            -1,
+            2,
+            7,
+            0xFF_FE,
+            0x7F_FF_FF_FF,
+            -100,
+            0x40_00_00_00,
+            103,
+            371,
+            552,
+            432_948,
+            818_826_998,
+            255,
+            256,
+        );
+        let actual_result = _mm512_popcnt_epi32(test_data);
+        let reference_result =
+            _mm512_set_epi32(0, 1, 32, 1, 3, 15, 31, 28, 1, 5, 6, 3, 10, 17, 8, 1);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_mask_popcnt_epi32() {
+        let test_data = _mm512_set_epi32(
+            0,
+            1,
+            -1,
+            2,
+            7,
+            0xFF_FE,
+            0x7F_FF_FF_FF,
+            -100,
+            0x40_00_00_00,
+            103,
+            371,
+            552,
+            432_948,
+            818_826_998,
+            255,
+            256,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm512_mask_popcnt_epi32(test_data, mask, test_data);
+        let reference_result = _mm512_set_epi32(
+            0,
+            1,
+            32,
+            1,
+            3,
+            15,
+            31,
+            28,
+            0x40_00_00_00,
+            103,
+            371,
+            552,
+            432_948,
+            818_826_998,
+            255,
+            256,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_maskz_popcnt_epi32() {
+        let test_data = _mm512_set_epi32(
+            0,
+            1,
+            -1,
+            2,
+            7,
+            0xFF_FE,
+            0x7F_FF_FF_FF,
+            -100,
+            0x40_00_00_00,
+            103,
+            371,
+            552,
+            432_948,
+            818_826_998,
+            255,
+            256,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm512_maskz_popcnt_epi32(mask, test_data);
+        let reference_result = _mm512_set_epi32(0, 1, 32, 1, 3, 15, 31, 28, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm256_popcnt_epi32() {
+        let test_data = _mm256_set_epi32(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
+        let actual_result = _mm256_popcnt_epi32(test_data);
+        let reference_result = _mm256_set_epi32(0, 1, 32, 1, 3, 15, 31, 28);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_popcnt_epi32() {
+        let test_data = _mm256_set_epi32(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
+        let mask = 0xF0;
+        let actual_result = _mm256_mask_popcnt_epi32(test_data, mask, test_data);
+        let reference_result = _mm256_set_epi32(0, 1, 32, 1, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_popcnt_epi32() {
+        let test_data = _mm256_set_epi32(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
+        let mask = 0xF0;
+        let actual_result = _mm256_maskz_popcnt_epi32(mask, test_data);
+        let reference_result = _mm256_set_epi32(0, 1, 32, 1, 0, 0, 0, 0);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm_popcnt_epi32() {
+        let test_data = _mm_set_epi32(0, 1, -1, -100);
+        let actual_result = _mm_popcnt_epi32(test_data);
+        let reference_result = _mm_set_epi32(0, 1, 32, 28);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm_mask_popcnt_epi32() {
+        let test_data = _mm_set_epi32(0, 1, -1, -100);
+        let mask = 0xE;
+        let actual_result = _mm_mask_popcnt_epi32(test_data, mask, test_data);
+        let reference_result = _mm_set_epi32(0, 1, 32, -100);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_popcnt_epi32() {
+        let test_data = _mm_set_epi32(0, 1, -1, -100);
+        let mask = 0xE;
+        let actual_result = _mm_maskz_popcnt_epi32(mask, test_data);
+        let reference_result = _mm_set_epi32(0, 1, 32, 0);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_popcnt_epi64() {
+        let test_data = _mm512_set_epi64(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
+        let actual_result = _mm512_popcnt_epi64(test_data);
+        let reference_result = _mm512_set_epi64(0, 1, 64, 1, 3, 15, 63, 60);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_mask_popcnt_epi64() {
+        let test_data = _mm512_set_epi64(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
+        let mask = 0xF0;
+        let actual_result = _mm512_mask_popcnt_epi64(test_data, mask, test_data);
+        let reference_result =
+            _mm512_set_epi64(0, 1, 64, 1, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_maskz_popcnt_epi64() {
+        let test_data = _mm512_set_epi64(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
+        let mask = 0xF0;
+        let actual_result = _mm512_maskz_popcnt_epi64(mask, test_data);
+        let reference_result = _mm512_set_epi64(0, 1, 64, 1, 0, 0, 0, 0);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm256_popcnt_epi64() {
+        let test_data = _mm256_set_epi64x(0, 1, -1, -100);
+        let actual_result = _mm256_popcnt_epi64(test_data);
+        let reference_result = _mm256_set_epi64x(0, 1, 64, 60);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm256_mask_popcnt_epi64() {
+        let test_data = _mm256_set_epi64x(0, 1, -1, -100);
+        let mask = 0xE;
+        let actual_result = _mm256_mask_popcnt_epi64(test_data, mask, test_data);
+        let reference_result = _mm256_set_epi64x(0, 1, 64, -100);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm256_maskz_popcnt_epi64() {
+        let test_data = _mm256_set_epi64x(0, 1, -1, -100);
+        let mask = 0xE;
+        let actual_result = _mm256_maskz_popcnt_epi64(mask, test_data);
+        let reference_result = _mm256_set_epi64x(0, 1, 64, 0);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm_popcnt_epi64() {
+        let test_data = _mm_set_epi64x(0, 1);
+        let actual_result = _mm_popcnt_epi64(test_data);
+        let reference_result = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(actual_result, reference_result);
+        let test_data = _mm_set_epi64x(-1, -100);
+        let actual_result = _mm_popcnt_epi64(test_data);
+        let reference_result = _mm_set_epi64x(64, 60);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm_mask_popcnt_epi64() {
+        let test_data = _mm_set_epi64x(0, -100);
+        let mask = 0x2;
+        let actual_result = _mm_mask_popcnt_epi64(test_data, mask, test_data);
+        let reference_result = _mm_set_epi64x(0, -100);
+        assert_eq_m128i(actual_result, reference_result);
+        let test_data = _mm_set_epi64x(-1, 1);
+        let mask = 0x2;
+        let actual_result = _mm_mask_popcnt_epi64(test_data, mask, test_data);
+        let reference_result = _mm_set_epi64x(64, 1);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm_maskz_popcnt_epi64() {
+        let test_data = _mm_set_epi64x(0, 1);
+        let mask = 0x2;
+        let actual_result = _mm_maskz_popcnt_epi64(mask, test_data);
+        let reference_result = _mm_set_epi64x(0, 0);
+        assert_eq_m128i(actual_result, reference_result);
+        let test_data = _mm_set_epi64x(-1, -100);
+        let mask = 0x2;
+        let actual_result = _mm_maskz_popcnt_epi64(mask, test_data);
+        let reference_result = _mm_set_epi64x(64, 0);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avxneconvert.rs b/library/stdarch/crates/core_arch/src/x86/avxneconvert.rs
new file mode 100644
index 0000000000000..b92ec823ec64e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avxneconvert.rs
@@ -0,0 +1,371 @@
+use crate::arch::asm;
+use crate::core_arch::x86::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
+/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vbcstnebf162ps))]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub unsafe fn _mm_bcstnebf16_ps(a: *const bf16) -> __m128 {
+    bcstnebf162ps_128(a)
+}
+
+/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
+/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vbcstnebf162ps))]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub unsafe fn _mm256_bcstnebf16_ps(a: *const bf16) -> __m256 {
+    bcstnebf162ps_256(a)
+}
+
+/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
+/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
+/// (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vbcstnesh2ps))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_bcstnesh_ps(a: *const f16) -> __m128 {
+    bcstnesh2ps_128(a)
+}
+
+/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
+/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
+/// (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vbcstnesh2ps))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_bcstnesh_ps(a: *const f16) -> __m256 {
+    bcstnesh2ps_256(a)
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneebf162ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 {
+    transmute(cvtneebf162ps_128(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneebf162ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 {
+    transmute(cvtneebf162ps_256(a))
+}
+
+/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneeph2ps))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_cvtneeph_ps(a: *const __m128h) -> __m128 {
+    transmute(cvtneeph2ps_128(a))
+}
+
+/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneeph2ps))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_cvtneeph_ps(a: *const __m256h) -> __m256 {
+    transmute(cvtneeph2ps_256(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneobf162ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 {
+    transmute(cvtneobf162ps_128(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneobf162ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 {
+    transmute(cvtneobf162ps_256(a))
+}
+
+/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneoph2ps))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_cvtneoph_ps(a: *const __m128h) -> __m128 {
+    transmute(cvtneoph2ps_128(a))
+}
+
+/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneoph2ps))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_cvtneoph_ps(a: *const __m256h) -> __m256 {
+    transmute(cvtneoph2ps_256(a))
+}
+
+/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneps2bf16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "{{vex}}vcvtneps2bf16 {dst},{src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
+}
+
+/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneps2bf16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "{{vex}}vcvtneps2bf16 {dst},{src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(ymm_reg) a,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.vbcstnebf162ps128"]
+    fn bcstnebf162ps_128(a: *const bf16) -> __m128;
+    #[link_name = "llvm.x86.vbcstnebf162ps256"]
+    fn bcstnebf162ps_256(a: *const bf16) -> __m256;
+    #[link_name = "llvm.x86.vbcstnesh2ps128"]
+    fn bcstnesh2ps_128(a: *const f16) -> __m128;
+    #[link_name = "llvm.x86.vbcstnesh2ps256"]
+    fn bcstnesh2ps_256(a: *const f16) -> __m256;
+
+    #[link_name = "llvm.x86.vcvtneebf162ps128"]
+    fn cvtneebf162ps_128(a: *const __m128bh) -> __m128;
+    #[link_name = "llvm.x86.vcvtneebf162ps256"]
+    fn cvtneebf162ps_256(a: *const __m256bh) -> __m256;
+    #[link_name = "llvm.x86.vcvtneeph2ps128"]
+    fn cvtneeph2ps_128(a: *const __m128h) -> __m128;
+    #[link_name = "llvm.x86.vcvtneeph2ps256"]
+    fn cvtneeph2ps_256(a: *const __m256h) -> __m256;
+
+    #[link_name = "llvm.x86.vcvtneobf162ps128"]
+    fn cvtneobf162ps_128(a: *const __m128bh) -> __m128;
+    #[link_name = "llvm.x86.vcvtneobf162ps256"]
+    fn cvtneobf162ps_256(a: *const __m256bh) -> __m256;
+    #[link_name = "llvm.x86.vcvtneoph2ps128"]
+    fn cvtneoph2ps_128(a: *const __m128h) -> __m128;
+    #[link_name = "llvm.x86.vcvtneoph2ps256"]
+    fn cvtneoph2ps_256(a: *const __m256h) -> __m256;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::simd::{u16x4, u16x8};
+    use crate::core_arch::x86::*;
+    use crate::mem::transmute_copy;
+    use std::ptr::addr_of;
+    use stdarch_test::simd_test;
+
+    const BF16_ONE: u16 = 0b0_01111111_0000000;
+    const BF16_TWO: u16 = 0b0_10000000_0000000;
+    const BF16_THREE: u16 = 0b0_10000000_1000000;
+    const BF16_FOUR: u16 = 0b0_10000001_0000000;
+    const BF16_FIVE: u16 = 0b0_10000001_0100000;
+    const BF16_SIX: u16 = 0b0_10000001_1000000;
+    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
+    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_bcstnebf16_ps() {
+        let a = bf16::from_bits(BF16_ONE);
+        let r = _mm_bcstnebf16_ps(addr_of!(a));
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_bcstnebf16_ps() {
+        let a = bf16::from_bits(BF16_ONE);
+        let r = _mm256_bcstnebf16_ps(addr_of!(a));
+        let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_bcstnesh_ps() {
+        let a = 1.0_f16;
+        let r = _mm_bcstnesh_ps(addr_of!(a));
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_bcstnesh_ps() {
+        let a = 1.0_f16;
+        let r = _mm256_bcstnesh_ps(addr_of!(a));
+        let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneebf16_ps() {
+        let a = __m128bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let r = _mm_cvtneebf16_ps(addr_of!(a));
+        let e = _mm_setr_ps(1., 3., 5., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneebf16_ps() {
+        let a = __m256bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let r = _mm256_cvtneebf16_ps(addr_of!(a));
+        let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneeph_ps() {
+        let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
+        let r = _mm_cvtneeph_ps(addr_of!(a));
+        let e = _mm_setr_ps(1., 3., 5., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneeph_ps() {
+        let a = __m256h([
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        ]);
+        let r = _mm256_cvtneeph_ps(addr_of!(a));
+        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneobf16_ps() {
+        let a = __m128bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let r = _mm_cvtneobf16_ps(addr_of!(a));
+        let e = _mm_setr_ps(2., 4., 6., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneobf16_ps() {
+        let a = __m256bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let r = _mm256_cvtneobf16_ps(addr_of!(a));
+        let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneoph_ps() {
+        let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
+        let r = _mm_cvtneoph_ps(addr_of!(a));
+        let e = _mm_setr_ps(2., 4., 6., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneoph_ps() {
+        let a = __m256h([
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        ]);
+        let r = _mm256_cvtneoph_ps(addr_of!(a));
+        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneps_avx_pbh() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let r: u16x4 = transmute_copy(&_mm_cvtneps_avx_pbh(a));
+        let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneps_avx_pbh() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r: u16x8 = transmute(_mm256_cvtneps_avx_pbh(a));
+        let e = u16x8::new(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        assert_eq!(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/bmi1.rs b/library/stdarch/crates/core_arch/src/x86/bmi1.rs
new file mode 100644
index 0000000000000..eb7242944abcb
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/bmi1.rs
@@ -0,0 +1,198 @@
+//! Bit Manipulation Instruction (BMI) Set 1.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]: https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bextr_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
+    _bextr2_u32(a, (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32))
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range
+/// to be extracted, and bits `[15,8]` specify the length of the range.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bextr2_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
+    unsafe { x86_bmi_bextr_32(a, control) }
+}
+
+/// Bitwise logical `AND` of inverted `a` with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_andn_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(andn))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _andn_u32(a: u32, b: u32) -> u32 {
+    !a & b
+}
+
+/// Extracts lowest set isolated bit.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_blsi_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsi))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _blsi_u32(x: u32) -> u32 {
+    x & x.wrapping_neg()
+}
+
+/// Gets mask up to lowest set bit.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_blsmsk_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _blsmsk_u32(x: u32) -> u32 {
+    x ^ (x.wrapping_sub(1_u32))
+}
+
+/// Resets the lowest set bit of `x`.
+///
+/// If `x` is sets CF.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_blsr_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _blsr_u32(x: u32) -> u32 {
+    x & (x.wrapping_sub(1))
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is `0`, it returns its size in bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tzcnt_u16)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub fn _tzcnt_u16(x: u16) -> u16 {
+    x.trailing_zeros() as u16
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is `0`, it returns its size in bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tzcnt_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _tzcnt_u32(x: u32) -> u32 {
+    x.trailing_zeros()
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is `0`, it returns its size in bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_tzcnt_32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_tzcnt_32(x: u32) -> i32 {
+    x.trailing_zeros() as i32
+}
+
+unsafe extern "C" {
+    #[link_name = "llvm.x86.bmi.bextr.32"]
+    fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_bextr_u32() {
+        let r = _bextr_u32(0b0101_0000u32, 4, 4);
+        assert_eq!(r, 0b0000_0101u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_andn_u32() {
+        assert_eq!(_andn_u32(0, 0), 0);
+        assert_eq!(_andn_u32(0, 1), 1);
+        assert_eq!(_andn_u32(1, 0), 0);
+        assert_eq!(_andn_u32(1, 1), 0);
+
+        let r = _andn_u32(0b0000_0000u32, 0b0000_0000u32);
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = _andn_u32(0b0000_0000u32, 0b1111_1111u32);
+        assert_eq!(r, 0b1111_1111u32);
+
+        let r = _andn_u32(0b1111_1111u32, 0b0000_0000u32);
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = _andn_u32(0b1111_1111u32, 0b1111_1111u32);
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = _andn_u32(0b0100_0000u32, 0b0101_1101u32);
+        assert_eq!(r, 0b0001_1101u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsi_u32() {
+        assert_eq!(_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsmsk_u32() {
+        let r = _blsmsk_u32(0b0011_0000u32);
+        assert_eq!(r, 0b0001_1111u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsr_u32() {
+        // TODO: test the behavior when the input is `0`.
+        let r = _blsr_u32(0b0011_0000u32);
+        assert_eq!(r, 0b0010_0000u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_tzcnt_u16() {
+        assert_eq!(_tzcnt_u16(0b0000_0001u16), 0u16);
+        assert_eq!(_tzcnt_u16(0b0000_0000u16), 16u16);
+        assert_eq!(_tzcnt_u16(0b1001_0000u16), 4u16);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_tzcnt_u32() {
+        assert_eq!(_tzcnt_u32(0b0000_0001u32), 0u32);
+        assert_eq!(_tzcnt_u32(0b0000_0000u32), 32u32);
+        assert_eq!(_tzcnt_u32(0b1001_0000u32), 4u32);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/bmi2.rs b/library/stdarch/crates/core_arch/src/x86/bmi2.rs
new file mode 100644
index 0000000000000..83cf650923f7a
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/bmi2.rs
@@ -0,0 +1,133 @@
+//! Bit Manipulation Instruction (BMI) Set 2.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Unsigned multiply without affecting flags.
+///
+/// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
+/// the low half and the high half of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mulx_u32)
+#[inline]
+// LLVM BUG (should be mulxl): https://bugs.llvm.org/show_bug.cgi?id=34232
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
+#[cfg_attr(all(test, target_arch = "x86"), assert_instr(mul))]
+#[target_feature(enable = "bmi2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mulx_u32(a: u32, b: u32, hi: &mut u32) -> u32 {
+    let result: u64 = (a as u64) * (b as u64);
+    *hi = (result >> 32) as u32;
+    result as u32
+}
+
+/// Zeroes higher bits of `a` >= `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bzhi_u32)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(bzhi))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _bzhi_u32(a: u32, index: u32) -> u32 {
+    unsafe { x86_bmi2_bzhi_32(a, index) }
+}
+
+/// Scatter contiguous low order bits of `a` to the result at the positions
+/// specified by the `mask`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_pdep_u32)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pdep))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _pdep_u32(a: u32, mask: u32) -> u32 {
+    unsafe { x86_bmi2_pdep_32(a, mask) }
+}
+
+/// Gathers the bits of `x` specified by the `mask` into the contiguous low
+/// order bit positions of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_pext_u32)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pext))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _pext_u32(a: u32, mask: u32) -> u32 {
+    unsafe { x86_bmi2_pext_32(a, mask) }
+}
+
+unsafe extern "C" {
+    #[link_name = "llvm.x86.bmi.bzhi.32"]
+    fn x86_bmi2_bzhi_32(x: u32, y: u32) -> u32;
+    #[link_name = "llvm.x86.bmi.pdep.32"]
+    fn x86_bmi2_pdep_32(x: u32, y: u32) -> u32;
+    #[link_name = "llvm.x86.bmi.pext.32"]
+    fn x86_bmi2_pext_32(x: u32, y: u32) -> u32;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pext_u32() {
+        let n = 0b1011_1110_1001_0011u32;
+
+        let m0 = 0b0110_0011_1000_0101u32;
+        let s0 = 0b0000_0000_0011_0101u32;
+
+        let m1 = 0b1110_1011_1110_1111u32;
+        let s1 = 0b0001_0111_0100_0011u32;
+
+        assert_eq!(_pext_u32(n, m0), s0);
+        assert_eq!(_pext_u32(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pdep_u32() {
+        let n = 0b1011_1110_1001_0011u32;
+
+        let m0 = 0b0110_0011_1000_0101u32;
+        let s0 = 0b0000_0010_0000_0101u32;
+
+        let m1 = 0b1110_1011_1110_1111u32;
+        let s1 = 0b1110_1001_0010_0011u32;
+
+        assert_eq!(_pdep_u32(n, m0), s0);
+        assert_eq!(_pdep_u32(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_bzhi_u32() {
+        let n = 0b1111_0010u32;
+        let s = 0b0001_0010u32;
+        assert_eq!(_bzhi_u32(n, 5), s);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_mulx_u32() {
+        let a: u32 = 4_294_967_200;
+        let b: u32 = 2;
+        let mut hi = 0;
+        let lo = _mulx_u32(a, b, &mut hi);
+        /*
+        result = 8589934400
+               = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
+                   ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                */
+        assert_eq!(lo, 0b1111_1111_1111_1111_1111_1111_0100_0000u32);
+        assert_eq!(hi, 0b0001u32);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/bswap.rs b/library/stdarch/crates/core_arch/src/x86/bswap.rs
new file mode 100644
index 0000000000000..0db9acbd0ddf8
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/bswap.rs
@@ -0,0 +1,28 @@
+//! Byte swap intrinsics.
+#![allow(clippy::module_name_repetitions)]
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Returns an integer with the reversed byte order of x
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bswap)
+#[inline]
+#[cfg_attr(test, assert_instr(bswap))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bswap(x: i32) -> i32 {
+    x.swap_bytes()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bswap() {
+        unsafe {
+            assert_eq!(_bswap(0x0EADBE0F), 0x0FBEAD0E);
+            assert_eq!(_bswap(0x00000000), 0x00000000);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/bt.rs b/library/stdarch/crates/core_arch/src/x86/bt.rs
new file mode 100644
index 0000000000000..06cc2833f4e6d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/bt.rs
@@ -0,0 +1,147 @@
+use crate::arch::asm;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+// x32 wants to use a 32-bit address size, but asm! defaults to using the full
+// register name (e.g. rax). We have to explicitly override the placeholder to
+// use the 32-bit register name in that case.
+#[cfg(target_pointer_width = "32")]
+macro_rules! bt {
+    ($inst:expr) => {
+        concat!($inst, " {b:e}, ({p:e})")
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! bt {
+    ($inst:expr) => {
+        concat!($inst, " {b:e}, ({p})")
+    };
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittest)
+#[inline]
+#[cfg_attr(test, assert_instr(bt))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittest(p: *const i32, b: i32) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btl"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(readonly, nostack, pure, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then sets the bit to `1`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittestandset)
+#[inline]
+#[cfg_attr(test, assert_instr(bts))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandset(p: *mut i32, b: i32) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btsl"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then resets that bit to `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittestandreset)
+#[inline]
+#[cfg_attr(test, assert_instr(btr))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandreset(p: *mut i32, b: i32) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btrl"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then inverts that bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittestandcomplement)
+#[inline]
+#[cfg_attr(test, assert_instr(btc))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandcomplement(p: *mut i32, b: i32) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btcl"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    fn test_bittest() {
+        unsafe {
+            let a = 0b0101_0000i32;
+            assert_eq!(_bittest(&a as _, 4), 1);
+            assert_eq!(_bittest(&a as _, 5), 0);
+        }
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    fn test_bittestandset() {
+        unsafe {
+            let mut a = 0b0101_0000i32;
+            assert_eq!(_bittestandset(&mut a as _, 4), 1);
+            assert_eq!(_bittestandset(&mut a as _, 4), 1);
+            assert_eq!(_bittestandset(&mut a as _, 5), 0);
+            assert_eq!(_bittestandset(&mut a as _, 5), 1);
+        }
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    fn test_bittestandreset() {
+        unsafe {
+            let mut a = 0b0101_0000i32;
+            assert_eq!(_bittestandreset(&mut a as _, 4), 1);
+            assert_eq!(_bittestandreset(&mut a as _, 4), 0);
+            assert_eq!(_bittestandreset(&mut a as _, 5), 0);
+            assert_eq!(_bittestandreset(&mut a as _, 5), 0);
+        }
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    fn test_bittestandcomplement() {
+        unsafe {
+            let mut a = 0b0101_0000i32;
+            assert_eq!(_bittestandcomplement(&mut a as _, 4), 1);
+            assert_eq!(_bittestandcomplement(&mut a as _, 4), 0);
+            assert_eq!(_bittestandcomplement(&mut a as _, 4), 1);
+            assert_eq!(_bittestandcomplement(&mut a as _, 5), 0);
+            assert_eq!(_bittestandcomplement(&mut a as _, 5), 1);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/cpuid.rs b/library/stdarch/crates/core_arch/src/x86/cpuid.rs
new file mode 100644
index 0000000000000..0634f10a99fdc
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/cpuid.rs
@@ -0,0 +1,112 @@
+//! `cpuid` intrinsics
+#![allow(clippy::module_name_repetitions)]
+
+use crate::arch::asm;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Result of the `cpuid` instruction.
+#[allow(clippy::missing_inline_in_public_items)]
+// ^^ the derived impl of Debug for CpuidResult is not #[inline] and that's OK.
+#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub struct CpuidResult {
+    /// EAX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub eax: u32,
+    /// EBX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub ebx: u32,
+    /// ECX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub ecx: u32,
+    /// EDX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub edx: u32,
+}
+
+/// Returns the result of the `cpuid` instruction for a given `leaf` (`EAX`)
+/// and `sub_leaf` (`ECX`).
+///
+/// The highest-supported leaf value is returned by the first tuple argument of
+/// [`__get_cpuid_max(0)`](fn.__get_cpuid_max.html). For leaves containing
+/// sub-leaves, the second tuple argument returns the highest-supported
+/// sub-leaf value.
+///
+/// The [CPUID Wikipedia page][wiki_cpuid] contains how to query which
+/// information using the `EAX` and `ECX` registers, and the interpretation of
+/// the results returned in `EAX`, `EBX`, `ECX`, and `EDX`.
+///
+/// The references are:
+/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+///   Instruction Set Reference, A-Z][intel64_ref].
+/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+///   System Instructions][amd64_ref].
+///
+/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
+/// [intel64_ref]: https://cdrdv2-public.intel.com/671110/325383-sdm-vol-2abcd.pdf
+/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+#[inline]
+#[cfg_attr(test, assert_instr(cpuid))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
+    let eax;
+    let ebx;
+    let ecx;
+    let edx;
+
+    // LLVM sometimes reserves `ebx` for its internal use, we so we need to use
+    // a scratch register for it instead.
+    #[cfg(target_arch = "x86")]
+    {
+        asm!(
+            "mov {0}, ebx",
+            "cpuid",
+            "xchg {0}, ebx",
+            out(reg) ebx,
+            inout("eax") leaf => eax,
+            inout("ecx") sub_leaf => ecx,
+            out("edx") edx,
+            options(nostack, preserves_flags),
+        );
+    }
+    #[cfg(target_arch = "x86_64")]
+    {
+        asm!(
+            "mov {0:r}, rbx",
+            "cpuid",
+            "xchg {0:r}, rbx",
+            out(reg) ebx,
+            inout("eax") leaf => eax,
+            inout("ecx") sub_leaf => ecx,
+            out("edx") edx,
+            options(nostack, preserves_flags),
+        );
+    }
+    CpuidResult { eax, ebx, ecx, edx }
+}
+
+/// See [`__cpuid_count`](fn.__cpuid_count.html).
+#[inline]
+#[cfg_attr(test, assert_instr(cpuid))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __cpuid(leaf: u32) -> CpuidResult {
+    __cpuid_count(leaf, 0)
+}
+
+/// Returns the highest-supported `leaf` (`EAX`) and sub-leaf (`ECX`) `cpuid`
+/// values.
+///
+/// If `cpuid` is supported, and `leaf` is zero, then the first tuple argument
+/// contains the highest `leaf` value that `cpuid` supports. For `leaf`s
+/// containing sub-leafs, the second tuple argument contains the
+/// highest-supported sub-leaf value.
+///
+/// See also [`__cpuid`](fn.__cpuid.html) and
+/// [`__cpuid_count`](fn.__cpuid_count.html).
+#[inline]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __get_cpuid_max(leaf: u32) -> (u32, u32) {
+    let CpuidResult { eax, ebx, .. } = __cpuid(leaf);
+    (eax, ebx)
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/eflags.rs b/library/stdarch/crates/core_arch/src/x86/eflags.rs
new file mode 100644
index 0000000000000..5ae656db38768
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/eflags.rs
@@ -0,0 +1,86 @@
+//! `i386` intrinsics
+
+use crate::arch::asm;
+
+/// Reads EFLAGS.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__readeflags)
+#[cfg(target_arch = "x86")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.29.0",
+    note = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __readeflags() -> u32 {
+    let eflags: u32;
+    asm!("pushfd", "pop {}", out(reg) eflags, options(nomem, att_syntax));
+    eflags
+}
+
+/// Reads EFLAGS.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__readeflags)
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.29.0",
+    note = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __readeflags() -> u64 {
+    let eflags: u64;
+    asm!("pushfq", "pop {}", out(reg) eflags, options(nomem, att_syntax));
+    eflags
+}
+
+/// Write EFLAGS.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__writeeflags)
+#[cfg(target_arch = "x86")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.29.0",
+    note = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __writeeflags(eflags: u32) {
+    asm!("push {}", "popfd", in(reg) eflags, options(nomem, att_syntax));
+}
+
+/// Write EFLAGS.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__writeeflags)
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.29.0",
+    note = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __writeeflags(eflags: u64) {
+    asm!("push {}", "popfq", in(reg) eflags, options(nomem, att_syntax));
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    #[allow(deprecated)]
+    fn test_readeflags() {
+        unsafe {
+            // reads eflags, writes them back, reads them again,
+            // and compare for equality:
+            let v = __readeflags();
+            __writeeflags(v);
+            let u = __readeflags();
+            assert_eq!(v, u);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/f16c.rs b/library/stdarch/crates/core_arch/src/x86/f16c.rs
new file mode 100644
index 0000000000000..7686b317d4d49
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/f16c.rs
@@ -0,0 +1,149 @@
+//! [F16C intrinsics].
+//!
+//! [F16C intrinsics]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=fp16&expand=1769
+
+use crate::core_arch::{simd::*, x86::*};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.vcvtph2ps.128"]
+    fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4;
+    #[link_name = "llvm.x86.vcvtph2ps.256"]
+    fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8;
+    #[link_name = "llvm.x86.vcvtps2ph.128"]
+    fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8;
+    #[link_name = "llvm.x86.vcvtps2ph.256"]
+    fn llvm_vcvtps2ph_256(a: f32x8, rounding: i32) -> i16x8;
+}
+
+/// Converts the 4 x 16-bit half-precision float values in the lowest 64-bit of
+/// the 128-bit vector `a` into 4 x 32-bit float values stored in a 128-bit wide
+/// vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_ps)
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtph2ps"))]
+#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
+pub fn _mm_cvtph_ps(a: __m128i) -> __m128 {
+    unsafe { transmute(llvm_vcvtph2ps_128(transmute(a))) }
+}
+
+/// Converts the 8 x 16-bit half-precision float values in the 128-bit vector
+/// `a` into 8 x 32-bit float values stored in a 256-bit wide vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_ps)
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtph2ps"))]
+#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
+pub fn _mm256_cvtph_ps(a: __m128i) -> __m256 {
+    unsafe { transmute(llvm_vcvtph2ps_256(transmute(a))) }
+}
+
+/// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x
+/// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit
+/// vector.
+///
+/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_ph)
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
+pub fn _mm_cvtps_ph<const IMM_ROUNDING: i32>(a: __m128) -> __m128i {
+    static_assert_uimm_bits!(IMM_ROUNDING, 3);
+    unsafe {
+        let a = a.as_f32x4();
+        let r = llvm_vcvtps2ph_128(a, IMM_ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x
+/// 16-bit half-precision float values stored in a 128-bit wide vector.
+///
+/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_ph)
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
+pub fn _mm256_cvtps_ph<const IMM_ROUNDING: i32>(a: __m256) -> __m128i {
+    static_assert_uimm_bits!(IMM_ROUNDING, 3);
+    unsafe {
+        let a = a.as_f32x8();
+        let r = llvm_vcvtps2ph_256(a, IMM_ROUNDING);
+        transmute(r)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{core_arch::x86::*, mem::transmute};
+    use stdarch_test::simd_test;
+
+    const F16_ONE: i16 = 0x3c00;
+    const F16_TWO: i16 = 0x4000;
+    const F16_THREE: i16 = 0x4200;
+    const F16_FOUR: i16 = 0x4400;
+    const F16_FIVE: i16 = 0x4500;
+    const F16_SIX: i16 = 0x4600;
+    const F16_SEVEN: i16 = 0x4700;
+    const F16_EIGHT: i16 = 0x4800;
+
+    #[simd_test(enable = "f16c")]
+    unsafe fn test_mm_cvtph_ps() {
+        let a = _mm_set_epi16(0, 0, 0, 0, F16_ONE, F16_TWO, F16_THREE, F16_FOUR);
+        let r = _mm_cvtph_ps(a);
+        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "f16c")]
+    unsafe fn test_mm256_cvtph_ps() {
+        let a = _mm_set_epi16(
+            F16_ONE, F16_TWO, F16_THREE, F16_FOUR, F16_FIVE, F16_SIX, F16_SEVEN, F16_EIGHT,
+        );
+        let r = _mm256_cvtph_ps(a);
+        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "f16c")]
+    unsafe fn test_mm_cvtps_ph() {
+        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtps_ph::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, F16_ONE, F16_TWO, F16_THREE, F16_FOUR);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "f16c")]
+    unsafe fn test_mm256_cvtps_ph() {
+        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvtps_ph::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm_set_epi16(
+            F16_ONE, F16_TWO, F16_THREE, F16_FOUR, F16_FIVE, F16_SIX, F16_SEVEN, F16_EIGHT,
+        );
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/fma.rs b/library/stdarch/crates/core_arch/src/x86/fma.rs
new file mode 100644
index 0000000000000..d3988422b9a4d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/fma.rs
@@ -0,0 +1,816 @@
+//! Fused Multiply-Add instruction set (FMA)
+//!
+//! The FMA instruction set is an extension to the 128 and 256-bit SSE
+//! instructions in the x86 microprocessor instruction set to perform fused
+//! multiply–add (FMA) operations.
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! Wikipedia's [FMA][wiki_fma] page provides a quick overview of the
+//! instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+
+use crate::core_arch::x86::*;
+use crate::intrinsics::simd::{simd_fma, simd_neg};
+use crate::intrinsics::{fmaf32, fmaf64};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiplies the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and add the intermediate result to the lower element in `c`.
+/// Stores the result in the lower element of the returned value, and copy the
+/// upper element from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
+        )
+    }
+}
+
+/// Multiplies the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`, and add the intermediate result to the lower element in `c`.
+/// Stores the result in the lower element of the returned value, and copy the
+/// 3 upper elements from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), _mm_cvtss_f32(c))
+        )
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [2, 1])
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [4, 1, 6, 3])
+    }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [4, 1, 6, 3])
+    }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmaddsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub213ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub213ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiplies the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and subtract the lower element in `c` from the intermediate
+/// result. Store the result in the lower element of the returned value, and
+/// copy the upper element from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
+        )
+    }
+}
+
+/// Multiplies the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`,  and subtract the lower element in `c` from the intermediate
+/// result. Store the result in the lower element of the returned value, and
+/// copy the 3 upper elements from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), -_mm_cvtss_f32(c))
+        )
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [0, 3])
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [0, 5, 2, 7])
+    }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [0, 5, 2, 7])
+    }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmsubadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15])
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fnmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiplies the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and add the negated intermediate result to the lower element
+/// in `c`. Store the result in the lower element of the returned value, and
+/// copy the upper element from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
+        )
+    }
+}
+
+/// Multiplies the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`, and add the negated intermediate result to the lower element
+/// in `c`. Store the result in the lower element of the returned value, and
+/// copy the 3 upper elements from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), _mm_cvtss_f32(c))
+        )
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fnmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiplies the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and subtract packed elements in `c` from the negated
+/// intermediate result. Store the result in the lower element of the returned
+/// value, and copy the upper element from `a` to the upper elements of the
+/// result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
+        )
+    }
+}
+
+/// Multiplies the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`, and subtract packed elements in `c` from the negated
+/// intermediate result. Store the result in the lower element of the
+/// returned value, and copy the 3 upper elements from `a` to the upper
+/// elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), -_mm_cvtss_f32(c))
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(9., 15.);
+        assert_eq_m128d(_mm_fmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmadd_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(9., 15., 22., 15.);
+        assert_eq_m256d(_mm256_fmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(9., 15., 22., 15.);
+        assert_eq_m128(_mm_fmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmadd_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(9., 15., 22., 15., -5., -49., -2., -31.);
+        assert_eq_m256(_mm256_fmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(9., 2.);
+        assert_eq_m128d(_mm_fmadd_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(9., 2., 3., 4.);
+        assert_eq_m128(_mm_fmadd_ss(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmaddsub_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(1., 15.);
+        assert_eq_m128d(_mm_fmaddsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmaddsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(1., 15., 20., 15.);
+        assert_eq_m256d(_mm256_fmaddsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmaddsub_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(1., 15., 20., 15.);
+        assert_eq_m128(_mm_fmaddsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmaddsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(1., 15., 20., 15., 5., -49., 2., -31.);
+        assert_eq_m256(_mm256_fmaddsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(1., -3.);
+        assert_eq_m128d(_mm_fmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(1., -3., 20., 1.);
+        assert_eq_m256d(_mm256_fmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(1., -3., 20., 1.);
+        assert_eq_m128(_mm_fmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(1., -3., 20., 1., 5., -71., 2., -25.);
+        assert_eq_m256(_mm256_fmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(1., 2.);
+        assert_eq_m128d(_mm_fmsub_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(1., 2., 3., 4.);
+        assert_eq_m128(_mm_fmsub_ss(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsubadd_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(9., -3.);
+        assert_eq_m128d(_mm_fmsubadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsubadd_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(9., -3., 22., 1.);
+        assert_eq_m256d(_mm256_fmsubadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsubadd_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(9., -3., 22., 1.);
+        assert_eq_m128(_mm_fmsubadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsubadd_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(9., -3., 22., 1., -5., -71., -2., -25.);
+        assert_eq_m256(_mm256_fmsubadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-1., 3.);
+        assert_eq_m128d(_mm_fnmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmadd_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(-1., 3., -20., -1.);
+        assert_eq_m256d(_mm256_fnmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-1., 3., -20., -1.);
+        assert_eq_m128(_mm_fnmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmadd_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(-1., 3., -20., -1., -5., 71., -2., 25.);
+        assert_eq_m256(_mm256_fnmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-1., 2.);
+        assert_eq_m128d(_mm_fnmadd_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-1., 2., 3., 4.);
+        assert_eq_m128(_mm_fnmadd_ss(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-9., -15.);
+        assert_eq_m128d(_mm_fnmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(-9., -15., -22., -15.);
+        assert_eq_m256d(_mm256_fnmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-9., -15., -22., -15.);
+        assert_eq_m128(_mm_fnmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(-9., -15., -22., -15., 5., 49., 2., 31.);
+        assert_eq_m256(_mm256_fnmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-9., 2.);
+        assert_eq_m128d(_mm_fnmsub_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-9., 2., 3., 4.);
+        assert_eq_m128(_mm_fnmsub_ss(a, b, c), r);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/fxsr.rs b/library/stdarch/crates/core_arch/src/x86/fxsr.rs
new file mode 100644
index 0000000000000..71fd52ca14963
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/fxsr.rs
@@ -0,0 +1,88 @@
+//! FXSR floating-point context fast save and restore.
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.fxsave"]
+    fn fxsave(p: *mut u8);
+    #[link_name = "llvm.x86.fxrstor"]
+    fn fxrstor(p: *const u8);
+}
+
+/// Saves the `x87` FPU, `MMX` technology, `XMM`, and `MXCSR` registers to the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_fxsave)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxsave))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxsave(mem_addr: *mut u8) {
+    fxsave(mem_addr)
+}
+
+/// Restores the `XMM`, `MMX`, `MXCSR`, and `x87` FPU registers from the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// The contents of this memory region should have been written to by a
+/// previous
+/// `_fxsave` or `_fxsave64` intrinsic.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_fxrstor)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxrstor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxrstor(mem_addr: *const u8) {
+    fxrstor(mem_addr)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use std::{cmp::PartialEq, fmt};
+    use stdarch_test::simd_test;
+
+    #[repr(align(16))]
+    struct FxsaveArea {
+        data: [u8; 512], // 512 bytes
+    }
+
+    impl FxsaveArea {
+        fn new() -> FxsaveArea {
+            FxsaveArea { data: [0; 512] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            self.data.as_mut_ptr()
+        }
+    }
+
+    #[simd_test(enable = "fxsr")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_fxsave() {
+        let mut a = FxsaveArea::new();
+        let mut b = FxsaveArea::new();
+
+        fxsr::_fxsave(a.ptr());
+        fxsr::_fxrstor(a.ptr());
+        fxsr::_fxsave(b.ptr());
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/gfni.rs b/library/stdarch/crates/core_arch/src/x86/gfni.rs
new file mode 100644
index 0000000000000..9386684abaef6
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/gfni.rs
@@ -0,0 +1,1549 @@
+//! Galois Field New Instructions (GFNI)
+//!
+//! The intrinsics here correspond to those in the `immintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::simd::i8x16;
+use crate::core_arch::simd::i8x32;
+use crate::core_arch::simd::i8x64;
+use crate::core_arch::x86::__m128i;
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+use crate::core_arch::x86::__mmask16;
+use crate::core_arch::x86::__mmask32;
+use crate::core_arch::x86::__mmask64;
+use crate::intrinsics::simd::simd_select_bitmask;
+use crate::mem::transmute;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
+    fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
+    #[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
+    fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
+    #[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
+    fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
+    #[link_name = "llvm.x86.vgf2p8affineqb.512"]
+    fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
+    #[link_name = "llvm.x86.vgf2p8affineqb.256"]
+    fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
+    #[link_name = "llvm.x86.vgf2p8affineqb.128"]
+    fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
+    #[link_name = "llvm.x86.vgf2p8mulb.512"]
+    fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
+    #[link_name = "llvm.x86.vgf2p8mulb.256"]
+    fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.vgf2p8mulb.128"]
+    fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
+}
+
+// LLVM requires AVX512BW for a lot of these instructions, see
+// https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
+// however our tests also require the target feature list to match Intel's
+// which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F
+// requirement (for now)
+// also see
+// https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
+// for forcing GFNI, BW and optionally VL extension
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm512_mask_gf2p8mul_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
+            src.as_i8x64(),
+        ))
+    }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let zero = i8x64::ZERO;
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
+            zero,
+        ))
+    }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm256_mask_gf2p8mul_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
+            src.as_i8x32(),
+        ))
+    }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let zero = i8x32::ZERO;
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
+            zero,
+        ))
+    }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(gf2p8mulb))]
+pub fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm_mask_gf2p8mul_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
+            src.as_i8x16(),
+        ))
+    }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let zero = i8x16::ZERO;
+        transmute(simd_select_bitmask(
+            k,
+            vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
+            zero,
+        ))
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    unsafe {
+        let r = vgf2p8affineqb_512(x, a, b);
+        transmute(r)
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
+    k: __mmask64,
+    x: __m512i,
+    a: __m512i,
+) -> __m512i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let zero = i8x64::ZERO;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    unsafe {
+        let r = vgf2p8affineqb_512(x, a, b);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
+    src: __m512i,
+    k: __mmask64,
+    x: __m512i,
+    a: __m512i,
+) -> __m512i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    unsafe {
+        let r = vgf2p8affineqb_512(x, a, b);
+        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    unsafe {
+        let r = vgf2p8affineqb_256(x, a, b);
+        transmute(r)
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
+    k: __mmask32,
+    x: __m256i,
+    a: __m256i,
+) -> __m256i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let zero = i8x32::ZERO;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    unsafe {
+        let r = vgf2p8affineqb_256(x, a, b);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
+    src: __m256i,
+    k: __mmask32,
+    x: __m256i,
+    a: __m256i,
+) -> __m256i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    unsafe {
+        let r = vgf2p8affineqb_256(x, a, b);
+        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(gf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    unsafe {
+        let r = vgf2p8affineqb_128(x, a, b);
+        transmute(r)
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
+    k: __mmask16,
+    x: __m128i,
+    a: __m128i,
+) -> __m128i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let zero = i8x16::ZERO;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    unsafe {
+        let r = vgf2p8affineqb_128(x, a, b);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
+    src: __m128i,
+    k: __mmask16,
+    x: __m128i,
+    a: __m128i,
+) -> __m128i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    unsafe {
+        let r = vgf2p8affineqb_128(x, a, b);
+        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    unsafe {
+        let r = vgf2p8affineinvqb_512(x, a, b);
+        transmute(r)
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
+    k: __mmask64,
+    x: __m512i,
+    a: __m512i,
+) -> __m512i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let zero = i8x64::ZERO;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    unsafe {
+        let r = vgf2p8affineinvqb_512(x, a, b);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
+    src: __m512i,
+    k: __mmask64,
+    x: __m512i,
+    a: __m512i,
+) -> __m512i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    unsafe {
+        let r = vgf2p8affineinvqb_512(x, a, b);
+        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    unsafe {
+        let r = vgf2p8affineinvqb_256(x, a, b);
+        transmute(r)
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
+    k: __mmask32,
+    x: __m256i,
+    a: __m256i,
+) -> __m256i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let zero = i8x32::ZERO;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    unsafe {
+        let r = vgf2p8affineinvqb_256(x, a, b);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
+    src: __m256i,
+    k: __mmask32,
+    x: __m256i,
+    a: __m256i,
+) -> __m256i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    unsafe {
+        let r = vgf2p8affineinvqb_256(x, a, b);
+        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(gf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    unsafe {
+        let r = vgf2p8affineinvqb_128(x, a, b);
+        transmute(r)
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
+    k: __mmask16,
+    x: __m128i,
+    a: __m128i,
+) -> __m128i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let zero = i8x16::ZERO;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    unsafe {
+        let r = vgf2p8affineinvqb_128(x, a, b);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
+    src: __m128i,
+    k: __mmask16,
+    x: __m128i,
+    a: __m128i,
+) -> __m128i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    unsafe {
+        let r = vgf2p8affineinvqb_128(x, a, b);
+        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __mXXXi happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use core::hint::black_box;
+    use core::intrinsics::size_of;
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    fn mulbyte(left: u8, right: u8) -> u8 {
+        // this implementation follows the description in
+        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8
+        const REDUCTION_POLYNOMIAL: u16 = 0x11b;
+        let left: u16 = left.into();
+        let right: u16 = right.into();
+        let mut carryless_product: u16 = 0;
+
+        // Carryless multiplication
+        for i in 0..8 {
+            if ((left >> i) & 0x01) != 0 {
+                carryless_product ^= right << i;
+            }
+        }
+
+        // reduction, adding in "0" where appropriate to clear out high bits
+        // note that REDUCTION_POLYNOMIAL is zero in this context
+        for i in (8..=14).rev() {
+            if ((carryless_product >> i) & 0x01) != 0 {
+                carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8);
+            }
+        }
+
+        carryless_product as u8
+    }
+
+    const NUM_TEST_WORDS_512: usize = 4;
+    const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2;
+    const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2;
+    const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64;
+    const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2;
+    const NUM_BYTES: usize = 256;
+    const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16;
+    const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2;
+    const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2;
+
+    fn parity(input: u8) -> u8 {
+        let mut accumulator = 0;
+        for i in 0..8 {
+            accumulator ^= (input >> i) & 0x01;
+        }
+        accumulator
+    }
+
+    fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
+        // this implementation follows the description in
+        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8
+        let mut accumulator = 0;
+
+        for bit in 0..8 {
+            accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit);
+        }
+
+        accumulator ^ b
+    }
+
+    fn generate_affine_mul_test_data(
+        immediate: u8,
+    ) -> (
+        [u64; NUM_TEST_WORDS_64],
+        [u8; NUM_TEST_ENTRIES],
+        [u8; NUM_TEST_ENTRIES],
+    ) {
+        let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64];
+        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+
+        for i in 0..NUM_TEST_WORDS_64 {
+            left[i] = (i as u64) * 103 * 101;
+            for j in 0..8 {
+                let j64 = j as u64;
+                right[i * 8 + j] = ((left[i] + j64) % 256) as u8;
+                result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate);
+            }
+        }
+
+        (left, right, result)
+    }
+
+    fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
+        let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES];
+        let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES];
+
+        for i in 0..NUM_BYTES {
+            input[i] = (i % 256) as u8;
+            result[i] = if i == 0 { 0 } else { 1 };
+        }
+
+        (input, result)
+    }
+
+    const AES_S_BOX: [u8; NUM_BYTES] = [
+        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab,
+        0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
+        0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71,
+        0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
+        0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6,
+        0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
+        0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45,
+        0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44,
+        0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
+        0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49,
+        0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
+        0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25,
+        0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
+        0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1,
+        0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb,
+        0x16,
+    ];
+
+    fn generate_byte_mul_test_data() -> (
+        [u8; NUM_TEST_ENTRIES],
+        [u8; NUM_TEST_ENTRIES],
+        [u8; NUM_TEST_ENTRIES],
+    ) {
+        let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+
+        for i in 0..NUM_TEST_ENTRIES {
+            left[i] = (i % 256) as u8;
+            right[i] = left[i].wrapping_mul(101);
+            result[i] = mulbyte(left[i], right[i]);
+        }
+
+        (left, right, result)
+    }
+
+    #[target_feature(enable = "sse2")]
+    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+    unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
+        let byte_offset = word_index * 16 / size_of::<T>();
+        let pointer = data.as_ptr().add(byte_offset) as *const __m128i;
+        _mm_loadu_si128(black_box(pointer))
+    }
+
+    #[target_feature(enable = "avx")]
+    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+    unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
+        let byte_offset = word_index * 32 / size_of::<T>();
+        let pointer = data.as_ptr().add(byte_offset) as *const __m256i;
+        _mm256_loadu_si256(black_box(pointer))
+    }
+
+    #[target_feature(enable = "avx512f")]
+    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+    unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
+        let byte_offset = word_index * 64 / size_of::<T>();
+        let pointer = data.as_ptr().add(byte_offset) as *const _;
+        _mm512_loadu_si512(black_box(pointer))
+    }
+
+    #[simd_test(enable = "gfni,avx512f")]
+    unsafe fn test_mm512_gf2p8mul_epi8() {
+        let (left, right, expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&left, i);
+            let right = load_m512i_word(&right, i);
+            let expected = load_m512i_word(&expected, i);
+            let result = _mm512_gf2p8mul_epi8(left, right);
+            assert_eq_m512i(result, expected);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw")]
+    unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&left, i);
+            let right = load_m512i_word(&right, i);
+            let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right);
+            assert_eq_m512i(result_zero, _mm512_setzero_si512());
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8mul_epi8(left, right);
+            let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
+            let expected_masked =
+                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw")]
+    unsafe fn test_mm512_mask_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&left, i);
+            let right = load_m512i_word(&right, i);
+            let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right);
+            assert_eq_m512i(result_left, left);
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8mul_epi8(left, right);
+            let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
+            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx")]
+    unsafe fn test_mm256_gf2p8mul_epi8() {
+        let (left, right, expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&left, i);
+            let right = load_m256i_word(&right, i);
+            let expected = load_m256i_word(&expected, i);
+            let result = _mm256_gf2p8mul_epi8(left, right);
+            assert_eq_m256i(result, expected);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&left, i);
+            let right = load_m256i_word(&right, i);
+            let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right);
+            assert_eq_m256i(result_zero, _mm256_setzero_si256());
+            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
+            const MASK_WORDS: i32 = 0b01_10_11_00;
+            let expected_result = _mm256_gf2p8mul_epi8(left, right);
+            let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
+            let expected_masked =
+                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&left, i);
+            let right = load_m256i_word(&right, i);
+            let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right);
+            assert_eq_m256i(result_left, left);
+            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
+            const MASK_WORDS: i32 = 0b01_10_11_00;
+            let expected_result = _mm256_gf2p8mul_epi8(left, right);
+            let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
+            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni")]
+    unsafe fn test_mm_gf2p8mul_epi8() {
+        let (left, right, expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&left, i);
+            let right = load_m128i_word(&right, i);
+            let expected = load_m128i_word(&expected, i);
+            let result = _mm_gf2p8mul_epi8(left, right);
+            assert_eq_m128i(result, expected);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&left, i);
+            let right = load_m128i_word(&right, i);
+            let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right);
+            assert_eq_m128i(result_zero, _mm_setzero_si128());
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8mul_epi8(left, right);
+            let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
+            let expected_masked =
+                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&left, i);
+            let right = load_m128i_word(&right, i);
+            let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right);
+            assert_eq_m128i(result_left, left);
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8mul_epi8(left, right);
+            let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
+            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512f")]
+    unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        let constant: i64 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm512_set1_epi64(identity);
+        let constant = _mm512_set1_epi64(constant);
+        let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
+
+        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
+        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let data = load_m512i_word(&bytes, i);
+            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m512i(result, data);
+            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m512i(result, constant_reference);
+            let data = load_m512i_word(&more_bytes, i);
+            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m512i(result, data);
+            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m512i(result, constant_reference);
+
+            let matrix = load_m512i_word(&matrices, i);
+            let vector = load_m512i_word(&vectors, i);
+            let reference = load_m512i_word(&references, i);
+
+            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
+            assert_eq_m512i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw")]
+    unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let matrix = load_m512i_word(&matrices, i);
+            let vector = load_m512i_word(&vectors, i);
+            let result_zero =
+                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m512i(result_zero, _mm512_setzero_si512());
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw")]
+    unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&vectors, i);
+            let right = load_m512i_word(&matrices, i);
+            let result_left =
+                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m512i(result_left, left);
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked =
+                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
+            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx")]
+    unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        let constant: i64 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm256_set1_epi64x(identity);
+        let constant = _mm256_set1_epi64x(constant);
+        let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
+
+        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
+        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let data = load_m256i_word(&bytes, i);
+            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m256i(result, data);
+            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m256i(result, constant_reference);
+            let data = load_m256i_word(&more_bytes, i);
+            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m256i(result, data);
+            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m256i(result, constant_reference);
+
+            let matrix = load_m256i_word(&matrices, i);
+            let vector = load_m256i_word(&vectors, i);
+            let reference = load_m256i_word(&references, i);
+
+            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
+            assert_eq_m256i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let matrix = load_m256i_word(&matrices, i);
+            let vector = load_m256i_word(&vectors, i);
+            let result_zero =
+                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m256i(result_zero, _mm256_setzero_si256());
+            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
+            const MASK_WORDS: i32 = 0b11_01_10_00;
+            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&vectors, i);
+            let right = load_m256i_word(&matrices, i);
+            let result_left =
+                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m256i(result_left, left);
+            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
+            const MASK_WORDS: i32 = 0b11_01_10_00;
+            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked =
+                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
+            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni")]
+    unsafe fn test_mm_gf2p8affine_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        let constant: i64 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm_set1_epi64x(identity);
+        let constant = _mm_set1_epi64x(constant);
+        let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
+
+        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
+        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let data = load_m128i_word(&bytes, i);
+            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m128i(result, data);
+            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m128i(result, constant_reference);
+            let data = load_m128i_word(&more_bytes, i);
+            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m128i(result, data);
+            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m128i(result, constant_reference);
+
+            let matrix = load_m128i_word(&matrices, i);
+            let vector = load_m128i_word(&vectors, i);
+            let reference = load_m128i_word(&references, i);
+
+            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
+            assert_eq_m128i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let matrix = load_m128i_word(&matrices, i);
+            let vector = load_m128i_word(&vectors, i);
+            let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m128i(result_zero, _mm_setzero_si128());
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&vectors, i);
+            let right = load_m128i_word(&matrices, i);
+            let result_left =
+                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m128i(result_left, left);
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked =
+                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
+            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512f")]
+    unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm512_set1_epi64(identity);
+
+        // validate inversion
+        let (inputs, results) = generate_inv_tests_data();
+
+        for i in 0..NUM_BYTES_WORDS_512 {
+            let input = load_m512i_word(&inputs, i);
+            let reference = load_m512i_word(&results, i);
+            let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
+            let remultiplied = _mm512_gf2p8mul_epi8(result, input);
+            assert_eq_m512i(remultiplied, reference);
+        }
+
+        // validate subsequent affine operation
+        let (matrices, vectors, _affine_expected) =
+            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let vector = load_m512i_word(&vectors, i);
+            let matrix = load_m512i_word(&matrices, i);
+
+            let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
+            let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
+            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            assert_eq_m512i(result, reference);
+        }
+
+        // validate everything by virtue of checking against the AES SBox
+        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
+        let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
+
+        for i in 0..NUM_BYTES_WORDS_512 {
+            let reference = load_m512i_word(&AES_S_BOX, i);
+            let input = load_m512i_word(&inputs, i);
+            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
+            assert_eq_m512i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw")]
+    unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let matrix = load_m512i_word(&matrices, i);
+            let vector = load_m512i_word(&vectors, i);
+            let result_zero =
+                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m512i(result_zero, _mm512_setzero_si512());
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw")]
+    unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&vectors, i);
+            let right = load_m512i_word(&matrices, i);
+            let result_left =
+                _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m512i(result_left, left);
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
+                left, mask_bytes, left, right,
+            );
+            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx")]
+    unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm256_set1_epi64x(identity);
+
+        // validate inversion
+        let (inputs, results) = generate_inv_tests_data();
+
+        for i in 0..NUM_BYTES_WORDS_256 {
+            let input = load_m256i_word(&inputs, i);
+            let reference = load_m256i_word(&results, i);
+            let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
+            let remultiplied = _mm256_gf2p8mul_epi8(result, input);
+            assert_eq_m256i(remultiplied, reference);
+        }
+
+        // validate subsequent affine operation
+        let (matrices, vectors, _affine_expected) =
+            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let vector = load_m256i_word(&vectors, i);
+            let matrix = load_m256i_word(&matrices, i);
+
+            let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
+            let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
+            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            assert_eq_m256i(result, reference);
+        }
+
+        // validate everything by virtue of checking against the AES SBox
+        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
+        let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
+
+        for i in 0..NUM_BYTES_WORDS_256 {
+            let reference = load_m256i_word(&AES_S_BOX, i);
+            let input = load_m256i_word(&inputs, i);
+            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
+            assert_eq_m256i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let matrix = load_m256i_word(&matrices, i);
+            let vector = load_m256i_word(&vectors, i);
+            let result_zero =
+                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m256i(result_zero, _mm256_setzero_si256());
+            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
+            const MASK_WORDS: i32 = 0b11_01_10_00;
+            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&vectors, i);
+            let right = load_m256i_word(&matrices, i);
+            let result_left =
+                _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m256i(result_left, left);
+            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
+            const MASK_WORDS: i32 = 0b11_01_10_00;
+            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
+                left, mask_bytes, left, right,
+            );
+            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni")]
+    unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm_set1_epi64x(identity);
+
+        // validate inversion
+        let (inputs, results) = generate_inv_tests_data();
+
+        for i in 0..NUM_BYTES_WORDS_128 {
+            let input = load_m128i_word(&inputs, i);
+            let reference = load_m128i_word(&results, i);
+            let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
+            let remultiplied = _mm_gf2p8mul_epi8(result, input);
+            assert_eq_m128i(remultiplied, reference);
+        }
+
+        // validate subsequent affine operation
+        let (matrices, vectors, _affine_expected) =
+            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let vector = load_m128i_word(&vectors, i);
+            let matrix = load_m128i_word(&matrices, i);
+
+            let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
+            let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
+            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            assert_eq_m128i(result, reference);
+        }
+
+        // validate everything by virtue of checking against the AES SBox
+        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
+        let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
+
+        for i in 0..NUM_BYTES_WORDS_128 {
+            let reference = load_m128i_word(&AES_S_BOX, i);
+            let input = load_m128i_word(&inputs, i);
+            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
+            assert_eq_m128i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let matrix = load_m128i_word(&matrices, i);
+            let vector = load_m128i_word(&vectors, i);
+            let result_zero =
+                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m128i(result_zero, _mm_setzero_si128());
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&vectors, i);
+            let right = load_m128i_word(&matrices, i);
+            let result_left =
+                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m128i(result_left, left);
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked =
+                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
+            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/kl.rs b/library/stdarch/crates/core_arch/src/x86/kl.rs
new file mode 100644
index 0000000000000..eb9eb83f4115c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/kl.rs
@@ -0,0 +1,526 @@
+//! AES Key Locker Intrinsics
+//!
+//! The Intrinsics here correspond to those in the `keylockerintrin.h` C header.
+
+use crate::core_arch::x86::__m128i;
+use crate::ptr;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[repr(C, packed)]
+struct EncodeKey128Output(u32, __m128i, __m128i, __m128i, __m128i, __m128i, __m128i);
+
+#[repr(C, packed)]
+struct EncodeKey256Output(
+    u32,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+);
+
+#[repr(C, packed)]
+struct AesOutput(u8, __m128i);
+
+#[repr(C, packed)]
+struct WideAesOutput(
+    u8,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+);
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.loadiwkey"]
+    fn loadiwkey(integrity_key: __m128i, key_lo: __m128i, key_hi: __m128i, control: u32);
+
+    #[link_name = "llvm.x86.encodekey128"]
+    fn encodekey128(key_metadata: u32, key: __m128i) -> EncodeKey128Output;
+    #[link_name = "llvm.x86.encodekey256"]
+    fn encodekey256(key_metadata: u32, key_lo: __m128i, key_hi: __m128i) -> EncodeKey256Output;
+
+    #[link_name = "llvm.x86.aesenc128kl"]
+    fn aesenc128kl(data: __m128i, handle: *const u8) -> AesOutput;
+    #[link_name = "llvm.x86.aesdec128kl"]
+    fn aesdec128kl(data: __m128i, handle: *const u8) -> AesOutput;
+    #[link_name = "llvm.x86.aesenc256kl"]
+    fn aesenc256kl(data: __m128i, handle: *const u8) -> AesOutput;
+    #[link_name = "llvm.x86.aesdec256kl"]
+    fn aesdec256kl(data: __m128i, handle: *const u8) -> AesOutput;
+
+    #[link_name = "llvm.x86.aesencwide128kl"]
+    fn aesencwide128kl(
+        handle: *const u8,
+        i0: __m128i,
+        i1: __m128i,
+        i2: __m128i,
+        i3: __m128i,
+        i4: __m128i,
+        i5: __m128i,
+        i6: __m128i,
+        i7: __m128i,
+    ) -> WideAesOutput;
+    #[link_name = "llvm.x86.aesdecwide128kl"]
+    fn aesdecwide128kl(
+        handle: *const u8,
+        i0: __m128i,
+        i1: __m128i,
+        i2: __m128i,
+        i3: __m128i,
+        i4: __m128i,
+        i5: __m128i,
+        i6: __m128i,
+        i7: __m128i,
+    ) -> WideAesOutput;
+    #[link_name = "llvm.x86.aesencwide256kl"]
+    fn aesencwide256kl(
+        handle: *const u8,
+        i0: __m128i,
+        i1: __m128i,
+        i2: __m128i,
+        i3: __m128i,
+        i4: __m128i,
+        i5: __m128i,
+        i6: __m128i,
+        i7: __m128i,
+    ) -> WideAesOutput;
+    #[link_name = "llvm.x86.aesdecwide256kl"]
+    fn aesdecwide256kl(
+        handle: *const u8,
+        i0: __m128i,
+        i1: __m128i,
+        i2: __m128i,
+        i3: __m128i,
+        i4: __m128i,
+        i5: __m128i,
+        i6: __m128i,
+        i7: __m128i,
+    ) -> WideAesOutput;
+}
+
+/// Load internal wrapping key (IWKey). The 32-bit unsigned integer `control` specifies IWKey's KeySource
+/// and whether backing up the key is permitted. IWKey's 256-bit encryption key is loaded from `key_lo`
+/// and `key_hi`.
+///
+///  - `control[0]`: NoBackup bit. If set, the IWKey cannot be backed up.
+///  - `control[1:4]`: KeySource bits. These bits specify the encoding method of the IWKey. The only
+///    allowed values are `0` (AES GCM SIV wrapping algorithm with the specified key) and `1` (AES GCM
+///    SIV wrapping algorithm with random keys enforced by hardware). After calling `_mm_loadiwkey` with
+///    KeySource set to `1`, software must check `ZF` to ensure that the key was loaded successfully.
+///    Using any other value may result in a General Protection Exception.
+///  - `control[5:31]`: Reserved for future use, must be set to `0`.
+///
+/// Note that setting the NoBackup bit and using the KeySource value `1` requires hardware support. These
+/// permissions can be found by calling `__cpuid(0x19)` and checking the `ECX[0:1]` bits. Failing to follow
+/// these restrictions may result in a General Protection Exception.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadiwkey)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(loadiwkey))]
+pub unsafe fn _mm_loadiwkey(
+    control: u32,
+    integrity_key: __m128i,
+    key_lo: __m128i,
+    key_hi: __m128i,
+) {
+    loadiwkey(integrity_key, key_lo, key_hi, control);
+}
+
+/// Wrap a 128-bit AES key into a 384-bit key handle and stores it in `handle`. Returns the `control`
+/// parameter used to create the IWKey.
+///
+///  - `key_params[0]`: If set, this key can only be used by the Kernel.
+///  - `key_params[1]`: If set, this key can not be used to encrypt.
+///  - `key_params[2]`: If set, this key can not be used to decrypt.
+///  - `key_params[31:3]`: Reserved for future use, must be set to `0`.
+///
+/// Note that these restrictions need hardware support, and the supported restrictions can be found by
+/// calling `__cpuid(0x19)` and checking the `EAX[0:2]` bits. Failing to follow these restrictions may
+/// result in a General Protection Exception.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_encodekey128_u32)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(encodekey128))]
+pub unsafe fn _mm_encodekey128_u32(key_params: u32, key: __m128i, handle: *mut u8) -> u32 {
+    let EncodeKey128Output(control, key0, key1, key2, _, _, _) = encodekey128(key_params, key);
+    ptr::write_unaligned(handle.cast(), [key0, key1, key2]);
+    control
+}
+
+/// Wrap a 256-bit AES key into a 512-bit key handle and stores it in `handle`. Returns the `control`
+/// parameter used to create the IWKey.
+///
+///  - `key_params[0]`: If set, this key can only be used by the Kernel.
+///  - `key_params[1]`: If set, this key can not be used to encrypt.
+///  - `key_params[2]`: If set, this key can not be used to decrypt.
+///  - `key_params[31:3]`: Reserved for future use, must be set to `0`.
+///
+/// Note that these restrictions need hardware support, and the supported restrictions can be found by
+/// calling `__cpuid(0x19)` and checking the `EAX[0:2]` bits. Failing to follow these restrictions may
+/// result in a General Protection Exception.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_encodekey256_u32)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(encodekey256))]
+pub unsafe fn _mm_encodekey256_u32(
+    key_params: u32,
+    key_lo: __m128i,
+    key_hi: __m128i,
+    handle: *mut u8,
+) -> u32 {
+    let EncodeKey256Output(control, key0, key1, key2, key3, _, _, _) =
+        encodekey256(key_params, key_lo, key_hi);
+    ptr::write_unaligned(handle.cast(), [key0, key1, key2, key3]);
+    control
+}
+
+/// Encrypt 10 rounds of unsigned 8-bit integers in `input` using 128-bit AES key specified in the
+/// 384-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc128kl_u8)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesenc128kl))]
+pub unsafe fn _mm_aesenc128kl_u8(output: *mut __m128i, input: __m128i, handle: *const u8) -> u8 {
+    let AesOutput(status, result) = aesenc128kl(input, handle);
+    *output = result;
+    status
+}
+
+/// Decrypt 10 rounds of unsigned 8-bit integers in `input` using 128-bit AES key specified in the
+/// 384-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec128kl_u8)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesdec128kl))]
+pub unsafe fn _mm_aesdec128kl_u8(output: *mut __m128i, input: __m128i, handle: *const u8) -> u8 {
+    let AesOutput(status, result) = aesdec128kl(input, handle);
+    *output = result;
+    status
+}
+
+/// Encrypt 14 rounds of unsigned 8-bit integers in `input` using 256-bit AES key specified in the
+/// 512-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc256kl_u8)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesenc256kl))]
+pub unsafe fn _mm_aesenc256kl_u8(output: *mut __m128i, input: __m128i, handle: *const u8) -> u8 {
+    let AesOutput(status, result) = aesenc256kl(input, handle);
+    *output = result;
+    status
+}
+
+/// Decrypt 14 rounds of unsigned 8-bit integers in `input` using 256-bit AES key specified in the
+/// 512-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec256kl_u8)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesdec256kl))]
+pub unsafe fn _mm_aesdec256kl_u8(output: *mut __m128i, input: __m128i, handle: *const u8) -> u8 {
+    let AesOutput(status, result) = aesdec256kl(input, handle);
+    *output = result;
+    status
+}
+
+/// Encrypt 10 rounds of 8 groups of unsigned 8-bit integers in `input` using 128-bit AES key specified
+/// in the 384-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesencwide128kl_u8)
+#[inline]
+#[target_feature(enable = "widekl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesencwide128kl))]
+pub unsafe fn _mm_aesencwide128kl_u8(
+    output: *mut __m128i,
+    input: *const __m128i,
+    handle: *const u8,
+) -> u8 {
+    let input = &*ptr::slice_from_raw_parts(input, 8);
+    let WideAesOutput(status, out0, out1, out2, out3, out4, out5, out6, out7) = aesencwide128kl(
+        handle, input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7],
+    );
+    *output.cast() = [out0, out1, out2, out3, out4, out5, out6, out7];
+    status
+}
+
+/// Decrypt 10 rounds of 8 groups of unsigned 8-bit integers in `input` using 128-bit AES key specified
+/// in the 384-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdecwide128kl_u8)
+#[inline]
+#[target_feature(enable = "widekl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesdecwide128kl))]
+pub unsafe fn _mm_aesdecwide128kl_u8(
+    output: *mut __m128i,
+    input: *const __m128i,
+    handle: *const u8,
+) -> u8 {
+    let input = &*ptr::slice_from_raw_parts(input, 8);
+    let WideAesOutput(status, out0, out1, out2, out3, out4, out5, out6, out7) = aesdecwide128kl(
+        handle, input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7],
+    );
+    *output.cast() = [out0, out1, out2, out3, out4, out5, out6, out7];
+    status
+}
+
+/// Encrypt 14 rounds of 8 groups of unsigned 8-bit integers in `input` using 256-bit AES key specified
+/// in the 512-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesencwide256kl_u8)
+#[inline]
+#[target_feature(enable = "widekl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesencwide256kl))]
+pub unsafe fn _mm_aesencwide256kl_u8(
+    output: *mut __m128i,
+    input: *const __m128i,
+    handle: *const u8,
+) -> u8 {
+    let input = &*ptr::slice_from_raw_parts(input, 8);
+    let WideAesOutput(status, out0, out1, out2, out3, out4, out5, out6, out7) = aesencwide256kl(
+        handle, input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7],
+    );
+    *output.cast() = [out0, out1, out2, out3, out4, out5, out6, out7];
+    status
+}
+
+/// Decrypt 14 rounds of 8 groups of unsigned 8-bit integers in `input` using 256-bit AES key specified
+/// in the 512-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdecwide256kl_u8)
+#[inline]
+#[target_feature(enable = "widekl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesdecwide256kl))]
+pub unsafe fn _mm_aesdecwide256kl_u8(
+    output: *mut __m128i,
+    input: *const __m128i,
+    handle: *const u8,
+) -> u8 {
+    let input = &*ptr::slice_from_raw_parts(input, 8);
+    let WideAesOutput(status, out0, out1, out2, out3, out4, out5, out6, out7) = aesdecwide256kl(
+        handle, input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7],
+    );
+    *output.cast() = [out0, out1, out2, out3, out4, out5, out6, out7];
+    status
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[target_feature(enable = "kl")]
+    unsafe fn encodekey128() -> [u8; 48] {
+        let mut handle = [0; 48];
+        let _ = _mm_encodekey128_u32(0, _mm_setzero_si128(), handle.as_mut_ptr());
+        handle
+    }
+
+    #[target_feature(enable = "kl")]
+    unsafe fn encodekey256() -> [u8; 64] {
+        let mut handle = [0; 64];
+        let _ = _mm_encodekey256_u32(
+            0,
+            _mm_setzero_si128(),
+            _mm_setzero_si128(),
+            handle.as_mut_ptr(),
+        );
+        handle
+    }
+
+    #[simd_test(enable = "kl")]
+    unsafe fn test_mm_encodekey128_u32() {
+        encodekey128();
+    }
+
+    #[simd_test(enable = "kl")]
+    unsafe fn test_mm_encodekey256_u32() {
+        encodekey256();
+    }
+
+    #[simd_test(enable = "kl")]
+    unsafe fn test_mm_aesenc128kl_u8() {
+        let mut buffer = _mm_setzero_si128();
+        let key = encodekey128();
+
+        for _ in 0..100 {
+            let status = _mm_aesenc128kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesdec128kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        assert_eq_m128i(buffer, _mm_setzero_si128());
+    }
+
+    #[simd_test(enable = "kl")]
+    unsafe fn test_mm_aesdec128kl_u8() {
+        let mut buffer = _mm_setzero_si128();
+        let key = encodekey128();
+
+        for _ in 0..100 {
+            let status = _mm_aesdec128kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesenc128kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        assert_eq_m128i(buffer, _mm_setzero_si128());
+    }
+
+    #[simd_test(enable = "kl")]
+    unsafe fn test_mm_aesenc256kl_u8() {
+        let mut buffer = _mm_setzero_si128();
+        let key = encodekey256();
+
+        for _ in 0..100 {
+            let status = _mm_aesenc256kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesdec256kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        assert_eq_m128i(buffer, _mm_setzero_si128());
+    }
+
+    #[simd_test(enable = "kl")]
+    unsafe fn test_mm_aesdec256kl_u8() {
+        let mut buffer = _mm_setzero_si128();
+        let key = encodekey256();
+
+        for _ in 0..100 {
+            let status = _mm_aesdec256kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesenc256kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        assert_eq_m128i(buffer, _mm_setzero_si128());
+    }
+
+    #[simd_test(enable = "widekl")]
+    unsafe fn test_mm_aesencwide128kl_u8() {
+        let mut buffer = [_mm_setzero_si128(); 8];
+        let key = encodekey128();
+
+        for _ in 0..100 {
+            let status = _mm_aesencwide128kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesdecwide128kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        for elem in buffer {
+            assert_eq_m128i(elem, _mm_setzero_si128());
+        }
+    }
+
+    #[simd_test(enable = "widekl")]
+    unsafe fn test_mm_aesdecwide128kl_u8() {
+        let mut buffer = [_mm_setzero_si128(); 8];
+        let key = encodekey128();
+
+        for _ in 0..100 {
+            let status = _mm_aesdecwide128kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesencwide128kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        for elem in buffer {
+            assert_eq_m128i(elem, _mm_setzero_si128());
+        }
+    }
+
+    #[simd_test(enable = "widekl")]
+    unsafe fn test_mm_aesencwide256kl_u8() {
+        let mut buffer = [_mm_setzero_si128(); 8];
+        let key = encodekey256();
+
+        for _ in 0..100 {
+            let status = _mm_aesencwide256kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesdecwide256kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        for elem in buffer {
+            assert_eq_m128i(elem, _mm_setzero_si128());
+        }
+    }
+
+    #[simd_test(enable = "widekl")]
+    unsafe fn test_mm_aesdecwide256kl_u8() {
+        let mut buffer = [_mm_setzero_si128(); 8];
+        let key = encodekey256();
+
+        for _ in 0..100 {
+            let status = _mm_aesdecwide256kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesencwide256kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        for elem in buffer {
+            assert_eq_m128i(elem, _mm_setzero_si128());
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/macros.rs b/library/stdarch/crates/core_arch/src/x86/macros.rs
new file mode 100644
index 0000000000000..9b9c24a447ec7
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/macros.rs
@@ -0,0 +1,98 @@
+//! Utility macros.
+
+// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
+// not a round number.
+#[allow(unused)]
+macro_rules! static_assert_rounding {
+    ($imm:ident) => {
+        static_assert!(
+            $imm == 4 || $imm == 8 || $imm == 9 || $imm == 10 || $imm == 11,
+            "Invalid IMM value"
+        )
+    };
+}
+
+// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
+// not a sae number.
+#[allow(unused)]
+macro_rules! static_assert_sae {
+    ($imm:ident) => {
+        static_assert!($imm == 4 || $imm == 8, "Invalid IMM value")
+    };
+}
+
+// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
+// not an extended rounding number
+#[allow(unused)]
+macro_rules! static_assert_extended_rounding {
+    ($imm: ident) => {
+        static_assert!(($imm & 7) < 5 && ($imm & !15) == 0, "Invalid IMM value")
+    };
+}
+
+// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
+// not a mantissas sae number.
+#[allow(unused)]
+macro_rules! static_assert_mantissas_sae {
+    ($imm:ident) => {
+        static_assert!($imm == 4 || $imm == 8 || $imm == 12, "Invalid IMM value")
+    };
+}
+
+// Helper macro used to trigger const eval errors when the const generic immediate value `SCALE` is
+// not valid for gather instructions: the only valid scale values are 1, 2, 4 and 8.
+#[allow(unused)]
+macro_rules! static_assert_imm8_scale {
+    ($imm:ident) => {
+        static_assert!(
+            $imm == 1 || $imm == 2 || $imm == 4 || $imm == 8,
+            "Invalid SCALE value"
+        )
+    };
+}
+
+#[cfg(test)]
+macro_rules! assert_approx_eq {
+    ($a:expr, $b:expr, $eps:expr) => {{
+        let (a, b) = (&$a, &$b);
+        assert!(
+            (*a - *b).abs() < $eps,
+            "assertion failed: `(left !== right)` \
+             (left: `{:?}`, right: `{:?}`, expect diff: `{:?}`, real diff: `{:?}`)",
+            *a,
+            *b,
+            $eps,
+            (*a - *b).abs()
+        );
+    }};
+}
+
+// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
+// register name (e.g. rax). We have to explicitly override the placeholder to
+// use the 32-bit register name in that case.
+
+#[cfg(target_pointer_width = "32")]
+macro_rules! vpl {
+    ($inst:expr) => {
+        concat!($inst, ", [{p:e}]")
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! vpl {
+    ($inst:expr) => {
+        concat!($inst, ", [{p}]")
+    };
+}
+
+#[cfg(target_pointer_width = "32")]
+macro_rules! vps {
+    ($inst1:expr, $inst2:expr) => {
+        concat!($inst1, " [{p:e}]", $inst2)
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! vps {
+    ($inst1:expr, $inst2:expr) => {
+        concat!($inst1, " [{p}]", $inst2)
+    };
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/mod.rs b/library/stdarch/crates/core_arch/src/x86/mod.rs
new file mode 100644
index 0000000000000..8897258c7dc24
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/mod.rs
@@ -0,0 +1,776 @@
+//! `x86` and `x86_64` intrinsics.
+
+use crate::mem::transmute;
+
+#[macro_use]
+mod macros;
+
+types! {
+    #![stable(feature = "simd_x86", since = "1.27.0")]
+
+    /// 128-bit wide integer vector type, x86-specific
+    ///
+    /// This type is the same as the `__m128i` type defined by Intel,
+    /// representing a 128-bit SIMD register. Usage of this type typically
+    /// corresponds to the `sse` and up target features for x86/x86_64.
+    ///
+    /// Internally this type may be viewed as:
+    ///
+    /// * `i8x16` - sixteen `i8` variables packed together
+    /// * `i16x8` - eight `i16` variables packed together
+    /// * `i32x4` - four `i32` variables packed together
+    /// * `i64x2` - two `i64` variables packed together
+    ///
+    /// (as well as unsigned versions). Each intrinsic may interpret the
+    /// internal bits differently, check the documentation of the intrinsic
+    /// to see how it's being used.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding); however, the alignment is different and equal to
+    /// the size of the type. Note that the ABI for function calls may *not* be
+    /// the same.
+    ///
+    /// Note that this means that an instance of `__m128i` typically just means
+    /// a "bag of bits" which is left up to interpretation at the point of use.
+    ///
+    /// Most intrinsics using `__m128i` are prefixed with `_mm_` and the
+    /// integer types tend to correspond to suffixes like "epi8" or "epi32".
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "sse2")]
+    /// # #[allow(unused_unsafe)] // temporary, to unstick CI
+    /// # unsafe fn foo() { unsafe {
+    /// let all_bytes_zero = _mm_setzero_si128();
+    /// let all_bytes_one = _mm_set1_epi8(1);
+    /// let four_i32 = _mm_set_epi32(1, 2, 3, 4);
+    /// # }}
+    /// # if is_x86_feature_detected!("sse2") { unsafe { foo() } }
+    /// # }
+    /// ```
+    pub struct __m128i(2 x i64);
+
+    /// 128-bit wide set of four `f32` types, x86-specific
+    ///
+    /// This type is the same as the `__m128` type defined by Intel,
+    /// representing a 128-bit SIMD register which internally is consisted of
+    /// four packed `f32` instances. Usage of this type typically corresponds
+    /// to the `sse` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m128i`, the integer version of the 128-bit
+    /// registers, this `__m128` type has *one* interpretation. Each instance
+    /// of `__m128` always corresponds to `f32x4`, or four `f32` types packed
+    /// together.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding); however, the alignment is different and equal to
+    /// the size of the type. Note that the ABI for function calls may *not* be
+    /// the same.
+    ///
+    /// Most intrinsics using `__m128` are prefixed with `_mm_` and are
+    /// suffixed with "ps" (or otherwise contain "ps"). Not to be confused with
+    /// "pd" which is used for `__m128d`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "sse")]
+    /// # #[allow(unused_unsafe)] // temporary, to unstick CI
+    /// # unsafe fn foo() { unsafe {
+    /// let four_zeros = _mm_setzero_ps();
+    /// let four_ones = _mm_set1_ps(1.0);
+    /// let four_floats = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+    /// # }}
+    /// # if is_x86_feature_detected!("sse") { unsafe { foo() } }
+    /// # }
+    /// ```
+    pub struct __m128(4 x f32);
+
+    /// 128-bit wide set of two `f64` types, x86-specific
+    ///
+    /// This type is the same as the `__m128d` type defined by Intel,
+    /// representing a 128-bit SIMD register which internally is consisted of
+    /// two packed `f64` instances. Usage of this type typically corresponds
+    /// to the `sse` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m128i`, the integer version of the 128-bit
+    /// registers, this `__m128d` type has *one* interpretation. Each instance
+    /// of `__m128d` always corresponds to `f64x2`, or two `f64` types packed
+    /// together.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding); however, the alignment is different and equal to
+    /// the size of the type. Note that the ABI for function calls may *not* be
+    /// the same.
+    ///
+    /// Most intrinsics using `__m128d` are prefixed with `_mm_` and are
+    /// suffixed with "pd" (or otherwise contain "pd"). Not to be confused with
+    /// "ps" which is used for `__m128`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "sse2")]
+    /// # #[allow(unused_unsafe)] // temporary, to unstick CI
+    /// # unsafe fn foo() { unsafe {
+    /// let two_zeros = _mm_setzero_pd();
+    /// let two_ones = _mm_set1_pd(1.0);
+    /// let two_floats = _mm_set_pd(1.0, 2.0);
+    /// # }}
+    /// # if is_x86_feature_detected!("sse2") { unsafe { foo() } }
+    /// # }
+    /// ```
+    pub struct __m128d(2 x f64);
+
+    /// 256-bit wide integer vector type, x86-specific
+    ///
+    /// This type is the same as the `__m256i` type defined by Intel,
+    /// representing a 256-bit SIMD register. Usage of this type typically
+    /// corresponds to the `avx` and up target features for x86/x86_64.
+    ///
+    /// Internally this type may be viewed as:
+    ///
+    /// * `i8x32` - thirty two `i8` variables packed together
+    /// * `i16x16` - sixteen `i16` variables packed together
+    /// * `i32x8` - eight `i32` variables packed together
+    /// * `i64x4` - four `i64` variables packed together
+    ///
+    /// (as well as unsigned versions). Each intrinsic may interpret the
+    /// internal bits differently, check the documentation of the intrinsic
+    /// to see how it's being used.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding); however, the alignment is different and equal to
+    /// the size of the type. Note that the ABI for function calls may *not* be
+    /// the same.
+    ///
+    /// Note that this means that an instance of `__m256i` typically just means
+    /// a "bag of bits" which is left up to interpretation at the point of use.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "avx")]
+    /// # #[allow(unused_unsafe)] // temporary, to unstick CI
+    /// # unsafe fn foo() { unsafe {
+    /// let all_bytes_zero = _mm256_setzero_si256();
+    /// let all_bytes_one = _mm256_set1_epi8(1);
+    /// let eight_i32 = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+    /// # }}
+    /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
+    /// # }
+    /// ```
+    pub struct __m256i(4 x i64);
+
+    /// 256-bit wide set of eight `f32` types, x86-specific
+    ///
+    /// This type is the same as the `__m256` type defined by Intel,
+    /// representing a 256-bit SIMD register which internally is consisted of
+    /// eight packed `f32` instances. Usage of this type typically corresponds
+    /// to the `avx` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m256i`, the integer version of the 256-bit
+    /// registers, this `__m256` type has *one* interpretation. Each instance
+    /// of `__m256` always corresponds to `f32x8`, or eight `f32` types packed
+    /// together.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding  between two consecutive elements); however, the
+    /// alignment is different and equal to the size of the type. Note that the
+    /// ABI for function calls may *not* be the same.
+    ///
+    /// Most intrinsics using `__m256` are prefixed with `_mm256_` and are
+    /// suffixed with "ps" (or otherwise contain "ps"). Not to be confused with
+    /// "pd" which is used for `__m256d`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "avx")]
+    /// # #[allow(unused_unsafe)] // temporary, to unstick CI
+    /// # unsafe fn foo() { unsafe {
+    /// let eight_zeros = _mm256_setzero_ps();
+    /// let eight_ones = _mm256_set1_ps(1.0);
+    /// let eight_floats = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+    /// # }}
+    /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
+    /// # }
+    /// ```
+    pub struct __m256(8 x f32);
+
+    /// 256-bit wide set of four `f64` types, x86-specific
+    ///
+    /// This type is the same as the `__m256d` type defined by Intel,
+    /// representing a 256-bit SIMD register which internally is consisted of
+    /// four packed `f64` instances. Usage of this type typically corresponds
+    /// to the `avx` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m256i`, the integer version of the 256-bit
+    /// registers, this `__m256d` type has *one* interpretation. Each instance
+    /// of `__m256d` always corresponds to `f64x4`, or four `f64` types packed
+    /// together.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding); however, the alignment is different and equal to
+    /// the size of the type. Note that the ABI for function calls may *not* be
+    /// the same.
+    ///
+    /// Most intrinsics using `__m256d` are prefixed with `_mm256_` and are
+    /// suffixed with "pd" (or otherwise contain "pd"). Not to be confused with
+    /// "ps" which is used for `__m256`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "avx")]
+    /// # #[allow(unused_unsafe)] // temporary, to unstick CI
+    /// # unsafe fn foo() { unsafe {
+    /// let four_zeros = _mm256_setzero_pd();
+    /// let four_ones = _mm256_set1_pd(1.0);
+    /// let four_floats = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+    /// # }}
+    /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
+    /// # }
+    /// ```
+    pub struct __m256d(4 x f64);
+}
+
+types! {
+    #![stable(feature = "simd_avx512_types", since = "1.72.0")]
+
+    /// 512-bit wide integer vector type, x86-specific
+    ///
+    /// This type is the same as the `__m512i` type defined by Intel,
+    /// representing a 512-bit SIMD register. Usage of this type typically
+    /// corresponds to the `avx512*` and up target features for x86/x86_64.
+    ///
+    /// Internally this type may be viewed as:
+    ///
+    /// * `i8x64` - sixty-four `i8` variables packed together
+    /// * `i16x32` - thirty-two `i16` variables packed together
+    /// * `i32x16` - sixteen `i32` variables packed together
+    /// * `i64x8` - eight `i64` variables packed together
+    ///
+    /// (as well as unsigned versions). Each intrinsic may interpret the
+    /// internal bits differently, check the documentation of the intrinsic
+    /// to see how it's being used.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding); however, the alignment is different and equal to
+    /// the size of the type. Note that the ABI for function calls may *not* be
+    /// the same.
+    ///
+    /// Note that this means that an instance of `__m512i` typically just means
+    /// a "bag of bits" which is left up to interpretation at the point of use.
+    pub struct __m512i(8 x i64);
+
+    /// 512-bit wide set of sixteen `f32` types, x86-specific
+    ///
+    /// This type is the same as the `__m512` type defined by Intel,
+    /// representing a 512-bit SIMD register which internally is consisted of
+    /// eight packed `f32` instances. Usage of this type typically corresponds
+    /// to the `avx512*` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m512i`, the integer version of the 512-bit
+    /// registers, this `__m512` type has *one* interpretation. Each instance
+    /// of `__m512` always corresponds to `f32x16`, or sixteen `f32` types
+    /// packed together.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding  between two consecutive elements); however, the
+    /// alignment is different and equal to the size of the type. Note that the
+    /// ABI for function calls may *not* be the same.
+    ///
+    /// Most intrinsics using `__m512` are prefixed with `_mm512_` and are
+    /// suffixed with "ps" (or otherwise contain "ps"). Not to be confused with
+    /// "pd" which is used for `__m512d`.
+    pub struct __m512(16 x f32);
+
+    /// 512-bit wide set of eight `f64` types, x86-specific
+    ///
+    /// This type is the same as the `__m512d` type defined by Intel,
+    /// representing a 512-bit SIMD register which internally is consisted of
+    /// eight packed `f64` instances. Usage of this type typically corresponds
+    /// to the `avx` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m512i`, the integer version of the 512-bit
+    /// registers, this `__m512d` type has *one* interpretation. Each instance
+    /// of `__m512d` always corresponds to `f64x8`, or eight `f64` types packed
+    /// together.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding  between two consecutive elements); however, the
+    /// alignment is different and equal to the size of the type. Note that the
+    /// ABI for function calls may *not* be the same.
+    ///
+    /// Most intrinsics using `__m512d` are prefixed with `_mm512_` and are
+    /// suffixed with "pd" (or otherwise contain "pd"). Not to be confused with
+    /// "ps" which is used for `__m512`.
+    pub struct __m512d(8 x f64);
+}
+
+types! {
+    #![stable(feature = "stdarch_x86_avx512", since = "1.89")]
+
+    /// 128-bit wide set of eight `u16` types, x86-specific
+    ///
+    /// This type is representing a 128-bit SIMD register which internally is consisted of
+    /// eight packed `u16` instances. Its purpose is for bf16 related intrinsic
+    /// implementations.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding); however, the alignment is different and equal to
+    /// the size of the type. Note that the ABI for function calls may *not* be
+    /// the same.
+    pub struct __m128bh(8 x u16);
+
+    /// 256-bit wide set of 16 `u16` types, x86-specific
+    ///
+    /// This type is the same as the `__m256bh` type defined by Intel,
+    /// representing a 256-bit SIMD register which internally is consisted of
+    /// 16 packed `u16` instances. Its purpose is for bf16 related intrinsic
+    /// implementations.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding); however, the alignment is different and equal to
+    /// the size of the type. Note that the ABI for function calls may *not* be
+    /// the same.
+    pub struct __m256bh(16 x u16);
+
+    /// 512-bit wide set of 32 `u16` types, x86-specific
+    ///
+    /// This type is the same as the `__m512bh` type defined by Intel,
+    /// representing a 512-bit SIMD register which internally is consisted of
+    /// 32 packed `u16` instances. Its purpose is for bf16 related intrinsic
+    /// implementations.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding); however, the alignment is different and equal to
+    /// the size of the type. Note that the ABI for function calls may *not* be
+    /// the same.
+    pub struct __m512bh(32 x u16);
+}
+
+types! {
+    #![unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+
+    /// 128-bit wide set of 8 `f16` types, x86-specific
+    ///
+    /// This type is the same as the `__m128h` type defined by Intel,
+    /// representing a 128-bit SIMD register which internally is consisted of
+    /// 8 packed `f16` instances. its purpose is for f16 related intrinsic
+    /// implementations.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding); however, the alignment is different and equal to
+    /// the size of the type. Note that the ABI for function calls may *not* be
+    /// the same.
+    pub struct __m128h(8 x f16);
+
+    /// 256-bit wide set of 16 `f16` types, x86-specific
+    ///
+    /// This type is the same as the `__m256h` type defined by Intel,
+    /// representing a 256-bit SIMD register which internally is consisted of
+    /// 16 packed `f16` instances. its purpose is for f16 related intrinsic
+    /// implementations.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding); however, the alignment is different and equal to
+    /// the size of the type. Note that the ABI for function calls may *not* be
+    /// the same.
+    pub struct __m256h(16 x f16);
+
+    /// 512-bit wide set of 32 `f16` types, x86-specific
+    ///
+    /// This type is the same as the `__m512h` type defined by Intel,
+    /// representing a 512-bit SIMD register which internally is consisted of
+    /// 32 packed `f16` instances. its purpose is for f16 related intrinsic
+    /// implementations.
+    ///
+    /// The in-memory representation of this type is the same as the one of an
+    /// equivalent array (i.e. the in-memory order of elements is the same, and
+    /// there is no padding); however, the alignment is different and equal to
+    /// the size of the type. Note that the ABI for function calls may *not* be
+    /// the same.
+    pub struct __m512h(32 x f16);
+}
+
+/// The BFloat16 type used in AVX-512 intrinsics.
+#[repr(transparent)]
+#[derive(Copy, Clone, Debug)]
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub struct bf16(u16);
+
+impl bf16 {
+    /// Raw transmutation from `u16`
+    #[inline]
+    #[must_use]
+    #[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+    pub const fn from_bits(bits: u16) -> bf16 {
+        bf16(bits)
+    }
+
+    /// Raw transmutation to `u16`
+    #[inline]
+    #[must_use = "this returns the result of the operation, without modifying the original"]
+    #[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+    pub const fn to_bits(self) -> u16 {
+        self.0
+    }
+}
+
+/// The `__mmask64` type used in AVX-512 intrinsics, a 64-bit integer
+#[allow(non_camel_case_types)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub type __mmask64 = u64;
+
+/// The `__mmask32` type used in AVX-512 intrinsics, a 32-bit integer
+#[allow(non_camel_case_types)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub type __mmask32 = u32;
+
+/// The `__mmask16` type used in AVX-512 intrinsics, a 16-bit integer
+#[allow(non_camel_case_types)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub type __mmask16 = u16;
+
+/// The `__mmask8` type used in AVX-512 intrinsics, a 8-bit integer
+#[allow(non_camel_case_types)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub type __mmask8 = u8;
+
+/// The `_MM_CMPINT_ENUM` type used to specify comparison operations in AVX-512 intrinsics.
+#[allow(non_camel_case_types)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub type _MM_CMPINT_ENUM = i32;
+
+/// The `MM_MANTISSA_NORM_ENUM` type used to specify mantissa normalized operations in AVX-512 intrinsics.
+#[allow(non_camel_case_types)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub type _MM_MANTISSA_NORM_ENUM = i32;
+
+/// The `MM_MANTISSA_SIGN_ENUM` type used to specify mantissa signed operations in AVX-512 intrinsics.
+#[allow(non_camel_case_types)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub type _MM_MANTISSA_SIGN_ENUM = i32;
+
+/// The `MM_PERM_ENUM` type used to specify shuffle operations in AVX-512 intrinsics.
+#[allow(non_camel_case_types)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub type _MM_PERM_ENUM = i32;
+
+#[cfg(test)]
+mod test;
+#[cfg(test)]
+pub use self::test::*;
+
+macro_rules! as_transmute {
+    ($from:ty => $as_from:ident, $($as_to:ident -> $to:ident),* $(,)?) => {
+        impl $from {$(
+            #[inline]
+            pub(crate) fn $as_to(self) -> crate::core_arch::simd::$to {
+                unsafe { transmute(self) }
+            }
+        )*}
+        $(
+            impl crate::core_arch::simd::$to {
+                #[inline]
+                pub(crate) fn $as_from(self) -> $from {
+                    unsafe { transmute(self) }
+                }
+            }
+        )*
+    };
+}
+
+as_transmute!(__m128i =>
+    as_m128i,
+    as_u8x16 -> u8x16,
+    as_u16x8 -> u16x8,
+    as_u32x4 -> u32x4,
+    as_u64x2 -> u64x2,
+    as_i8x16 -> i8x16,
+    as_i16x8 -> i16x8,
+    as_i32x4 -> i32x4,
+    as_i64x2 -> i64x2,
+);
+as_transmute!(__m256i =>
+    as_m256i,
+    as_u8x32 -> u8x32,
+    as_u16x16 -> u16x16,
+    as_u32x8 -> u32x8,
+    as_u64x4 -> u64x4,
+    as_i8x32 -> i8x32,
+    as_i16x16 -> i16x16,
+    as_i32x8 -> i32x8,
+    as_i64x4 -> i64x4,
+);
+as_transmute!(__m512i =>
+    as_m512i,
+    as_u8x64 -> u8x64,
+    as_u16x32 -> u16x32,
+    as_u32x16 -> u32x16,
+    as_u64x8 -> u64x8,
+    as_i8x64 -> i8x64,
+    as_i16x32 -> i16x32,
+    as_i32x16 -> i32x16,
+    as_i64x8 -> i64x8,
+);
+
+as_transmute!(__m128 => as_m128, as_f32x4 -> f32x4);
+as_transmute!(__m128d => as_m128d, as_f64x2 -> f64x2);
+as_transmute!(__m256 => as_m256, as_f32x8 -> f32x8);
+as_transmute!(__m256d => as_m256d, as_f64x4 -> f64x4);
+as_transmute!(__m512 => as_m512, as_f32x16 -> f32x16);
+as_transmute!(__m512d => as_m512d, as_f64x8 -> f64x8);
+
+as_transmute!(__m128bh =>
+    as_m128bh,
+    as_u16x8 -> u16x8,
+    as_u32x4 -> u32x4,
+    as_i16x8 -> i16x8,
+    as_i32x4 -> i32x4,
+);
+as_transmute!(__m256bh =>
+    as_m256bh,
+    as_u16x16 -> u16x16,
+    as_u32x8 -> u32x8,
+    as_i16x16 -> i16x16,
+    as_i32x8 -> i32x8,
+);
+as_transmute!(__m512bh =>
+    as_m512bh,
+    as_u16x32 -> u16x32,
+    as_u32x16 -> u32x16,
+    as_i16x32 -> i16x32,
+    as_i32x16 -> i32x16,
+);
+
+as_transmute!(__m128h => as_m128h, as_f16x8 -> f16x8);
+as_transmute!(__m256h => as_m256h, as_f16x16 -> f16x16);
+as_transmute!(__m512h => as_m512h, as_f16x32 -> f16x32);
+
+mod eflags;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::eflags::*;
+
+mod fxsr;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::fxsr::*;
+
+mod bswap;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::bswap::*;
+
+mod rdtsc;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::rdtsc::*;
+
+mod cpuid;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::cpuid::*;
+mod xsave;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::xsave::*;
+
+mod sse;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::sse::*;
+mod sse2;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::sse2::*;
+mod sse3;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::sse3::*;
+mod ssse3;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::ssse3::*;
+mod sse41;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::sse41::*;
+mod sse42;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::sse42::*;
+mod avx;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::avx::*;
+mod avx2;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::avx2::*;
+mod fma;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::fma::*;
+
+mod abm;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::abm::*;
+mod bmi1;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::bmi1::*;
+
+mod bmi2;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::bmi2::*;
+
+mod sse4a;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::sse4a::*;
+
+mod tbm;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::tbm::*;
+
+mod pclmulqdq;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::pclmulqdq::*;
+
+mod aes;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::aes::*;
+
+mod rdrand;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::rdrand::*;
+
+mod sha;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::sha::*;
+
+mod adx;
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub use self::adx::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+mod avx512f;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avx512f::*;
+
+mod avx512bw;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avx512bw::*;
+
+mod avx512cd;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avx512cd::*;
+
+mod avx512dq;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avx512dq::*;
+
+mod avx512ifma;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avx512ifma::*;
+
+mod avx512vbmi;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avx512vbmi::*;
+
+mod avx512vbmi2;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avx512vbmi2::*;
+
+mod avx512vnni;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avx512vnni::*;
+
+mod avx512bitalg;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avx512bitalg::*;
+
+mod gfni;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::gfni::*;
+
+mod avx512vpopcntdq;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avx512vpopcntdq::*;
+
+mod vaes;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::vaes::*;
+
+mod vpclmulqdq;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::vpclmulqdq::*;
+
+mod bt;
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub use self::bt::*;
+
+mod rtm;
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub use self::rtm::*;
+
+mod f16c;
+#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
+pub use self::f16c::*;
+
+mod avx512bf16;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avx512bf16::*;
+
+mod avxneconvert;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avxneconvert::*;
+
+mod avx512fp16;
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub use self::avx512fp16::*;
+
+mod kl;
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+pub use self::kl::*;
diff --git a/library/stdarch/crates/core_arch/src/x86/pclmulqdq.rs b/library/stdarch/crates/core_arch/src/x86/pclmulqdq.rs
new file mode 100644
index 0000000000000..cce6a51e2cd63
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/pclmulqdq.rs
@@ -0,0 +1,66 @@
+//! Carry-less Multiplication (CLMUL)
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
+//!
+//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::x86::__m128i;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.pclmulqdq"]
+    fn pclmulqdq(a: __m128i, round_key: __m128i, imm8: u8) -> __m128i;
+}
+
+/// Performs a carry-less multiplication of two 64-bit polynomials over the
+/// finite field GF(2).
+///
+/// The immediate byte is used for determining which halves of `a` and `b`
+/// should be used. Immediate bits other than 0 and 4 are ignored.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128)
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+#[cfg_attr(test, assert_instr(pclmul, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_clmulepi64_si128<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pclmulqdq(a, b, IMM8 as u8) }
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __m128i happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "pclmulqdq")]
+    unsafe fn test_mm_clmulepi64_si128() {
+        // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
+        let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
+        let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
+        let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
+        let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
+        let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
+        let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
+
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x00>(a, b), r00);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x10>(a, b), r01);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x01>(a, b), r10);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x11>(a, b), r11);
+
+        let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
+        let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x00>(a0, a0), r);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/rdrand.rs b/library/stdarch/crates/core_arch/src/x86/rdrand.rs
new file mode 100644
index 0000000000000..50097915213b9
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/rdrand.rs
@@ -0,0 +1,75 @@
+//! RDRAND and RDSEED instructions for returning random numbers from an Intel
+//! on-chip hardware random number generator which has been seeded by an
+//! on-chip entropy source.
+#![allow(clippy::module_name_repetitions)]
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.rdrand.16"]
+    fn x86_rdrand16_step() -> (u16, i32);
+    #[link_name = "llvm.x86.rdrand.32"]
+    fn x86_rdrand32_step() -> (u32, i32);
+    #[link_name = "llvm.x86.rdseed.16"]
+    fn x86_rdseed16_step() -> (u16, i32);
+    #[link_name = "llvm.x86.rdseed.32"]
+    fn x86_rdseed32_step() -> (u32, i32);
+}
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Read a hardware generated 16-bit random value and store the result in val.
+/// Returns 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdrand16_step)
+#[inline]
+#[target_feature(enable = "rdrand")]
+#[cfg_attr(test, assert_instr(rdrand))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
+    let (v, flag) = x86_rdrand16_step();
+    *val = v;
+    flag
+}
+
+/// Read a hardware generated 32-bit random value and store the result in val.
+/// Returns 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdrand32_step)
+#[inline]
+#[target_feature(enable = "rdrand")]
+#[cfg_attr(test, assert_instr(rdrand))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {
+    let (v, flag) = x86_rdrand32_step();
+    *val = v;
+    flag
+}
+
+/// Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store
+/// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdseed16_step)
+#[inline]
+#[target_feature(enable = "rdseed")]
+#[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdseed16_step(val: &mut u16) -> i32 {
+    let (v, flag) = x86_rdseed16_step();
+    *val = v;
+    flag
+}
+
+/// Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store
+/// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdseed32_step)
+#[inline]
+#[target_feature(enable = "rdseed")]
+#[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdseed32_step(val: &mut u32) -> i32 {
+    let (v, flag) = x86_rdseed32_step();
+    *val = v;
+    flag
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/rdtsc.rs b/library/stdarch/crates/core_arch/src/x86/rdtsc.rs
new file mode 100644
index 0000000000000..3b348153d602d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/rdtsc.rs
@@ -0,0 +1,79 @@
+//! RDTSC instructions.
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Reads the current value of the processor’s time-stamp counter.
+///
+/// The processor monotonically increments the time-stamp counter MSR
+/// every clock cycle and resets it to 0 whenever the processor is
+/// reset.
+///
+/// The RDTSC instruction is not a serializing instruction. It does
+/// not necessarily wait until all previous instructions have been
+/// executed before reading the counter. Similarly, subsequent
+/// instructions may begin execution before the read operation is
+/// performed.
+///
+/// On processors that support the Intel 64 architecture, the
+/// high-order 32 bits of each of RAX and RDX are cleared.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdtsc)
+#[inline]
+#[cfg_attr(test, assert_instr(rdtsc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdtsc() -> u64 {
+    rdtsc()
+}
+
+/// Reads the current value of the processor’s time-stamp counter and
+/// the `IA32_TSC_AUX MSR`.
+///
+/// The processor monotonically increments the time-stamp counter MSR
+/// every clock cycle and resets it to 0 whenever the processor is
+/// reset.
+///
+/// The RDTSCP instruction waits until all previous instructions have
+/// been executed before reading the counter. However, subsequent
+/// instructions may begin execution before the read operation is
+/// performed.
+///
+/// On processors that support the Intel 64 architecture, the
+/// high-order 32 bits of each of RAX, RDX, and RCX are cleared.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__rdtscp)
+#[inline]
+#[cfg_attr(test, assert_instr(rdtscp))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __rdtscp(aux: *mut u32) -> u64 {
+    let (tsc, auxval) = rdtscp();
+    *aux = auxval;
+    tsc
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.rdtsc"]
+    fn rdtsc() -> u64;
+    #[link_name = "llvm.x86.rdtscp"]
+    fn rdtscp() -> (u64, u32);
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_rdtsc() {
+        let r = _rdtsc();
+        assert_ne!(r, 0); // The chances of this being 0 are infinitesimal
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_rdtscp() {
+        let mut aux = 0;
+        let r = __rdtscp(&mut aux);
+        assert_ne!(r, 0); // The chances of this being 0 are infinitesimal
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/rtm.rs b/library/stdarch/crates/core_arch/src/x86/rtm.rs
new file mode 100644
index 0000000000000..b807305d6aa8f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/rtm.rs
@@ -0,0 +1,174 @@
+//! Intel's Restricted Transactional Memory (RTM).
+//!
+//! This CPU feature is available on Intel Broadwell or later CPUs (and some Haswell).
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_rtm] provides a quick overview of the assembly instructions, and
+//! Intel's [programming considerations][intel_consid] details what sorts of instructions within a
+//! transaction are likely to cause an abort.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_rtm]: https://en.wikipedia.org/wiki/Transactional_Synchronization_Extensions#Restricted_Transactional_Memory
+//! [intel_consid]: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-intel-transactional-synchronization-extensions-intel-tsx-programming-considerations
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "C" {
+    #[link_name = "llvm.x86.xbegin"]
+    fn x86_xbegin() -> i32;
+    #[link_name = "llvm.x86.xend"]
+    fn x86_xend();
+    #[link_name = "llvm.x86.xabort"]
+    fn x86_xabort(imm8: i8);
+    #[link_name = "llvm.x86.xtest"]
+    fn x86_xtest() -> i32;
+}
+
+/// Transaction successfully started.
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XBEGIN_STARTED: u32 = !0;
+
+/// Transaction explicitly aborted with xabort. The parameter passed to xabort is available with
+/// `_xabort_code(status)`.
+#[allow(clippy::identity_op)]
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XABORT_EXPLICIT: u32 = 1 << 0;
+
+/// Transaction retry is possible.
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XABORT_RETRY: u32 = 1 << 1;
+
+/// Transaction abort due to a memory conflict with another thread.
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XABORT_CONFLICT: u32 = 1 << 2;
+
+/// Transaction abort due to the transaction using too much memory.
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XABORT_CAPACITY: u32 = 1 << 3;
+
+/// Transaction abort due to a debug trap.
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XABORT_DEBUG: u32 = 1 << 4;
+
+/// Transaction abort in a inner nested transaction.
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XABORT_NESTED: u32 = 1 << 5;
+
+/// Specifies the start of a restricted transactional memory (RTM) code region and returns a value
+/// indicating status.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xbegin)
+#[inline]
+#[target_feature(enable = "rtm")]
+#[cfg_attr(test, assert_instr(xbegin))]
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub unsafe fn _xbegin() -> u32 {
+    x86_xbegin() as _
+}
+
+/// Specifies the end of a restricted transactional memory (RTM) code region.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xend)
+#[inline]
+#[target_feature(enable = "rtm")]
+#[cfg_attr(test, assert_instr(xend))]
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub unsafe fn _xend() {
+    x86_xend()
+}
+
+/// Forces a restricted transactional memory (RTM) region to abort.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xabort)
+#[inline]
+#[target_feature(enable = "rtm")]
+#[cfg_attr(test, assert_instr(xabort, IMM8 = 0x0))]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub unsafe fn _xabort<const IMM8: u32>() {
+    static_assert_uimm_bits!(IMM8, 8);
+    x86_xabort(IMM8 as i8)
+}
+
+/// Queries whether the processor is executing in a transactional region identified by restricted
+/// transactional memory (RTM) or hardware lock elision (HLE).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xtest)
+#[inline]
+#[target_feature(enable = "rtm")]
+#[cfg_attr(test, assert_instr(xtest))]
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub unsafe fn _xtest() -> u8 {
+    x86_xtest() as _
+}
+
+/// Retrieves the parameter passed to [`_xabort`] when [`_xbegin`]'s status has the
+/// `_XABORT_EXPLICIT` flag set.
+#[inline]
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const fn _xabort_code(status: u32) -> u32 {
+    (status >> 24) & 0xFF
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "rtm")]
+    unsafe fn test_xbegin() {
+        let mut x = 0;
+        for _ in 0..10 {
+            let code = _xbegin();
+            if code == _XBEGIN_STARTED {
+                x += 1;
+                _xend();
+                assert_eq!(x, 1);
+                break;
+            }
+            assert_eq!(x, 0);
+        }
+    }
+
+    #[simd_test(enable = "rtm")]
+    unsafe fn test_xabort() {
+        const ABORT_CODE: u32 = 42;
+        // aborting outside a transactional region does nothing
+        _xabort::<ABORT_CODE>();
+
+        for _ in 0..10 {
+            let mut x = 0;
+            let code = rtm::_xbegin();
+            if code == _XBEGIN_STARTED {
+                x += 1;
+                rtm::_xabort::<ABORT_CODE>();
+            } else if code & _XABORT_EXPLICIT != 0 {
+                let test_abort_code = rtm::_xabort_code(code);
+                assert_eq!(test_abort_code, ABORT_CODE);
+            }
+            assert_eq!(x, 0);
+        }
+    }
+
+    #[simd_test(enable = "rtm")]
+    unsafe fn test_xtest() {
+        assert_eq!(_xtest(), 0);
+
+        for _ in 0..10 {
+            let code = rtm::_xbegin();
+            if code == _XBEGIN_STARTED {
+                let in_tx = _xtest();
+                rtm::_xend();
+
+                // putting the assert inside the transaction would abort the transaction on fail
+                // without any output/panic/etc
+                assert_eq!(in_tx, 1);
+                break;
+            }
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sha.rs b/library/stdarch/crates/core_arch/src/x86/sha.rs
new file mode 100644
index 0000000000000..da568c449a6be
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sha.rs
@@ -0,0 +1,732 @@
+use crate::core_arch::{simd::*, x86::*};
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sha1msg1"]
+    fn sha1msg1(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha1msg2"]
+    fn sha1msg2(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha1nexte"]
+    fn sha1nexte(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha1rnds4"]
+    fn sha1rnds4(a: i32x4, b: i32x4, c: i8) -> i32x4;
+    #[link_name = "llvm.x86.sha256msg1"]
+    fn sha256msg1(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha256msg2"]
+    fn sha256msg2(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha256rnds2"]
+    fn sha256rnds2(a: i32x4, b: i32x4, k: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.vsha512msg1"]
+    fn vsha512msg1(a: i64x4, b: i64x2) -> i64x4;
+    #[link_name = "llvm.x86.vsha512msg2"]
+    fn vsha512msg2(a: i64x4, b: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.vsha512rnds2"]
+    fn vsha512rnds2(a: i64x4, b: i64x4, k: i64x2) -> i64x4;
+    #[link_name = "llvm.x86.vsm3msg1"]
+    fn vsm3msg1(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.vsm3msg2"]
+    fn vsm3msg2(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.vsm3rnds2"]
+    fn vsm3rnds2(a: i32x4, b: i32x4, c: i32x4, d: i32) -> i32x4;
+    #[link_name = "llvm.x86.vsm4key4128"]
+    fn vsm4key4128(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.vsm4key4256"]
+    fn vsm4key4256(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.vsm4rnds4128"]
+    fn vsm4rnds4128(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.vsm4rnds4256"]
+    fn vsm4rnds4256(a: i32x8, b: i32x8) -> i32x8;
+}
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Performs an intermediate calculation for the next four SHA1 message values
+/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
+/// and returning the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1msg1_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1msg1))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha1msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(sha1msg1(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Performs the final calculation for the next four SHA1 message values
+/// (unsigned 32-bit integers) using the intermediate result in `a` and the
+/// previous message values in `b`, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1msg2_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1msg2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha1msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(sha1msg2(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Calculate SHA1 state variable E after four rounds of operation from the
+/// current SHA1 state variable `a`, add that value to the scheduled values
+/// (unsigned 32-bit integers) in `b`, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1nexte_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1nexte))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(sha1nexte(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Performs four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D)
+/// from `a` and some pre-computed sum of the next 4 round message values
+/// (unsigned 32-bit integers), and state variable E from `b`, and return the
+/// updated SHA1 state (A,B,C,D). `FUNC` contains the logic functions and round
+/// constants.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1rnds4_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1rnds4, FUNC = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha1rnds4_epu32<const FUNC: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(FUNC, 2);
+    unsafe { transmute(sha1rnds4(a.as_i32x4(), b.as_i32x4(), FUNC as i8)) }
+}
+
+/// Performs an intermediate calculation for the next four SHA256 message values
+/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
+/// and return the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg1_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha256msg1))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha256msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(sha256msg1(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Performs the final calculation for the next four SHA256 message values
+/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
+/// and return the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg2_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha256msg2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha256msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(sha256msg2(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Performs 2 rounds of SHA256 operation using an initial SHA256 state
+/// (C,D,G,H) from `a`, an initial SHA256 state (A,B,E,F) from `b`, and a
+/// pre-computed sum of the next 2 round message values (unsigned 32-bit
+/// integers) and the corresponding round constants from `k`, and store the
+/// updated SHA256 state (A,B,E,F) in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256rnds2_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha256rnds2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha256rnds2_epu32(a: __m128i, b: __m128i, k: __m128i) -> __m128i {
+    unsafe { transmute(sha256rnds2(a.as_i32x4(), b.as_i32x4(), k.as_i32x4())) }
+}
+
+/// This intrinsic is one of the two SHA512 message scheduling instructions.
+/// The intrinsic performs an intermediate calculation for the next four SHA512
+/// message qwords. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg1_epi64)
+#[inline]
+#[target_feature(enable = "sha512,avx")]
+#[cfg_attr(test, assert_instr(vsha512msg1))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm256_sha512msg1_epi64(a: __m256i, b: __m128i) -> __m256i {
+    unsafe { transmute(vsha512msg1(a.as_i64x4(), b.as_i64x2())) }
+}
+
+/// This intrinsic is one of the two SHA512 message scheduling instructions.
+/// The intrinsic performs the final calculation for the next four SHA512 message
+/// qwords. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg2_epi64)
+#[inline]
+#[target_feature(enable = "sha512,avx")]
+#[cfg_attr(test, assert_instr(vsha512msg2))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm256_sha512msg2_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vsha512msg2(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// This intrinsic performs two rounds of SHA512 operation using initial SHA512 state
+/// `(C,D,G,H)` from `a`, an initial SHA512 state `(A,B,E,F)` from `b`, and a
+/// pre-computed sum of the next two round message qwords and the corresponding
+/// round constants from `c` (only the two lower qwords of the third operand). The
+/// updated SHA512 state `(A,B,E,F)` is written to dst, and dst can be used as the
+/// updated state `(C,D,G,H)` in later rounds.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512rnds2_epi64)
+#[inline]
+#[target_feature(enable = "sha512,avx")]
+#[cfg_attr(test, assert_instr(vsha512rnds2))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm256_sha512rnds2_epi64(a: __m256i, b: __m256i, k: __m128i) -> __m256i {
+    unsafe { transmute(vsha512rnds2(a.as_i64x4(), b.as_i64x4(), k.as_i64x2())) }
+}
+
+/// This is one of the two SM3 message scheduling intrinsics. The intrinsic performs
+/// an initial calculation for the next four SM3 message words. The calculated results
+/// are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3msg1_epi32)
+#[inline]
+#[target_feature(enable = "sm3,avx")]
+#[cfg_attr(test, assert_instr(vsm3msg1))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm_sm3msg1_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vsm3msg1(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
+}
+
+/// This is one of the two SM3 message scheduling intrinsics. The intrinsic performs
+/// the final calculation for the next four SM3 message words. The calculated results
+/// are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3msg2_epi32)
+#[inline]
+#[target_feature(enable = "sm3,avx")]
+#[cfg_attr(test, assert_instr(vsm3msg2))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm_sm3msg2_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vsm3msg2(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
+}
+
+/// The intrinsic performs two rounds of SM3 operation using initial SM3 state `(C, D, G, H)`
+/// from `a`, an initial SM3 states `(A, B, E, F)` from `b` and a pre-computed words from the
+/// `c`. `a` with initial SM3 state of `(C, D, G, H)` assumes input of non-rotated left variables
+/// from previous state. The updated SM3 state `(A, B, E, F)` is written to `a`. The `imm8`
+/// should contain the even round number for the first of the two rounds computed by this instruction.
+/// The computation masks the `imm8` value by ANDing it with `0x3E` so that only even round numbers
+/// from 0 through 62 are used for this operation. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3rnds2_epi32)
+#[inline]
+#[target_feature(enable = "sm3,avx")]
+#[cfg_attr(test, assert_instr(vsm3rnds2, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm_sm3rnds2_epi32<const IMM8: i32>(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    static_assert!(
+        IMM8 == (IMM8 & 0x3e),
+        "IMM8 must be an even number in the range `0..=62`"
+    );
+    unsafe { transmute(vsm3rnds2(a.as_i32x4(), b.as_i32x4(), c.as_i32x4(), IMM8)) }
+}
+
+/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent
+/// 128-bit lanes. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm4key4_epi32)
+#[inline]
+#[target_feature(enable = "sm4,avx")]
+#[cfg_attr(test, assert_instr(vsm4key4))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm_sm4key4_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vsm4key4128(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent
+/// 128-bit lanes. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sm4key4_epi32)
+#[inline]
+#[target_feature(enable = "sm4,avx")]
+#[cfg_attr(test, assert_instr(vsm4key4))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm256_sm4key4_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vsm4key4256(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
+/// 128-bit lanes. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm4rnds4_epi32)
+#[inline]
+#[target_feature(enable = "sm4,avx")]
+#[cfg_attr(test, assert_instr(vsm4rnds4))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm_sm4rnds4_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vsm4rnds4128(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
+/// 128-bit lanes. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sm4rnds4_epi32)
+#[inline]
+#[target_feature(enable = "sm4,avx")]
+#[cfg_attr(test, assert_instr(vsm4rnds4))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm256_sm4rnds4_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vsm4rnds4256(a.as_i32x8(), b.as_i32x8())) }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        core_arch::{simd::*, x86::*},
+        hint::black_box,
+    };
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1msg1_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0x98829f34f74ad457, 0xda2b1a44d0b5ad3c);
+        let r = _mm_sha1msg1_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1msg2_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0xf714b202d863d47d, 0x90c30d946b3d3b35);
+        let r = _mm_sha1msg2_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1nexte_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0x2589d5be923f82a4, 0x59f111f13956c25b);
+        let r = _mm_sha1nexte_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1rnds4_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0x32b13cd8322f5268, 0xc54420862bd9246f);
+        let r = _mm_sha1rnds4_epu32::<0>(a, b);
+        assert_eq_m128i(r, expected);
+
+        let expected = _mm_set_epi64x(0x6d4c43e56a3c25d9, 0xa7e00fb775cbd3fe);
+        let r = _mm_sha1rnds4_epu32::<1>(a, b);
+        assert_eq_m128i(r, expected);
+
+        let expected = _mm_set_epi64x(0xb304e383c01222f4, 0x66f6b3b1f89d8001);
+        let r = _mm_sha1rnds4_epu32::<2>(a, b);
+        assert_eq_m128i(r, expected);
+
+        let expected = _mm_set_epi64x(0x8189b758bfabfa79, 0xdb08f6e78cae098b);
+        let r = _mm_sha1rnds4_epu32::<3>(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha256msg1_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0xeb84973fd5cda67d, 0x2857b88f406b09ee);
+        let r = _mm_sha256msg1_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha256msg2_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0xb58777ce887fd851, 0x15d1ec8b73ac8450);
+        let r = _mm_sha256msg2_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha256rnds2_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let k = _mm_set_epi64x(0, 0x12835b01d807aa98);
+        let expected = _mm_set_epi64x(0xd3063037effb15ea, 0x187ee3db0d6d1d19);
+        let r = _mm_sha256rnds2_epu32(a, b, k);
+        assert_eq_m128i(r, expected);
+    }
+
+    static DATA_64: [u64; 10] = [
+        0x0011223344556677,
+        0x8899aabbccddeeff,
+        0xffeeddccbbaa9988,
+        0x7766554433221100,
+        0x0123456789abcdef,
+        0xfedcba9876543210,
+        0x02468ace13579bdf,
+        0xfdb97531eca86420,
+        0x048c159d26ae37bf,
+        0xfb73ea62d951c840,
+    ];
+
+    #[simd_test(enable = "sha512,avx")]
+    unsafe fn test_mm256_sha512msg1_epi64() {
+        fn s0(word: u64) -> u64 {
+            word.rotate_right(1) ^ word.rotate_right(8) ^ (word >> 7)
+        }
+
+        let A = &DATA_64[0..4];
+        let B = &DATA_64[4..6];
+
+        let a = _mm256_loadu_si256(A.as_ptr().cast());
+        let b = _mm_loadu_si128(B.as_ptr().cast());
+
+        let r = _mm256_sha512msg1_epi64(a, b);
+
+        let e = _mm256_setr_epi64x(
+            A[0].wrapping_add(s0(A[1])) as _,
+            A[1].wrapping_add(s0(A[2])) as _,
+            A[2].wrapping_add(s0(A[3])) as _,
+            A[3].wrapping_add(s0(B[0])) as _,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "sha512,avx")]
+    unsafe fn test_mm256_sha512msg2_epi64() {
+        fn s1(word: u64) -> u64 {
+            word.rotate_right(19) ^ word.rotate_right(61) ^ (word >> 6)
+        }
+
+        let A = &DATA_64[0..4];
+        let B = &DATA_64[4..8];
+
+        let a = _mm256_loadu_si256(A.as_ptr().cast());
+        let b = _mm256_loadu_si256(B.as_ptr().cast());
+
+        let r = _mm256_sha512msg2_epi64(a, b);
+
+        let e0 = A[0].wrapping_add(s1(B[2]));
+        let e1 = A[1].wrapping_add(s1(B[3]));
+        let e = _mm256_setr_epi64x(
+            e0 as _,
+            e1 as _,
+            A[2].wrapping_add(s1(e0)) as _,
+            A[3].wrapping_add(s1(e1)) as _,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "sha512,avx")]
+    unsafe fn test_mm256_sha512rnds2_epi64() {
+        fn cap_sigma0(word: u64) -> u64 {
+            word.rotate_right(28) ^ word.rotate_right(34) ^ word.rotate_right(39)
+        }
+
+        fn cap_sigma1(word: u64) -> u64 {
+            word.rotate_right(14) ^ word.rotate_right(18) ^ word.rotate_right(41)
+        }
+
+        fn maj(a: u64, b: u64, c: u64) -> u64 {
+            (a & b) ^ (a & c) ^ (b & c)
+        }
+
+        fn ch(e: u64, f: u64, g: u64) -> u64 {
+            (e & f) ^ (g & !e)
+        }
+
+        let A = &DATA_64[0..4];
+        let B = &DATA_64[4..8];
+        let K = &DATA_64[8..10];
+
+        let a = _mm256_loadu_si256(A.as_ptr().cast());
+        let b = _mm256_loadu_si256(B.as_ptr().cast());
+        let k = _mm_loadu_si128(K.as_ptr().cast());
+
+        let r = _mm256_sha512rnds2_epi64(a, b, k);
+
+        let mut array = [B[3], B[2], A[3], A[2], B[1], B[0], A[1], A[0]];
+        for i in 0..2 {
+            let new_d = ch(array[4], array[5], array[6])
+                .wrapping_add(cap_sigma1(array[4]))
+                .wrapping_add(K[i])
+                .wrapping_add(array[7]);
+            array[7] = new_d
+                .wrapping_add(maj(array[0], array[1], array[2]))
+                .wrapping_add(cap_sigma0(array[0]));
+            array[3] = new_d.wrapping_add(array[3]);
+            array.rotate_right(1);
+        }
+        let e = _mm256_setr_epi64x(array[5] as _, array[4] as _, array[1] as _, array[0] as _);
+
+        assert_eq_m256i(r, e);
+    }
+
+    static DATA_32: [u32; 16] = [
+        0x00112233, 0x44556677, 0x8899aabb, 0xccddeeff, 0xffeeddcc, 0xbbaa9988, 0x77665544,
+        0x33221100, 0x01234567, 0x89abcdef, 0xfedcba98, 0x76543210, 0x02468ace, 0x13579bdf,
+        0xfdb97531, 0xeca86420,
+    ];
+
+    #[simd_test(enable = "sm3,avx")]
+    unsafe fn test_mm_sm3msg1_epi32() {
+        fn p1(x: u32) -> u32 {
+            x ^ x.rotate_left(15) ^ x.rotate_left(23)
+        }
+        let A = &DATA_32[0..4];
+        let B = &DATA_32[4..8];
+        let C = &DATA_32[8..12];
+
+        let a = _mm_loadu_si128(A.as_ptr().cast());
+        let b = _mm_loadu_si128(B.as_ptr().cast());
+        let c = _mm_loadu_si128(C.as_ptr().cast());
+
+        let r = _mm_sm3msg1_epi32(a, b, c);
+
+        let e = _mm_setr_epi32(
+            p1(A[0] ^ C[0] ^ B[0].rotate_left(15)) as _,
+            p1(A[1] ^ C[1] ^ B[1].rotate_left(15)) as _,
+            p1(A[2] ^ C[2] ^ B[2].rotate_left(15)) as _,
+            p1(A[3] ^ C[3]) as _,
+        );
+
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sm3,avx")]
+    unsafe fn test_mm_sm3msg2_epi32() {
+        let A = &DATA_32[0..4];
+        let B = &DATA_32[4..8];
+        let C = &DATA_32[8..12];
+
+        let a = _mm_loadu_si128(A.as_ptr().cast());
+        let b = _mm_loadu_si128(B.as_ptr().cast());
+        let c = _mm_loadu_si128(C.as_ptr().cast());
+
+        let r = _mm_sm3msg2_epi32(a, b, c);
+
+        let e0 = B[0].rotate_left(7) ^ C[0] ^ A[0];
+        let e = _mm_setr_epi32(
+            e0 as _,
+            (B[1].rotate_left(7) ^ C[1] ^ A[1]) as _,
+            (B[2].rotate_left(7) ^ C[2] ^ A[2]) as _,
+            (B[3].rotate_left(7)
+                ^ C[3]
+                ^ A[3]
+                ^ e0.rotate_left(6)
+                ^ e0.rotate_left(15)
+                ^ e0.rotate_left(30)) as _,
+        );
+
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sm3,avx")]
+    unsafe fn test_mm_sm3rnds2_epi32() {
+        fn p0(x: u32) -> u32 {
+            x ^ x.rotate_left(9) ^ x.rotate_left(17)
+        }
+        fn ff(x: u32, y: u32, z: u32, round: u32) -> u32 {
+            if round < 16 {
+                x ^ y ^ z
+            } else {
+                (x & y) | (x & z) | (y & z)
+            }
+        }
+        fn gg(x: u32, y: u32, z: u32, round: u32) -> u32 {
+            if round < 16 {
+                x ^ y ^ z
+            } else {
+                (x & y) | (!x & z)
+            }
+        }
+
+        const ROUND: u32 = 30;
+
+        let A = &DATA_32[0..4];
+        let B = &DATA_32[4..8];
+        let C = &DATA_32[8..12];
+
+        let a = _mm_loadu_si128(A.as_ptr().cast());
+        let b = _mm_loadu_si128(B.as_ptr().cast());
+        let c = _mm_loadu_si128(C.as_ptr().cast());
+
+        let r = _mm_sm3rnds2_epi32::<{ ROUND as i32 }>(a, b, c);
+
+        let CONST: u32 = if ROUND < 16 { 0x79cc4519 } else { 0x7a879d8a };
+
+        let mut array = [
+            B[3],
+            B[2],
+            A[3].rotate_left(9),
+            A[2].rotate_left(9),
+            B[1],
+            B[0],
+            A[1].rotate_left(19),
+            A[0].rotate_left(19),
+        ];
+
+        for i in 0..2 {
+            let s1 = array[0]
+                .rotate_left(12)
+                .wrapping_add(array[4])
+                .wrapping_add(CONST.rotate_left(ROUND as u32 + i as u32))
+                .rotate_left(7);
+            let s2 = s1 ^ array[0].rotate_left(12);
+
+            let t1 = ff(array[0], array[1], array[2], ROUND)
+                .wrapping_add(array[3])
+                .wrapping_add(s2)
+                .wrapping_add(C[i] ^ C[i + 2]);
+            let t2 = gg(array[4], array[5], array[6], ROUND)
+                .wrapping_add(array[7])
+                .wrapping_add(s1)
+                .wrapping_add(C[i]);
+
+            array[3] = array[2];
+            array[2] = array[1].rotate_left(9);
+            array[1] = array[0];
+            array[0] = t1;
+            array[7] = array[6];
+            array[6] = array[5].rotate_left(19);
+            array[5] = array[4];
+            array[4] = p0(t2);
+        }
+
+        let e = _mm_setr_epi32(array[5] as _, array[4] as _, array[1] as _, array[0] as _);
+
+        assert_eq_m128i(r, e);
+    }
+
+    fn lower_t(x: u32) -> u32 {
+        static SBOX: [u8; 256] = [
+            0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB,
+            0x2C, 0x05, 0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26,
+            0x49, 0x86, 0x06, 0x99, 0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54,
+            0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, 0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95,
+            0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, 0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73,
+            0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, 0x68, 0x6B, 0x81, 0xB2,
+            0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, 0x1E, 0x24,
+            0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
+            0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4,
+            0xC8, 0x9E, 0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE,
+            0xF9, 0x61, 0x15, 0xA1, 0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93,
+            0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, 0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60,
+            0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, 0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD,
+            0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, 0x8D, 0x1B, 0xAF, 0x92,
+            0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, 0x0A, 0xC1,
+            0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
+            0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E,
+            0xC6, 0x84, 0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E,
+            0xD7, 0xCB, 0x39, 0x48,
+        ];
+
+        ((SBOX[(x >> 24) as usize] as u32) << 24)
+            | ((SBOX[((x >> 16) & 0xff) as usize] as u32) << 16)
+            | ((SBOX[((x >> 8) & 0xff) as usize] as u32) << 8)
+            | (SBOX[(x & 0xff) as usize] as u32)
+    }
+
+    #[simd_test(enable = "sm4,avx")]
+    unsafe fn test_mm_sm4key4_epi32() {
+        fn l_key(x: u32) -> u32 {
+            x ^ x.rotate_left(13) ^ x.rotate_left(23)
+        }
+        fn f_key(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
+            x0 ^ l_key(lower_t(x1 ^ x2 ^ x3 ^ rk))
+        }
+
+        let A = &DATA_32[0..4];
+        let B = &DATA_32[4..8];
+
+        let a = _mm_loadu_si128(A.as_ptr().cast());
+        let b = _mm_loadu_si128(B.as_ptr().cast());
+
+        let r = _mm_sm4key4_epi32(a, b);
+
+        let e0 = f_key(A[0], A[1], A[2], A[3], B[0]);
+        let e1 = f_key(A[1], A[2], A[3], e0, B[1]);
+        let e2 = f_key(A[2], A[3], e0, e1, B[2]);
+        let e3 = f_key(A[3], e0, e1, e2, B[3]);
+        let e = _mm_setr_epi32(e0 as _, e1 as _, e2 as _, e3 as _);
+
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sm4,avx")]
+    unsafe fn test_mm256_sm4key4_epi32() {
+        let a_low = _mm_loadu_si128(DATA_32.as_ptr().cast());
+        let a_high = _mm_loadu_si128(DATA_32[4..].as_ptr().cast());
+        let b_low = _mm_loadu_si128(DATA_32[8..].as_ptr().cast());
+        let b_high = _mm_loadu_si128(DATA_32[12..].as_ptr().cast());
+
+        let a = _mm256_set_m128i(a_high, a_low);
+        let b = _mm256_set_m128i(b_high, b_low);
+
+        let r = _mm256_sm4key4_epi32(a, b);
+
+        let e_low = _mm_sm4key4_epi32(a_low, b_low);
+        let e_high = _mm_sm4key4_epi32(a_high, b_high);
+        let e = _mm256_set_m128i(e_high, e_low);
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "sm4,avx")]
+    unsafe fn test_mm_sm4rnds4_epi32() {
+        fn l_rnd(x: u32) -> u32 {
+            x ^ x.rotate_left(2) ^ x.rotate_left(10) ^ x.rotate_left(18) ^ x.rotate_left(24)
+        }
+        fn f_rnd(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
+            x0 ^ l_rnd(lower_t(x1 ^ x2 ^ x3 ^ rk))
+        }
+
+        let A = &DATA_32[0..4];
+        let B = &DATA_32[4..8];
+
+        let a = _mm_loadu_si128(A.as_ptr().cast());
+        let b = _mm_loadu_si128(B.as_ptr().cast());
+
+        let r = _mm_sm4rnds4_epi32(a, b);
+
+        let e0 = f_rnd(A[0], A[1], A[2], A[3], B[0]);
+        let e1 = f_rnd(A[1], A[2], A[3], e0, B[1]);
+        let e2 = f_rnd(A[2], A[3], e0, e1, B[2]);
+        let e3 = f_rnd(A[3], e0, e1, e2, B[3]);
+        let e = _mm_setr_epi32(e0 as _, e1 as _, e2 as _, e3 as _);
+
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sm4,avx")]
+    unsafe fn test_mm256_sm4rnds4_epi32() {
+        let a_low = _mm_loadu_si128(DATA_32.as_ptr().cast());
+        let a_high = _mm_loadu_si128(DATA_32[4..].as_ptr().cast());
+        let b_low = _mm_loadu_si128(DATA_32[8..].as_ptr().cast());
+        let b_high = _mm_loadu_si128(DATA_32[12..].as_ptr().cast());
+
+        let a = _mm256_set_m128i(a_high, a_low);
+        let b = _mm256_set_m128i(b_high, b_low);
+
+        let r = _mm256_sm4rnds4_epi32(a, b);
+
+        let e_low = _mm_sm4rnds4_epi32(a_low, b_low);
+        let e_high = _mm_sm4rnds4_epi32(a_high, b_high);
+        let e = _mm256_set_m128i(e_high, e_low);
+
+        assert_eq_m256i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse.rs b/library/stdarch/crates/core_arch/src/x86/sse.rs
new file mode 100644
index 0000000000000..1eca66adc2c6a
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse.rs
@@ -0,0 +1,3338 @@
+//! Streaming SIMD Extensions (SSE)
+
+use crate::{
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+    intrinsics::sqrtf32,
+    mem, ptr,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Adds the first component of `a` and `b`, the other components are copied
+/// from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(addss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) + _mm_cvtss_f32(b)) }
+}
+
+/// Adds packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(addps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_add(a, b) }
+}
+
+/// Subtracts the first component of `b` from `a`, the other components are
+/// copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(subss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) - _mm_cvtss_f32(b)) }
+}
+
+/// Subtracts packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(subps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_sub(a, b) }
+}
+
+/// Multiplies the first component of `a` and `b`, the other components are
+/// copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(mulss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) * _mm_cvtss_f32(b)) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(mulps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_mul(a, b) }
+}
+
+/// Divides the first component of `b` by `a`, the other components are
+/// copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(divss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) / _mm_cvtss_f32(b)) }
+}
+
+/// Divides packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(divps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_div(a, b) }
+}
+
+/// Returns the square root of the first single-precision (32-bit)
+/// floating-point element in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(sqrtss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sqrt_ss(a: __m128) -> __m128 {
+    unsafe { simd_insert!(a, 0, sqrtf32(_mm_cvtss_f32(a))) }
+}
+
+/// Returns the square root of packed single-precision (32-bit) floating-point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(sqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sqrt_ps(a: __m128) -> __m128 {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Returns the approximate reciprocal of the first single-precision
+/// (32-bit) floating-point element in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rcpss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_rcp_ss(a: __m128) -> __m128 {
+    unsafe { rcpss(a) }
+}
+
+/// Returns the approximate reciprocal of packed single-precision (32-bit)
+/// floating-point elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rcpps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_rcp_ps(a: __m128) -> __m128 {
+    unsafe { rcpps(a) }
+}
+
+/// Returns the approximate reciprocal square root of the first single-precision
+/// (32-bit) floating-point element in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rsqrtss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_rsqrt_ss(a: __m128) -> __m128 {
+    unsafe { rsqrtss(a) }
+}
+
+/// Returns the approximate reciprocal square root of packed single-precision
+/// (32-bit) floating-point elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rsqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_rsqrt_ps(a: __m128) -> __m128 {
+    unsafe { rsqrtps(a) }
+}
+
+/// Compares the first single-precision (32-bit) floating-point element of `a`
+/// and `b`, and return the minimum value in the first element of the return
+/// value, the other elements are copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(minss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { minss(a, b) }
+}
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`, and return the corresponding minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(minps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
+    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
+    unsafe { minps(a, b) }
+}
+
+/// Compares the first single-precision (32-bit) floating-point element of `a`
+/// and `b`, and return the maximum value in the first element of the return
+/// value, the other elements are copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(maxss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { maxss(a, b) }
+}
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`, and return the corresponding maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(maxps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
+    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
+    unsafe { maxps(a, b) }
+}
+
+/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `and` instructions, so ignore it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(andps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a: __m128i = mem::transmute(a);
+        let b: __m128i = mem::transmute(b);
+        mem::transmute(simd_and(a, b))
+    }
+}
+
+/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
+/// elements.
+///
+/// Computes `!a & b` for each bit in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `not` and `and` instructions, so ignore
+// it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(andnps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a: __m128i = mem::transmute(a);
+        let b: __m128i = mem::transmute(b);
+        let mask: __m128i = mem::transmute(i32x4::splat(-1));
+        mem::transmute(simd_and(simd_xor(mask, a), b))
+    }
+}
+
+/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `or` instructions, so we ignore it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(orps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a: __m128i = mem::transmute(a);
+        let b: __m128i = mem::transmute(b);
+        mem::transmute(simd_or(a, b))
+    }
+}
+
+/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
+/// elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `xor` instructions, so we ignore it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(xorps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a: __m128i = mem::transmute(a);
+        let b: __m128i = mem::transmute(b);
+        mem::transmute(simd_xor(a, b))
+    }
+}
+
+/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
+/// the result will be `0xffffffff` if the two inputs are equal, or `0`
+/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpeqss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 0) }
+}
+
+/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
+/// of the result will be `0xffffffff` if `a.extract(0)` is less than
+/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
+/// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 1) }
+}
+
+/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
+/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
+/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 2) }
+}
+
+/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
+/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
+/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, cmpss(b, a, 1), [4, 1, 2, 3]) }
+}
+
+/// Compares the lowest `f32` of both inputs for greater than or equal. The
+/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
+/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
+/// of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, cmpss(b, a, 2), [4, 1, 2, 3]) }
+}
+
+/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
+/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
+/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
+/// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpneqss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 4) }
+}
+
+/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
+/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
+/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
+/// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 5) }
+}
+
+/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
+/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
+/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
+/// of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 6) }
+}
+
+/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
+/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
+/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
+/// the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, cmpss(b, a, 5), [4, 1, 2, 3]) }
+}
+
+/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
+/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
+/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
+/// bits of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, cmpss(b, a, 6), [4, 1, 2, 3]) }
+}
+
+/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
+/// the result will be `0xffffffff` if neither of `a.extract(0)` or
+/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpordss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 7) }
+}
+
+/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
+/// of the result will be `0xffffffff` if any of `a.extract(0)` or
+/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpunordss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 3) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input elements
+/// were equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpeqps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(a, b, 0) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(a, b, 1) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is less than or equal to the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(a, b, 2) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(b, a, 1) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(b, a, 2) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input elements
+/// are **not** equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpneqps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(a, b, 4) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is **not** less than the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(a, b, 5) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is **not** less than or equal to the corresponding element in `b`, or
+/// `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(a, b, 6) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is **not** greater than the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(b, a, 5) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is **not** greater than or equal to the corresponding element in `b`,
+/// or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(b, a, 6) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// Returns four floats that have one of two possible bit patterns. The element
+/// in the output vector will be `0xffffffff` if the input elements in `a` and
+/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpordps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(b, a, 7) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// Returns four floats that have one of two possible bit patterns. The element
+/// in the output vector will be `0xffffffff` if the input elements in `a` and
+/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpunordps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(b, a, 3) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { comieq_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { comilt_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { comile_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than the one from `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { comigt_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than or equal to the one from `b`, or
+/// `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { comige_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are **not** equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { comineq_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are equal, or `0` otherwise. This instruction will not signal
+/// an exception if either argument is a quiet NaN.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { ucomieq_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
+/// This instruction will not signal an exception if either argument is a quiet
+/// NaN.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { ucomilt_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
+/// otherwise. This instruction will not signal an exception if either argument
+/// is a quiet NaN.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { ucomile_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than the one from `b`, or `0`
+/// otherwise. This instruction will not signal an exception if either argument
+/// is a quiet NaN.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { ucomigt_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than or equal to the one from `b`, or
+/// `0` otherwise. This instruction will not signal an exception if either
+/// argument is a quiet NaN.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { ucomige_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are **not** equal, or `0` otherwise. This instruction will not
+/// signal an exception if either argument is a quiet NaN.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { ucomineq_ss(a, b) }
+}
+
+/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
+///
+/// The result is rounded according to the current rounding mode. If the result
+/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
+/// (`i32::MIN`).
+///
+/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtss_si32(a: __m128) -> i32 {
+    unsafe { cvtss2si(a) }
+}
+
+/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvt_ss2si(a: __m128) -> i32 {
+    _mm_cvtss_si32(a)
+}
+
+/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
+/// with
+/// truncation.
+///
+/// The result is rounded always using truncation (round towards zero). If the
+/// result cannot be represented as a 32 bit integer the result will be
+/// `0x8000_0000` (`i32::MIN`).
+///
+/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvttss_si32(a: __m128) -> i32 {
+    unsafe { cvttss2si(a) }
+}
+
+/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtt_ss2si(a: __m128) -> i32 {
+    _mm_cvttss_si32(a)
+}
+
+/// Extracts the lowest 32 bit float from the input vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32)
+#[inline]
+#[target_feature(enable = "sse")]
+// No point in using assert_instrs. In Unix x86_64 calling convention this is a
+// no-op, and on msvc it's just a `mov`.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtss_f32(a: __m128) -> f32 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
+/// vector `a` with the lowest 32 bit float replaced by the converted integer.
+///
+/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
+/// input).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
+    unsafe { cvtsi2ss(a, b) }
+}
+
+/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
+    _mm_cvtsi32_ss(a, b)
+}
+
+/// Construct a `__m128` with the lowest element set to `a` and the rest set to
+/// zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set_ss(a: f32) -> __m128 {
+    __m128([a, 0.0, 0.0, 0.0])
+}
+
+/// Construct a `__m128` with all element set to `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(shufps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set1_ps(a: f32) -> __m128 {
+    __m128([a, a, a, a])
+}
+
+/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(shufps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set_ps1(a: f32) -> __m128 {
+    _mm_set1_ps(a)
+}
+
+/// Construct a `__m128` from four floating point values highest to lowest.
+///
+/// Note that `a` will be the highest 32 bits of the result, and `d` the
+/// lowest. This matches the standard way of writing bit patterns on x86:
+///
+/// ```text
+///  bit    127 .. 96  95 .. 64  63 .. 32  31 .. 0
+///        +---------+---------+---------+---------+
+///        |    a    |    b    |    c    |    d    |   result
+///        +---------+---------+---------+---------+
+/// ```
+///
+/// Alternatively:
+///
+/// ```text
+/// let v = _mm_set_ps(d, c, b, a);
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
+    __m128([d, c, b, a])
+}
+
+/// Construct a `__m128` from four floating point values lowest to highest.
+///
+/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
+/// bits of the result, and `d` the highest.
+///
+/// ```text
+/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(test, any(target_env = "msvc", target_arch = "x86_64")),
+    assert_instr(unpcklps)
+)]
+// On a 32-bit architecture on non-msvc it just copies the operands from the stack.
+#[cfg_attr(
+    all(test, all(not(target_env = "msvc"), target_arch = "x86")),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
+    __m128([a, b, c, d])
+}
+
+/// Construct a `__m128` with all elements initialized to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_setzero_ps() -> __m128 {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// A utility function for creating masks to use with Intel shuffle and
+/// permute intrinsics.
+#[inline]
+#[allow(non_snake_case)]
+#[unstable(feature = "stdarch_x86_mm_shuffle", issue = "111147")]
+pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
+    ((z << 6) | (y << 4) | (x << 2) | w) as i32
+}
+
+/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
+/// `b` using `MASK`.
+///
+/// The lower half of result takes values from `a` and the higher half from
+/// `b`. Mask is split to 2 control bits each to index the element from inputs.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps)
+///
+/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
+/// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
+/// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_).
+/// Performing an implicit type conversion between an unsigned integer and a signed integer
+/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(shufps, MASK = 3))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(MASK, 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+            ],
+        )
+    }
+}
+
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the higher half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(unpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) }
+}
+
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the lower half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) }
+}
+
+/// Combine higher half of `a` and `b`. The higher half of `b` occupies the
+/// lower half of result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movhlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
+    // TODO; figure why this is a different instruction on msvc?
+    unsafe { simd_shuffle!(a, b, [6, 7, 2, 3]) }
+}
+
+/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
+/// higher half of result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movlhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, b, [0, 1, 4, 5]) }
+}
+
+/// Returns a mask of the most significant bit of each element in `a`.
+///
+/// The mask is stored in the 4 least significant bits of the return value.
+/// All other bits are set to `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movmskps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_movemask_ps(a: __m128) -> i32 {
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    unsafe {
+        let mask: i32x4 = simd_lt(transmute(a), i32x4::ZERO);
+        simd_bitmask::<i32x4, u8>(mask).into()
+    }
+}
+
+/// Construct a `__m128` with the lowest element read from `p` and the other
+/// elements set to zero.
+///
+/// This corresponds to instructions `VMOVSS` / `MOVSS`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
+    __m128([*p, 0.0, 0.0, 0.0])
+}
+
+/// Construct a `__m128` by duplicating the value read from `p` into all
+/// elements.
+///
+/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
+/// shuffling.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
+    let a = *p;
+    __m128([a, a, a, a])
+}
+
+/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
+    _mm_load1_ps(p)
+}
+
+/// Loads four `f32` values from *aligned* memory into a `__m128`. If the
+/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
+/// memory.
+///
+/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// FIXME: Rust doesn't emit alignment attributes for MSVC x86-32. Ref https://github.com/rust-lang/rust/pull/139261
+// All aligned load/store intrinsics are affected
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
+    *(p as *const __m128)
+}
+
+/// Loads four `f32` values from memory into a `__m128`. There are no
+/// restrictions
+/// on memory alignment. For aligned memory
+/// [`_mm_load_ps`](fn._mm_load_ps.html)
+/// may be faster.
+///
+/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
+    // Note: Using `*p` would require `f32` alignment, but `movups` has no
+    // alignment restrictions.
+    let mut dst = _mm_undefined_ps();
+    ptr::copy_nonoverlapping(
+        p as *const u8,
+        ptr::addr_of_mut!(dst) as *mut u8,
+        mem::size_of::<__m128>(),
+    );
+    dst
+}
+
+/// Loads four `f32` values from aligned memory into a `__m128` in reverse
+/// order.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Functionally equivalent to the following code sequence (assuming `p`
+/// satisfies the alignment restrictions):
+///
+/// ```text
+/// let a0 = *p;
+/// let a1 = *p.add(1);
+/// let a2 = *p.add(2);
+/// let a3 = *p.add(3);
+/// __m128::new(a3, a2, a1, a0)
+/// ```
+///
+/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
+/// shuffling.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
+    let a = _mm_load_ps(p);
+    simd_shuffle!(a, a, [3, 2, 1, 0])
+}
+
+/// Stores the lowest 32 bit float of `a` into memory.
+///
+/// This intrinsic corresponds to the `MOVSS` instruction.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
+    *p = simd_extract!(a, 0);
+}
+
+/// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
+/// memory.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Functionally equivalent to the following code sequence (assuming `p`
+/// satisfies the alignment restrictions):
+///
+/// ```text
+/// let x = a.extract(0);
+/// *p = x;
+/// *p.add(1) = x;
+/// *p.add(2) = x;
+/// *p.add(3) = x;
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
+    let b: __m128 = simd_shuffle!(a, a, [0, 0, 0, 0]);
+    *(p as *mut __m128) = b;
+}
+
+/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
+    _mm_store1_ps(p, a);
+}
+
+/// Stores four 32-bit floats into *aligned* memory.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
+/// memory.
+///
+/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
+    *(p as *mut __m128) = a;
+}
+
+/// Stores four 32-bit floats into memory. There are no restrictions on memory
+/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
+/// faster.
+///
+/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
+    ptr::copy_nonoverlapping(
+        ptr::addr_of!(a) as *const u8,
+        p as *mut u8,
+        mem::size_of::<__m128>(),
+    );
+}
+
+/// Stores four 32-bit floats into *aligned* memory in reverse order.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Functionally equivalent to the following code sequence (assuming `p`
+/// satisfies the alignment restrictions):
+///
+/// ```text
+/// *p = a.extract(3);
+/// *p.add(1) = a.extract(2);
+/// *p.add(2) = a.extract(1);
+/// *p.add(3) = a.extract(0);
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
+    let b: __m128 = simd_shuffle!(a, a, [3, 2, 1, 0]);
+    *(p as *mut __m128) = b;
+}
+
+/// Returns a `__m128` with the first component from `b` and the remaining
+/// components from `a`.
+///
+/// In other words for any `a` and `b`:
+/// ```text
+/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, b, [4, 1, 2, 3]) }
+}
+
+/// Performs a serializing operation on all non-temporal ("streaming") store instructions that
+/// were issued by the current thread prior to this instruction.
+///
+/// Guarantees that every non-temporal store instruction that precedes this fence, in program order, is
+/// ordered before any load or store instruction which follows the fence in
+/// synchronization order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
+/// (but note that Intel is only documenting the hardware-level concerns related to this
+/// instruction; the Intel documentation does not take into account the extra concerns that arise
+/// because the Rust memory model is different from the x86 memory model.)
+///
+/// # Safety of non-temporal stores
+///
+/// After using any non-temporal store intrinsic, but before any other access to the memory that the
+/// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the
+/// intrinsic.
+///
+/// Non-temporal stores behave very different from regular stores. For the purpose of the Rust
+/// memory model, these stores are happening asynchronously in a background thread. This means a
+/// non-temporal store can cause data races with other accesses, even other accesses on the same
+/// thread. It also means that cross-thread synchronization does not work as expected: let's say the
+/// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The
+/// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not
+/// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize
+/// with all the non-temporal stores previously started on this thread, which means in particular
+/// that subsequent synchronization with other threads will then work as intended again.
+///
+/// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your
+/// code jumps back to code outside your library. This ensures all stores inside your function
+/// are synchronized-before the return, and thus transitively synchronized-before everything
+/// the caller does after your function returns.
+//
+// The following is not a doc comment since it's not clear whether we want to put this into the
+// docs, but it should be written out somewhere.
+//
+// Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot
+// inspect, and that behave like the following functions. This explains where the docs above come
+// from.
+// ```
+// #[thread_local]
+// static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0);
+//
+// pub unsafe fn nontemporal_store<T>(ptr: *mut T, val: T) {
+//     PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed);
+//     // Spawn a thread that will eventually do our write.
+//     // We need to fetch a pointer to this thread's pending-write
+//     // counter, so that we can access it from the background thread.
+//     let pending_writes = addr_of!(PENDING_NONTEMP_WRITES);
+//     // If this was actual Rust code we'd have to do some extra work
+//     // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here.
+//     std::thread::spawn(move || {
+//         // Do the write in the background thread.
+//         ptr.write(val);
+//         // Register the write as done. Crucially, this is `Release`, so it
+//         // syncs-with the `Acquire in `sfence`.
+//         (&*pending_writes).fetch_sub(1, Release);
+//     });
+// }
+//
+// pub fn sfence() {
+//     unsafe {
+//         // Wait until there are no more pending writes.
+//         while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {}
+//     }
+// }
+// ```
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(sfence))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sfence() {
+    sfence()
+}
+
+/// Gets the unsigned 32-bit value of the MXCSR control and status register.
+///
+/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
+/// floating-point operations may or may not result in this register getting updated with exception
+/// state, and the register can change between two invocations of this function even when no
+/// floating-point operations appear in the source code (since floating-point operations appearing
+/// earlier or later can be reordered).
+///
+/// If you need to perform some floating-point operations and check whether they raised an
+/// exception, use an inline assembly block for the entire sequence of operations.
+///
+/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(stmxcsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _mm_getcsr() -> u32 {
+    unsafe {
+        let mut result = 0_i32;
+        stmxcsr(ptr::addr_of_mut!(result) as *mut i8);
+        result as u32
+    }
+}
+
+/// Sets the MXCSR register with the 32-bit unsigned integer value.
+///
+/// This register controls how SIMD instructions handle floating point
+/// operations. Modifying this register only affects the current thread.
+///
+/// It contains several groups of flags:
+///
+/// * *Exception flags* report which exceptions occurred since last they were reset.
+///
+/// * *Masking flags* can be used to mask (ignore) certain exceptions. By default
+///   these flags are all set to 1, so all exceptions are masked. When
+///   an exception is masked, the processor simply sets the exception flag and
+///   continues the operation. If the exception is unmasked, the flag is also set
+///   but additionally an exception handler is invoked.
+///
+/// * *Rounding mode flags* control the rounding mode of floating point
+///   instructions.
+///
+/// * The *denormals-are-zero mode flag* turns all numbers which would be
+///   denormalized (exponent bits are all zeros) into zeros.
+///
+/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
+/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and
+/// will optimize accordingly. This even applies when the register is altered and later reset to its
+/// original value without any floating-point operations appearing in the source code between those
+/// operations (since floating-point operations appearing earlier or later can be reordered).
+///
+/// If you need to perform some floating-point operations under a different masking flags, rounding
+/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
+/// original MXCSR register state before the end of the block.
+///
+/// ## Exception Flags
+///
+/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
+///   Infinity by Infinity).
+///
+/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
+///   number. Mainly this can cause loss of precision.
+///
+/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.
+///
+/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a
+///   result was too large to be represented (e.g., an `f32` with absolute
+///   value greater than `2^128`).
+///
+/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a
+///   result was too small to be represented in a normalized way (e.g., an
+///   `f32` with absolute value smaller than `2^-126`.)
+///
+/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.
+///   precision exception). This means some precision was lost due to rounding.
+///   For example, the fraction `1/3` cannot be represented accurately in a
+///   32 or 64 bit float and computing it would cause this exception to be
+///   raised. Precision exceptions are very common, so they are usually masked.
+///
+/// Exception flags can be read and set using the convenience functions
+/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
+/// check if an operation caused some overflow:
+///
+/// ```rust,ignore
+/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
+///                             // perform calculations
+/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
+///     // handle overflow
+/// }
+/// ```
+///
+/// ## Masking Flags
+///
+/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
+/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
+/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
+///
+/// A single masking bit can be set via
+///
+/// ```rust,ignore
+/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
+/// ```
+///
+/// However, since mask bits are by default all set to 1, it is more common to
+/// want to *disable* certain bits. For example, to unmask the underflow
+/// exception, use:
+///
+/// ```rust,ignore
+/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
+/// exception
+/// ```
+///
+/// Warning: an unmasked exception will cause an exception handler to be
+/// called.
+/// The standard handler will simply terminate the process. So, in this case
+/// any underflow exception would terminate the current process with something
+/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
+///
+/// ## Rounding Mode
+///
+/// The rounding mode is describe using two bits. It can be read and set using
+/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
+/// `_MM_SET_ROUNDING_MODE(mode)`.
+///
+/// The rounding modes are:
+///
+/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
+///   value. If two values are equally close, round to even (i.e., least
+///   significant bit will be zero).
+///
+/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
+///
+/// * `_MM_ROUND_UP`: Round toward positive Infinity.
+///
+/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
+///
+/// Example:
+///
+/// ```rust,ignore
+/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
+/// ```
+///
+/// ## Denormals-are-zero/Flush-to-zero Mode
+///
+/// If this bit is set, values that would be denormalized will be set to zero
+/// instead. This is turned off by default.
+///
+/// You can read and enable/disable this mode via the helper functions
+/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
+///
+/// ```rust,ignore
+/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
+/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
+/// ```
+///
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ldmxcsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _mm_setcsr(val: u32) {
+    ldmxcsr(ptr::addr_of!(val) as *const i8);
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
+/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_MASK: u32 = 0x003f;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_INVALID: u32 = 0x0080;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_DENORM: u32 = 0x0100;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_INEXACT: u32 = 0x1000;
+/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_MASK: u32 = 0x1f80;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_NEAREST: u32 = 0x0000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_DOWN: u32 = 0x2000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_UP: u32 = 0x4000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
+
+/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_MASK: u32 = 0x6000;
+
+/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
+    _mm_getcsr() & _MM_MASK_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
+    _mm_getcsr() & _MM_EXCEPT_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
+    _mm_getcsr() & _MM_FLUSH_ZERO_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
+    _mm_getcsr() & _MM_ROUND_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | (x & _MM_MASK_MASK))
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | (x & _MM_EXCEPT_MASK))
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | (x & _MM_FLUSH_ZERO_MASK))
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | (x & _MM_ROUND_MASK))
+}
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_T0: i32 = 3;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_T1: i32 = 2;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_T2: i32 = 1;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_NTA: i32 = 0;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_ET0: i32 = 7;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_ET1: i32 = 6;
+
+/// Fetch the cache line that contains address `p` using the given `STRATEGY`.
+///
+/// The `STRATEGY` must be one of:
+///
+/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
+///   cache hierarchy.
+///
+/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
+///
+/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
+///   an implementation-specific choice (e.g., L2 if there is no L3).
+///
+/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
+///   non-temporal access (NTA) hint. It may be a place closer than main memory
+///   but outside of the cache hierarchy. This is used to reduce access latency
+///   without polluting the cache.
+///
+/// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
+///   [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
+///   and `_MM_HINT_T1` but indicate an anticipation to write to the address.
+///
+/// The actual implementation depends on the particular CPU. This instruction
+/// is considered a hint, so the CPU is also free to simply ignore the request.
+///
+/// The amount of prefetched data depends on the cache line size of the
+/// specific CPU, but it will be at least 32 bytes.
+///
+/// Common caveats:
+///
+/// * Most modern CPUs already automatically prefetch data based on predicted
+///   access patterns.
+///
+/// * Data is usually not fetched if this would cause a TLB miss or a page
+///   fault.
+///
+/// * Too much prefetching can cause unnecessary cache evictions.
+///
+/// * Prefetching may also fail if there are not enough memory-subsystem
+///   resources (e.g., request buffers).
+///
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
+#[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
+#[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
+#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
+    static_assert_uimm_bits!(STRATEGY, 3);
+    // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
+    // `locality` and `rw` are based on our `STRATEGY`.
+    prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
+}
+
+/// Returns vector of type __m128 with indeterminate elements.with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_undefined_ps() -> __m128 {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _MM_TRANSPOSE4_PS(
+    row0: &mut __m128,
+    row1: &mut __m128,
+    row2: &mut __m128,
+    row3: &mut __m128,
+) {
+    let tmp0 = _mm_unpacklo_ps(*row0, *row1);
+    let tmp2 = _mm_unpacklo_ps(*row2, *row3);
+    let tmp1 = _mm_unpackhi_ps(*row0, *row1);
+    let tmp3 = _mm_unpackhi_ps(*row2, *row3);
+
+    *row0 = _mm_movelh_ps(tmp0, tmp2);
+    *row1 = _mm_movehl_ps(tmp2, tmp0);
+    *row2 = _mm_movelh_ps(tmp1, tmp3);
+    *row3 = _mm_movehl_ps(tmp3, tmp1);
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sse.rcp.ss"]
+    fn rcpss(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rcp.ps"]
+    fn rcpps(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rsqrt.ss"]
+    fn rsqrtss(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rsqrt.ps"]
+    fn rsqrtps(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.min.ss"]
+    fn minss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.min.ps"]
+    fn minps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.max.ss"]
+    fn maxss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.max.ps"]
+    fn maxps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.cmp.ps"]
+    fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
+    #[link_name = "llvm.x86.sse.comieq.ss"]
+    fn comieq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comilt.ss"]
+    fn comilt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comile.ss"]
+    fn comile_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comigt.ss"]
+    fn comigt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comige.ss"]
+    fn comige_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comineq.ss"]
+    fn comineq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomieq.ss"]
+    fn ucomieq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomilt.ss"]
+    fn ucomilt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomile.ss"]
+    fn ucomile_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomigt.ss"]
+    fn ucomigt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomige.ss"]
+    fn ucomige_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomineq.ss"]
+    fn ucomineq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cvtss2si"]
+    fn cvtss2si(a: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cvttss2si"]
+    fn cvttss2si(a: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cvtsi2ss"]
+    fn cvtsi2ss(a: __m128, b: i32) -> __m128;
+    #[link_name = "llvm.x86.sse.sfence"]
+    fn sfence();
+    #[link_name = "llvm.x86.sse.stmxcsr"]
+    fn stmxcsr(p: *mut i8);
+    #[link_name = "llvm.x86.sse.ldmxcsr"]
+    fn ldmxcsr(p: *const i8);
+    #[link_name = "llvm.prefetch"]
+    fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
+    #[link_name = "llvm.x86.sse.cmp.ss"]
+    fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
+}
+
+/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
+///
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
+/// exception _may_ be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movntps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
+    crate::arch::asm!(
+        vps!("movntps", ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(xmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{hint::black_box, mem::transmute, ptr};
+    use std::boxed;
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::{simd::*, x86::*};
+
+    const NAN: f32 = f32::NAN;
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_add_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_add_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_add_ss() {
+        let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_add_ss(a, b);
+        assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_sub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sub_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_sub_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_mul_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_mul_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_mul_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_mul_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_div_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
+        let r = _mm_div_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_div_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_div_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sqrt_ss() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_sqrt_ss(a);
+        let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sqrt_ps() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_sqrt_ps(a);
+        let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rcp_ss() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rcp_ss(a);
+        let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
+        let rel_err = 0.00048828125;
+        assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err);
+        for i in 1..4 {
+            assert_eq!(get_m128(r, i), get_m128(e, i));
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rcp_ps() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rcp_ps(a);
+        let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
+        let rel_err = 0.00048828125;
+        for i in 0..4 {
+            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rsqrt_ss() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rsqrt_ss(a);
+        let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
+        let rel_err = 0.00048828125;
+        for i in 0..4 {
+            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rsqrt_ps() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rsqrt_ps(a);
+        let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
+        let rel_err = 0.00048828125;
+        for i in 0..4 {
+            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_min_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_min_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_min_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_min_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
+
+        // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
+        // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
+        // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
+        // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
+        // `r1` to `a` and `r2` to `b`.
+        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
+        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_min_ps(a, b));
+        let r2: [u8; 16] = transmute(_mm_min_ps(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_max_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_max_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_max_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_max_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
+
+        // Check SSE-specific semantics for -0.0 handling.
+        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
+        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_max_ps(a, b));
+        let r2: [u8; 16] = transmute(_mm_max_ps(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_and_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_and_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0001));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_andnot_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0100));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_or_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_or_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0111));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_xor_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0110));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpeq_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
+        let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
+        let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0));
+        assert_eq!(r, e);
+
+        let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
+        let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0));
+        assert_eq!(r2, e2);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmplt_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) < b.extract(0)
+        let c1 = 0u32; // a.extract(0) < c.extract(0)
+        let d1 = !0u32; // a.extract(0) < d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmple_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) <= b.extract(0)
+        let c1 = !0u32; // a.extract(0) <= c.extract(0)
+        let d1 = !0u32; // a.extract(0) <= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpgt_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) > b.extract(0)
+        let c1 = 0u32; // a.extract(0) > c.extract(0)
+        let d1 = 0u32; // a.extract(0) > d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpge_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) >= b.extract(0)
+        let c1 = !0u32; // a.extract(0) >= c.extract(0)
+        let d1 = 0u32; // a.extract(0) >= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpneq_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) != b.extract(0)
+        let c1 = 0u32; // a.extract(0) != c.extract(0)
+        let d1 = !0u32; // a.extract(0) != d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnlt_ss() {
+        // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence of NaNs (signaling or quiet). If so, we should add tests
+        // for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) >= b.extract(0)
+        let c1 = !0u32; // a.extract(0) >= c.extract(0)
+        let d1 = 0u32; // a.extract(0) >= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnle_ss() {
+        // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence
+        // of NaNs (signaling or quiet). If so, we should add tests for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) > b.extract(0)
+        let c1 = 0u32; // a.extract(0) > c.extract(0)
+        let d1 = 0u32; // a.extract(0) > d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpngt_ss() {
+        // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence of NaNs (signaling or quiet). If so, we should add tests
+        // for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) <= b.extract(0)
+        let c1 = !0u32; // a.extract(0) <= c.extract(0)
+        let d1 = !0u32; // a.extract(0) <= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnge_ss() {
+        // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence of NaNs (signaling or quiet). If so, we should add tests
+        // for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) < b.extract(0)
+        let c1 = 0u32; // a.extract(0) < c.extract(0)
+        let d1 = !0u32; // a.extract(0) < d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpord_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) ord b.extract(0)
+        let c1 = 0u32; // a.extract(0) ord c.extract(0)
+        let d1 = !0u32; // a.extract(0) ord d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpunord_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) unord b.extract(0)
+        let c1 = !0u32; // a.extract(0) unord c.extract(0)
+        let d1 = 0u32; // a.extract(0) unord d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpeq_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, fls, tru, fls);
+        let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmplt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, fls, fls);
+        let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmple_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, tru, fls);
+        let r: u32x4 = transmute(_mm_cmple_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpgt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, fls, fls);
+        let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpge_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, tru, fls);
+        let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpneq_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, tru, fls, tru);
+        let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnlt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, tru, tru);
+        let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnle_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, fls, tru);
+        let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpngt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, tru, tru);
+        let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnge_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, fls, tru);
+        let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpord_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
+        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, fls, fls);
+        let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpunord_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
+        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, tru, tru);
+        let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comieq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comieq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comilt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comilt_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comile_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comile_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comigt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 1, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comige_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comineq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 1, 1];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comineq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomieq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomieq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomilt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomilt_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomile_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomile_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomigt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 0, 1, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomigt_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomige_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 1, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomige_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomineq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 1, 1];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomineq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtss_si32() {
+        let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
+        let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
+        for i in 0..inputs.len() {
+            let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
+            let e = result[i];
+            let r = _mm_cvtss_si32(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvttss_si32() {
+        let inputs = &[
+            (42.0f32, 42i32),
+            (-31.4, -31),
+            (-33.5, -33),
+            (-34.5, -34),
+            (10.999, 10),
+            (-5.99, -5),
+            (4.0e10, i32::MIN),
+            (4.0e-10, 0),
+            (NAN, i32::MIN),
+            (2147483500.1, 2147483520),
+        ];
+        for (i, &(xi, e)) in inputs.iter().enumerate() {
+            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
+            let r = _mm_cvttss_si32(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtsi32_ss() {
+        let inputs = &[
+            (4555i32, 4555.0f32),
+            (322223333, 322223330.0),
+            (-432, -432.0),
+            (-322223333, -322223330.0),
+        ];
+
+        for &(x, f) in inputs.iter() {
+            let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+            let r = _mm_cvtsi32_ss(a, x);
+            let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
+            assert_eq_m128(e, r);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtss_f32() {
+        let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
+        assert_eq!(_mm_cvtss_f32(a), 312.0134);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_set_ss() {
+        let r = _mm_set_ss(black_box(4.25));
+        assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_set1_ps() {
+        let r1 = _mm_set1_ps(black_box(4.25));
+        let r2 = _mm_set_ps1(black_box(4.25));
+        assert_eq!(get_m128(r1, 0), 4.25);
+        assert_eq!(get_m128(r1, 1), 4.25);
+        assert_eq!(get_m128(r1, 2), 4.25);
+        assert_eq!(get_m128(r1, 3), 4.25);
+        assert_eq!(get_m128(r2, 0), 4.25);
+        assert_eq!(get_m128(r2, 1), 4.25);
+        assert_eq!(get_m128(r2, 2), 4.25);
+        assert_eq!(get_m128(r2, 3), 4.25);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_set_ps() {
+        let r = _mm_set_ps(
+            black_box(1.0),
+            black_box(2.0),
+            black_box(3.0),
+            black_box(4.0),
+        );
+        assert_eq!(get_m128(r, 0), 4.0);
+        assert_eq!(get_m128(r, 1), 3.0);
+        assert_eq!(get_m128(r, 2), 2.0);
+        assert_eq!(get_m128(r, 3), 1.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_setr_ps() {
+        let r = _mm_setr_ps(
+            black_box(1.0),
+            black_box(2.0),
+            black_box(3.0),
+            black_box(4.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_setzero_ps() {
+        let r = *black_box(&_mm_setzero_ps());
+        assert_eq_m128(r, _mm_set1_ps(0.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_MM_SHUFFLE() {
+        assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
+        assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
+        assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_shuffle_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b);
+        assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_unpackhi_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_unpackhi_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_unpacklo_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_unpacklo_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_movehl_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_movehl_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_movelh_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_movelh_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_load_ss() {
+        let a = 42.0f32;
+        let r = _mm_load_ss(ptr::addr_of!(a));
+        assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_load1_ps() {
+        let a = 42.0f32;
+        let r = _mm_load1_ps(ptr::addr_of!(a));
+        assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_load_ps() {
+        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+
+        let mut p = vals.as_ptr();
+        let mut fixup = 0.0f32;
+
+        // Make sure p is aligned, otherwise we might get a
+        // (signal: 11, SIGSEGV: invalid memory reference)
+
+        let unalignment = (p as usize) & 0xf;
+        if unalignment != 0 {
+            let delta = (16 - unalignment) >> 2;
+            fixup = delta as f32;
+            p = p.add(delta);
+        }
+
+        let r = _mm_load_ps(p);
+        let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_loadu_ps() {
+        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = vals.as_ptr().add(3);
+        let r = _mm_loadu_ps(black_box(p));
+        assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_loadr_ps() {
+        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+
+        let mut p = vals.as_ptr();
+        let mut fixup = 0.0f32;
+
+        // Make sure p is aligned, otherwise we might get a
+        // (signal: 11, SIGSEGV: invalid memory reference)
+
+        let unalignment = (p as usize) & 0xf;
+        if unalignment != 0 {
+            let delta = (16 - unalignment) >> 2;
+            fixup = delta as f32;
+            p = p.add(delta);
+        }
+
+        let r = _mm_loadr_ps(p);
+        let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_store_ss() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        _mm_store_ss(vals.as_mut_ptr().add(1), a);
+
+        assert_eq!(vals[0], 0.0);
+        assert_eq!(vals[1], 1.0);
+        assert_eq!(vals[2], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_store1_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        if (p as usize) & 0xf != 0 {
+            ofs = (16 - ((p as usize) & 0xf)) >> 2;
+            p = p.add(ofs);
+        }
+
+        _mm_store1_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 1.0);
+        assert_eq!(vals[ofs + 2], 1.0);
+        assert_eq!(vals[ofs + 3], 1.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_store_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Align p to 16-byte boundary
+        if (p as usize) & 0xf != 0 {
+            ofs = (16 - ((p as usize) & 0xf)) >> 2;
+            p = p.add(ofs);
+        }
+
+        _mm_store_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 2.0);
+        assert_eq!(vals[ofs + 2], 3.0);
+        assert_eq!(vals[ofs + 3], 4.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_storer_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Align p to 16-byte boundary
+        if (p as usize) & 0xf != 0 {
+            ofs = (16 - ((p as usize) & 0xf)) >> 2;
+            p = p.add(ofs);
+        }
+
+        _mm_storer_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 4.0);
+        assert_eq!(vals[ofs + 1], 3.0);
+        assert_eq!(vals[ofs + 2], 2.0);
+        assert_eq!(vals[ofs + 3], 1.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_storeu_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Make sure p is **not** aligned to 16-byte boundary
+        if (p as usize) & 0xf == 0 {
+            ofs = 1;
+            p = p.add(1);
+        }
+
+        _mm_storeu_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 2.0);
+        assert_eq!(vals[ofs + 2], 3.0);
+        assert_eq!(vals[ofs + 3], 4.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_move_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+        let r = _mm_move_ss(a, b);
+        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(e, r);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_movemask_ps() {
+        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
+        assert_eq!(r, 0b0101);
+
+        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
+        assert_eq!(r, 0b0111);
+    }
+
+    #[simd_test(enable = "sse")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_sfence() {
+        _mm_sfence();
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_MM_TRANSPOSE4_PS() {
+        let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
+        let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
+
+        _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
+
+        assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
+        assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
+        assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
+        assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
+    }
+
+    #[repr(align(16))]
+    struct Memory {
+        pub data: [f32; 4],
+    }
+
+    #[simd_test(enable = "sse")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_stream_ps() {
+        let a = _mm_set1_ps(7.0);
+        let mut mem = Memory { data: [-1.0; 4] };
+
+        _mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
+        for i in 0..4 {
+            assert_eq!(mem.data[i], get_m128(a, i));
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs
new file mode 100644
index 0000000000000..3dabcde18ce9e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@@ -0,0 +1,5253 @@
+//! Streaming SIMD Extensions 2 (SSE2)
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::{
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+    intrinsics::sqrtf64,
+    mem, ptr,
+};
+
+/// Provides a hint to the processor that the code sequence is a spin-wait loop.
+///
+/// This can help improve the performance and power consumption of spin-wait
+/// loops.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause)
+#[inline]
+#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_pause() {
+    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
+    // the SSE2 target-feature - therefore it does not require any target features
+    pause()
+}
+
+/// Invalidates and flushes the cache line that contains `p` from all levels of
+/// the cache hierarchy.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(clflush))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_clflush(p: *const u8) {
+    clflush(p)
+}
+
+/// Performs a serializing operation on all load-from-memory instructions
+/// that were issued prior to this instruction.
+///
+/// Guarantees that every load instruction that precedes, in program order, is
+/// globally visible before any load instruction which follows the fence in
+/// program order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(lfence))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_lfence() {
+    lfence()
+}
+
+/// Performs a serializing operation on all load-from-memory and store-to-memory
+/// instructions that were issued prior to this instruction.
+///
+/// Guarantees that every memory access that precedes, in program order, the
+/// memory fence instruction is globally visible before any memory instruction
+/// which follows the fence in program order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(mfence))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mfence() {
+    mfence()
+}
+
+/// Adds packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Adds packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Adds packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Adds packed 64-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Adds packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Adds packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Averages packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pavgb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = simd_cast::<_, u16x16>(a.as_u8x16());
+        let b = simd_cast::<_, u16x16>(b.as_u8x16());
+        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
+        transmute(simd_cast::<_, u8x16>(r))
+    }
+}
+
+/// Averages packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pavgw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = simd_cast::<_, u32x8>(a.as_u16x8());
+        let b = simd_cast::<_, u32x8>(b.as_u16x8());
+        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
+        transmute(simd_cast::<_, u16x8>(r))
+    }
+}
+
+/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
+///
+/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
+/// intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmaddwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmaxsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
+/// packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmaxub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pminsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
+/// packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pminub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmulhw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = simd_cast::<_, i32x8>(a.as_i16x8());
+        let b = simd_cast::<_, i32x8>(b.as_i16x8());
+        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
+        transmute(simd_cast::<i32x8, i16x8>(r))
+    }
+}
+
+/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmulhuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = simd_cast::<_, u32x8>(a.as_u16x8());
+        let b = simd_cast::<_, u32x8>(b.as_u16x8());
+        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
+        transmute(simd_cast::<u32x8, u16x8>(r))
+    }
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// low 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmullw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
+/// in `a` and `b`.
+///
+/// Returns the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmuludq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        let mask = u64x2::splat(u32::MAX.into());
+        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    }
+}
+
+/// Sum the absolute differences of packed unsigned 8-bit integers.
+///
+/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to produce
+/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
+/// the low 16 bits of 64-bit elements returned.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psadbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { _mm_slli_si128_impl::<IMM8>(a) }
+}
+
+/// Implementation detail: converts the immediate argument of the
+/// `_mm_slli_si128` intrinsic into a compile-time constant.
+#[inline]
+#[target_feature(enable = "sse2")]
+unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
+    const fn mask(shift: i32, i: u32) -> u32 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 { i } else { 16 - shift + i }
+    }
+    transmute::<i8x16, _>(simd_shuffle!(
+        i8x16::ZERO,
+        a.as_i8x16(),
+        [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+        ],
+    ))
+}
+
+/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        _mm_slli_si128_impl::<IMM8>(a)
+    }
+}
+
+/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        _mm_srli_si128_impl::<IMM8>(a)
+    }
+}
+
+/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        if IMM8 >= 16 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
+        }
+    }
+}
+
+/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psllw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
+}
+
+/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
+        }
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pslld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
+}
+
+/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
+        }
+    }
+}
+
+/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psllq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psraw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrad))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
+}
+
+/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { _mm_srli_si128_impl::<IMM8>(a) }
+}
+
+/// Implementation detail: converts the immediate argument of the
+/// `_mm_srli_si128` intrinsic into a compile-time constant.
+#[inline]
+#[target_feature(enable = "sse2")]
+unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
+    const fn mask(shift: i32, i: u32) -> u32 {
+        if (shift as u32) > 15 {
+            i + 16
+        } else {
+            i + (shift as u32)
+        }
+    }
+    let x: i8x16 = simd_shuffle!(
+        a.as_i8x16(),
+        i8x16::ZERO,
+        [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+        ],
+    );
+    transmute(x)
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        if IMM8 >= 16 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
+        }
+    }
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrlw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
+        }
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
+}
+
+/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
+        }
+    }
+}
+
+/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrlq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
+}
+
+/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(andps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { simd_and(a, b) }
+}
+
+/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
+/// then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(andnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { simd_and(simd_xor(_mm_set1_epi8(-1), a), b) }
+}
+
+/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(orps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { simd_or(a, b) }
+}
+
+/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { simd_xor(a, b) }
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpeqb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpeqw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpeqd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Converts the lower two packed 32-bit integers in `a` to packed
+/// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtdq2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
+    unsafe {
+        let a = a.as_i32x4();
+        simd_cast::<i32x2, __m128d>(simd_shuffle!(a, a, [0, 1]))
+    }
+}
+
+/// Returns `a` with its lower element replaced by `b` after converting it to
+/// an `f64`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
+    unsafe { simd_insert!(a, 0, b as f64) }
+}
+
+/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtdq2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
+    unsafe { transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
+    unsafe { transmute(cvtps2dq(a)) }
+}
+
+/// Returns a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
+    unsafe { transmute(i32x4::new(a, 0, 0, 0)) }
+}
+
+/// Returns the lowest element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
+    unsafe { simd_extract!(a.as_i32x4(), 0) }
+}
+
+/// Sets packed 64-bit integers with the supplied values, from highest to
+/// lowest.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
+    unsafe { transmute(i64x2::new(e0, e1)) }
+}
+
+/// Sets packed 32-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
+    unsafe { transmute(i32x4::new(e0, e1, e2, e3)) }
+}
+
+/// Sets packed 16-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m128i {
+    unsafe { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
+}
+
+/// Sets packed 8-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m128i {
+    unsafe {
+        #[rustfmt::skip]
+        transmute(i8x16::new(
+            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+        ))
+    }
+}
+
+/// Broadcasts 64-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set1_epi64x(a: i64) -> __m128i {
+    _mm_set_epi64x(a, a)
+}
+
+/// Broadcasts 32-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set1_epi32(a: i32) -> __m128i {
+    _mm_set_epi32(a, a, a, a)
+}
+
+/// Broadcasts 16-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set1_epi16(a: i16) -> __m128i {
+    _mm_set_epi16(a, a, a, a, a, a, a, a)
+}
+
+/// Broadcasts 8-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set1_epi8(a: i8) -> __m128i {
+    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+/// Sets packed 32-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
+    _mm_set_epi32(e0, e1, e2, e3)
+}
+
+/// Sets packed 16-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_setr_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m128i {
+    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
+}
+
+/// Sets packed 8-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_setr_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m128i {
+    #[rustfmt::skip]
+    _mm_set_epi8(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    )
+}
+
+/// Returns a vector with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_setzero_si128() -> __m128i {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// Loads 64-bit integer from memory into first element of returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
+    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
+}
+
+/// Loads 128-bits of integer data from memory into a new vector.
+///
+/// `mem_addr` must be aligned on a 16-byte boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
+    *mem_addr
+}
+
+/// Loads 128-bits of integer data from memory into a new vector.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
+    let mut dst: __m128i = _mm_undefined_si128();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        ptr::addr_of_mut!(dst) as *mut u8,
+        mem::size_of::<__m128i>(),
+    );
+    dst
+}
+
+/// Conditionally store 8-bit integer elements from `a` into memory using
+/// `mask`.
+///
+/// Elements are not stored when the highest bit is not set in the
+/// corresponding element.
+///
+/// `mem_addr` should correspond to a 128-bit memory location and does not need
+/// to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(maskmovdqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
+    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
+}
+
+/// Stores 128-bits of integer data from `a` into memory.
+///
+/// `mem_addr` must be aligned on a 16-byte boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
+    *mem_addr = a;
+}
+
+/// Stores 128-bits of integer data from `a` into memory.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
+    mem_addr.write_unaligned(a);
+}
+
+/// Stores the lower 64-bit integer `a` to a memory location.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
+    ptr::copy_nonoverlapping(ptr::addr_of!(a) as *const u8, mem_addr as *mut u8, 8);
+}
+
+/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movntdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
+    crate::arch::asm!(
+        vps!("movntdq",  ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(xmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Stores a 32-bit integer value in the specified memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movnti))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
+    crate::arch::asm!(
+        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
+        p = in(reg) mem_addr,
+        a = in(reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Returns a vector where the low element is extracted from `a` and its upper
+/// element is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+// FIXME movd on msvc, movd on i686
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_move_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
+        transmute(r)
+    }
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(packsswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(packssdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(packuswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Returns the `imm8` element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 3);
+    unsafe { simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32 }
+}
+
+/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 3);
+    unsafe { transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16)) }
+}
+
+/// Returns a mask of the most significant bit of each element in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmovmskb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
+    unsafe {
+        let z = i8x16::ZERO;
+        let m: i8x16 = simd_lt(a.as_i8x16(), z);
+        simd_bitmask::<_, u16>(m) as u32 as i32
+    }
+}
+
+/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        let a = a.as_i32x4();
+        let x: i32x4 = simd_shuffle!(
+            a,
+            a,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
+        );
+        transmute(x)
+    }
+}
+
+/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
+/// `IMM8`.
+///
+/// Put the results in the high 64 bits of the returned vector, with the low 64
+/// bits being copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        let a = a.as_i16x8();
+        let x: i16x8 = simd_shuffle!(
+            a,
+            a,
+            [
+                0,
+                1,
+                2,
+                3,
+                (IMM8 as u32 & 0b11) + 4,
+                ((IMM8 as u32 >> 2) & 0b11) + 4,
+                ((IMM8 as u32 >> 4) & 0b11) + 4,
+                ((IMM8 as u32 >> 6) & 0b11) + 4,
+            ],
+        );
+        transmute(x)
+    }
+}
+
+/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
+/// `IMM8`.
+///
+/// Put the results in the low 64 bits of the returned vector, with the high 64
+/// bits being copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        let a = a.as_i16x8();
+        let x: i16x8 = simd_shuffle!(
+            a,
+            a,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+                4,
+                5,
+                6,
+                7,
+            ],
+        );
+        transmute(x)
+    }
+}
+
+/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(punpckhbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        transmute::<i8x16, _>(simd_shuffle!(
+            a.as_i8x16(),
+            b.as_i8x16(),
+            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
+        ))
+    }
+}
+
+/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(punpckhwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
+        transmute::<i16x8, _>(x)
+    }
+}
+
+/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(unpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
+}
+
+/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(unpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
+}
+
+/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(punpcklbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        transmute::<i8x16, _>(simd_shuffle!(
+            a.as_i8x16(),
+            b.as_i8x16(),
+            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
+        ))
+    }
+}
+
+/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(punpcklwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
+        transmute::<i16x8, _>(x)
+    }
+}
+
+/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
+}
+
+/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movlhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the sum of the
+/// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(addsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) }
+}
+
+/// Adds packed double-precision (64-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(addpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_add(a, b) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the result of
+/// diving the lower element of `a` by the lower element of `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(divsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in `a` by
+/// packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(divpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_div(a, b) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the maximum
+/// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(maxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { maxsd(a, b) }
+}
+
+/// Returns a new vector with the maximum values from corresponding elements in
+/// `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(maxpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { maxpd(a, b) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the minimum
+/// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(minsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { minsd(a, b) }
+}
+
+/// Returns a new vector with the minimum values from corresponding elements in
+/// `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(minpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { minpd(a, b) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by multiplying the
+/// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(mulsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(mulpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_mul(a, b) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the square
+/// root of the lower element `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(sqrtsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_insert!(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
+}
+
+/// Returns a new vector with the square root of each of the values in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(sqrtpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by subtracting the
+/// low element by `b` from the low element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(subsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in `b`
+/// from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(subpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_sub(a, b) }
+}
+
+/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(andps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_and_si128(a, b))
+    }
+}
+
+/// Computes the bitwise NOT of `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(andnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_andnot_si128(a, b))
+    }
+}
+
+/// Computes the bitwise OR of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(orps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_or_si128(a, b))
+    }
+}
+
+/// Computes the bitwise XOR of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_xor_si128(a, b))
+    }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the equality
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpeqsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmpsd(a, b, 0) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the less-than
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmpsd(a, b, 1) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmplesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmpsd(a, b, 2) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmplesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64)) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the result
+/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
+/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpordsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmpsd(a, b, 7) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the result of
+/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
+/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpunordsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmpsd(a, b, 3) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the not-equal
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpneqsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmpsd(a, b, 4) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-less-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmpsd(a, b, 5) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnlesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmpsd(a, b, 6) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnlesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64)) }
+}
+
+/// Compares corresponding elements in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpeqpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmppd(a, b, 0) }
+}
+
+/// Compares corresponding elements in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmppd(a, b, 1) }
+}
+
+/// Compares corresponding elements in `a` and `b` for less-than-or-equal
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmplepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmppd(a, b, 2) }
+}
+
+/// Compares corresponding elements in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_cmplt_pd(b, a)
+}
+
+/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmplepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_cmple_pd(b, a)
+}
+
+/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpordpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmppd(a, b, 7) }
+}
+
+/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpunordpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmppd(a, b, 3) }
+}
+
+/// Compares corresponding elements in `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpneqpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmppd(a, b, 4) }
+}
+
+/// Compares corresponding elements in `a` and `b` for not-less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmppd(a, b, 5) }
+}
+
+/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnlepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { cmppd(a, b, 6) }
+}
+
+/// Compares corresponding elements in `a` and `b` for not-greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_cmpnlt_pd(b, a)
+}
+
+/// Compares corresponding elements in `a` and `b` for
+/// not-greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnlepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_cmpnle_pd(b, a)
+}
+
+/// Compares the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { comieqsd(a, b) }
+}
+
+/// Compares the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { comiltsd(a, b) }
+}
+
+/// Compares the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { comilesd(a, b) }
+}
+
+/// Compares the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { comigtsd(a, b) }
+}
+
+/// Compares the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { comigesd(a, b) }
+}
+
+/// Compares the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { comineqsd(a, b) }
+}
+
+/// Compares the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { ucomieqsd(a, b) }
+}
+
+/// Compares the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { ucomiltsd(a, b) }
+}
+
+/// Compares the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { ucomilesd(a, b) }
+}
+
+/// Compares the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { ucomigtsd(a, b) }
+}
+
+/// Compares the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { ucomigesd(a, b) }
+}
+
+/// Compares the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { ucomineqsd(a, b) }
+}
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed single-precision (32-bit) floating-point elements
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtpd2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
+    unsafe {
+        let r = simd_cast::<_, f32x2>(a.as_f64x2());
+        let zero = f32x2::ZERO;
+        transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
+    }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to
+/// packed
+/// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtps2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
+    unsafe {
+        let a = a.as_f32x4();
+        transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
+    }
+}
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
+    unsafe { transmute(cvtpd2dq(a)) }
+}
+
+/// Converts the lower double-precision (64-bit) floating-point element in a to
+/// a 32-bit integer.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
+    unsafe { cvtsd2si(a) }
+}
+
+/// Converts the lower double-precision (64-bit) floating-point element in `b`
+/// to a single-precision (32-bit) floating-point element, store the result in
+/// the lower element of the return value, and copies the upper element from `a`
+/// to the upper element the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsd2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
+    unsafe { cvtsd2ss(a, b) }
+}
+
+/// Returns the lower double-precision (64-bit) floating-point element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsd_f64(a: __m128d) -> f64 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Converts the lower single-precision (32-bit) floating-point element in `b`
+/// to a double-precision (64-bit) floating-point element, store the result in
+/// the lower element of the return value, and copies the upper element from `a`
+/// to the upper element the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtss2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
+    unsafe { cvtss2sd(a, b) }
+}
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
+    unsafe { transmute(cvttpd2dq(a)) }
+}
+
+/// Converts the lower double-precision (64-bit) floating-point element in `a`
+/// to a 32-bit integer with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
+    unsafe { cvttsd2si(a) }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
+    unsafe { transmute(cvttps2dq(a)) }
+}
+
+/// Copies double-precision (64-bit) floating-point element `a` to the lower
+/// element of the packed 64-bit return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set_sd(a: f64) -> __m128d {
+    _mm_set_pd(0.0, a)
+}
+
+/// Broadcasts double-precision (64-bit) floating-point value a to all elements
+/// of the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set1_pd(a: f64) -> __m128d {
+    _mm_set_pd(a, a)
+}
+
+/// Broadcasts double-precision (64-bit) floating-point value a to all elements
+/// of the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set_pd1(a: f64) -> __m128d {
+    _mm_set_pd(a, a)
+}
+
+/// Sets packed double-precision (64-bit) floating-point elements in the return
+/// value with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set_pd(a: f64, b: f64) -> __m128d {
+    __m128d([b, a])
+}
+
+/// Sets packed double-precision (64-bit) floating-point elements in the return
+/// value with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
+    _mm_set_pd(b, a)
+}
+
+/// Returns packed double-precision (64-bit) floating-point elements with all
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(xorp))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_setzero_pd() -> __m128d {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// Returns a mask of the most significant bit of each element in `a`.
+///
+/// The mask is stored in the 2 least significant bits of the return value.
+/// All other bits are set to `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movmskpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_movemask_pd(a: __m128d) -> i32 {
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    unsafe {
+        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
+        simd_bitmask::<i64x2, u8>(mask).into()
+    }
+}
+
+/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from memory into the returned vector.
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
+    *(mem_addr as *const __m128d)
+}
+
+/// Loads a 64-bit double-precision value to the low element of a
+/// 128-bit integer vector and clears the upper element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
+    _mm_setr_pd(*mem_addr, 0.)
+}
+
+/// Loads a double-precision value into the high-order bits of a 128-bit
+/// vector of `[2 x double]`. The low-order bits are copied from the low-order
+/// bits of the first operand.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
+    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
+}
+
+/// Loads a double-precision value into the low-order bits of a 128-bit
+/// vector of `[2 x double]`. The high-order bits are copied from the
+/// high-order bits of the first operand.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
+    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
+}
+
+/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
+/// aligned memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movntpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
+    crate::arch::asm!(
+        vps!("movntpd", ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(xmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
+/// memory location.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
+    *mem_addr = simd_extract!(a, 0)
+}
+
+/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
+/// on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
+    *(mem_addr as *mut __m128d) = a;
+}
+
+/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
+    mem_addr.cast::<__m128d>().write_unaligned(a);
+}
+
+/// Store 16-bit integer from the first element of a into memory.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
+}
+
+/// Store 32-bit integer from the first element of a into memory.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
+}
+
+/// Store 64-bit integer from the first element of a into memory.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
+}
+
+/// Stores the lower double-precision (64-bit) floating-point element from `a`
+/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
+/// 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
+    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
+    *(mem_addr as *mut __m128d) = b;
+}
+
+/// Stores the lower double-precision (64-bit) floating-point element from `a`
+/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
+/// 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
+    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
+    *(mem_addr as *mut __m128d) = b;
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
+/// memory in reverse order.
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
+    let b: __m128d = simd_shuffle!(a, a, [1, 0]);
+    *(mem_addr as *mut __m128d) = b;
+}
+
+/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
+/// memory location.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
+    *mem_addr = simd_extract!(a, 1);
+}
+
+/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
+/// memory location.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
+    *mem_addr = simd_extract!(a, 0);
+}
+
+/// Loads a double-precision (64-bit) floating-point element from memory
+/// into both elements of returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
+    let d = *mem_addr;
+    _mm_setr_pd(d, d)
+}
+
+/// Loads a double-precision (64-bit) floating-point element from memory
+/// into both elements of returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1)
+#[inline]
+#[target_feature(enable = "sse2")]
+// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
+    _mm_load1_pd(mem_addr)
+}
+
+/// Loads 2 double-precision (64-bit) floating-point elements from memory into
+/// the returned vector in reverse order. `mem_addr` must be aligned on a
+/// 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
+    let a = _mm_load_pd(mem_addr);
+    simd_shuffle!(a, a, [1, 0])
+}
+
+/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from memory into the returned vector.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
+    let mut dst = _mm_undefined_pd();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        ptr::addr_of_mut!(dst) as *mut u8,
+        mem::size_of::<__m128d>(),
+    );
+    dst
+}
+
+/// Loads unaligned 16-bits of integer data from memory into new vector.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
+    transmute(i16x8::new(
+        ptr::read_unaligned(mem_addr as *const i16),
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+    ))
+}
+
+/// Loads unaligned 32-bits of integer data from memory into new vector.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
+    transmute(i32x4::new(
+        ptr::read_unaligned(mem_addr as *const i32),
+        0,
+        0,
+        0,
+    ))
+}
+
+/// Loads unaligned 64-bits of integer data from memory into new vector.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
+pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
+    transmute(i64x2::new(ptr::read_unaligned(mem_addr as *const i64), 0))
+}
+
+/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
+/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
+/// parameter as a specifier.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(MASK, 8);
+    unsafe { simd_shuffle!(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2]) }
+}
+
+/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
+/// 64 bits are set to the lower 64 bits of the second parameter. The upper
+/// 64 bits are set to the upper 64 bits of the first parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1)) }
+}
+
+/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
+/// floating-point vector of `[4 x float]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_castpd_ps(a: __m128d) -> __m128 {
+    unsafe { transmute(a) }
+}
+
+/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
+/// integer vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_castpd_si128(a: __m128d) -> __m128i {
+    unsafe { transmute(a) }
+}
+
+/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
+/// floating-point vector of `[2 x double]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_castps_pd(a: __m128) -> __m128d {
+    unsafe { transmute(a) }
+}
+
+/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
+/// integer vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_castps_si128(a: __m128) -> __m128i {
+    unsafe { transmute(a) }
+}
+
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of `[2 x double]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_castsi128_pd(a: __m128i) -> __m128d {
+    unsafe { transmute(a) }
+}
+
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of `[4 x float]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_castsi128_ps(a: __m128i) -> __m128 {
+    unsafe { transmute(a) }
+}
+
+/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_undefined_pd() -> __m128d {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_undefined_si128() -> __m128i {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// The resulting `__m128d` element is composed by the low-order values of
+/// the two `__m128d` interleaved input elements, i.e.:
+///
+/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
+/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(unpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_shuffle!(a, b, [1, 3]) }
+}
+
+/// The resulting `__m128d` element is composed by the high-order values of
+/// the two `__m128d` interleaved input elements, i.e.:
+///
+/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
+/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movlhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { simd_shuffle!(a, b, [0, 2]) }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sse2.pause"]
+    fn pause();
+    #[link_name = "llvm.x86.sse2.clflush"]
+    fn clflush(p: *const u8);
+    #[link_name = "llvm.x86.sse2.lfence"]
+    fn lfence();
+    #[link_name = "llvm.x86.sse2.mfence"]
+    fn mfence();
+    #[link_name = "llvm.x86.sse2.pmadd.wd"]
+    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psad.bw"]
+    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
+    #[link_name = "llvm.x86.sse2.psll.w"]
+    fn psllw(a: i16x8, count: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psll.d"]
+    fn pslld(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psll.q"]
+    fn psllq(a: i64x2, count: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.sse2.psra.w"]
+    fn psraw(a: i16x8, count: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psra.d"]
+    fn psrad(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psrl.w"]
+    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psrl.d"]
+    fn psrld(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psrl.q"]
+    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.sse2.cvtps2dq"]
+    fn cvtps2dq(a: __m128) -> i32x4;
+    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
+    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
+    #[link_name = "llvm.x86.sse2.packsswb.128"]
+    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
+    #[link_name = "llvm.x86.sse2.packssdw.128"]
+    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
+    #[link_name = "llvm.x86.sse2.packuswb.128"]
+    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
+    #[link_name = "llvm.x86.sse2.max.sd"]
+    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.max.pd"]
+    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.min.sd"]
+    fn minsd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.min.pd"]
+    fn minpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.cmp.sd"]
+    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
+    #[link_name = "llvm.x86.sse2.cmp.pd"]
+    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
+    #[link_name = "llvm.x86.sse2.comieq.sd"]
+    fn comieqsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comilt.sd"]
+    fn comiltsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comile.sd"]
+    fn comilesd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comigt.sd"]
+    fn comigtsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comige.sd"]
+    fn comigesd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comineq.sd"]
+    fn comineqsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
+    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
+    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomile.sd"]
+    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
+    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomige.sd"]
+    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
+    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
+    fn cvtpd2dq(a: __m128d) -> i32x4;
+    #[link_name = "llvm.x86.sse2.cvtsd2si"]
+    fn cvtsd2si(a: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
+    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
+    #[link_name = "llvm.x86.sse2.cvtss2sd"]
+    fn cvtss2sd(a: __m128d, b: __m128) -> __m128d;
+    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
+    fn cvttpd2dq(a: __m128d) -> i32x4;
+    #[link_name = "llvm.x86.sse2.cvttsd2si"]
+    fn cvttsd2si(a: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.cvttps2dq"]
+    fn cvttps2dq(a: __m128) -> i32x4;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        core_arch::{simd::*, x86::*},
+        hint::black_box,
+    };
+    use std::{
+        boxed, f32, f64,
+        mem::{self, transmute},
+        ptr,
+    };
+    use stdarch_test::simd_test;
+
+    const NAN: f64 = f64::NAN;
+
+    #[test]
+    fn test_mm_pause() {
+        unsafe { _mm_pause() }
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_clflush() {
+        let x = 0_u8;
+        _mm_clflush(ptr::addr_of!(x));
+    }
+
+    #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_lfence() {
+        _mm_lfence();
+    }
+
+    #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_mfence() {
+        _mm_mfence();
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi8() {
+        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_add_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi8_overflow() {
+        let a = _mm_set1_epi8(0x7F);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_add_epi8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(-128));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_add_epi16(a, b);
+        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let b = _mm_setr_epi32(4, 5, 6, 7);
+        let r = _mm_add_epi32(a, b);
+        let e = _mm_setr_epi32(4, 6, 8, 10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(2, 3);
+        let r = _mm_add_epi64(a, b);
+        let e = _mm_setr_epi64x(2, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi8() {
+        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_adds_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi8_saturate_positive() {
+        let a = _mm_set1_epi8(0x7F);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_adds_epi8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi8_saturate_negative() {
+        let a = _mm_set1_epi8(-0x80);
+        let b = _mm_set1_epi8(-1);
+        let r = _mm_adds_epi8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_adds_epi16(a, b);
+        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi16_saturate_positive() {
+        let a = _mm_set1_epi16(0x7FFF);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_adds_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi16_saturate_negative() {
+        let a = _mm_set1_epi16(-0x8000);
+        let b = _mm_set1_epi16(-1);
+        let r = _mm_adds_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epu8() {
+        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_adds_epu8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epu8_saturate() {
+        let a = _mm_set1_epi8(!0);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_adds_epu8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epu16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_adds_epu16(a, b);
+        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epu16_saturate() {
+        let a = _mm_set1_epi16(!0);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_adds_epu16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_avg_epu8() {
+        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
+        let r = _mm_avg_epu8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(6));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_avg_epu16() {
+        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
+        let r = _mm_avg_epu16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(6));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_madd_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm_madd_epi16(a, b);
+        let e = _mm_setr_epi32(29, 81, 149, 233);
+        assert_eq_m128i(r, e);
+
+        // Test large values.
+        // MIN*MIN+MIN*MIN will overflow into i32::MIN.
+        let a = _mm_setr_epi16(
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MAX,
+            0,
+            0,
+        );
+        let b = _mm_setr_epi16(
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MAX,
+            i16::MIN,
+            0,
+            0,
+        );
+        let r = _mm_madd_epi16(a, b);
+        let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_max_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(-1);
+        let r = _mm_max_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_max_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(!0);
+        let r = _mm_max_epu8(a, b);
+        assert_eq_m128i(r, b);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_min_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(-1);
+        let r = _mm_min_epi16(a, b);
+        assert_eq_m128i(r, b);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_min_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(!0);
+        let r = _mm_min_epu8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mulhi_epi16() {
+        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
+        let r = _mm_mulhi_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(-16));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mulhi_epu16() {
+        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
+        let r = _mm_mulhi_epu16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(15));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mullo_epi16() {
+        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
+        let r = _mm_mullo_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(-17960));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mul_epu32() {
+        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
+        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
+        let r = _mm_mul_epu32(a, b);
+        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sad_epu8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
+            1, 2, 3, 4,
+            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
+            1, 2, 3, 4,
+        );
+        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
+        let r = _mm_sad_epu8(a, b);
+        let e = _mm_setr_epi64x(1020, 614);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_epi8() {
+        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
+        let r = _mm_sub_epi8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_epi16() {
+        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
+        let r = _mm_sub_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_epi32() {
+        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
+        let r = _mm_sub_epi32(a, b);
+        assert_eq_m128i(r, _mm_set1_epi32(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_epi64() {
+        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
+        let r = _mm_sub_epi64(a, b);
+        assert_eq_m128i(r, _mm_set1_epi64x(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi8() {
+        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
+        let r = _mm_subs_epi8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi8_saturate_positive() {
+        let a = _mm_set1_epi8(0x7F);
+        let b = _mm_set1_epi8(-1);
+        let r = _mm_subs_epi8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi8_saturate_negative() {
+        let a = _mm_set1_epi8(-0x80);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_subs_epi8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi16() {
+        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
+        let r = _mm_subs_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi16_saturate_positive() {
+        let a = _mm_set1_epi16(0x7FFF);
+        let b = _mm_set1_epi16(-1);
+        let r = _mm_subs_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi16_saturate_negative() {
+        let a = _mm_set1_epi16(-0x8000);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_subs_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epu8() {
+        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
+        let r = _mm_subs_epu8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epu8_saturate() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_subs_epu8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epu16() {
+        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
+        let r = _mm_subs_epu16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epu16_saturate() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_subs_epu16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_slli_si128() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_slli_si128::<1>(a);
+        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_slli_si128::<15>(a);
+        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_slli_si128::<16>(a);
+        assert_eq_m128i(r, _mm_set1_epi8(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_slli_epi16() {
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_slli_epi16::<4>(a);
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
+        );
+        let r = _mm_slli_epi16::<16>(a);
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sll_epi16() {
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
+        );
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_slli_epi32() {
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_slli_epi32::<4>(a);
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
+        let r = _mm_slli_epi32::<32>(a);
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sll_epi32() {
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_slli_epi64() {
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_slli_epi64::<4>(a);
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
+        let r = _mm_slli_epi64::<64>(a);
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sll_epi64() {
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srai_epi16() {
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_srai_epi16::<4>(a);
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
+        );
+        let r = _mm_srai_epi16::<16>(a);
+        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sra_epi16() {
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
+        );
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srai_epi32() {
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_srai_epi32::<4>(a);
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
+        let r = _mm_srai_epi32::<32>(a);
+        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sra_epi32() {
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srli_si128() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_srli_si128::<1>(a);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
+        );
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_srli_si128::<15>(a);
+        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_srli_si128::<16>(a);
+        assert_eq_m128i(r, _mm_set1_epi8(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srli_epi16() {
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_srli_epi16::<4>(a);
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
+        );
+        let r = _mm_srli_epi16::<16>(a);
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srl_epi16() {
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
+        );
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srli_epi32() {
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_srli_epi32::<4>(a);
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
+        let r = _mm_srli_epi32::<32>(a);
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srl_epi32() {
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srli_epi64() {
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_srli_epi64::<4>(a);
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
+        let r = _mm_srli_epi64::<64>(a);
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srl_epi64() {
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_and_si128() {
+        let a = _mm_set1_epi8(5);
+        let b = _mm_set1_epi8(3);
+        let r = _mm_and_si128(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_andnot_si128() {
+        let a = _mm_set1_epi8(5);
+        let b = _mm_set1_epi8(3);
+        let r = _mm_andnot_si128(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(2));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_or_si128() {
+        let a = _mm_set1_epi8(5);
+        let b = _mm_set1_epi8(3);
+        let r = _mm_or_si128(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(7));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_xor_si128() {
+        let a = _mm_set1_epi8(5);
+        let b = _mm_set1_epi8(3);
+        let r = _mm_xor_si128(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(6));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_epi8() {
+        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_cmpeq_epi8(a, b);
+        #[rustfmt::skip]
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi8(
+                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+            )
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
+        let r = _mm_cmpeq_epi16(a, b);
+        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let b = _mm_setr_epi32(3, 2, 2, 0);
+        let r = _mm_cmpeq_epi32(a, b);
+        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_epi8() {
+        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let b = _mm_set1_epi8(0);
+        let r = _mm_cmpgt_epi8(a, b);
+        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_epi16() {
+        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
+        let b = _mm_set1_epi16(0);
+        let r = _mm_cmpgt_epi16(a, b);
+        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_epi32() {
+        let a = _mm_set_epi32(5, 0, 0, 0);
+        let b = _mm_set1_epi32(0);
+        let r = _mm_cmpgt_epi32(a, b);
+        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_epi8() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_cmplt_epi8(a, b);
+        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_epi16() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_cmplt_epi16(a, b);
+        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_epi32() {
+        let a = _mm_set1_epi32(0);
+        let b = _mm_set_epi32(5, 0, 0, 0);
+        let r = _mm_cmplt_epi32(a, b);
+        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtepi32_pd() {
+        let a = _mm_set_epi32(35, 25, 15, 5);
+        let r = _mm_cvtepi32_pd(a);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi32_sd() {
+        let a = _mm_set1_pd(3.5);
+        let r = _mm_cvtsi32_sd(a, 5);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtepi32_ps() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm_cvtepi32_ps(a);
+        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtps_epi32() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtps_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi32_si128() {
+        let r = _mm_cvtsi32_si128(5);
+        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi128_si32() {
+        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
+        assert_eq!(r, 5);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_epi64x() {
+        let r = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_epi32() {
+        let r = _mm_set_epi32(0, 1, 2, 3);
+        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_epi16() {
+        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_epi8() {
+        #[rustfmt::skip]
+        let r = _mm_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            15, 14, 13, 12, 11, 10, 9, 8,
+            7, 6, 5, 4, 3, 2, 1, 0,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_epi64x() {
+        let r = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, _mm_set1_epi64x(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_epi32() {
+        let r = _mm_set1_epi32(1);
+        assert_eq_m128i(r, _mm_set1_epi32(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_epi16() {
+        let r = _mm_set1_epi16(1);
+        assert_eq_m128i(r, _mm_set1_epi16(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_epi8() {
+        let r = _mm_set1_epi8(1);
+        assert_eq_m128i(r, _mm_set1_epi8(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setr_epi32() {
+        let r = _mm_setr_epi32(0, 1, 2, 3);
+        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setr_epi16() {
+        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setr_epi8() {
+        #[rustfmt::skip]
+        let r = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setzero_si128() {
+        let r = _mm_setzero_si128();
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadl_epi64() {
+        let a = _mm_setr_epi64x(6, 5);
+        let r = _mm_loadl_epi64(ptr::addr_of!(a));
+        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load_si128() {
+        let a = _mm_set_epi64x(5, 6);
+        let r = _mm_load_si128(ptr::addr_of!(a) as *const _);
+        assert_eq_m128i(a, r);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadu_si128() {
+        let a = _mm_set_epi64x(5, 6);
+        let r = _mm_loadu_si128(ptr::addr_of!(a) as *const _);
+        assert_eq_m128i(a, r);
+    }
+
+    #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_maskmoveu_si128() {
+        let a = _mm_set1_epi8(9);
+        #[rustfmt::skip]
+        let mask = _mm_set_epi8(
+            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0,
+        );
+        let mut r = _mm_set1_epi8(0);
+        _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
+        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store_si128() {
+        let a = _mm_set1_epi8(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_store_si128(&mut r, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeu_si128() {
+        let a = _mm_set1_epi8(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_storeu_si128(&mut r, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storel_epi64() {
+        let a = _mm_setr_epi64x(2, 9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_storel_epi64(&mut r, a);
+        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_stream_si128() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut r = _mm_undefined_si128();
+        _mm_stream_si128(ptr::addr_of_mut!(r), a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_stream_si32() {
+        let a: i32 = 7;
+        let mut mem = boxed::Box::<i32>::new(-1);
+        _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
+        assert_eq!(a, *mem);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_move_epi64() {
+        let a = _mm_setr_epi64x(5, 6);
+        let r = _mm_move_epi64(a);
+        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_packs_epi16() {
+        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
+        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
+        let r = _mm_packs_epi16(a, b);
+        #[rustfmt::skip]
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi8(
+                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
+            )
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_packs_epi32() {
+        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
+        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
+        let r = _mm_packs_epi32(a, b);
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_packus_epi16() {
+        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
+        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
+        let r = _mm_packus_epi16(a, b);
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_extract_epi16() {
+        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
+        let r1 = _mm_extract_epi16::<0>(a);
+        let r2 = _mm_extract_epi16::<3>(a);
+        assert_eq!(r1, 0xFFFF);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_insert_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_insert_epi16::<0>(a, 9);
+        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_movemask_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
+            0b0101, 0b1111_0000u8 as i8, 0, 0,
+            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
+            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
+        );
+        let r = _mm_movemask_epi8(a);
+        assert_eq!(r, 0b10100110_00100101);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_shuffle_epi32() {
+        let a = _mm_setr_epi32(5, 10, 15, 20);
+        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
+        let e = _mm_setr_epi32(20, 10, 10, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_shufflehi_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
+        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
+        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_shufflelo_epi16() {
+        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
+        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
+        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_unpackhi_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_unpackhi_epi16(a, b);
+        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let b = _mm_setr_epi32(4, 5, 6, 7);
+        let r = _mm_unpackhi_epi32(a, b);
+        let e = _mm_setr_epi32(2, 6, 3, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(2, 3);
+        let r = _mm_unpackhi_epi64(a, b);
+        let e = _mm_setr_epi64x(1, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_unpacklo_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            0, 16, 1, 17, 2, 18, 3, 19,
+            4, 20, 5, 21, 6, 22, 7, 23,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_unpacklo_epi16(a, b);
+        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let b = _mm_setr_epi32(4, 5, 6, 7);
+        let r = _mm_unpacklo_epi32(a, b);
+        let e = _mm_setr_epi32(0, 4, 1, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(2, 3);
+        let r = _mm_unpacklo_epi64(a, b);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_add_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_add_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_div_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_div_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_div_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_div_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_max_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_max_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_max_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_max_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
+
+        // Check SSE(2)-specific semantics for -0.0 handling.
+        let a = _mm_setr_pd(-0.0, 0.0);
+        let b = _mm_setr_pd(0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
+        let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_min_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_min_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_min_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_min_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+
+        // Check SSE(2)-specific semantics for -0.0 handling.
+        let a = _mm_setr_pd(-0.0, 0.0);
+        let b = _mm_setr_pd(0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
+        let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mul_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_mul_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mul_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_mul_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sqrt_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_sqrt_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sqrt_pd() {
+        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
+        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_sub_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_sub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_and_pd() {
+        let a = transmute(u64x2::splat(5));
+        let b = transmute(u64x2::splat(3));
+        let r = _mm_and_pd(a, b);
+        let e = transmute(u64x2::splat(1));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_andnot_pd() {
+        let a = transmute(u64x2::splat(5));
+        let b = transmute(u64x2::splat(3));
+        let r = _mm_andnot_pd(a, b);
+        let e = transmute(u64x2::splat(2));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_or_pd() {
+        let a = transmute(u64x2::splat(5));
+        let b = transmute(u64x2::splat(3));
+        let r = _mm_or_pd(a, b);
+        let e = transmute(u64x2::splat(7));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_xor_pd() {
+        let a = transmute(u64x2::splat(5));
+        let b = transmute(u64x2::splat(3));
+        let r = _mm_xor_pd(a, b);
+        let e = transmute(u64x2::splat(6));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
+        let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
+        let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmple_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
+        let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_sd() {
+        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
+        let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpge_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
+        let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpord_sd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
+        let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpunord_sd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
+        let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpneq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
+        let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnlt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
+        let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnle_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
+        let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpngt_sd() {
+        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
+        let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnge_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
+        let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmple_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpge_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpord_pd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpunord_pd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpneq_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnlt_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnle_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpngt_pd() {
+        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnge_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comieq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comieq_sd(a, b) != 0);
+
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comieq_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comilt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comilt_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comile_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comile_sd(a, b) != 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comigt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comigt_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comige_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comige_sd(a, b) != 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comineq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comineq_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomieq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomieq_sd(a, b) != 0);
+
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
+        assert!(_mm_ucomieq_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomilt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomilt_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomile_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomile_sd(a, b) != 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomigt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomigt_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomige_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomige_sd(a, b) != 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomineq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomineq_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_movemask_pd() {
+        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
+        assert_eq!(r, 0b01);
+
+        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
+        assert_eq!(r, 0b11);
+    }
+
+    #[repr(align(16))]
+    struct Memory {
+        data: [f64; 4],
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load_pd() {
+        let mem = Memory {
+            data: [1.0f64, 2.0, 3.0, 4.0],
+        };
+        let vals = &mem.data;
+        let d = vals.as_ptr();
+
+        let r = _mm_load_pd(d);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load_sd() {
+        let a = 1.;
+        let expected = _mm_setr_pd(a, 0.);
+        let r = _mm_load_sd(&a);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadh_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = 3.;
+        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
+        let r = _mm_loadh_pd(a, &b);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadl_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = 3.;
+        let expected = _mm_setr_pd(3., get_m128d(a, 1));
+        let r = _mm_loadl_pd(a, &b);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_stream_pd() {
+        #[repr(align(128))]
+        struct Memory {
+            pub data: [f64; 2],
+        }
+        let a = _mm_set1_pd(7.0);
+        let mut mem = Memory { data: [-1.0; 2] };
+
+        _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
+        for i in 0..2 {
+            assert_eq!(mem.data[i], get_m128d(a, i));
+        }
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store_sd() {
+        let mut dest = 0.;
+        let a = _mm_setr_pd(1., 2.);
+        _mm_store_sd(&mut dest, a);
+        assert_eq!(dest, _mm_cvtsd_f64(a));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store_pd() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+        let d = vals.as_mut_ptr();
+
+        _mm_store_pd(d, *black_box(&a));
+        assert_eq!(vals[0], 1.0);
+        assert_eq!(vals[1], 2.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeu_pd() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Make sure p is **not** aligned to 16-byte boundary
+        if (p as usize) & 0xf == 0 {
+            ofs = 1;
+            p = p.add(1);
+        }
+
+        _mm_storeu_pd(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 2.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeu_si16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
+        _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
+        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeu_si32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut r = _mm_setr_epi32(5, 6, 7, 8);
+        _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
+        let e = _mm_setr_epi32(1, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeu_si64() {
+        let a = _mm_setr_epi64x(1, 2);
+        let mut r = _mm_setr_epi64x(3, 4);
+        _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
+        let e = _mm_setr_epi64x(1, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store1_pd() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+        let d = vals.as_mut_ptr();
+
+        _mm_store1_pd(d, *black_box(&a));
+        assert_eq!(vals[0], 1.0);
+        assert_eq!(vals[1], 1.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store_pd1() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+        let d = vals.as_mut_ptr();
+
+        _mm_store_pd1(d, *black_box(&a));
+        assert_eq!(vals[0], 1.0);
+        assert_eq!(vals[1], 1.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storer_pd() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+        let d = vals.as_mut_ptr();
+
+        _mm_storer_pd(d, *black_box(&a));
+        assert_eq!(vals[0], 2.0);
+        assert_eq!(vals[1], 1.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeh_pd() {
+        let mut dest = 0.;
+        let a = _mm_setr_pd(1., 2.);
+        _mm_storeh_pd(&mut dest, a);
+        assert_eq!(dest, get_m128d(a, 1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storel_pd() {
+        let mut dest = 0.;
+        let a = _mm_setr_pd(1., 2.);
+        _mm_storel_pd(&mut dest, a);
+        assert_eq!(dest, _mm_cvtsd_f64(a));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadr_pd() {
+        let mut mem = Memory {
+            data: [1.0f64, 2.0, 3.0, 4.0],
+        };
+        let vals = &mut mem.data;
+        let d = vals.as_ptr();
+
+        let r = _mm_loadr_pd(d);
+        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadu_pd() {
+        let mut mem = Memory {
+            data: [1.0f64, 2.0, 3.0, 4.0],
+        };
+        let vals = &mut mem.data;
+        let mut d = vals.as_ptr();
+
+        // make sure d is not aligned to 16-byte boundary
+        let mut offset = 0;
+        if (d as usize) & 0xf == 0 {
+            offset = 1;
+            d = d.add(offset);
+        }
+
+        let r = _mm_loadu_pd(d);
+        let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadu_si16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm_loadu_si16(ptr::addr_of!(a) as *const _);
+        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadu_si32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm_loadu_si32(ptr::addr_of!(a) as *const _);
+        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadu_si64() {
+        let a = _mm_setr_epi64x(5, 6);
+        let r = _mm_loadu_si64(ptr::addr_of!(a) as *const _);
+        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtpd_ps() {
+        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
+        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
+
+        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
+        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
+
+        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
+
+        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
+        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtps_pd() {
+        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
+        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
+
+        let r = _mm_cvtps_pd(_mm_setr_ps(
+            f32::MAX,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            f32::MIN,
+        ));
+        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtpd_epi32() {
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
+        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
+        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_si32() {
+        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
+        assert_eq!(r, -2);
+
+        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq!(r, i32::MIN);
+
+        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
+        assert_eq!(r, i32::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_ss() {
+        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
+        let b = _mm_setr_pd(2.0, -5.0);
+
+        let r = _mm_cvtsd_ss(a, b);
+
+        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
+
+        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
+        let b = _mm_setr_pd(f64::INFINITY, -5.0);
+
+        let r = _mm_cvtsd_ss(a, b);
+
+        assert_eq_m128(
+            r,
+            _mm_setr_ps(
+                f32::INFINITY,
+                f32::NEG_INFINITY,
+                f32::MAX,
+                f32::NEG_INFINITY,
+            ),
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_f64() {
+        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
+        assert_eq!(r, -1.1);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtss_sd() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let r = _mm_cvtss_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
+
+        let a = _mm_setr_pd(-1.1, f64::INFINITY);
+        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
+
+        let r = _mm_cvtss_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttpd_epi32() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let r = _mm_cvttpd_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
+
+        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
+        let r = _mm_cvttpd_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttsd_si32() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let r = _mm_cvttsd_si32(a);
+        assert_eq!(r, -1);
+
+        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
+        let r = _mm_cvttsd_si32(a);
+        assert_eq!(r, i32::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttps_epi32() {
+        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
+        let r = _mm_cvttps_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
+
+        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
+        let r = _mm_cvttps_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_sd() {
+        let r = _mm_set_sd(-1.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_pd() {
+        let r = _mm_set1_pd(-1.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_pd1() {
+        let r = _mm_set_pd1(-2.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_pd() {
+        let r = _mm_set_pd(1.0_f64, 5.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setr_pd() {
+        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setzero_pd() {
+        let r = _mm_setzero_pd();
+        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load1_pd() {
+        let d = -5.0;
+        let r = _mm_load1_pd(&d);
+        assert_eq_m128d(r, _mm_setr_pd(d, d));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load_pd1() {
+        let d = -5.0;
+        let r = _mm_load_pd1(&d);
+        assert_eq_m128d(r, _mm_setr_pd(d, d));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 4.0);
+        let r = _mm_unpackhi_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 4.0);
+        let r = _mm_unpacklo_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_shuffle_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(3., 4.);
+        let expected = _mm_setr_pd(1., 3.);
+        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_move_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(3., 4.);
+        let expected = _mm_setr_pd(3., 2.);
+        let r = _mm_move_sd(a, b);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castpd_ps() {
+        let a = _mm_set1_pd(0.);
+        let expected = _mm_set1_ps(0.);
+        let r = _mm_castpd_ps(a);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castpd_si128() {
+        let a = _mm_set1_pd(0.);
+        let expected = _mm_set1_epi64x(0);
+        let r = _mm_castpd_si128(a);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castps_pd() {
+        let a = _mm_set1_ps(0.);
+        let expected = _mm_set1_pd(0.);
+        let r = _mm_castps_pd(a);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castps_si128() {
+        let a = _mm_set1_ps(0.);
+        let expected = _mm_set1_epi32(0);
+        let r = _mm_castps_si128(a);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castsi128_pd() {
+        let a = _mm_set1_epi64x(0);
+        let expected = _mm_set1_pd(0.);
+        let r = _mm_castsi128_pd(a);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castsi128_ps() {
+        let a = _mm_set1_epi32(0);
+        let expected = _mm_set1_ps(0.);
+        let r = _mm_castsi128_ps(a);
+        assert_eq_m128(r, expected);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse3.rs b/library/stdarch/crates/core_arch/src/x86/sse3.rs
new file mode 100644
index 0000000000000..7a32cfe472d43
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse3.rs
@@ -0,0 +1,262 @@
+//! Streaming SIMD Extensions 3 (SSE3)
+
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Alternatively add and subtract packed single-precision (32-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(addsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let add = simd_add(a, b);
+        let sub = simd_sub(a, b);
+        simd_shuffle!(add, sub, [4, 1, 6, 3])
+    }
+}
+
+/// Alternatively add and subtract packed double-precision (64-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(addsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let add = simd_add(a, b);
+        let sub = simd_sub(a, b);
+        simd_shuffle!(add, sub, [2, 1])
+    }
+}
+
+/// Horizontally adds adjacent pairs of double-precision (64-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(haddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { haddpd(a, b) }
+}
+
+/// Horizontally adds adjacent pairs of single-precision (32-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(haddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { haddps(a, b) }
+}
+
+/// Horizontally subtract adjacent pairs of double-precision (64-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(hsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { hsubpd(a, b) }
+}
+
+/// Horizontally adds adjacent pairs of single-precision (32-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(hsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { hsubps(a, b) }
+}
+
+/// Loads 128-bits of integer data from unaligned memory.
+/// This intrinsic may perform better than `_mm_loadu_si128`
+/// when the data crosses a cache line boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(lddqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
+    transmute(lddqu(mem_addr as *const _))
+}
+
+/// Duplicate the low double-precision (64-bit) floating-point element
+/// from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_movedup_pd(a: __m128d) -> __m128d {
+    unsafe { simd_shuffle!(a, a, [0, 0]) }
+}
+
+/// Loads a double-precision (64-bit) floating-point element from memory
+/// into both elements of return vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
+    _mm_load1_pd(mem_addr)
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+/// from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movshdup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_movehdup_ps(a: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3]) }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements
+/// from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movsldup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_moveldup_ps(a: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sse3.hadd.pd"]
+    fn haddpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse3.hadd.ps"]
+    fn haddps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse3.hsub.pd"]
+    fn hsubpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse3.hsub.ps"]
+    fn hsubps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse3.ldu.dq"]
+    fn lddqu(mem_addr: *const i8) -> i8x16;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_addsub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_addsub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, 25.0, 0.0, -15.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_addsub_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_addsub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(99.0, 25.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hadd_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_hadd_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(4.0, -80.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hadd_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_hadd_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(4.0, -10.0, -80.0, -5.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hsub_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_hsub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-6.0, -120.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hsub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_hsub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-6.0, 10.0, -120.0, 5.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_lddqu_si128() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        let r = _mm_lddqu_si128(&a);
+        assert_eq_m128i(a, r);
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_movedup_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let r = _mm_movedup_pd(a);
+        assert_eq_m128d(r, _mm_setr_pd(-1.0, -1.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_movehdup_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let r = _mm_movehdup_ps(a);
+        assert_eq_m128(r, _mm_setr_ps(5.0, 5.0, -10.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_moveldup_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let r = _mm_moveldup_ps(a);
+        assert_eq_m128(r, _mm_setr_ps(-1.0, -1.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_loaddup_pd() {
+        let d = -5.0;
+        let r = _mm_loaddup_pd(&d);
+        assert_eq_m128d(r, _mm_setr_pd(d, d));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse41.rs b/library/stdarch/crates/core_arch/src/x86/sse41.rs
new file mode 100644
index 0000000000000..9aa200dfc07ab
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs
@@ -0,0 +1,1941 @@
+//! Streaming SIMD Extensions 4.1 (SSE4.1)
+
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+// SSE4 rounding constants
+/// round to nearest
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
+/// round down
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
+/// round up
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
+/// truncate
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
+/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
+/// do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
+/// suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_NO_EXC: i32 = 0x08;
+/// round to nearest and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_NINT: i32 = 0x00;
+/// round down and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
+/// round up and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
+/// truncate and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
+/// use MXCSR.RC and do not suppress exceptions; see
+/// `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
+/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
+
+/// Blend packed 8-bit integers from `a` and `b` using `mask`
+///
+/// The high bit of each corresponding mask byte determines the selection.
+/// If the high bit is set, the element of `b` is selected.
+/// Otherwise, the element of `a` is selected.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pblendvb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
+    unsafe {
+        let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::ZERO);
+        transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16()))
+    }
+}
+
+/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
+///
+/// The mask bits determine the selection. A clear bit selects the
+/// corresponding element of `a`, and a set bit the corresponding
+/// element of `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        transmute::<i16x8, _>(simd_shuffle!(
+            a.as_i16x8(),
+            b.as_i16x8(),
+            [
+                [0, 8][IMM8 as usize & 1],
+                [1, 9][(IMM8 >> 1) as usize & 1],
+                [2, 10][(IMM8 >> 2) as usize & 1],
+                [3, 11][(IMM8 >> 3) as usize & 1],
+                [4, 12][(IMM8 >> 4) as usize & 1],
+                [5, 13][(IMM8 >> 5) as usize & 1],
+                [6, 14][(IMM8 >> 6) as usize & 1],
+                [7, 15][(IMM8 >> 7) as usize & 1],
+            ]
+        ))
+    }
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a`
+/// and `b` using `mask`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(blendvpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
+    unsafe {
+        let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::ZERO);
+        transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2()))
+    }
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a`
+/// and `b` using `mask`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(blendvps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
+    unsafe {
+        let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::ZERO);
+        transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4()))
+    }
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a`
+/// and `b` using control mask `IMM2`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+// Note: LLVM7 prefers the single-precision floating-point domain when possible
+// see https://bugs.llvm.org/show_bug.cgi?id=38195
+// #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
+#[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM2, 2);
+    unsafe {
+        transmute::<f64x2, _>(simd_shuffle!(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]]
+        ))
+    }
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a`
+/// and `b` using mask `IMM4`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM4, 4);
+    unsafe {
+        transmute::<f32x4, _>(simd_shuffle!(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            [
+                [0, 4][IMM4 as usize & 1],
+                [1, 5][(IMM4 >> 1) as usize & 1],
+                [2, 6][(IMM4 >> 2) as usize & 1],
+                [3, 7][(IMM4 >> 3) as usize & 1],
+            ]
+        ))
+    }
+}
+
+/// Extracts a single-precision (32-bit) floating-point element from `a`,
+/// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
+/// and may be converted back to a floating point number via casting.
+///
+/// # Example
+/// ```rust
+/// # #[cfg(target_arch = "x86")]
+/// # use std::arch::x86::*;
+/// # #[cfg(target_arch = "x86_64")]
+/// # use std::arch::x86_64::*;
+/// # fn main() {
+/// #    if is_x86_feature_detected!("sse4.1") {
+/// #       #[target_feature(enable = "sse4.1")]
+/// #       #[allow(unused_unsafe)] // FIXME remove after stdarch bump in rustc
+/// #       unsafe fn worker() { unsafe {
+/// let mut float_store = vec![1.0, 1.0, 2.0, 3.0];
+/// let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0);
+/// let x: i32 = _mm_extract_ps::<2>(simd_floats);
+/// float_store.push(f32::from_bits(x as u32));
+/// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]);
+/// #       }}
+/// #       unsafe { worker() }
+/// #   }
+/// # }
+/// ```
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(extractps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
+    static_assert_uimm_bits!(IMM8, 2);
+    unsafe { simd_extract!(a, IMM8 as u32, f32).to_bits() as i32 }
+}
+
+/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 4);
+    unsafe { simd_extract!(a.as_u8x16(), IMM8 as u32, u8) as i32 }
+}
+
+/// Extracts an 32-bit integer from `a` selected with `IMM8`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(extractps, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 2);
+    unsafe { simd_extract!(a.as_i32x4(), IMM8 as u32, i32) }
+}
+
+/// Select a single value in `b` to store at some position in `a`,
+/// Then zero elements according to `IMM8`.
+///
+/// `IMM8` specifies which bits from operand `b` will be copied, which bits in
+/// the result they will be copied to, and which bits in the result will be
+/// cleared. The following assignments are made:
+///
+/// * Bits `[7:6]` specify the bits to copy from operand `b`:
+///     - `00`: Selects bits `[31:0]` from operand `b`.
+///     - `01`: Selects bits `[63:32]` from operand `b`.
+///     - `10`: Selects bits `[95:64]` from operand `b`.
+///     - `11`: Selects bits `[127:96]` from operand `b`.
+///
+/// * Bits `[5:4]` specify the bits in the result to which the selected bits
+///   from operand `b` are copied:
+///     - `00`: Copies the selected bits from `b` to result bits `[31:0]`.
+///     - `01`: Copies the selected bits from `b` to result bits `[63:32]`.
+///     - `10`: Copies the selected bits from `b` to result bits `[95:64]`.
+///     - `11`: Copies the selected bits from `b` to result bits `[127:96]`.
+///
+/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
+///   element is cleared.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { insertps(a, b, IMM8 as u8) }
+}
+
+/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
+/// location specified by `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 4);
+    unsafe { transmute(simd_insert!(a.as_i8x16(), IMM8 as u32, i as i8)) }
+}
+
+/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
+/// location specified by `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 2);
+    unsafe { transmute(simd_insert!(a.as_i32x4(), IMM8 as u32, i)) }
+}
+
+/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
+/// values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i8x16();
+        let b = b.as_i8x16();
+        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
+/// maximum.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u16x8();
+        let b = b.as_u16x8();
+        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
+/// values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u32x4();
+        let b = b.as_u32x4();
+        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
+/// values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i8x16();
+        let b = b.as_i8x16();
+        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
+/// minimum.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u16x8();
+        let b = b.as_u16x8();
+        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
+/// values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u32x4();
+        let b = b.as_u32x4();
+        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(packusdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(packusdw(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for equality
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pcmpeqq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i8x16();
+        let a: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute(simd_cast::<_, i16x8>(a))
+    }
+}
+
+/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i8x16();
+        let a: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<_, i32x4>(a))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
+/// 64-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i8x16();
+        let a: i8x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(simd_cast::<_, i64x2>(a))
+    }
+}
+
+/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i16x8();
+        let a: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<_, i32x4>(a))
+    }
+}
+
+/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i16x8();
+        let a: i16x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(simd_cast::<_, i64x2>(a))
+    }
+}
+
+/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x4();
+        let a: i32x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(simd_cast::<_, i64x2>(a))
+    }
+}
+
+/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u8x16();
+        let a: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute(simd_cast::<_, i16x8>(a))
+    }
+}
+
+/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u8x16();
+        let a: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<_, i32x4>(a))
+    }
+}
+
+/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u8x16();
+        let a: u8x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(simd_cast::<_, i64x2>(a))
+    }
+}
+
+/// Zeroes extend packed unsigned 16-bit integers in `a`
+/// to packed 32-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u16x8();
+        let a: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<_, i32x4>(a))
+    }
+}
+
+/// Zeroes extend packed unsigned 16-bit integers in `a`
+/// to packed 64-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u16x8();
+        let a: u16x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(simd_cast::<_, i64x2>(a))
+    }
+}
+
+/// Zeroes extend packed unsigned 32-bit integers in `a`
+/// to packed 64-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u32x4();
+        let a: u32x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(simd_cast::<_, i64x2>(a))
+    }
+}
+
+/// Returns the dot product of two __m128d vectors.
+///
+/// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
+/// If a condition mask bit is zero, the corresponding multiplication is
+/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
+/// the dot product will be stored in the return value component. Otherwise if
+/// the broadcast mask bit is zero then the return component will be zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        dppd(a, b, IMM8 as u8)
+    }
+}
+
+/// Returns the dot product of two __m128 vectors.
+///
+/// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
+/// If a condition mask bit is zero, the corresponding multiplication is
+/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
+/// the dot product will be stored in the return value component. Otherwise if
+/// the broadcast mask bit is zero then the return component will be zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { dpps(a, b, IMM8 as u8) }
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a`
+/// down to an integer value, and stores the results as packed double-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_floor_pd(a: __m128d) -> __m128d {
+    unsafe { simd_floor(a) }
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a`
+/// down to an integer value, and stores the results as packed single-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_floor_ps(a: __m128) -> __m128 {
+    unsafe { simd_floor(a) }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b`
+/// down to an integer value, store the result as a double-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper element from `a` to the upper element of the intrinsic
+/// result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { roundsd(a, b, _MM_FROUND_FLOOR) }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b`
+/// down to an integer value, store the result as a single-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper 3 packed elements from `a` to the upper elements
+/// of the intrinsic result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { roundss(a, b, _MM_FROUND_FLOOR) }
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a`
+/// up to an integer value, and stores the results as packed double-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ceil_pd(a: __m128d) -> __m128d {
+    unsafe { simd_ceil(a) }
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a`
+/// up to an integer value, and stores the results as packed single-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ceil_ps(a: __m128) -> __m128 {
+    unsafe { simd_ceil(a) }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b`
+/// up to an integer value, store the result as a double-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper element from `a` to the upper element
+/// of the intrinsic result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { roundsd(a, b, _MM_FROUND_CEIL) }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b`
+/// up to an integer value, store the result as a single-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper 3 packed elements from `a` to the upper elements
+/// of the intrinsic result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { roundss(a, b, _MM_FROUND_CEIL) }
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a`
+/// using the `ROUNDING` parameter, and stores the results as packed
+/// double-precision floating-point elements.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
+    static_assert_uimm_bits!(ROUNDING, 4);
+    unsafe { roundpd(a, ROUNDING) }
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a`
+/// using the `ROUNDING` parameter, and stores the results as packed
+/// single-precision floating-point elements.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
+    static_assert_uimm_bits!(ROUNDING, 4);
+    unsafe { roundps(a, ROUNDING) }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b`
+/// using the `ROUNDING` parameter, store the result as a double-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper element from `a` to the upper element of the intrinsic
+/// result.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(ROUNDING, 4);
+    unsafe { roundsd(a, b, ROUNDING) }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b`
+/// using the `ROUNDING` parameter, store the result as a single-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper 3 packed elements from `a` to the upper elements
+/// of the intrinsic result.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(ROUNDING, 4);
+    unsafe { roundss(a, b, ROUNDING) }
+}
+
+/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
+/// returning a vector containing its value in its first position, and its
+/// index
+/// in its second position; all other elements are set to zero.
+///
+/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
+/// instruction.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit vector of type `__m128i`.
+///
+/// Returns:
+///
+/// A 128-bit value where:
+///
+/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
+/// * bits `[18:16]` - contain the index of the minimum value
+/// * remaining bits are set to `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(phminposuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_minpos_epu16(a: __m128i) -> __m128i {
+    unsafe { transmute(phminposuw(a.as_u16x8())) }
+}
+
+/// Multiplies the low 32-bit integers from each packed 64-bit
+/// element in `a` and `b`, and returns the signed 64-bit result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmuldq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
+        let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
+        transmute(simd_mul(a, b))
+    }
+}
+
+/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
+/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
+/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
+/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
+/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
+/// return a negative number.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmulld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_mul(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Subtracts 8-bit unsigned integer values and computes the absolute
+/// values of the differences to the corresponding bits in the destination.
+/// Then sums of the absolute differences are returned according to the bit
+/// fields in the immediate operand.
+///
+/// The following algorithm is performed:
+///
+/// ```ignore
+/// i = IMM8[2] * 4
+/// j = IMM8[1:0] * 4
+/// for k := 0 to 7
+///     d0 = abs(a[i + k + 0] - b[j + 0])
+///     d1 = abs(a[i + k + 1] - b[j + 1])
+///     d2 = abs(a[i + k + 2] - b[j + 2])
+///     d3 = abs(a[i + k + 3] - b[j + 3])
+///     r[k] = d0 + d1 + d2 + d3
+/// ```
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit vector of type `__m128i`.
+/// * `b` - A 128-bit vector of type `__m128i`.
+/// * `IMM8` - An 8-bit immediate operand specifying how the absolute
+///   differences are to be calculated
+///     * Bit `[2]` specify the offset for operand `a`
+///     * Bits `[1:0]` specify the offset for operand `b`
+///
+/// Returns:
+///
+/// * A `__m128i` vector containing the sums of the sets of   absolute
+///   differences between both operands.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 3);
+    unsafe { transmute(mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8)) }
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// zeros.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all zeros,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
+    unsafe { ptestz(a.as_i64x2(), mask.as_i64x2()) }
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all ones,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
+    unsafe { ptestc(a.as_i64x2(), mask.as_i64x2()) }
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are
+/// neither all zeros nor all ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are neither all zeros nor all ones,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
+    unsafe { ptestnzc(a.as_i64x2(), mask.as_i64x2()) }
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// zeros.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all zeros,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
+    _mm_testz_si128(a, mask)
+}
+
+/// Tests whether the specified bits in `a` 128-bit integer vector are all
+/// ones.
+///
+/// Argument:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+///
+/// Returns:
+///
+/// * `1` - if the bits specified in the operand are all set to 1,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pcmpeqd))]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_test_all_ones(a: __m128i) -> i32 {
+    _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are
+/// neither all zeros nor all ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are neither all zeros nor all ones,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_mix_ones_zeros)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
+    _mm_testnzc_si128(a, mask)
+}
+
+/// Load 128-bits of integer data from memory into dst. mem_addr must be aligned on a 16-byte
+/// boundary or a general-protection exception may be generated. To minimize caching, the data
+/// is flagged as non-temporal (unlikely to be used again soon)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(movntdqa))]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
+    let dst: __m128i;
+    crate::arch::asm!(
+        vpl!("movntdqa {a}"),
+        a = out(xmm_reg) dst,
+        p = in(reg) mem_addr,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sse41.insertps"]
+    fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
+    #[link_name = "llvm.x86.sse41.packusdw"]
+    fn packusdw(a: i32x4, b: i32x4) -> u16x8;
+    #[link_name = "llvm.x86.sse41.dppd"]
+    fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
+    #[link_name = "llvm.x86.sse41.dpps"]
+    fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
+    #[link_name = "llvm.x86.sse41.round.pd"]
+    fn roundpd(a: __m128d, rounding: i32) -> __m128d;
+    #[link_name = "llvm.x86.sse41.round.ps"]
+    fn roundps(a: __m128, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.sse41.round.sd"]
+    fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
+    #[link_name = "llvm.x86.sse41.round.ss"]
+    fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.sse41.phminposuw"]
+    fn phminposuw(a: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.mpsadbw"]
+    fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.ptestz"]
+    fn ptestz(a: i64x2, mask: i64x2) -> i32;
+    #[link_name = "llvm.x86.sse41.ptestc"]
+    fn ptestc(a: i64x2, mask: i64x2) -> i32;
+    #[link_name = "llvm.x86.sse41.ptestnzc"]
+    fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use std::mem;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blendv_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let mask = _mm_setr_epi8(
+            0, -1, 0, -1, 0, -1, 0, -1,
+            0, -1, 0, -1, 0, -1, 0, -1,
+        );
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
+        );
+        assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blendv_pd() {
+        let a = _mm_set1_pd(0.0);
+        let b = _mm_set1_pd(1.0);
+        let mask = transmute(_mm_setr_epi64x(0, -1));
+        let r = _mm_blendv_pd(a, b, mask);
+        let e = _mm_setr_pd(0.0, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blendv_ps() {
+        let a = _mm_set1_ps(0.0);
+        let b = _mm_set1_ps(1.0);
+        let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
+        let r = _mm_blendv_ps(a, b, mask);
+        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blend_pd() {
+        let a = _mm_set1_pd(0.0);
+        let b = _mm_set1_pd(1.0);
+        let r = _mm_blend_pd::<0b10>(a, b);
+        let e = _mm_setr_pd(0.0, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blend_ps() {
+        let a = _mm_set1_ps(0.0);
+        let b = _mm_set1_ps(1.0);
+        let r = _mm_blend_ps::<0b1010>(a, b);
+        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blend_epi16() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_blend_epi16::<0b1010_1100>(a, b);
+        let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_ps() {
+        let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
+        let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32);
+        assert_eq!(r, 1.0);
+        let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32);
+        assert_eq!(r, 3.0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            -1, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15
+        );
+        let r1 = _mm_extract_epi8::<0>(a);
+        let r2 = _mm_extract_epi8::<3>(a);
+        assert_eq!(r1, 0xFF);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let r = _mm_extract_epi32::<1>(a);
+        assert_eq!(r, 1);
+        let r = _mm_extract_epi32::<3>(a);
+        assert_eq!(r, 3);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_ps() {
+        let a = _mm_set1_ps(1.0);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_insert_ps::<0b11_00_1100>(a, b);
+        let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
+        assert_eq_m128(r, e);
+
+        // Zeroing takes precedence over copied value
+        let a = _mm_set1_ps(1.0);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_insert_ps::<0b11_00_0001>(a, b);
+        let e = _mm_setr_ps(0.0, 1.0, 1.0, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_epi8() {
+        let a = _mm_set1_epi8(0);
+        let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_insert_epi8::<1>(a, 32);
+        assert_eq_m128i(r, e);
+        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
+        let r = _mm_insert_epi8::<14>(a, 32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_epi32() {
+        let a = _mm_set1_epi32(0);
+        let e = _mm_setr_epi32(0, 32, 0, 0);
+        let r = _mm_insert_epi32::<1>(a, 32);
+        assert_eq_m128i(r, e);
+        let e = _mm_setr_epi32(0, 0, 0, 32);
+        let r = _mm_insert_epi32::<3>(a, 32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 4, 5, 8, 9, 12, 13, 16,
+            17, 20, 21, 24, 25, 28, 29, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            2, 3, 6, 7, 10, 11, 14, 15,
+            18, 19, 22, 23, 26, 27, 30, 31,
+        );
+        let r = _mm_max_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            2, 4, 6, 8, 10, 12, 14, 16,
+            18, 20, 22, 24, 26, 28, 30, 32,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epu16() {
+        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm_max_epu16(a, b);
+        let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epi32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_max_epi32(a, b);
+        let e = _mm_setr_epi32(2, 4, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epu32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_max_epu32(a, b);
+        let e = _mm_setr_epi32(2, 4, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 4, 5, 8, 9, 12, 13, 16,
+            17, 20, 21, 24, 25, 28, 29, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            2, 3, 6, 7, 10, 11, 14, 15,
+            18, 19, 22, 23, 26, 27, 30, 31,
+        );
+        let r = _mm_min_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            1, 3, 5, 7, 9, 11, 13, 15,
+            17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, -4, -5, 8, -9, -12, 13, -16,
+            17, 20, 21, 24, 25, 28, 29, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            2, -3, -6, 7, -10, -11, 14, -15,
+            18, 19, 22, 23, 26, 27, 30, 31,
+        );
+        let r = _mm_min_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            1, -4, -6, 7, -10, -12, 13, -16,
+            17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epu16() {
+        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm_min_epu16(a, b);
+        let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epi32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_min_epi32(a, b);
+        let e = _mm_setr_epi32(1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+
+        let a = _mm_setr_epi32(-1, 4, 5, -7);
+        let b = _mm_setr_epi32(-2, 3, -6, 8);
+        let r = _mm_min_epi32(a, b);
+        let e = _mm_setr_epi32(-2, 3, -6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epu32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_min_epu32(a, b);
+        let e = _mm_setr_epi32(1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_packus_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let b = _mm_setr_epi32(-1, -2, -3, -4);
+        let r = _mm_packus_epi32(a, b);
+        let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cmpeq_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(0, 0);
+        let r = _mm_cmpeq_epi64(a, b);
+        let e = _mm_setr_epi64x(-1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi8_epi16() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi16(a);
+        let e = _mm_set1_epi16(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi16(a);
+        let e = _mm_set1_epi16(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi8_epi32() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi32(a);
+        let e = _mm_set1_epi32(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi8_epi64() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi64(a);
+        let e = _mm_set1_epi64x(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi16_epi32() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepi16_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi16(-10);
+        let r = _mm_cvtepi16_epi32(a);
+        let e = _mm_set1_epi32(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi16_epi64() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepi16_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi16(-10);
+        let r = _mm_cvtepi16_epi64(a);
+        let e = _mm_set1_epi64x(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi32_epi64() {
+        let a = _mm_set1_epi32(10);
+        let r = _mm_cvtepi32_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi32(-10);
+        let r = _mm_cvtepi32_epi64(a);
+        let e = _mm_set1_epi64x(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu8_epi16() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi16(a);
+        let e = _mm_set1_epi16(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu8_epi32() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu8_epi64() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu16_epi32() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepu16_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu16_epi64() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepu16_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu32_epi64() {
+        let a = _mm_set1_epi32(10);
+        let r = _mm_cvtepu32_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_dp_pd() {
+        let a = _mm_setr_pd(2.0, 3.0);
+        let b = _mm_setr_pd(1.0, 4.0);
+        let e = _mm_setr_pd(14.0, 0.0);
+        assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_dp_ps() {
+        let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
+        let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
+        let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
+        assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_pd() {
+        let a = _mm_setr_pd(2.5, 4.5);
+        let r = _mm_floor_pd(a);
+        let e = _mm_setr_pd(2.0, 4.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_ps() {
+        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
+        let r = _mm_floor_ps(a);
+        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_sd() {
+        let a = _mm_setr_pd(2.5, 4.5);
+        let b = _mm_setr_pd(-1.5, -3.5);
+        let r = _mm_floor_sd(a, b);
+        let e = _mm_setr_pd(-2.0, 4.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_ss() {
+        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
+        let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
+        let r = _mm_floor_ss(a, b);
+        let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_pd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let r = _mm_ceil_pd(a);
+        let e = _mm_setr_pd(2.0, 4.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_ps() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let r = _mm_ceil_ps(a);
+        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_sd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let r = _mm_ceil_sd(a, b);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_ss() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
+        let r = _mm_ceil_ss(a, b);
+        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_pd() {
+        let a = _mm_setr_pd(1.25, 3.75);
+        let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
+        let e = _mm_setr_pd(1.0, 4.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_ps() {
+        let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
+        let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
+        let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_sd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let r = _mm_round_sd::<_MM_FROUND_TO_NEAREST_INT>(a, b);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
+
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let r = _mm_round_sd::<_MM_FROUND_TO_NEG_INF>(a, b);
+        let e = _mm_setr_pd(-3.0, 3.5);
+        assert_eq_m128d(r, e);
+
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let r = _mm_round_sd::<_MM_FROUND_TO_POS_INF>(a, b);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
+
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let r = _mm_round_sd::<_MM_FROUND_TO_ZERO>(a, b);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_ss() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
+        let r = _mm_round_ss::<_MM_FROUND_TO_NEAREST_INT>(a, b);
+        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
+        let r = _mm_round_ss::<_MM_FROUND_TO_NEG_INF>(a, b);
+        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
+        let r = _mm_round_ss::<_MM_FROUND_TO_POS_INF>(a, b);
+        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
+        let r = _mm_round_ss::<_MM_FROUND_TO_ZERO>(a, b);
+        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16_1() {
+        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16_2() {
+        let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16_3() {
+        // Case where the minimum value is repeated
+        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 13);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_mul_epi32() {
+        {
+            let a = _mm_setr_epi32(1, 1, 1, 1);
+            let b = _mm_setr_epi32(1, 2, 3, 4);
+            let r = _mm_mul_epi32(a, b);
+            let e = _mm_setr_epi64x(1, 3);
+            assert_eq_m128i(r, e);
+        }
+        {
+            let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
+            let b = _mm_setr_epi32(
+                -20, -256, /* ignored */
+                666666, 666666, /* ignored */
+            );
+            let r = _mm_mul_epi32(a, b);
+            let e = _mm_setr_epi64x(-300, 823043843622);
+            assert_eq_m128i(r, e);
+        }
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_mullo_epi32() {
+        {
+            let a = _mm_setr_epi32(1, 1, 1, 1);
+            let b = _mm_setr_epi32(1, 2, 3, 4);
+            let r = _mm_mullo_epi32(a, b);
+            let e = _mm_setr_epi32(1, 2, 3, 4);
+            assert_eq_m128i(r, e);
+        }
+        {
+            let a = _mm_setr_epi32(15, -2, 1234567, 99999);
+            let b = _mm_setr_epi32(-20, -256, 666666, -99999);
+            let r = _mm_mullo_epi32(a, b);
+            // Attention, most significant bit in r[2] is treated
+            // as a sign bit:
+            // 1234567 * 666666 = -1589877210
+            let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
+            assert_eq_m128i(r, e);
+        }
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16() {
+        let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_mpsadbw_epu8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+
+        let r = _mm_mpsadbw_epu8::<0b000>(a, a);
+        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8::<0b001>(a, a);
+        let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8::<0b100>(a, a);
+        let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8::<0b101>(a, a);
+        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8::<0b111>(a, a);
+        let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_testz_si128() {
+        let a = _mm_set1_epi8(1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b011);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_testc_si128() {
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_testnzc_si128() {
+        let a = _mm_set1_epi8(0);
+        let mask = _mm_set1_epi8(1);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b101);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_test_all_zeros() {
+        let a = _mm_set1_epi8(1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b011);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_test_all_ones() {
+        let a = _mm_set1_epi8(-1);
+        let r = _mm_test_all_ones(a);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let r = _mm_test_all_ones(a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_test_mix_ones_zeros() {
+        let a = _mm_set1_epi8(0);
+        let mask = _mm_set1_epi8(1);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b101);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_stream_load_si128() {
+        let a = _mm_set_epi64x(5, 6);
+        let r = _mm_stream_load_si128(core::ptr::addr_of!(a) as *const _);
+        assert_eq_m128i(a, r);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse42.rs b/library/stdarch/crates/core_arch/src/x86/sse42.rs
new file mode 100644
index 0000000000000..83c51f2b70ebb
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse42.rs
@@ -0,0 +1,798 @@
+//! Streaming SIMD Extensions 4.2 (SSE4.2)
+//!
+//! Extends SSE4.1 with STTNI (String and Text New Instructions).
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::{
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+};
+
+/// String contains unsigned 8-bit characters *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_UBYTE_OPS: i32 = 0b0000_0000;
+/// String contains unsigned 16-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_UWORD_OPS: i32 = 0b0000_0001;
+/// String contains signed 8-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_SBYTE_OPS: i32 = 0b0000_0010;
+/// String contains unsigned 16-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_SWORD_OPS: i32 = 0b0000_0011;
+
+/// For each character in `a`, find if it is in `b` *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_EQUAL_ANY: i32 = 0b0000_0000;
+/// For each character in `a`, determine if
+/// `b[0] <= c <= b[1] or b[1] <= c <= b[2]...`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_RANGES: i32 = 0b0000_0100;
+/// The strings defined by `a` and `b` are equal
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_EQUAL_EACH: i32 = 0b0000_1000;
+/// Search for the defined substring in the target
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_EQUAL_ORDERED: i32 = 0b0000_1100;
+
+/// Do not negate results *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_POSITIVE_POLARITY: i32 = 0b0000_0000;
+/// Negates results
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_NEGATIVE_POLARITY: i32 = 0b0001_0000;
+/// Do not negate results before the end of the string
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_MASKED_POSITIVE_POLARITY: i32 = 0b0010_0000;
+/// Negates results only before the end of the string
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_MASKED_NEGATIVE_POLARITY: i32 = 0b0011_0000;
+
+/// **Index only**: return the least significant bit *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_LEAST_SIGNIFICANT: i32 = 0b0000_0000;
+/// **Index only**: return the most significant bit
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_MOST_SIGNIFICANT: i32 = 0b0100_0000;
+
+/// **Mask only**: return the bit mask
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_BIT_MASK: i32 = 0b0000_0000;
+/// **Mask only**: return the byte mask
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_UNIT_MASK: i32 = 0b0100_0000;
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return the generated mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistrm, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistrm<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { transmute(pcmpistrm128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8)) }
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8` and return the generated index. Similar to
+/// [`_mm_cmpestri`] with the exception that [`_mm_cmpestri`] requires the
+/// lengths of `a` and `b` to be explicitly specified.
+///
+/// # Control modes
+///
+/// The control specified by `IMM8` may be one or more of the following.
+///
+/// ## Data size and signedness
+///
+///  - [`_SIDD_UBYTE_OPS`] - Default
+///  - [`_SIDD_UWORD_OPS`]
+///  - [`_SIDD_SBYTE_OPS`]
+///  - [`_SIDD_SWORD_OPS`]
+///
+/// ## Comparison options
+///  - [`_SIDD_CMP_EQUAL_ANY`] - Default
+///  - [`_SIDD_CMP_RANGES`]
+///  - [`_SIDD_CMP_EQUAL_EACH`]
+///  - [`_SIDD_CMP_EQUAL_ORDERED`]
+///
+/// ## Result polarity
+///  - [`_SIDD_POSITIVE_POLARITY`] - Default
+///  - [`_SIDD_NEGATIVE_POLARITY`]
+///
+/// ## Bit returned
+///  - [`_SIDD_LEAST_SIGNIFICANT`] - Default
+///  - [`_SIDD_MOST_SIGNIFICANT`]
+///
+/// # Examples
+///
+/// Finds a substring using [`_SIDD_CMP_EQUAL_ORDERED`]
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// let haystack = b"This is a long string of text data\r\n\tthat extends
+/// multiple lines";
+/// let needle = b"\r\n\t\0\0\0\0\0\0\0\0\0\0\0\0\0";
+///
+/// let a = unsafe { _mm_loadu_si128(needle.as_ptr() as *const _) };
+/// let hop = 16;
+/// let mut indexes = Vec::new();
+///
+/// // Chunk the haystack into 16 byte chunks and find
+/// // the first "\r\n\t" in the chunk.
+/// for (i, chunk) in haystack.chunks(hop).enumerate() {
+///     let b = unsafe { _mm_loadu_si128(chunk.as_ptr() as *const _) };
+///     let idx = _mm_cmpistri(a, b, _SIDD_CMP_EQUAL_ORDERED);
+///     if idx != 16 {
+///         indexes.push((idx as usize) + (i * hop));
+///     }
+/// }
+/// assert_eq!(indexes, vec![34]);
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// The `_mm_cmpistri` intrinsic may also be used to find the existence of
+/// one or more of a given set of characters in the haystack.
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// // Ensure your input is 16 byte aligned
+/// let password = b"hunter2\0\0\0\0\0\0\0\0\0";
+/// let special_chars = b"!@#$%^&*()[]:;<>";
+///
+/// // Load the input
+/// let a = unsafe { _mm_loadu_si128(special_chars.as_ptr() as *const _) };
+/// let b = unsafe { _mm_loadu_si128(password.as_ptr() as *const _) };
+///
+/// // Use _SIDD_CMP_EQUAL_ANY to find the index of any bytes in b
+/// let idx = _mm_cmpistri(a.into(), b.into(), _SIDD_CMP_EQUAL_ANY);
+///
+/// if idx < 16 {
+///     println!("Congrats! Your password contains a special character");
+///     # panic!("{:?} does not contain a special character", password);
+/// } else {
+///     println!("Your password should contain a special character");
+/// }
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// Finds the index of the first character in the haystack that is within a
+/// range of characters.
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// # let b = b":;<=>?@[\\]^_`abc";
+/// # let b = unsafe { _mm_loadu_si128(b.as_ptr() as *const _) };
+///
+/// // Specify the ranges of values to be searched for [A-Za-z0-9].
+/// let a = b"AZaz09\0\0\0\0\0\0\0\0\0\0";
+/// let a = unsafe { _mm_loadu_si128(a.as_ptr() as *const _) };
+///
+/// // Use _SIDD_CMP_RANGES to find the index of first byte in ranges.
+/// // Which in this case will be the first alpha numeric byte found
+/// // in the string.
+/// let idx = _mm_cmpistri(a, b, _SIDD_CMP_RANGES);
+///
+/// if idx < 16 {
+///     println!("Found an alpha numeric character");
+///     # assert_eq!(idx, 13);
+/// } else {
+///     println!("Did not find an alpha numeric character");
+/// }
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// Working with 16-bit characters.
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// # let mut some_utf16_words = [0u16; 8];
+/// # let mut more_utf16_words = [0u16; 8];
+/// # '❤'.encode_utf16(&mut some_utf16_words);
+/// # '𝕊'.encode_utf16(&mut more_utf16_words);
+/// // Load the input
+/// let a = unsafe { _mm_loadu_si128(some_utf16_words.as_ptr() as *const _) };
+/// let b = unsafe { _mm_loadu_si128(more_utf16_words.as_ptr() as *const _) };
+///
+/// // Specify _SIDD_UWORD_OPS to compare words instead of bytes, and
+/// // use _SIDD_CMP_EQUAL_EACH to compare the two strings.
+/// let idx = _mm_cmpistri(a, b, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_EACH);
+///
+/// if idx == 0 {
+///     println!("16-bit unicode strings were equal!");
+///     # panic!("Strings should not be equal!")
+/// } else {
+///     println!("16-bit unicode strings were not equal!");
+/// }
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistri<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpistri128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return `1` if any character in `b` was null.
+/// and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistrz<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpistriz128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return `1` if the resulting mask was non-zero,
+/// and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistrc<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpistric128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and returns `1` if any character in `a` was null,
+/// and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistrs<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpistris128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return bit `0` of the resulting bit mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistro<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpistrio128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return `1` if `b` did not contain a null
+/// character and the resulting mask was zero, and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistra<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpistria128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return the generated mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestrm, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestrm<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { transmute(pcmpestrm128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8)) }
+}
+
+/// Compares packed strings `a` and `b` with lengths `la` and `lb` using the
+/// control in `IMM8` and return the generated index. Similar to
+/// [`_mm_cmpistri`] with the exception that [`_mm_cmpistri`] implicitly
+/// determines the length of `a` and `b`.
+///
+/// # Control modes
+///
+/// The control specified by `IMM8` may be one or more of the following.
+///
+/// ## Data size and signedness
+///
+///  - [`_SIDD_UBYTE_OPS`] - Default
+///  - [`_SIDD_UWORD_OPS`]
+///  - [`_SIDD_SBYTE_OPS`]
+///  - [`_SIDD_SWORD_OPS`]
+///
+/// ## Comparison options
+///  - [`_SIDD_CMP_EQUAL_ANY`] - Default
+///  - [`_SIDD_CMP_RANGES`]
+///  - [`_SIDD_CMP_EQUAL_EACH`]
+///  - [`_SIDD_CMP_EQUAL_ORDERED`]
+///
+/// ## Result polarity
+///  - [`_SIDD_POSITIVE_POLARITY`] - Default
+///  - [`_SIDD_NEGATIVE_POLARITY`]
+///
+/// ## Bit returned
+///  - [`_SIDD_LEAST_SIGNIFICANT`] - Default
+///  - [`_SIDD_MOST_SIGNIFICANT`]
+///
+/// # Examples
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+///
+/// // The string we want to find a substring in
+/// let haystack = b"Split \r\n\t line  ";
+///
+/// // The string we want to search for with some
+/// // extra bytes we do not want to search for.
+/// let needle = b"\r\n\t ignore this ";
+///
+/// let a = unsafe { _mm_loadu_si128(needle.as_ptr() as *const _) };
+/// let b = unsafe { _mm_loadu_si128(haystack.as_ptr() as *const _) };
+///
+/// // Note: We explicitly specify we only want to search `b` for the
+/// // first 3 characters of a.
+/// let idx = _mm_cmpestri(a, 3, b, 15, _SIDD_CMP_EQUAL_ORDERED);
+///
+/// assert_eq!(idx, 6);
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [`_SIDD_UBYTE_OPS`]: constant._SIDD_UBYTE_OPS.html
+/// [`_SIDD_UWORD_OPS`]: constant._SIDD_UWORD_OPS.html
+/// [`_SIDD_SBYTE_OPS`]: constant._SIDD_SBYTE_OPS.html
+/// [`_SIDD_SWORD_OPS`]: constant._SIDD_SWORD_OPS.html
+/// [`_SIDD_CMP_EQUAL_ANY`]: constant._SIDD_CMP_EQUAL_ANY.html
+/// [`_SIDD_CMP_RANGES`]: constant._SIDD_CMP_RANGES.html
+/// [`_SIDD_CMP_EQUAL_EACH`]: constant._SIDD_CMP_EQUAL_EACH.html
+/// [`_SIDD_CMP_EQUAL_ORDERED`]: constant._SIDD_CMP_EQUAL_ORDERED.html
+/// [`_SIDD_POSITIVE_POLARITY`]: constant._SIDD_POSITIVE_POLARITY.html
+/// [`_SIDD_NEGATIVE_POLARITY`]: constant._SIDD_NEGATIVE_POLARITY.html
+/// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
+/// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
+/// [`_mm_cmpistri`]: fn._mm_cmpistri.html
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestri<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpestri128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return `1` if any character in
+/// `b` was null, and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestrz<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpestriz128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return `1` if the resulting mask
+/// was non-zero, and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestrc<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpestric128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return `1` if any character in
+/// a was null, and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestrs<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpestris128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return bit `0` of the resulting
+/// bit mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestro<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpestrio128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return `1` if `b` did not
+/// contain a null character and the resulting mask was zero, and `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestra<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpestria128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32-C value for unsigned 8-bit integer `v`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_crc32_u8(crc: u32, v: u8) -> u32 {
+    unsafe { crc32_32_8(crc, v) }
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32-C value for unsigned 16-bit integer `v`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_crc32_u16(crc: u32, v: u16) -> u32 {
+    unsafe { crc32_32_16(crc, v) }
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32-C value for unsigned 32-bit integer `v`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_crc32_u32(crc: u32, v: u32) -> u32 {
+    unsafe { crc32_32_32(crc, v) }
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for greater-than,
+/// return the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi64)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpgtq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpgt_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_gt::<_, i64x2>(a.as_i64x2(), b.as_i64x2())) }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    // SSE 4.2 string and text comparison ops
+    #[link_name = "llvm.x86.sse42.pcmpestrm128"]
+    fn pcmpestrm128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> u8x16;
+    #[link_name = "llvm.x86.sse42.pcmpestri128"]
+    fn pcmpestri128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestriz128"]
+    fn pcmpestriz128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestric128"]
+    fn pcmpestric128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestris128"]
+    fn pcmpestris128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestrio128"]
+    fn pcmpestrio128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestria128"]
+    fn pcmpestria128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistrm128"]
+    fn pcmpistrm128(a: i8x16, b: i8x16, imm8: i8) -> i8x16;
+    #[link_name = "llvm.x86.sse42.pcmpistri128"]
+    fn pcmpistri128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistriz128"]
+    fn pcmpistriz128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistric128"]
+    fn pcmpistric128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistris128"]
+    fn pcmpistris128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistrio128"]
+    fn pcmpistrio128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistria128"]
+    fn pcmpistria128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    // SSE 4.2 CRC instructions
+    #[link_name = "llvm.x86.sse42.crc32.32.8"]
+    fn crc32_32_8(crc: u32, v: u8) -> u32;
+    #[link_name = "llvm.x86.sse42.crc32.32.16"]
+    fn crc32_32_16(crc: u32, v: u16) -> u32;
+    #[link_name = "llvm.x86.sse42.crc32.32.32"]
+    fn crc32_32_32(crc: u32, v: u32) -> u32;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use std::ptr;
+
+    // Currently one cannot `load` a &[u8] that is less than 16
+    // in length. This makes loading strings less than 16 in length
+    // a bit difficult. Rather than `load` and mutate the __m128i,
+    // it is easier to memcpy the given string to a local slice with
+    // length 16 and `load` the local slice.
+    #[target_feature(enable = "sse4.2")]
+    unsafe fn str_to_m128i(s: &[u8]) -> __m128i {
+        assert!(s.len() <= 16);
+        let slice = &mut [0u8; 16];
+        ptr::copy_nonoverlapping(s.as_ptr(), slice.as_mut_ptr(), s.len());
+        _mm_loadu_si128(slice.as_ptr() as *const _)
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrm() {
+        let a = str_to_m128i(b"Hello! Good-Bye!");
+        let b = str_to_m128i(b"hello! good-bye!");
+        let i = _mm_cmpistrm::<_SIDD_UNIT_MASK>(a, b);
+        #[rustfmt::skip]
+        let res = _mm_setr_epi8(
+            0x00, !0, !0, !0, !0, !0, !0, 0x00,
+            !0, !0, !0, !0, 0x00, !0, !0, !0,
+        );
+        assert_eq_m128i(i, res);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistri() {
+        let a = str_to_m128i(b"Hello");
+        let b = str_to_m128i(b"   Hello        ");
+        let i = _mm_cmpistri::<_SIDD_CMP_EQUAL_ORDERED>(a, b);
+        assert_eq!(3, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrz() {
+        let a = str_to_m128i(b"");
+        let b = str_to_m128i(b"Hello");
+        let i = _mm_cmpistrz::<_SIDD_CMP_EQUAL_ORDERED>(a, b);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrc() {
+        let a = str_to_m128i(b"                ");
+        let b = str_to_m128i(b"       !        ");
+        let i = _mm_cmpistrc::<_SIDD_UNIT_MASK>(a, b);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrs() {
+        let a = str_to_m128i(b"Hello");
+        let b = str_to_m128i(b"");
+        let i = _mm_cmpistrs::<_SIDD_CMP_EQUAL_ORDERED>(a, b);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistro() {
+        #[rustfmt::skip]
+        let a_bytes = _mm_setr_epi8(
+            0x00, 0x47, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
+            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        );
+        #[rustfmt::skip]
+        let b_bytes = _mm_setr_epi8(
+            0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
+            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        );
+        let a = a_bytes;
+        let b = b_bytes;
+        let i = _mm_cmpistro::<{ _SIDD_UWORD_OPS | _SIDD_UNIT_MASK }>(a, b);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistra() {
+        let a = str_to_m128i(b"");
+        let b = str_to_m128i(b"Hello!!!!!!!!!!!");
+        let i = _mm_cmpistra::<_SIDD_UNIT_MASK>(a, b);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrm() {
+        let a = str_to_m128i(b"Hello!");
+        let b = str_to_m128i(b"Hello.");
+        let i = _mm_cmpestrm::<_SIDD_UNIT_MASK>(a, 5, b, 5);
+        #[rustfmt::skip]
+        let r = _mm_setr_epi8(
+            !0, !0, !0, !0, !0, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        );
+        assert_eq_m128i(i, r);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestri() {
+        let a = str_to_m128i(b"bar - garbage");
+        let b = str_to_m128i(b"foobar");
+        let i = _mm_cmpestri::<_SIDD_CMP_EQUAL_ORDERED>(a, 3, b, 6);
+        assert_eq!(3, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrz() {
+        let a = str_to_m128i(b"");
+        let b = str_to_m128i(b"Hello");
+        let i = _mm_cmpestrz::<_SIDD_CMP_EQUAL_ORDERED>(a, 16, b, 6);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrc() {
+        let va = str_to_m128i(b"!!!!!!!!");
+        let vb = str_to_m128i(b"        ");
+        let i = _mm_cmpestrc::<_SIDD_UNIT_MASK>(va, 7, vb, 7);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrs() {
+        #[rustfmt::skip]
+        let a_bytes = _mm_setr_epi8(
+            0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
+            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        );
+        let a = a_bytes;
+        let b = _mm_set1_epi8(0x00);
+        let i = _mm_cmpestrs::<_SIDD_UWORD_OPS>(a, 8, b, 0);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestro() {
+        let a = str_to_m128i(b"Hello");
+        let b = str_to_m128i(b"World");
+        let i = _mm_cmpestro::<_SIDD_UBYTE_OPS>(a, 5, b, 5);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestra() {
+        let a = str_to_m128i(b"Cannot match a");
+        let b = str_to_m128i(b"Null after 14");
+        let i = _mm_cmpestra::<{ _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK }>(a, 14, b, 16);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u8() {
+        let crc = 0x2aa1e72b;
+        let v = 0x2a;
+        let i = _mm_crc32_u8(crc, v);
+        assert_eq!(i, 0xf24122e4);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u16() {
+        let crc = 0x8ecec3b5;
+        let v = 0x22b;
+        let i = _mm_crc32_u16(crc, v);
+        assert_eq!(i, 0x13bb2fb);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u32() {
+        let crc = 0xae2912c8;
+        let v = 0x845fed;
+        let i = _mm_crc32_u32(crc, v);
+        assert_eq!(i, 0xffae2ed1);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpgt_epi64() {
+        let a = _mm_setr_epi64x(0, 0x2a);
+        let b = _mm_set1_epi64x(0x00);
+        let i = _mm_cmpgt_epi64(a, b);
+        assert_eq_m128i(i, _mm_setr_epi64x(0x00, 0xffffffffffffffffu64 as i64));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse4a.rs b/library/stdarch/crates/core_arch/src/x86/sse4a.rs
new file mode 100644
index 0000000000000..051b77d02dfe0
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse4a.rs
@@ -0,0 +1,243 @@
+//! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`)
+
+use crate::core_arch::{simd::*, x86::*};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sse4a.extrq"]
+    fn extrq(x: i64x2, y: i8x16) -> i64x2;
+    #[link_name = "llvm.x86.sse4a.extrqi"]
+    fn extrqi(x: i64x2, len: u8, idx: u8) -> i64x2;
+    #[link_name = "llvm.x86.sse4a.insertq"]
+    fn insertq(x: i64x2, y: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.sse4a.insertqi"]
+    fn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2;
+    #[link_name = "llvm.x86.sse4a.movnt.sd"]
+    fn movntsd(x: *mut f64, y: __m128d);
+    #[link_name = "llvm.x86.sse4a.movnt.ss"]
+    fn movntss(x: *mut f32, y: __m128);
+}
+
+/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
+///
+/// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The
+/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All
+/// other bits are ignored.
+///
+/// If the length is zero, it is interpreted as `64`. If the length and index
+/// are zero, the lower 64 bits of `x` are extracted.
+///
+/// If `length == 0 && index > 0` or `length + index > 64` the result is
+/// undefined.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(extrq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
+    unsafe { transmute(extrq(x.as_i64x2(), y.as_i8x16())) }
+}
+
+/// Extracts the specified bits from the lower 64 bits of the 128-bit integer vector operand at the
+/// index `idx` and of the length `len`.
+///
+/// `idx` specifies the index of the LSB. `len` specifies the number of bits to extract. If length
+/// and index are both zero, bits `[63:0]` of parameter `x` are extracted. It is a compile-time error
+/// for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero.
+///
+/// Returns a 128-bit integer vector whose lower 64 bits contain the extracted bits.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(extrq, LEN = 5, IDX = 5))]
+#[rustc_legacy_const_generics(1, 2)]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub fn _mm_extracti_si64<const LEN: i32, const IDX: i32>(x: __m128i) -> __m128i {
+    // LLVM mentions that it is UB if these are not satisfied
+    static_assert_uimm_bits!(LEN, 6);
+    static_assert_uimm_bits!(IDX, 6);
+    static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64));
+    unsafe { transmute(extrqi(x.as_i64x2(), LEN as u8, IDX as u8)) }
+}
+
+/// Inserts the `[length:0]` bits of `y` into `x` at `index`.
+///
+/// The bits of `y`:
+///
+/// - `[69:64]` specify the `length`,
+/// - `[77:72]` specify the index.
+///
+/// If the `length` is zero it is interpreted as `64`. If `index + length > 64`
+/// or `index > 0 && length == 0` the result is undefined.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(insertq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
+    unsafe { transmute(insertq(x.as_i64x2(), y.as_i64x2())) }
+}
+
+/// Inserts the `len` least-significant bits from the lower 64 bits of the 128-bit integer vector operand `y` into
+/// the lower 64 bits of the 128-bit integer vector operand `x` at the index `idx` and of the length `len`.
+///
+/// `idx` specifies the index of the LSB. `len` specifies the number of bits to insert. If length and index
+/// are both zero, bits `[63:0]` of parameter `x` are replaced with bits `[63:0]` of parameter `y`. It is a
+/// compile-time error for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(insertq, LEN = 5, IDX = 5))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub fn _mm_inserti_si64<const LEN: i32, const IDX: i32>(x: __m128i, y: __m128i) -> __m128i {
+    // LLVM mentions that it is UB if these are not satisfied
+    static_assert_uimm_bits!(LEN, 6);
+    static_assert_uimm_bits!(IDX, 6);
+    static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64));
+    unsafe { transmute(insertqi(x.as_i64x2(), y.as_i64x2(), LEN as u8, IDX as u8)) }
+}
+
+/// Non-temporal store of `a.0` into `p`.
+///
+/// Writes 64-bit data to a memory location without polluting the caches.
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(movntsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
+    movntsd(p, a);
+}
+
+/// Non-temporal store of `a.0` into `p`.
+///
+/// Writes 32-bit data to a memory location without polluting the caches.
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(movntss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
+    movntss(p, a);
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_extract_si64() {
+        let b = 0b0110_0000_0000_i64;
+        //        ^^^^ bit range extracted
+        let x = _mm_setr_epi64x(b, 0);
+        let v = 0b001000___00___000100_i64;
+        //        ^idx: 2^3 = 8 ^length = 2^2 = 4
+        let y = _mm_setr_epi64x(v, 0);
+        let e = _mm_setr_epi64x(0b0110_i64, 0);
+        let r = _mm_extract_si64(x, y);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_extracti_si64() {
+        let a = _mm_setr_epi64x(0x0123456789abcdef, 0);
+        let r = _mm_extracti_si64::<8, 8>(a);
+        let e = _mm_setr_epi64x(0xcd, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_insert_si64() {
+        let i = 0b0110_i64;
+        //        ^^^^ bit range inserted
+        let z = 0b1010_1010_1010i64;
+        //        ^^^^ bit range replaced
+        let e = 0b0110_1010_1010i64;
+        //        ^^^^ replaced 1010 with 0110
+        let x = _mm_setr_epi64x(z, 0);
+        let expected = _mm_setr_epi64x(e, 0);
+        let v = 0b001000___00___000100_i64;
+        //        ^idx: 2^3 = 8 ^length = 2^2 = 4
+        let y = _mm_setr_epi64x(i, v);
+        let r = _mm_insert_si64(x, y);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_inserti_si64() {
+        let a = _mm_setr_epi64x(0x0123456789abcdef, 0);
+        let b = _mm_setr_epi64x(0x0011223344556677, 0);
+        let r = _mm_inserti_si64::<8, 8>(a, b);
+        let e = _mm_setr_epi64x(0x0123456789ab77ef, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[repr(align(16))]
+    struct MemoryF64 {
+        data: [f64; 2],
+    }
+
+    #[simd_test(enable = "sse4a")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_stream_sd() {
+        let mut mem = MemoryF64 {
+            data: [1.0_f64, 2.0],
+        };
+        {
+            let vals = &mut mem.data;
+            let d = vals.as_mut_ptr();
+
+            let x = _mm_setr_pd(3.0, 4.0);
+
+            _mm_stream_sd(d, x);
+        }
+        assert_eq!(mem.data[0], 3.0);
+        assert_eq!(mem.data[1], 2.0);
+    }
+
+    #[repr(align(16))]
+    struct MemoryF32 {
+        data: [f32; 4],
+    }
+
+    #[simd_test(enable = "sse4a")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_stream_ss() {
+        let mut mem = MemoryF32 {
+            data: [1.0_f32, 2.0, 3.0, 4.0],
+        };
+        {
+            let vals = &mut mem.data;
+            let d = vals.as_mut_ptr();
+
+            let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+            _mm_stream_ss(d, x);
+        }
+        assert_eq!(mem.data[0], 5.0);
+        assert_eq!(mem.data[1], 2.0);
+        assert_eq!(mem.data[2], 3.0);
+        assert_eq!(mem.data[3], 4.0);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/ssse3.rs b/library/stdarch/crates/core_arch/src/x86/ssse3.rs
new file mode 100644
index 0000000000000..2be182e88f4ba
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/ssse3.rs
@@ -0,0 +1,656 @@
+//! Supplemental Streaming SIMD Extensions 3 (SSSE3)
+
+use crate::{
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Computes the absolute value of packed 8-bit signed integers in `a` and
+/// return the unsigned results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pabsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_abs_epi8(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i8x16();
+        let zero = i8x16::ZERO;
+        let r = simd_select::<m8x16, _>(simd_lt(a, zero), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Computes the absolute value of each of the packed 16-bit signed integers in
+/// `a` and
+/// return the 16-bit unsigned integer
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pabsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_abs_epi16(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i16x8();
+        let zero = i16x8::ZERO;
+        let r = simd_select::<m16x8, _>(simd_lt(a, zero), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Computes the absolute value of each of the packed 32-bit signed integers in
+/// `a` and
+/// return the 32-bit unsigned integer
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pabsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_abs_epi32(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x4();
+        let zero = i32x4::ZERO;
+        let r = simd_select::<m32x4, _>(simd_lt(a, zero), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Shuffles bytes from `a` according to the content of `b`.
+///
+/// The last 4 bits of each byte of `b` are used as addresses
+/// into the 16 bytes of `a`.
+///
+/// In addition, if the highest significant bit of a byte of `b`
+/// is set, the respective destination byte is set to 0.
+///
+/// Picturing `a` and `b` as `[u8; 16]`, `_mm_shuffle_epi8` is
+/// logically equivalent to:
+///
+/// ```
+/// fn mm_shuffle_epi8(a: [u8; 16], b: [u8; 16]) -> [u8; 16] {
+///     let mut r = [0u8; 16];
+///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pshufb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(pshufb128(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
+/// shift the result right by `n` bytes, and returns the low 16 bytes.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(palignr, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    // If palignr is shifting the pair of vectors more than the size of two
+    // lanes, emit zero.
+    if IMM8 > 32 {
+        return _mm_setzero_si128();
+    }
+    // If palignr is shifting the pair of input vectors more than one lane,
+    // but less than two lanes, convert to shifting in zeroes.
+    let (a, b) = if IMM8 > 16 {
+        (_mm_setzero_si128(), a)
+    } else {
+        (a, b)
+    };
+    const fn mask(shift: u32, i: u32) -> u32 {
+        if shift > 32 {
+            // Unused, but needs to be a valid index.
+            i
+        } else if shift > 16 {
+            shift - 16 + i
+        } else {
+            shift + i
+        }
+    }
+    unsafe {
+        let r: i8x16 = simd_shuffle!(
+            b.as_i8x16(),
+            a.as_i8x16(),
+            [
+                mask(IMM8 as u32, 0),
+                mask(IMM8 as u32, 1),
+                mask(IMM8 as u32, 2),
+                mask(IMM8 as u32, 3),
+                mask(IMM8 as u32, 4),
+                mask(IMM8 as u32, 5),
+                mask(IMM8 as u32, 6),
+                mask(IMM8 as u32, 7),
+                mask(IMM8 as u32, 8),
+                mask(IMM8 as u32, 9),
+                mask(IMM8 as u32, 10),
+                mask(IMM8 as u32, 11),
+                mask(IMM8 as u32, 12),
+                mask(IMM8 as u32, 13),
+                mask(IMM8 as u32, 14),
+                mask(IMM8 as u32, 15),
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[8 x i16]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(phaddw128(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[8 x i16]`. Positive sums greater than 7FFFh are
+/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(phaddsw128(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[4 x i32]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(phaddd128(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[8 x i16]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(phsubw128(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[8 x i16]`. Positive differences greater than
+/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+/// saturated to 8000h.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(phsubsw128(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[4 x i32]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(phsubd128(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiplies corresponding pairs of packed 8-bit unsigned integer
+/// values contained in the first source operand and packed 8-bit signed
+/// integer values contained in the second source operand, add pairs of
+/// contiguous products with signed saturation, and writes the 16-bit sums to
+/// the corresponding bits in the destination.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pmaddubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(pmaddubsw128(a.as_u8x16(), b.as_i8x16())) }
+}
+
+/// Multiplies packed 16-bit signed integer values, truncate the 32-bit
+/// product to the 18 most significant bits by right-shifting, round the
+/// truncated value by adding 1, and write bits `[16:1]` to the destination.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pmulhrsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(pmulhrsw128(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Negates packed 8-bit integers in `a` when the corresponding signed 8-bit
+/// integer in `b` is negative, and returns the result.
+/// Elements in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(psignb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(psignb128(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Negates packed 16-bit integers in `a` when the corresponding signed 16-bit
+/// integer in `b` is negative, and returns the results.
+/// Elements in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(psignw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(psignw128(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Negates packed 32-bit integers in `a` when the corresponding signed 32-bit
+/// integer in `b` is negative, and returns the results.
+/// Element in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(psignd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(psignd128(a.as_i32x4(), b.as_i32x4())) }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.ssse3.pshuf.b.128"]
+    fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
+
+    #[link_name = "llvm.x86.ssse3.phadd.w.128"]
+    fn phaddw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.phadd.sw.128"]
+    fn phaddsw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.phadd.d.128"]
+    fn phaddd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.ssse3.phsub.w.128"]
+    fn phsubw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.phsub.sw.128"]
+    fn phsubsw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.phsub.d.128"]
+    fn phsubd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"]
+    fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.pmul.hr.sw.128"]
+    fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.psign.b.128"]
+    fn psignb128(a: i8x16, b: i8x16) -> i8x16;
+
+    #[link_name = "llvm.x86.ssse3.psign.w.128"]
+    fn psignw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.psign.d.128"]
+    fn psignd128(a: i32x4, b: i32x4) -> i32x4;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_abs_epi8() {
+        let r = _mm_abs_epi8(_mm_set1_epi8(-5));
+        assert_eq_m128i(r, _mm_set1_epi8(5));
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_abs_epi16() {
+        let r = _mm_abs_epi16(_mm_set1_epi16(-5));
+        assert_eq_m128i(r, _mm_set1_epi16(5));
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_abs_epi32() {
+        let r = _mm_abs_epi32(_mm_set1_epi32(-5));
+        assert_eq_m128i(r, _mm_set1_epi32(5));
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            4, 128_u8 as i8, 4, 3,
+            24, 12, 6, 19,
+            12, 5, 5, 10,
+            4, 1, 8, 0,
+        );
+        let expected = _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
+        let r = _mm_shuffle_epi8(a, b);
+        assert_eq_m128i(r, expected);
+
+        // Test indices greater than 15 wrapping around
+        let b = _mm_add_epi8(b, _mm_set1_epi8(32));
+        let r = _mm_shuffle_epi8(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            4, 63, 4, 3,
+            24, 12, 6, 19,
+            12, 5, 5, 10,
+            4, 1, 8, 0,
+        );
+        let r = _mm_alignr_epi8::<33>(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(0));
+
+        let r = _mm_alignr_epi8::<17>(a, b);
+        #[rustfmt::skip]
+        let expected = _mm_setr_epi8(
+            2, 3, 4, 5, 6, 7, 8, 9,
+            10, 11, 12, 13, 14, 15, 16, 0,
+        );
+        assert_eq_m128i(r, expected);
+
+        let r = _mm_alignr_epi8::<16>(a, b);
+        assert_eq_m128i(r, a);
+
+        let r = _mm_alignr_epi8::<15>(a, b);
+        #[rustfmt::skip]
+        let expected = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        assert_eq_m128i(r, expected);
+
+        let r = _mm_alignr_epi8::<0>(a, b);
+        assert_eq_m128i(r, b);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hadd_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
+        let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 36, 25);
+        let r = _mm_hadd_epi16(a, b);
+        assert_eq_m128i(r, expected);
+
+        // Test wrapping on overflow
+        let a = _mm_setr_epi16(i16::MAX, 1, i16::MAX, 2, i16::MAX, 3, i16::MAX, 4);
+        let b = _mm_setr_epi16(i16::MIN, -1, i16::MIN, -2, i16::MIN, -3, i16::MIN, -4);
+        let expected = _mm_setr_epi16(
+            i16::MIN,
+            i16::MIN + 1,
+            i16::MIN + 2,
+            i16::MIN + 3,
+            i16::MAX,
+            i16::MAX - 1,
+            i16::MAX - 2,
+            i16::MAX - 3,
+        );
+        let r = _mm_hadd_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hadds_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, 1, -32768, -1);
+        let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 32767, -32768);
+        let r = _mm_hadds_epi16(a, b);
+        assert_eq_m128i(r, expected);
+
+        // Test saturating on overflow
+        let a = _mm_setr_epi16(i16::MAX, 1, i16::MAX, 2, i16::MAX, 3, i16::MAX, 4);
+        let b = _mm_setr_epi16(i16::MIN, -1, i16::MIN, -2, i16::MIN, -3, i16::MIN, -4);
+        let expected = _mm_setr_epi16(
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+        );
+        let r = _mm_hadds_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hadd_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let b = _mm_setr_epi32(4, 128, 4, 3);
+        let expected = _mm_setr_epi32(3, 7, 132, 7);
+        let r = _mm_hadd_epi32(a, b);
+        assert_eq_m128i(r, expected);
+
+        // Test wrapping on overflow
+        let a = _mm_setr_epi32(i32::MAX, 1, i32::MAX, 2);
+        let b = _mm_setr_epi32(i32::MIN, -1, i32::MIN, -2);
+        let expected = _mm_setr_epi32(i32::MIN, i32::MIN + 1, i32::MAX, i32::MAX - 1);
+        let r = _mm_hadd_epi32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hsub_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
+        let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 12, -13);
+        let r = _mm_hsub_epi16(a, b);
+        assert_eq_m128i(r, expected);
+
+        // Test wrapping on overflow
+        let a = _mm_setr_epi16(i16::MAX, -1, i16::MAX, -2, i16::MAX, -3, i16::MAX, -4);
+        let b = _mm_setr_epi16(i16::MIN, 1, i16::MIN, 2, i16::MIN, 3, i16::MIN, 4);
+        let expected = _mm_setr_epi16(
+            i16::MIN,
+            i16::MIN + 1,
+            i16::MIN + 2,
+            i16::MIN + 3,
+            i16::MAX,
+            i16::MAX - 1,
+            i16::MAX - 2,
+            i16::MAX - 3,
+        );
+        let r = _mm_hsub_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hsubs_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
+        let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 32767, -32768);
+        let r = _mm_hsubs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+
+        // Test saturating on overflow
+        let a = _mm_setr_epi16(i16::MAX, -1, i16::MAX, -2, i16::MAX, -3, i16::MAX, -4);
+        let b = _mm_setr_epi16(i16::MIN, 1, i16::MIN, 2, i16::MIN, 3, i16::MIN, 4);
+        let expected = _mm_setr_epi16(
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+        );
+        let r = _mm_hsubs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hsub_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let b = _mm_setr_epi32(4, 128, 4, 3);
+        let expected = _mm_setr_epi32(-1, -1, -124, 1);
+        let r = _mm_hsub_epi32(a, b);
+        assert_eq_m128i(r, expected);
+
+        // Test wrapping on overflow
+        let a = _mm_setr_epi32(i32::MAX, -1, i32::MAX, -2);
+        let b = _mm_setr_epi32(i32::MIN, 1, i32::MIN, 2);
+        let expected = _mm_setr_epi32(i32::MIN, i32::MIN + 1, i32::MAX, i32::MAX - 1);
+        let r = _mm_hsub_epi32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_maddubs_epi16() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            4, 63, 4, 3,
+            24, 12, 6, 19,
+            12, 5, 5, 10,
+            4, 1, 8, 0,
+        );
+        let expected = _mm_setr_epi16(130, 24, 192, 194, 158, 175, 66, 120);
+        let r = _mm_maddubs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+
+        // Test widening and saturation
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            u8::MAX as i8, u8::MAX as i8,
+            u8::MAX as i8, u8::MAX as i8,
+            u8::MAX as i8, u8::MAX as i8,
+            100, 100, 0, 0,
+            0, 0, 0, 0, 0, 0,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            i8::MAX, i8::MAX,
+            i8::MAX, i8::MIN,
+            i8::MIN, i8::MIN,
+            50, 15, 0, 0, 0,
+            0, 0, 0, 0, 0,
+        );
+        let expected = _mm_setr_epi16(i16::MAX, -255, i16::MIN, 6500, 0, 0, 0, 0);
+        let r = _mm_maddubs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_mulhrs_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
+        let expected = _mm_setr_epi16(0, 0, 0, 0, 5, 0, -7, 0);
+        let r = _mm_mulhrs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+
+        // Test extreme values
+        let a = _mm_setr_epi16(i16::MAX, i16::MIN, i16::MIN, 0, 0, 0, 0, 0);
+        let b = _mm_setr_epi16(i16::MAX, i16::MIN, i16::MAX, 0, 0, 0, 0, 0);
+        let expected = _mm_setr_epi16(i16::MAX - 1, i16::MIN, -i16::MAX, 0, 0, 0, 0, 0);
+        let r = _mm_mulhrs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_sign_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, -14, -15, 16,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            4, 63, -4, 3, 24, 12, -6, -19,
+            12, 5, -5, 10, 4, 1, -8, 0,
+        );
+        #[rustfmt::skip]
+        let expected = _mm_setr_epi8(
+            1, 2, -3, 4, 5, 6, -7, -8,
+            9, 10, -11, 12, 13, -14, 15, 0,
+        );
+        let r = _mm_sign_epi8(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_sign_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, -5, -6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 0, 3, 1, -1, -2, 1);
+        let expected = _mm_setr_epi16(1, 2, 0, 4, -5, 6, -7, 8);
+        let r = _mm_sign_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_sign_epi32() {
+        let a = _mm_setr_epi32(-1, 2, 3, 4);
+        let b = _mm_setr_epi32(1, -1, 1, 0);
+        let expected = _mm_setr_epi32(-1, -2, 3, 0);
+        let r = _mm_sign_epi32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/tbm.rs b/library/stdarch/crates/core_arch/src/x86/tbm.rs
new file mode 100644
index 0000000000000..a245e693284fb
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/tbm.rs
@@ -0,0 +1,225 @@
+//! Trailing Bit Manipulation (TBM) instruction set.
+//!
+//! The reference is [AMD64 Architecture Programmer's Manual, Volume 3:
+//! General-Purpose and System Instructions][amd64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the available
+//! instructions.
+//!
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "C" {
+    #[link_name = "llvm.x86.tbm.bextri.u32"]
+    fn bextri_u32(a: u32, control: u32) -> u32;
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
+/// be extracted, and bits `[15,8]` specify the length of the range. For any bit
+/// position in the specified range that lie beyond the MSB of the source operand,
+/// zeroes will be written. If the range is empty, the result is zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(bextr, CONTROL = 0x0404))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub unsafe fn _bextri_u32<const CONTROL: u32>(a: u32) -> u32 {
+    static_assert_uimm_bits!(CONTROL, 16);
+    unsafe { bextri_u32(a, CONTROL) }
+}
+
+/// Clears all bits below the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcfill_u32(x: u32) -> u32 {
+    x & (x.wrapping_add(1))
+}
+
+/// Sets all bits of `x` to 1 except for the least significant zero bit.
+///
+/// If there is no zero bit in `x`, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blci))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blci_u32(x: u32) -> u32 {
+    x | !x.wrapping_add(1)
+}
+
+/// Sets the least significant zero bit of `x` and clears all other bits.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcic_u32(x: u32) -> u32 {
+    !x & x.wrapping_add(1)
+}
+
+/// Sets the least significant zero bit of `x` and clears all bits above
+/// that bit.
+///
+/// If there is no zero bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
+    x ^ x.wrapping_add(1)
+}
+
+/// Sets the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns `x`.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcs))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcs_u32(x: u32) -> u32 {
+    x | x.wrapping_add(1)
+}
+
+/// Sets all bits of `x` below the least significant one.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsfill_u32(x: u32) -> u32 {
+    x | x.wrapping_sub(1)
+}
+
+/// Clears least significant bit and sets all other bits.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsic_u32(x: u32) -> u32 {
+    !x | x.wrapping_sub(1)
+}
+
+/// Clears all bits below the least significant zero of `x` and sets all other
+/// bits.
+///
+/// If the least significant bit of `x` is `0`, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(t1mskc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
+    !x | x.wrapping_add(1)
+}
+
+/// Sets all bits below the least significant one of `x` and clears all other
+/// bits.
+///
+/// If the least significant bit of `x` is 1, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(tzmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
+    !x & x.wrapping_sub(1)
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_bextri_u32() {
+        assert_eq!(_bextri_u32::<0x0404>(0b0101_0000u32), 0b0000_0101u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcfill_u32() {
+        assert_eq!(_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
+        assert_eq!(_blcfill_u32(0b1111_1111u32), 0u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blci_u32() {
+        assert_eq!(
+            _blci_u32(0b0101_0000u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1110u32
+        );
+        assert_eq!(
+            _blci_u32(0b1111_1111u32),
+            0b1111_1111_1111_1111_1111_1110_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcic_u32() {
+        assert_eq!(_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
+        assert_eq!(_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcmsk_u32() {
+        assert_eq!(_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
+        assert_eq!(_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcs_u32() {
+        assert_eq!(_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
+        assert_eq!(_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blsfill_u32() {
+        assert_eq!(_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
+        assert_eq!(
+            _blsfill_u32(0u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blsic_u32() {
+        assert_eq!(
+            _blsic_u32(0b0101_0100u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1011u32
+        );
+        assert_eq!(
+            _blsic_u32(0u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_t1mskc_u32() {
+        assert_eq!(
+            _t1mskc_u32(0b0101_0111u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1000u32
+        );
+        assert_eq!(
+            _t1mskc_u32(0u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_tzmsk_u32() {
+        assert_eq!(_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
+        assert_eq!(_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/test.rs b/library/stdarch/crates/core_arch/src/x86/test.rs
new file mode 100644
index 0000000000000..fec25ce2bc7ce
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/test.rs
@@ -0,0 +1,168 @@
+//! Utilities used in testing the x86 intrinsics
+
+use crate::core_arch::x86::*;
+use std::mem::transmute;
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
+    assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
+    if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn get_m128d(a: __m128d, idx: usize) -> f64 {
+    transmute::<_, [f64; 2]>(a)[idx]
+}
+
+#[track_caller]
+#[target_feature(enable = "sse")]
+pub unsafe fn assert_eq_m128(a: __m128, b: __m128) {
+    let r = _mm_cmpeq_ps(a, b);
+    if _mm_movemask_ps(r) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "sse")]
+pub unsafe fn get_m128(a: __m128, idx: usize) -> f32 {
+    transmute::<_, [f32; 4]>(a)[idx]
+}
+
+#[track_caller]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+pub unsafe fn assert_eq_m128h(a: __m128h, b: __m128h) {
+    let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
+    if r != 0b1111_1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+// not actually an intrinsic but useful in various tests as we proted from
+// `i64x2::new` which is backwards from `_mm_set_epi64x`
+#[target_feature(enable = "sse2")]
+pub unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
+    _mm_set_epi64x(b, a)
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
+    assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+pub unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
+    let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_pd(cmp) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn get_m256d(a: __m256d, idx: usize) -> f64 {
+    transmute::<_, [f64; 4]>(a)[idx]
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+pub unsafe fn assert_eq_m256(a: __m256, b: __m256) {
+    let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_ps(cmp) != 0b11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn get_m256(a: __m256, idx: usize) -> f32 {
+    transmute::<_, [f32; 8]>(a)[idx]
+}
+
+#[track_caller]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+pub unsafe fn assert_eq_m256h(a: __m256h, b: __m256h) {
+    let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
+    if r != 0b11111111_11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512(a: __m512, idx: usize) -> f32 {
+    transmute::<_, [f32; 16]>(a)[idx]
+}
+
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512d(a: __m512d, idx: usize) -> f64 {
+    transmute::<_, [f64; 8]>(a)[idx]
+}
+
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512i(a: __m512i, idx: usize) -> i64 {
+    transmute::<_, [i64; 8]>(a)[idx]
+}
+
+// These intrinsics doesn't exist on x86 b/c it requires a 64-bit register,
+// which doesn't exist on x86!
+#[cfg(target_arch = "x86")]
+mod x86_polyfill {
+    use crate::core_arch::x86::*;
+    use crate::intrinsics::simd::*;
+
+    #[rustc_legacy_const_generics(2)]
+    pub unsafe fn _mm_insert_epi64<const INDEX: i32>(a: __m128i, val: i64) -> __m128i {
+        static_assert_uimm_bits!(INDEX, 1);
+        transmute(simd_insert!(a.as_i64x2(), INDEX as u32, val))
+    }
+
+    #[target_feature(enable = "avx2")]
+    #[rustc_legacy_const_generics(2)]
+    pub unsafe fn _mm256_insert_epi64<const INDEX: i32>(a: __m256i, val: i64) -> __m256i {
+        static_assert_uimm_bits!(INDEX, 2);
+        transmute(simd_insert!(a.as_i64x4(), INDEX as u32, val))
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+mod x86_polyfill {
+    pub use crate::core_arch::x86_64::{_mm_insert_epi64, _mm256_insert_epi64};
+}
+pub use self::x86_polyfill::*;
+
+#[track_caller]
+pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
+    assert_eq!(transmute::<_, [i32; 16]>(a), transmute::<_, [i32; 16]>(b))
+}
+
+#[track_caller]
+pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
+    let cmp = _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b);
+    if cmp != 0b11111111_11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {
+    let cmp = _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b);
+    if cmp != 0b11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "avx512fp16")]
+pub unsafe fn assert_eq_m512h(a: __m512h, b: __m512h) {
+    let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
+    if r != 0b11111111_11111111_11111111_11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/vaes.rs b/library/stdarch/crates/core_arch/src/x86/vaes.rs
new file mode 100644
index 0000000000000..b1fe193e3f5d7
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/vaes.rs
@@ -0,0 +1,340 @@
+//! Vectorized AES Instructions (VAES)
+//!
+//! The intrinsics here correspond to those in the `immintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.aesni.aesenc.256"]
+    fn aesenc_256(a: __m256i, round_key: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.aesni.aesenclast.256"]
+    fn aesenclast_256(a: __m256i, round_key: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.aesni.aesdec.256"]
+    fn aesdec_256(a: __m256i, round_key: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.aesni.aesdeclast.256"]
+    fn aesdeclast_256(a: __m256i, round_key: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.aesni.aesenc.512"]
+    fn aesenc_512(a: __m512i, round_key: __m512i) -> __m512i;
+    #[link_name = "llvm.x86.aesni.aesenclast.512"]
+    fn aesenclast_512(a: __m512i, round_key: __m512i) -> __m512i;
+    #[link_name = "llvm.x86.aesni.aesdec.512"]
+    fn aesdec_512(a: __m512i, round_key: __m512i) -> __m512i;
+    #[link_name = "llvm.x86.aesni.aesdeclast.512"]
+    fn aesdeclast_512(a: __m512i, round_key: __m512i) -> __m512i;
+}
+
+/// Performs one round of an AES encryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_aesenc_epi128)
+#[inline]
+#[target_feature(enable = "vaes")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesenc))]
+pub fn _mm256_aesenc_epi128(a: __m256i, round_key: __m256i) -> __m256i {
+    unsafe { aesenc_256(a, round_key) }
+}
+
+/// Performs the last round of an AES encryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_aesenclast_epi128)
+#[inline]
+#[target_feature(enable = "vaes")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesenclast))]
+pub fn _mm256_aesenclast_epi128(a: __m256i, round_key: __m256i) -> __m256i {
+    unsafe { aesenclast_256(a, round_key) }
+}
+
+/// Performs one round of an AES decryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_aesdec_epi128)
+#[inline]
+#[target_feature(enable = "vaes")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesdec))]
+pub fn _mm256_aesdec_epi128(a: __m256i, round_key: __m256i) -> __m256i {
+    unsafe { aesdec_256(a, round_key) }
+}
+
+/// Performs the last round of an AES decryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_aesdeclast_epi128)
+#[inline]
+#[target_feature(enable = "vaes")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesdeclast))]
+pub fn _mm256_aesdeclast_epi128(a: __m256i, round_key: __m256i) -> __m256i {
+    unsafe { aesdeclast_256(a, round_key) }
+}
+
+/// Performs one round of an AES encryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_aesenc_epi128)
+#[inline]
+#[target_feature(enable = "vaes,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesenc))]
+pub fn _mm512_aesenc_epi128(a: __m512i, round_key: __m512i) -> __m512i {
+    unsafe { aesenc_512(a, round_key) }
+}
+
+/// Performs the last round of an AES encryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_aesenclast_epi128)
+#[inline]
+#[target_feature(enable = "vaes,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesenclast))]
+pub fn _mm512_aesenclast_epi128(a: __m512i, round_key: __m512i) -> __m512i {
+    unsafe { aesenclast_512(a, round_key) }
+}
+
+/// Performs one round of an AES decryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_aesdec_epi128)
+#[inline]
+#[target_feature(enable = "vaes,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesdec))]
+pub fn _mm512_aesdec_epi128(a: __m512i, round_key: __m512i) -> __m512i {
+    unsafe { aesdec_512(a, round_key) }
+}
+
+/// Performs the last round of an AES decryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_aesdeclast_epi128)
+#[inline]
+#[target_feature(enable = "vaes,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesdeclast))]
+pub fn _mm512_aesdeclast_epi128(a: __m512i, round_key: __m512i) -> __m512i {
+    unsafe { aesdeclast_512(a, round_key) }
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __mXXXi happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    // the first parts of these tests are straight ports from the AES-NI tests
+    // the second parts directly compare the two, for inputs that are different across lanes
+    // and "more random" than the standard test vectors
+    // ideally we'd be using quickcheck here instead
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn helper_for_256_vaes(
+        linear: unsafe fn(__m128i, __m128i) -> __m128i,
+        vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
+    ) {
+        let a = _mm256_set_epi64x(
+            0xDCB4DB3657BF0B7D,
+            0x18DB0601068EDD9F,
+            0xB76B908233200DC5,
+            0xE478235FA8E22D5E,
+        );
+        let k = _mm256_set_epi64x(
+            0x672F6F105A94CEA7,
+            0x8298B8FFCA5F829C,
+            0xA3927047B3FB61D8,
+            0x978093862CDE7187,
+        );
+        let mut a_decomp = [_mm_setzero_si128(); 2];
+        a_decomp[0] = _mm256_extracti128_si256::<0>(a);
+        a_decomp[1] = _mm256_extracti128_si256::<1>(a);
+        let mut k_decomp = [_mm_setzero_si128(); 2];
+        k_decomp[0] = _mm256_extracti128_si256::<0>(k);
+        k_decomp[1] = _mm256_extracti128_si256::<1>(k);
+        let r = vectorized(a, k);
+        let mut e_decomp = [_mm_setzero_si128(); 2];
+        for i in 0..2 {
+            e_decomp[i] = linear(a_decomp[i], k_decomp[i]);
+        }
+        assert_eq_m128i(_mm256_extracti128_si256::<0>(r), e_decomp[0]);
+        assert_eq_m128i(_mm256_extracti128_si256::<1>(r), e_decomp[1]);
+    }
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn setup_state_key<T>(broadcast: unsafe fn(__m128i) -> T) -> (T, T) {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        (broadcast(a), broadcast(k))
+    }
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn setup_state_key_256() -> (__m256i, __m256i) {
+        setup_state_key(_mm256_broadcastsi128_si256)
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe fn setup_state_key_512() -> (__m512i, __m512i) {
+        setup_state_key(_mm512_broadcast_i32x4)
+    }
+
+    #[simd_test(enable = "vaes,avx512vl")]
+    unsafe fn test_mm256_aesdec_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
+        let (a, k) = setup_state_key_256();
+        let e = _mm_set_epi64x(0x044e4f5176fec48f, 0xb57ecfa381da39ee);
+        let e = _mm256_broadcastsi128_si256(e);
+        let r = _mm256_aesdec_epi128(a, k);
+        assert_eq_m256i(r, e);
+
+        helper_for_256_vaes(_mm_aesdec_si128, _mm256_aesdec_epi128);
+    }
+
+    #[simd_test(enable = "vaes,avx512vl")]
+    unsafe fn test_mm256_aesdeclast_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
+        let (a, k) = setup_state_key_256();
+        let e = _mm_set_epi64x(0x36cad57d9072bf9e, 0xf210dd981fa4a493);
+        let e = _mm256_broadcastsi128_si256(e);
+        let r = _mm256_aesdeclast_epi128(a, k);
+        assert_eq_m256i(r, e);
+
+        helper_for_256_vaes(_mm_aesdeclast_si128, _mm256_aesdeclast_epi128);
+    }
+
+    #[simd_test(enable = "vaes,avx512vl")]
+    unsafe fn test_mm256_aesenc_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
+        // they are repeated appropriately
+        let (a, k) = setup_state_key_256();
+        let e = _mm_set_epi64x(0x16ab0e57dfc442ed, 0x28e4ee1884504333);
+        let e = _mm256_broadcastsi128_si256(e);
+        let r = _mm256_aesenc_epi128(a, k);
+        assert_eq_m256i(r, e);
+
+        helper_for_256_vaes(_mm_aesenc_si128, _mm256_aesenc_epi128);
+    }
+
+    #[simd_test(enable = "vaes,avx512vl")]
+    unsafe fn test_mm256_aesenclast_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
+        let (a, k) = setup_state_key_256();
+        let e = _mm_set_epi64x(0xb6dd7df25d7ab320, 0x4b04f98cf4c860f8);
+        let e = _mm256_broadcastsi128_si256(e);
+        let r = _mm256_aesenclast_epi128(a, k);
+        assert_eq_m256i(r, e);
+
+        helper_for_256_vaes(_mm_aesenclast_si128, _mm256_aesenclast_epi128);
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe fn helper_for_512_vaes(
+        linear: unsafe fn(__m128i, __m128i) -> __m128i,
+        vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
+    ) {
+        let a = _mm512_set_epi64(
+            0xDCB4DB3657BF0B7D,
+            0x18DB0601068EDD9F,
+            0xB76B908233200DC5,
+            0xE478235FA8E22D5E,
+            0xAB05CFFA2621154C,
+            0x1171B47A186174C9,
+            0x8C6B6C0E7595CEC9,
+            0xBE3E7D4934E961BD,
+        );
+        let k = _mm512_set_epi64(
+            0x672F6F105A94CEA7,
+            0x8298B8FFCA5F829C,
+            0xA3927047B3FB61D8,
+            0x978093862CDE7187,
+            0xB1927AB22F31D0EC,
+            0xA9A5DA619BE4D7AF,
+            0xCA2590F56884FDC6,
+            0x19BE9F660038BDB5,
+        );
+        let mut a_decomp = [_mm_setzero_si128(); 4];
+        a_decomp[0] = _mm512_extracti32x4_epi32::<0>(a);
+        a_decomp[1] = _mm512_extracti32x4_epi32::<1>(a);
+        a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);
+        a_decomp[3] = _mm512_extracti32x4_epi32::<3>(a);
+        let mut k_decomp = [_mm_setzero_si128(); 4];
+        k_decomp[0] = _mm512_extracti32x4_epi32::<0>(k);
+        k_decomp[1] = _mm512_extracti32x4_epi32::<1>(k);
+        k_decomp[2] = _mm512_extracti32x4_epi32::<2>(k);
+        k_decomp[3] = _mm512_extracti32x4_epi32::<3>(k);
+        let r = vectorized(a, k);
+        let mut e_decomp = [_mm_setzero_si128(); 4];
+        for i in 0..4 {
+            e_decomp[i] = linear(a_decomp[i], k_decomp[i]);
+        }
+        assert_eq_m128i(_mm512_extracti32x4_epi32::<0>(r), e_decomp[0]);
+        assert_eq_m128i(_mm512_extracti32x4_epi32::<1>(r), e_decomp[1]);
+        assert_eq_m128i(_mm512_extracti32x4_epi32::<2>(r), e_decomp[2]);
+        assert_eq_m128i(_mm512_extracti32x4_epi32::<3>(r), e_decomp[3]);
+    }
+
+    #[simd_test(enable = "vaes,avx512f")]
+    unsafe fn test_mm512_aesdec_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
+        let (a, k) = setup_state_key_512();
+        let e = _mm_set_epi64x(0x044e4f5176fec48f, 0xb57ecfa381da39ee);
+        let e = _mm512_broadcast_i32x4(e);
+        let r = _mm512_aesdec_epi128(a, k);
+        assert_eq_m512i(r, e);
+
+        helper_for_512_vaes(_mm_aesdec_si128, _mm512_aesdec_epi128);
+    }
+
+    #[simd_test(enable = "vaes,avx512f")]
+    unsafe fn test_mm512_aesdeclast_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
+        let (a, k) = setup_state_key_512();
+        let e = _mm_set_epi64x(0x36cad57d9072bf9e, 0xf210dd981fa4a493);
+        let e = _mm512_broadcast_i32x4(e);
+        let r = _mm512_aesdeclast_epi128(a, k);
+        assert_eq_m512i(r, e);
+
+        helper_for_512_vaes(_mm_aesdeclast_si128, _mm512_aesdeclast_epi128);
+    }
+
+    #[simd_test(enable = "vaes,avx512f")]
+    unsafe fn test_mm512_aesenc_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
+        let (a, k) = setup_state_key_512();
+        let e = _mm_set_epi64x(0x16ab0e57dfc442ed, 0x28e4ee1884504333);
+        let e = _mm512_broadcast_i32x4(e);
+        let r = _mm512_aesenc_epi128(a, k);
+        assert_eq_m512i(r, e);
+
+        helper_for_512_vaes(_mm_aesenc_si128, _mm512_aesenc_epi128);
+    }
+
+    #[simd_test(enable = "vaes,avx512f")]
+    unsafe fn test_mm512_aesenclast_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
+        let (a, k) = setup_state_key_512();
+        let e = _mm_set_epi64x(0xb6dd7df25d7ab320, 0x4b04f98cf4c860f8);
+        let e = _mm512_broadcast_i32x4(e);
+        let r = _mm512_aesenclast_epi128(a, k);
+        assert_eq_m512i(r, e);
+
+        helper_for_512_vaes(_mm_aesenclast_si128, _mm512_aesenclast_epi128);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/vpclmulqdq.rs b/library/stdarch/crates/core_arch/src/x86/vpclmulqdq.rs
new file mode 100644
index 0000000000000..b1f23bd2f45c1
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/vpclmulqdq.rs
@@ -0,0 +1,260 @@
+//! Vectorized Carry-less Multiplication (VCLMUL)
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
+//!
+//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.pclmulqdq.256"]
+    fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i;
+    #[link_name = "llvm.x86.pclmulqdq.512"]
+    fn pclmulqdq_512(a: __m512i, round_key: __m512i, imm8: u8) -> __m512i;
+}
+
+// for some odd reason on x86_64 we generate the correct long name instructions
+// but on i686 we generate the short name + imm8
+// so we need to special-case on that...
+
+/// Performs a carry-less multiplication of two 64-bit polynomials over the
+/// finite field GF(2) - in each of the 4 128-bit lanes.
+///
+/// The immediate byte is used for determining which halves of each lane `a` and `b`
+/// should be used. Immediate bits other than 0 and 4 are ignored.
+/// All lanes share immediate byte.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_clmulepi64_epi128)
+#[inline]
+#[target_feature(enable = "vpclmulqdq,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+// technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
+#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_clmulepi64_epi128<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pclmulqdq_512(a, b, IMM8 as u8) }
+}
+
+/// Performs a carry-less multiplication of two 64-bit polynomials over the
+/// finite field GF(2) - in each of the 2 128-bit lanes.
+///
+/// The immediate byte is used for determining which halves of each lane `a` and `b`
+/// should be used. Immediate bits other than 0 and 4 are ignored.
+/// All lanes share immediate byte.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_clmulepi64_epi128)
+#[inline]
+#[target_feature(enable = "vpclmulqdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_clmulepi64_epi128<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pclmulqdq_256(a, b, IMM8 as u8) }
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __mXXXi happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    macro_rules! verify_kat_pclmul {
+        ($broadcast:ident, $clmul:ident, $assert:ident) => {
+            // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
+         let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
+         let a = $broadcast(a);
+         let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
+         let b = $broadcast(b);
+         let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
+         let r00 = $broadcast(r00);
+         let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
+         let r01 = $broadcast(r01);
+         let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
+         let r10 = $broadcast(r10);
+         let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
+         let r11 = $broadcast(r11);
+
+         $assert($clmul::<0x00>(a, b), r00);
+         $assert($clmul::<0x10>(a, b), r01);
+         $assert($clmul::<0x01>(a, b), r10);
+         $assert($clmul::<0x11>(a, b), r11);
+
+         let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
+         let a0 = $broadcast(a0);
+         let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
+         let r = $broadcast(r);
+         $assert($clmul::<0x00>(a0, a0), r);
+        }
+    }
+
+    macro_rules! unroll {
+        ($target:ident[4] = $op:ident::<4>($source:ident);) => {
+            $target[3] = $op::<3>($source);
+            $target[2] = $op::<2>($source);
+            unroll! {$target[2] = $op::<2>($source);}
+        };
+        ($target:ident[2] = $op:ident::<2>($source:ident);) => {
+            $target[1] = $op::<1>($source);
+            $target[0] = $op::<0>($source);
+        };
+        (assert_eq_m128i($op:ident::<4>($vec_res:ident),$lin_res:ident[4]);) => {
+            assert_eq_m128i($op::<3>($vec_res), $lin_res[3]);
+            assert_eq_m128i($op::<2>($vec_res), $lin_res[2]);
+            unroll! {assert_eq_m128i($op::<2>($vec_res),$lin_res[2]);}
+        };
+        (assert_eq_m128i($op:ident::<2>($vec_res:ident),$lin_res:ident[2]);) => {
+            assert_eq_m128i($op::<1>($vec_res), $lin_res[1]);
+            assert_eq_m128i($op::<0>($vec_res), $lin_res[0]);
+        };
+    }
+
+    // this function tests one of the possible 4 instances
+    // with different inputs across lanes
+    #[target_feature(enable = "vpclmulqdq,avx512f")]
+    unsafe fn verify_512_helper(
+        linear: unsafe fn(__m128i, __m128i) -> __m128i,
+        vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
+    ) {
+        let a = _mm512_set_epi64(
+            0xDCB4DB3657BF0B7D,
+            0x18DB0601068EDD9F,
+            0xB76B908233200DC5,
+            0xE478235FA8E22D5E,
+            0xAB05CFFA2621154C,
+            0x1171B47A186174C9,
+            0x8C6B6C0E7595CEC9,
+            0xBE3E7D4934E961BD,
+        );
+        let b = _mm512_set_epi64(
+            0x672F6F105A94CEA7,
+            0x8298B8FFCA5F829C,
+            0xA3927047B3FB61D8,
+            0x978093862CDE7187,
+            0xB1927AB22F31D0EC,
+            0xA9A5DA619BE4D7AF,
+            0xCA2590F56884FDC6,
+            0x19BE9F660038BDB5,
+        );
+
+        let mut a_decomp = [_mm_setzero_si128(); 4];
+        unroll! {a_decomp[4] = _mm512_extracti32x4_epi32::<4>(a);}
+        let mut b_decomp = [_mm_setzero_si128(); 4];
+        unroll! {b_decomp[4] = _mm512_extracti32x4_epi32::<4>(b);}
+
+        let r = vectorized(a, b);
+        let mut e_decomp = [_mm_setzero_si128(); 4];
+        for i in 0..4 {
+            e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
+        }
+        unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32::<4>(r),e_decomp[4]);}
+    }
+
+    // this function tests one of the possible 4 instances
+    // with different inputs across lanes for the VL version
+    #[target_feature(enable = "vpclmulqdq,avx512vl")]
+    unsafe fn verify_256_helper(
+        linear: unsafe fn(__m128i, __m128i) -> __m128i,
+        vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
+    ) {
+        let a = _mm512_set_epi64(
+            0xDCB4DB3657BF0B7D,
+            0x18DB0601068EDD9F,
+            0xB76B908233200DC5,
+            0xE478235FA8E22D5E,
+            0xAB05CFFA2621154C,
+            0x1171B47A186174C9,
+            0x8C6B6C0E7595CEC9,
+            0xBE3E7D4934E961BD,
+        );
+        let b = _mm512_set_epi64(
+            0x672F6F105A94CEA7,
+            0x8298B8FFCA5F829C,
+            0xA3927047B3FB61D8,
+            0x978093862CDE7187,
+            0xB1927AB22F31D0EC,
+            0xA9A5DA619BE4D7AF,
+            0xCA2590F56884FDC6,
+            0x19BE9F660038BDB5,
+        );
+
+        let mut a_decomp = [_mm_setzero_si128(); 2];
+        unroll! {a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);}
+        let mut b_decomp = [_mm_setzero_si128(); 2];
+        unroll! {b_decomp[2] = _mm512_extracti32x4_epi32::<2>(b);}
+
+        let r = vectorized(
+            _mm512_extracti64x4_epi64::<0>(a),
+            _mm512_extracti64x4_epi64::<0>(b),
+        );
+        let mut e_decomp = [_mm_setzero_si128(); 2];
+        for i in 0..2 {
+            e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
+        }
+        unroll! {assert_eq_m128i(_mm256_extracti128_si256::<2>(r),e_decomp[2]);}
+    }
+
+    #[simd_test(enable = "vpclmulqdq,avx512f")]
+    unsafe fn test_mm512_clmulepi64_epi128() {
+        verify_kat_pclmul!(
+            _mm512_broadcast_i32x4,
+            _mm512_clmulepi64_epi128,
+            assert_eq_m512i
+        );
+
+        verify_512_helper(
+            |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
+            |a, b| _mm512_clmulepi64_epi128::<0x00>(a, b),
+        );
+        verify_512_helper(
+            |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
+            |a, b| _mm512_clmulepi64_epi128::<0x01>(a, b),
+        );
+        verify_512_helper(
+            |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
+            |a, b| _mm512_clmulepi64_epi128::<0x10>(a, b),
+        );
+        verify_512_helper(
+            |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
+            |a, b| _mm512_clmulepi64_epi128::<0x11>(a, b),
+        );
+    }
+
+    #[simd_test(enable = "vpclmulqdq,avx512vl")]
+    unsafe fn test_mm256_clmulepi64_epi128() {
+        verify_kat_pclmul!(
+            _mm256_broadcastsi128_si256,
+            _mm256_clmulepi64_epi128,
+            assert_eq_m256i
+        );
+
+        verify_256_helper(
+            |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
+            |a, b| _mm256_clmulepi64_epi128::<0x00>(a, b),
+        );
+        verify_256_helper(
+            |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
+            |a, b| _mm256_clmulepi64_epi128::<0x01>(a, b),
+        );
+        verify_256_helper(
+            |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
+            |a, b| _mm256_clmulepi64_epi128::<0x10>(a, b),
+        );
+        verify_256_helper(
+            |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
+            |a, b| _mm256_clmulepi64_epi128::<0x11>(a, b),
+        );
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/xsave.rs b/library/stdarch/crates/core_arch/src/x86/xsave.rs
new file mode 100644
index 0000000000000..10266662e13ec
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/xsave.rs
@@ -0,0 +1,233 @@
+//! `i586`'s `xsave` and `xsaveopt` target feature intrinsics
+#![allow(clippy::module_name_repetitions)]
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.xsave"]
+    fn xsave(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xrstor"]
+    fn xrstor(p: *const u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsetbv"]
+    fn xsetbv(v: u32, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xgetbv"]
+    fn xgetbv(v: u32) -> i64;
+    #[link_name = "llvm.x86.xsaveopt"]
+    fn xsaveopt(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsavec"]
+    fn xsavec(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsaves"]
+    fn xsaves(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xrstors"]
+    fn xrstors(p: *const u8, hi: u32, lo: u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and XCR0.
+/// `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
+/// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsave)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xsave))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
+    xsave(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial restore of the enabled processor states using
+/// the state information stored in memory at `mem_addr`.
+///
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xrstor)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xrstor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) {
+    xrstor(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+/// `XFEATURE_ENABLED_MASK` for `XCR`
+///
+/// This intrinsic maps to `XSETBV` instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _XCR_XFEATURE_ENABLED_MASK: u32 = 0;
+
+/// Copies 64-bits from `val` to the extended control register (`XCR`) specified
+/// by `a`.
+///
+/// Currently only `XFEATURE_ENABLED_MASK` `XCR` is supported.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsetbv)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xsetbv))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsetbv(a: u32, val: u64) {
+    xsetbv(a, (val >> 32) as u32, val as u32);
+}
+
+/// Reads the contents of the extended control register `XCR`
+/// specified in `xcr_no`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xgetbv)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xgetbv))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
+    xgetbv(xcr_no) as u64
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and `XCR0`.
+/// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
+/// the manner in which data is saved. The performance of this instruction will
+/// be equal to or better than using the `XSAVE` instruction.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsaveopt)
+#[inline]
+#[target_feature(enable = "xsave,xsaveopt")]
+#[cfg_attr(test, assert_instr(xsaveopt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
+    xsaveopt(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory
+/// at `mem_addr`.
+///
+/// `xsavec` differs from `xsave` in that it uses compaction and that it may
+/// use init optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsavec)
+#[inline]
+#[target_feature(enable = "xsave,xsavec")]
+#[cfg_attr(test, assert_instr(xsavec))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
+    xsavec(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`
+///
+/// `xsaves` differs from xsave in that it can save state components
+/// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
+/// modified optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsaves)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xsaves))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
+    xsaves(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial restore of the enabled processor states using the
+/// state information stored in memory at `mem_addr`.
+///
+/// `xrstors` differs from `xrstor` in that it can restore state components
+/// corresponding to bits set in the `IA32_XSS` `MSR`; `xrstors` cannot restore
+/// from an `xsave` area in which the extended region is in the standard form.
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xrstors)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xrstors))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) {
+    xrstors(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{fmt, prelude::v1::*};
+
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[repr(align(64))]
+    #[derive(Debug)]
+    struct XsaveArea {
+        // max size for 256-bit registers is 800 bytes:
+        // see https://software.intel.com/en-us/node/682996
+        // max size for 512-bit registers is 2560 bytes:
+        // FIXME: add source
+        data: [u8; 2560],
+    }
+
+    impl XsaveArea {
+        fn new() -> XsaveArea {
+            XsaveArea { data: [0; 2560] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            self.data.as_mut_ptr()
+        }
+    }
+
+    #[simd_test(enable = "xsave")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_xsave() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsave(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsave(b.ptr(), m);
+    }
+
+    #[simd_test(enable = "xsave")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_xgetbv() {
+        let xcr_n: u32 = _XCR_XFEATURE_ENABLED_MASK;
+
+        let xcr: u64 = _xgetbv(xcr_n);
+        let xcr_cpy: u64 = _xgetbv(xcr_n);
+        assert_eq!(xcr, xcr_cpy);
+    }
+
+    #[simd_test(enable = "xsave,xsaveopt")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_xsaveopt() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsaveopt(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsaveopt(b.ptr(), m);
+    }
+
+    #[simd_test(enable = "xsave,xsavec")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_xsavec() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsavec(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsavec(b.ptr(), m);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/abm.rs b/library/stdarch/crates/core_arch/src/x86_64/abm.rs
new file mode 100644
index 0000000000000..bf59cc4632182
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/abm.rs
@@ -0,0 +1,62 @@
+//! Advanced Bit Manipulation (ABM) instructions
+//!
+//! The POPCNT and LZCNT have their own CPUID bits to indicate support.
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Counts the leading most significant zero bits.
+///
+/// When the operand is zero, it returns its size in bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_lzcnt_u64)
+#[inline]
+#[target_feature(enable = "lzcnt")]
+#[cfg_attr(test, assert_instr(lzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _lzcnt_u64(x: u64) -> u64 {
+    x.leading_zeros() as u64
+}
+
+/// Counts the bits that are set.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_popcnt64)
+#[inline]
+#[target_feature(enable = "popcnt")]
+#[cfg_attr(test, assert_instr(popcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _popcnt64(x: i64) -> i32 {
+    x.count_ones() as i32
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::arch::x86_64::*;
+
+    #[simd_test(enable = "lzcnt")]
+    unsafe fn test_lzcnt_u64() {
+        assert_eq!(_lzcnt_u64(0b0101_1010), 57);
+    }
+
+    #[simd_test(enable = "popcnt")]
+    unsafe fn test_popcnt64() {
+        assert_eq!(_popcnt64(0b0101_1010), 4);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/adx.rs b/library/stdarch/crates/core_arch/src/x86_64/adx.rs
new file mode 100644
index 0000000000000..bdc534b5a525b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/adx.rs
@@ -0,0 +1,154 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.addcarry.64"]
+    fn llvm_addcarry_u64(a: u8, b: u64, c: u64) -> (u8, u64);
+    #[link_name = "llvm.x86.addcarryx.u64"]
+    fn llvm_addcarryx_u64(a: u8, b: u64, c: u64, d: *mut u64) -> u8;
+    #[link_name = "llvm.x86.subborrow.64"]
+    fn llvm_subborrow_u64(a: u8, b: u64, c: u64) -> (u8, u64);
+}
+
+/// Adds unsigned 64-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
+/// (carry or overflow flag), and store the unsigned 64-bit result in `out`, and the carry-out
+/// is returned (carry or overflow flag).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_addcarry_u64)
+#[inline]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    let (a, b) = llvm_addcarry_u64(c_in, a, b);
+    *out = b;
+    a
+}
+
+/// Adds unsigned 64-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
+/// (carry or overflow flag), and store the unsigned 64-bit result in `out`, and
+/// the carry-out is returned (carry or overflow flag).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_addcarryx_u64)
+#[inline]
+#[target_feature(enable = "adx")]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _addcarryx_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    llvm_addcarryx_u64(c_in, a, b, out as *mut _)
+}
+
+/// Adds unsigned 64-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`.
+/// (carry or overflow flag), and store the unsigned 64-bit result in `out`, and
+/// the carry-out is returned (carry or overflow flag).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_subborrow_u64)
+#[inline]
+#[cfg_attr(test, assert_instr(sbb))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _subborrow_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    let (a, b) = llvm_subborrow_u64(c_in, a, b);
+    *out = b;
+    a
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86_64::*;
+
+    #[test]
+    fn test_addcarry_u64() {
+        unsafe {
+            let a = u64::MAX;
+            let mut out = 0;
+
+            let r = _addcarry_u64(0, a, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 0);
+
+            let r = _addcarry_u64(0, a, 0, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, a);
+
+            let r = _addcarry_u64(1, a, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 1);
+
+            let r = _addcarry_u64(1, a, 0, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 0);
+
+            let r = _addcarry_u64(0, 3, 4, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 7);
+
+            let r = _addcarry_u64(1, 3, 4, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 8);
+        }
+    }
+
+    #[simd_test(enable = "adx")]
+    unsafe fn test_addcarryx_u64() {
+        let a = u64::MAX;
+        let mut out = 0;
+
+        let r = _addcarry_u64(0, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);
+
+        let r = _addcarry_u64(0, a, 0, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, a);
+
+        let r = _addcarry_u64(1, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 1);
+
+        let r = _addcarry_u64(1, a, 0, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);
+
+        let r = _addcarry_u64(0, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 7);
+
+        let r = _addcarry_u64(1, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 8);
+    }
+
+    #[test]
+    fn test_subborrow_u64() {
+        unsafe {
+            let a = u64::MAX;
+            let mut out = 0;
+
+            let r = _subborrow_u64(0, 0, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a);
+
+            let r = _subborrow_u64(0, 0, 0, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 0);
+
+            let r = _subborrow_u64(1, 0, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a - 1);
+
+            let r = _subborrow_u64(1, 0, 0, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a);
+
+            let r = _subborrow_u64(0, 7, 3, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 4);
+
+            let r = _subborrow_u64(1, 7, 3, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 3);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/amx.rs b/library/stdarch/crates/core_arch/src/x86_64/amx.rs
new file mode 100644
index 0000000000000..4b33c0ab6c155
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/amx.rs
@@ -0,0 +1,622 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Load tile configuration from a 64-byte memory location specified by mem_addr.
+/// The tile configuration format is specified below, and includes the tile type pallette,
+/// the number of bytes per row, and the number of rows. If the specified pallette_id is zero,
+/// that signifies the init state for both the tile config and the tile data, and the tiles are zeroed.
+/// Any invalid configurations will result in #GP fault.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_loadconfig&ig_expand=6875)
+#[inline]
+#[target_feature(enable = "amx-tile")]
+#[cfg_attr(test, assert_instr(ldtilecfg))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_loadconfig(mem_addr: *const u8) {
+    ldtilecfg(mem_addr);
+}
+
+/// Stores the current tile configuration to a 64-byte memory location specified by mem_addr.
+/// The tile configuration format is specified below, and includes the tile type pallette,
+/// the number of bytes per row, and the number of rows. If tiles are not configured, all zeroes will be stored to memory.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_storeconfig&ig_expand=6879)
+#[inline]
+#[target_feature(enable = "amx-tile")]
+#[cfg_attr(test, assert_instr(sttilecfg))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_storeconfig(mem_addr: *mut u8) {
+    sttilecfg(mem_addr);
+}
+
+/// Load tile rows from memory specifieid by base address and stride into destination tile dst using the tile configuration previously configured via _tile_loadconfig.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_loadd&ig_expand=6877)
+#[inline]
+#[rustc_legacy_const_generics(0)]
+#[target_feature(enable = "amx-tile")]
+#[cfg_attr(test, assert_instr(tileloadd, DST = 0))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_loadd<const DST: i32>(base: *const u8, stride: usize) {
+    static_assert_uimm_bits!(DST, 3);
+    tileloadd64(DST as i8, base, stride);
+}
+
+/// Release the tile configuration to return to the init state, which releases all storage it currently holds.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_release&ig_expand=6878)
+#[inline]
+#[target_feature(enable = "amx-tile")]
+#[cfg_attr(test, assert_instr(tilerelease))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_release() {
+    tilerelease();
+}
+
+/// Store the tile specified by src to memory specifieid by base address and stride using the tile configuration previously configured via _tile_loadconfig.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_stored&ig_expand=6881)
+#[inline]
+#[rustc_legacy_const_generics(0)]
+#[target_feature(enable = "amx-tile")]
+#[cfg_attr(test, assert_instr(tilestored, DST = 0))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_stored<const DST: i32>(base: *mut u8, stride: usize) {
+    static_assert_uimm_bits!(DST, 3);
+    tilestored64(DST as i8, base, stride);
+}
+
+/// Load tile rows from memory specifieid by base address and stride into destination tile dst using the tile configuration
+/// previously configured via _tile_loadconfig. This intrinsic provides a hint to the implementation that the data will
+/// likely not be reused in the near future and the data caching can be optimized accordingly.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_stream_loadd&ig_expand=6883)
+#[inline]
+#[rustc_legacy_const_generics(0)]
+#[target_feature(enable = "amx-tile")]
+#[cfg_attr(test, assert_instr(tileloaddt1, DST = 0))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_stream_loadd<const DST: i32>(base: *const u8, stride: usize) {
+    static_assert_uimm_bits!(DST, 3);
+    tileloaddt164(DST as i8, base, stride);
+}
+
+/// Zero the tile specified by tdest.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_zero&ig_expand=6885)
+#[inline]
+#[rustc_legacy_const_generics(0)]
+#[target_feature(enable = "amx-tile")]
+#[cfg_attr(test, assert_instr(tilezero, DST = 0))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_zero<const DST: i32>() {
+    static_assert_uimm_bits!(DST, 3);
+    tilezero(DST as i8);
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in dst, and store the 32-bit result back to tile dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbf16ps&ig_expand=6864)
+#[inline]
+#[rustc_legacy_const_generics(0, 1, 2)]
+#[target_feature(enable = "amx-bf16")]
+#[cfg_attr(test, assert_instr(tdpbf16ps, DST = 0, A = 1, B = 2))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_dpbf16ps<const DST: i32, const A: i32, const B: i32>() {
+    static_assert_uimm_bits!(DST, 3);
+    static_assert_uimm_bits!(A, 3);
+    static_assert_uimm_bits!(B, 3);
+    tdpbf16ps(DST as i8, A as i8, B as i8);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding
+/// signed 8-bit integers in b, producing 4 intermediate 32-bit results.
+/// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbssd&ig_expand=6866)
+#[inline]
+#[rustc_legacy_const_generics(0, 1, 2)]
+#[target_feature(enable = "amx-int8")]
+#[cfg_attr(test, assert_instr(tdpbssd, DST = 0, A = 1, B = 2))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_dpbssd<const DST: i32, const A: i32, const B: i32>() {
+    static_assert_uimm_bits!(DST, 3);
+    static_assert_uimm_bits!(A, 3);
+    static_assert_uimm_bits!(B, 3);
+    tdpbssd(DST as i8, A as i8, B as i8);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding
+/// unsigned 8-bit integers in b, producing 4 intermediate 32-bit results.
+/// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbsud&ig_expand=6868)
+#[inline]
+#[rustc_legacy_const_generics(0, 1, 2)]
+#[target_feature(enable = "amx-int8")]
+#[cfg_attr(test, assert_instr(tdpbsud, DST = 0, A = 1, B = 2))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_dpbsud<const DST: i32, const A: i32, const B: i32>() {
+    static_assert_uimm_bits!(DST, 3);
+    static_assert_uimm_bits!(A, 3);
+    static_assert_uimm_bits!(B, 3);
+    tdpbsud(DST as i8, A as i8, B as i8);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding
+/// signed 8-bit integers in b, producing 4 intermediate 32-bit results.
+/// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbusd&ig_expand=6870)
+#[inline]
+#[rustc_legacy_const_generics(0, 1, 2)]
+#[target_feature(enable = "amx-int8")]
+#[cfg_attr(test, assert_instr(tdpbusd, DST = 0, A = 1, B = 2))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_dpbusd<const DST: i32, const A: i32, const B: i32>() {
+    static_assert_uimm_bits!(DST, 3);
+    static_assert_uimm_bits!(A, 3);
+    static_assert_uimm_bits!(B, 3);
+    tdpbusd(DST as i8, A as i8, B as i8);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding
+/// unsigned 8-bit integers in b, producing 4 intermediate 32-bit results.
+/// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbuud&ig_expand=6872)
+#[inline]
+#[rustc_legacy_const_generics(0, 1, 2)]
+#[target_feature(enable = "amx-int8")]
+#[cfg_attr(test, assert_instr(tdpbuud, DST = 0, A = 1, B = 2))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_dpbuud<const DST: i32, const A: i32, const B: i32>() {
+    static_assert_uimm_bits!(DST, 3);
+    static_assert_uimm_bits!(A, 3);
+    static_assert_uimm_bits!(B, 3);
+    tdpbuud(DST as i8, A as i8, B as i8);
+}
+
+/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+///  with elements in dst, and store the 32-bit result back to tile dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpfp16ps&ig_expand=6874)
+#[inline]
+#[rustc_legacy_const_generics(0, 1, 2)]
+#[target_feature(enable = "amx-fp16")]
+#[cfg_attr(test, assert_instr(tdpfp16ps, DST = 0, A = 1, B = 2))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_dpfp16ps<const DST: i32, const A: i32, const B: i32>() {
+    static_assert_uimm_bits!(DST, 3);
+    static_assert_uimm_bits!(A, 3);
+    static_assert_uimm_bits!(B, 3);
+    tdpfp16ps(DST as i8, A as i8, B as i8);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile.
+/// Each dword element in input tiles a and b is interpreted as a complex number with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination of (row of a, column of b),
+/// it performs a set of multiplication and accumulations on all corresponding complex numbers (one from a and one from b).
+/// The imaginary part of the a element is multiplied with the real part of the corresponding b element, and the real part of
+/// the a element is multiplied with the imaginary part of the corresponding b elements. The two accumulated results are added,
+/// and then accumulated into the corresponding row and column of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_cmmimfp16ps&ig_expand=6860)
+#[inline]
+#[rustc_legacy_const_generics(0, 1, 2)]
+#[target_feature(enable = "amx-complex")]
+#[cfg_attr(test, assert_instr(tcmmimfp16ps, DST = 0, A = 1, B = 2))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_cmmimfp16ps<const DST: i32, const A: i32, const B: i32>() {
+    static_assert_uimm_bits!(DST, 3);
+    static_assert_uimm_bits!(A, 3);
+    static_assert_uimm_bits!(B, 3);
+    tcmmimfp16ps(DST as i8, A as i8, B as i8);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile.
+/// Each dword element in input tiles a and b is interpreted as a complex number with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination of (row of a, column of b),
+/// it performs a set of multiplication and accumulations on all corresponding complex numbers (one from a and one from b).
+/// The real part of the a element is multiplied with the real part of the corresponding b element, and the negated imaginary part of
+/// the a element is multiplied with the imaginary part of the corresponding b elements.
+/// The two accumulated results are added, and then accumulated into the corresponding row and column of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_cmmrlfp16ps&ig_expand=6862)
+#[inline]
+#[rustc_legacy_const_generics(0, 1, 2)]
+#[target_feature(enable = "amx-complex")]
+#[cfg_attr(test, assert_instr(tcmmrlfp16ps, DST = 0, A = 1, B = 2))]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_cmmrlfp16ps<const DST: i32, const A: i32, const B: i32>() {
+    static_assert_uimm_bits!(DST, 3);
+    static_assert_uimm_bits!(A, 3);
+    static_assert_uimm_bits!(B, 3);
+    tcmmrlfp16ps(DST as i8, A as i8, B as i8);
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.ldtilecfg"]
+    fn ldtilecfg(mem_addr: *const u8);
+    #[link_name = "llvm.x86.sttilecfg"]
+    fn sttilecfg(mem_addr: *mut u8);
+    #[link_name = "llvm.x86.tileloadd64"]
+    fn tileloadd64(dst: i8, base: *const u8, stride: usize);
+    #[link_name = "llvm.x86.tileloaddt164"]
+    fn tileloaddt164(dst: i8, base: *const u8, stride: usize);
+    #[link_name = "llvm.x86.tilerelease"]
+    fn tilerelease();
+    #[link_name = "llvm.x86.tilestored64"]
+    fn tilestored64(dst: i8, base: *mut u8, stride: usize);
+    #[link_name = "llvm.x86.tilezero"]
+    fn tilezero(dst: i8);
+    #[link_name = "llvm.x86.tdpbf16ps"]
+    fn tdpbf16ps(dst: i8, a: i8, b: i8);
+    #[link_name = "llvm.x86.tdpbuud"]
+    fn tdpbuud(dst: i8, a: i8, b: i8);
+    #[link_name = "llvm.x86.tdpbusd"]
+    fn tdpbusd(dst: i8, a: i8, b: i8);
+    #[link_name = "llvm.x86.tdpbsud"]
+    fn tdpbsud(dst: i8, a: i8, b: i8);
+    #[link_name = "llvm.x86.tdpbssd"]
+    fn tdpbssd(dst: i8, a: i8, b: i8);
+    #[link_name = "llvm.x86.tdpfp16ps"]
+    fn tdpfp16ps(dst: i8, a: i8, b: i8);
+    #[link_name = "llvm.x86.tcmmimfp16ps"]
+    fn tcmmimfp16ps(dst: i8, a: i8, b: i8);
+    #[link_name = "llvm.x86.tcmmrlfp16ps"]
+    fn tcmmrlfp16ps(dst: i8, a: i8, b: i8);
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::_mm_cvtness_sbh;
+    use crate::core_arch::x86_64::*;
+    use core::mem::transmute;
+    use stdarch_test::simd_test;
+    #[cfg(target_os = "linux")]
+    use syscalls::{Sysno, syscall};
+
+    #[allow(non_camel_case_types)]
+    #[repr(packed)]
+    #[derive(Copy, Clone, Default, Debug, PartialEq)]
+    struct __tilecfg {
+        /// 0 `or` 1
+        palette: u8,
+        start_row: u8,
+        /// reserved, must be zero
+        reserved_a0: [u8; 14],
+        /// number of bytes of one row in each tile
+        colsb: [u16; 8],
+        /// reserved, must be zero
+        reserved_b0: [u16; 8],
+        /// number of rows in each tile
+        rows: [u8; 8],
+        /// reserved, must be zero
+        reserved_c0: [u8; 8],
+    }
+
+    impl __tilecfg {
+        fn new(palette: u8, start_row: u8, colsb: [u16; 8], rows: [u8; 8]) -> Self {
+            Self {
+                palette,
+                start_row,
+                reserved_a0: [0u8; 14],
+                colsb,
+                reserved_b0: [0u16; 8],
+                rows,
+                reserved_c0: [0u8; 8],
+            }
+        }
+
+        const fn as_ptr(&self) -> *const u8 {
+            self as *const Self as *const u8
+        }
+
+        fn as_mut_ptr(&mut self) -> *mut u8 {
+            self as *mut Self as *mut u8
+        }
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[target_feature(enable = "amx-tile")]
+    fn _init_amx() {}
+
+    #[cfg(target_os = "linux")]
+    #[target_feature(enable = "amx-tile")]
+    #[inline]
+    unsafe fn _init_amx() {
+        let mut ret: usize;
+        let mut xfeatures: usize = 0;
+        ret = syscall!(Sysno::arch_prctl, 0x1022, &mut xfeatures as *mut usize)
+            .expect("arch_prctl ARCH_GET_XCOMP_PERM syscall failed");
+        if ret != 0 {
+            panic!("Failed to get XFEATURES");
+        } else {
+            match 0b11 & (xfeatures >> 17) {
+                0 => panic!("AMX is not available"),
+                1 => {
+                    ret = syscall!(Sysno::arch_prctl, 0x1023, 18)
+                        .expect("arch_prctl ARCH_REQ_XCOMP_PERM syscall failed");
+                    if ret != 0 {
+                        panic!("Failed to enable AMX");
+                    }
+                }
+                3 => {}
+                _ => unreachable!(),
+            }
+        }
+    }
+
+    #[simd_test(enable = "amx-tile")]
+    unsafe fn test_tile_loadconfig() {
+        let config = __tilecfg::default();
+        _tile_loadconfig(config.as_ptr());
+        _tile_release();
+    }
+
+    #[simd_test(enable = "amx-tile")]
+    unsafe fn test_tile_storeconfig() {
+        let config = __tilecfg::new(1, 0, [32; 8], [8; 8]);
+        _tile_loadconfig(config.as_ptr());
+        let mut _config = __tilecfg::default();
+        _tile_storeconfig(_config.as_mut_ptr());
+        _tile_release();
+        assert_eq!(config, _config);
+    }
+
+    #[simd_test(enable = "amx-tile")]
+    unsafe fn test_tile_zero() {
+        _init_amx();
+        let mut config = __tilecfg::default();
+        config.palette = 1;
+        config.colsb[0] = 64;
+        config.rows[0] = 16;
+        _tile_loadconfig(config.as_ptr());
+        _tile_zero::<0>();
+        let mut out = [[1_i8; 64]; 16];
+        _tile_stored::<0>(&mut out as *mut [i8; 64] as *mut u8, 64);
+        _tile_release();
+        assert_eq!(out, [[0; 64]; 16]);
+    }
+
+    #[simd_test(enable = "amx-tile")]
+    unsafe fn test_tile_stored() {
+        _init_amx();
+        let mut config = __tilecfg::default();
+        config.palette = 1;
+        config.colsb[0] = 64;
+        config.rows[0] = 16;
+        _tile_loadconfig(config.as_ptr());
+        _tile_zero::<0>();
+        let mut out = [[1_i8; 64]; 16];
+        _tile_stored::<0>(&mut out as *mut [i8; 64] as *mut u8, 64);
+        _tile_release();
+        assert_eq!(out, [[0; 64]; 16]);
+    }
+
+    #[simd_test(enable = "amx-tile")]
+    unsafe fn test_tile_loadd() {
+        _init_amx();
+        let mut config = __tilecfg::default();
+        config.palette = 1;
+        config.colsb[0] = 64;
+        config.rows[0] = 16;
+        _tile_loadconfig(config.as_ptr());
+        _tile_zero::<0>();
+        let mat = [1_i8; 1024];
+        _tile_loadd::<0>(&mat as *const i8 as *const u8, 64);
+        let mut out = [[0_i8; 64]; 16];
+        _tile_stored::<0>(&mut out as *mut [i8; 64] as *mut u8, 64);
+        _tile_release();
+        assert_eq!(out, [[1; 64]; 16]);
+    }
+
+    #[simd_test(enable = "amx-tile")]
+    unsafe fn test_tile_stream_loadd() {
+        _init_amx();
+        let mut config = __tilecfg::default();
+        config.palette = 1;
+        config.colsb[0] = 64;
+        config.rows[0] = 16;
+        _tile_loadconfig(config.as_ptr());
+        _tile_zero::<0>();
+        let mat = [1_i8; 1024];
+        _tile_stream_loadd::<0>(&mat as *const i8 as *const u8, 64);
+        let mut out = [[0_i8; 64]; 16];
+        _tile_stored::<0>(&mut out as *mut [i8; 64] as *mut u8, 64);
+        _tile_release();
+        assert_eq!(out, [[1; 64]; 16]);
+    }
+
+    #[simd_test(enable = "amx-tile")]
+    unsafe fn test_tile_release() {
+        _tile_release();
+    }
+
+    #[simd_test(enable = "amx-bf16,avx512f")]
+    unsafe fn test_tile_dpbf16ps() {
+        _init_amx();
+        let bf16_1: u16 = _mm_cvtness_sbh(1.0).to_bits();
+        let bf16_2: u16 = _mm_cvtness_sbh(2.0).to_bits();
+        let ones: [u8; 1024] = transmute([bf16_1; 512]);
+        let twos: [u8; 1024] = transmute([bf16_2; 512]);
+        let mut res = [[0f32; 16]; 16];
+        let mut config = __tilecfg::default();
+        config.palette = 1;
+        (0..=2).for_each(|i| {
+            config.colsb[i] = 64;
+            config.rows[i] = 16;
+        });
+        _tile_loadconfig(config.as_ptr());
+        _tile_zero::<0>();
+        _tile_loadd::<1>(&ones as *const u8, 64);
+        _tile_loadd::<2>(&twos as *const u8, 64);
+        _tile_dpbf16ps::<0, 1, 2>();
+        _tile_stored::<0>(&mut res as *mut [f32; 16] as *mut u8, 64);
+        _tile_release();
+        assert_eq!(res, [[64f32; 16]; 16]);
+    }
+
+    #[simd_test(enable = "amx-int8")]
+    unsafe fn test_tile_dpbssd() {
+        _init_amx();
+        let ones = [-1_i8; 1024];
+        let twos = [-2_i8; 1024];
+        let mut res = [[0_i32; 16]; 16];
+        let mut config = __tilecfg::default();
+        config.palette = 1;
+        (0..=2).for_each(|i| {
+            config.colsb[i] = 64;
+            config.rows[i] = 16;
+        });
+        _tile_loadconfig(config.as_ptr());
+        _tile_zero::<0>();
+        _tile_loadd::<1>(&ones as *const i8 as *const u8, 64);
+        _tile_loadd::<2>(&twos as *const i8 as *const u8, 64);
+        _tile_dpbssd::<0, 1, 2>();
+        _tile_stored::<0>(&mut res as *mut [i32; 16] as *mut u8, 64);
+        _tile_release();
+        assert_eq!(res, [[128_i32; 16]; 16]);
+    }
+
+    #[simd_test(enable = "amx-int8")]
+    unsafe fn test_tile_dpbsud() {
+        _init_amx();
+        let ones = [-1_i8; 1024];
+        let twos = [2_u8; 1024];
+        let mut res = [[0_i32; 16]; 16];
+        let mut config = __tilecfg::default();
+        config.palette = 1;
+        (0..=2).for_each(|i| {
+            config.colsb[i] = 64;
+            config.rows[i] = 16;
+        });
+        _tile_loadconfig(config.as_ptr());
+        _tile_zero::<0>();
+        _tile_loadd::<1>(&ones as *const i8 as *const u8, 64);
+        _tile_loadd::<2>(&twos as *const u8, 64);
+        _tile_dpbsud::<0, 1, 2>();
+        _tile_stored::<0>(&mut res as *mut [i32; 16] as *mut u8, 64);
+        _tile_release();
+        assert_eq!(res, [[-128_i32; 16]; 16]);
+    }
+
+    #[simd_test(enable = "amx-int8")]
+    unsafe fn test_tile_dpbusd() {
+        _init_amx();
+        let ones = [1_u8; 1024];
+        let twos = [-2_i8; 1024];
+        let mut res = [[0_i32; 16]; 16];
+        let mut config = __tilecfg::default();
+        config.palette = 1;
+        (0..=2).for_each(|i| {
+            config.colsb[i] = 64;
+            config.rows[i] = 16;
+        });
+        _tile_loadconfig(config.as_ptr());
+        _tile_zero::<0>();
+        _tile_loadd::<1>(&ones as *const u8, 64);
+        _tile_loadd::<2>(&twos as *const i8 as *const u8, 64);
+        _tile_dpbusd::<0, 1, 2>();
+        _tile_stored::<0>(&mut res as *mut [i32; 16] as *mut u8, 64);
+        _tile_release();
+        assert_eq!(res, [[-128_i32; 16]; 16]);
+    }
+
+    #[simd_test(enable = "amx-int8")]
+    unsafe fn test_tile_dpbuud() {
+        _init_amx();
+        let ones = [1_u8; 1024];
+        let twos = [2_u8; 1024];
+        let mut res = [[0_i32; 16]; 16];
+        let mut config = __tilecfg::default();
+        config.palette = 1;
+        (0..=2).for_each(|i| {
+            config.colsb[i] = 64;
+            config.rows[i] = 16;
+        });
+        _tile_loadconfig(config.as_ptr());
+        _tile_zero::<0>();
+        _tile_loadd::<1>(&ones as *const u8, 64);
+        _tile_loadd::<2>(&twos as *const u8, 64);
+        _tile_dpbuud::<0, 1, 2>();
+        _tile_stored::<0>(&mut res as *mut [i32; 16] as *mut u8, 64);
+        _tile_release();
+        assert_eq!(res, [[128_i32; 16]; 16]);
+    }
+
+    #[simd_test(enable = "amx-fp16")]
+    unsafe fn test_tile_dpfp16ps() {
+        _init_amx();
+        let ones = [1f16; 512];
+        let twos = [2f16; 512];
+        let mut res = [[0f32; 16]; 16];
+        let mut config = __tilecfg::default();
+        config.palette = 1;
+        (0..=2).for_each(|i| {
+            config.colsb[i] = 64;
+            config.rows[i] = 16;
+        });
+        _tile_loadconfig(config.as_ptr());
+        _tile_zero::<0>();
+        _tile_loadd::<1>(&ones as *const f16 as *const u8, 64);
+        _tile_loadd::<2>(&twos as *const f16 as *const u8, 64);
+        _tile_dpfp16ps::<0, 1, 2>();
+        _tile_stored::<0>(&mut res as *mut [f32; 16] as *mut u8, 64);
+        _tile_release();
+        assert_eq!(res, [[64f32; 16]; 16]);
+    }
+
+    #[simd_test(enable = "amx-complex")]
+    unsafe fn test_tile_cmmimfp16ps() {
+        _init_amx();
+        let ones = [1f16; 512];
+        let twos = [2f16; 512];
+        let mut res = [[0f32; 16]; 16];
+        let mut config = __tilecfg::default();
+        config.palette = 1;
+        (0..=2).for_each(|i| {
+            config.colsb[i] = 64;
+            config.rows[i] = 16;
+        });
+        _tile_loadconfig(config.as_ptr());
+        _tile_zero::<0>();
+        _tile_loadd::<1>(&ones as *const f16 as *const u8, 64);
+        _tile_loadd::<2>(&twos as *const f16 as *const u8, 64);
+        _tile_cmmimfp16ps::<0, 1, 2>();
+        _tile_stored::<0>(&mut res as *mut [f32; 16] as *mut u8, 64);
+        _tile_release();
+        assert_eq!(res, [[64f32; 16]; 16]);
+    }
+
+    #[simd_test(enable = "amx-complex")]
+    unsafe fn test_tile_cmmrlfp16ps() {
+        _init_amx();
+        let ones = [1f16; 512];
+        let twos = [2f16; 512];
+        let mut res = [[0f32; 16]; 16];
+        let mut config = __tilecfg::default();
+        config.palette = 1;
+        (0..=2).for_each(|i| {
+            config.colsb[i] = 64;
+            config.rows[i] = 16;
+        });
+        _tile_loadconfig(config.as_ptr());
+        _tile_zero::<0>();
+        _tile_loadd::<1>(&ones as *const f16 as *const u8, 64);
+        _tile_loadd::<2>(&twos as *const f16 as *const u8, 64);
+        _tile_cmmrlfp16ps::<0, 1, 2>();
+        _tile_stored::<0>(&mut res as *mut [f32; 16] as *mut u8, 64);
+        _tile_release();
+        assert_eq!(res, [[0f32; 16]; 16]);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx.rs b/library/stdarch/crates/core_arch/src/x86_64/avx.rs
new file mode 100644
index 0000000000000..b494385e4a616
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx.rs
@@ -0,0 +1,65 @@
+//! Advanced Vector Extensions (AVX)
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
+//!   Programmer's Manual, Volume 3: General-Purpose and System
+//!   Instructions][amd64_ref].
+//!
+//! [Wikipedia][wiki] provides a quick overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+
+use crate::{core_arch::x86::*, mem::transmute};
+
+/// Copies `a` to result, and insert the 64-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi64)
+#[inline]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_insert_epi64<const INDEX: i32>(a: __m256i, i: i64) -> __m256i {
+    static_assert_uimm_bits!(INDEX, 2);
+    unsafe { transmute(simd_insert!(a.as_i64x4(), INDEX as u32, i)) }
+}
+
+/// Extracts a 64-bit integer from `a`, selected with `INDEX`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi64)
+#[inline]
+#[target_feature(enable = "avx")]
+#[rustc_legacy_const_generics(1)]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_extract_epi64<const INDEX: i32>(a: __m256i) -> i64 {
+    static_assert_uimm_bits!(INDEX, 2);
+    unsafe { simd_extract!(a.as_i64x4(), INDEX as u32) }
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::arch::x86_64::*;
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insert_epi64() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_insert_epi64::<3>(a, 0);
+        let e = _mm256_setr_epi64x(1, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extract_epi64() {
+        let a = _mm256_setr_epi64x(0, 1, 2, 3);
+        let r = _mm256_extract_epi64::<3>(a);
+        assert_eq!(r, 3);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86_64/avx512bw.rs
new file mode 100644
index 0000000000000..466c36ef31e5c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx512bw.rs
@@ -0,0 +1,45 @@
+use crate::core_arch::x86::*;
+
+/// Convert 64-bit mask a into an integer value, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtmask64_u64)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtmask64_u64(a: __mmask64) -> u64 {
+    a
+}
+
+/// Convert integer value a into an 64-bit mask, and store the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu64_mask64)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtu64_mask64(a: u64) -> __mmask64 {
+    a
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::{x86::*, x86_64::*};
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_cvtmask64_u64() {
+        let a: __mmask64 = 0b11001100_00110011_01100110_10011001;
+        let r = _cvtmask64_u64(a);
+        let e: u64 = 0b11001100_00110011_01100110_10011001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_cvtu64_mask64() {
+        let a: u64 = 0b11001100_00110011_01100110_10011001;
+        let r = _cvtu64_mask64(a);
+        let e: __mmask64 = 0b11001100_00110011_01100110_10011001;
+        assert_eq!(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
new file mode 100644
index 0000000000000..934c9e2812c42
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
@@ -0,0 +1,13014 @@
+use crate::{
+    core_arch::{simd::*, x86::*, x86_64::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_i64&expand=1792)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2si))]
+pub fn _mm_cvtsd_i64(a: __m128d) -> i64 {
+    _mm_cvtsd_si64(a)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_i64&expand=1894)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2si))]
+pub fn _mm_cvtss_i64(a: __m128) -> i64 {
+    _mm_cvtss_si64(a)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_u64&expand=1902)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2usi))]
+pub fn _mm_cvtss_u64(a: __m128) -> u64 {
+    unsafe { vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_u64&expand=1800)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi))]
+pub fn _mm_cvtsd_u64(a: __m128d) -> u64 {
+    unsafe { vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvti64_ss&expand=1643)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss))]
+pub fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
+    unsafe {
+        let b = b as f32;
+        simd_insert!(a, 0, b)
+    }
+}
+
+/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvti64_sd&expand=1644)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd))]
+pub fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
+    unsafe {
+        let b = b as f64;
+        simd_insert!(a, 0, b)
+    }
+}
+
+/// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtu64_ss&expand=2035)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtusi2ss))]
+pub fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
+    unsafe {
+        let b = b as f32;
+        simd_insert!(a, 0, b)
+    }
+}
+
+/// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtu64_sd&expand=2034)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtusi2sd))]
+pub fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
+    unsafe {
+        let b = b as f64;
+        simd_insert!(a, 0, b)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_i64&expand=2016)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2si))]
+pub fn _mm_cvttsd_i64(a: __m128d) -> i64 {
+    unsafe { vcvttsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_u64&expand=2021)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2usi))]
+pub fn _mm_cvttsd_u64(a: __m128d) -> u64 {
+    unsafe { vcvttsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=#text=_mm_cvttss_i64&expand=2023)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2si))]
+pub fn _mm_cvttss_i64(a: __m128) -> i64 {
+    unsafe { vcvttss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_u64&expand=2027)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2usi))]
+pub fn _mm_cvttss_u64(a: __m128) -> u64 {
+    unsafe { vcvttss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundi64_sd&expand=1313)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let r = vcvtsi2sd64(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundsi64_sd&expand=1367)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let r = vcvtsi2sd64(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundi64_ss&expand=1314)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtsi2ss64(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundu64_sd&expand=1379)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtusi2sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let r = vcvtusi2sd64(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundsi64_ss&expand=1368)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtsi2ss64(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundu64_ss&expand=1380)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtusi2ss64(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundsd_si64&expand=1360)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2si64(a, ROUNDING)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundsd_i64&expand=1358)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2si64(a, ROUNDING)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundsd_u64&expand=1365)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2usi64(a, ROUNDING)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundss_si64&expand=1375)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2si64(a, ROUNDING)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundss_i64&expand=1370)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2si64(a, ROUNDING)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundss_u64&expand=1377)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2usi64(a, ROUNDING)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_roundsd_si64&expand=1931)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2si64(a, SAE)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_roundsd_i64&expand=1929)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2si64(a, SAE)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_roundsd_u64&expand=1933)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2usi64(a, SAE)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_roundss_i64&expand=1935)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2si64(a, SAE)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_roundss_si64&expand=1937)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2si64(a, SAE)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_roundss_u64&expand=1939)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundss_u64<const SAE: i32>(a: __m128) -> u64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2usi64(a, SAE)
+    }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.vcvtss2si64"]
+    fn vcvtss2si64(a: f32x4, rounding: i32) -> i64;
+    #[link_name = "llvm.x86.avx512.vcvtss2usi64"]
+    fn vcvtss2usi64(a: f32x4, rounding: i32) -> u64;
+    #[link_name = "llvm.x86.avx512.vcvtsd2si64"]
+    fn vcvtsd2si64(a: f64x2, rounding: i32) -> i64;
+    #[link_name = "llvm.x86.avx512.vcvtsd2usi64"]
+    fn vcvtsd2usi64(a: f64x2, rounding: i32) -> u64;
+
+    #[link_name = "llvm.x86.avx512.cvtsi2ss64"]
+    fn vcvtsi2ss64(a: f32x4, b: i64, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.cvtsi2sd64"]
+    fn vcvtsi2sd64(a: f64x2, b: i64, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.cvtusi642ss"]
+    fn vcvtusi2ss64(a: f32x4, b: u64, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.cvtusi642sd"]
+    fn vcvtusi2sd64(a: f64x2, b: u64, rounding: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.cvttss2si64"]
+    fn vcvttss2si64(a: f32x4, rounding: i32) -> i64;
+    #[link_name = "llvm.x86.avx512.cvttss2usi64"]
+    fn vcvttss2usi64(a: f32x4, rounding: i32) -> u64;
+    #[link_name = "llvm.x86.avx512.cvttsd2si64"]
+    fn vcvttsd2si64(a: f64x2, rounding: i32) -> i64;
+    #[link_name = "llvm.x86.avx512.cvttsd2usi64"]
+    fn vcvttsd2usi64(a: f64x2, rounding: i32) -> u64;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use crate::core_arch::x86_64::*;
+    use crate::hint::black_box;
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_abs_epi64() {
+        let a = _mm512_set_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let r = _mm512_abs_epi64(a);
+        let e = _mm512_set_epi64(0, 1, 1, i64::MAX, i64::MAX.wrapping_add(1), 100, 100, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_abs_epi64() {
+        let a = _mm512_set_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let r = _mm512_mask_abs_epi64(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_abs_epi64(a, 0b11111111, a);
+        let e = _mm512_set_epi64(0, 1, 1, i64::MAX, i64::MIN, 100, 100, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_abs_epi64() {
+        let a = _mm512_set_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let r = _mm512_maskz_abs_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_abs_epi64(0b11111111, a);
+        let e = _mm512_set_epi64(0, 1, 1, i64::MAX, i64::MIN, 100, 100, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_abs_epi64() {
+        let a = _mm256_set_epi64x(i64::MAX, i64::MIN, 100, -100);
+        let r = _mm256_abs_epi64(a);
+        let e = _mm256_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1), 100, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_abs_epi64() {
+        let a = _mm256_set_epi64x(i64::MAX, i64::MIN, 100, -100);
+        let r = _mm256_mask_abs_epi64(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_abs_epi64(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1), 100, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_abs_epi64() {
+        let a = _mm256_set_epi64x(i64::MAX, i64::MIN, 100, -100);
+        let r = _mm256_maskz_abs_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_abs_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1), 100, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_abs_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let r = _mm_abs_epi64(a);
+        let e = _mm_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1));
+        assert_eq_m128i(r, e);
+        let a = _mm_set_epi64x(100, -100);
+        let r = _mm_abs_epi64(a);
+        let e = _mm_set_epi64x(100, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_abs_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let r = _mm_mask_abs_epi64(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_abs_epi64(a, 0b00000011, a);
+        let e = _mm_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1));
+        assert_eq_m128i(r, e);
+        let a = _mm_set_epi64x(100, -100);
+        let r = _mm_mask_abs_epi64(a, 0b00000011, a);
+        let e = _mm_set_epi64x(100, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_abs_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let r = _mm_maskz_abs_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_abs_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1));
+        assert_eq_m128i(r, e);
+        let a = _mm_set_epi64x(100, -100);
+        let r = _mm_maskz_abs_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(100, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_abs_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let r = _mm512_abs_pd(a);
+        let e = _mm512_setr_pd(0., 1., 1., f64::MAX, f64::MAX, 100., 100., 32.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_abs_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let r = _mm512_mask_abs_pd(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_abs_pd(a, 0b00001111, a);
+        let e = _mm512_setr_pd(0., 1., 1., f64::MAX, f64::MIN, 100., -100., -32.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_epi64() {
+        let src = _mm512_set1_epi64(1);
+        let a = _mm512_set1_epi64(2);
+        let r = _mm512_mask_mov_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_mov_epi64(src, 0b11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let r = _mm512_maskz_mov_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mov_epi64(0b11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mov_epi64() {
+        let src = _mm256_set1_epi64x(1);
+        let a = _mm256_set1_epi64x(2);
+        let r = _mm256_mask_mov_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_mov_epi64(src, 0b00001111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let r = _mm256_maskz_mov_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mov_epi64(0b00001111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mov_epi64() {
+        let src = _mm_set1_epi64x(1);
+        let a = _mm_set1_epi64x(2);
+        let r = _mm_mask_mov_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_mov_epi64(src, 0b00000011, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mov_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let r = _mm_maskz_mov_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mov_epi64(0b00000011, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_pd() {
+        let src = _mm512_set1_pd(1.);
+        let a = _mm512_set1_pd(2.);
+        let r = _mm512_mask_mov_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_mov_pd(src, 0b11111111, a);
+        assert_eq_m512d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_pd() {
+        let a = _mm512_set1_pd(2.);
+        let r = _mm512_maskz_mov_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_mov_pd(0b11111111, a);
+        assert_eq_m512d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mov_pd() {
+        let src = _mm256_set1_pd(1.);
+        let a = _mm256_set1_pd(2.);
+        let r = _mm256_mask_mov_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_mov_pd(src, 0b00001111, a);
+        assert_eq_m256d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_pd() {
+        let a = _mm256_set1_pd(2.);
+        let r = _mm256_maskz_mov_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_mov_pd(0b00001111, a);
+        assert_eq_m256d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mov_pd() {
+        let src = _mm_set1_pd(1.);
+        let a = _mm_set1_pd(2.);
+        let r = _mm_mask_mov_pd(src, 0, a);
+        assert_eq_m128d(r, src);
+        let r = _mm_mask_mov_pd(src, 0b00000011, a);
+        assert_eq_m128d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mov_pd() {
+        let a = _mm_set1_pd(2.);
+        let r = _mm_maskz_mov_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_mov_pd(0b00000011, a);
+        assert_eq_m128d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_epi64() {
+        let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let b = _mm512_set1_epi64(1);
+        let r = _mm512_add_epi64(a, b);
+        let e = _mm512_setr_epi64(1, 2, 0, i64::MIN, i64::MIN + 1, 101, -99, -31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_epi64() {
+        let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let b = _mm512_set1_epi64(1);
+        let r = _mm512_mask_add_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_add_epi64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(1, 2, 0, i64::MIN, i64::MIN, 100, -100, -32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_epi64() {
+        let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let b = _mm512_set1_epi64(1);
+        let r = _mm512_maskz_add_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_add_epi64(0b00001111, a, b);
+        let e = _mm512_setr_epi64(1, 2, 0, i64::MIN, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_add_epi64() {
+        let a = _mm256_set_epi64x(1, -1, i64::MAX, i64::MIN);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_add_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_add_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(2, 0, i64::MIN, i64::MIN + 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_add_epi64() {
+        let a = _mm256_set_epi64x(1, -1, i64::MAX, i64::MIN);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_add_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_add_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(2, 0, i64::MIN, i64::MIN + 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_add_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_mask_add_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_add_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(i64::MIN, i64::MIN + 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_add_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_maskz_add_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_add_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(i64::MIN, i64::MIN + 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_add_pd(a, b);
+        let e = _mm512_setr_pd(1., 2., 0., f64::MAX, f64::MIN + 1., 101., -99., -31.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_mask_add_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_add_pd(a, 0b00001111, a, b);
+        let e = _mm512_setr_pd(1., 2., 0., f64::MAX, f64::MIN, 100., -100., -32.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_add_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_add_pd(0b00001111, a, b);
+        let e = _mm512_setr_pd(1., 2., 0., f64::MAX, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_add_pd() {
+        let a = _mm256_set_pd(1., -1., f64::MAX, f64::MIN);
+        let b = _mm256_set1_pd(1.);
+        let r = _mm256_mask_add_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_add_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(2., 0., f64::MAX, f64::MIN + 1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_add_pd() {
+        let a = _mm256_set_pd(1., -1., f64::MAX, f64::MIN);
+        let b = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_add_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_add_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(2., 0., f64::MAX, f64::MIN + 1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_add_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set1_pd(1.);
+        let r = _mm_mask_add_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_add_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(f64::MAX, f64::MIN + 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_add_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set1_pd(1.);
+        let r = _mm_maskz_add_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_add_pd(0b00000011, a, b);
+        let e = _mm_set_pd(f64::MAX, f64::MIN + 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_epi64() {
+        let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let b = _mm512_set1_epi64(1);
+        let r = _mm512_sub_epi64(a, b);
+        let e = _mm512_setr_epi64(-1, 0, -2, i64::MAX - 1, i64::MAX, 99, -101, -33);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_epi64() {
+        let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let b = _mm512_set1_epi64(1);
+        let r = _mm512_mask_sub_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sub_epi64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(-1, 0, -2, i64::MAX - 1, i64::MIN, 100, -100, -32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_epi64() {
+        let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let b = _mm512_set1_epi64(1);
+        let r = _mm512_maskz_sub_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sub_epi64(0b00001111, a, b);
+        let e = _mm512_setr_epi64(-1, 0, -2, i64::MAX - 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sub_epi64() {
+        let a = _mm256_set_epi64x(1, -1, i64::MAX, i64::MIN);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_sub_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sub_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(0, -2, i64::MAX - 1, i64::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_epi64() {
+        let a = _mm256_set_epi64x(1, -1, i64::MAX, i64::MIN);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_sub_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sub_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(0, -2, i64::MAX - 1, i64::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sub_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_mask_sub_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sub_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(i64::MAX - 1, i64::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sub_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_maskz_sub_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sub_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(i64::MAX - 1, i64::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_sub_pd(a, b);
+        let e = _mm512_setr_pd(-1., 0., -2., f64::MAX - 1., f64::MIN, 99., -101., -33.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_mask_sub_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_sub_pd(a, 0b00001111, a, b);
+        let e = _mm512_setr_pd(-1., 0., -2., f64::MAX - 1., f64::MIN, 100., -100., -32.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_sub_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_sub_pd(0b00001111, a, b);
+        let e = _mm512_setr_pd(-1., 0., -2., f64::MAX - 1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sub_pd() {
+        let a = _mm256_set_pd(1., -1., f64::MAX, f64::MIN);
+        let b = _mm256_set1_pd(1.);
+        let r = _mm256_mask_sub_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_sub_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(0., -2., f64::MAX - 1., f64::MIN);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_pd() {
+        let a = _mm256_set_pd(1., -1., f64::MAX, f64::MIN);
+        let b = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_sub_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_sub_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(0., -2., f64::MAX - 1., f64::MIN);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sub_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set1_pd(1.);
+        let r = _mm_mask_sub_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_sub_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(f64::MAX - 1., f64::MIN);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sub_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set1_pd(1.);
+        let r = _mm_maskz_sub_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_sub_pd(0b00000011, a, b);
+        let e = _mm_set_pd(f64::MAX - 1., f64::MIN);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mul_epi32(a, b);
+        let e = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_mul_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mul_epi32(a, 0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 | 1 << 32, 1 | 1 << 32, 1 | 1 << 32, 1 | 1 << 32,
+            7, 5, 3, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_mul_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mul_epi32(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 7, 5, 3, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mul_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_mask_mul_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mul_epi32(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(2, 4, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mul_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_mul_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mul_epi32(0b00001111, a, b);
+        let e = _mm256_set_epi64x(2, 4, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mul_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_mask_mul_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mul_epi32(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(2, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mul_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_mul_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mul_epi32(0b00000011, a, b);
+        let e = _mm_set_epi64x(2, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_epu32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mul_epu32(a, b);
+        let e = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_epu32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_mul_epu32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mul_epu32(a, 0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 | 1 << 32, 1 | 1 << 32, 1 | 1 << 32, 1 | 1 << 32,
+            7, 5, 3, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_epu32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_mul_epu32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mul_epu32(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 7, 5, 3, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mul_epu32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_mask_mul_epu32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mul_epu32(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(2, 4, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mul_epu32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_mul_epu32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mul_epu32(0b00001111, a, b);
+        let e = _mm256_set_epi64x(2, 4, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mul_epu32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_mask_mul_epu32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mul_epu32(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(2, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mul_epu32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_mul_epu32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mul_epu32(0b00000011, a, b);
+        let e = _mm_set_epi64x(2, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mullox_epi64() {
+        let a = _mm512_setr_epi64(0, 1, i64::MAX, i64::MIN, i64::MAX, 100, -100, -32);
+        let b = _mm512_set1_epi64(2);
+        let r = _mm512_mullox_epi64(a, b);
+        let e = _mm512_setr_epi64(0, 2, -2, 0, -2, 200, -200, -64);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mullox_epi64() {
+        let a = _mm512_setr_epi64(0, 1, i64::MAX, i64::MIN, i64::MAX, 100, -100, -32);
+        let b = _mm512_set1_epi64(2);
+        let r = _mm512_mask_mullox_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mullox_epi64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(0, 2, -2, 0, i64::MAX, 100, -100, -32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_pd() {
+        let a = _mm512_setr_pd(0., 1., f64::MAX, f64::MIN, f64::MAX, f64::MIN, -100., -32.);
+        let b = _mm512_set1_pd(2.);
+        let r = _mm512_mul_pd(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            0., 2., f64::INFINITY, f64::NEG_INFINITY,
+            f64::INFINITY, f64::NEG_INFINITY, -200., -64.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_pd() {
+        let a = _mm512_setr_pd(0., 1., f64::MAX, f64::MIN, f64::MAX, f64::MIN, -100., -32.);
+        let b = _mm512_set1_pd(2.);
+        let r = _mm512_mask_mul_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_mul_pd(a, 0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            0., 2., f64::INFINITY, f64::NEG_INFINITY,
+            f64::MAX, f64::MIN, -100., -32.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_pd() {
+        let a = _mm512_setr_pd(0., 1., f64::MAX, f64::MIN, f64::MAX, f64::MIN, -100., -32.);
+        let b = _mm512_set1_pd(2.);
+        let r = _mm512_maskz_mul_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_mul_pd(0b00001111, a, b);
+        let e = _mm512_setr_pd(0., 2., f64::INFINITY, f64::NEG_INFINITY, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mul_pd() {
+        let a = _mm256_set_pd(0., 1., f64::MAX, f64::MIN);
+        let b = _mm256_set1_pd(2.);
+        let r = _mm256_mask_mul_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_mul_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(0., 2., f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mul_pd() {
+        let a = _mm256_set_pd(0., 1., f64::MAX, f64::MIN);
+        let b = _mm256_set1_pd(2.);
+        let r = _mm256_maskz_mul_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_mul_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(0., 2., f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mul_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set1_pd(2.);
+        let r = _mm_mask_mul_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_mul_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mul_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set1_pd(2.);
+        let r = _mm_maskz_mul_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_mul_pd(0b00000011, a, b);
+        let e = _mm_set_pd(f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_div_pd() {
+        let a = _mm512_setr_pd(0., 1., f64::MAX, f64::MIN, f64::MAX, f64::MIN, -100., -32.);
+        let b = _mm512_setr_pd(2., 2., 0., 0., 0., 0., 2., 2.);
+        let r = _mm512_div_pd(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            0., 0.5, f64::INFINITY, f64::NEG_INFINITY,
+            f64::INFINITY, f64::NEG_INFINITY, -50., -16.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_div_pd() {
+        let a = _mm512_setr_pd(0., 1., f64::MAX, f64::MIN, f64::MAX, f64::MIN, -100., -32.);
+        let b = _mm512_setr_pd(2., 2., 0., 0., 0., 0., 2., 2.);
+        let r = _mm512_mask_div_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_div_pd(a, 0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            0., 0.5, f64::INFINITY, f64::NEG_INFINITY,
+            f64::MAX, f64::MIN, -100., -32.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_div_pd() {
+        let a = _mm512_setr_pd(0., 1., f64::MAX, f64::MIN, f64::MAX, f64::MIN, -100., -32.);
+        let b = _mm512_setr_pd(2., 2., 0., 0., 0., 0., 2., 2.);
+        let r = _mm512_maskz_div_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_div_pd(0b00001111, a, b);
+        let e = _mm512_setr_pd(0., 0.5, f64::INFINITY, f64::NEG_INFINITY, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_div_pd() {
+        let a = _mm256_set_pd(0., 1., f64::MAX, f64::MIN);
+        let b = _mm256_set_pd(2., 2., 0., 0.);
+        let r = _mm256_mask_div_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_div_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(0., 0.5, f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_div_pd() {
+        let a = _mm256_set_pd(0., 1., f64::MAX, f64::MIN);
+        let b = _mm256_set_pd(2., 2., 0., 0.);
+        let r = _mm256_maskz_div_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_div_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(0., 0.5, f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_div_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set_pd(0., 0.);
+        let r = _mm_mask_div_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_div_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_div_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set_pd(0., 0.);
+        let r = _mm_maskz_div_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_div_pd(0b00000011, a, b);
+        let e = _mm_set_pd(f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epi64(a, b);
+        let e = _mm512_setr_epi64(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epi64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epi64(0b00001111, a, b);
+        let e = _mm512_setr_epi64(7, 6, 5, 4, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_max_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_max_epi64(a, b);
+        let e = _mm256_set_epi64x(3, 2, 2, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_mask_max_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(3, 2, 2, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_maskz_max_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(3, 2, 2, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_max_epi64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_max_epi64(a, b);
+        let e = _mm_set_epi64x(3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epi64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_mask_max_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epi64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_maskz_max_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_max_pd(a, b);
+        let e = _mm512_setr_pd(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_mask_max_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_max_pd(a, 0b00001111, a, b);
+        let e = _mm512_setr_pd(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_maskz_max_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_max_pd(0b00001111, a, b);
+        let e = _mm512_setr_pd(7., 6., 5., 4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let b = _mm256_set_pd(3., 2., 1., 0.);
+        let r = _mm256_mask_max_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_max_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(3., 2., 2., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let b = _mm256_set_pd(3., 2., 1., 0.);
+        let r = _mm256_maskz_max_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_max_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(3., 2., 2., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_pd() {
+        let a = _mm_set_pd(2., 3.);
+        let b = _mm_set_pd(3., 2.);
+        let r = _mm_mask_max_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_max_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(3., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_pd() {
+        let a = _mm_set_pd(2., 3.);
+        let b = _mm_set_pd(3., 2.);
+        let r = _mm_maskz_max_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_max_pd(0b00000011, a, b);
+        let e = _mm_set_pd(3., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_epu64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epu64(a, b);
+        let e = _mm512_setr_epi64(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epu64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epu64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epu64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epu64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epu64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epu64(0b00001111, a, b);
+        let e = _mm512_setr_epi64(7, 6, 5, 4, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_max_epu64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_max_epu64(a, b);
+        let e = _mm256_set_epi64x(3, 2, 2, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epu64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_mask_max_epu64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epu64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(3, 2, 2, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epu64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_maskz_max_epu64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epu64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(3, 2, 2, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_max_epu64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_max_epu64(a, b);
+        let e = _mm_set_epi64x(3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epu64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_mask_max_epu64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epu64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epu64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_maskz_max_epu64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epu64(0b00000011, a, b);
+        let e = _mm_set_epi64x(3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epi64(a, b);
+        let e = _mm512_setr_epi64(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epi64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epi64(0b00001111, a, b);
+        let e = _mm512_setr_epi64(0, 1, 2, 3, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_min_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_min_epi64(a, b);
+        let e = _mm256_set_epi64x(0, 1, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_mask_min_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(0, 1, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_maskz_min_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(0, 1, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_min_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_min_epi64(a, b);
+        let e = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(r, e);
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(1, 0);
+        let r = _mm_min_epi64(a, b);
+        let e = _mm_set_epi64x(1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_mask_min_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_maskz_min_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_min_pd(a, b);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 3., 2., 1., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_mask_min_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_min_pd(a, 0b00001111, a, b);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_maskz_min_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_min_pd(0b00001111, a, b);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let b = _mm256_set_pd(3., 2., 1., 0.);
+        let r = _mm256_mask_min_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_min_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(0., 1., 1., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let b = _mm256_set_pd(3., 2., 1., 0.);
+        let r = _mm256_maskz_min_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_min_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(0., 1., 1., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(1., 0.);
+        let r = _mm_mask_min_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_min_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(1., 0.);
+        let r = _mm_maskz_min_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_min_pd(0b00000011, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_epu64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epu64(a, b);
+        let e = _mm512_setr_epi64(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epu64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epu64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epu64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epu64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epu64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epu64(0b00001111, a, b);
+        let e = _mm512_setr_epi64(0, 1, 2, 3, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_min_epu64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_min_epu64(a, b);
+        let e = _mm256_set_epi64x(0, 1, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epu64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_mask_min_epu64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epu64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(0, 1, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epu64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_maskz_min_epu64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epu64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(0, 1, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_min_epu64() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(1, 0);
+        let r = _mm_min_epu64(a, b);
+        let e = _mm_set_epi64x(0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epu64() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(1, 0);
+        let r = _mm_mask_min_epu64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epu64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epu64() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(1, 0);
+        let r = _mm_maskz_min_epu64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epu64(0b00000011, a, b);
+        let e = _mm_set_epi64x(0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sqrt_pd() {
+        let a = _mm512_setr_pd(0., 1., 4., 9., 16., 25., 36., 49.);
+        let r = _mm512_sqrt_pd(a);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sqrt_pd() {
+        let a = _mm512_setr_pd(0., 1., 4., 9., 16., 25., 36., 49.);
+        let r = _mm512_mask_sqrt_pd(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_sqrt_pd(a, 0b00001111, a);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 16., 25., 36., 49.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sqrt_pd() {
+        let a = _mm512_setr_pd(0., 1., 4., 9., 16., 25., 36., 49.);
+        let r = _mm512_maskz_sqrt_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_sqrt_pd(0b00001111, a);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sqrt_pd() {
+        let a = _mm256_set_pd(0., 1., 4., 9.);
+        let r = _mm256_mask_sqrt_pd(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_sqrt_pd(a, 0b00001111, a);
+        let e = _mm256_set_pd(0., 1., 2., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sqrt_pd() {
+        let a = _mm256_set_pd(0., 1., 4., 9.);
+        let r = _mm256_maskz_sqrt_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_sqrt_pd(0b00001111, a);
+        let e = _mm256_set_pd(0., 1., 2., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sqrt_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let r = _mm_mask_sqrt_pd(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_sqrt_pd(a, 0b00000011, a);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sqrt_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let r = _mm_maskz_sqrt_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_sqrt_pd(0b00000011, a);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmadd_pd() {
+        let a = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let r = _mm512_fmadd_pd(a, b, c);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmadd_pd() {
+        let a = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let r = _mm512_mask_fmadd_pd(a, 0, b, c);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmadd_pd(a, 0b00001111, b, c);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmadd_pd() {
+        let a = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let r = _mm512_maskz_fmadd_pd(0, a, b, c);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmadd_pd(0b00001111, a, b, c);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmadd_pd() {
+        let a = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 2., 2., 2., 2.);
+        let r = _mm512_mask3_fmadd_pd(a, b, c, 0);
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmadd_pd(a, b, c, 0b00001111);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask_fmadd_pd(a, 0, b, c);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_fmadd_pd(a, 0b00001111, b, c);
+        let e = _mm256_set_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_fmadd_pd(0, a, b, c);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_fmadd_pd(0b00001111, a, b, c);
+        let e = _mm256_set_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask3_fmadd_pd(a, b, c, 0);
+        assert_eq_m256d(r, c);
+        let r = _mm256_mask3_fmadd_pd(a, b, c, 0b00001111);
+        let e = _mm256_set_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask_fmadd_pd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmadd_pd(a, 0b00000011, b, c);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_maskz_fmadd_pd(0, a, b, c);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_fmadd_pd(0b00000011, a, b, c);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask3_fmadd_pd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmadd_pd(a, b, c, 0b00000011);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_fmsub_pd(a, b, c);
+        let e = _mm512_setr_pd(-1., 0., 1., 2., 3., 4., 5., 6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fmsub_pd(a, 0, b, c);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmsub_pd(a, 0b00001111, b, c);
+        let e = _mm512_setr_pd(-1., 0., 1., 2., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fmsub_pd(0, a, b, c);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmsub_pd(0b00001111, a, b, c);
+        let e = _mm512_setr_pd(-1., 0., 1., 2., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 2., 2., 2., 2.);
+        let r = _mm512_mask3_fmsub_pd(a, b, c, 0);
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmsub_pd(a, b, c, 0b00001111);
+        let e = _mm512_setr_pd(-1., 0., 1., 2., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask_fmsub_pd(a, 0, b, c);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_fmsub_pd(a, 0b00001111, b, c);
+        let e = _mm256_set_pd(-1., 0., 1., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_fmsub_pd(0, a, b, c);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_fmsub_pd(0b00001111, a, b, c);
+        let e = _mm256_set_pd(-1., 0., 1., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask3_fmsub_pd(a, b, c, 0);
+        assert_eq_m256d(r, c);
+        let r = _mm256_mask3_fmsub_pd(a, b, c, 0b00001111);
+        let e = _mm256_set_pd(-1., 0., 1., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask_fmsub_pd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmsub_pd(a, 0b00000011, b, c);
+        let e = _mm_set_pd(-1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_maskz_fmsub_pd(0, a, b, c);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_fmsub_pd(0b00000011, a, b, c);
+        let e = _mm_set_pd(-1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask3_fmsub_pd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmsub_pd(a, b, c, 0b00000011);
+        let e = _mm_set_pd(-1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmaddsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_fmaddsub_pd(a, b, c);
+        let e = _mm512_setr_pd(-1., 2., 1., 4., 3., 6., 5., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmaddsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fmaddsub_pd(a, 0, b, c);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmaddsub_pd(a, 0b00001111, b, c);
+        let e = _mm512_setr_pd(-1., 2., 1., 4., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmaddsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fmaddsub_pd(0, a, b, c);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmaddsub_pd(0b00001111, a, b, c);
+        let e = _mm512_setr_pd(-1., 2., 1., 4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmaddsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 2., 2., 2., 2.);
+        let r = _mm512_mask3_fmaddsub_pd(a, b, c, 0);
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmaddsub_pd(a, b, c, 0b00001111);
+        let e = _mm512_setr_pd(-1., 2., 1., 4., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmaddsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask_fmaddsub_pd(a, 0, b, c);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_fmaddsub_pd(a, 0b00001111, b, c);
+        let e = _mm256_set_pd(1., 0., 3., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmaddsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_fmaddsub_pd(0, a, b, c);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_fmaddsub_pd(0b00001111, a, b, c);
+        let e = _mm256_set_pd(1., 0., 3., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmaddsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask3_fmaddsub_pd(a, b, c, 0);
+        assert_eq_m256d(r, c);
+        let r = _mm256_mask3_fmaddsub_pd(a, b, c, 0b00001111);
+        let e = _mm256_set_pd(1., 0., 3., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmaddsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask_fmaddsub_pd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmaddsub_pd(a, 0b00000011, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmaddsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_maskz_fmaddsub_pd(0, a, b, c);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_fmaddsub_pd(0b00000011, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmaddsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask3_fmaddsub_pd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmaddsub_pd(a, b, c, 0b00000011);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsubadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_fmsubadd_pd(a, b, c);
+        let e = _mm512_setr_pd(1., 0., 3., 2., 5., 4., 7., 6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsubadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fmsubadd_pd(a, 0, b, c);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmsubadd_pd(a, 0b00001111, b, c);
+        let e = _mm512_setr_pd(1., 0., 3., 2., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsubadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fmsubadd_pd(0, a, b, c);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmsubadd_pd(0b00001111, a, b, c);
+        let e = _mm512_setr_pd(1., 0., 3., 2., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsubadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 2., 2., 2., 2.);
+        let r = _mm512_mask3_fmsubadd_pd(a, b, c, 0);
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmsubadd_pd(a, b, c, 0b00001111);
+        let e = _mm512_setr_pd(1., 0., 3., 2., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmsubadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask_fmsubadd_pd(a, 0, b, c);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_fmsubadd_pd(a, 0b00001111, b, c);
+        let e = _mm256_set_pd(-1., 2., 1., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsubadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_fmsubadd_pd(0, a, b, c);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_fmsubadd_pd(0b00001111, a, b, c);
+        let e = _mm256_set_pd(-1., 2., 1., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsubadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask3_fmsubadd_pd(a, b, c, 0);
+        assert_eq_m256d(r, c);
+        let r = _mm256_mask3_fmsubadd_pd(a, b, c, 0b00001111);
+        let e = _mm256_set_pd(-1., 2., 1., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmsubadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask_fmsubadd_pd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmsubadd_pd(a, 0b00000011, b, c);
+        let e = _mm_set_pd(-1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmsubadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_maskz_fmsubadd_pd(0, a, b, c);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_fmsubadd_pd(0b00000011, a, b, c);
+        let e = _mm_set_pd(-1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmsubadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask3_fmsubadd_pd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmsubadd_pd(a, b, c, 0b00000011);
+        let e = _mm_set_pd(-1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_fnmadd_pd(a, b, c);
+        let e = _mm512_setr_pd(1., 0., -1., -2., -3., -4., -5., -6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fnmadd_pd(a, 0, b, c);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fnmadd_pd(a, 0b00001111, b, c);
+        let e = _mm512_setr_pd(1., 0., -1., -2., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fnmadd_pd(0, a, b, c);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fnmadd_pd(0b00001111, a, b, c);
+        let e = _mm512_setr_pd(1., 0., -1., -2., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 2., 2., 2., 2.);
+        let r = _mm512_mask3_fnmadd_pd(a, b, c, 0);
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fnmadd_pd(a, b, c, 0b00001111);
+        let e = _mm512_setr_pd(1., 0., -1., -2., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fnmadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask_fnmadd_pd(a, 0, b, c);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_fnmadd_pd(a, 0b00001111, b, c);
+        let e = _mm256_set_pd(1., 0., -1., -2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_fnmadd_pd(0, a, b, c);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_fnmadd_pd(0b00001111, a, b, c);
+        let e = _mm256_set_pd(1., 0., -1., -2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask3_fnmadd_pd(a, b, c, 0);
+        assert_eq_m256d(r, c);
+        let r = _mm256_mask3_fnmadd_pd(a, b, c, 0b00001111);
+        let e = _mm256_set_pd(1., 0., -1., -2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fnmadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask_fnmadd_pd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmadd_pd(a, 0b00000011, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fnmadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_maskz_fnmadd_pd(0, a, b, c);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_fnmadd_pd(0b00000011, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fnmadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask3_fnmadd_pd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmadd_pd(a, b, c, 0b00000011);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_fnmsub_pd(a, b, c);
+        let e = _mm512_setr_pd(-1., -2., -3., -4., -5., -6., -7., -8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fnmsub_pd(a, 0, b, c);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fnmsub_pd(a, 0b00001111, b, c);
+        let e = _mm512_setr_pd(-1., -2., -3., -4., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fnmsub_pd(0, a, b, c);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fnmsub_pd(0b00001111, a, b, c);
+        let e = _mm512_setr_pd(-1., -2., -3., -4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 2., 2., 2., 2.);
+        let r = _mm512_mask3_fnmsub_pd(a, b, c, 0);
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fnmsub_pd(a, b, c, 0b00001111);
+        let e = _mm512_setr_pd(-1., -2., -3., -4., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fnmsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask_fnmsub_pd(a, 0, b, c);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_fnmsub_pd(a, 0b00001111, b, c);
+        let e = _mm256_set_pd(-1., -2., -3., -4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_fnmsub_pd(0, a, b, c);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_fnmsub_pd(0b00001111, a, b, c);
+        let e = _mm256_set_pd(-1., -2., -3., -4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask3_fnmsub_pd(a, b, c, 0);
+        assert_eq_m256d(r, c);
+        let r = _mm256_mask3_fnmsub_pd(a, b, c, 0b00001111);
+        let e = _mm256_set_pd(-1., -2., -3., -4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fnmsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask_fnmsub_pd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmsub_pd(a, 0b00000011, b, c);
+        let e = _mm_set_pd(-1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fnmsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_maskz_fnmsub_pd(0, a, b, c);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_fnmsub_pd(0b00000011, a, b, c);
+        let e = _mm_set_pd(-1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fnmsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask3_fnmsub_pd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmsub_pd(a, b, c, 0b00000011);
+        let e = _mm_set_pd(-1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rcp14_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_rcp14_pd(a);
+        let e = _mm512_set1_pd(0.3333320617675781);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rcp14_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_mask_rcp14_pd(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_rcp14_pd(a, 0b11110000, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            3., 3., 3., 3.,
+            0.3333320617675781, 0.3333320617675781, 0.3333320617675781, 0.3333320617675781,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rcp14_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_maskz_rcp14_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_rcp14_pd(0b11110000, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            0., 0., 0., 0.,
+            0.3333320617675781, 0.3333320617675781, 0.3333320617675781, 0.3333320617675781,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rcp14_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_rcp14_pd(a);
+        let e = _mm256_set1_pd(0.3333320617675781);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rcp14_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_mask_rcp14_pd(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_rcp14_pd(a, 0b00001111, a);
+        let e = _mm256_set1_pd(0.3333320617675781);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rcp14_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_maskz_rcp14_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_rcp14_pd(0b00001111, a);
+        let e = _mm256_set1_pd(0.3333320617675781);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rcp14_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_rcp14_pd(a);
+        let e = _mm_set1_pd(0.3333320617675781);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rcp14_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_mask_rcp14_pd(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_rcp14_pd(a, 0b00000011, a);
+        let e = _mm_set1_pd(0.3333320617675781);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rcp14_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_maskz_rcp14_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_rcp14_pd(0b00000011, a);
+        let e = _mm_set1_pd(0.3333320617675781);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rsqrt14_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_rsqrt14_pd(a);
+        let e = _mm512_set1_pd(0.5773391723632813);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rsqrt14_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_mask_rsqrt14_pd(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_rsqrt14_pd(a, 0b11110000, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            3., 3., 3., 3.,
+            0.5773391723632813, 0.5773391723632813, 0.5773391723632813, 0.5773391723632813,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rsqrt14_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_maskz_rsqrt14_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_rsqrt14_pd(0b11110000, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            0., 0., 0., 0.,
+            0.5773391723632813, 0.5773391723632813, 0.5773391723632813, 0.5773391723632813,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rsqrt14_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_rsqrt14_pd(a);
+        let e = _mm256_set1_pd(0.5773391723632813);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rsqrt14_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_mask_rsqrt14_pd(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_rsqrt14_pd(a, 0b00001111, a);
+        let e = _mm256_set1_pd(0.5773391723632813);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rsqrt14_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_maskz_rsqrt14_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_rsqrt14_pd(0b00001111, a);
+        let e = _mm256_set1_pd(0.5773391723632813);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rsqrt14_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_rsqrt14_pd(a);
+        let e = _mm_set1_pd(0.5773391723632813);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rsqrt14_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_mask_rsqrt14_pd(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_rsqrt14_pd(a, 0b00000011, a);
+        let e = _mm_set1_pd(0.5773391723632813);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rsqrt14_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_maskz_rsqrt14_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_rsqrt14_pd(0b00000011, a);
+        let e = _mm_set1_pd(0.5773391723632813);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getexp_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_getexp_pd(a);
+        let e = _mm512_set1_pd(1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getexp_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_mask_getexp_pd(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_getexp_pd(a, 0b11110000, a);
+        let e = _mm512_setr_pd(3., 3., 3., 3., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getexp_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_maskz_getexp_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_getexp_pd(0b11110000, a);
+        let e = _mm512_setr_pd(0., 0., 0., 0., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_getexp_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_getexp_pd(a);
+        let e = _mm256_set1_pd(1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_getexp_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_mask_getexp_pd(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_getexp_pd(a, 0b00001111, a);
+        let e = _mm256_set1_pd(1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_getexp_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_maskz_getexp_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_getexp_pd(0b00001111, a);
+        let e = _mm256_set1_pd(1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_getexp_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_getexp_pd(a);
+        let e = _mm_set1_pd(1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_getexp_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_mask_getexp_pd(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_getexp_pd(a, 0b00000011, a);
+        let e = _mm_set1_pd(1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_getexp_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_maskz_getexp_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_getexp_pd(0b00000011, a);
+        let e = _mm_set1_pd(1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_roundscale_pd::<0b00_00_00_00>(a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_mask_roundscale_pd::<0b00_00_00_00>(a, 0, a);
+        let e = _mm512_set1_pd(1.1);
+        assert_eq_m512d(r, e);
+        let r = _mm512_mask_roundscale_pd::<0b00_00_00_00>(a, 0b11111111, a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_maskz_roundscale_pd::<0b00_00_00_00>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_roundscale_pd::<0b00_00_00_00>(0b11111111, a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_roundscale_pd() {
+        let a = _mm256_set1_pd(1.1);
+        let r = _mm256_roundscale_pd::<0b00_00_00_00>(a);
+        let e = _mm256_set1_pd(1.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_roundscale_pd() {
+        let a = _mm256_set1_pd(1.1);
+        let r = _mm256_mask_roundscale_pd::<0b00_00_00_00>(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_roundscale_pd::<0b00_00_00_00>(a, 0b00001111, a);
+        let e = _mm256_set1_pd(1.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_roundscale_pd() {
+        let a = _mm256_set1_pd(1.1);
+        let r = _mm256_maskz_roundscale_pd::<0b00_00_00_00>(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_roundscale_pd::<0b00_00_00_00>(0b00001111, a);
+        let e = _mm256_set1_pd(1.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_roundscale_pd() {
+        let a = _mm_set1_pd(1.1);
+        let r = _mm_roundscale_pd::<0b00_00_00_00>(a);
+        let e = _mm_set1_pd(1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_roundscale_pd() {
+        let a = _mm_set1_pd(1.1);
+        let r = _mm_mask_roundscale_pd::<0b00_00_00_00>(a, 0, a);
+        let e = _mm_set1_pd(1.1);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_roundscale_pd::<0b00_00_00_00>(a, 0b00000011, a);
+        let e = _mm_set1_pd(1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_roundscale_pd() {
+        let a = _mm_set1_pd(1.1);
+        let r = _mm_maskz_roundscale_pd::<0b00_00_00_00>(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_roundscale_pd::<0b00_00_00_00>(0b00000011, a);
+        let e = _mm_set1_pd(1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_scalef_pd(a, b);
+        let e = _mm512_set1_pd(8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_mask_scalef_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_scalef_pd(a, 0b11110000, a, b);
+        let e = _mm512_set_pd(8., 8., 8., 8., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_maskz_scalef_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_scalef_pd(0b11110000, a, b);
+        let e = _mm512_set_pd(8., 8., 8., 8., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_scalef_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(3.);
+        let r = _mm256_scalef_pd(a, b);
+        let e = _mm256_set1_pd(8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_scalef_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(3.);
+        let r = _mm256_mask_scalef_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_scalef_pd(a, 0b00001111, a, b);
+        let e = _mm256_set1_pd(8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_scalef_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(3.);
+        let r = _mm256_maskz_scalef_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_scalef_pd(0b00001111, a, b);
+        let e = _mm256_set1_pd(8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_scalef_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_scalef_pd(a, b);
+        let e = _mm_set1_pd(8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_scalef_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_scalef_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_scalef_pd(a, 0b00000011, a, b);
+        let e = _mm_set1_pd(8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_scalef_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_scalef_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_scalef_pd(0b00000011, a, b);
+        let e = _mm_set1_pd(8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_pd() {
+        let a = _mm512_set1_pd(f64::NAN);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_fixupimm_pd::<5>(a, b, c);
+        let e = _mm512_set1_pd(0.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_pd() {
+        let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_mask_fixupimm_pd::<5>(a, 0b11110000, b, c);
+        let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_pd() {
+        let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_maskz_fixupimm_pd::<5>(0b11110000, a, b, c);
+        let e = _mm512_set_pd(0., 0., 0., 0., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_fixupimm_pd() {
+        let a = _mm256_set1_pd(f64::NAN);
+        let b = _mm256_set1_pd(f64::MAX);
+        let c = _mm256_set1_epi64x(i32::MAX as i64);
+        let r = _mm256_fixupimm_pd::<5>(a, b, c);
+        let e = _mm256_set1_pd(0.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fixupimm_pd() {
+        let a = _mm256_set1_pd(f64::NAN);
+        let b = _mm256_set1_pd(f64::MAX);
+        let c = _mm256_set1_epi64x(i32::MAX as i64);
+        let r = _mm256_mask_fixupimm_pd::<5>(a, 0b00001111, b, c);
+        let e = _mm256_set1_pd(0.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fixupimm_pd() {
+        let a = _mm256_set1_pd(f64::NAN);
+        let b = _mm256_set1_pd(f64::MAX);
+        let c = _mm256_set1_epi64x(i32::MAX as i64);
+        let r = _mm256_maskz_fixupimm_pd::<5>(0b00001111, a, b, c);
+        let e = _mm256_set1_pd(0.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_fixupimm_pd() {
+        let a = _mm_set1_pd(f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_fixupimm_pd::<5>(a, b, c);
+        let e = _mm_set1_pd(0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fixupimm_pd() {
+        let a = _mm_set1_pd(f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_mask_fixupimm_pd::<5>(a, 0b00000011, b, c);
+        let e = _mm_set1_pd(0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fixupimm_pd() {
+        let a = _mm_set1_pd(f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_maskz_fixupimm_pd::<5>(0b00000011, a, b, c);
+        let e = _mm_set1_pd(0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_ternarylogic_epi64() {
+        let a = _mm512_set1_epi64(1 << 2);
+        let b = _mm512_set1_epi64(1 << 1);
+        let c = _mm512_set1_epi64(1 << 0);
+        let r = _mm512_ternarylogic_epi64::<8>(a, b, c);
+        let e = _mm512_set1_epi64(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_ternarylogic_epi64() {
+        let src = _mm512_set1_epi64(1 << 2);
+        let a = _mm512_set1_epi64(1 << 1);
+        let b = _mm512_set1_epi64(1 << 0);
+        let r = _mm512_mask_ternarylogic_epi64::<8>(src, 0, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_ternarylogic_epi64::<8>(src, 0b11111111, a, b);
+        let e = _mm512_set1_epi64(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_ternarylogic_epi64() {
+        let a = _mm512_set1_epi64(1 << 2);
+        let b = _mm512_set1_epi64(1 << 1);
+        let c = _mm512_set1_epi64(1 << 0);
+        let r = _mm512_maskz_ternarylogic_epi64::<8>(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_ternarylogic_epi64::<8>(0b11111111, a, b, c);
+        let e = _mm512_set1_epi64(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_ternarylogic_epi64() {
+        let a = _mm256_set1_epi64x(1 << 2);
+        let b = _mm256_set1_epi64x(1 << 1);
+        let c = _mm256_set1_epi64x(1 << 0);
+        let r = _mm256_ternarylogic_epi64::<8>(a, b, c);
+        let e = _mm256_set1_epi64x(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_ternarylogic_epi64() {
+        let src = _mm256_set1_epi64x(1 << 2);
+        let a = _mm256_set1_epi64x(1 << 1);
+        let b = _mm256_set1_epi64x(1 << 0);
+        let r = _mm256_mask_ternarylogic_epi64::<8>(src, 0, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_ternarylogic_epi64::<8>(src, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_ternarylogic_epi64() {
+        let a = _mm256_set1_epi64x(1 << 2);
+        let b = _mm256_set1_epi64x(1 << 1);
+        let c = _mm256_set1_epi64x(1 << 0);
+        let r = _mm256_maskz_ternarylogic_epi64::<9>(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_ternarylogic_epi64::<8>(0b00001111, a, b, c);
+        let e = _mm256_set1_epi64x(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_ternarylogic_epi64() {
+        let a = _mm_set1_epi64x(1 << 2);
+        let b = _mm_set1_epi64x(1 << 1);
+        let c = _mm_set1_epi64x(1 << 0);
+        let r = _mm_ternarylogic_epi64::<8>(a, b, c);
+        let e = _mm_set1_epi64x(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_ternarylogic_epi64() {
+        let src = _mm_set1_epi64x(1 << 2);
+        let a = _mm_set1_epi64x(1 << 1);
+        let b = _mm_set1_epi64x(1 << 0);
+        let r = _mm_mask_ternarylogic_epi64::<8>(src, 0, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_ternarylogic_epi64::<8>(src, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_ternarylogic_epi64() {
+        let a = _mm_set1_epi64x(1 << 2);
+        let b = _mm_set1_epi64x(1 << 1);
+        let c = _mm_set1_epi64x(1 << 0);
+        let r = _mm_maskz_ternarylogic_epi64::<9>(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_ternarylogic_epi64::<8>(0b00000011, a, b, c);
+        let e = _mm_set1_epi64x(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getmant_pd() {
+        let a = _mm512_set1_pd(10.);
+        let r = _mm512_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a);
+        let e = _mm512_set1_pd(1.25);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getmant_pd() {
+        let a = _mm512_set1_pd(10.);
+        let r = _mm512_mask_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11110000, a);
+        let e = _mm512_setr_pd(10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getmant_pd() {
+        let a = _mm512_set1_pd(10.);
+        let r = _mm512_maskz_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11110000, a);
+        let e = _mm512_setr_pd(0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_getmant_pd() {
+        let a = _mm256_set1_pd(10.);
+        let r = _mm256_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a);
+        let e = _mm256_set1_pd(1.25);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_getmant_pd() {
+        let a = _mm256_set1_pd(10.);
+        let r = _mm256_mask_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b00001111, a);
+        let e = _mm256_set1_pd(1.25);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_getmant_pd() {
+        let a = _mm256_set1_pd(10.);
+        let r = _mm256_maskz_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b00001111, a);
+        let e = _mm256_set1_pd(1.25);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_getmant_pd() {
+        let a = _mm_set1_pd(10.);
+        let r = _mm_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a);
+        let e = _mm_set1_pd(1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_getmant_pd() {
+        let a = _mm_set1_pd(10.);
+        let r = _mm_mask_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b00000011, a);
+        let e = _mm_set1_pd(1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_getmant_pd() {
+        let a = _mm_set1_pd(10.);
+        let r = _mm_maskz_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b00000011, a);
+        let e = _mm_set1_pd(1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtps_pd() {
+        let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvtps_pd(a);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtps_pd() {
+        let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm512_set1_pd(0.);
+        let r = _mm512_mask_cvtps_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvtps_pd(src, 0b00001111, a);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtps_pd() {
+        let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvtps_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_cvtps_pd(0b00001111, a);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtpslo_pd() {
+        let v2 = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 100., 100., 100., 100., 100., 100., 100., 100.,
+        );
+        let r = _mm512_cvtpslo_pd(v2);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtpslo_pd() {
+        let v2 = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 100., 100., 100., 100., 100., 100., 100., 100.,
+        );
+        let src = _mm512_set1_pd(0.);
+        let r = _mm512_mask_cvtpslo_pd(src, 0, v2);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvtpslo_pd(src, 0b00001111, v2);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtpd_ps() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvtpd_ps(a);
+        let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtpd_ps() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_ps(0.);
+        let r = _mm512_mask_cvtpd_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm512_mask_cvtpd_ps(src, 0b00001111, a);
+        let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtpd_ps() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvtpd_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm512_maskz_cvtpd_ps(0b00001111, a);
+        let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_ps() {
+        let a = _mm256_set_pd(4., -5.5, 6., -7.5);
+        let src = _mm_set1_ps(0.);
+        let r = _mm256_mask_cvtpd_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm256_mask_cvtpd_ps(src, 0b00001111, a);
+        let e = _mm_set_ps(4., -5.5, 6., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_ps() {
+        let a = _mm256_set_pd(4., -5.5, 6., -7.5);
+        let r = _mm256_maskz_cvtpd_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm256_maskz_cvtpd_ps(0b00001111, a);
+        let e = _mm_set_ps(4., -5.5, 6., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_ps() {
+        let a = _mm_set_pd(6., -7.5);
+        let src = _mm_set1_ps(0.);
+        let r = _mm_mask_cvtpd_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_cvtpd_ps(src, 0b00000011, a);
+        let e = _mm_set_ps(0., 0., 6., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_ps() {
+        let a = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvtpd_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_cvtpd_ps(0b00000011, a);
+        let e = _mm_set_ps(0., 0., 6., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvtpd_epi32(a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvtpd_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtpd_epi32(src, 0b11111111, a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvtpd_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtpd_epi32(0b11111111, a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_epi32() {
+        let a = _mm256_set_pd(4., -5.5, 6., -7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvtpd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtpd_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, -6, 6, -8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_epi32() {
+        let a = _mm256_set_pd(4., -5.5, 6., -7.5);
+        let r = _mm256_maskz_cvtpd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtpd_epi32(0b00001111, a);
+        let e = _mm_set_epi32(4, -6, 6, -8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_epi32() {
+        let a = _mm_set_pd(6., -7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtpd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtpd_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, -8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_epi32() {
+        let a = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvtpd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtpd_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, -8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtpd_epu32() {
+        let a = _mm512_setr_pd(0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5);
+        let r = _mm512_cvtpd_epu32(a);
+        let e = _mm256_setr_epi32(0, 2, 2, 4, 4, 6, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtpd_epu32() {
+        let a = _mm512_setr_pd(0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvtpd_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtpd_epu32(src, 0b11111111, a);
+        let e = _mm256_setr_epi32(0, 2, 2, 4, 4, 6, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtpd_epu32() {
+        let a = _mm512_setr_pd(0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5);
+        let r = _mm512_maskz_cvtpd_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtpd_epu32(0b11111111, a);
+        let e = _mm256_setr_epi32(0, 2, 2, 4, 4, 6, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let r = _mm256_cvtpd_epu32(a);
+        let e = _mm_set_epi32(4, 6, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvtpd_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtpd_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 6, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let r = _mm256_maskz_cvtpd_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtpd_epu32(0b00001111, a);
+        let e = _mm_set_epi32(4, 6, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let r = _mm_cvtpd_epu32(a);
+        let e = _mm_set_epi32(0, 0, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtpd_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtpd_epu32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let r = _mm_maskz_cvtpd_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtpd_epu32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtpd_pslo() {
+        let v2 = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvtpd_pslo(v2);
+        let e = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtpd_pslo() {
+        let v2 = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvtpd_pslo(src, 0, v2);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtpd_pslo(src, 0b00001111, v2);
+        let e = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi8_epi64(a);
+        let e = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi64(-1);
+        let r = _mm512_mask_cvtepi8_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi8_epi64(src, 0b00001111, a);
+        let e = _mm512_set_epi64(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi8_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi8_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepi8_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi8_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi8_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi8_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepi8_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi8_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepi8_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi8_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu8_epi64(a);
+        let e = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi64(-1);
+        let r = _mm512_mask_cvtepu8_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu8_epi64(src, 0b00001111, a);
+        let e = _mm512_set_epi64(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu8_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu8_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepu8_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu8_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu8_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu8_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepu8_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu8_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu8_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu8_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi16_epi64(a);
+        let e = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi64(-1);
+        let r = _mm512_mask_cvtepi16_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi16_epi64(src, 0b00001111, a);
+        let e = _mm512_set_epi64(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi16_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi16_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepi16_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi16_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi16_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi16_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepi16_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi16_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepi16_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi16_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu16_epi64(a);
+        let e = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi64(-1);
+        let r = _mm512_mask_cvtepu16_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu16_epi64(src, 0b00001111, a);
+        let e = _mm512_set_epi64(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu16_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu16_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepu16_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu16_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu16_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu16_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepu16_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu16_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu16_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu16_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_epi64() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_epi64(a);
+        let e = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_epi64() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi64(-1);
+        let r = _mm512_mask_cvtepi32_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi32_epi64(src, 0b00001111, a);
+        let e = _mm512_set_epi64(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_epi64() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi32_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_epi64() {
+        let a = _mm_set_epi32(8, 9, 10, 11);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepi32_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi32_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(8, 9, 10, 11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_epi64() {
+        let a = _mm_set_epi32(8, 9, 10, 11);
+        let r = _mm256_maskz_cvtepi32_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi32_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(8, 9, 10, 11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_epi64() {
+        let a = _mm_set_epi32(8, 9, 10, 11);
+        let src = _mm_set1_epi64x(0);
+        let r = _mm_mask_cvtepi32_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi32_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(10, 11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_epi64() {
+        let a = _mm_set_epi32(8, 9, 10, 11);
+        let r = _mm_maskz_cvtepi32_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi32_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(10, 11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu32_epi64() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu32_epi64(a);
+        let e = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu32_epi64() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi64(-1);
+        let r = _mm512_mask_cvtepu32_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu32_epi64(src, 0b00001111, a);
+        let e = _mm512_set_epi64(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu32_epi64() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu32_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu32_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu32_epi64() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepu32_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu32_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu32_epi64() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu32_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu32_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu32_epi64() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepu32_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu32_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu32_epi64() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu32_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu32_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_pd() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_pd(a);
+        let e = _mm512_set_pd(8., 9., 10., 11., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_pd() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_cvtepi32_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvtepi32_pd(src, 0b00001111, a);
+        let e = _mm512_set_pd(-1., -1., -1., -1., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_pd() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_cvtepi32_pd(0b00001111, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm256_set1_pd(-1.);
+        let r = _mm256_mask_cvtepi32_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_cvtepi32_pd(src, 0b00001111, a);
+        let e = _mm256_set_pd(12., 13., 14., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi32_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_cvtepi32_pd(0b00001111, a);
+        let e = _mm256_set_pd(12., 13., 14., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm_set1_pd(-1.);
+        let r = _mm_mask_cvtepi32_pd(src, 0, a);
+        assert_eq_m128d(r, src);
+        let r = _mm_mask_cvtepi32_pd(src, 0b00000011, a);
+        let e = _mm_set_pd(14., 15.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_maskz_cvtepi32_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_cvtepi32_pd(0b00000011, a);
+        let e = _mm_set_pd(14., 15.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu32_pd() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu32_pd(a);
+        let e = _mm512_set_pd(8., 9., 10., 11., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu32_pd() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_cvtepu32_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvtepu32_pd(src, 0b00001111, a);
+        let e = _mm512_set_pd(-1., -1., -1., -1., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu32_pd() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu32_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_cvtepu32_pd(0b00001111, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm256_cvtepu32_pd(a);
+        let e = _mm256_set_pd(12., 13., 14., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm256_set1_pd(-1.);
+        let r = _mm256_mask_cvtepu32_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_cvtepu32_pd(src, 0b00001111, a);
+        let e = _mm256_set_pd(12., 13., 14., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu32_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_cvtepu32_pd(0b00001111, a);
+        let e = _mm256_set_pd(12., 13., 14., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_cvtepu32_pd(a);
+        let e = _mm_set_pd(14., 15.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm_set1_pd(-1.);
+        let r = _mm_mask_cvtepu32_pd(src, 0, a);
+        assert_eq_m128d(r, src);
+        let r = _mm_mask_cvtepu32_pd(src, 0b00000011, a);
+        let e = _mm_set_pd(14., 15.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu32_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_cvtepu32_pd(0b00000011, a);
+        let e = _mm_set_pd(14., 15.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32lo_pd() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32lo_pd(a);
+        let e = _mm512_set_pd(8., 9., 10., 11., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32lo_pd() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_cvtepi32lo_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvtepi32lo_pd(src, 0b00001111, a);
+        let e = _mm512_set_pd(-1., -1., -1., -1., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu32lo_pd() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu32lo_pd(a);
+        let e = _mm512_set_pd(8., 9., 10., 11., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu32lo_pd() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_cvtepu32lo_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvtepu32lo_pd(src, 0b00001111, a);
+        let e = _mm512_set_pd(-1., -1., -1., -1., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi64_epi32() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi64_epi32(a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi64_epi32() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm512_mask_cvtepi64_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtepi64_epi32(src, 0b00001111, a);
+        let e = _mm256_set_epi32(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi64_epi32() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi64_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtepi64_epi32(0b00001111, a);
+        let e = _mm256_set_epi32(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepi64_epi32() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepi64_epi32(a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_epi32() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvtepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi64_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi64_epi32() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi64_epi32(0b00001111, a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi64_epi32() {
+        let a = _mm_set_epi64x(3, 4);
+        let r = _mm_cvtepi64_epi32(a);
+        let e = _mm_set_epi32(0, 0, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_epi32() {
+        let a = _mm_set_epi64x(3, 4);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi64_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi64_epi32() {
+        let a = _mm_set_epi64x(3, 4);
+        let r = _mm_maskz_cvtepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi64_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi64_epi16() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi64_epi16(a);
+        let e = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi64_epi16() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm512_mask_cvtepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtepi64_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi64_epi16() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtepi64_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepi64_epi16() {
+        let a = _mm256_set_epi64x(12, 13, 14, 15);
+        let r = _mm256_cvtepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_epi16() {
+        let a = _mm256_set_epi64x(12, 13, 14, 15);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi64_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi64_epi16() {
+        let a = _mm256_set_epi64x(12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi64_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi64_epi16() {
+        let a = _mm_set_epi64x(14, 15);
+        let r = _mm_cvtepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_epi16() {
+        let a = _mm_set_epi64x(14, 15);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi64_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi64_epi16() {
+        let a = _mm_set_epi64x(14, 15);
+        let r = _mm_maskz_cvtepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi64_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi64_epi8() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi64_epi8() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_mask_cvtepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtepi64_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi64_epi8() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepi64_epi8() {
+        let a = _mm256_set_epi64x(12, 13, 14, 15);
+        let r = _mm256_cvtepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_epi8() {
+        let a = _mm256_set_epi64x(12, 13, 14, 15);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi64_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi64_epi8() {
+        let a = _mm256_set_epi64x(12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi64_epi8() {
+        let a = _mm_set_epi64x(14, 15);
+        let r = _mm_cvtepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_epi8() {
+        let a = _mm_set_epi64x(14, 15);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi64_epi8(src, 0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi64_epi8() {
+        let a = _mm_set_epi64x(14, 15);
+        let r = _mm_maskz_cvtepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi64_epi8(0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsepi64_epi32() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let r = _mm512_cvtsepi64_epi32(a);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, i32::MIN, i32::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_epi32() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm512_mask_cvtsepi64_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtsepi64_epi32(src, 0b00001111, a);
+        let e = _mm256_set_epi32(-1, -1, -1, -1, 4, 5, i32::MIN, i32::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtsepi64_epi32() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let r = _mm512_maskz_cvtsepi64_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtsepi64_epi32(0b00001111, a);
+        let e = _mm256_set_epi32(0, 0, 0, 0, 4, 5, i32::MIN, i32::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_cvtsepi64_epi32(a);
+        let e = _mm_set_epi32(4, 5, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm256_mask_cvtsepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi64_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 5, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_maskz_cvtsepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi64_epi32(0b00001111, a);
+        let e = _mm_set_epi32(4, 5, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi64_epi32() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_cvtsepi64_epi32(a);
+        let e = _mm_set_epi32(0, 0, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_epi32() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtsepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi64_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi64_epi32() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_maskz_cvtsepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi64_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsepi64_epi16() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let r = _mm512_cvtsepi64_epi16(a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_epi16() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm512_mask_cvtsepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtsepi64_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(-1, -1, -1, -1, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtsepi64_epi16() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let r = _mm512_maskz_cvtsepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtsepi64_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_cvtsepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtsepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi64_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_maskz_cvtsepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi64_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi64_epi16() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_cvtsepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_epi16() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtsepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi64_epi16(src, 0b00000011, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi64_epi16() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_maskz_cvtsepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi64_epi16(0b00000011, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsepi64_epi8() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let r = _mm512_cvtsepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_epi8() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_mask_cvtsepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtsepi64_epi8(src, 0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            -1, -1, -1, -1,
+            4, 5, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtsepi64_epi8() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let r = _mm512_maskz_cvtsepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtsepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_cvtsepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtsepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi64_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_maskz_cvtsepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi64_epi8() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_cvtsepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_epi8() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtsepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi64_epi8(src, 0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi64_epi8() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_maskz_cvtsepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi64_epi8(0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtusepi64_epi32() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let r = _mm512_cvtusepi64_epi32(a);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_epi32() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm512_mask_cvtusepi64_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtusepi64_epi32(src, 0b00001111, a);
+        let e = _mm256_set_epi32(-1, -1, -1, -1, 4, 5, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtusepi64_epi32() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let r = _mm512_maskz_cvtusepi64_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtusepi64_epi32(0b00001111, a);
+        let e = _mm256_set_epi32(0, 0, 0, 0, 4, 5, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_cvtusepi64_epi32(a);
+        let e = _mm_set_epi32(4, 5, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvtusepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi64_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_maskz_cvtusepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi64_epi32(0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi64_epi32() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_cvtusepi64_epi32(a);
+        let e = _mm_set_epi32(0, 0, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_epi32() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtusepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi64_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi64_epi32() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_maskz_cvtusepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi64_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtusepi64_epi16() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let r = _mm512_cvtusepi64_epi16(a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_epi16() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm512_mask_cvtusepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtusepi64_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(-1, -1, -1, -1, 4, 5, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtusepi64_epi16() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let r = _mm512_maskz_cvtusepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtusepi64_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_cvtusepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtusepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi64_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_maskz_cvtusepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi64_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi64_epi16() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_cvtusepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_epi16() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtusepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi64_epi16(src, 0b00000011, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi64_epi16() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_maskz_cvtusepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi64_epi16(0b00000011, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtusepi64_epi8() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let r = _mm512_cvtusepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_epi8() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_mask_cvtusepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtusepi64_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 4, 5, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtusepi64_epi8() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let r = _mm512_maskz_cvtusepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtusepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_cvtusepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtusepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi64_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_maskz_cvtusepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi64_epi8() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_cvtusepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_epi8() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtusepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi64_epi8(src, 0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi64_epi8() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_maskz_cvtusepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi64_epi8(0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtt_roundpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm256_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtt_roundpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtt_roundpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtt_roundpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtt_roundpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtt_roundpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvttpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvttpd_epi32(a);
+        let e = _mm256_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvttpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvttpd_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvttpd_epi32(src, 0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvttpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvttpd_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvttpd_epi32(0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttpd_epi32() {
+        let a = _mm256_setr_pd(4., -5.5, 6., -7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvttpd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvttpd_epi32(src, 0b00001111, a);
+        let e = _mm_setr_epi32(4, -5, 6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttpd_epi32() {
+        let a = _mm256_setr_pd(4., -5.5, 6., -7.5);
+        let r = _mm256_maskz_cvttpd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvttpd_epi32(0b00001111, a);
+        let e = _mm_setr_epi32(4, -5, 6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttpd_epi32() {
+        let a = _mm_set_pd(6., -7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttpd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttpd_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttpd_epi32() {
+        let a = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvttpd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttpd_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvttpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvttpd_epu32(a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvttpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvttpd_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvttpd_epu32(src, 0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvttpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvttpd_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvttpd_epu32(0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvttpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let r = _mm256_cvttpd_epu32(a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvttpd_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvttpd_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let r = _mm256_maskz_cvttpd_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvttpd_epu32(0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvttpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let r = _mm_cvttpd_epu32(a);
+        let e = _mm_set_epi32(0, 0, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttpd_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttpd_epu32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let r = _mm_maskz_cvttpd_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttpd_epu32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
+        let b = _mm512_set1_pd(-1.);
+        let r = _mm512_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_pd(7., 8.5, 9., 10.5, 11., 12.5, 13., -1.0);
+        assert_eq_m512d(r, e);
+        let r = _mm512_add_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_pd(7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999999999999);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
+        let b = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(8., 9.5, 10., 11.5, 11., 12.5, 13., -1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
+        let b = _mm512_set1_pd(-1.);
+        let r =
+            _mm512_maskz_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(0., 0., 0., 0., 11., 12.5, 13., -1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_sub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_pd(7., 8.5, 9., 10.5, 11., 12.5, 13., -1.0);
+        assert_eq_m512d(r, e);
+        let r = _mm512_sub_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_pd(7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999999999999);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_mask_sub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_sub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(8., 9.5, 10., 11.5, 11., 12.5, 13., -1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let r =
+            _mm512_maskz_sub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_sub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(0., 0., 0., 0., 11., 12.5, 13., -1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.);
+        let b = _mm512_set1_pd(0.1);
+        let r = _mm512_mul_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_pd(
+            0.8,
+            0.9500000000000001,
+            1.,
+            1.1500000000000001,
+            1.2000000000000002,
+            1.35,
+            1.4000000000000001,
+            0.,
+        );
+        assert_eq_m512d(r, e);
+        let r = _mm512_mul_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_pd(0.8, 0.95, 1.0, 1.15, 1.2, 1.3499999999999999, 1.4, 0.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.);
+        let b = _mm512_set1_pd(0.1);
+        let r = _mm512_mask_mul_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_mul_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(
+            8.,
+            9.5,
+            10.,
+            11.5,
+            1.2000000000000002,
+            1.35,
+            1.4000000000000001,
+            0.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.);
+        let b = _mm512_set1_pd(0.1);
+        let r =
+            _mm512_maskz_mul_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_mul_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(
+            0.,
+            0.,
+            0.,
+            0.,
+            1.2000000000000002,
+            1.35,
+            1.4000000000000001,
+            0.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_div_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_div_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pd(0.3333333333333333);
+        assert_eq_m512d(r, e);
+        let r = _mm512_div_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pd(0.3333333333333333);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_div_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_mask_div_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_div_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(
+            1.,
+            1.,
+            1.,
+            1.,
+            0.3333333333333333,
+            0.3333333333333333,
+            0.3333333333333333,
+            0.3333333333333333,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_div_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r =
+            _mm512_maskz_div_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_div_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(
+            0.,
+            0.,
+            0.,
+            0.,
+            0.3333333333333333,
+            0.3333333333333333,
+            0.3333333333333333,
+            0.3333333333333333,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sqrt_round_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_sqrt_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set1_pd(1.7320508075688772);
+        assert_eq_m512d(r, e);
+        let r = _mm512_sqrt_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set1_pd(1.7320508075688774);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sqrt_round_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r =
+            _mm512_mask_sqrt_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_sqrt_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11110000, a,
+        );
+        let e = _mm512_setr_pd(
+            3.,
+            3.,
+            3.,
+            3.,
+            1.7320508075688772,
+            1.7320508075688772,
+            1.7320508075688772,
+            1.7320508075688772,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sqrt_round_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r =
+            _mm512_maskz_sqrt_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_sqrt_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11110000, a,
+        );
+        let e = _mm512_setr_pd(
+            0.,
+            0.,
+            0.,
+            0.,
+            1.7320508075688772,
+            1.7320508075688772,
+            1.7320508075688772,
+            1.7320508075688772,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(-1.);
+        assert_eq_m512d(r, e);
+        let r = _mm512_fmadd_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(-0.9999999999999999);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b00001111, b, c,
+        );
+        let e = _mm512_setr_pd(
+            -1.,
+            -1.,
+            -1.,
+            -1.,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_maskz_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00001111, a, b, c,
+        );
+        let e = _mm512_setr_pd(-1., -1., -1., -1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask3_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b00001111,
+        );
+        let e = _mm512_setr_pd(-1., -1., -1., -1., -1., -1., -1., -1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(-1.);
+        assert_eq_m512d(r, e);
+        let r = _mm512_fmsub_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(-0.9999999999999999);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b00001111, b, c,
+        );
+        let e = _mm512_setr_pd(
+            -1.,
+            -1.,
+            -1.,
+            -1.,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00001111, a, b, c,
+        );
+        let e = _mm512_setr_pd(-1., -1., -1., -1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask3_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b00001111,
+        );
+        let e = _mm512_setr_pd(-1., -1., -1., -1., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmaddsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r =
+            _mm512_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_pd(1., -1., 1., -1., 1., -1., 1., -1.);
+        assert_eq_m512d(r, e);
+        let r = _mm512_fmaddsub_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_pd(
+            1.,
+            -0.9999999999999999,
+            1.,
+            -0.9999999999999999,
+            1.,
+            -0.9999999999999999,
+            1.,
+            -0.9999999999999999,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmaddsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b00001111, b, c,
+        );
+        let e = _mm512_setr_pd(
+            1.,
+            -1.,
+            1.,
+            -1.,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmaddsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_maskz_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00001111, a, b, c,
+        );
+        let e = _mm512_setr_pd(1., -1., 1., -1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmaddsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask3_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b00001111,
+        );
+        let e = _mm512_setr_pd(1., -1., 1., -1., -1., -1., -1., -1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsubadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r =
+            _mm512_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_pd(-1., 1., -1., 1., -1., 1., -1., 1.);
+        assert_eq_m512d(r, e);
+        let r = _mm512_fmsubadd_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_pd(
+            -0.9999999999999999,
+            1.,
+            -0.9999999999999999,
+            1.,
+            -0.9999999999999999,
+            1.,
+            -0.9999999999999999,
+            1.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsubadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b00001111, b, c,
+        );
+        let e = _mm512_setr_pd(
+            -1.,
+            1.,
+            -1.,
+            1.,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsubadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_maskz_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00001111, a, b, c,
+        );
+        let e = _mm512_setr_pd(-1., 1., -1., 1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsubadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask3_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b00001111,
+        );
+        let e = _mm512_setr_pd(-1., 1., -1., 1., -1., -1., -1., -1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r =
+            _mm512_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(1.);
+        assert_eq_m512d(r, e);
+        let r = _mm512_fnmadd_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(0.9999999999999999);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b00001111, b, c,
+        );
+        let e = _mm512_setr_pd(
+            1.,
+            1.,
+            1.,
+            1.,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00001111, a, b, c,
+        );
+        let e = _mm512_setr_pd(1., 1., 1., 1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask3_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b00001111,
+        );
+        let e = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r =
+            _mm512_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(1.);
+        assert_eq_m512d(r, e);
+        let r = _mm512_fnmsub_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(0.9999999999999999);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b00001111, b, c,
+        );
+        let e = _mm512_setr_pd(
+            1.,
+            1.,
+            1.,
+            1.,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_maskz_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00001111, a, b, c,
+        );
+        let e = _mm512_setr_pd(1., 1., 1., 1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask3_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b00001111,
+        );
+        let e = _mm512_setr_pd(1., 1., 1., 1., -1., -1., -1., -1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_round_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_max_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm512_setr_pd(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_round_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_mask_max_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_max_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, 0b00001111, a, b);
+        let e = _mm512_setr_pd(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_round_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_maskz_max_round_pd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_max_round_pd::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a, b);
+        let e = _mm512_setr_pd(7., 6., 5., 4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_round_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_min_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 3., 2., 1., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_round_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_mask_min_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_min_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, 0b00001111, a, b);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_round_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_maskz_min_round_pd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_min_round_pd::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a, b);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getexp_round_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_getexp_round_pd::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm512_set1_pd(1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getexp_round_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_mask_getexp_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_getexp_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11110000, a);
+        let e = _mm512_setr_pd(3., 3., 3., 3., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getexp_round_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_maskz_getexp_round_pd::<_MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_getexp_round_pd::<_MM_FROUND_CUR_DIRECTION>(0b11110000, a);
+        let e = _mm512_setr_pd(0., 0., 0., 0., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_round_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_roundscale_round_pd::<0, _MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_round_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_mask_roundscale_round_pd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a);
+        let e = _mm512_set1_pd(1.1);
+        assert_eq_m512d(r, e);
+        let r = _mm512_mask_roundscale_round_pd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_round_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_maskz_roundscale_round_pd::<0, _MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_roundscale_round_pd::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111, a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_scalef_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pd(8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_mask_scalef_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_scalef_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11110000, a, b,
+        );
+        let e = _mm512_set_pd(8., 8., 8., 8., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_maskz_scalef_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_scalef_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11110000, a, b,
+        );
+        let e = _mm512_set_pd(8., 8., 8., 8., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_round_pd() {
+        let a = _mm512_set1_pd(f64::NAN);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_fixupimm_round_pd::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
+        let e = _mm512_set1_pd(0.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_round_pd() {
+        let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_mask_fixupimm_round_pd::<5, _MM_FROUND_CUR_DIRECTION>(a, 0b11110000, b, c);
+        let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_round_pd() {
+        let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_maskz_fixupimm_round_pd::<5, _MM_FROUND_CUR_DIRECTION>(0b11110000, a, b, c);
+        let e = _mm512_set_pd(0., 0., 0., 0., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getmant_round_pd() {
+        let a = _mm512_set1_pd(10.);
+        let r = _mm512_getmant_round_pd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a);
+        let e = _mm512_set1_pd(1.25);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getmant_round_pd() {
+        let a = _mm512_set1_pd(10.);
+        let r = _mm512_mask_getmant_round_pd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_getmant_round_pd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0b11110000, a);
+        let e = _mm512_setr_pd(10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getmant_round_pd() {
+        let a = _mm512_set1_pd(10.);
+        let r = _mm512_maskz_getmant_round_pd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_getmant_round_pd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0b11110000, a);
+        let e = _mm512_setr_pd(0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_pd() {
+        let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_pd() {
+        let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm512_set1_pd(0.);
+        let r = _mm512_mask_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_pd() {
+        let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundpd_ps() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundpd_ps() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm512_mask_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
+        let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundpd_ps() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm512_maskz_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
+        let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_pd() {
+        assert_eq_m512d(_mm512_setzero_pd(), _mm512_set1_pd(0.));
+    }
+
+    unsafe fn test_mm512_set1_epi64() {
+        let r = _mm512_set_epi64(2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, _mm512_set1_epi64(2));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_pd() {
+        let expected = _mm512_set_pd(2., 2., 2., 2., 2., 2., 2., 2.);
+        assert_eq_m512d(expected, _mm512_set1_pd(2.));
+    }
+
+    unsafe fn test_mm512_set4_epi64() {
+        let r = _mm512_set_epi64(4, 3, 2, 1, 4, 3, 2, 1);
+        assert_eq_m512i(r, _mm512_set4_epi64(4, 3, 2, 1));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set4_pd() {
+        let r = _mm512_set_pd(4., 3., 2., 1., 4., 3., 2., 1.);
+        assert_eq_m512d(r, _mm512_set4_pd(4., 3., 2., 1.));
+    }
+
+    unsafe fn test_mm512_setr4_epi64() {
+        let r = _mm512_set_epi64(4, 3, 2, 1, 4, 3, 2, 1);
+        assert_eq_m512i(r, _mm512_setr4_epi64(1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr4_pd() {
+        let r = _mm512_set_pd(4., 3., 2., 1., 4., 3., 2., 1.);
+        assert_eq_m512d(r, _mm512_setr4_pd(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let m = _mm512_cmplt_pd_mask(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmplt_pd_mask(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnlt_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        assert_eq!(_mm512_cmpnlt_pd_mask(a, b), !_mm512_cmplt_pd_mask(a, b));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnlt_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01111010;
+        assert_eq!(_mm512_mask_cmpnlt_pd_mask(mask, a, b), 0b01111010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        assert_eq!(_mm512_cmple_pd_mask(a, b), 0b00100101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01111010;
+        assert_eq!(_mm512_mask_cmple_pd_mask(mask, a, b), 0b00100000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnle_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let m = _mm512_cmpnle_pd_mask(b, a);
+        assert_eq!(m, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnle_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmpnle_pd_mask(mask, b, a);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_pd_mask() {
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let m = _mm512_cmpeq_pd_mask(b, a);
+        assert_eq!(m, 0b11001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_pd_mask() {
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpeq_pd_mask(mask, b, a);
+        assert_eq!(r, 0b01001000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_pd_mask() {
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let m = _mm512_cmpneq_pd_mask(b, a);
+        assert_eq!(m, 0b00110010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_pd_mask() {
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpneq_pd_mask(mask, b, a);
+        assert_eq!(r, 0b00110010)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_pd_mask() {
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let m = _mm512_cmp_pd_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_pd_mask() {
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmp_pd_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_pd_mask() {
+        let a = _mm256_set_pd(0., 1., -1., 13.);
+        let b = _mm256_set1_pd(1.);
+        let m = _mm256_cmp_pd_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_pd_mask() {
+        let a = _mm256_set_pd(0., 1., -1., 13.);
+        let b = _mm256_set1_pd(1.);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmp_pd_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_pd_mask() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set1_pd(1.);
+        let m = _mm_cmp_pd_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_pd_mask() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set1_pd(1.);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_pd_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_round_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let m = _mm512_cmp_round_pd_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_round_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmp_round_pd_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpord_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.);
+        let m = _mm512_cmpord_pd_mask(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpord_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.);
+        let mask = 0b11000011;
+        let m = _mm512_mask_cmpord_pd_mask(mask, a, b);
+        assert_eq!(m, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpunord_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.);
+        let m = _mm512_cmpunord_pd_mask(a, b);
+
+        assert_eq!(m, 0b11111010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpunord_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.);
+        let mask = 0b00001111;
+        let m = _mm512_mask_cmpunord_pd_mask(mask, a, b);
+        assert_eq!(m, 0b000001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmplt_epu64_mask(a, b);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmplt_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmplt_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, 100);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_cmplt_epu64_mask(a, b);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, 100);
+        let b = _mm256_set1_epi64x(2);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmplt_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmplt_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(2);
+        let r = _mm_cmplt_epu64_mask(a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(2);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmplt_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpgt_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmpgt_epu64_mask(b, a);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpgt_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpgt_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_cmpgt_epu64_mask(a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpgt_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpgt_epu64_mask() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_cmpgt_epu64_mask(a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epu64_mask() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpgt_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        assert_eq!(
+            _mm512_cmple_epu64_mask(a, b),
+            !_mm512_cmpgt_epu64_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        assert_eq!(_mm512_mask_cmple_epu64_mask(mask, a, b), 0b01111010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmple_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, 1);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_cmple_epu64_mask(a, b);
+        assert_eq!(r, 0b00001101)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, 1);
+        let b = _mm256_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmple_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00001101)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmple_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_cmple_epu64_mask(a, b);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmple_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        assert_eq!(
+            _mm512_cmpge_epu64_mask(a, b),
+            !_mm512_cmplt_epu64_mask(a, b)
+        );
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b11111111;
+        let r = _mm512_mask_cmpge_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpge_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, u64::MAX as i64);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_cmpge_epu64_mask(a, b);
+        assert_eq!(r, 0b00000111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, u64::MAX as i64);
+        let b = _mm256_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpge_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00000111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpge_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_cmpge_epu64_mask(a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpge_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let m = _mm512_cmpeq_epu64_mask(b, a);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpeq_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, u64::MAX as i64);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let m = _mm256_cmpeq_epu64_mask(b, a);
+        assert_eq!(m, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, u64::MAX as i64);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpeq_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpeq_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(0, 1);
+        let m = _mm_cmpeq_epu64_mask(b, a);
+        assert_eq!(m, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(0, 1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpeq_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let m = _mm512_cmpneq_epu64_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epu64_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, -100, 100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpneq_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b00110010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, u64::MAX as i64);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let r = _mm256_cmpneq_epu64_mask(b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, u64::MAX as i64);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpneq_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpneq_epu64_mask() {
+        let a = _mm_set_epi64x(-1, u64::MAX as i64);
+        let b = _mm_set_epi64x(13, 42);
+        let r = _mm_cmpneq_epu64_mask(b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epu64_mask() {
+        let a = _mm_set_epi64x(-1, u64::MAX as i64);
+        let b = _mm_set_epi64x(13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpneq_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmp_epu64_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 100);
+        let b = _mm256_set1_epi64x(1);
+        let m = _mm256_cmp_epu64_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 100);
+        let b = _mm256_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let m = _mm_cmp_epu64_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmplt_epi64_mask(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmplt_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmplt_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, -13);
+        let b = _mm256_set1_epi64x(-1);
+        let r = _mm256_cmplt_epi64_mask(a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, -13);
+        let b = _mm256_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmplt_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmplt_epi64_mask() {
+        let a = _mm_set_epi64x(-1, -13);
+        let b = _mm_set1_epi64x(-1);
+        let r = _mm_cmplt_epi64_mask(a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epi64_mask() {
+        let a = _mm_set_epi64x(-1, -13);
+        let b = _mm_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmplt_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpgt_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmpgt_epi64_mask(b, a);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpgt_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmpgt_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set1_epi64x(-1);
+        let r = _mm256_cmpgt_epi64_mask(a, b);
+        assert_eq!(r, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpgt_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpgt_epi64_mask() {
+        let a = _mm_set_epi64x(0, -1);
+        let b = _mm_set1_epi64x(-1);
+        let r = _mm_cmpgt_epi64_mask(a, b);
+        assert_eq!(r, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epi64_mask() {
+        let a = _mm_set_epi64x(0, -1);
+        let b = _mm_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpgt_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        assert_eq!(
+            _mm512_cmple_epi64_mask(a, b),
+            !_mm512_cmpgt_epi64_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        assert_eq!(_mm512_mask_cmple_epi64_mask(mask, a, b), 0b00110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmple_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, i64::MAX);
+        let b = _mm256_set1_epi64x(-1);
+        let r = _mm256_cmple_epi64_mask(a, b);
+        assert_eq!(r, 0b00000010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, i64::MAX);
+        let b = _mm256_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmple_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmple_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_cmple_epi64_mask(a, b);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmple_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        assert_eq!(
+            _mm512_cmpge_epi64_mask(a, b),
+            !_mm512_cmplt_epi64_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b11111111;
+        let r = _mm512_mask_cmpge_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b11111010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpge_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, i64::MAX);
+        let b = _mm256_set1_epi64x(-1);
+        let r = _mm256_cmpge_epi64_mask(a, b);
+        assert_eq!(r, 0b00001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, i64::MAX);
+        let b = _mm256_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpge_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpge_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(-1);
+        let r = _mm_cmpge_epi64_mask(a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpge_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let m = _mm512_cmpeq_epi64_mask(b, a);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpeq_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let m = _mm256_cmpeq_epi64_mask(b, a);
+        assert_eq!(m, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpeq_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(0, 1);
+        let m = _mm_cmpeq_epi64_mask(b, a);
+        assert_eq!(m, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(0, 1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpeq_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_epi64() {
+        let r = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m512i(r, _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0))
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_epi64() {
+        let r = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0))
+    }
+
+    unsafe fn test_mm512_cmpneq_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let m = _mm512_cmpneq_epi64_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epi64_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, -100, 100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpneq_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b00110010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let r = _mm256_cmpneq_epi64_mask(b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpneq_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpneq_epi64_mask() {
+        let a = _mm_set_epi64x(-1, 13);
+        let b = _mm_set_epi64x(13, 42);
+        let r = _mm_cmpneq_epi64_mask(b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epi64_mask() {
+        let a = _mm_set_epi64x(-1, 13);
+        let b = _mm_set_epi64x(13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpneq_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmp_epi64_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set1_epi64x(1);
+        let m = _mm256_cmp_epi64_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let m = _mm_cmp_epi64_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i32gather_pd::<8>(index, arr.as_ptr());
+        assert_eq_m512d(r, _mm512_setr_pd(0., 16., 32., 48., 64., 80., 96., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        let src = _mm512_set1_pd(2.);
+        let mask = 0b10101010;
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i32gather_pd::<8>(src, mask, index, arr.as_ptr());
+        assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i64gather_pd::<8>(index, arr.as_ptr());
+        assert_eq_m512d(r, _mm512_setr_pd(0., 16., 32., 48., 64., 80., 96., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        let src = _mm512_set1_pd(2.);
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i64gather_pd::<8>(src, mask, index, arr.as_ptr());
+        assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i64gather_ps::<4>(index, arr.as_ptr());
+        assert_eq_m256(r, _mm256_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        let src = _mm256_set1_ps(2.);
+        let mask = 0b10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 4 is word-addressing
+        let r = _mm512_mask_i64gather_ps::<4>(src, mask, index, arr.as_ptr());
+        assert_eq_m256(r, _mm256_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i32gather_epi64::<8>(index, arr.as_ptr());
+        assert_eq_m512i(r, _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        let src = _mm512_set1_epi64(2);
+        let mask = 0b10101010;
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i32gather_epi64::<8>(src, mask, index, arr.as_ptr());
+        assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i64gather_epi64::<8>(index, arr.as_ptr());
+        assert_eq_m512i(r, _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        let src = _mm512_set1_epi64(2);
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i64gather_epi64::<8>(src, mask, index, arr.as_ptr());
+        assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64gather_epi32() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i64gather_epi32::<8>(index, arr.as_ptr() as *const i32);
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64gather_epi32() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        let src = _mm256_set1_epi32(2);
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i64gather_epi32::<8>(src, mask, index, arr.as_ptr() as *const i32);
+        assert_eq_m256i(r, _mm256_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_pd() {
+        let mut arr = [0f64; 128];
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 8 is word-addressing
+        _mm512_i32scatter_pd::<8>(arr.as_mut_ptr(), index, src);
+        let mut expected = [0f64; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as f64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_pd() {
+        let mut arr = [0f64; 128];
+        let mask = 0b10101010;
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 8 is word-addressing
+        _mm512_mask_i32scatter_pd::<8>(arr.as_mut_ptr(), mask, index, src);
+        let mut expected = [0f64; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64scatter_pd() {
+        let mut arr = [0f64; 128];
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 8 is word-addressing
+        _mm512_i64scatter_pd::<8>(arr.as_mut_ptr(), index, src);
+        let mut expected = [0f64; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as f64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64scatter_pd() {
+        let mut arr = [0f64; 128];
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 8 is word-addressing
+        _mm512_mask_i64scatter_pd::<8>(arr.as_mut_ptr(), mask, index, src);
+        let mut expected = [0f64; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64scatter_ps() {
+        let mut arr = [0f32; 128];
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 4 is word-addressing
+        _mm512_i64scatter_ps::<4>(arr.as_mut_ptr(), index, src);
+        let mut expected = [0f32; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64scatter_ps() {
+        let mut arr = [0f32; 128];
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i64scatter_ps::<4>(arr.as_mut_ptr(), mask, index, src);
+        let mut expected = [0f32; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_epi64() {
+        let mut arr = [0i64; 128];
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 8 is word-addressing
+        _mm512_i32scatter_epi64::<8>(arr.as_mut_ptr(), index, src);
+        let mut expected = [0i64; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as i64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_epi64() {
+        let mut arr = [0i64; 128];
+        let mask = 0b10101010;
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 8 is word-addressing
+        _mm512_mask_i32scatter_epi64::<8>(arr.as_mut_ptr(), mask, index, src);
+        let mut expected = [0i64; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64scatter_epi64() {
+        let mut arr = [0i64; 128];
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 8 is word-addressing
+        _mm512_i64scatter_epi64::<8>(arr.as_mut_ptr(), index, src);
+        let mut expected = [0i64; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as i64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64scatter_epi64() {
+        let mut arr = [0i64; 128];
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 8 is word-addressing
+        _mm512_mask_i64scatter_epi64::<8>(arr.as_mut_ptr(), mask, index, src);
+        let mut expected = [0i64; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64scatter_epi32() {
+        let mut arr = [0i32; 128];
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 4 is word-addressing
+        _mm512_i64scatter_epi32::<4>(arr.as_mut_ptr(), index, src);
+        let mut expected = [0i32; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64scatter_epi32() {
+        let mut arr = [0i32; 128];
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i64scatter_epi32::<4>(arr.as_mut_ptr(), mask, index, src);
+        let mut expected = [0i32; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32logather_epi64() {
+        let base_addr: [i64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_i32logather_epi64::<8>(vindex, base_addr.as_ptr());
+        let expected = _mm512_setr_epi64(2, 3, 4, 5, 6, 7, 8, 1);
+        assert_eq_m512i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32logather_epi64() {
+        let base_addr: [i64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let src = _mm512_setr_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_mask_i32logather_epi64::<8>(src, 0b01010101, vindex, base_addr.as_ptr());
+        let expected = _mm512_setr_epi64(2, 10, 4, 12, 6, 14, 8, 16);
+        assert_eq_m512i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32logather_pd() {
+        let base_addr: [f64; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_i32logather_pd::<8>(vindex, base_addr.as_ptr());
+        let expected = _mm512_setr_pd(2., 3., 4., 5., 6., 7., 8., 1.);
+        assert_eq_m512d(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32logather_pd() {
+        let base_addr: [f64; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let src = _mm512_setr_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_mask_i32logather_pd::<8>(src, 0b01010101, vindex, base_addr.as_ptr());
+        let expected = _mm512_setr_pd(2., 10., 4., 12., 6., 14., 8., 16.);
+        assert_eq_m512d(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32loscatter_epi64() {
+        let mut base_addr: [i64; 8] = [0; 8];
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let src = _mm512_setr_epi64(2, 3, 4, 5, 6, 7, 8, 1);
+        _mm512_i32loscatter_epi64::<8>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1, 2, 3, 4, 5, 6, 7, 8];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32loscatter_epi64() {
+        let mut base_addr: [i64; 8] = [0; 8];
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let src = _mm512_setr_epi64(2, 3, 4, 5, 6, 7, 8, 1);
+        _mm512_mask_i32loscatter_epi64::<8>(base_addr.as_mut_ptr(), 0b01010101, vindex, src);
+        let expected = [0, 2, 0, 4, 0, 6, 0, 8];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32loscatter_pd() {
+        let mut base_addr: [f64; 8] = [0.; 8];
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let src = _mm512_setr_pd(2., 3., 4., 5., 6., 7., 8., 1.);
+        _mm512_i32loscatter_pd::<8>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1., 2., 3., 4., 5., 6., 7., 8.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32loscatter_pd() {
+        let mut base_addr: [f64; 8] = [0.; 8];
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let src = _mm512_setr_pd(2., 3., 4., 5., 6., 7., 8., 1.);
+        _mm512_mask_i32loscatter_pd::<8>(base_addr.as_mut_ptr(), 0b01010101, vindex, src);
+        let expected = [0., 2., 0., 4., 0., 6., 0., 8.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i32gather_epi32() {
+        let base_addr: [i32; 4] = [1, 2, 3, 4];
+        let src = _mm_setr_epi32(5, 6, 7, 8);
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let r = _mm_mmask_i32gather_epi32::<4>(src, 0b0101, vindex, base_addr.as_ptr());
+        let expected = _mm_setr_epi32(2, 6, 4, 8);
+        assert_eq_m128i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i32gather_epi64() {
+        let base_addr: [i64; 2] = [1, 2];
+        let src = _mm_setr_epi64x(5, 6);
+        let vindex = _mm_setr_epi32(1, 0, -1, -1);
+        let r = _mm_mmask_i32gather_epi64::<8>(src, 0b01, vindex, base_addr.as_ptr());
+        let expected = _mm_setr_epi64x(2, 6);
+        assert_eq_m128i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i32gather_pd() {
+        let base_addr: [f64; 2] = [1., 2.];
+        let src = _mm_setr_pd(5., 6.);
+        let vindex = _mm_setr_epi32(1, 0, -1, -1);
+        let r = _mm_mmask_i32gather_pd::<8>(src, 0b01, vindex, base_addr.as_ptr());
+        let expected = _mm_setr_pd(2., 6.);
+        assert_eq_m128d(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i32gather_ps() {
+        let base_addr: [f32; 4] = [1., 2., 3., 4.];
+        let src = _mm_setr_ps(5., 6., 7., 8.);
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let r = _mm_mmask_i32gather_ps::<4>(src, 0b0101, vindex, base_addr.as_ptr());
+        let expected = _mm_setr_ps(2., 6., 4., 8.);
+        assert_eq_m128(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i64gather_epi32() {
+        let base_addr: [i32; 2] = [1, 2];
+        let src = _mm_setr_epi32(5, 6, 7, 8);
+        let vindex = _mm_setr_epi64x(1, 0);
+        let r = _mm_mmask_i64gather_epi32::<4>(src, 0b01, vindex, base_addr.as_ptr());
+        let expected = _mm_setr_epi32(2, 6, 0, 0);
+        assert_eq_m128i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i64gather_epi64() {
+        let base_addr: [i64; 2] = [1, 2];
+        let src = _mm_setr_epi64x(5, 6);
+        let vindex = _mm_setr_epi64x(1, 0);
+        let r = _mm_mmask_i64gather_epi64::<8>(src, 0b01, vindex, base_addr.as_ptr());
+        let expected = _mm_setr_epi64x(2, 6);
+        assert_eq_m128i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i64gather_pd() {
+        let base_addr: [f64; 2] = [1., 2.];
+        let src = _mm_setr_pd(5., 6.);
+        let vindex = _mm_setr_epi64x(1, 0);
+        let r = _mm_mmask_i64gather_pd::<8>(src, 0b01, vindex, base_addr.as_ptr());
+        let expected = _mm_setr_pd(2., 6.);
+        assert_eq_m128d(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i64gather_ps() {
+        let base_addr: [f32; 2] = [1., 2.];
+        let src = _mm_setr_ps(5., 6., 7., 8.);
+        let vindex = _mm_setr_epi64x(1, 0);
+        let r = _mm_mmask_i64gather_ps::<4>(src, 0b01, vindex, base_addr.as_ptr());
+        let expected = _mm_setr_ps(2., 6., 0., 0.);
+        assert_eq_m128(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i32gather_epi32() {
+        let base_addr: [i32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let src = _mm256_setr_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        let r = _mm256_mmask_i32gather_epi32::<4>(src, 0b01010101, vindex, base_addr.as_ptr());
+        let expected = _mm256_setr_epi32(2, 10, 4, 12, 6, 14, 8, 16);
+        assert_eq_m256i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i32gather_epi64() {
+        let base_addr: [i64; 4] = [1, 2, 3, 4];
+        let src = _mm256_setr_epi64x(9, 10, 11, 12);
+        let vindex = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm256_mmask_i32gather_epi64::<8>(src, 0b0101, vindex, base_addr.as_ptr());
+        let expected = _mm256_setr_epi64x(2, 10, 4, 12);
+        assert_eq_m256i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i32gather_pd() {
+        let base_addr: [f64; 4] = [1., 2., 3., 4.];
+        let src = _mm256_setr_pd(9., 10., 11., 12.);
+        let vindex = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm256_mmask_i32gather_pd::<8>(src, 0b0101, vindex, base_addr.as_ptr());
+        let expected = _mm256_setr_pd(2., 10., 4., 12.);
+        assert_eq_m256d(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i32gather_ps() {
+        let base_addr: [f32; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        let r = _mm256_mmask_i32gather_ps::<4>(src, 0b01010101, vindex, base_addr.as_ptr());
+        let expected = _mm256_setr_ps(2., 10., 4., 12., 6., 14., 8., 16.);
+        assert_eq_m256(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i64gather_epi32() {
+        let base_addr: [i32; 4] = [1, 2, 3, 4];
+        let src = _mm_setr_epi32(9, 10, 11, 12);
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let r = _mm256_mmask_i64gather_epi32::<4>(src, 0b0101, vindex, base_addr.as_ptr());
+        let expected = _mm_setr_epi32(2, 10, 4, 12);
+        assert_eq_m128i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i64gather_epi64() {
+        let base_addr: [i64; 4] = [1, 2, 3, 4];
+        let src = _mm256_setr_epi64x(9, 10, 11, 12);
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let r = _mm256_mmask_i64gather_epi64::<8>(src, 0b0101, vindex, base_addr.as_ptr());
+        let expected = _mm256_setr_epi64x(2, 10, 4, 12);
+        assert_eq_m256i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i64gather_pd() {
+        let base_addr: [f64; 4] = [1., 2., 3., 4.];
+        let src = _mm256_setr_pd(9., 10., 11., 12.);
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let r = _mm256_mmask_i64gather_pd::<8>(src, 0b0101, vindex, base_addr.as_ptr());
+        let expected = _mm256_setr_pd(2., 10., 4., 12.);
+        assert_eq_m256d(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i64gather_ps() {
+        let base_addr: [f32; 4] = [1., 2., 3., 4.];
+        let src = _mm_setr_ps(9., 10., 11., 12.);
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let r = _mm256_mmask_i64gather_ps::<4>(src, 0b0101, vindex, base_addr.as_ptr());
+        let expected = _mm_setr_ps(2., 10., 4., 12.);
+        assert_eq_m128(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i32scatter_epi32() {
+        let mut base_addr: [i32; 4] = [0; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm_setr_epi32(2, 3, 4, 1);
+        _mm_i32scatter_epi32::<4>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1, 2, 3, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i32scatter_epi32() {
+        let mut base_addr: [i32; 4] = [0; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm_setr_epi32(2, 3, 4, 1);
+        _mm_mask_i32scatter_epi32::<4>(base_addr.as_mut_ptr(), 0b0101, vindex, src);
+        let expected = [0, 2, 0, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i32scatter_epi64() {
+        let mut base_addr: [i64; 2] = [0; 2];
+        let vindex = _mm_setr_epi32(1, 0, -1, -1);
+        let src = _mm_setr_epi64x(2, 1);
+        _mm_i32scatter_epi64::<8>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1, 2];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i32scatter_epi64() {
+        let mut base_addr: [i64; 2] = [0; 2];
+        let vindex = _mm_setr_epi32(1, 0, -1, -1);
+        let src = _mm_setr_epi64x(2, 1);
+        _mm_mask_i32scatter_epi64::<8>(base_addr.as_mut_ptr(), 0b01, vindex, src);
+        let expected = [0, 2];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i32scatter_pd() {
+        let mut base_addr: [f64; 2] = [0.; 2];
+        let vindex = _mm_setr_epi32(1, 0, -1, -1);
+        let src = _mm_setr_pd(2., 1.);
+        _mm_i32scatter_pd::<8>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1., 2.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i32scatter_pd() {
+        let mut base_addr: [f64; 2] = [0.; 2];
+        let vindex = _mm_setr_epi32(1, 0, -1, -1);
+        let src = _mm_setr_pd(2., 1.);
+        _mm_mask_i32scatter_pd::<8>(base_addr.as_mut_ptr(), 0b01, vindex, src);
+        let expected = [0., 2.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i32scatter_ps() {
+        let mut base_addr: [f32; 4] = [0.; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm_setr_ps(2., 3., 4., 1.);
+        _mm_i32scatter_ps::<4>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1., 2., 3., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i32scatter_ps() {
+        let mut base_addr: [f32; 4] = [0.; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm_setr_ps(2., 3., 4., 1.);
+        _mm_mask_i32scatter_ps::<4>(base_addr.as_mut_ptr(), 0b0101, vindex, src);
+        let expected = [0., 2., 0., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i64scatter_epi32() {
+        let mut base_addr: [i32; 2] = [0; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_epi32(2, 1, -1, -1);
+        _mm_i64scatter_epi32::<4>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1, 2];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i64scatter_epi32() {
+        let mut base_addr: [i32; 2] = [0; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_epi32(2, 1, -1, -1);
+        _mm_mask_i64scatter_epi32::<4>(base_addr.as_mut_ptr(), 0b01, vindex, src);
+        let expected = [0, 2];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i64scatter_epi64() {
+        let mut base_addr: [i64; 2] = [0; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_epi64x(2, 1);
+        _mm_i64scatter_epi64::<8>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1, 2];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i64scatter_epi64() {
+        let mut base_addr: [i64; 2] = [0; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_epi64x(2, 1);
+        _mm_mask_i64scatter_epi64::<8>(base_addr.as_mut_ptr(), 0b01, vindex, src);
+        let expected = [0, 2];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i64scatter_pd() {
+        let mut base_addr: [f64; 2] = [0.; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_pd(2., 1.);
+        _mm_i64scatter_pd::<8>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1., 2.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i64scatter_pd() {
+        let mut base_addr: [f64; 2] = [0.; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_pd(2., 1.);
+        _mm_mask_i64scatter_pd::<8>(base_addr.as_mut_ptr(), 0b01, vindex, src);
+        let expected = [0., 2.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i64scatter_ps() {
+        let mut base_addr: [f32; 2] = [0.; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_ps(2., 1., -1., -1.);
+        _mm_i64scatter_ps::<4>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1., 2.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i64scatter_ps() {
+        let mut base_addr: [f32; 2] = [0.; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_ps(2., 1., -1., -1.);
+        _mm_mask_i64scatter_ps::<4>(base_addr.as_mut_ptr(), 0b01, vindex, src);
+        let expected = [0., 2.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i32scatter_epi32() {
+        let mut base_addr: [i32; 8] = [0; 8];
+        let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        let src = _mm256_setr_epi32(2, 3, 4, 5, 6, 7, 8, 1);
+        _mm256_i32scatter_epi32::<4>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1, 2, 3, 4, 5, 6, 7, 8];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i32scatter_epi32() {
+        let mut base_addr: [i32; 8] = [0; 8];
+        let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        let src = _mm256_setr_epi32(2, 3, 4, 5, 6, 7, 8, 1);
+        _mm256_mask_i32scatter_epi32::<4>(base_addr.as_mut_ptr(), 0b01010101, vindex, src);
+        let expected = [0, 2, 0, 4, 0, 6, 0, 8];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i32scatter_epi64() {
+        let mut base_addr: [i64; 4] = [0; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm256_setr_epi64x(2, 3, 4, 1);
+        _mm256_i32scatter_epi64::<8>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1, 2, 3, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i32scatter_epi64() {
+        let mut base_addr: [i64; 4] = [0; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm256_setr_epi64x(2, 3, 4, 1);
+        _mm256_mask_i32scatter_epi64::<8>(base_addr.as_mut_ptr(), 0b0101, vindex, src);
+        let expected = [0, 2, 0, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i32scatter_pd() {
+        let mut base_addr: [f64; 4] = [0.; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm256_setr_pd(2., 3., 4., 1.);
+        _mm256_i32scatter_pd::<8>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1., 2., 3., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i32scatter_pd() {
+        let mut base_addr: [f64; 4] = [0.; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm256_setr_pd(2., 3., 4., 1.);
+        _mm256_mask_i32scatter_pd::<8>(base_addr.as_mut_ptr(), 0b0101, vindex, src);
+        let expected = [0., 2., 0., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i32scatter_ps() {
+        let mut base_addr: [f32; 8] = [0.; 8];
+        let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        let src = _mm256_setr_ps(2., 3., 4., 5., 6., 7., 8., 1.);
+        _mm256_i32scatter_ps::<4>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1., 2., 3., 4., 5., 6., 7., 8.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i32scatter_ps() {
+        let mut base_addr: [f32; 8] = [0.; 8];
+        let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        let src = _mm256_setr_ps(2., 3., 4., 5., 6., 7., 8., 1.);
+        _mm256_mask_i32scatter_ps::<4>(base_addr.as_mut_ptr(), 0b01010101, vindex, src);
+        let expected = [0., 2., 0., 4., 0., 6., 0., 8.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i64scatter_epi32() {
+        let mut base_addr: [i32; 4] = [0; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm_setr_epi32(2, 3, 4, 1);
+        _mm256_i64scatter_epi32::<4>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1, 2, 3, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i64scatter_epi32() {
+        let mut base_addr: [i32; 4] = [0; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm_setr_epi32(2, 3, 4, 1);
+        _mm256_mask_i64scatter_epi32::<4>(base_addr.as_mut_ptr(), 0b0101, vindex, src);
+        let expected = [0, 2, 0, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i64scatter_epi64() {
+        let mut base_addr: [i64; 4] = [0; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm256_setr_epi64x(2, 3, 4, 1);
+        _mm256_i64scatter_epi64::<8>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1, 2, 3, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i64scatter_epi64() {
+        let mut base_addr: [i64; 4] = [0; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm256_setr_epi64x(2, 3, 4, 1);
+        _mm256_mask_i64scatter_epi64::<8>(base_addr.as_mut_ptr(), 0b0101, vindex, src);
+        let expected = [0, 2, 0, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i64scatter_pd() {
+        let mut base_addr: [f64; 4] = [0.; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm256_setr_pd(2., 3., 4., 1.);
+        _mm256_i64scatter_pd::<8>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1., 2., 3., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i64scatter_pd() {
+        let mut base_addr: [f64; 4] = [0.; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm256_setr_pd(2., 3., 4., 1.);
+        _mm256_mask_i64scatter_pd::<8>(base_addr.as_mut_ptr(), 0b0101, vindex, src);
+        let expected = [0., 2., 0., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i64scatter_ps() {
+        let mut base_addr: [f32; 4] = [0.; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm_setr_ps(2., 3., 4., 1.);
+        _mm256_i64scatter_ps::<4>(base_addr.as_mut_ptr(), vindex, src);
+        let expected = [1., 2., 3., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i64scatter_ps() {
+        let mut base_addr: [f32; 4] = [0.; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm_setr_ps(2., 3., 4., 1.);
+        _mm256_mask_i64scatter_ps::<4>(base_addr.as_mut_ptr(), 0b0101, vindex, src);
+        let expected = [0., 2., 0., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rol_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 63, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_rol_epi64::<1>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 0, 1 << 33, 1 << 33, 1 << 33,
+            1 << 33, 1 << 33, 1 << 33, 1 << 33,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rol_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 63, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_mask_rol_epi64::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rol_epi64::<1>(a, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 0,  1 << 33, 1 << 33, 1 << 33,
+            1 << 33, 1 << 33, 1 << 33, 1 << 33,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rol_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 63,
+        );
+        let r = _mm512_maskz_rol_epi64::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rol_epi64::<1>(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 33, 1 << 33, 1 << 33, 1 << 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rol_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_rol_epi64::<1>(a);
+        let e = _mm256_set_epi64x(1 << 0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rol_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_mask_rol_epi64::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rol_epi64::<1>(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rol_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_maskz_rol_epi64::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rol_epi64::<1>(0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rol_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let r = _mm_rol_epi64::<1>(a);
+        let e = _mm_set_epi64x(1 << 0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rol_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let r = _mm_mask_rol_epi64::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rol_epi64::<1>(a, 0b00000011, a);
+        let e = _mm_set_epi64x(1 << 0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rol_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let r = _mm_maskz_rol_epi64::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rol_epi64::<1>(0b00000011, a);
+        let e = _mm_set_epi64x(1 << 0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_ror_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 0,  1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_ror_epi64::<1>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 63, 1 << 31, 1 << 31, 1 << 31,
+            1 << 31, 1 << 31, 1 << 31, 1 << 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_ror_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 0,  1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_mask_ror_epi64::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_ror_epi64::<1>(a, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 63, 1 << 31, 1 << 31, 1 << 31,
+            1 << 31, 1 << 31, 1 << 31, 1 << 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_ror_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 0,
+        );
+        let r = _mm512_maskz_ror_epi64::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_ror_epi64::<1>(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 31, 1 << 31, 1 << 31, 1 << 63);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_ror_epi64() {
+        let a = _mm256_set_epi64x(1 << 0, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_ror_epi64::<1>(a);
+        let e = _mm256_set_epi64x(1 << 63, 1 << 31, 1 << 31, 1 << 31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_ror_epi64() {
+        let a = _mm256_set_epi64x(1 << 0, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_mask_ror_epi64::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_ror_epi64::<1>(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 63, 1 << 31, 1 << 31, 1 << 31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_ror_epi64() {
+        let a = _mm256_set_epi64x(1 << 0, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_maskz_ror_epi64::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_ror_epi64::<1>(0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 63, 1 << 31, 1 << 31, 1 << 31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_ror_epi64() {
+        let a = _mm_set_epi64x(1 << 0, 1 << 32);
+        let r = _mm_ror_epi64::<1>(a);
+        let e = _mm_set_epi64x(1 << 63, 1 << 31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_ror_epi64() {
+        let a = _mm_set_epi64x(1 << 0, 1 << 32);
+        let r = _mm_mask_ror_epi64::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_ror_epi64::<1>(a, 0b00000011, a);
+        let e = _mm_set_epi64x(1 << 63, 1 << 31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_ror_epi64() {
+        let a = _mm_set_epi64x(1 << 0, 1 << 32);
+        let r = _mm_maskz_ror_epi64::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_ror_epi64::<1>(0b00000011, a);
+        let e = _mm_set_epi64x(1 << 63, 1 << 31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_slli_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 63, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_slli_epi64::<1>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 33, 1 << 33, 1 << 33,
+            1 << 33, 1 << 33, 1 << 33, 1 << 33,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_slli_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 63, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_mask_slli_epi64::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_slli_epi64::<1>(a, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 33, 1 << 33, 1 << 33,
+            1 << 33, 1 << 33, 1 << 33, 1 << 33,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_slli_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 63,
+        );
+        let r = _mm512_maskz_slli_epi64::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_slli_epi64::<1>(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 33, 1 << 33, 1 << 33, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_slli_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_mask_slli_epi64::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_slli_epi64::<1>(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_slli_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_maskz_slli_epi64::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_slli_epi64::<1>(0b00001111, a);
+        let e = _mm256_set_epi64x(0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_slli_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let r = _mm_mask_slli_epi64::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_slli_epi64::<1>(a, 0b00000011, a);
+        let e = _mm_set_epi64x(0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_slli_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let r = _mm_maskz_slli_epi64::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_slli_epi64::<1>(0b00000011, a);
+        let e = _mm_set_epi64x(0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srli_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 0, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_srli_epi64::<1>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 31, 1 << 31, 1 << 31,
+            1 << 31, 1 << 31, 1 << 31, 1 << 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srli_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 0, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_mask_srli_epi64::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srli_epi64::<1>(a, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 31, 1 << 31, 1 << 31,
+            1 << 31, 1 << 31, 1 << 31, 1 << 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srli_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 0,
+        );
+        let r = _mm512_maskz_srli_epi64::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srli_epi64::<1>(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 31, 1 << 31, 1 << 31, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srli_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let r = _mm256_mask_srli_epi64::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srli_epi64::<1>(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srli_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let r = _mm256_maskz_srli_epi64::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srli_epi64::<1>(0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srli_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let r = _mm_mask_srli_epi64::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srli_epi64::<1>(a, 0b00000011, a);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srli_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let r = _mm_maskz_srli_epi64::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srli_epi64::<1>(0b00000011, a);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rolv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 63, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let b = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_rolv_epi64(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 1 << 0,  1 << 34, 1 << 35,
+            1 << 36, 1 << 37, 1 << 38, 1 << 39,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rolv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 63, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let b = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_rolv_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rolv_epi64(a, 0b11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 1 << 0,  1 << 34, 1 << 35,
+            1 << 36, 1 << 37, 1 << 38, 1 << 39,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rolv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 62,
+        );
+        let b = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 2);
+        let r = _mm512_maskz_rolv_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rolv_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 36, 1 << 37, 1 << 38, 1 << 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rolv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 63, 1 << 32, 1 << 32);
+        let b = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_rolv_epi64(a, b);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 0, 1 << 34, 1 << 35);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rolv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 63, 1 << 32, 1 << 32);
+        let b = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_mask_rolv_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rolv_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 0, 1 << 34, 1 << 35);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rolv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 63, 1 << 32, 1 << 32);
+        let b = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_maskz_rolv_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rolv_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 0, 1 << 34, 1 << 35);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rolv_epi64() {
+        let a = _mm_set_epi64x(1 << 32, 1 << 63);
+        let b = _mm_set_epi64x(0, 1);
+        let r = _mm_rolv_epi64(a, b);
+        let e = _mm_set_epi64x(1 << 32, 1 << 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rolv_epi64() {
+        let a = _mm_set_epi64x(1 << 32, 1 << 63);
+        let b = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_rolv_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rolv_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(1 << 32, 1 << 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rolv_epi64() {
+        let a = _mm_set_epi64x(1 << 32, 1 << 63);
+        let b = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_rolv_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rolv_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(1 << 32, 1 << 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rorv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 0, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let b = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_rorv_epi64(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 1 << 63, 1 << 30, 1 << 29,
+            1 << 28, 1 << 27, 1 << 26, 1 << 25,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rorv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 0, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let b = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_rorv_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rorv_epi64(a, 0b11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 1 << 63, 1 << 30, 1 << 29,
+            1 << 28, 1 << 27, 1 << 26, 1 << 25,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rorv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 0,
+        );
+        let b = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 2);
+        let r = _mm512_maskz_rorv_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rorv_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 28, 1 << 27, 1 << 26, 1 << 62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rorv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 0, 1 << 32, 1 << 32);
+        let b = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_rorv_epi64(a, b);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 63, 1 << 30, 1 << 29);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rorv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 0, 1 << 32, 1 << 32);
+        let b = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_mask_rorv_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rorv_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 63, 1 << 30, 1 << 29);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rorv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 0, 1 << 32, 1 << 32);
+        let b = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_maskz_rorv_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rorv_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 63, 1 << 30, 1 << 29);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rorv_epi64() {
+        let a = _mm_set_epi64x(1 << 32, 1 << 0);
+        let b = _mm_set_epi64x(0, 1);
+        let r = _mm_rorv_epi64(a, b);
+        let e = _mm_set_epi64x(1 << 32, 1 << 63);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rorv_epi64() {
+        let a = _mm_set_epi64x(1 << 32, 1 << 0);
+        let b = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_rorv_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rorv_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(1 << 32, 1 << 63);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rorv_epi64() {
+        let a = _mm_set_epi64x(1 << 32, 1 << 0);
+        let b = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_rorv_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rorv_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(1 << 32, 1 << 63);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sllv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 63, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm512_set_epi64(0, 2, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_sllv_epi64(a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 0, 1 << 34, 1 << 35,
+            1 << 36, 1 << 37, 1 << 38, 1 << 39,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sllv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 63, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_sllv_epi64(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sllv_epi64(a, 0b11111111, a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 1 << 33, 0, 1 << 35,
+            1 << 36, 1 << 37, 1 << 38, 1 << 39,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sllv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 63,
+        );
+        let count = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 1);
+        let r = _mm512_maskz_sllv_epi64(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sllv_epi64(0b00001111, a, count);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 36, 1 << 37, 1 << 38, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sllv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 32, 1 << 63, 1 << 32);
+        let count = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_mask_sllv_epi64(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sllv_epi64(a, 0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 33, 0, 1 << 35);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sllv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 32, 1 << 63, 1 << 32);
+        let count = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_maskz_sllv_epi64(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sllv_epi64(0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 33, 0, 1 << 35);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sllv_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let count = _mm_set_epi64x(2, 3);
+        let r = _mm_mask_sllv_epi64(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sllv_epi64(a, 0b00000011, a, count);
+        let e = _mm_set_epi64x(0, 1 << 35);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sllv_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let count = _mm_set_epi64x(2, 3);
+        let r = _mm_maskz_sllv_epi64(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sllv_epi64(0b00000011, a, count);
+        let e = _mm_set_epi64x(0, 1 << 35);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srlv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 0, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_srlv_epi64(a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 0, 1 << 30, 1 << 29,
+            1 << 28, 1 << 27, 1 << 26, 1 << 25,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srlv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 0, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_srlv_epi64(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srlv_epi64(a, 0b11111111, a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 0, 1 << 30, 1 << 29,
+            1 << 28, 1 << 27, 1 << 26, 1 << 25,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srlv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 0,
+        );
+        let count = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_maskz_srlv_epi64(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srlv_epi64(0b00001111, a, count);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 28, 1 << 27, 1 << 26, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srlv_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_srlv_epi64(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srlv_epi64(a, 0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srlv_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_srlv_epi64(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srlv_epi64(0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srlv_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set1_epi64x(1);
+        let r = _mm_mask_srlv_epi64(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srlv_epi64(a, 0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srlv_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set1_epi64x(1);
+        let r = _mm_maskz_srlv_epi64(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srlv_epi64(0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sll_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 63, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm512_sll_epi64(a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 33, 1 << 33, 1 << 33,
+            1 << 33, 1 << 33, 1 << 33, 1 << 33,
+        );
+        assert_eq_m512i(r, e);
+        let count = _mm_set_epi64x(1, 0);
+        let r = _mm512_sll_epi64(a, count);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sll_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 63, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm512_mask_sll_epi64(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sll_epi64(a, 0b11111111, a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 33, 1 << 33, 1 << 33,
+            1 << 33, 1 << 33, 1 << 33, 1 << 33,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sll_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 63,
+        );
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm512_maskz_sll_epi64(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sll_epi64(0b00001111, a, count);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 33, 1 << 33, 1 << 33, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sll_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_mask_sll_epi64(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sll_epi64(a, 0b00001111, a, count);
+        let e = _mm256_set_epi64x(0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sll_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_maskz_sll_epi64(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sll_epi64(0b00001111, a, count);
+        let e = _mm256_set_epi64x(0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sll_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_sll_epi64(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sll_epi64(a, 0b00000011, a, count);
+        let e = _mm_set_epi64x(0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sll_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_sll_epi64(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sll_epi64(0b00000011, a, count);
+        let e = _mm_set_epi64x(0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srl_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 0, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm512_srl_epi64(a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 31, 1 << 31, 1 << 31,
+            1 << 31, 1 << 31, 1 << 31, 1 << 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srl_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 0, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm512_mask_srl_epi64(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srl_epi64(a, 0b11111111, a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 31, 1 << 31, 1 << 31,
+            1 << 31, 1 << 31, 1 << 31, 1 << 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srl_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 0,
+        );
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm512_maskz_srl_epi64(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srl_epi64(0b00001111, a, count);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 31, 1 << 31, 1 << 31, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srl_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_mask_srl_epi64(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srl_epi64(a, 0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srl_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_maskz_srl_epi64(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srl_epi64(0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srl_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_srl_epi64(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srl_epi64(a, 0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srl_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_srl_epi64(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srl_epi64(0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sra_epi64() {
+        let a = _mm512_set_epi64(1, -8, 0, 0, 0, 0, 15, -16);
+        let count = _mm_set_epi64x(0, 2);
+        let r = _mm512_sra_epi64(a, count);
+        let e = _mm512_set_epi64(0, -2, 0, 0, 0, 0, 3, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sra_epi64() {
+        let a = _mm512_set_epi64(1, -8, 0, 0, 0, 0, 15, -16);
+        let count = _mm_set_epi64x(0, 2);
+        let r = _mm512_mask_sra_epi64(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sra_epi64(a, 0b11111111, a, count);
+        let e = _mm512_set_epi64(0, -2, 0, 0, 0, 0, 3, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sra_epi64() {
+        let a = _mm512_set_epi64(1, -8, 0, 0, 0, 0, 15, -16);
+        let count = _mm_set_epi64x(0, 2);
+        let r = _mm512_maskz_sra_epi64(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sra_epi64(0b00001111, a, count);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 3, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_sra_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_sra_epi64(a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sra_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_mask_sra_epi64(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sra_epi64(a, 0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sra_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_maskz_sra_epi64(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sra_epi64(0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_sra_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_sra_epi64(a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sra_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_sra_epi64(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sra_epi64(a, 0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sra_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_sra_epi64(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sra_epi64(0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srav_epi64() {
+        let a = _mm512_set_epi64(1, -8, 0, 0, 0, 0, 15, -16);
+        let count = _mm512_set_epi64(2, 2, 0, 0, 0, 0, 2, 1);
+        let r = _mm512_srav_epi64(a, count);
+        let e = _mm512_set_epi64(0, -2, 0, 0, 0, 0, 3, -8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srav_epi64() {
+        let a = _mm512_set_epi64(1, -8, 0, 0, 0, 0, 15, -16);
+        let count = _mm512_set_epi64(2, 2, 0, 0, 0, 0, 2, 1);
+        let r = _mm512_mask_srav_epi64(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srav_epi64(a, 0b11111111, a, count);
+        let e = _mm512_set_epi64(0, -2, 0, 0, 0, 0, 3, -8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srav_epi64() {
+        let a = _mm512_set_epi64(1, -8, 0, 0, 0, 0, 15, -16);
+        let count = _mm512_set_epi64(2, 2, 0, 0, 0, 0, 2, 1);
+        let r = _mm512_maskz_srav_epi64(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srav_epi64(0b00001111, a, count);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 3, -8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_srav_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm256_set1_epi64x(1);
+        let r = _mm256_srav_epi64(a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srav_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_srav_epi64(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srav_epi64(a, 0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srav_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_srav_epi64(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srav_epi64(0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_srav_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set1_epi64x(1);
+        let r = _mm_srav_epi64(a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srav_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set1_epi64x(1);
+        let r = _mm_mask_srav_epi64(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srav_epi64(a, 0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srav_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set1_epi64x(1);
+        let r = _mm_maskz_srav_epi64(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srav_epi64(0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srai_epi64() {
+        let a = _mm512_set_epi64(1, -4, 15, 0, 0, 0, 0, -16);
+        let r = _mm512_srai_epi64::<2>(a);
+        let e = _mm512_set_epi64(0, -1, 3, 0, 0, 0, 0, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srai_epi64() {
+        let a = _mm512_set_epi64(1, -4, 15, 0, 0, 0, 0, -16);
+        let r = _mm512_mask_srai_epi64::<2>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srai_epi64::<2>(a, 0b11111111, a);
+        let e = _mm512_set_epi64(0, -1, 3, 0, 0, 0, 0, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srai_epi64() {
+        let a = _mm512_set_epi64(1, -4, 15, 0, 0, 0, 0, -16);
+        let r = _mm512_maskz_srai_epi64::<2>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srai_epi64::<2>(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_srai_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let r = _mm256_srai_epi64::<1>(a);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srai_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let r = _mm256_mask_srai_epi64::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srai_epi64::<1>(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srai_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let r = _mm256_maskz_srai_epi64::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srai_epi64::<1>(0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_srai_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let r = _mm_srai_epi64::<1>(a);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srai_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let r = _mm_mask_srai_epi64::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srai_epi64::<1>(a, 0b00000011, a);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srai_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let r = _mm_maskz_srai_epi64::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srai_epi64::<1>(0b00000011, a);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permute_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_permute_pd::<0b11_11_11_11>(a);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permute_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_mask_permute_pd::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_permute_pd::<0b11_11_11_11>(a, 0b11111111, a);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permute_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_maskz_permute_pd::<0b11_11_11_11>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_permute_pd::<0b11_11_11_11>(0b11111111, a);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permute_pd() {
+        let a = _mm256_set_pd(3., 2., 1., 0.);
+        let r = _mm256_mask_permute_pd::<0b11_11>(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_permute_pd::<0b11_11>(a, 0b00001111, a);
+        let e = _mm256_set_pd(3., 3., 1., 1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permute_pd() {
+        let a = _mm256_set_pd(3., 2., 1., 0.);
+        let r = _mm256_maskz_permute_pd::<0b11_11>(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_permute_pd::<0b11_11>(0b00001111, a);
+        let e = _mm256_set_pd(3., 3., 1., 1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permute_pd() {
+        let a = _mm_set_pd(1., 0.);
+        let r = _mm_mask_permute_pd::<0b11>(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_permute_pd::<0b11>(a, 0b00000011, a);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permute_pd() {
+        let a = _mm_set_pd(1., 0.);
+        let r = _mm_maskz_permute_pd::<0b11>(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_permute_pd::<0b11>(0b00000011, a);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_permutex_epi64::<0b11_11_11_11>(a);
+        let e = _mm512_setr_epi64(3, 3, 3, 3, 7, 7, 7, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_permutex_epi64::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutex_epi64::<0b11_11_11_11>(a, 0b11111111, a);
+        let e = _mm512_setr_epi64(3, 3, 3, 3, 7, 7, 7, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_maskz_permutex_epi64::<0b11_11_11_11>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutex_epi64::<0b11_11_11_11>(0b11111111, a);
+        let e = _mm512_setr_epi64(3, 3, 3, 3, 7, 7, 7, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex_epi64() {
+        let a = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_permutex_epi64::<0b11_11_11_11>(a);
+        let e = _mm256_set_epi64x(3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex_epi64() {
+        let a = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_mask_permutex_epi64::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutex_epi64::<0b11_11_11_11>(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm256_maskz_permutex_epi64() {
+        let a = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_maskz_permutex_epi64::<0b11_11_11_11>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutex_epi64::<0b11_11_11_11>(0b00001111, a);
+        let e = _mm256_set_epi64x(3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_permutex_pd::<0b11_11_11_11>(a);
+        let e = _mm512_setr_pd(3., 3., 3., 3., 7., 7., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_mask_permutex_pd::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_permutex_pd::<0b11_11_11_11>(a, 0b11111111, a);
+        let e = _mm512_setr_pd(3., 3., 3., 3., 7., 7., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_maskz_permutex_pd::<0b11_11_11_11>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_permutex_pd::<0b11_11_11_11>(0b11111111, a);
+        let e = _mm512_setr_pd(3., 3., 3., 3., 7., 7., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_permutex_pd::<0b11_11_11_11>(a);
+        let e = _mm256_set_pd(0., 0., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_mask_permutex_pd::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_permutex_pd::<0b11_11_11_11>(a, 0b00001111, a);
+        let e = _mm256_set_pd(0., 0., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_maskz_permutex_pd::<0b11_11_11_11>(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_permutex_pd::<0b11_11_11_11>(0b00001111, a);
+        let e = _mm256_set_pd(0., 0., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutevar_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_set1_epi64(0b1);
+        let r = _mm512_permutevar_pd(a, b);
+        let e = _mm512_set_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutevar_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_set1_epi64(0b1);
+        let r = _mm512_mask_permutevar_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_permutevar_pd(a, 0b11111111, a, b);
+        let e = _mm512_set_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutevar_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_set1_epi64(0b1);
+        let r = _mm512_maskz_permutevar_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_permutevar_pd(0b00001111, a, b);
+        let e = _mm512_set_pd(0., 0., 0., 0., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutevar_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let b = _mm256_set1_epi64x(0b1);
+        let r = _mm256_mask_permutevar_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_permutevar_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(1., 1., 3., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutevar_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let b = _mm256_set1_epi64x(0b1);
+        let r = _mm256_maskz_permutevar_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_permutevar_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(1., 1., 3., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutevar_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set1_epi64x(0b1);
+        let r = _mm_mask_permutevar_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_permutevar_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutevar_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set1_epi64x(0b1);
+        let r = _mm_maskz_permutevar_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_permutevar_pd(0b00000011, a, b);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutexvar_epi64() {
+        let idx = _mm512_set1_epi64(1);
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_permutexvar_epi64(idx, a);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutexvar_epi64() {
+        let idx = _mm512_set1_epi64(1);
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_permutexvar_epi64(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutexvar_epi64(a, 0b11111111, idx, a);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutexvar_epi64() {
+        let idx = _mm512_set1_epi64(1);
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_maskz_permutexvar_epi64(0, idx, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutexvar_epi64(0b00001111, idx, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 6, 6, 6, 6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutexvar_epi64() {
+        let idx = _mm256_set1_epi64x(1);
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_permutexvar_epi64(idx, a);
+        let e = _mm256_set1_epi64x(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_epi64() {
+        let idx = _mm256_set1_epi64x(1);
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_mask_permutexvar_epi64(a, 0, idx, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutexvar_epi64(a, 0b00001111, idx, a);
+        let e = _mm256_set1_epi64x(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_epi64() {
+        let idx = _mm256_set1_epi64x(1);
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_maskz_permutexvar_epi64(0, idx, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutexvar_epi64(0b00001111, idx, a);
+        let e = _mm256_set1_epi64x(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutexvar_pd() {
+        let idx = _mm512_set1_epi64(1);
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_permutexvar_pd(idx, a);
+        let e = _mm512_set1_pd(6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutexvar_pd() {
+        let idx = _mm512_set1_epi64(1);
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_mask_permutexvar_pd(a, 0, idx, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_permutexvar_pd(a, 0b11111111, idx, a);
+        let e = _mm512_set1_pd(6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutexvar_pd() {
+        let idx = _mm512_set1_epi64(1);
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_maskz_permutexvar_pd(0, idx, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_permutexvar_pd(0b00001111, idx, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 6., 6., 6., 6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutexvar_pd() {
+        let idx = _mm256_set1_epi64x(1);
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_permutexvar_pd(idx, a);
+        let e = _mm256_set1_pd(2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_pd() {
+        let idx = _mm256_set1_epi64x(1);
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_mask_permutexvar_pd(a, 0, idx, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_permutexvar_pd(a, 0b00001111, idx, a);
+        let e = _mm256_set1_pd(2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_pd() {
+        let idx = _mm256_set1_epi64x(1);
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_maskz_permutexvar_pd(0, idx, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_permutexvar_pd(0b00001111, idx, a);
+        let e = _mm256_set1_pd(2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex2var_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_epi64(100);
+        let r = _mm512_permutex2var_epi64(a, idx, b);
+        let e = _mm512_set_epi64(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex2var_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_epi64(100);
+        let r = _mm512_mask_permutex2var_epi64(a, 0, idx, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutex2var_epi64(a, 0b11111111, idx, b);
+        let e = _mm512_set_epi64(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex2var_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_epi64(100);
+        let r = _mm512_maskz_permutex2var_epi64(0, a, idx, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutex2var_epi64(0b00001111, a, idx, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 4, 100, 3, 100);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2_permutex2var_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm512_set_epi64(1000, 1 << 3, 2000, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_epi64(100);
+        let r = _mm512_mask2_permutex2var_epi64(a, idx, 0, b);
+        assert_eq_m512i(r, idx);
+        let r = _mm512_mask2_permutex2var_epi64(a, idx, 0b00001111, b);
+        let e = _mm512_set_epi64(1000, 1 << 3, 2000, 1 << 3, 4, 100, 3, 100);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex2var_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_epi64x(100);
+        let r = _mm256_permutex2var_epi64(a, idx, b);
+        let e = _mm256_set_epi64x(2, 100, 1, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_epi64x(100);
+        let r = _mm256_mask_permutex2var_epi64(a, 0, idx, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutex2var_epi64(a, 0b00001111, idx, b);
+        let e = _mm256_set_epi64x(2, 100, 1, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_epi64x(100);
+        let r = _mm256_maskz_permutex2var_epi64(0, a, idx, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutex2var_epi64(0b00001111, a, idx, b);
+        let e = _mm256_set_epi64x(2, 100, 1, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_epi64x(100);
+        let r = _mm256_mask2_permutex2var_epi64(a, idx, 0, b);
+        assert_eq_m256i(r, idx);
+        let r = _mm256_mask2_permutex2var_epi64(a, idx, 0b00001111, b);
+        let e = _mm256_set_epi64x(2, 100, 1, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_permutex2var_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_epi64x(100);
+        let r = _mm_permutex2var_epi64(a, idx, b);
+        let e = _mm_set_epi64x(0, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_epi64x(100);
+        let r = _mm_mask_permutex2var_epi64(a, 0, idx, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutex2var_epi64(a, 0b00000011, idx, b);
+        let e = _mm_set_epi64x(0, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_epi64x(100);
+        let r = _mm_maskz_permutex2var_epi64(0, a, idx, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutex2var_epi64(0b00000011, a, idx, b);
+        let e = _mm_set_epi64x(0, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_epi64x(100);
+        let r = _mm_mask2_permutex2var_epi64(a, idx, 0, b);
+        assert_eq_m128i(r, idx);
+        let r = _mm_mask2_permutex2var_epi64(a, idx, 0b00000011, b);
+        let e = _mm_set_epi64x(0, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex2var_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_pd(100.);
+        let r = _mm512_permutex2var_pd(a, idx, b);
+        let e = _mm512_set_pd(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex2var_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_pd(100.);
+        let r = _mm512_mask_permutex2var_pd(a, 0, idx, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_permutex2var_pd(a, 0b11111111, idx, b);
+        let e = _mm512_set_pd(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex2var_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_pd(100.);
+        let r = _mm512_maskz_permutex2var_pd(0, a, idx, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_permutex2var_pd(0b00001111, a, idx, b);
+        let e = _mm512_set_pd(0., 0., 0., 0., 4., 100., 3., 100.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2_permutex2var_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_pd(100.);
+        let r = _mm512_mask2_permutex2var_pd(a, idx, 0, b);
+        assert_eq_m512d(r, _mm512_castsi512_pd(idx));
+        let r = _mm512_mask2_permutex2var_pd(a, idx, 0b11111111, b);
+        let e = _mm512_set_pd(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex2var_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_pd(100.);
+        let r = _mm256_permutex2var_pd(a, idx, b);
+        let e = _mm256_set_pd(2., 100., 1., 100.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_pd(100.);
+        let r = _mm256_mask_permutex2var_pd(a, 0, idx, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_permutex2var_pd(a, 0b00001111, idx, b);
+        let e = _mm256_set_pd(2., 100., 1., 100.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_pd(100.);
+        let r = _mm256_maskz_permutex2var_pd(0, a, idx, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_permutex2var_pd(0b00001111, a, idx, b);
+        let e = _mm256_set_pd(2., 100., 1., 100.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_pd(100.);
+        let r = _mm256_mask2_permutex2var_pd(a, idx, 0, b);
+        assert_eq_m256d(r, _mm256_castsi256_pd(idx));
+        let r = _mm256_mask2_permutex2var_pd(a, idx, 0b00001111, b);
+        let e = _mm256_set_pd(2., 100., 1., 100.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_permutex2var_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_pd(100.);
+        let r = _mm_permutex2var_pd(a, idx, b);
+        let e = _mm_set_pd(0., 100.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_pd(100.);
+        let r = _mm_mask_permutex2var_pd(a, 0, idx, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_permutex2var_pd(a, 0b00000011, idx, b);
+        let e = _mm_set_pd(0., 100.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_pd(100.);
+        let r = _mm_maskz_permutex2var_pd(0, a, idx, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_permutex2var_pd(0b00000011, a, idx, b);
+        let e = _mm_set_pd(0., 100.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_pd(100.);
+        let r = _mm_mask2_permutex2var_pd(a, idx, 0, b);
+        assert_eq_m128d(r, _mm_castsi128_pd(idx));
+        let r = _mm_mask2_permutex2var_pd(a, idx, 0b00000011, b);
+        let e = _mm_set_pd(0., 100.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_pd() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_mask_shuffle_pd::<0b11_11_11_11>(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_shuffle_pd::<0b11_11_11_11>(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(2., 1., 6., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_pd() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_maskz_shuffle_pd::<0b11_11_11_11>(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_shuffle_pd::<0b11_11_11_11>(0b00001111, a, b);
+        let e = _mm256_set_pd(2., 1., 6., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_pd() {
+        let a = _mm_set_pd(1., 4.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_shuffle_pd::<0b11_11_11_11>(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_shuffle_pd::<0b11_11_11_11>(a, 0b00000011, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_pd() {
+        let a = _mm_set_pd(1., 4.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_shuffle_pd::<0b11_11_11_11>(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_shuffle_pd::<0b11_11_11_11>(0b00000011, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_i64x2() {
+        let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_shuffle_i64x2::<0b00_00_00_00>(a, b);
+        let e = _mm512_setr_epi64(1, 4, 1, 4, 2, 3, 2, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_i64x2() {
+        let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_mask_shuffle_i64x2::<0b00_00_00_00>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shuffle_i64x2::<0b00_00_00_00>(a, 0b11111111, a, b);
+        let e = _mm512_setr_epi64(1, 4, 1, 4, 2, 3, 2, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_i64x2() {
+        let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_maskz_shuffle_i64x2::<0b00_00_00_00>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shuffle_i64x2::<0b00_00_00_00>(0b00001111, a, b);
+        let e = _mm512_setr_epi64(1, 4, 1, 4, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_i64x2() {
+        let a = _mm256_set_epi64x(1, 4, 5, 8);
+        let b = _mm256_set_epi64x(2, 3, 6, 7);
+        let r = _mm256_shuffle_i64x2::<0b00>(a, b);
+        let e = _mm256_set_epi64x(6, 7, 5, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_i64x2() {
+        let a = _mm256_set_epi64x(1, 4, 5, 8);
+        let b = _mm256_set_epi64x(2, 3, 6, 7);
+        let r = _mm256_mask_shuffle_i64x2::<0b00>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_i64x2::<0b00>(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(6, 7, 5, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_i64x2() {
+        let a = _mm256_set_epi64x(1, 4, 5, 8);
+        let b = _mm256_set_epi64x(2, 3, 6, 7);
+        let r = _mm256_maskz_shuffle_i64x2::<0b00>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_i64x2::<0b00>(0b00001111, a, b);
+        let e = _mm256_set_epi64x(6, 7, 5, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_f64x2() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm512_shuffle_f64x2::<0b00_00_00_00>(a, b);
+        let e = _mm512_setr_pd(1., 4., 1., 4., 2., 3., 2., 3.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_f64x2() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm512_mask_shuffle_f64x2::<0b00_00_00_00>(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_shuffle_f64x2::<0b00_00_00_00>(a, 0b11111111, a, b);
+        let e = _mm512_setr_pd(1., 4., 1., 4., 2., 3., 2., 3.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_f64x2() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm512_maskz_shuffle_f64x2::<0b00_00_00_00>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_shuffle_f64x2::<0b00_00_00_00>(0b00001111, a, b);
+        let e = _mm512_setr_pd(1., 4., 1., 4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_f64x2() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_shuffle_f64x2::<0b00>(a, b);
+        let e = _mm256_set_pd(6., 7., 5., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_f64x2() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_mask_shuffle_f64x2::<0b00>(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_shuffle_f64x2::<0b00>(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(6., 7., 5., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_f64x2() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_maskz_shuffle_f64x2::<0b00>(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_shuffle_f64x2::<0b00>(0b00001111, a, b);
+        let e = _mm256_set_pd(6., 7., 5., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_movedup_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_movedup_pd(a);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_movedup_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_mask_movedup_pd(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_movedup_pd(a, 0b11111111, a);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_movedup_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_movedup_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_movedup_pd(0b00001111, a);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_movedup_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_mask_movedup_pd(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_movedup_pd(a, 0b00001111, a);
+        let e = _mm256_set_pd(2., 2., 4., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_movedup_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_maskz_movedup_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_movedup_pd(0b00001111, a);
+        let e = _mm256_set_pd(2., 2., 4., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_movedup_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_mask_movedup_pd(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_movedup_pd(a, 0b00000011, a);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_movedup_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_maskz_movedup_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_movedup_pd(0b00000011, a);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_inserti64x4() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_setr_epi64x(17, 18, 19, 20);
+        let r = _mm512_inserti64x4::<1>(a, b);
+        let e = _mm512_setr_epi64(1, 2, 3, 4, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_inserti64x4() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_setr_epi64x(17, 18, 19, 20);
+        let r = _mm512_mask_inserti64x4::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_inserti64x4::<1>(a, 0b11111111, a, b);
+        let e = _mm512_setr_epi64(1, 2, 3, 4, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_inserti64x4() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_setr_epi64x(17, 18, 19, 20);
+        let r = _mm512_maskz_inserti64x4::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_inserti64x4::<1>(0b00001111, a, b);
+        let e = _mm512_setr_epi64(1, 2, 3, 4, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_insertf64x4() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_pd(17., 18., 19., 20.);
+        let r = _mm512_insertf64x4::<1>(a, b);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 17., 18., 19., 20.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_insertf64x4() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_pd(17., 18., 19., 20.);
+        let r = _mm512_mask_insertf64x4::<1>(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_insertf64x4::<1>(a, 0b11111111, a, b);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 17., 18., 19., 20.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_insertf64x4() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_pd(17., 18., 19., 20.);
+        let r = _mm512_maskz_insertf64x4::<1>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_insertf64x4::<1>(0b00001111, a, b);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd128_pd512() {
+        let a = _mm_setr_pd(17., 18.);
+        let r = _mm512_castpd128_pd512(a);
+        assert_eq_m128d(_mm512_castpd512_pd128(r), a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd256_pd512() {
+        let a = _mm256_setr_pd(17., 18., 19., 20.);
+        let r = _mm512_castpd256_pd512(a);
+        assert_eq_m256d(_mm512_castpd512_pd256(r), a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextpd128_pd512() {
+        let a = _mm_setr_pd(17., 18.);
+        let r = _mm512_zextpd128_pd512(a);
+        let e = _mm512_setr_pd(17., 18., 0., 0., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextpd256_pd512() {
+        let a = _mm256_setr_pd(17., 18., 19., 20.);
+        let r = _mm512_zextpd256_pd512(a);
+        let e = _mm512_setr_pd(17., 18., 19., 20., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd512_pd128() {
+        let a = _mm512_setr_pd(17., 18., -1., -1., -1., -1., -1., -1.);
+        let r = _mm512_castpd512_pd128(a);
+        let e = _mm_setr_pd(17., 18.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd512_pd256() {
+        let a = _mm512_setr_pd(17., 18., 19., 20., -1., -1., -1., -1.);
+        let r = _mm512_castpd512_pd256(a);
+        let e = _mm256_setr_pd(17., 18., 19., 20.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd_ps() {
+        let a = _mm512_set1_pd(1.);
+        let r = _mm512_castpd_ps(a);
+        let e = _mm512_set_ps(
+            1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0,
+            1.875, 0.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd_si512() {
+        let a = _mm512_set1_pd(1.);
+        let r = _mm512_castpd_si512(a);
+        let e = _mm512_set_epi32(
+            1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248,
+            0, 1072693248, 0, 1072693248, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi128_si512() {
+        let a = _mm_setr_epi64x(17, 18);
+        let r = _mm512_castsi128_si512(a);
+        assert_eq_m128i(_mm512_castsi512_si128(r), a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi256_si512() {
+        let a = _mm256_setr_epi64x(17, 18, 19, 20);
+        let r = _mm512_castsi256_si512(a);
+        assert_eq_m256i(_mm512_castsi512_si256(r), a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextsi128_si512() {
+        let a = _mm_setr_epi64x(17, 18);
+        let r = _mm512_zextsi128_si512(a);
+        let e = _mm512_setr_epi64(17, 18, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextsi256_si512() {
+        let a = _mm256_setr_epi64x(17, 18, 19, 20);
+        let r = _mm512_zextsi256_si512(a);
+        let e = _mm512_setr_epi64(17, 18, 19, 20, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi512_si128() {
+        let a = _mm512_setr_epi64(17, 18, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_castsi512_si128(a);
+        let e = _mm_setr_epi64x(17, 18);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi512_si256() {
+        let a = _mm512_setr_epi64(17, 18, 19, 20, -1, -1, -1, -1);
+        let r = _mm512_castsi512_si256(a);
+        let e = _mm256_setr_epi64x(17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi512_ps() {
+        let a = _mm512_set1_epi64(1 << 62);
+        let r = _mm512_castsi512_ps(a);
+        let e = _mm512_set_ps(
+            2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi512_pd() {
+        let a = _mm512_set1_epi64(1 << 62);
+        let r = _mm512_castsi512_pd(a);
+        let e = _mm512_set_pd(2., 2., 2., 2., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastq_epi64() {
+        let a = _mm_setr_epi64x(17, 18);
+        let r = _mm512_broadcastq_epi64(a);
+        let e = _mm512_set1_epi64(17);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastq_epi64() {
+        let src = _mm512_set1_epi64(18);
+        let a = _mm_setr_epi64x(17, 18);
+        let r = _mm512_mask_broadcastq_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcastq_epi64(src, 0b11111111, a);
+        let e = _mm512_set1_epi64(17);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastq_epi64() {
+        let a = _mm_setr_epi64x(17, 18);
+        let r = _mm512_maskz_broadcastq_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcastq_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 17, 17, 17, 17);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastq_epi64() {
+        let src = _mm256_set1_epi64x(18);
+        let a = _mm_set_epi64x(17, 18);
+        let r = _mm256_mask_broadcastq_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcastq_epi64(src, 0b00001111, a);
+        let e = _mm256_set1_epi64x(18);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastq_epi64() {
+        let a = _mm_set_epi64x(17, 18);
+        let r = _mm256_maskz_broadcastq_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcastq_epi64(0b00001111, a);
+        let e = _mm256_set1_epi64x(18);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_broadcastq_epi64() {
+        let src = _mm_set1_epi64x(18);
+        let a = _mm_set_epi64x(17, 18);
+        let r = _mm_mask_broadcastq_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_broadcastq_epi64(src, 0b00000011, a);
+        let e = _mm_set1_epi64x(18);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastq_epi64() {
+        let a = _mm_set_epi64x(17, 18);
+        let r = _mm_maskz_broadcastq_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_broadcastq_epi64(0b00000011, a);
+        let e = _mm_set1_epi64x(18);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastsd_pd() {
+        let a = _mm_set_pd(17., 18.);
+        let r = _mm512_broadcastsd_pd(a);
+        let e = _mm512_set1_pd(18.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastsd_pd() {
+        let src = _mm512_set1_pd(18.);
+        let a = _mm_set_pd(17., 18.);
+        let r = _mm512_mask_broadcastsd_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_broadcastsd_pd(src, 0b11111111, a);
+        let e = _mm512_set1_pd(18.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastsd_pd() {
+        let a = _mm_set_pd(17., 18.);
+        let r = _mm512_maskz_broadcastsd_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_broadcastsd_pd(0b00001111, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 18., 18., 18., 18.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastsd_pd() {
+        let src = _mm256_set1_pd(18.);
+        let a = _mm_set_pd(17., 18.);
+        let r = _mm256_mask_broadcastsd_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_broadcastsd_pd(src, 0b00001111, a);
+        let e = _mm256_set1_pd(18.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastsd_pd() {
+        let a = _mm_set_pd(17., 18.);
+        let r = _mm256_maskz_broadcastsd_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_broadcastsd_pd(0b00001111, a);
+        let e = _mm256_set1_pd(18.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_i64x4() {
+        let a = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm512_broadcast_i64x4(a);
+        let e = _mm512_set_epi64(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_i64x4() {
+        let src = _mm512_set1_epi64(18);
+        let a = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm512_mask_broadcast_i64x4(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcast_i64x4(src, 0b11111111, a);
+        let e = _mm512_set_epi64(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_i64x4() {
+        let a = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm512_maskz_broadcast_i64x4(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcast_i64x4(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_f64x4() {
+        let a = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm512_broadcast_f64x4(a);
+        let e = _mm512_set_pd(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_f64x4() {
+        let src = _mm512_set1_pd(18.);
+        let a = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm512_mask_broadcast_f64x4(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_broadcast_f64x4(src, 0b11111111, a);
+        let e = _mm512_set_pd(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_f64x4() {
+        let a = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm512_maskz_broadcast_f64x4(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_broadcast_f64x4(0b00001111, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 17., 18., 19., 20.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(2);
+        let r = _mm512_mask_blend_epi64(0b11110000, a, b);
+        let e = _mm512_set_epi64(2, 2, 2, 2, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_mask_blend_epi64(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(2);
+        let r = _mm_mask_blend_epi64(0b00000011, a, b);
+        let e = _mm_set1_epi64x(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(2.);
+        let r = _mm512_mask_blend_pd(0b11110000, a, b);
+        let e = _mm512_set_pd(2., 2., 2., 2., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(2.);
+        let r = _mm256_mask_blend_pd(0b00001111, a, b);
+        let e = _mm256_set1_pd(2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let r = _mm_mask_blend_pd(0b00000011, a, b);
+        let e = _mm_set1_pd(2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_unpackhi_epi64(a, b);
+        let e = _mm512_set_epi64(17, 1, 19, 3, 21, 5, 23, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_mask_unpackhi_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpackhi_epi64(a, 0b11111111, a, b);
+        let e = _mm512_set_epi64(17, 1, 19, 3, 21, 5, 23, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_maskz_unpackhi_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpackhi_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 21, 5, 23, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm256_mask_unpackhi_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpackhi_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(17, 1, 19, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm256_maskz_unpackhi_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpackhi_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(17, 1, 19, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(17, 18);
+        let r = _mm_mask_unpackhi_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpackhi_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(17, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(17, 18);
+        let r = _mm_maskz_unpackhi_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpackhi_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(17, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_unpackhi_pd(a, b);
+        let e = _mm512_set_pd(17., 1., 19., 3., 21., 5., 23., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_mask_unpackhi_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_unpackhi_pd(a, 0b11111111, a, b);
+        let e = _mm512_set_pd(17., 1., 19., 3., 21., 5., 23., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_maskz_unpackhi_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_unpackhi_pd(0b00001111, a, b);
+        let e = _mm512_set_pd(0., 0., 0., 0., 21., 5., 23., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm256_mask_unpackhi_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_unpackhi_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(17., 1., 19., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm256_maskz_unpackhi_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_unpackhi_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(17., 1., 19., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(17., 18.);
+        let r = _mm_mask_unpackhi_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_unpackhi_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(17., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(17., 18.);
+        let r = _mm_maskz_unpackhi_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_unpackhi_pd(0b00000011, a, b);
+        let e = _mm_set_pd(17., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_unpacklo_epi64(a, b);
+        let e = _mm512_set_epi64(18, 2, 20, 4, 22, 6, 24, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_mask_unpacklo_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpacklo_epi64(a, 0b11111111, a, b);
+        let e = _mm512_set_epi64(18, 2, 20, 4, 22, 6, 24, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_maskz_unpacklo_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpacklo_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 22, 6, 24, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm256_mask_unpacklo_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpacklo_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(18, 2, 20, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm256_maskz_unpacklo_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpacklo_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(18, 2, 20, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(17, 18);
+        let r = _mm_mask_unpacklo_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpacklo_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(18, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(17, 18);
+        let r = _mm_maskz_unpacklo_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpacklo_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(18, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_unpacklo_pd(a, b);
+        let e = _mm512_set_pd(18., 2., 20., 4., 22., 6., 24., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_mask_unpacklo_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_unpacklo_pd(a, 0b11111111, a, b);
+        let e = _mm512_set_pd(18., 2., 20., 4., 22., 6., 24., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_maskz_unpacklo_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_unpacklo_pd(0b00001111, a, b);
+        let e = _mm512_set_pd(0., 0., 0., 0., 22., 6., 24., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm256_mask_unpacklo_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_unpacklo_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(18., 2., 20., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm256_maskz_unpacklo_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_unpacklo_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(18., 2., 20., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(17., 18.);
+        let r = _mm_mask_unpacklo_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_unpacklo_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(18., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(17., 18.);
+        let r = _mm_maskz_unpacklo_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_unpacklo_pd(0b00000011, a, b);
+        let e = _mm_set_pd(18., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_alignr_epi64() {
+        let a = _mm512_set_epi64(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi64(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm512_alignr_epi64::<0>(a, b);
+        assert_eq_m512i(r, b);
+        let r = _mm512_alignr_epi64::<8>(a, b);
+        assert_eq_m512i(r, b);
+        let r = _mm512_alignr_epi64::<1>(a, b);
+        let e = _mm512_set_epi64(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_alignr_epi64() {
+        let a = _mm512_set_epi64(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi64(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm512_mask_alignr_epi64::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_alignr_epi64::<1>(a, 0b11111111, a, b);
+        let e = _mm512_set_epi64(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_alignr_epi64() {
+        let a = _mm512_set_epi64(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi64(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm512_maskz_alignr_epi64::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_alignr_epi64::<1>(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 13, 12, 11, 10);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_alignr_epi64() {
+        let a = _mm256_set_epi64x(4, 3, 2, 1);
+        let b = _mm256_set_epi64x(8, 7, 6, 5);
+        let r = _mm256_alignr_epi64::<0>(a, b);
+        let e = _mm256_set_epi64x(8, 7, 6, 5);
+        assert_eq_m256i(r, e);
+        let r = _mm256_alignr_epi64::<1>(a, b);
+        let e = _mm256_set_epi64x(1, 8, 7, 6);
+        assert_eq_m256i(r, e);
+        let r = _mm256_alignr_epi64::<6>(a, b);
+        let e = _mm256_set_epi64x(2, 1, 8, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_alignr_epi64() {
+        let a = _mm256_set_epi64x(4, 3, 2, 1);
+        let b = _mm256_set_epi64x(8, 7, 6, 5);
+        let r = _mm256_mask_alignr_epi64::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_alignr_epi64::<0>(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(8, 7, 6, 5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_alignr_epi64() {
+        let a = _mm256_set_epi64x(4, 3, 2, 1);
+        let b = _mm256_set_epi64x(8, 7, 6, 5);
+        let r = _mm256_maskz_alignr_epi64::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_alignr_epi64::<0>(0b00001111, a, b);
+        let e = _mm256_set_epi64x(8, 7, 6, 5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_alignr_epi64() {
+        let a = _mm_set_epi64x(2, 1);
+        let b = _mm_set_epi64x(4, 3);
+        let r = _mm_alignr_epi64::<0>(a, b);
+        let e = _mm_set_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_alignr_epi64() {
+        let a = _mm_set_epi64x(2, 1);
+        let b = _mm_set_epi64x(4, 3);
+        let r = _mm_mask_alignr_epi64::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_alignr_epi64::<0>(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_alignr_epi64() {
+        let a = _mm_set_epi64x(2, 1);
+        let b = _mm_set_epi64x(4, 3);
+        let r = _mm_maskz_alignr_epi64::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_alignr_epi64::<0>(0b00000011, a, b);
+        let e = _mm_set_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_and_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_and_epi64(a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_and_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_mask_and_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_and_epi64(a, 0b01111111, a, b);
+        let e = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_and_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_maskz_and_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_and_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_and_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 0);
+        let r = _mm256_mask_and_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_and_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_and_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 0);
+        let r = _mm256_maskz_and_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_and_epi64(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_and_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 0);
+        let r = _mm_mask_and_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_and_epi64(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_and_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 0);
+        let r = _mm_maskz_and_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_and_epi64(0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_and_si512() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_and_epi64(a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_or_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_or_epi64(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_or_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_mask_or_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_or_epi64(a, 0b11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_or_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_maskz_or_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_or_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_or_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 13);
+        let r = _mm256_or_epi64(a, b);
+        let e = _mm256_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_or_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 13);
+        let r = _mm256_mask_or_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_or_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_or_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 13);
+        let r = _mm256_maskz_or_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_or_epi64(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_or_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 13);
+        let r = _mm_or_epi64(a, b);
+        let e = _mm_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_or_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 13);
+        let r = _mm_mask_or_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_or_epi64(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_or_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 13);
+        let r = _mm_maskz_or_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_or_epi64(0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_or_si512() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_or_epi64(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_xor_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_xor_epi64(a, b);
+        let e = _mm512_set_epi64(1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_xor_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_mask_xor_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_xor_epi64(a, 0b11111111, a, b);
+        let e = _mm512_set_epi64(1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_xor_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_maskz_xor_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_xor_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_xor_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 13);
+        let r = _mm256_xor_epi64(a, b);
+        let e = _mm256_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_xor_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 13);
+        let r = _mm256_mask_xor_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_xor_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_xor_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 13);
+        let r = _mm256_maskz_xor_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_xor_epi64(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_xor_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 13);
+        let r = _mm_xor_epi64(a, b);
+        let e = _mm_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_xor_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 13);
+        let r = _mm_mask_xor_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_xor_epi64(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_xor_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 13);
+        let r = _mm_maskz_xor_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_xor_epi64(0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_xor_si512() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_xor_epi64(a, b);
+        let e = _mm512_set_epi64(1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_andnot_epi64() {
+        let a = _mm512_set1_epi64(0);
+        let b = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        let r = _mm512_andnot_epi64(a, b);
+        let e = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_andnot_epi64() {
+        let a = _mm512_set1_epi64(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        let r = _mm512_mask_andnot_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_andnot_epi64(a, 0b11111111, a, b);
+        let e = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_andnot_epi64() {
+        let a = _mm512_set1_epi64(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        let r = _mm512_maskz_andnot_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_andnot_epi64(0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 0, 0, 0,
+            1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_andnot_epi64() {
+        let a = _mm256_set1_epi64x(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi64x(1 << 3 | 1 << 4);
+        let r = _mm256_mask_andnot_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_andnot_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 3 | 1 << 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_andnot_epi64() {
+        let a = _mm256_set1_epi64x(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi64x(1 << 3 | 1 << 4);
+        let r = _mm256_maskz_andnot_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_andnot_epi64(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 3 | 1 << 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_andnot_epi64() {
+        let a = _mm_set1_epi64x(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi64x(1 << 3 | 1 << 4);
+        let r = _mm_mask_andnot_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_andnot_epi64(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 3 | 1 << 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_andnot_epi64() {
+        let a = _mm_set1_epi64x(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi64x(1 << 3 | 1 << 4);
+        let r = _mm_maskz_andnot_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_andnot_epi64(0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 3 | 1 << 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_andnot_si512() {
+        let a = _mm512_set1_epi64(0);
+        let b = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        let r = _mm512_andnot_si512(a, b);
+        let e = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_add_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let e: i64 = _mm512_reduce_add_epi64(a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_add_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let e: i64 = _mm512_mask_reduce_add_epi64(0b11110000, a);
+        assert_eq!(4, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_add_pd() {
+        let a = _mm512_set1_pd(1.);
+        let e: f64 = _mm512_reduce_add_pd(a);
+        assert_eq!(8., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_add_pd() {
+        let a = _mm512_set1_pd(1.);
+        let e: f64 = _mm512_mask_reduce_add_pd(0b11110000, a);
+        assert_eq!(4., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_mul_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let e: i64 = _mm512_reduce_mul_epi64(a);
+        assert_eq!(256, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_mul_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let e: i64 = _mm512_mask_reduce_mul_epi64(0b11110000, a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_mul_pd() {
+        let a = _mm512_set1_pd(2.);
+        let e: f64 = _mm512_reduce_mul_pd(a);
+        assert_eq!(256., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_mul_pd() {
+        let a = _mm512_set1_pd(2.);
+        let e: f64 = _mm512_mask_reduce_mul_pd(0b11110000, a);
+        assert_eq!(16., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i64 = _mm512_reduce_max_epi64(a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i64 = _mm512_mask_reduce_max_epi64(0b11110000, a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_epu64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u64 = _mm512_reduce_max_epu64(a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_epu64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u64 = _mm512_mask_reduce_max_epu64(0b11110000, a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let e: f64 = _mm512_reduce_max_pd(a);
+        assert_eq!(7., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let e: f64 = _mm512_mask_reduce_max_pd(0b11110000, a);
+        assert_eq!(3., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i64 = _mm512_reduce_min_epi64(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i64 = _mm512_mask_reduce_min_epi64(0b11110000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_epu64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u64 = _mm512_reduce_min_epu64(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_epu64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u64 = _mm512_mask_reduce_min_epu64(0b11110000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let e: f64 = _mm512_reduce_min_pd(a);
+        assert_eq!(0., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let e: f64 = _mm512_mask_reduce_min_pd(0b11110000, a);
+        assert_eq!(0., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_and_epi64() {
+        let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+        let e: i64 = _mm512_reduce_and_epi64(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_and_epi64() {
+        let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+        let e: i64 = _mm512_mask_reduce_and_epi64(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_or_epi64() {
+        let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+        let e: i64 = _mm512_reduce_or_epi64(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_or_epi64() {
+        let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+        let e: i64 = _mm512_mask_reduce_or_epi64(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_extractf64x4_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_extractf64x4_pd::<1>(a);
+        let e = _mm256_setr_pd(5., 6., 7., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_extractf64x4_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let src = _mm256_set1_pd(100.);
+        let r = _mm512_mask_extractf64x4_pd::<1>(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm512_mask_extractf64x4_pd::<1>(src, 0b11111111, a);
+        let e = _mm256_setr_pd(5., 6., 7., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_extractf64x4_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_extractf64x4_pd::<1>(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm512_maskz_extractf64x4_pd::<1>(0b00000001, a);
+        let e = _mm256_setr_pd(5., 0., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_extracti64x4_epi64() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_extracti64x4_epi64::<0x1>(a);
+        let e = _mm256_setr_epi64x(5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_extracti64x4_epi64() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm256_set1_epi64x(100);
+        let r = _mm512_mask_extracti64x4_epi64::<0x1>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_extracti64x4_epi64::<0x1>(src, 0b11111111, a);
+        let e = _mm256_setr_epi64x(5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_extracti64x4_epi64() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_extracti64x4_epi64::<0x1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_extracti64x4_epi64::<0x1>(0b00000001, a);
+        let e = _mm256_setr_epi64x(5, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compress_epi64() {
+        let src = _mm512_set1_epi64(200);
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_compress_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_compress_epi64(src, 0b01010101, a);
+        let e = _mm512_set_epi64(200, 200, 200, 200, 1, 3, 5, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_compress_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_maskz_compress_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_compress_epi64(0b01010101, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1, 3, 5, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compress_epi64() {
+        let src = _mm256_set1_epi64x(200);
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_mask_compress_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_compress_epi64(src, 0b00000101, a);
+        let e = _mm256_set_epi64x(200, 200, 1, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_maskz_compress_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_compress_epi64(0b00000101, a);
+        let e = _mm256_set_epi64x(0, 0, 1, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compress_epi64() {
+        let src = _mm_set1_epi64x(200);
+        let a = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_compress_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_compress_epi64(src, 0b00000001, a);
+        let e = _mm_set_epi64x(200, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_compress_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_compress_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_compress_epi64(0b00000001, a);
+        let e = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compress_pd() {
+        let src = _mm512_set1_pd(200.);
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_mask_compress_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_compress_pd(src, 0b01010101, a);
+        let e = _mm512_set_pd(200., 200., 200., 200., 1., 3., 5., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_compress_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_maskz_compress_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_compress_pd(0b01010101, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 1., 3., 5., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compress_pd() {
+        let src = _mm256_set1_pd(200.);
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_mask_compress_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_compress_pd(src, 0b00000101, a);
+        let e = _mm256_set_pd(200., 200., 1., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_maskz_compress_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_compress_pd(0b00000101, a);
+        let e = _mm256_set_pd(0., 0., 1., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compress_pd() {
+        let src = _mm_set1_pd(200.);
+        let a = _mm_set_pd(0., 1.);
+        let r = _mm_mask_compress_pd(src, 0, a);
+        assert_eq_m128d(r, src);
+        let r = _mm_mask_compress_pd(src, 0b00000001, a);
+        let e = _mm_set_pd(200., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_compress_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let r = _mm_maskz_compress_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_compress_pd(0b00000001, a);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expand_epi64() {
+        let src = _mm512_set1_epi64(200);
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_expand_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_expand_epi64(src, 0b01010101, a);
+        let e = _mm512_set_epi64(200, 4, 200, 5, 200, 6, 200, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expand_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_maskz_expand_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_expand_epi64(0b01010101, a);
+        let e = _mm512_set_epi64(0, 4, 0, 5, 0, 6, 0, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expand_epi64() {
+        let src = _mm256_set1_epi64x(200);
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_mask_expand_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_expand_epi64(src, 0b00000101, a);
+        let e = _mm256_set_epi64x(200, 2, 200, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_maskz_expand_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_expand_epi64(0b00000101, a);
+        let e = _mm256_set_epi64x(0, 2, 0, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expand_epi64() {
+        let src = _mm_set1_epi64x(200);
+        let a = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_expand_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_expand_epi64(src, 0b00000001, a);
+        let e = _mm_set_epi64x(200, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expand_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_expand_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_expand_epi64(0b00000001, a);
+        let e = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expand_pd() {
+        let src = _mm512_set1_pd(200.);
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_mask_expand_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_expand_pd(src, 0b01010101, a);
+        let e = _mm512_set_pd(200., 4., 200., 5., 200., 6., 200., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expand_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_maskz_expand_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_expand_pd(0b01010101, a);
+        let e = _mm512_set_pd(0., 4., 0., 5., 0., 6., 0., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expand_pd() {
+        let src = _mm256_set1_pd(200.);
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_mask_expand_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_expand_pd(src, 0b00000101, a);
+        let e = _mm256_set_pd(200., 2., 200., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_maskz_expand_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_expand_pd(0b00000101, a);
+        let e = _mm256_set_pd(0., 2., 0., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expand_pd() {
+        let src = _mm_set1_pd(200.);
+        let a = _mm_set_pd(0., 1.);
+        let r = _mm_mask_expand_pd(src, 0, a);
+        assert_eq_m128d(r, src);
+        let r = _mm_mask_expand_pd(src, 0b00000001, a);
+        let e = _mm_set_pd(200., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expand_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let r = _mm_maskz_expand_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_expand_pd(0b00000001, a);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_epi64() {
+        let a = &[4, 3, 2, 5, -8, -9, -64, -50];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_epi64(black_box(p));
+        let e = _mm512_setr_epi64(4, 3, 2, 5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_loadu_epi64() {
+        let a = &[4, 3, 2, 5];
+        let p = a.as_ptr();
+        let r = _mm256_loadu_epi64(black_box(p));
+        let e = _mm256_setr_epi64x(4, 3, 2, 5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_loadu_epi64() {
+        let a = &[4, 3];
+        let p = a.as_ptr();
+        let r = _mm_loadu_epi64(black_box(p));
+        let e = _mm_setr_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi64_storeu_epi16() {
+        let a = _mm512_set1_epi64(9);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtepi64_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set1_epi16(9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_storeu_epi16() {
+        let a = _mm256_set1_epi64x(9);
+        let mut r = _mm_set1_epi16(0);
+        _mm256_mask_cvtepi64_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_storeu_epi16() {
+        let a = _mm_set1_epi64x(9);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtepi64_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_storeu_epi16() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtsepi64_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set1_epi16(i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_storeu_epi16() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm256_mask_cvtsepi64_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_storeu_epi16() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtsepi64_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_storeu_epi16() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtusepi64_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set1_epi16(u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_storeu_epi16() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm256_mask_cvtusepi64_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set_epi16(
+            0,
+            0,
+            0,
+            0,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_storeu_epi16() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtusepi64_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi64_storeu_epi8() {
+        let a = _mm512_set1_epi64(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm512_mask_cvtepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_storeu_epi8() {
+        let a = _mm256_set1_epi64x(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_storeu_epi8() {
+        let a = _mm_set1_epi64x(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_storeu_epi8() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm512_mask_cvtsepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_storeu_epi8() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtsepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_storeu_epi8() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_storeu_epi8() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm512_mask_cvtusepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_storeu_epi8() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtusepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_storeu_epi8() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi64_storeu_epi32() {
+        let a = _mm512_set1_epi64(9);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtepi64_storeu_epi32(&mut r as *mut _ as *mut i32, 0b11111111, a);
+        let e = _mm256_set1_epi32(9);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_storeu_epi32() {
+        let a = _mm256_set1_epi64x(9);
+        let mut r = _mm_set1_epi32(0);
+        _mm256_mask_cvtepi64_storeu_epi32(&mut r as *mut _ as *mut i32, 0b11111111, a);
+        let e = _mm_set_epi32(9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_storeu_epi32() {
+        let a = _mm_set1_epi64x(9);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtepi64_storeu_epi32(&mut r as *mut _ as *mut i32, 0b11111111, a);
+        let e = _mm_set_epi32(0, 0, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_storeu_epi32() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtsepi64_storeu_epi32(&mut r as *mut _ as *mut i32, 0b11111111, a);
+        let e = _mm256_set1_epi32(i32::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_storeu_epi32() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi32(0);
+        _mm256_mask_cvtsepi64_storeu_epi32(&mut r as *mut _ as *mut i32, 0b00001111, a);
+        let e = _mm_set1_epi32(i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_storeu_epi32() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtsepi64_storeu_epi32(&mut r as *mut _ as *mut i32, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, i32::MAX, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_storeu_epi32() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtusepi64_storeu_epi32(&mut r as *mut _ as *mut i32, 0b11111111, a);
+        let e = _mm256_set1_epi32(u32::MAX as i32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_storeu_epi32() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi32(0);
+        _mm256_mask_cvtusepi64_storeu_epi32(&mut r as *mut _ as *mut i32, 0b00001111, a);
+        let e = _mm_set1_epi32(u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_storeu_epi32() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtusepi64_storeu_epi32(&mut r as *mut _ as *mut i32, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, u32::MAX as i32, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_epi64() {
+        let a = _mm512_set1_epi64(9);
+        let mut r = _mm512_set1_epi64(0);
+        _mm512_storeu_epi64(&mut r as *mut _ as *mut i64, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_storeu_epi64() {
+        let a = _mm256_set1_epi64x(9);
+        let mut r = _mm256_set1_epi64x(0);
+        _mm256_storeu_epi64(&mut r as *mut _ as *mut i64, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_storeu_epi64() {
+        let a = _mm_set1_epi64x(9);
+        let mut r = _mm_set1_epi64x(0);
+        _mm_storeu_epi64(&mut r as *mut _ as *mut i64, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8], // 64 bytes
+        }
+        let a = Align {
+            data: [4, 3, 2, 5, -8, -9, -64, -50],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm512_load_epi64(black_box(p));
+        let e = _mm512_setr_epi64(4, 3, 2, 5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 4],
+        }
+        let a = Align { data: [4, 3, 2, 5] };
+        let p = (a.data).as_ptr();
+        let r = _mm256_load_epi64(black_box(p));
+        let e = _mm256_set_epi64x(5, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 2],
+        }
+        let a = Align { data: [4, 3] };
+        let p = (a.data).as_ptr();
+        let r = _mm_load_epi64(black_box(p));
+        let e = _mm_set_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_store_epi64() {
+        let a = _mm512_set1_epi64(9);
+        let mut r = _mm512_set1_epi64(0);
+        _mm512_store_epi64(&mut r as *mut _ as *mut i64, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_store_epi64() {
+        let a = _mm256_set1_epi64x(9);
+        let mut r = _mm256_set1_epi64x(0);
+        _mm256_store_epi64(&mut r as *mut _ as *mut i64, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_store_epi64() {
+        let a = _mm_set1_epi64x(9);
+        let mut r = _mm_set1_epi64x(0);
+        _mm_store_epi64(&mut r as *mut _ as *mut i64, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_load_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8], // 64 bytes
+        }
+        let a = Align {
+            data: [4., 3., 2., 5., -8., -9., -64., -50.],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm512_load_pd(black_box(p));
+        let e = _mm512_setr_pd(4., 3., 2., 5., -8., -9., -64., -50.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_store_pd() {
+        let a = _mm512_set1_pd(9.);
+        let mut r = _mm512_undefined_pd();
+        _mm512_store_pd(&mut r as *mut _ as *mut f64, a);
+        assert_eq_m512d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_test_epi64_mask() {
+        let a = _mm512_set1_epi64(1 << 0);
+        let b = _mm512_set1_epi64(1 << 0 | 1 << 1);
+        let r = _mm512_test_epi64_mask(a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_test_epi64_mask() {
+        let a = _mm512_set1_epi64(1 << 0);
+        let b = _mm512_set1_epi64(1 << 0 | 1 << 1);
+        let r = _mm512_mask_test_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_test_epi64_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_test_epi64_mask() {
+        let a = _mm256_set1_epi64x(1 << 0);
+        let b = _mm256_set1_epi64x(1 << 0 | 1 << 1);
+        let r = _mm256_test_epi64_mask(a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_test_epi64_mask() {
+        let a = _mm256_set1_epi64x(1 << 0);
+        let b = _mm256_set1_epi64x(1 << 0 | 1 << 1);
+        let r = _mm256_mask_test_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_test_epi64_mask(0b00001111, a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_test_epi64_mask() {
+        let a = _mm_set1_epi64x(1 << 0);
+        let b = _mm_set1_epi64x(1 << 0 | 1 << 1);
+        let r = _mm_test_epi64_mask(a, b);
+        let e: __mmask8 = 0b00000011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_test_epi64_mask() {
+        let a = _mm_set1_epi64x(1 << 0);
+        let b = _mm_set1_epi64x(1 << 0 | 1 << 1);
+        let r = _mm_mask_test_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_test_epi64_mask(0b00000011, a, b);
+        let e: __mmask8 = 0b00000011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_testn_epi64_mask() {
+        let a = _mm512_set1_epi64(1 << 0);
+        let b = _mm512_set1_epi64(1 << 0 | 1 << 1);
+        let r = _mm512_testn_epi64_mask(a, b);
+        let e: __mmask8 = 0b00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_testn_epi64_mask() {
+        let a = _mm512_set1_epi64(1 << 0);
+        let b = _mm512_set1_epi64(1 << 1);
+        let r = _mm512_mask_testn_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_testn_epi64_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_testn_epi64_mask() {
+        let a = _mm256_set1_epi64x(1 << 0);
+        let b = _mm256_set1_epi64x(1 << 1);
+        let r = _mm256_testn_epi64_mask(a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_testn_epi64_mask() {
+        let a = _mm256_set1_epi64x(1 << 0);
+        let b = _mm256_set1_epi64x(1 << 1);
+        let r = _mm256_mask_testn_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_testn_epi64_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_testn_epi64_mask() {
+        let a = _mm_set1_epi64x(1 << 0);
+        let b = _mm_set1_epi64x(1 << 1);
+        let r = _mm_testn_epi64_mask(a, b);
+        let e: __mmask8 = 0b00000011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_testn_epi64_mask() {
+        let a = _mm_set1_epi64x(1 << 0);
+        let b = _mm_set1_epi64x(1 << 1);
+        let r = _mm_mask_testn_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_testn_epi64_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b00000011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_set1_epi64() {
+        let src = _mm512_set1_epi64(2);
+        let a: i64 = 11;
+        let r = _mm512_mask_set1_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_set1_epi64(src, 0b11111111, a);
+        let e = _mm512_set1_epi64(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_set1_epi64() {
+        let a: i64 = 11;
+        let r = _mm512_maskz_set1_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_set1_epi64(0b11111111, a);
+        let e = _mm512_set1_epi64(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_set1_epi64() {
+        let src = _mm256_set1_epi64x(2);
+        let a: i64 = 11;
+        let r = _mm256_mask_set1_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_set1_epi64(src, 0b00001111, a);
+        let e = _mm256_set1_epi64x(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_set1_epi64() {
+        let a: i64 = 11;
+        let r = _mm256_maskz_set1_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_set1_epi64(0b00001111, a);
+        let e = _mm256_set1_epi64x(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_set1_epi64() {
+        let src = _mm_set1_epi64x(2);
+        let a: i64 = 11;
+        let r = _mm_mask_set1_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_set1_epi64(src, 0b00000011, a);
+        let e = _mm_set1_epi64x(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_set1_epi64() {
+        let a: i64 = 11;
+        let r = _mm_maskz_set1_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_set1_epi64(0b00000011, a);
+        let e = _mm_set1_epi64x(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundi64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundi64_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsi64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundsi64_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvti64_ss(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvti64_sd(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_si64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_si64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_i64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_u64() {
+        let a = _mm_set_pd(1., f64::MAX);
+        let r = _mm_cvt_roundsd_u64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_u64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_i64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_si64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_si64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_u64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_i64(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_i64::<_MM_FROUND_NO_EXC>(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_si64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_si64::<_MM_FROUND_NO_EXC>(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_u64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_u64::<_MM_FROUND_NO_EXC>(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_u64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_i64(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_i64::<_MM_FROUND_NO_EXC>(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_si64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_si64::<_MM_FROUND_NO_EXC>(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_u64::<_MM_FROUND_NO_EXC>(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtu64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvtu64_ss(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtu64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvtu64_sd(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundu64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvt_roundu64_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundu64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvt_roundu64_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundi64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundi64_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsi64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundsi64_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx512fp16.rs b/library/stdarch/crates/core_arch/src/x86_64/avx512fp16.rs
new file mode 100644
index 0000000000000..955c6ccc7526b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx512fp16.rs
@@ -0,0 +1,321 @@
+use crate::core_arch::x86::*;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Convert the signed 64-bit integer b to a half-precision (16-bit) floating-point element, store the
+/// result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements
+/// of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti64_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsi2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvti64_sh(a: __m128h, b: i64) -> __m128h {
+    unsafe { vcvtsi642sh(a, b, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the signed 64-bit integer b to a half-precision (16-bit) floating-point element, store the
+/// result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements
+/// of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundi64_sh<const ROUNDING: i32>(a: __m128h, b: i64) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsi642sh(a, b, ROUNDING)
+    }
+}
+
+/// Convert the unsigned 64-bit integer b to a half-precision (16-bit) floating-point element, store the
+/// result in the lower element of dst, and copy the upper 1 packed elements from a to the upper elements
+/// of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtusi2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtu64_sh(a: __m128h, b: u64) -> __m128h {
+    unsafe { vcvtusi642sh(a, b, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the unsigned 64-bit integer b to a half-precision (16-bit) floating-point element, store the
+/// result in the lower element of dst, and copy the upper 1 packed elements from a to the upper elements
+/// of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu64_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundu64_sh<const ROUNDING: i32>(a: __m128h, b: u64) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtusi642sh(a, b, ROUNDING)
+    }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit integer, and store
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_i64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2si))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsh_i64(a: __m128h) -> i64 {
+    unsafe { vcvtsh2si64(a, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit integer, and store
+/// the result in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_i64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundsh_i64<const ROUNDING: i32>(a: __m128h) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsh2si64(a, ROUNDING)
+    }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit unsigned integer, and store
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_u64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2usi))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsh_u64(a: __m128h) -> u64 {
+    unsafe { vcvtsh2usi64(a, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit unsigned integer, and store
+/// the result in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_u64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundsh_u64<const ROUNDING: i32>(a: __m128h) -> u64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsh2usi64(a, ROUNDING)
+    }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit integer with truncation,
+/// and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_i64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttsh2si))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttsh_i64(a: __m128h) -> i64 {
+    unsafe { vcvttsh2si64(a, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit integer with truncation,
+/// and store the result in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_i64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtt_roundsh_i64<const SAE: i32>(a: __m128h) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvttsh2si64(a, SAE)
+    }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit unsigned integer with truncation,
+/// and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_u64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttsh2usi))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttsh_u64(a: __m128h) -> u64 {
+    unsafe { vcvttsh2usi64(a, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit unsigned integer with truncation,
+/// and store the result in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_u64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtt_roundsh_u64<const SAE: i32>(a: __m128h) -> u64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvttsh2usi64(a, SAE)
+    }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512fp16.vcvtsi642sh"]
+    fn vcvtsi642sh(a: __m128h, b: i64, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.vcvtusi642sh"]
+    fn vcvtusi642sh(a: __m128h, b: u64, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.vcvtsh2si64"]
+    fn vcvtsh2si64(a: __m128h, rounding: i32) -> i64;
+    #[link_name = "llvm.x86.avx512fp16.vcvtsh2usi64"]
+    fn vcvtsh2usi64(a: __m128h, rounding: i32) -> u64;
+    #[link_name = "llvm.x86.avx512fp16.vcvttsh2si64"]
+    fn vcvttsh2si64(a: __m128h, sae: i32) -> i64;
+    #[link_name = "llvm.x86.avx512fp16.vcvttsh2usi64"]
+    fn vcvttsh2usi64(a: __m128h, sae: i32) -> u64;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::{x86::*, x86_64::*};
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvti64_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvti64_sh(a, 10);
+        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvt_roundi64_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvt_roundi64_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
+        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtu64_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtu64_sh(a, 10);
+        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvt_roundu64_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvt_roundu64_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
+        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsh_i64() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtsh_i64(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundsh_i64() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvt_roundsh_i64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsh_u64() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtsh_u64(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundsh_u64() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvt_roundsh_u64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvttsh_i64() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttsh_i64(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtt_roundsh_i64() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtt_roundsh_i64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvttsh_u64() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttsh_u64(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtt_roundsh_u64() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtt_roundsh_u64::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 1);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/bmi.rs b/library/stdarch/crates/core_arch/src/x86_64/bmi.rs
new file mode 100644
index 0000000000000..5d204d51ae6d9
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/bmi.rs
@@ -0,0 +1,183 @@
+//! Bit Manipulation Instruction (BMI) Set 1.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]: https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bextr_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 {
+    _bextr2_u64(a, ((start & 0xff) | ((len & 0xff) << 8)) as u64)
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range
+/// to be extracted, and bits `[15,8]` specify the length of the range.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bextr2_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
+    unsafe { x86_bmi_bextr_64(a, control) }
+}
+
+/// Bitwise logical `AND` of inverted `a` with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_andn_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(andn))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _andn_u64(a: u64, b: u64) -> u64 {
+    !a & b
+}
+
+/// Extracts lowest set isolated bit.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_blsi_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsi))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _blsi_u64(x: u64) -> u64 {
+    x & x.wrapping_neg()
+}
+
+/// Gets mask up to lowest set bit.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_blsmsk_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsmsk))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _blsmsk_u64(x: u64) -> u64 {
+    x ^ (x.wrapping_sub(1_u64))
+}
+
+/// Resets the lowest set bit of `x`.
+///
+/// If `x` is sets CF.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_blsr_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsr))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _blsr_u64(x: u64) -> u64 {
+    x & (x.wrapping_sub(1))
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is `0`, it returns its size in bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tzcnt_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _tzcnt_u64(x: u64) -> u64 {
+    x.trailing_zeros() as u64
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is `0`, it returns its size in bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_tzcnt_64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_tzcnt_64(x: u64) -> i64 {
+    x.trailing_zeros() as i64
+}
+
+unsafe extern "C" {
+    #[link_name = "llvm.x86.bmi.bextr.64"]
+    fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::{x86::*, x86_64::*};
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_bextr_u64() {
+        let r = _bextr_u64(0b0101_0000u64, 4, 4);
+        assert_eq!(r, 0b0000_0101u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_andn_u64() {
+        assert_eq!(_andn_u64(0, 0), 0);
+        assert_eq!(_andn_u64(0, 1), 1);
+        assert_eq!(_andn_u64(1, 0), 0);
+        assert_eq!(_andn_u64(1, 1), 0);
+
+        let r = _andn_u64(0b0000_0000u64, 0b0000_0000u64);
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = _andn_u64(0b0000_0000u64, 0b1111_1111u64);
+        assert_eq!(r, 0b1111_1111u64);
+
+        let r = _andn_u64(0b1111_1111u64, 0b0000_0000u64);
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = _andn_u64(0b1111_1111u64, 0b1111_1111u64);
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = _andn_u64(0b0100_0000u64, 0b0101_1101u64);
+        assert_eq!(r, 0b0001_1101u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsi_u64() {
+        assert_eq!(_blsi_u64(0b1101_0000u64), 0b0001_0000u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsmsk_u64() {
+        let r = _blsmsk_u64(0b0011_0000u64);
+        assert_eq!(r, 0b0001_1111u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsr_u64() {
+        // TODO: test the behavior when the input is `0`.
+        let r = _blsr_u64(0b0011_0000u64);
+        assert_eq!(r, 0b0010_0000u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_tzcnt_u64() {
+        assert_eq!(_tzcnt_u64(0b0000_0001u64), 0u64);
+        assert_eq!(_tzcnt_u64(0b0000_0000u64), 64u64);
+        assert_eq!(_tzcnt_u64(0b1001_0000u64), 4u64);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/bmi2.rs b/library/stdarch/crates/core_arch/src/x86_64/bmi2.rs
new file mode 100644
index 0000000000000..ea9daf88574f0
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/bmi2.rs
@@ -0,0 +1,139 @@
+//! Bit Manipulation Instruction (BMI) Set 2.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Unsigned multiply without affecting flags.
+///
+/// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
+/// the low half and the high half of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mulx_u64)
+#[inline]
+#[cfg_attr(test, assert_instr(mul))]
+#[target_feature(enable = "bmi2")]
+#[cfg(not(target_arch = "x86"))] // calls an intrinsic
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mulx_u64(a: u64, b: u64, hi: &mut u64) -> u64 {
+    let result: u128 = (a as u128) * (b as u128);
+    *hi = (result >> 64) as u64;
+    result as u64
+}
+
+/// Zeroes higher bits of `a` >= `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bzhi_u64)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(bzhi))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _bzhi_u64(a: u64, index: u32) -> u64 {
+    unsafe { x86_bmi2_bzhi_64(a, index as u64) }
+}
+
+/// Scatter contiguous low order bits of `a` to the result at the positions
+/// specified by the `mask`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_pdep_u64)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pdep))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _pdep_u64(a: u64, mask: u64) -> u64 {
+    unsafe { x86_bmi2_pdep_64(a, mask) }
+}
+
+/// Gathers the bits of `x` specified by the `mask` into the contiguous low
+/// order bit positions of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_pext_u64)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pext))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _pext_u64(a: u64, mask: u64) -> u64 {
+    unsafe { x86_bmi2_pext_64(a, mask) }
+}
+
+unsafe extern "C" {
+    #[link_name = "llvm.x86.bmi.bzhi.64"]
+    fn x86_bmi2_bzhi_64(x: u64, y: u64) -> u64;
+    #[link_name = "llvm.x86.bmi.pdep.64"]
+    fn x86_bmi2_pdep_64(x: u64, y: u64) -> u64;
+    #[link_name = "llvm.x86.bmi.pext.64"]
+    fn x86_bmi2_pext_64(x: u64, y: u64) -> u64;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86_64::*;
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pext_u64() {
+        let n = 0b1011_1110_1001_0011u64;
+
+        let m0 = 0b0110_0011_1000_0101u64;
+        let s0 = 0b0000_0000_0011_0101u64;
+
+        let m1 = 0b1110_1011_1110_1111u64;
+        let s1 = 0b0001_0111_0100_0011u64;
+
+        assert_eq!(_pext_u64(n, m0), s0);
+        assert_eq!(_pext_u64(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pdep_u64() {
+        let n = 0b1011_1110_1001_0011u64;
+
+        let m0 = 0b0110_0011_1000_0101u64;
+        let s0 = 0b0000_0010_0000_0101u64;
+
+        let m1 = 0b1110_1011_1110_1111u64;
+        let s1 = 0b1110_1001_0010_0011u64;
+
+        assert_eq!(_pdep_u64(n, m0), s0);
+        assert_eq!(_pdep_u64(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_bzhi_u64() {
+        let n = 0b1111_0010u64;
+        let s = 0b0001_0010u64;
+        assert_eq!(_bzhi_u64(n, 5), s);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    #[rustfmt::skip]
+    unsafe fn test_mulx_u64() {
+        let a: u64 = 9_223_372_036_854_775_800;
+        let b: u64 = 100;
+        let mut hi = 0;
+        let lo = _mulx_u64(a, b, &mut hi);
+        /*
+result = 922337203685477580000 =
+0b00110001_1111111111111111_1111111111111111_1111111111111111_1111110011100000
+  ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        */
+        assert_eq!(
+            lo,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u64
+        );
+        assert_eq!(hi, 0b00110001u64);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/bswap.rs b/library/stdarch/crates/core_arch/src/x86_64/bswap.rs
new file mode 100644
index 0000000000000..62cd2948ce14d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/bswap.rs
@@ -0,0 +1,29 @@
+//! Byte swap intrinsics.
+
+#![allow(clippy::module_name_repetitions)]
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Returns an integer with the reversed byte order of x
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bswap64)
+#[inline]
+#[cfg_attr(test, assert_instr(bswap))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bswap64(x: i64) -> i64 {
+    x.swap_bytes()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bswap64() {
+        unsafe {
+            assert_eq!(_bswap64(0x0EADBEEFFADECA0E), 0x0ECADEFAEFBEAD0E);
+            assert_eq!(_bswap64(0x0000000000000000), 0x0000000000000000);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/bt.rs b/library/stdarch/crates/core_arch/src/x86_64/bt.rs
new file mode 100644
index 0000000000000..f9aa3e16ccdf0
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/bt.rs
@@ -0,0 +1,147 @@
+use crate::arch::asm;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+// x32 wants to use a 32-bit address size, but asm! defaults to using the full
+// register name (e.g. rax). We have to explicitly override the placeholder to
+// use the 32-bit register name in that case.
+#[cfg(target_pointer_width = "32")]
+macro_rules! bt {
+    ($inst:expr) => {
+        concat!($inst, " {b}, ({p:e})")
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! bt {
+    ($inst:expr) => {
+        concat!($inst, " {b}, ({p})")
+    };
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittest64)
+#[inline]
+#[cfg_attr(test, assert_instr(bt))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittest64(p: *const i64, b: i64) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btq"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(readonly, nostack, pure, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then sets the bit to `1`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittestandset64)
+#[inline]
+#[cfg_attr(test, assert_instr(bts))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandset64(p: *mut i64, b: i64) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btsq"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then resets that bit to `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittestandreset64)
+#[inline]
+#[cfg_attr(test, assert_instr(btr))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandreset64(p: *mut i64, b: i64) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btrq"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then inverts that bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittestandcomplement64)
+#[inline]
+#[cfg_attr(test, assert_instr(btc))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandcomplement64(p: *mut i64, b: i64) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btcq"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86_64::*;
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    fn test_bittest64() {
+        unsafe {
+            let a = 0b0101_0000i64;
+            assert_eq!(_bittest64(&a as _, 4), 1);
+            assert_eq!(_bittest64(&a as _, 5), 0);
+        }
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    fn test_bittestandset64() {
+        unsafe {
+            let mut a = 0b0101_0000i64;
+            assert_eq!(_bittestandset64(&mut a as _, 4), 1);
+            assert_eq!(_bittestandset64(&mut a as _, 4), 1);
+            assert_eq!(_bittestandset64(&mut a as _, 5), 0);
+            assert_eq!(_bittestandset64(&mut a as _, 5), 1);
+        }
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    fn test_bittestandreset64() {
+        unsafe {
+            let mut a = 0b0101_0000i64;
+            assert_eq!(_bittestandreset64(&mut a as _, 4), 1);
+            assert_eq!(_bittestandreset64(&mut a as _, 4), 0);
+            assert_eq!(_bittestandreset64(&mut a as _, 5), 0);
+            assert_eq!(_bittestandreset64(&mut a as _, 5), 0);
+        }
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    fn test_bittestandcomplement64() {
+        unsafe {
+            let mut a = 0b0101_0000i64;
+            assert_eq!(_bittestandcomplement64(&mut a as _, 4), 1);
+            assert_eq!(_bittestandcomplement64(&mut a as _, 4), 0);
+            assert_eq!(_bittestandcomplement64(&mut a as _, 4), 1);
+            assert_eq!(_bittestandcomplement64(&mut a as _, 5), 0);
+            assert_eq!(_bittestandcomplement64(&mut a as _, 5), 1);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/cmpxchg16b.rs b/library/stdarch/crates/core_arch/src/x86_64/cmpxchg16b.rs
new file mode 100644
index 0000000000000..46a008245bf82
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/cmpxchg16b.rs
@@ -0,0 +1,55 @@
+use crate::sync::atomic::Ordering;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Compares and exchange 16 bytes (128 bits) of data atomically.
+///
+/// This intrinsic corresponds to the `cmpxchg16b` instruction on `x86_64`
+/// processors. It performs an atomic compare-and-swap, updating the `ptr`
+/// memory location to `val` if the current value in memory equals `old`.
+///
+/// # Return value
+///
+/// This function returns the previous value at the memory location. If it is
+/// equal to `old` then the memory was updated to `new`.
+///
+/// # Memory Orderings
+///
+/// This atomic operation has the same semantics of memory orderings as
+/// `AtomicUsize::compare_exchange` does, only operating on 16 bytes of memory
+/// instead of just a pointer.
+///
+/// The failure ordering must be [`Ordering::SeqCst`], [`Ordering::Acquire`] or
+/// [`Ordering::Relaxed`].
+///
+/// For more information on memory orderings here see the `compare_exchange`
+/// documentation for other `Atomic*` types in the standard library.
+///
+/// # Unsafety
+///
+/// This method is unsafe because it takes a raw pointer and will attempt to
+/// read and possibly write the memory at the pointer. The pointer must also be
+/// aligned on a 16-byte boundary.
+///
+/// This method also requires the `cmpxchg16b` CPU feature to be available at
+/// runtime to work correctly. If the CPU running the binary does not actually
+/// support `cmpxchg16b` and the program enters an execution path that
+/// eventually would reach this function the behavior is undefined.
+#[inline]
+#[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
+#[cfg_attr(test, assert_instr(cmpxchg16b, success = Ordering::SeqCst, failure = Ordering::SeqCst))]
+#[target_feature(enable = "cmpxchg16b")]
+#[stable(feature = "cmpxchg16b_intrinsic", since = "1.67.0")]
+pub unsafe fn cmpxchg16b(
+    dst: *mut u128,
+    old: u128,
+    new: u128,
+    success: Ordering,
+    failure: Ordering,
+) -> u128 {
+    debug_assert!(dst as usize % 16 == 0);
+
+    let res = crate::sync::atomic::atomic_compare_exchange(dst, old, new, success, failure);
+    res.unwrap_or_else(|x| x)
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/fxsr.rs b/library/stdarch/crates/core_arch/src/x86_64/fxsr.rs
new file mode 100644
index 0000000000000..a24b44fb1f7e3
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/fxsr.rs
@@ -0,0 +1,88 @@
+//! FXSR floating-point context fast save and restore.
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.fxsave64"]
+    fn fxsave64(p: *mut u8);
+    #[link_name = "llvm.x86.fxrstor64"]
+    fn fxrstor64(p: *const u8);
+}
+
+/// Saves the `x87` FPU, `MMX` technology, `XMM`, and `MXCSR` registers to the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_fxsave64)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxsave64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxsave64(mem_addr: *mut u8) {
+    fxsave64(mem_addr)
+}
+
+/// Restores the `XMM`, `MMX`, `MXCSR`, and `x87` FPU registers from the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// The contents of this memory region should have been written to by a
+/// previous
+/// `_fxsave` or `_fxsave64` intrinsic.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_fxrstor64)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxrstor64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxrstor64(mem_addr: *const u8) {
+    fxrstor64(mem_addr)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86_64::*;
+    use std::{cmp::PartialEq, fmt};
+    use stdarch_test::simd_test;
+
+    #[repr(align(16))]
+    struct FxsaveArea {
+        data: [u8; 512], // 512 bytes
+    }
+
+    impl FxsaveArea {
+        fn new() -> FxsaveArea {
+            FxsaveArea { data: [0; 512] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            self.data.as_mut_ptr()
+        }
+    }
+
+    #[simd_test(enable = "fxsr")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_fxsave64() {
+        let mut a = FxsaveArea::new();
+        let mut b = FxsaveArea::new();
+
+        fxsr::_fxsave64(a.ptr());
+        fxsr::_fxrstor64(a.ptr());
+        fxsr::_fxsave64(b.ptr());
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/macros.rs b/library/stdarch/crates/core_arch/src/x86_64/macros.rs
new file mode 100644
index 0000000000000..53f1d02bd3684
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/macros.rs
@@ -0,0 +1,35 @@
+//! Utility macros.
+
+// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
+// not a round number.
+#[allow(unused)]
+macro_rules! static_assert_rounding {
+    ($imm:ident) => {
+        static_assert!(
+            $imm == 4 || $imm == 8 || $imm == 9 || $imm == 10 || $imm == 11,
+            "Invalid IMM value"
+        )
+    };
+}
+
+// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
+// not a sae number.
+#[allow(unused)]
+macro_rules! static_assert_sae {
+    ($imm:ident) => {
+        static_assert!($imm == 4 || $imm == 8, "Invalid IMM value")
+    };
+}
+
+#[cfg(target_pointer_width = "32")]
+macro_rules! vps {
+    ($inst1:expr, $inst2:expr) => {
+        concat!($inst1, " [{p:e}]", $inst2)
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! vps {
+    ($inst1:expr, $inst2:expr) => {
+        concat!($inst1, " [{p}]", $inst2)
+    };
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/mod.rs b/library/stdarch/crates/core_arch/src/x86_64/mod.rs
new file mode 100644
index 0000000000000..7d681882bef0c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/mod.rs
@@ -0,0 +1,83 @@
+//! `x86_64` intrinsics
+
+#[macro_use]
+mod macros;
+
+mod fxsr;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::fxsr::*;
+
+mod sse;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::sse::*;
+
+mod sse2;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::sse2::*;
+
+mod sse41;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::sse41::*;
+
+mod sse42;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::sse42::*;
+
+mod xsave;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::xsave::*;
+
+mod abm;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::abm::*;
+
+mod avx;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::avx::*;
+
+mod bmi;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::bmi::*;
+mod bmi2;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::bmi2::*;
+
+mod tbm;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::tbm::*;
+
+mod avx512f;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avx512f::*;
+
+mod avx512bw;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub use self::avx512bw::*;
+
+mod bswap;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::bswap::*;
+
+mod rdrand;
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub use self::rdrand::*;
+
+mod cmpxchg16b;
+#[stable(feature = "cmpxchg16b_intrinsic", since = "1.67.0")]
+pub use self::cmpxchg16b::*;
+
+mod adx;
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub use self::adx::*;
+
+mod bt;
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub use self::bt::*;
+
+mod avx512fp16;
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub use self::avx512fp16::*;
+
+mod amx;
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub use self::amx::*;
diff --git a/library/stdarch/crates/core_arch/src/x86_64/rdrand.rs b/library/stdarch/crates/core_arch/src/x86_64/rdrand.rs
new file mode 100644
index 0000000000000..42e907b4e478d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/rdrand.rs
@@ -0,0 +1,44 @@
+//! RDRAND and RDSEED instructions for returning random numbers from an Intel
+//! on-chip hardware random number generator which has been seeded by an
+//! on-chip entropy source.
+
+#![allow(clippy::module_name_repetitions)]
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.rdrand.64"]
+    fn x86_rdrand64_step() -> (u64, i32);
+    #[link_name = "llvm.x86.rdseed.64"]
+    fn x86_rdseed64_step() -> (u64, i32);
+}
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Read a hardware generated 64-bit random value and store the result in val.
+/// Returns 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdrand64_step)
+#[inline]
+#[target_feature(enable = "rdrand")]
+#[cfg_attr(test, assert_instr(rdrand))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdrand64_step(val: &mut u64) -> i32 {
+    let (v, flag) = x86_rdrand64_step();
+    *val = v;
+    flag
+}
+
+/// Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store
+/// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdseed64_step)
+#[inline]
+#[target_feature(enable = "rdseed")]
+#[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdseed64_step(val: &mut u64) -> i32 {
+    let (v, flag) = x86_rdseed64_step();
+    *val = v;
+    flag
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse.rs b/library/stdarch/crates/core_arch/src/x86_64/sse.rs
new file mode 100644
index 0000000000000..863c3cd2e7012
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse.rs
@@ -0,0 +1,145 @@
+//! `x86_64` Streaming SIMD Extensions (SSE)
+
+use crate::core_arch::x86::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sse.cvtss2si64"]
+    fn cvtss2si64(a: __m128) -> i64;
+    #[link_name = "llvm.x86.sse.cvttss2si64"]
+    fn cvttss2si64(a: __m128) -> i64;
+    #[link_name = "llvm.x86.sse.cvtsi642ss"]
+    fn cvtsi642ss(a: __m128, b: i64) -> __m128;
+}
+
+/// Converts the lowest 32 bit float in the input vector to a 64 bit integer.
+///
+/// The result is rounded according to the current rounding mode. If the result
+/// cannot be represented as a 64 bit integer the result will be
+/// `0x8000_0000_0000_0000` (`i64::MIN`) or trigger an invalid operation
+/// floating point exception if unmasked (see
+/// [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTSS2SI` instruction (with 64 bit output).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtss_si64(a: __m128) -> i64 {
+    unsafe { cvtss2si64(a) }
+}
+
+/// Converts the lowest 32 bit float in the input vector to a 64 bit integer
+/// with truncation.
+///
+/// The result is rounded always using truncation (round towards zero). If the
+/// result cannot be represented as a 64 bit integer the result will be
+/// `0x8000_0000_0000_0000` (`i64::MIN`) or an invalid operation floating
+/// point exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTTSS2SI` instruction (with 64 bit output).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvttss_si64(a: __m128) -> i64 {
+    unsafe { cvttss2si64(a) }
+}
+
+/// Converts a 64 bit integer to a 32 bit float. The result vector is the input
+/// vector `a` with the lowest 32 bit float replaced by the converted integer.
+///
+/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 64 bit
+/// input).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsi64_ss(a: __m128, b: i64) -> __m128 {
+    unsafe { cvtsi642ss(a, b) }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::arch::x86_64::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtss_si64() {
+        let inputs = &[
+            (42.0f32, 42i64),
+            (-31.4, -31),
+            (-33.5, -34),
+            (-34.5, -34),
+            (4.0e10, 40_000_000_000),
+            (4.0e-10, 0),
+            (f32::NAN, i64::MIN),
+            (2147483500.1, 2147483520),
+            (9.223371e18, 9223370937343148032),
+        ];
+        for (i, &(xi, e)) in inputs.iter().enumerate() {
+            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
+            let r = _mm_cvtss_si64(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvtss_si64({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvttss_si64() {
+        let inputs = &[
+            (42.0f32, 42i64),
+            (-31.4, -31),
+            (-33.5, -33),
+            (-34.5, -34),
+            (10.999, 10),
+            (-5.99, -5),
+            (4.0e10, 40_000_000_000),
+            (4.0e-10, 0),
+            (f32::NAN, i64::MIN),
+            (2147483500.1, 2147483520),
+            (9.223371e18, 9223370937343148032),
+            (9.223372e18, i64::MIN),
+        ];
+        for (i, &(xi, e)) in inputs.iter().enumerate() {
+            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
+            let r = _mm_cvttss_si64(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvttss_si64({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtsi64_ss() {
+        let inputs = &[
+            (4555i64, 4555.0f32),
+            (322223333, 322223330.0),
+            (-432, -432.0),
+            (-322223333, -322223330.0),
+            (9223372036854775807, 9.223372e18),
+            (-9223372036854775808, -9.223372e18),
+        ];
+
+        for &(x, f) in inputs {
+            let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+            let r = _mm_cvtsi64_ss(a, x);
+            let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
+            assert_eq_m128(e, r);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse2.rs b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
new file mode 100644
index 0000000000000..475e2d2a83cc3
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
@@ -0,0 +1,224 @@
+//! `x86_64`'s Streaming SIMD Extensions 2 (SSE2)
+
+use crate::core_arch::x86::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sse2.cvtsd2si64"]
+    fn cvtsd2si64(a: __m128d) -> i64;
+    #[link_name = "llvm.x86.sse2.cvttsd2si64"]
+    fn cvttsd2si64(a: __m128d) -> i64;
+}
+
+/// Converts the lower double-precision (64-bit) floating-point element in a to
+/// a 64-bit integer.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsd_si64(a: __m128d) -> i64 {
+    unsafe { cvtsd2si64(a) }
+}
+
+/// Alias for `_mm_cvtsd_si64`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsd_si64x(a: __m128d) -> i64 {
+    _mm_cvtsd_si64(a)
+}
+
+/// Converts the lower double-precision (64-bit) floating-point element in `a`
+/// to a 64-bit integer with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvttsd_si64(a: __m128d) -> i64 {
+    unsafe { cvttsd2si64(a) }
+}
+
+/// Alias for `_mm_cvttsd_si64`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
+    _mm_cvttsd_si64(a)
+}
+
+/// Stores a 64-bit integer value in the specified memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movnti))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
+    crate::arch::asm!(
+        vps!("movnti", ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Returns a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsi64_si128(a: i64) -> __m128i {
+    _mm_set_epi64x(0, a)
+}
+
+/// Returns a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsi64x_si128(a: i64) -> __m128i {
+    _mm_cvtsi64_si128(a)
+}
+
+/// Returns the lowest element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsi128_si64(a: __m128i) -> i64 {
+    unsafe { simd_extract!(a.as_i64x2(), 0) }
+}
+
+/// Returns the lowest element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsi128_si64x(a: __m128i) -> i64 {
+    _mm_cvtsi128_si64(a)
+}
+
+/// Returns `a` with its lower element replaced by `b` after converting it to
+/// an `f64`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsi64_sd(a: __m128d, b: i64) -> __m128d {
+    unsafe { simd_insert!(a, 0, b as f64) }
+}
+
+/// Returns `a` with its lower element replaced by `b` after converting it to
+/// an `f64`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsi64x_sd(a: __m128d, b: i64) -> __m128d {
+    _mm_cvtsi64_sd(a, b)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::arch::x86_64::*;
+    use std::boxed;
+    use std::ptr;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_si64() {
+        let r = _mm_cvtsd_si64(_mm_setr_pd(-2.0, 5.0));
+        assert_eq!(r, -2_i64);
+
+        let r = _mm_cvtsd_si64(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq!(r, i64::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_si64x() {
+        let r = _mm_cvtsd_si64x(_mm_setr_pd(f64::NAN, f64::NAN));
+        assert_eq!(r, i64::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttsd_si64() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let r = _mm_cvttsd_si64(a);
+        assert_eq!(r, -1_i64);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttsd_si64x() {
+        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
+        let r = _mm_cvttsd_si64x(a);
+        assert_eq!(r, i64::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_stream_si64() {
+        let a: i64 = 7;
+        let mut mem = boxed::Box::<i64>::new(-1);
+        _mm_stream_si64(ptr::addr_of_mut!(*mem), a);
+        assert_eq!(a, *mem);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi64_si128() {
+        let r = _mm_cvtsi64_si128(5);
+        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi128_si64() {
+        let r = _mm_cvtsi128_si64(_mm_setr_epi64x(5, 0));
+        assert_eq!(r, 5);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi64_sd() {
+        let a = _mm_set1_pd(3.5);
+        let r = _mm_cvtsi64_sd(a, 5);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse41.rs b/library/stdarch/crates/core_arch/src/x86_64/sse41.rs
new file mode 100644
index 0000000000000..4b7d25f2144b0
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse41.rs
@@ -0,0 +1,59 @@
+//! `i686`'s Streaming SIMD Extensions 4.1 (SSE4.1)
+
+use crate::{core_arch::x86::*, mem::transmute};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Extracts an 64-bit integer from `a` selected with `IMM1`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pextrq, IMM1 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_extract_epi64<const IMM1: i32>(a: __m128i) -> i64 {
+    static_assert_uimm_bits!(IMM1, 1);
+    unsafe { simd_extract!(a.as_i64x2(), IMM1 as u32) }
+}
+
+/// Returns a copy of `a` with the 64-bit integer from `i` inserted at a
+/// location specified by `IMM1`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pinsrq, IMM1 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_insert_epi64<const IMM1: i32>(a: __m128i, i: i64) -> __m128i {
+    static_assert_uimm_bits!(IMM1, 1);
+    unsafe { transmute(simd_insert!(a.as_i64x2(), IMM1 as u32, i)) }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::arch::x86_64::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let r = _mm_extract_epi64::<1>(a);
+        assert_eq!(r, 1);
+        let r = _mm_extract_epi64::<0>(a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_epi64() {
+        let a = _mm_set1_epi64x(0);
+        let e = _mm_setr_epi64x(0, 32);
+        let r = _mm_insert_epi64::<1>(a, 32);
+        assert_eq_m128i(r, e);
+        let e = _mm_setr_epi64x(32, 0);
+        let r = _mm_insert_epi64::<0>(a, 32);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse42.rs b/library/stdarch/crates/core_arch/src/x86_64/sse42.rs
new file mode 100644
index 0000000000000..64a23b2b19210
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse42.rs
@@ -0,0 +1,37 @@
+//! `x86_64`'s Streaming SIMD Extensions 4.2 (SSE4.2)
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sse42.crc32.64.64"]
+    fn crc32_64_64(crc: u64, v: u64) -> u64;
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32-C value for unsigned 64-bit integer `v`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_crc32_u64(crc: u64, v: u64) -> u64 {
+    unsafe { crc32_64_64(crc, v) }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::arch::x86_64::*;
+
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u64() {
+        let crc = 0x7819dccd3e824;
+        let v = 0x2a22b845fed;
+        let i = _mm_crc32_u64(crc, v);
+        assert_eq!(i, 0xbb6cdc6c);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/tbm.rs b/library/stdarch/crates/core_arch/src/x86_64/tbm.rs
new file mode 100644
index 0000000000000..002e0059160b7
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/tbm.rs
@@ -0,0 +1,225 @@
+//! Trailing Bit Manipulation (TBM) instruction set.
+//!
+//! The reference is [AMD64 Architecture Programmer's Manual, Volume 3:
+//! General-Purpose and System Instructions][amd64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the available
+//! instructions.
+//!
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "C" {
+    #[link_name = "llvm.x86.tbm.bextri.u64"]
+    fn bextri_u64(a: u64, control: u64) -> u64;
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
+/// be extracted, and bits `[15,8]` specify the length of the range. For any bit
+/// position in the specified range that lie beyond the MSB of the source operand,
+/// zeroes will be written. If the range is empty, the result is zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(bextr, CONTROL = 0x0404))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub unsafe fn _bextri_u64<const CONTROL: u64>(a: u64) -> u64 {
+    static_assert_uimm_bits!(CONTROL, 16);
+    unsafe { bextri_u64(a, CONTROL) }
+}
+
+/// Clears all bits below the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcfill_u64(x: u64) -> u64 {
+    x & x.wrapping_add(1)
+}
+
+/// Sets all bits of `x` to 1 except for the least significant zero bit.
+///
+/// If there is no zero bit in `x`, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blci))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blci_u64(x: u64) -> u64 {
+    x | !x.wrapping_add(1)
+}
+
+/// Sets the least significant zero bit of `x` and clears all other bits.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcic_u64(x: u64) -> u64 {
+    !x & x.wrapping_add(1)
+}
+
+/// Sets the least significant zero bit of `x` and clears all bits above
+/// that bit.
+///
+/// If there is no zero bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
+    x ^ x.wrapping_add(1)
+}
+
+/// Sets the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns `x`.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcs))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcs_u64(x: u64) -> u64 {
+    x | x.wrapping_add(1)
+}
+
+/// Sets all bits of `x` below the least significant one.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsfill_u64(x: u64) -> u64 {
+    x | x.wrapping_sub(1)
+}
+
+/// Clears least significant bit and sets all other bits.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsic_u64(x: u64) -> u64 {
+    !x | x.wrapping_sub(1)
+}
+
+/// Clears all bits below the least significant zero of `x` and sets all other
+/// bits.
+///
+/// If the least significant bit of `x` is `0`, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(t1mskc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
+    !x | x.wrapping_add(1)
+}
+
+/// Sets all bits below the least significant one of `x` and clears all other
+/// bits.
+///
+/// If the least significant bit of `x` is 1, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(tzmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _tzmsk_u64(x: u64) -> u64 {
+    !x & x.wrapping_sub(1)
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86_64::*;
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_bextri_u64() {
+        assert_eq!(_bextri_u64::<0x0404>(0b0101_0000u64), 0b0000_0101u64);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcfill_u64() {
+        assert_eq!(_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
+        assert_eq!(_blcfill_u64(0b1111_1111u64), 0u64);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blci_u64() {
+        assert_eq!(
+            _blci_u64(0b0101_0000u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64
+        );
+        assert_eq!(
+            _blci_u64(0b1111_1111u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcic_u64() {
+        assert_eq!(_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
+        assert_eq!(_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcmsk_u64() {
+        assert_eq!(_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
+        assert_eq!(_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcs_u64() {
+        assert_eq!(_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
+        assert_eq!(_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blsfill_u64() {
+        assert_eq!(_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
+        assert_eq!(
+            _blsfill_u64(0u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blsic_u64() {
+        assert_eq!(
+            _blsic_u64(0b0101_0100u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64
+        );
+        assert_eq!(
+            _blsic_u64(0u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_t1mskc_u64() {
+        assert_eq!(
+            _t1mskc_u64(0b0101_0111u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64
+        );
+        assert_eq!(
+            _t1mskc_u64(0u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_tzmsk_u64() {
+        assert_eq!(_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64);
+        assert_eq!(_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/xsave.rs b/library/stdarch/crates/core_arch/src/x86_64/xsave.rs
new file mode 100644
index 0000000000000..ca2367307f8db
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/xsave.rs
@@ -0,0 +1,187 @@
+//! `x86_64`'s `xsave` and `xsaveopt` target feature intrinsics
+
+#![allow(clippy::module_name_repetitions)]
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.xsave64"]
+    fn xsave64(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xrstor64"]
+    fn xrstor64(p: *const u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsaveopt64"]
+    fn xsaveopt64(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsavec64"]
+    fn xsavec64(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsaves64"]
+    fn xsaves64(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xrstors64"]
+    fn xrstors64(p: *const u8, hi: u32, lo: u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and XCR0.
+/// `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
+/// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsave64)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xsave64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
+    xsave64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial restore of the enabled processor states using
+/// the state information stored in memory at `mem_addr`.
+///
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xrstor64)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xrstor64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
+    xrstor64(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and `XCR0`.
+/// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
+/// the manner in which data is saved. The performance of this instruction will
+/// be equal to or better than using the `XSAVE64` instruction.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsaveopt64)
+#[inline]
+#[target_feature(enable = "xsave,xsaveopt")]
+#[cfg_attr(test, assert_instr(xsaveopt64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
+    xsaveopt64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory
+/// at `mem_addr`.
+///
+/// `xsavec` differs from `xsave` in that it uses compaction and that it may
+/// use init optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsavec64)
+#[inline]
+#[target_feature(enable = "xsave,xsavec")]
+#[cfg_attr(test, assert_instr(xsavec64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
+    xsavec64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`
+///
+/// `xsaves` differs from xsave in that it can save state components
+/// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
+/// modified optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsaves64)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xsaves64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
+    xsaves64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial restore of the enabled processor states using the
+/// state information stored in memory at `mem_addr`.
+///
+/// `xrstors` differs from `xrstor` in that it can restore state components
+/// corresponding to bits set in the `IA32_XSS` `MSR`; `xrstors` cannot restore
+/// from an `xsave` area in which the extended region is in the standard form.
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xrstors64)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xrstors64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstors64(mem_addr: *const u8, rs_mask: u64) {
+    xrstors64(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86_64::xsave;
+    use std::fmt;
+    use stdarch_test::simd_test;
+
+    #[repr(align(64))]
+    #[derive(Debug)]
+    struct XsaveArea {
+        // max size for 256-bit registers is 800 bytes:
+        // see https://software.intel.com/en-us/node/682996
+        // max size for 512-bit registers is 2560 bytes:
+        // FIXME: add source
+        data: [u8; 2560],
+    }
+
+    impl XsaveArea {
+        fn new() -> XsaveArea {
+            XsaveArea { data: [0; 2560] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            self.data.as_mut_ptr()
+        }
+    }
+
+    #[simd_test(enable = "xsave")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_xsave64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        xsave::_xsave64(a.ptr(), m);
+        xsave::_xrstor64(a.ptr(), m);
+        xsave::_xsave64(b.ptr(), m);
+    }
+
+    #[simd_test(enable = "xsave,xsaveopt")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_xsaveopt64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        xsave::_xsaveopt64(a.ptr(), m);
+        xsave::_xrstor64(a.ptr(), m);
+        xsave::_xsaveopt64(b.ptr(), m);
+    }
+
+    #[simd_test(enable = "xsave,xsavec")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_xsavec64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        xsave::_xsavec64(a.ptr(), m);
+        xsave::_xrstor64(a.ptr(), m);
+        xsave::_xsavec64(b.ptr(), m);
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/Cargo.toml b/library/stdarch/crates/intrinsic-test/Cargo.toml
new file mode 100644
index 0000000000000..06051abc8d0d4
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "intrinsic-test"
+version = "0.1.0"
+authors = ["Jamie Cunliffe <Jamie.Cunliffe@arm.com>",
+    "James McGregor <James.McGregor2@arm.com",
+    "Adam Gemmell <Adam.Gemmell@arm.com",
+    "Jacob Bramley <jacob.bramley@arm.com>",
+    "James Barford-Evans <james.barford-evans@arm.com>"
+    ]
+license = "MIT OR Apache-2.0"
+edition = "2024"
+
+[dependencies]
+lazy_static = "1.4.0"
+serde = { version = "1", features = ["derive"] }
+serde_json = "1.0"
+csv = "1.1"
+clap = { version = "4.4", features = ["derive"] }
+regex = "1.4.2"
+log = "0.4.11"
+pretty_env_logger = "0.5.0"
+rayon = "1.5.0"
+diff = "0.1.12"
+itertools = "0.14.0"
diff --git a/library/stdarch/crates/intrinsic-test/LICENSE-APACHE b/library/stdarch/crates/intrinsic-test/LICENSE-APACHE
new file mode 100644
index 0000000000000..16fe87b06e802
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/library/stdarch/crates/intrinsic-test/LICENSE-MIT b/library/stdarch/crates/intrinsic-test/LICENSE-MIT
new file mode 100644
index 0000000000000..ef223ae2c7c0c
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2021-2023 The Rust Project Developers
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/library/stdarch/crates/intrinsic-test/README.md b/library/stdarch/crates/intrinsic-test/README.md
new file mode 100644
index 0000000000000..260d59fca80f7
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/README.md
@@ -0,0 +1,23 @@
+Generate and run programs using equivalent C and Rust intrinsics, checking that
+each produces the same result from random inputs.
+
+# Usage
+```
+USAGE:
+    intrinsic-test [FLAGS] [OPTIONS] <INPUT>
+
+FLAGS:
+        --a32              Run tests for A32 instrinsics instead of A64
+        --generate-only    Regenerate test programs, but don't build or run them
+    -h, --help             Prints help information
+    -V, --version          Prints version information
+
+OPTIONS:
+        --cppcompiler <CPPCOMPILER>    The C++ compiler to use for compiling the c++ code [default: clang++]
+        --runner <RUNNER>              Run the C programs under emulation with this command
+        --skip <SKIP>                  Filename for a list of intrinsics to skip (one per line)
+        --toolchain <TOOLCHAIN>        The rust toolchain to use for building the rust code
+
+ARGS:
+    <INPUT>    The input file containing the intrinsics
+```
diff --git a/library/stdarch/crates/intrinsic-test/missing_aarch64.txt b/library/stdarch/crates/intrinsic-test/missing_aarch64.txt
new file mode 100644
index 0000000000000..bbcfc40c69ab6
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/missing_aarch64.txt
@@ -0,0 +1,64 @@
+# Not implemented in stdarch yet
+vbfdot_f32
+vbfdot_lane_f32
+vbfdot_laneq_f32
+vbfdotq_f32
+vbfdotq_lane_f32
+vbfdotq_laneq_f32
+vbfmlalbq_f32
+vbfmlalbq_lane_f32
+vbfmlalbq_laneq_f32
+vbfmlaltq_f32
+vbfmlaltq_lane_f32
+vbfmlaltq_laneq_f32
+vbfmmlaq_f32
+
+
+# Implemented in stdarch, but missing in Clang.
+vrnd32xq_f64
+vrnd32zq_f64
+vrnd64xq_f64
+vrnd64zq_f64
+vamin_f32
+vaminq_f32
+vaminq_f64
+vamax_f32
+vamaxq_f32
+vamaxq_f64
+# LLVM select error, and missing in Clang.
+vrnd32x_f64
+vrnd32z_f64
+vrnd64x_f64
+vrnd64z_f64
+vluti2_lane_p16
+vluti2_lane_p8
+vluti2_lane_s16
+vluti2_lane_s8
+vluti2_lane_u16
+vluti2_lane_u8
+vluti2q_lane_p16
+vluti2q_lane_p8
+vluti2q_lane_s16
+vluti2q_lane_s8
+vluti2q_lane_u16
+vluti2q_lane_u8
+vluti4q_lane_f16_x2
+vluti4q_lane_p16_x2
+vluti4q_lane_p8
+vluti4q_lane_s16_x2
+vluti4q_lane_s8
+vluti4q_lane_u16_x2
+vluti4q_lane_u8
+vluti4q_laneq_f16_x2
+vluti4q_laneq_p16_x2
+vluti4q_laneq_p8
+vluti4q_laneq_s16_x2
+vluti4q_laneq_s8
+vluti4q_laneq_u16_x2
+vluti4q_laneq_u8
+
+# Broken in Clang
+vcvth_s16_f16
+# FIXME: Broken output due to missing f16 printing support in Rust, see git blame for this line
+vmulh_lane_f16
+vmulh_laneq_f16
diff --git a/library/stdarch/crates/intrinsic-test/missing_arm.txt b/library/stdarch/crates/intrinsic-test/missing_arm.txt
new file mode 100644
index 0000000000000..04c09a27d90d4
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/missing_arm.txt
@@ -0,0 +1,320 @@
+# Not implemented in stdarch yet
+vbfdot_f32
+vbfdot_lane_f32
+vbfdot_laneq_f32
+vbfdotq_f32
+vbfdotq_lane_f32
+vbfdotq_laneq_f32
+vbfmlalbq_f32
+vbfmlalbq_lane_f32
+vbfmlalbq_laneq_f32
+vbfmlaltq_f32
+vbfmlaltq_lane_f32
+vbfmlaltq_laneq_f32
+vbfmmlaq_f32
+
+# Implemented in Clang and stdarch for A64 only even though CSV claims A32 support
+vaddq_p64
+vbsl_p64
+vbslq_p64
+vceq_p64
+vceqq_p64
+vceqz_p64
+vceqzq_p64
+vcombine_p64
+vcopy_lane_p64
+vcopy_laneq_p64
+vcopyq_lane_p64
+vcopyq_laneq_p64
+vcreate_p64
+vdup_lane_p64
+vdup_n_p64
+vdupq_lane_p64
+vdupq_n_p64
+vext_p64
+vextq_p64
+vget_high_p64
+vget_lane_p64
+vget_low_p64
+vgetq_lane_p64
+vmovn_high_s16
+vmovn_high_s32
+vmovn_high_s64
+vmovn_high_u16
+vmovn_high_u32
+vmovn_high_u64
+vmull_high_p64
+vmull_p64
+vreinterpret_p16_p64
+vreinterpret_p64_f32
+vreinterpret_p64_p16
+vreinterpret_p64_p8
+vreinterpret_p64_s16
+vreinterpret_p64_s32
+vreinterpret_p64_s8
+vreinterpret_p64_u16
+vreinterpret_p64_u32
+vreinterpret_p64_u64
+vreinterpret_p64_u8
+vreinterpret_p8_p64
+vreinterpretq_f64_u64
+vreinterpretq_p128_f32
+vreinterpretq_p128_p16
+vreinterpretq_p128_p8
+vreinterpretq_p128_s16
+vreinterpretq_p128_s32
+vreinterpretq_p128_s64
+vreinterpretq_p128_s8
+vreinterpretq_p128_u16
+vreinterpretq_p128_u32
+vreinterpretq_p128_u64
+vreinterpretq_p128_u8
+vreinterpretq_p16_p64
+vreinterpretq_p64_f32
+vreinterpretq_p64_p16
+vreinterpretq_p64_p8
+vreinterpretq_p64_s16
+vreinterpretq_p64_s32
+vreinterpretq_p64_s64
+vreinterpretq_p64_s8
+vreinterpretq_p64_u16
+vreinterpretq_p64_u32
+vreinterpretq_p64_u64
+vreinterpretq_p64_u8
+vreinterpretq_p8_p64
+vreinterpretq_s16_p64
+vreinterpretq_s32_p64
+vreinterpretq_s64_p64
+vreinterpretq_s8_p64
+vreinterpretq_u16_p64
+vreinterpretq_u32_p64
+vreinterpretq_u64_p64
+vreinterpretq_u8_p64
+vreinterpret_s16_p64
+vreinterpret_s32_p64
+vreinterpret_s64_p64
+vreinterpret_s8_p64
+vreinterpret_u16_p64
+vreinterpret_u32_p64
+vreinterpret_u64_p64
+vreinterpret_u8_p64
+vrndn_f64
+vrndnq_f64
+vset_lane_p64
+vsetq_lane_p64
+vsli_n_p64
+vsliq_n_p64
+vsri_n_p64
+vsriq_n_p64
+vtst_p64
+vtstq_p64
+vaddh_f16
+vsubh_f16
+vabsh_f16
+vdivh_f16
+vmulh_f16
+vfmsh_f16
+vfmah_f16
+vminnmh_f16
+vmaxnmh_f16
+vrndh_f16
+vrndnh_f16
+vrndih_f16
+vrndah_f16
+vrndph_f16
+vrndmh_f16
+vrndxh_f16
+vsqrth_f16
+vnegh_f16
+vcvth_f16_s32
+vcvth_s32_f16
+vcvth_n_f16_s32
+vcvth_n_s32_f16
+vcvth_f16_u32
+vcvth_u32_f16
+vcvth_n_f16_u32
+vcvth_n_u32_f16
+vcvtah_s32_f16
+vcvtah_u32_f16
+vcvtmh_s32_f16
+vcvtmh_u32_f16
+vcvtpq_s16_f16
+vcvtpq_u16_f16
+vcvtp_s16_f16
+vcvtp_u16_f16
+vcvtph_s32_f16
+vcvtph_u32_f16
+vcvtnh_u32_f16
+vcvtnh_s32_f16
+vfmlsl_low_f16
+vfmlslq_low_f16
+vfmlsl_high_f16
+vfmlslq_high_f16
+vfmlsl_lane_high_f16
+vfmlsl_laneq_high_f16
+vfmlslq_lane_high_f16
+vfmlslq_laneq_high_f16
+vfmlsl_lane_low_f16
+vfmlsl_laneq_low_f16
+vfmlslq_lane_low_f16
+vfmlslq_laneq_low_f16
+vfmlal_low_f16
+vfmlalq_low_f16
+vfmlal_high_f16
+vfmlalq_high_f16
+vfmlal_lane_low_f16
+vfmlal_laneq_low_f16
+vfmlalq_lane_low_f16
+vfmlalq_laneq_low_f16
+vfmlal_lane_high_f16
+vfmlal_laneq_high_f16
+vfmlalq_lane_high_f16
+vfmlalq_laneq_high_f16
+vreinterpret_f16_p64
+vreinterpretq_f16_p64
+vreinterpret_p64_f16
+vreinterpretq_p64_f16
+vreinterpret_p128_f16
+vreinterpretq_p128_f16
+
+# Present in Clang header but triggers an ICE due to lack of backend support.
+vcmla_f32
+vcmla_lane_f32
+vcmla_laneq_f32
+vcmla_rot180_f32
+vcmla_rot180_lane_f32
+vcmla_rot180_laneq_f32
+vcmla_rot270_f32
+vcmla_rot270_lane_f32
+vcmla_rot270_laneq_f32
+vcmla_rot90_f32
+vcmla_rot90_lane_f32
+vcmla_rot90_laneq_f32
+vcmlaq_f32
+vcmlaq_lane_f32
+vcmlaq_laneq_f32
+vcmlaq_rot180_f32
+vcmlaq_rot180_lane_f32
+vcmlaq_rot180_laneq_f32
+vcmlaq_rot270_f32
+vcmlaq_rot270_lane_f32
+vcmlaq_rot270_laneq_f32
+vcmlaq_rot90_f32
+vcmlaq_rot90_lane_f32
+vcmlaq_rot90_laneq_f32
+vcmla_f16
+vcmlaq_f16
+vcmla_laneq_f16
+vcmla_lane_f16
+vcmla_laneq_f16
+vcmlaq_lane_f16
+vcmlaq_laneq_f16
+vcmla_rot90_f16
+vcmlaq_rot90_f16
+vcmla_rot180_f16
+vcmlaq_rot180_f16
+vcmla_rot270_f16
+vcmlaq_rot270_f16
+vcmla_rot90_lane_f16
+vcmla_rot90_laneq_f16
+vcmlaq_rot90_lane_f16
+vcmlaq_rot90_laneq_f16
+vcmla_rot180_lane_f16
+vcmla_rot180_laneq_f16
+vcmlaq_rot180_lane_f16
+vcmlaq_rot180_laneq_f16
+vcmla_rot270_lane_f16
+vcmla_rot270_laneq_f16
+vcmlaq_rot270_lane_f16
+vcmlaq_rot270_laneq_f16
+
+# Implemented in stdarch for A64 only, Clang support both A32/A64
+vadd_s64
+vadd_u64
+vcaddq_rot270_f32
+vcaddq_rot90_f32
+vcadd_rot270_f32
+vcadd_rot90_f32
+vcvtaq_s32_f32
+vcvtaq_u32_f32
+vcvta_s32_f32
+vcvta_u32_f32
+vcvtmq_s32_f32
+vcvtmq_u32_f32
+vcvtm_s32_f32
+vcvtm_u32_f32
+vcvtnq_s32_f32
+vcvtnq_u32_f32
+vcvtn_s32_f32
+vcvtn_u32_f32
+vcvtpq_s32_f32
+vcvtpq_u32_f32
+vcvtp_s32_f32
+vcvtp_u32_f32
+vqdmulh_lane_s16
+vqdmulh_lane_s32
+vqdmulhq_lane_s16
+vqdmulhq_lane_s32
+vrnda_f32
+vrnda_f32
+vrndaq_f32
+vrndaq_f32
+vrnd_f32
+vrnd_f32
+vrndi_f32
+vrndi_f32
+vrndiq_f32
+vrndiq_f32
+vrndm_f32
+vrndm_f32
+vrndmq_f32
+vrndmq_f32
+vrndns_f32
+vrndp_f32
+vrndpq_f32
+vrndq_f32
+vrndq_f32
+vrndx_f32
+vrndxq_f32
+vrnda_f16
+vrnda_f16
+vrndaq_f16
+vrndaq_f16
+vrnd_f16
+vrnd_f16
+vrndi_f16
+vrndi_f16
+vrndiq_f16
+vrndiq_f16
+vrndm_f16
+vrndm_f16
+vrndmq_f16
+vrndmq_f16
+vrndns_f16
+vrndp_f16
+vrndpq_f16
+vrndq_f16
+vrndx_f16
+vrndxq_f16
+vpmin_f16
+vpmax_f16
+vcaddq_rot270_f16
+vcaddq_rot90_f16
+vcadd_rot270_f16
+vcadd_rot90_f16
+vcvtm_s16_f16
+vcvtmq_s16_f16
+vcvtm_u16_f16
+vcvtmq_u16_f16
+vcvtaq_s16_f16
+vcvtaq_u16_f16
+vcvtnq_s16_f16
+vcvtnq_u16_f16
+vcvtn_s16_f16
+vcvtn_u16_f16
+vcvtaq_s16_f16
+vcvtaq_u16_f16
+vcvta_s16_f16
+vcvta_u16_f16
+vceqz_f16
+vceqzq_f16
diff --git a/library/stdarch/crates/intrinsic-test/src/arm/compile.rs b/library/stdarch/crates/intrinsic-test/src/arm/compile.rs
new file mode 100644
index 0000000000000..8276cd87c1cbc
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/arm/compile.rs
@@ -0,0 +1,64 @@
+use crate::common::compile_c::CompilationCommandBuilder;
+use crate::common::gen_c::compile_c_programs;
+
+pub fn compile_c_arm(
+    intrinsics_name_list: &[String],
+    compiler: &str,
+    target: &str,
+    cxx_toolchain_dir: Option<&str>,
+) -> bool {
+    // -ffp-contract=off emulates Rust's approach of not fusing separate mul-add operations
+    let mut command = CompilationCommandBuilder::new()
+        .add_arch_flags(vec!["armv8.6-a", "crypto", "crc", "dotprod", "fp16"])
+        .set_compiler(compiler)
+        .set_target(target)
+        .set_opt_level("2")
+        .set_cxx_toolchain_dir(cxx_toolchain_dir)
+        .set_project_root("c_programs")
+        .add_extra_flags(vec!["-ffp-contract=off", "-Wno-narrowing"]);
+
+    if !target.contains("v7") {
+        command = command.add_arch_flags(vec!["faminmax", "lut", "sha3"]);
+    }
+
+    /*
+     * clang++ cannot link an aarch64_be object file, so we invoke
+     * aarch64_be-unknown-linux-gnu's C++ linker. This ensures that we
+     * are testing the intrinsics against LLVM.
+     *
+     * Note: setting `--sysroot=<...>` which is the obvious thing to do
+     * does not work as it gets caught up with `#include_next <stdlib.h>`
+     * not existing...
+     */
+    if target.contains("aarch64_be") {
+        command = command
+            .set_linker(
+                cxx_toolchain_dir.unwrap_or("").to_string() + "/bin/aarch64_be-none-linux-gnu-g++",
+            )
+            .set_include_paths(vec![
+                "/include",
+                "/aarch64_be-none-linux-gnu/include",
+                "/aarch64_be-none-linux-gnu/include/c++/14.2.1",
+                "/aarch64_be-none-linux-gnu/include/c++/14.2.1/aarch64_be-none-linux-gnu",
+                "/aarch64_be-none-linux-gnu/include/c++/14.2.1/backward",
+                "/aarch64_be-none-linux-gnu/libc/usr/include",
+            ]);
+    }
+
+    if !compiler.contains("clang") {
+        command = command.add_extra_flag("-flax-vector-conversions");
+    }
+
+    let compiler_commands = intrinsics_name_list
+        .iter()
+        .map(|intrinsic_name| {
+            command
+                .clone()
+                .set_input_name(intrinsic_name)
+                .set_output_name(intrinsic_name)
+                .make_string()
+        })
+        .collect::<Vec<_>>();
+
+    compile_c_programs(&compiler_commands)
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/arm/config.rs b/library/stdarch/crates/intrinsic-test/src/arm/config.rs
new file mode 100644
index 0000000000000..cee80374ae9d8
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/arm/config.rs
@@ -0,0 +1,122 @@
+pub fn build_notices(line_prefix: &str) -> String {
+    format!(
+        "\
+{line_prefix}This is a transient test file, not intended for distribution. Some aspects of the
+{line_prefix}test are derived from a JSON specification, published under the same license as the
+{line_prefix}`intrinsic-test` crate.\n
+"
+    )
+}
+
+pub const POLY128_OSTREAM_DEF: &str = r#"std::ostream& operator<<(std::ostream& os, poly128_t value) {
+    std::stringstream temp;
+    do {
+      int n = value % 10;
+      value /= 10;
+      temp << n;
+    } while (value != 0);
+    std::string tempstr(temp.str());
+    std::string res(tempstr.rbegin(), tempstr.rend());
+    os << res;
+    return os;
+}"#;
+
+// Format f16 values (and vectors containing them) in a way that is consistent with C.
+pub const F16_FORMATTING_DEF: &str = r#"
+/// Used to continue `Debug`ging SIMD types as `MySimd(1, 2, 3, 4)`, as they
+/// were before moving to array-based simd.
+#[inline]
+fn debug_simd_finish<T: core::fmt::Debug, const N: usize>(
+    formatter: &mut core::fmt::Formatter<'_>,
+    type_name: &str,
+    array: &[T; N],
+) -> core::fmt::Result {
+    core::fmt::Formatter::debug_tuple_fields_finish(
+        formatter,
+        type_name,
+        &core::array::from_fn::<&dyn core::fmt::Debug, N, _>(|i| &array[i]),
+    )
+}
+
+#[repr(transparent)]
+struct Hex<T>(T);
+
+impl<T: DebugHexF16> core::fmt::Debug for Hex<T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        <T as DebugHexF16>::fmt(&self.0, f)
+    }
+}
+
+fn debug_f16<T: DebugHexF16>(x: T) -> impl core::fmt::Debug {
+    Hex(x)
+}
+
+trait DebugHexF16 {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result;
+}
+
+impl DebugHexF16 for f16 {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "{:#06x?}", self.to_bits())
+    }
+}
+
+impl DebugHexF16 for float16x4_t {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 4]>(*self) };
+        debug_simd_finish(f, "float16x4_t", &array)
+    }
+}
+
+impl DebugHexF16 for float16x8_t {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 8]>(*self) };
+        debug_simd_finish(f, "float16x8_t", &array)
+    }
+}
+
+impl DebugHexF16 for float16x4x2_t {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        debug_simd_finish(f, "float16x4x2_t", &[Hex(self.0), Hex(self.1)])
+    }
+}
+impl DebugHexF16 for float16x4x3_t {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        debug_simd_finish(f, "float16x4x3_t", &[Hex(self.0), Hex(self.1), Hex(self.2)])
+    }
+}
+impl DebugHexF16 for float16x4x4_t {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        debug_simd_finish(f, "float16x4x4_t", &[Hex(self.0), Hex(self.1), Hex(self.2), Hex(self.3)])
+    }
+}
+
+impl DebugHexF16 for float16x8x2_t {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        debug_simd_finish(f, "float16x8x2_t", &[Hex(self.0), Hex(self.1)])
+    }
+}
+impl DebugHexF16 for float16x8x3_t {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        debug_simd_finish(f, "float16x8x3_t", &[Hex(self.0), Hex(self.1), Hex(self.2)])
+    }
+}
+impl DebugHexF16 for float16x8x4_t {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        debug_simd_finish(f, "float16x8x4_t", &[Hex(self.0), Hex(self.1), Hex(self.2), Hex(self.3)])
+    }
+}
+ "#;
+
+pub const AARCH_CONFIGURATIONS: &str = r#"
+#![cfg_attr(target_arch = "arm", feature(stdarch_arm_neon_intrinsics))]
+#![cfg_attr(target_arch = "arm", feature(stdarch_aarch32_crc32))]
+#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_fcma))]
+#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_dotprod))]
+#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_i8mm))]
+#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_sha3))]
+#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_sm4))]
+#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_ftts))]
+#![feature(fmt_helpers_for_derive)]
+#![feature(stdarch_neon_f16)]
+"#;
diff --git a/library/stdarch/crates/intrinsic-test/src/arm/intrinsic.rs b/library/stdarch/crates/intrinsic-test/src/arm/intrinsic.rs
new file mode 100644
index 0000000000000..773dabf4d75b1
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/arm/intrinsic.rs
@@ -0,0 +1,95 @@
+use crate::common::argument::ArgumentList;
+use crate::common::indentation::Indentation;
+use crate::common::intrinsic::{Intrinsic, IntrinsicDefinition};
+use crate::common::intrinsic_helpers::{IntrinsicType, IntrinsicTypeDefinition, TypeKind};
+use std::ops::Deref;
+
+#[derive(Debug, Clone, PartialEq)]
+pub struct ArmIntrinsicType(pub IntrinsicType);
+
+impl Deref for ArmIntrinsicType {
+    type Target = IntrinsicType;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl IntrinsicDefinition<ArmIntrinsicType> for Intrinsic<ArmIntrinsicType> {
+    fn arguments(&self) -> ArgumentList<ArmIntrinsicType> {
+        self.arguments.clone()
+    }
+
+    fn results(&self) -> ArmIntrinsicType {
+        self.results.clone()
+    }
+
+    fn name(&self) -> String {
+        self.name.clone()
+    }
+
+    /// Generates a std::cout for the intrinsics results that will match the
+    /// rust debug output format for the return type. The generated line assumes
+    /// there is an int i in scope which is the current pass number.
+    fn print_result_c(&self, indentation: Indentation, additional: &str) -> String {
+        let lanes = if self.results().num_vectors() > 1 {
+            (0..self.results().num_vectors())
+                .map(|vector| {
+                    format!(
+                        r#""{ty}(" << {lanes} << ")""#,
+                        ty = self.results().c_single_vector_type(),
+                        lanes = (0..self.results().num_lanes())
+                            .map(move |idx| -> std::string::String {
+                                format!(
+                                    "{cast}{lane_fn}(__return_value.val[{vector}], {lane})",
+                                    cast = self.results().c_promotion(),
+                                    lane_fn = self.results().get_lane_function(),
+                                    lane = idx,
+                                    vector = vector,
+                                )
+                            })
+                            .collect::<Vec<_>>()
+                            .join(r#" << ", " << "#)
+                    )
+                })
+                .collect::<Vec<_>>()
+                .join(r#" << ", " << "#)
+        } else if self.results().num_lanes() > 1 {
+            (0..self.results().num_lanes())
+                .map(|idx| -> std::string::String {
+                    format!(
+                        "{cast}{lane_fn}(__return_value, {lane})",
+                        cast = self.results().c_promotion(),
+                        lane_fn = self.results().get_lane_function(),
+                        lane = idx
+                    )
+                })
+                .collect::<Vec<_>>()
+                .join(r#" << ", " << "#)
+        } else {
+            format!(
+                "{promote}cast<{cast}>(__return_value)",
+                cast = match self.results.kind() {
+                    TypeKind::Float if self.results().inner_size() == 16 => "float16_t".to_string(),
+                    TypeKind::Float if self.results().inner_size() == 32 => "float".to_string(),
+                    TypeKind::Float if self.results().inner_size() == 64 => "double".to_string(),
+                    TypeKind::Int => format!("int{}_t", self.results().inner_size()),
+                    TypeKind::UInt => format!("uint{}_t", self.results().inner_size()),
+                    TypeKind::Poly => format!("poly{}_t", self.results().inner_size()),
+                    ty => todo!("print_result_c - Unknown type: {:#?}", ty),
+                },
+                promote = self.results().c_promotion(),
+            )
+        };
+
+        format!(
+            r#"{indentation}std::cout << "Result {additional}-" << i+1 << ": {ty}" << std::fixed << std::setprecision(150) <<  {lanes} << "{close}" << std::endl;"#,
+            ty = if self.results().is_simd() {
+                format!("{}(", self.results().c_type())
+            } else {
+                String::from("")
+            },
+            close = if self.results.is_simd() { ")" } else { "" },
+        )
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/arm/json_parser.rs b/library/stdarch/crates/intrinsic-test/src/arm/json_parser.rs
new file mode 100644
index 0000000000000..0ac47484b0193
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/arm/json_parser.rs
@@ -0,0 +1,137 @@
+use super::intrinsic::ArmIntrinsicType;
+use crate::common::argument::{Argument, ArgumentList};
+use crate::common::constraint::Constraint;
+use crate::common::intrinsic::Intrinsic;
+use crate::common::intrinsic_helpers::{IntrinsicType, IntrinsicTypeDefinition};
+use serde::Deserialize;
+use serde_json::Value;
+use std::collections::HashMap;
+use std::path::Path;
+
+#[derive(Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+struct ReturnType {
+    value: String,
+}
+
+#[derive(Deserialize, Debug)]
+#[serde(untagged, deny_unknown_fields)]
+pub enum ArgPrep {
+    Register {
+        #[serde(rename = "register")]
+        #[allow(dead_code)]
+        reg: String,
+    },
+    Immediate {
+        #[serde(rename = "minimum")]
+        min: i64,
+        #[serde(rename = "maximum")]
+        max: i64,
+    },
+    Nothing {},
+}
+
+impl TryFrom<Value> for ArgPrep {
+    type Error = serde_json::Error;
+
+    fn try_from(value: Value) -> Result<Self, Self::Error> {
+        serde_json::from_value(value)
+    }
+}
+
+#[derive(Deserialize, Debug)]
+struct JsonIntrinsic {
+    #[serde(rename = "SIMD_ISA")]
+    simd_isa: String,
+    name: String,
+    arguments: Vec<String>,
+    return_type: ReturnType,
+    #[serde(rename = "Arguments_Preparation")]
+    args_prep: Option<HashMap<String, Value>>,
+    #[serde(rename = "Architectures")]
+    architectures: Vec<String>,
+}
+
+pub fn get_neon_intrinsics(
+    filename: &Path,
+    target: &str,
+) -> Result<Vec<Intrinsic<ArmIntrinsicType>>, Box<dyn std::error::Error>> {
+    let file = std::fs::File::open(filename)?;
+    let reader = std::io::BufReader::new(file);
+    let json: Vec<JsonIntrinsic> = serde_json::from_reader(reader).expect("Couldn't parse JSON");
+
+    let parsed = json
+        .into_iter()
+        .filter_map(|intr| {
+            if intr.simd_isa == "Neon" {
+                Some(json_to_intrinsic(intr, target).expect("Couldn't parse JSON"))
+            } else {
+                None
+            }
+        })
+        .collect();
+    Ok(parsed)
+}
+
+fn json_to_intrinsic(
+    mut intr: JsonIntrinsic,
+    target: &str,
+) -> Result<Intrinsic<ArmIntrinsicType>, Box<dyn std::error::Error>> {
+    let name = intr.name.replace(['[', ']'], "");
+
+    let results = ArmIntrinsicType::from_c(&intr.return_type.value, target)?;
+
+    let args = intr
+        .arguments
+        .into_iter()
+        .enumerate()
+        .map(|(i, arg)| {
+            let arg_name = Argument::<ArmIntrinsicType>::type_and_name_from_c(&arg).1;
+            let metadata = intr.args_prep.as_mut();
+            let metadata = metadata.and_then(|a| a.remove(arg_name));
+            let arg_prep: Option<ArgPrep> = metadata.and_then(|a| a.try_into().ok());
+            let constraint: Option<Constraint> = arg_prep.and_then(|a| a.try_into().ok());
+
+            let mut arg = Argument::<ArmIntrinsicType>::from_c(i, &arg, target, constraint);
+
+            // The JSON doesn't list immediates as const
+            let IntrinsicType {
+                ref mut constant, ..
+            } = arg.ty.0;
+            if arg.name.starts_with("imm") {
+                *constant = true
+            }
+            arg
+        })
+        .collect();
+
+    let arguments = ArgumentList::<ArmIntrinsicType> { args };
+
+    Ok(Intrinsic {
+        name,
+        arguments,
+        results: *results,
+        arch_tags: intr.architectures,
+    })
+}
+
+/// ARM-specific
+impl TryFrom<ArgPrep> for Constraint {
+    type Error = ();
+
+    fn try_from(prep: ArgPrep) -> Result<Self, Self::Error> {
+        let parsed_ints = match prep {
+            ArgPrep::Immediate { min, max } => Ok((min, max)),
+            _ => Err(()),
+        };
+        if let Ok((min, max)) = parsed_ints {
+            if min == max {
+                Ok(Constraint::Equal(min))
+            } else {
+                Ok(Constraint::Range(min..max + 1))
+            }
+        } else {
+            Err(())
+        }
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/arm/mod.rs b/library/stdarch/crates/intrinsic-test/src/arm/mod.rs
new file mode 100644
index 0000000000000..6aaa49ff97f9b
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/arm/mod.rs
@@ -0,0 +1,124 @@
+mod compile;
+mod config;
+mod intrinsic;
+mod json_parser;
+mod types;
+
+use crate::common::SupportedArchitectureTest;
+use crate::common::cli::ProcessedCli;
+use crate::common::compare::compare_outputs;
+use crate::common::gen_rust::compile_rust_programs;
+use crate::common::intrinsic::{Intrinsic, IntrinsicDefinition};
+use crate::common::intrinsic_helpers::TypeKind;
+use crate::common::write_file::{write_c_testfiles, write_rust_testfiles};
+use compile::compile_c_arm;
+use config::{AARCH_CONFIGURATIONS, F16_FORMATTING_DEF, POLY128_OSTREAM_DEF, build_notices};
+use intrinsic::ArmIntrinsicType;
+use json_parser::get_neon_intrinsics;
+
+pub struct ArmArchitectureTest {
+    intrinsics: Vec<Intrinsic<ArmIntrinsicType>>,
+    cli_options: ProcessedCli,
+}
+
+impl SupportedArchitectureTest for ArmArchitectureTest {
+    fn create(cli_options: ProcessedCli) -> Box<Self> {
+        let a32 = cli_options.target.contains("v7");
+        let mut intrinsics = get_neon_intrinsics(&cli_options.filename, &cli_options.target)
+            .expect("Error parsing input file");
+
+        intrinsics.sort_by(|a, b| a.name.cmp(&b.name));
+
+        let mut intrinsics = intrinsics
+            .into_iter()
+            // Not sure how we would compare intrinsic that returns void.
+            .filter(|i| i.results.kind() != TypeKind::Void)
+            .filter(|i| i.results.kind() != TypeKind::BFloat)
+            .filter(|i| !i.arguments.iter().any(|a| a.ty.kind() == TypeKind::BFloat))
+            // Skip pointers for now, we would probably need to look at the return
+            // type to work out how many elements we need to point to.
+            .filter(|i| !i.arguments.iter().any(|a| a.is_ptr()))
+            .filter(|i| !i.arguments.iter().any(|a| a.ty.inner_size() == 128))
+            .filter(|i| !cli_options.skip.contains(&i.name))
+            .filter(|i| !(a32 && i.arch_tags == vec!["A64".to_string()]))
+            .collect::<Vec<_>>();
+        intrinsics.dedup();
+
+        Box::new(Self {
+            intrinsics,
+            cli_options,
+        })
+    }
+
+    fn build_c_file(&self) -> bool {
+        let compiler = self.cli_options.cpp_compiler.as_deref();
+        let target = &self.cli_options.target;
+        let cxx_toolchain_dir = self.cli_options.cxx_toolchain_dir.as_deref();
+        let c_target = "aarch64";
+
+        let intrinsics_name_list = write_c_testfiles(
+            &self
+                .intrinsics
+                .iter()
+                .map(|i| i as &dyn IntrinsicDefinition<_>)
+                .collect::<Vec<_>>(),
+            target,
+            c_target,
+            &["arm_neon.h", "arm_acle.h", "arm_fp16.h"],
+            &build_notices("// "),
+            &[POLY128_OSTREAM_DEF],
+        );
+
+        match compiler {
+            None => true,
+            Some(compiler) => compile_c_arm(
+                intrinsics_name_list.as_slice(),
+                compiler,
+                target,
+                cxx_toolchain_dir,
+            ),
+        }
+    }
+
+    fn build_rust_file(&self) -> bool {
+        let rust_target = if self.cli_options.target.contains("v7") {
+            "arm"
+        } else {
+            "aarch64"
+        };
+        let target = &self.cli_options.target;
+        let toolchain = self.cli_options.toolchain.as_deref();
+        let linker = self.cli_options.linker.as_deref();
+        let intrinsics_name_list = write_rust_testfiles(
+            self.intrinsics
+                .iter()
+                .map(|i| i as &dyn IntrinsicDefinition<_>)
+                .collect::<Vec<_>>(),
+            rust_target,
+            &build_notices("// "),
+            F16_FORMATTING_DEF,
+            AARCH_CONFIGURATIONS,
+        );
+
+        compile_rust_programs(intrinsics_name_list, toolchain, target, linker)
+    }
+
+    fn compare_outputs(&self) -> bool {
+        if let Some(ref toolchain) = self.cli_options.toolchain {
+            let intrinsics_name_list = self
+                .intrinsics
+                .iter()
+                .map(|i| i.name.clone())
+                .collect::<Vec<_>>();
+
+            compare_outputs(
+                &intrinsics_name_list,
+                toolchain,
+                &self.cli_options.c_runner,
+                &self.cli_options.target,
+            )
+        } else {
+            true
+        }
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/arm/types.rs b/library/stdarch/crates/intrinsic-test/src/arm/types.rs
new file mode 100644
index 0000000000000..9f3d6302f460c
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/arm/types.rs
@@ -0,0 +1,195 @@
+use super::intrinsic::ArmIntrinsicType;
+use crate::common::cli::Language;
+use crate::common::intrinsic_helpers::{IntrinsicType, IntrinsicTypeDefinition, TypeKind};
+
+impl IntrinsicTypeDefinition for ArmIntrinsicType {
+    /// Gets a string containing the typename for this type in C format.
+    fn c_type(&self) -> String {
+        let prefix = self.0.kind.c_prefix();
+        let const_prefix = if self.0.constant { "const " } else { "" };
+
+        if let (Some(bit_len), simd_len, vec_len) =
+            (self.0.bit_len, self.0.simd_len, self.0.vec_len)
+        {
+            match (simd_len, vec_len) {
+                (None, None) => format!("{const_prefix}{prefix}{bit_len}_t"),
+                (Some(simd), None) => format!("{prefix}{bit_len}x{simd}_t"),
+                (Some(simd), Some(vec)) => format!("{prefix}{bit_len}x{simd}x{vec}_t"),
+                (None, Some(_)) => todo!("{:#?}", self), // Likely an invalid case
+            }
+        } else {
+            todo!("{:#?}", self)
+        }
+    }
+
+    fn c_single_vector_type(&self) -> String {
+        if let (Some(bit_len), Some(simd_len)) = (self.0.bit_len, self.0.simd_len) {
+            format!(
+                "{prefix}{bit_len}x{simd_len}_t",
+                prefix = self.0.kind.c_prefix()
+            )
+        } else {
+            unreachable!("Shouldn't be called on this type")
+        }
+    }
+
+    fn rust_type(&self) -> String {
+        let rust_prefix = self.0.kind.rust_prefix();
+        let c_prefix = self.0.kind.c_prefix();
+        if self.0.ptr_constant {
+            self.c_type()
+        } else if let (Some(bit_len), simd_len, vec_len) =
+            (self.0.bit_len, self.0.simd_len, self.0.vec_len)
+        {
+            match (simd_len, vec_len) {
+                (None, None) => format!("{rust_prefix}{bit_len}"),
+                (Some(simd), None) => format!("{c_prefix}{bit_len}x{simd}_t"),
+                (Some(simd), Some(vec)) => format!("{c_prefix}{bit_len}x{simd}x{vec}_t"),
+                (None, Some(_)) => todo!("{:#?}", self), // Likely an invalid case
+            }
+        } else {
+            todo!("{:#?}", self)
+        }
+    }
+
+    /// Determines the load function for this type.
+    fn get_load_function(&self, language: Language) -> String {
+        if let IntrinsicType {
+            kind: k,
+            bit_len: Some(bl),
+            simd_len,
+            vec_len,
+            target,
+            ..
+        } = &self.0
+        {
+            let quad = if simd_len.unwrap_or(1) * bl > 64 {
+                "q"
+            } else {
+                ""
+            };
+
+            let choose_workaround = language == Language::C && target.contains("v7");
+            format!(
+                "vld{len}{quad}_{type}{size}",
+                type = match k {
+                    TypeKind::UInt => "u",
+                    TypeKind::Int => "s",
+                    TypeKind::Float => "f",
+                    // The ACLE doesn't support 64-bit polynomial loads on Armv7
+                    // if armv7 and bl == 64, use "s", else "p"
+                    TypeKind::Poly => if choose_workaround && *bl == 64 {"s"} else {"p"},
+                    x => todo!("get_load_function TypeKind: {:#?}", x),
+                },
+                size = bl,
+                quad = quad,
+                len = vec_len.unwrap_or(1),
+            )
+        } else {
+            todo!("get_load_function IntrinsicType: {:#?}", self)
+        }
+    }
+
+    /// Determines the get lane function for this type.
+    fn get_lane_function(&self) -> String {
+        if let IntrinsicType {
+            kind: k,
+            bit_len: Some(bl),
+            simd_len,
+            ..
+        } = &self.0
+        {
+            let quad = if (simd_len.unwrap_or(1) * bl) > 64 {
+                "q"
+            } else {
+                ""
+            };
+            format!(
+                "vget{quad}_lane_{type}{size}",
+                type = match k {
+                    TypeKind::UInt => "u",
+                    TypeKind::Int => "s",
+                    TypeKind::Float => "f",
+                    TypeKind::Poly => "p",
+                    x => todo!("get_load_function TypeKind: {:#?}", x),
+                },
+                size = bl,
+                quad = quad,
+            )
+        } else {
+            todo!("get_lane_function IntrinsicType: {:#?}", self)
+        }
+    }
+
+    fn from_c(s: &str, target: &str) -> Result<Box<Self>, String> {
+        const CONST_STR: &str = "const";
+        if let Some(s) = s.strip_suffix('*') {
+            let (s, constant) = match s.trim().strip_suffix(CONST_STR) {
+                Some(stripped) => (stripped, true),
+                None => (s, false),
+            };
+            let s = s.trim_end();
+            let temp_return = ArmIntrinsicType::from_c(s, target);
+            temp_return.map(|mut op| {
+                let edited = op.as_mut();
+                edited.0.ptr = true;
+                edited.0.ptr_constant = constant;
+                op
+            })
+        } else {
+            // [const ]TYPE[{bitlen}[x{simdlen}[x{vec_len}]]][_t]
+            let (mut s, constant) = match s.strip_prefix(CONST_STR) {
+                Some(stripped) => (stripped.trim(), true),
+                None => (s, false),
+            };
+            s = s.strip_suffix("_t").unwrap_or(s);
+            let mut parts = s.split('x'); // [[{bitlen}], [{simdlen}], [{vec_len}] ]
+            let start = parts.next().ok_or("Impossible to parse type")?;
+            if let Some(digit_start) = start.find(|c: char| c.is_ascii_digit()) {
+                let (arg_kind, bit_len) = start.split_at(digit_start);
+                let arg_kind = arg_kind.parse::<TypeKind>()?;
+                let bit_len = bit_len.parse::<u32>().map_err(|err| err.to_string())?;
+                let simd_len = match parts.next() {
+                    Some(part) => Some(
+                        part.parse::<u32>()
+                            .map_err(|_| "Couldn't parse simd_len: {part}")?,
+                    ),
+                    None => None,
+                };
+                let vec_len = match parts.next() {
+                    Some(part) => Some(
+                        part.parse::<u32>()
+                            .map_err(|_| "Couldn't parse vec_len: {part}")?,
+                    ),
+                    None => None,
+                };
+                Ok(Box::new(ArmIntrinsicType(IntrinsicType {
+                    ptr: false,
+                    ptr_constant: false,
+                    constant,
+                    kind: arg_kind,
+                    bit_len: Some(bit_len),
+                    simd_len,
+                    vec_len,
+                    target: target.to_string(),
+                })))
+            } else {
+                let kind = start.parse::<TypeKind>()?;
+                let bit_len = match kind {
+                    TypeKind::Int => Some(32),
+                    _ => None,
+                };
+                Ok(Box::new(ArmIntrinsicType(IntrinsicType {
+                    ptr: false,
+                    ptr_constant: false,
+                    constant,
+                    kind: start.parse::<TypeKind>()?,
+                    bit_len,
+                    simd_len: None,
+                    vec_len: None,
+                    target: target.to_string(),
+                })))
+            }
+        }
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/common/argument.rs b/library/stdarch/crates/intrinsic-test/src/common/argument.rs
new file mode 100644
index 0000000000000..443ccb919f467
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/common/argument.rs
@@ -0,0 +1,209 @@
+use super::cli::Language;
+use super::constraint::Constraint;
+use super::indentation::Indentation;
+use super::intrinsic_helpers::IntrinsicTypeDefinition;
+
+/// An argument for the intrinsic.
+#[derive(Debug, PartialEq, Clone)]
+pub struct Argument<T: IntrinsicTypeDefinition> {
+    /// The argument's index in the intrinsic function call.
+    pub pos: usize,
+    /// The argument name.
+    pub name: String,
+    /// The type of the argument.
+    pub ty: T,
+    /// Any constraints that are on this argument
+    pub constraint: Option<Constraint>,
+}
+
+impl<T> Argument<T>
+where
+    T: IntrinsicTypeDefinition,
+{
+    pub fn to_c_type(&self) -> String {
+        self.ty.c_type()
+    }
+
+    pub fn is_simd(&self) -> bool {
+        self.ty.is_simd()
+    }
+
+    pub fn is_ptr(&self) -> bool {
+        self.ty.is_ptr()
+    }
+
+    pub fn has_constraint(&self) -> bool {
+        self.constraint.is_some()
+    }
+
+    pub fn type_and_name_from_c(arg: &str) -> (&str, &str) {
+        let split_index = arg
+            .rfind([' ', '*'])
+            .expect("Couldn't split type and argname");
+
+        (arg[..split_index + 1].trim_end(), &arg[split_index + 1..])
+    }
+
+    /// The binding keyword (e.g. "const" or "let") for the array of possible test inputs.
+    fn rust_vals_array_binding(&self) -> impl std::fmt::Display {
+        if self.ty.is_rust_vals_array_const() {
+            "const"
+        } else {
+            "let"
+        }
+    }
+
+    /// The name (e.g. "A_VALS" or "a_vals") for the array of possible test inputs.
+    fn rust_vals_array_name(&self) -> impl std::fmt::Display {
+        if self.ty.is_rust_vals_array_const() {
+            format!("{}_VALS", self.name.to_uppercase())
+        } else {
+            format!("{}_vals", self.name.to_lowercase())
+        }
+    }
+
+    pub fn from_c(
+        pos: usize,
+        arg: &str,
+        target: &str,
+        constraint: Option<Constraint>,
+    ) -> Argument<T> {
+        let (ty, var_name) = Self::type_and_name_from_c(arg);
+
+        let ty =
+            T::from_c(ty, target).unwrap_or_else(|_| panic!("Failed to parse argument '{arg}'"));
+
+        Argument {
+            pos,
+            name: String::from(var_name),
+            ty: *ty,
+            constraint,
+        }
+    }
+
+    fn as_call_param_c(&self) -> String {
+        self.ty.as_call_param_c(&self.name)
+    }
+}
+
+#[derive(Debug, PartialEq, Clone)]
+pub struct ArgumentList<T: IntrinsicTypeDefinition> {
+    pub args: Vec<Argument<T>>,
+}
+
+impl<T> ArgumentList<T>
+where
+    T: IntrinsicTypeDefinition,
+{
+    /// Converts the argument list into the call parameters for a C function call.
+    /// e.g. this would generate something like `a, &b, c`
+    pub fn as_call_param_c(&self) -> String {
+        self.iter()
+            .map(|arg| arg.as_call_param_c())
+            .collect::<Vec<String>>()
+            .join(", ")
+    }
+
+    /// Converts the argument list into the call parameters for a Rust function.
+    /// e.g. this would generate something like `a, b, c`
+    pub fn as_call_param_rust(&self) -> String {
+        self.iter()
+            .filter(|a| !a.has_constraint())
+            .map(|arg| arg.name.clone())
+            .collect::<Vec<String>>()
+            .join(", ")
+    }
+
+    pub fn as_constraint_parameters_rust(&self) -> String {
+        self.iter()
+            .filter(|a| a.has_constraint())
+            .map(|arg| arg.name.clone())
+            .collect::<Vec<String>>()
+            .join(", ")
+    }
+
+    /// Creates a line for each argument that initializes an array for C from which `loads` argument
+    /// values can be loaded  as a sliding window.
+    /// e.g `const int32x2_t a_vals = {0x3effffff, 0x3effffff, 0x3f7fffff}`, if loads=2.
+    pub fn gen_arglists_c(&self, indentation: Indentation, loads: u32) -> String {
+        self.iter()
+            .filter(|&arg| !arg.has_constraint())
+            .map(|arg| {
+                format!(
+                    "{indentation}const {ty} {name}_vals[] = {values};",
+                    ty = arg.ty.c_scalar_type(),
+                    name = arg.name,
+                    values = arg.ty.populate_random(indentation, loads, &Language::C)
+                )
+            })
+            .collect::<Vec<_>>()
+            .join("\n")
+    }
+
+    /// Creates a line for each argument that initializes an array for Rust from which `loads` argument
+    /// values can be loaded as a sliding window, e.g `const A_VALS: [u32; 20]  = [...];`
+    pub fn gen_arglists_rust(&self, indentation: Indentation, loads: u32) -> String {
+        self.iter()
+            .filter(|&arg| !arg.has_constraint())
+            .map(|arg| {
+                format!(
+                    "{indentation}{bind} {name}: [{ty}; {load_size}] = {values};",
+                    bind = arg.rust_vals_array_binding(),
+                    name = arg.rust_vals_array_name(),
+                    ty = arg.ty.rust_scalar_type(),
+                    load_size = arg.ty.num_lanes() * arg.ty.num_vectors() + loads - 1,
+                    values = arg.ty.populate_random(indentation, loads, &Language::Rust)
+                )
+            })
+            .collect::<Vec<_>>()
+            .join("\n")
+    }
+
+    /// Creates a line for each argument that initializes the argument from an array `[arg]_vals` at
+    /// an offset `i` using a load intrinsic, in C.
+    /// e.g `uint8x8_t a = vld1_u8(&a_vals[i]);`
+    ///
+    /// ARM-specific
+    pub fn load_values_c(&self, indentation: Indentation) -> String {
+        self.iter()
+            .filter(|&arg| !arg.has_constraint())
+            .map(|arg| {
+                format!(
+                    "{indentation}{ty} {name} = cast<{ty}>({load}(&{name}_vals[i]));\n",
+                    ty = arg.to_c_type(),
+                    name = arg.name,
+                    load = if arg.is_simd() {
+                        arg.ty.get_load_function(Language::C)
+                    } else {
+                        "*".to_string()
+                    }
+                )
+            })
+            .collect()
+    }
+
+    /// Creates a line for each argument that initializes the argument from array `[ARG]_VALS` at
+    /// an offset `i` using a load intrinsic, in Rust.
+    /// e.g `let a = vld1_u8(A_VALS.as_ptr().offset(i));`
+    pub fn load_values_rust(&self, indentation: Indentation) -> String {
+        self.iter()
+            .filter(|&arg| !arg.has_constraint())
+            .map(|arg| {
+                format!(
+                    "{indentation}let {name} = {load}({vals_name}.as_ptr().offset(i));\n",
+                    name = arg.name,
+                    vals_name = arg.rust_vals_array_name(),
+                    load = if arg.is_simd() {
+                        arg.ty.get_load_function(Language::Rust)
+                    } else {
+                        "*".to_string()
+                    },
+                )
+            })
+            .collect()
+    }
+
+    pub fn iter(&self) -> std::slice::Iter<'_, Argument<T>> {
+        self.args.iter()
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/common/cli.rs b/library/stdarch/crates/intrinsic-test/src/common/cli.rs
new file mode 100644
index 0000000000000..1d572723008df
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/common/cli.rs
@@ -0,0 +1,113 @@
+use itertools::Itertools;
+use std::path::PathBuf;
+
+#[derive(Debug, PartialEq)]
+pub enum Language {
+    Rust,
+    C,
+}
+
+pub enum FailureReason {
+    RunC(String),
+    RunRust(String),
+    Difference(String, String, String),
+}
+
+/// Intrinsic test tool
+#[derive(clap::Parser)]
+#[command(
+    name = "Intrinsic test tool",
+    about = "Generates Rust and C programs for intrinsics and compares the output"
+)]
+pub struct Cli {
+    /// The input file containing the intrinsics
+    pub input: PathBuf,
+
+    /// The rust toolchain to use for building the rust code
+    #[arg(long)]
+    pub toolchain: Option<String>,
+
+    /// The C++ compiler to use for compiling the c++ code
+    #[arg(long, default_value_t = String::from("clang++"))]
+    pub cppcompiler: String,
+
+    /// Run the C programs under emulation with this command
+    #[arg(long)]
+    pub runner: Option<String>,
+
+    /// Filename for a list of intrinsics to skip (one per line)
+    #[arg(long)]
+    pub skip: Option<PathBuf>,
+
+    /// Regenerate test programs, but don't build or run them
+    #[arg(long)]
+    pub generate_only: bool,
+
+    /// Pass a target the test suite
+    #[arg(long, default_value_t = String::from("armv7-unknown-linux-gnueabihf"))]
+    pub target: String,
+
+    /// Set the linker
+    #[arg(long)]
+    pub linker: Option<String>,
+
+    /// Set the sysroot for the C++ compiler
+    #[arg(long)]
+    pub cxx_toolchain_dir: Option<String>,
+}
+
+pub struct ProcessedCli {
+    pub filename: PathBuf,
+    pub toolchain: Option<String>,
+    pub cpp_compiler: Option<String>,
+    pub c_runner: String,
+    pub target: String,
+    pub linker: Option<String>,
+    pub cxx_toolchain_dir: Option<String>,
+    pub skip: Vec<String>,
+}
+
+impl ProcessedCli {
+    pub fn new(cli_options: Cli) -> Self {
+        let filename = cli_options.input;
+        let c_runner = cli_options.runner.unwrap_or_default();
+        let target = cli_options.target;
+        let linker = cli_options.linker;
+        let cxx_toolchain_dir = cli_options.cxx_toolchain_dir;
+
+        let skip = if let Some(filename) = cli_options.skip {
+            let data = std::fs::read_to_string(&filename).expect("Failed to open file");
+            data.lines()
+                .map(str::trim)
+                .filter(|s| !s.contains('#'))
+                .map(String::from)
+                .collect_vec()
+        } else {
+            Default::default()
+        };
+
+        let (toolchain, cpp_compiler) = if cli_options.generate_only {
+            (None, None)
+        } else {
+            (
+                Some(
+                    cli_options
+                        .toolchain
+                        .map_or_else(String::new, |t| format!("+{t}")),
+                ),
+                Some(cli_options.cppcompiler),
+            )
+        };
+
+        Self {
+            toolchain,
+            cpp_compiler,
+            c_runner,
+            target,
+            linker,
+            cxx_toolchain_dir,
+            skip,
+            filename,
+        }
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/common/compare.rs b/library/stdarch/crates/intrinsic-test/src/common/compare.rs
new file mode 100644
index 0000000000000..815ccf89fc695
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/common/compare.rs
@@ -0,0 +1,90 @@
+use super::cli::FailureReason;
+use rayon::prelude::*;
+use std::process::Command;
+
+pub fn compare_outputs(
+    intrinsic_name_list: &Vec<String>,
+    toolchain: &str,
+    runner: &str,
+    target: &str,
+) -> bool {
+    let intrinsics = intrinsic_name_list
+        .par_iter()
+        .filter_map(|intrinsic_name| {
+            let c = Command::new("sh")
+                .arg("-c")
+                .arg(format!("{runner} ./c_programs/{intrinsic_name}"))
+                .output();
+
+            let rust = Command::new("sh")
+                .current_dir("rust_programs")
+                .arg("-c")
+                .arg(format!(
+                    "cargo {toolchain} run --target {target} --bin {intrinsic_name} --release",
+                ))
+                .env("RUSTFLAGS", "-Cdebuginfo=0")
+                .output();
+
+            let (c, rust) = match (c, rust) {
+                (Ok(c), Ok(rust)) => (c, rust),
+                a => panic!("{a:#?}"),
+            };
+
+            if !c.status.success() {
+                error!(
+                    "Failed to run C program for intrinsic {intrinsic_name}\nstdout: {stdout}\nstderr: {stderr}",
+                    stdout = std::str::from_utf8(&c.stdout).unwrap_or(""),
+                    stderr = std::str::from_utf8(&c.stderr).unwrap_or(""),
+                );
+                return Some(FailureReason::RunC(intrinsic_name.clone()));
+            }
+
+            if !rust.status.success() {
+                error!(
+                    "Failed to run Rust program for intrinsic {intrinsic_name}\nstdout: {stdout}\nstderr: {stderr}",
+                    stdout = std::str::from_utf8(&rust.stdout).unwrap_or(""),
+                    stderr = std::str::from_utf8(&rust.stderr).unwrap_or(""),
+                );
+                return Some(FailureReason::RunRust(intrinsic_name.clone()));
+            }
+
+            info!("Comparing intrinsic: {}", intrinsic_name);
+
+            let c = std::str::from_utf8(&c.stdout)
+                .unwrap()
+                .to_lowercase()
+                .replace("-nan", "nan");
+            let rust = std::str::from_utf8(&rust.stdout)
+                .unwrap()
+                .to_lowercase()
+                .replace("-nan", "nan");
+
+            if c == rust {
+                None
+            } else {
+                Some(FailureReason::Difference(intrinsic_name.clone(), c, rust))
+            }
+        })
+        .collect::<Vec<_>>();
+
+    intrinsics.iter().for_each(|reason| match reason {
+        FailureReason::Difference(intrinsic, c, rust) => {
+            println!("Difference for intrinsic: {intrinsic}");
+            let diff = diff::lines(c, rust);
+            diff.iter().for_each(|diff| match diff {
+                diff::Result::Left(c) => println!("C: {c}"),
+                diff::Result::Right(rust) => println!("Rust: {rust}"),
+                diff::Result::Both(_, _) => (),
+            });
+            println!("****************************************************************");
+        }
+        FailureReason::RunC(intrinsic) => {
+            println!("Failed to run C program for intrinsic {intrinsic}")
+        }
+        FailureReason::RunRust(intrinsic) => {
+            println!("Failed to run rust program for intrinsic {intrinsic}")
+        }
+    });
+    println!("{} differences found", intrinsics.len());
+    intrinsics.is_empty()
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/common/compile_c.rs b/library/stdarch/crates/intrinsic-test/src/common/compile_c.rs
new file mode 100644
index 0000000000000..aebb7b111e28c
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/common/compile_c.rs
@@ -0,0 +1,154 @@
+#[derive(Clone)]
+pub struct CompilationCommandBuilder {
+    compiler: String,
+    target: Option<String>,
+    cxx_toolchain_dir: Option<String>,
+    arch_flags: Vec<String>,
+    optimization: String,
+    include_paths: Vec<String>,
+    project_root: Option<String>,
+    output: String,
+    input: String,
+    linker: Option<String>,
+    extra_flags: Vec<String>,
+}
+
+impl CompilationCommandBuilder {
+    pub fn new() -> Self {
+        Self {
+            compiler: String::new(),
+            target: None,
+            cxx_toolchain_dir: None,
+            arch_flags: Vec::new(),
+            optimization: "2".to_string(),
+            include_paths: Vec::new(),
+            project_root: None,
+            output: String::new(),
+            input: String::new(),
+            linker: None,
+            extra_flags: Vec::new(),
+        }
+    }
+
+    pub fn set_compiler(mut self, compiler: &str) -> Self {
+        self.compiler = compiler.to_string();
+        self
+    }
+
+    pub fn set_target(mut self, target: &str) -> Self {
+        self.target = Some(target.to_string());
+        self
+    }
+
+    pub fn set_cxx_toolchain_dir(mut self, path: Option<&str>) -> Self {
+        self.cxx_toolchain_dir = path.map(|p| p.to_string());
+        self
+    }
+
+    pub fn add_arch_flags(mut self, flags: Vec<&str>) -> Self {
+        let mut new_arch_flags = flags.into_iter().map(|v| v.to_string()).collect();
+        self.arch_flags.append(&mut new_arch_flags);
+
+        self
+    }
+
+    pub fn set_opt_level(mut self, optimization: &str) -> Self {
+        self.optimization = optimization.to_string();
+        self
+    }
+
+    /// Sets a list of include paths for compilation.
+    /// The paths that are passed must be relative to the
+    /// "cxx_toolchain_dir" directory path.
+    pub fn set_include_paths(mut self, paths: Vec<&str>) -> Self {
+        self.include_paths = paths.into_iter().map(|path| path.to_string()).collect();
+        self
+    }
+
+    /// Sets the root path of all the generated test files.
+    pub fn set_project_root(mut self, path: &str) -> Self {
+        self.project_root = Some(path.to_string());
+        self
+    }
+
+    /// The name of the output executable, without any suffixes
+    pub fn set_output_name(mut self, path: &str) -> Self {
+        self.output = path.to_string();
+        self
+    }
+
+    /// The name of the input C file, without any suffixes
+    pub fn set_input_name(mut self, path: &str) -> Self {
+        self.input = path.to_string();
+        self
+    }
+
+    pub fn set_linker(mut self, linker: String) -> Self {
+        self.linker = Some(linker);
+        self
+    }
+
+    pub fn add_extra_flags(mut self, flags: Vec<&str>) -> Self {
+        let mut flags: Vec<String> = flags.into_iter().map(|f| f.to_string()).collect();
+        self.extra_flags.append(&mut flags);
+        self
+    }
+
+    pub fn add_extra_flag(self, flag: &str) -> Self {
+        self.add_extra_flags(vec![flag])
+    }
+}
+
+impl CompilationCommandBuilder {
+    pub fn make_string(self) -> String {
+        let arch_flags = self.arch_flags.join("+");
+        let flags = std::env::var("CPPFLAGS").unwrap_or("".into());
+        let project_root = self.project_root.unwrap_or_default();
+        let project_root_str = project_root.as_str();
+        let mut output = self.output.clone();
+        if self.linker.is_some() {
+            output += ".o"
+        };
+        let mut command = format!(
+            "{} {flags} -march={arch_flags} \
+            -O{} \
+            -o {project_root}/{} \
+            {project_root}/{}.cpp",
+            self.compiler, self.optimization, output, self.input,
+        );
+
+        command = command + " " + self.extra_flags.join(" ").as_str();
+
+        if let Some(target) = &self.target {
+            command = command + " --target=" + target;
+        }
+
+        if let (Some(linker), Some(cxx_toolchain_dir)) = (&self.linker, &self.cxx_toolchain_dir) {
+            let include_args = self
+                .include_paths
+                .iter()
+                .map(|path| "--include-directory=".to_string() + cxx_toolchain_dir + path)
+                .collect::<Vec<_>>()
+                .join(" ");
+
+            command = command
+                + " -c "
+                + include_args.as_str()
+                + " && "
+                + linker
+                + " "
+                + project_root_str
+                + "/"
+                + &output
+                + " -o "
+                + project_root_str
+                + "/"
+                + &self.output
+                + " && rm "
+                + project_root_str
+                + "/"
+                + &output;
+        }
+        command
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/common/constraint.rs b/library/stdarch/crates/intrinsic-test/src/common/constraint.rs
new file mode 100644
index 0000000000000..269fb7f90cb7e
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/common/constraint.rs
@@ -0,0 +1,17 @@
+use serde::Deserialize;
+use std::ops::Range;
+
+#[derive(Debug, PartialEq, Clone, Deserialize)]
+pub enum Constraint {
+    Equal(i64),
+    Range(Range<i64>),
+}
+
+impl Constraint {
+    pub fn to_range(&self) -> Range<i64> {
+        match self {
+            Constraint::Equal(eq) => *eq..*eq + 1,
+            Constraint::Range(range) => range.clone(),
+        }
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/common/gen_c.rs b/library/stdarch/crates/intrinsic-test/src/common/gen_c.rs
new file mode 100644
index 0000000000000..84c28cc4bf439
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/common/gen_c.rs
@@ -0,0 +1,198 @@
+use itertools::Itertools;
+use rayon::prelude::*;
+use std::collections::BTreeMap;
+use std::process::Command;
+
+use super::argument::Argument;
+use super::indentation::Indentation;
+use super::intrinsic::IntrinsicDefinition;
+use super::intrinsic_helpers::IntrinsicTypeDefinition;
+
+// The number of times each intrinsic will be called.
+const PASSES: u32 = 20;
+
+// Formats the main C program template with placeholders
+pub fn format_c_main_template(
+    notices: &str,
+    header_files: &[&str],
+    arch_identifier: &str,
+    arch_specific_definitions: &[&str],
+    arglists: &str,
+    passes: &str,
+) -> String {
+    format!(
+        r#"{notices}{header_files}
+#include <iostream>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
+
+template<typename T1, typename T2> T1 cast(T2 x) {{
+  static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same");
+  T1 ret{{}};
+  memcpy(&ret, &x, sizeof(T1));
+  return ret;
+}}
+
+std::ostream& operator<<(std::ostream& os, float16_t value) {{
+    uint16_t temp = 0;
+    memcpy(&temp, &value, sizeof(float16_t));
+    std::stringstream ss;
+    ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp;
+    os << ss.str();
+    return os;
+}}
+
+#ifdef __{arch_identifier}__
+{arch_specific_definitions}
+#endif
+
+{arglists}
+
+int main(int argc, char **argv) {{
+{passes}
+    return 0;
+}}"#,
+        header_files = header_files
+            .iter()
+            .map(|header| format!("#include <{header}>"))
+            .collect::<Vec<_>>()
+            .join("\n"),
+        arch_specific_definitions = arch_specific_definitions.join("\n"),
+    )
+}
+
+pub fn compile_c_programs(compiler_commands: &[String]) -> bool {
+    compiler_commands
+        .par_iter()
+        .map(|compiler_command| {
+            let output = Command::new("sh").arg("-c").arg(compiler_command).output();
+            if let Ok(output) = output {
+                if output.status.success() {
+                    true
+                } else {
+                    error!(
+                        "Failed to compile code for intrinsics: \n\nstdout:\n{}\n\nstderr:\n{}",
+                        std::str::from_utf8(&output.stdout).unwrap_or(""),
+                        std::str::from_utf8(&output.stderr).unwrap_or("")
+                    );
+                    false
+                }
+            } else {
+                error!("Command failed: {:#?}", output);
+                false
+            }
+        })
+        .find_any(|x| !x)
+        .is_none()
+}
+
+// Creates directory structure and file path mappings
+pub fn setup_c_file_paths(identifiers: &Vec<String>) -> BTreeMap<&String, String> {
+    let _ = std::fs::create_dir("c_programs");
+    identifiers
+        .par_iter()
+        .map(|identifier| {
+            let c_filename = format!(r#"c_programs/{identifier}.cpp"#);
+
+            (identifier, c_filename)
+        })
+        .collect::<BTreeMap<&String, String>>()
+}
+
+pub fn generate_c_test_loop<T: IntrinsicTypeDefinition + Sized>(
+    intrinsic: &dyn IntrinsicDefinition<T>,
+    indentation: Indentation,
+    additional: &str,
+    passes: u32,
+    _target: &str,
+) -> String {
+    let body_indentation = indentation.nested();
+    format!(
+        "{indentation}for (int i=0; i<{passes}; i++) {{\n\
+            {loaded_args}\
+            {body_indentation}auto __return_value = {intrinsic_call}({args});\n\
+            {print_result}\n\
+        {indentation}}}",
+        loaded_args = intrinsic.arguments().load_values_c(body_indentation),
+        intrinsic_call = intrinsic.name(),
+        args = intrinsic.arguments().as_call_param_c(),
+        print_result = intrinsic.print_result_c(body_indentation, additional)
+    )
+}
+
+pub fn generate_c_constraint_blocks<T: IntrinsicTypeDefinition>(
+    intrinsic: &dyn IntrinsicDefinition<T>,
+    indentation: Indentation,
+    constraints: &[&Argument<T>],
+    name: String,
+    target: &str,
+) -> String {
+    if let Some((current, constraints)) = constraints.split_last() {
+        let range = current
+            .constraint
+            .iter()
+            .map(|c| c.to_range())
+            .flat_map(|r| r.into_iter());
+
+        let body_indentation = indentation.nested();
+        range
+            .map(|i| {
+                format!(
+                    "{indentation}{{\n\
+                        {body_indentation}{ty} {name} = {val};\n\
+                        {pass}\n\
+                    {indentation}}}",
+                    name = current.name,
+                    ty = current.ty.c_type(),
+                    val = i,
+                    pass = generate_c_constraint_blocks(
+                        intrinsic,
+                        body_indentation,
+                        constraints,
+                        format!("{name}-{i}"),
+                        target,
+                    )
+                )
+            })
+            .join("\n")
+    } else {
+        generate_c_test_loop(intrinsic, indentation, &name, PASSES, target)
+    }
+}
+
+// Compiles C test programs using specified compiler
+pub fn create_c_test_program<T: IntrinsicTypeDefinition>(
+    intrinsic: &dyn IntrinsicDefinition<T>,
+    header_files: &[&str],
+    target: &str,
+    c_target: &str,
+    notices: &str,
+    arch_specific_definitions: &[&str],
+) -> String {
+    let arguments = intrinsic.arguments();
+    let constraints = arguments
+        .iter()
+        .filter(|&i| i.has_constraint())
+        .collect_vec();
+
+    let indentation = Indentation::default();
+    format_c_main_template(
+        notices,
+        header_files,
+        c_target,
+        arch_specific_definitions,
+        intrinsic
+            .arguments()
+            .gen_arglists_c(indentation, PASSES)
+            .as_str(),
+        generate_c_constraint_blocks(
+            intrinsic,
+            indentation.nested(),
+            constraints.as_slice(),
+            Default::default(),
+            target,
+        )
+        .as_str(),
+    )
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/common/gen_rust.rs b/library/stdarch/crates/intrinsic-test/src/common/gen_rust.rs
new file mode 100644
index 0000000000000..a2878502ac944
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/common/gen_rust.rs
@@ -0,0 +1,243 @@
+use itertools::Itertools;
+use rayon::prelude::*;
+use std::collections::BTreeMap;
+use std::fs::File;
+use std::io::Write;
+use std::process::Command;
+
+use super::argument::Argument;
+use super::indentation::Indentation;
+use super::intrinsic::{IntrinsicDefinition, format_f16_return_value};
+use super::intrinsic_helpers::IntrinsicTypeDefinition;
+
+// The number of times each intrinsic will be called.
+const PASSES: u32 = 20;
+
+pub fn format_rust_main_template(
+    notices: &str,
+    definitions: &str,
+    configurations: &str,
+    arch_definition: &str,
+    arglists: &str,
+    passes: &str,
+) -> String {
+    format!(
+        r#"{notices}#![feature(simd_ffi)]
+#![feature(link_llvm_intrinsics)]
+#![feature(f16)]
+{configurations}
+{definitions}
+
+use core_arch::arch::{arch_definition}::*;
+
+fn main() {{
+{arglists}
+{passes}
+}}
+"#,
+    )
+}
+
+pub fn compile_rust_programs(
+    binaries: Vec<String>,
+    toolchain: Option<&str>,
+    target: &str,
+    linker: Option<&str>,
+) -> bool {
+    let mut cargo = File::create("rust_programs/Cargo.toml").unwrap();
+    cargo
+        .write_all(
+            format!(
+                r#"[package]
+name = "intrinsic-test-programs"
+version = "{version}"
+authors = [{authors}]
+license = "{license}"
+edition = "2018"
+[workspace]
+[dependencies]
+core_arch = {{ path = "../crates/core_arch" }}
+{binaries}"#,
+                version = env!("CARGO_PKG_VERSION"),
+                authors = env!("CARGO_PKG_AUTHORS")
+                    .split(":")
+                    .format_with(", ", |author, fmt| fmt(&format_args!("\"{author}\""))),
+                license = env!("CARGO_PKG_LICENSE"),
+                binaries = binaries
+                    .iter()
+                    .map(|binary| {
+                        format!(
+                            r#"[[bin]]
+name = "{binary}"
+path = "{binary}/main.rs""#,
+                        )
+                    })
+                    .collect::<Vec<_>>()
+                    .join("\n")
+            )
+            .into_bytes()
+            .as_slice(),
+        )
+        .unwrap();
+
+    let toolchain = match toolchain {
+        None => return true,
+        Some(t) => t,
+    };
+
+    /* If there has been a linker explicitly set from the command line then
+     * we want to set it via setting it in the RUSTFLAGS*/
+
+    let cargo_command = format!("cargo {toolchain} build --target {target} --release");
+
+    let mut command = Command::new("sh");
+    command
+        .current_dir("rust_programs")
+        .arg("-c")
+        .arg(cargo_command);
+
+    let mut rust_flags = "-Cdebuginfo=0".to_string();
+    if let Some(linker) = linker {
+        rust_flags.push_str(" -C linker=");
+        rust_flags.push_str(linker);
+        rust_flags.push_str(" -C link-args=-static");
+
+        command.env("CPPFLAGS", "-fuse-ld=lld");
+    }
+
+    command.env("RUSTFLAGS", rust_flags);
+    let output = command.output();
+
+    if let Ok(output) = output {
+        if output.status.success() {
+            true
+        } else {
+            error!(
+                "Failed to compile code for rust intrinsics\n\nstdout:\n{}\n\nstderr:\n{}",
+                std::str::from_utf8(&output.stdout).unwrap_or(""),
+                std::str::from_utf8(&output.stderr).unwrap_or("")
+            );
+            false
+        }
+    } else {
+        error!("Command failed: {:#?}", output);
+        false
+    }
+}
+
+// Creates directory structure and file path mappings
+pub fn setup_rust_file_paths(identifiers: &Vec<String>) -> BTreeMap<&String, String> {
+    identifiers
+        .par_iter()
+        .map(|identifier| {
+            let rust_dir = format!("rust_programs/{identifier}");
+            let _ = std::fs::create_dir_all(&rust_dir);
+            let rust_filename = format!("{rust_dir}/main.rs");
+
+            (identifier, rust_filename)
+        })
+        .collect::<BTreeMap<&String, String>>()
+}
+
+pub fn generate_rust_test_loop<T: IntrinsicTypeDefinition>(
+    intrinsic: &dyn IntrinsicDefinition<T>,
+    indentation: Indentation,
+    additional: &str,
+    passes: u32,
+) -> String {
+    let constraints = intrinsic.arguments().as_constraint_parameters_rust();
+    let constraints = if !constraints.is_empty() {
+        format!("::<{constraints}>")
+    } else {
+        constraints
+    };
+
+    let return_value = format_f16_return_value(intrinsic);
+    let indentation2 = indentation.nested();
+    let indentation3 = indentation2.nested();
+    format!(
+        "{indentation}for i in 0..{passes} {{\n\
+            {indentation2}unsafe {{\n\
+                {loaded_args}\
+                {indentation3}let __return_value = {intrinsic_call}{const}({args});\n\
+                {indentation3}println!(\"Result {additional}-{{}}: {{:?}}\", i + 1, {return_value});\n\
+            {indentation2}}}\n\
+        {indentation}}}",
+        loaded_args = intrinsic.arguments().load_values_rust(indentation3),
+        intrinsic_call = intrinsic.name(),
+        const = constraints,
+        args = intrinsic.arguments().as_call_param_rust(),
+    )
+}
+
+pub fn generate_rust_constraint_blocks<T: IntrinsicTypeDefinition>(
+    intrinsic: &dyn IntrinsicDefinition<T>,
+    indentation: Indentation,
+    constraints: &[&Argument<T>],
+    name: String,
+) -> String {
+    if let Some((current, constraints)) = constraints.split_last() {
+        let range = current
+            .constraint
+            .iter()
+            .map(|c| c.to_range())
+            .flat_map(|r| r.into_iter());
+
+        let body_indentation = indentation.nested();
+        range
+            .map(|i| {
+                format!(
+                    "{indentation}{{\n\
+                        {body_indentation}const {name}: {ty} = {val};\n\
+                        {pass}\n\
+                    {indentation}}}",
+                    name = current.name,
+                    ty = current.ty.rust_type(),
+                    val = i,
+                    pass = generate_rust_constraint_blocks(
+                        intrinsic,
+                        body_indentation,
+                        constraints,
+                        format!("{name}-{i}")
+                    )
+                )
+            })
+            .join("\n")
+    } else {
+        generate_rust_test_loop(intrinsic, indentation, &name, PASSES)
+    }
+}
+
+// Top-level function to create complete test program
+pub fn create_rust_test_program<T: IntrinsicTypeDefinition>(
+    intrinsic: &dyn IntrinsicDefinition<T>,
+    target: &str,
+    notice: &str,
+    definitions: &str,
+    cfg: &str,
+) -> String {
+    let arguments = intrinsic.arguments();
+    let constraints = arguments
+        .iter()
+        .filter(|i| i.has_constraint())
+        .collect_vec();
+
+    let indentation = Indentation::default();
+    format_rust_main_template(
+        notice,
+        definitions,
+        cfg,
+        target,
+        intrinsic
+            .arguments()
+            .gen_arglists_rust(indentation.nested(), PASSES)
+            .as_str(),
+        generate_rust_constraint_blocks(
+            intrinsic,
+            indentation.nested(),
+            &constraints,
+            Default::default(),
+        )
+        .as_str(),
+    )
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/common/indentation.rs b/library/stdarch/crates/intrinsic-test/src/common/indentation.rs
new file mode 100644
index 0000000000000..9ee331d7f7a3f
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/common/indentation.rs
@@ -0,0 +1,22 @@
+//! Basic code formatting tools.
+//!
+//! We don't need perfect formatting for the generated tests, but simple indentation can make
+//! debugging a lot easier.
+
+#[derive(Copy, Clone, Debug, Default)]
+pub struct Indentation(u32);
+
+impl Indentation {
+    pub fn nested(self) -> Self {
+        Self(self.0 + 1)
+    }
+}
+
+impl std::fmt::Display for Indentation {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        for _ in 0..self.0 {
+            write!(f, "    ")?;
+        }
+        Ok(())
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/common/intrinsic.rs b/library/stdarch/crates/intrinsic-test/src/common/intrinsic.rs
new file mode 100644
index 0000000000000..bc46ccfbac40c
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/common/intrinsic.rs
@@ -0,0 +1,51 @@
+use super::argument::ArgumentList;
+use super::indentation::Indentation;
+use super::intrinsic_helpers::{IntrinsicTypeDefinition, TypeKind};
+
+/// An intrinsic
+#[derive(Debug, PartialEq, Clone)]
+pub struct Intrinsic<T: IntrinsicTypeDefinition> {
+    /// The function name of this intrinsic.
+    pub name: String,
+
+    /// Any arguments for this intrinsic.
+    pub arguments: ArgumentList<T>,
+
+    /// The return type of this intrinsic.
+    pub results: T,
+
+    /// Any architecture-specific tags.
+    pub arch_tags: Vec<String>,
+}
+
+pub trait IntrinsicDefinition<T>
+where
+    T: IntrinsicTypeDefinition,
+{
+    fn arguments(&self) -> ArgumentList<T>;
+
+    fn results(&self) -> T;
+
+    fn name(&self) -> String;
+
+    /// Generates a std::cout for the intrinsics results that will match the
+    /// rust debug output format for the return type. The generated line assumes
+    /// there is an int i in scope which is the current pass number.
+    fn print_result_c(&self, _indentation: Indentation, _additional: &str) -> String;
+}
+
+pub fn format_f16_return_value<T: IntrinsicTypeDefinition>(
+    intrinsic: &dyn IntrinsicDefinition<T>,
+) -> String {
+    // the `intrinsic-test` crate compares the output of C and Rust intrinsics. Currently, It uses
+    // a string representation of the output value to compare. In C, f16 values are currently printed
+    // as hexadecimal integers. Since https://github.com/rust-lang/rust/pull/127013, rust does print
+    // them as decimal floating point values. To keep the intrinsics tests working, for now, format
+    // vectors containing f16 values like C prints them.
+    let return_value = match intrinsic.results().kind() {
+        TypeKind::Float if intrinsic.results().inner_size() == 16 => "debug_f16(__return_value)",
+        _ => "format_args!(\"{__return_value:.150?}\")",
+    };
+
+    String::from(return_value)
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/common/intrinsic_helpers.rs b/library/stdarch/crates/intrinsic-test/src/common/intrinsic_helpers.rs
new file mode 100644
index 0000000000000..3d200b19461e1
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/common/intrinsic_helpers.rs
@@ -0,0 +1,296 @@
+use std::fmt;
+use std::ops::Deref;
+use std::str::FromStr;
+
+use itertools::Itertools as _;
+
+use super::cli::Language;
+use super::indentation::Indentation;
+use super::values::value_for_array;
+
+#[derive(Debug, PartialEq, Copy, Clone)]
+pub enum TypeKind {
+    BFloat,
+    Float,
+    Int,
+    UInt,
+    Poly,
+    Void,
+}
+
+impl FromStr for TypeKind {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "bfloat" => Ok(Self::BFloat),
+            "float" => Ok(Self::Float),
+            "int" => Ok(Self::Int),
+            "poly" => Ok(Self::Poly),
+            "uint" | "unsigned" => Ok(Self::UInt),
+            "void" => Ok(Self::Void),
+            _ => Err(format!("Impossible to parse argument kind {s}")),
+        }
+    }
+}
+
+impl fmt::Display for TypeKind {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{}",
+            match self {
+                Self::BFloat => "bfloat",
+                Self::Float => "float",
+                Self::Int => "int",
+                Self::UInt => "uint",
+                Self::Poly => "poly",
+                Self::Void => "void",
+            }
+        )
+    }
+}
+
+impl TypeKind {
+    /// Gets the type part of a c typedef for a type that's in the form of {type}{size}_t.
+    pub fn c_prefix(&self) -> &str {
+        match self {
+            Self::Float => "float",
+            Self::Int => "int",
+            Self::UInt => "uint",
+            Self::Poly => "poly",
+            _ => unreachable!("Not used: {:#?}", self),
+        }
+    }
+
+    /// Gets the rust prefix for the type kind i.e. i, u, f.
+    pub fn rust_prefix(&self) -> &str {
+        match self {
+            Self::Float => "f",
+            Self::Int => "i",
+            Self::UInt => "u",
+            Self::Poly => "u",
+            _ => unreachable!("Unused type kind: {:#?}", self),
+        }
+    }
+}
+
+#[derive(Debug, PartialEq, Clone)]
+pub struct IntrinsicType {
+    pub constant: bool,
+
+    /// whether this object is a const pointer
+    pub ptr_constant: bool,
+
+    pub ptr: bool,
+
+    pub kind: TypeKind,
+    /// The bit length of this type (e.g. 32 for u32).
+    pub bit_len: Option<u32>,
+
+    /// Length of the SIMD vector (i.e. 4 for uint32x4_t), A value of `None`
+    /// means this is not a simd type. A `None` can be assumed to be 1,
+    /// although in some places a distinction is needed between `u64` and
+    /// `uint64x1_t` this signals that.
+    pub simd_len: Option<u32>,
+
+    /// The number of rows for SIMD matrices (i.e. 2 for uint8x8x2_t).
+    /// A value of `None` represents a type that does not contain any
+    /// rows encoded in the type (e.g. uint8x8_t).
+    /// A value of `None` can be assumed to be 1 though.
+    pub vec_len: Option<u32>,
+
+    pub target: String,
+}
+
+impl IntrinsicType {
+    pub fn kind(&self) -> TypeKind {
+        self.kind
+    }
+
+    pub fn inner_size(&self) -> u32 {
+        if let Some(bl) = self.bit_len {
+            bl
+        } else {
+            unreachable!("")
+        }
+    }
+
+    pub fn num_lanes(&self) -> u32 {
+        self.simd_len.unwrap_or(1)
+    }
+
+    pub fn num_vectors(&self) -> u32 {
+        self.vec_len.unwrap_or(1)
+    }
+
+    pub fn is_simd(&self) -> bool {
+        self.simd_len.is_some() || self.vec_len.is_some()
+    }
+
+    pub fn is_ptr(&self) -> bool {
+        self.ptr
+    }
+
+    pub fn c_scalar_type(&self) -> String {
+        format!(
+            "{prefix}{bits}_t",
+            prefix = self.kind().c_prefix(),
+            bits = self.inner_size()
+        )
+    }
+
+    pub fn rust_scalar_type(&self) -> String {
+        format!(
+            "{prefix}{bits}",
+            prefix = self.kind().rust_prefix(),
+            bits = self.inner_size()
+        )
+    }
+
+    pub fn c_promotion(&self) -> &str {
+        match *self {
+            IntrinsicType {
+                kind,
+                bit_len: Some(8),
+                ..
+            } => match kind {
+                TypeKind::Int => "(int)",
+                TypeKind::UInt => "(unsigned int)",
+                TypeKind::Poly => "(unsigned int)(uint8_t)",
+                _ => "",
+            },
+            IntrinsicType {
+                kind: TypeKind::Poly,
+                bit_len: Some(bit_len),
+                ..
+            } => match bit_len {
+                8 => unreachable!("handled above"),
+                16 => "(uint16_t)",
+                32 => "(uint32_t)",
+                64 => "(uint64_t)",
+                128 => "",
+                _ => panic!("invalid bit_len"),
+            },
+            _ => "",
+        }
+    }
+
+    pub fn populate_random(
+        &self,
+        indentation: Indentation,
+        loads: u32,
+        language: &Language,
+    ) -> String {
+        match self {
+            IntrinsicType {
+                bit_len: Some(bit_len @ (8 | 16 | 32 | 64)),
+                kind: kind @ (TypeKind::Int | TypeKind::UInt | TypeKind::Poly),
+                simd_len,
+                vec_len,
+                ..
+            } => {
+                let (prefix, suffix) = match language {
+                    Language::Rust => ("[", "]"),
+                    Language::C => ("{", "}"),
+                };
+                let body_indentation = indentation.nested();
+                format!(
+                    "{prefix}\n{body}\n{indentation}{suffix}",
+                    body = (0..(simd_len.unwrap_or(1) * vec_len.unwrap_or(1) + loads - 1))
+                        .format_with(",\n", |i, fmt| {
+                            let src = value_for_array(*bit_len, i);
+                            assert!(src == 0 || src.ilog2() < *bit_len);
+                            if *kind == TypeKind::Int && (src >> (*bit_len - 1)) != 0 {
+                                // `src` is a two's complement representation of a negative value.
+                                let mask = !0u64 >> (64 - *bit_len);
+                                let ones_compl = src ^ mask;
+                                let twos_compl = ones_compl + 1;
+                                if (twos_compl == src) && (language == &Language::C) {
+                                    // `src` is INT*_MIN. C requires `-0x7fffffff - 1` to avoid
+                                    // undefined literal overflow behaviour.
+                                    fmt(&format_args!("{body_indentation}-{ones_compl:#x} - 1"))
+                                } else {
+                                    fmt(&format_args!("{body_indentation}-{twos_compl:#x}"))
+                                }
+                            } else {
+                                fmt(&format_args!("{body_indentation}{src:#x}"))
+                            }
+                        })
+                )
+            }
+            IntrinsicType {
+                kind: TypeKind::Float,
+                bit_len: Some(bit_len @ (16 | 32 | 64)),
+                simd_len,
+                vec_len,
+                ..
+            } => {
+                let (prefix, cast_prefix, cast_suffix, suffix) = match (language, bit_len) {
+                    (&Language::Rust, 16) => ("[", "f16::from_bits(", ")", "]"),
+                    (&Language::Rust, 32) => ("[", "f32::from_bits(", ")", "]"),
+                    (&Language::Rust, 64) => ("[", "f64::from_bits(", ")", "]"),
+                    (&Language::C, 16) => ("{", "cast<float16_t, uint16_t>(", ")", "}"),
+                    (&Language::C, 32) => ("{", "cast<float, uint32_t>(", ")", "}"),
+                    (&Language::C, 64) => ("{", "cast<double, uint64_t>(", ")", "}"),
+                    _ => unreachable!(),
+                };
+                format!(
+                    "{prefix}\n{body}\n{indentation}{suffix}",
+                    body = (0..(simd_len.unwrap_or(1) * vec_len.unwrap_or(1) + loads - 1))
+                        .format_with(",\n", |i, fmt| fmt(&format_args!(
+                            "{indentation}{cast_prefix}{src:#x}{cast_suffix}",
+                            indentation = indentation.nested(),
+                            src = value_for_array(*bit_len, i)
+                        )))
+                )
+            }
+            _ => unimplemented!("populate random: {:#?}", self),
+        }
+    }
+
+    pub fn is_rust_vals_array_const(&self) -> bool {
+        match self {
+            // Floats have to be loaded at runtime for stable NaN conversion.
+            IntrinsicType {
+                kind: TypeKind::Float,
+                ..
+            } => false,
+            IntrinsicType {
+                kind: TypeKind::Int | TypeKind::UInt | TypeKind::Poly,
+                ..
+            } => true,
+            _ => unimplemented!(),
+        }
+    }
+
+    pub fn as_call_param_c(&self, name: &String) -> String {
+        if self.ptr {
+            format!("&{name}")
+        } else {
+            name.clone()
+        }
+    }
+}
+
+pub trait IntrinsicTypeDefinition: Deref<Target = IntrinsicType> {
+    /// Determines the load function for this type.
+    /// can be implemented in an `impl` block
+    fn get_load_function(&self, _language: Language) -> String;
+
+    /// can be implemented in an `impl` block
+    fn get_lane_function(&self) -> String;
+
+    /// can be implemented in an `impl` block
+    fn from_c(_s: &str, _target: &str) -> Result<Box<Self>, String>;
+
+    /// Gets a string containing the typename for this type in C format.
+    /// can be directly defined in `impl` blocks
+    fn c_type(&self) -> String;
+
+    /// can be directly defined in `impl` blocks
+    fn c_single_vector_type(&self) -> String;
+
+    /// can be defined in `impl` blocks
+    fn rust_type(&self) -> String;
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/common/mod.rs b/library/stdarch/crates/intrinsic-test/src/common/mod.rs
new file mode 100644
index 0000000000000..5d51d3460ecff
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/common/mod.rs
@@ -0,0 +1,25 @@
+use cli::ProcessedCli;
+
+pub mod argument;
+pub mod cli;
+pub mod compare;
+pub mod compile_c;
+pub mod constraint;
+pub mod gen_c;
+pub mod gen_rust;
+pub mod indentation;
+pub mod intrinsic;
+pub mod intrinsic_helpers;
+pub mod values;
+pub mod write_file;
+
+/// Architectures must support this trait
+/// to be successfully tested.
+pub trait SupportedArchitectureTest {
+    fn create(cli_options: ProcessedCli) -> Box<Self>
+    where
+        Self: Sized;
+    fn build_c_file(&self) -> bool;
+    fn build_rust_file(&self) -> bool;
+    fn compare_outputs(&self) -> bool;
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/common/values.rs b/library/stdarch/crates/intrinsic-test/src/common/values.rs
new file mode 100644
index 0000000000000..1b614a742ef8b
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/common/values.rs
@@ -0,0 +1,120 @@
+/// Get a single value for an argument values array in a determistic way.
+/// * `bits`: The number of bits for the type, only 8, 16, 32, 64 are valid values
+/// * `index`: The position in the array we are generating for
+pub fn value_for_array(bits: u32, index: u32) -> u64 {
+    let index = index as usize;
+    match bits {
+        8 => VALUES_8[index % VALUES_8.len()].into(),
+        16 => VALUES_16[index % VALUES_16.len()].into(),
+        32 => VALUES_32[index % VALUES_32.len()].into(),
+        64 => VALUES_64[index % VALUES_64.len()],
+        _ => unimplemented!("value_for_array(bits: {bits}, ..)"),
+    }
+}
+
+pub const VALUES_8: &[u8] = &[
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0xf0, 0x80, 0x3b, 0xff,
+];
+
+pub const VALUES_16: &[u16] = &[
+    0x0000, // 0.0
+    0x0400, // The smallest normal value.
+    0x37ff, // The value just below 0.5.
+    0x3800, // 0.5
+    0x3801, // The value just above 0.5.
+    0x3bff, // The value just below 1.0.
+    0x3c00, // 1.0
+    0x3c01, // The value just above 1.0.
+    0x3e00, // 1.5
+    0x4900, // 10
+    0x7bff, // The largest finite value.
+    0x7c00, // Infinity.
+    // NaNs.
+    //  - Quiet NaNs
+    0x7f23, 0x7e00, //  - Signalling NaNs
+    0x7d23, 0x7c01, // Subnormals.
+    //  - A recognisable bit pattern.
+    0x0012, //  - The largest subnormal value.
+    0x03ff, //  - The smallest subnormal value.
+    0x0001, // The same values again, but negated.
+    0x8000, 0x8400, 0xb7ff, 0xb800, 0xb801, 0xbbff, 0xbc00, 0xbc01, 0xbe00, 0xc900, 0xfbff, 0xfc00,
+    0xff23, 0xfe00, 0xfd23, 0xfc01, 0x8012, 0x83ff, 0x8001,
+];
+
+pub const VALUES_32: &[u32] = &[
+    // Simple values.
+    0x00000000, // 0.0
+    0x00800000, // The smallest normal value.
+    0x3effffff, // The value just below 0.5.
+    0x3f000000, // 0.5
+    0x3f000001, // The value just above 0.5.
+    0x3f7fffff, // The value just below 1.0.
+    0x3f800000, // 1.0
+    0x3f800001, // The value just above 1.0.
+    0x3fc00000, // 1.5
+    0x41200000, // 10
+    0x7f8fffff, // The largest finite value.
+    0x7f800000, // Infinity.
+    // NaNs.
+    //  - Quiet NaNs
+    0x7fd23456, 0x7fc00000, //  - Signalling NaNs
+    0x7f923456, 0x7f800001, // Subnormals.
+    //  - A recognisable bit pattern.
+    0x00123456, //  - The largest subnormal value.
+    0x007fffff, //  - The smallest subnormal value.
+    0x00000001, // The same values again, but negated.
+    0x80000000, 0x80800000, 0xbeffffff, 0xbf000000, 0xbf000001, 0xbf7fffff, 0xbf800000, 0xbf800001,
+    0xbfc00000, 0xc1200000, 0xff8fffff, 0xff800000, 0xffd23456, 0xffc00000, 0xff923456, 0xff800001,
+    0x80123456, 0x807fffff, 0x80000001,
+];
+
+pub const VALUES_64: &[u64] = &[
+    // Simple values.
+    0x0000000000000000, // 0.0
+    0x0010000000000000, // The smallest normal value.
+    0x3fdfffffffffffff, // The value just below 0.5.
+    0x3fe0000000000000, // 0.5
+    0x3fe0000000000001, // The value just above 0.5.
+    0x3fefffffffffffff, // The value just below 1.0.
+    0x3ff0000000000000, // 1.0
+    0x3ff0000000000001, // The value just above 1.0.
+    0x3ff8000000000000, // 1.5
+    0x4024000000000000, // 10
+    0x7fefffffffffffff, // The largest finite value.
+    0x7ff0000000000000, // Infinity.
+    // NaNs.
+    //  - Quiet NaNs
+    0x7ff923456789abcd,
+    0x7ff8000000000000,
+    //  - Signalling NaNs
+    0x7ff123456789abcd,
+    0x7ff0000000000000,
+    // Subnormals.
+    //  - A recognisable bit pattern.
+    0x000123456789abcd,
+    //  - The largest subnormal value.
+    0x000fffffffffffff,
+    //  - The smallest subnormal value.
+    0x0000000000000001,
+    // The same values again, but negated.
+    0x8000000000000000,
+    0x8010000000000000,
+    0xbfdfffffffffffff,
+    0xbfe0000000000000,
+    0xbfe0000000000001,
+    0xbfefffffffffffff,
+    0xbff0000000000000,
+    0xbff0000000000001,
+    0xbff8000000000000,
+    0xc024000000000000,
+    0xffefffffffffffff,
+    0xfff0000000000000,
+    0xfff923456789abcd,
+    0xfff8000000000000,
+    0xfff123456789abcd,
+    0xfff0000000000000,
+    0x800123456789abcd,
+    0x800fffffffffffff,
+    0x8000000000000001,
+];
diff --git a/library/stdarch/crates/intrinsic-test/src/common/write_file.rs b/library/stdarch/crates/intrinsic-test/src/common/write_file.rs
new file mode 100644
index 0000000000000..0ba3e829a6b80
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/common/write_file.rs
@@ -0,0 +1,66 @@
+use super::gen_c::create_c_test_program;
+use super::gen_c::setup_c_file_paths;
+use super::gen_rust::{create_rust_test_program, setup_rust_file_paths};
+use super::intrinsic::IntrinsicDefinition;
+use super::intrinsic_helpers::IntrinsicTypeDefinition;
+use std::fs::File;
+use std::io::Write;
+
+pub fn write_file(filename: &String, code: String) {
+    let mut file = File::create(filename).unwrap();
+    file.write_all(code.into_bytes().as_slice()).unwrap();
+}
+
+pub fn write_c_testfiles<T: IntrinsicTypeDefinition + Sized>(
+    intrinsics: &Vec<&dyn IntrinsicDefinition<T>>,
+    target: &str,
+    c_target: &str,
+    headers: &[&str],
+    notice: &str,
+    arch_specific_definitions: &[&str],
+) -> Vec<String> {
+    let intrinsics_name_list = intrinsics
+        .iter()
+        .map(|i| i.name().clone())
+        .collect::<Vec<_>>();
+    let filename_mapping = setup_c_file_paths(&intrinsics_name_list);
+
+    intrinsics.iter().for_each(|&i| {
+        let c_code = create_c_test_program(
+            i,
+            headers,
+            target,
+            c_target,
+            notice,
+            arch_specific_definitions,
+        );
+        if let Some(filename) = filename_mapping.get(&i.name()) {
+            write_file(filename, c_code)
+        };
+    });
+
+    intrinsics_name_list
+}
+
+pub fn write_rust_testfiles<T: IntrinsicTypeDefinition>(
+    intrinsics: Vec<&dyn IntrinsicDefinition<T>>,
+    rust_target: &str,
+    notice: &str,
+    definitions: &str,
+    cfg: &str,
+) -> Vec<String> {
+    let intrinsics_name_list = intrinsics
+        .iter()
+        .map(|i| i.name().clone())
+        .collect::<Vec<_>>();
+    let filename_mapping = setup_rust_file_paths(&intrinsics_name_list);
+
+    intrinsics.iter().for_each(|&i| {
+        let rust_code = create_rust_test_program(i, rust_target, notice, definitions, cfg);
+        if let Some(filename) = filename_mapping.get(&i.name()) {
+            write_file(filename, rust_code)
+        }
+    });
+
+    intrinsics_name_list
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/main.rs b/library/stdarch/crates/intrinsic-test/src/main.rs
new file mode 100644
index 0000000000000..054138a0dba1a
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/main.rs
@@ -0,0 +1,42 @@
+#[macro_use]
+extern crate log;
+
+mod arm;
+mod common;
+
+use arm::ArmArchitectureTest;
+use common::SupportedArchitectureTest;
+use common::cli::{Cli, ProcessedCli};
+
+fn main() {
+    pretty_env_logger::init();
+    let args: Cli = clap::Parser::parse();
+    let processed_cli_options = ProcessedCli::new(args);
+
+    let test_environment_result: Option<Box<dyn SupportedArchitectureTest>> =
+        match processed_cli_options.target.as_str() {
+            "aarch64-unknown-linux-gnu"
+            | "armv7-unknown-linux-gnueabihf"
+            | "aarch64_be-unknown-linux-gnu" => {
+                Some(ArmArchitectureTest::create(processed_cli_options))
+            }
+
+            _ => None,
+        };
+
+    if test_environment_result.is_none() {
+        std::process::exit(0);
+    }
+
+    let test_environment = test_environment_result.unwrap();
+
+    if !test_environment.build_c_file() {
+        std::process::exit(2);
+    }
+    if !test_environment.build_rust_file() {
+        std::process::exit(3);
+    }
+    if !test_environment.compare_outputs() {
+        std::process::exit(1);
+    }
+}
diff --git a/library/stdarch/crates/simd-test-macro/Cargo.toml b/library/stdarch/crates/simd-test-macro/Cargo.toml
new file mode 100644
index 0000000000000..8f9f9b13273cb
--- /dev/null
+++ b/library/stdarch/crates/simd-test-macro/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "simd-test-macro"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+edition = "2024"
+
+[lib]
+proc-macro = true
+test = false
+
+[dependencies]
+proc-macro2 = "1.0"
+quote = "1.0"
+syn = { version = "2.0", features = ["full"] }
diff --git a/library/stdarch/crates/simd-test-macro/src/lib.rs b/library/stdarch/crates/simd-test-macro/src/lib.rs
new file mode 100644
index 0000000000000..18e4747d94d91
--- /dev/null
+++ b/library/stdarch/crates/simd-test-macro/src/lib.rs
@@ -0,0 +1,126 @@
+//! Implementation of the `#[simd_test]` macro
+//!
+//! This macro expands to a `#[test]` function which tests the local machine
+//! for the appropriate cfg before calling the inner test function.
+#![deny(rust_2018_idioms)]
+
+#[macro_use]
+extern crate quote;
+
+use proc_macro2::{Ident, Literal, Span, TokenStream, TokenTree};
+use quote::ToTokens;
+use std::env;
+
+fn string(s: &str) -> TokenTree {
+    Literal::string(s).into()
+}
+
+#[proc_macro_attribute]
+pub fn simd_test(
+    attr: proc_macro::TokenStream,
+    item: proc_macro::TokenStream,
+) -> proc_macro::TokenStream {
+    let tokens = TokenStream::from(attr).into_iter().collect::<Vec<_>>();
+    if tokens.len() != 3 {
+        panic!("expected #[simd_test(enable = \"feature\")]");
+    }
+    match &tokens[0] {
+        TokenTree::Ident(tt) if *tt == "enable" => {}
+        _ => panic!("expected #[simd_test(enable = \"feature\")]"),
+    }
+    match &tokens[1] {
+        TokenTree::Punct(tt) if tt.as_char() == '=' => {}
+        _ => panic!("expected #[simd_test(enable = \"feature\")]"),
+    }
+    let enable_feature = match &tokens[2] {
+        TokenTree::Literal(tt) => tt.to_string(),
+        _ => panic!("expected #[simd_test(enable = \"feature\")]"),
+    };
+    let enable_feature = enable_feature.trim_start_matches('"').trim_end_matches('"');
+    let target_features: Vec<String> = enable_feature
+        .replace('+', "")
+        .split(',')
+        .map(String::from)
+        .collect();
+
+    let enable_feature = string(enable_feature);
+    let mut item = syn::parse_macro_input!(item as syn::ItemFn);
+    let item_attrs = std::mem::take(&mut item.attrs);
+    let name = &item.sig.ident;
+
+    let target = env::var("TARGET").expect(
+        "TARGET environment variable should be set for rustc (e.g. TARGET=x86_64-apple-darwin cargo test)"
+    );
+    let macro_test = match target
+        .split('-')
+        .next()
+        .unwrap_or_else(|| panic!("target triple contained no \"-\": {target}"))
+    {
+        "i686" | "x86_64" | "i586" => "is_x86_feature_detected",
+        "arm" | "armv7" => "is_arm_feature_detected",
+        "aarch64" | "arm64ec" | "aarch64_be" => "is_aarch64_feature_detected",
+        maybe_riscv if maybe_riscv.starts_with("riscv") => "is_riscv_feature_detected",
+        "powerpc" | "powerpcle" => "is_powerpc_feature_detected",
+        "powerpc64" | "powerpc64le" => "is_powerpc64_feature_detected",
+        "loongarch64" => "is_loongarch_feature_detected",
+        "s390x" => "is_s390x_feature_detected",
+        t => panic!("unknown target: {t}"),
+    };
+    let macro_test = Ident::new(macro_test, Span::call_site());
+
+    let skipped_functions = env::var("STDARCH_TEST_SKIP_FUNCTION").unwrap_or_default();
+    let skipped_features = env::var("STDARCH_TEST_SKIP_FEATURE").unwrap_or_default();
+
+    let mut name_str = &*name.to_string();
+    if name_str.starts_with("test_") {
+        name_str = &name_str[5..];
+    }
+
+    let skip_this = skipped_functions
+        .split(',')
+        .map(str::trim)
+        .any(|s| s == name_str)
+        || skipped_features
+            .split(',')
+            .map(str::trim)
+            .any(|s| target_features.iter().any(|feature| s == feature));
+
+    let mut detect_missing_features = TokenStream::new();
+    for feature in target_features {
+        let q = quote_spanned! {
+            proc_macro2::Span::call_site() =>
+            if !#macro_test!(#feature) {
+                missing_features.push(#feature);
+            }
+        };
+        q.to_tokens(&mut detect_missing_features);
+    }
+
+    let maybe_ignore = if skip_this {
+        quote! { #[ignore] }
+    } else {
+        TokenStream::new()
+    };
+
+    let ret: TokenStream = quote_spanned! {
+        proc_macro2::Span::call_site() =>
+        #[allow(non_snake_case)]
+        #[test]
+        #maybe_ignore
+        #(#item_attrs)*
+        fn #name() {
+            let mut missing_features = ::std::vec::Vec::new();
+            #detect_missing_features
+            if missing_features.is_empty() {
+                let v = unsafe { #name() };
+                return v;
+            } else {
+                ::stdarch_test::assert_skip_test_ok(stringify!(#name), &missing_features);
+            }
+
+            #[target_feature(enable = #enable_feature)]
+            #item
+        }
+    };
+    ret.into()
+}
diff --git a/library/stdarch/crates/std_detect/Cargo.toml b/library/stdarch/crates/std_detect/Cargo.toml
new file mode 100644
index 0000000000000..368423065a231
--- /dev/null
+++ b/library/stdarch/crates/std_detect/Cargo.toml
@@ -0,0 +1,43 @@
+[package]
+name = "std_detect"
+version = "0.1.5"
+authors = [
+    "Alex Crichton <alex@alexcrichton.com>",
+    "Andrew Gallant <jamslam@gmail.com>",
+    "Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>",
+]
+description = "`std::detect` - Rust's standard library run-time CPU feature detection."
+homepage = "https://github.com/rust-lang/stdarch"
+repository = "https://github.com/rust-lang/stdarch"
+readme = "README.md"
+keywords = ["std", "run-time", "feature", "detection"]
+categories = ["hardware-support"]
+license = "MIT OR Apache-2.0"
+edition = "2024"
+
+[badges]
+is-it-maintained-issue-resolution = { repository = "rust-lang/stdarch" }
+is-it-maintained-open-issues = { repository = "rust-lang/stdarch" }
+maintenance = { status = "experimental" }
+
+[dependencies]
+cfg-if = "1.0.0"
+
+# When built as part of libstd
+core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
+compiler_builtins = { version = "0.1.2", optional = true }
+alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" }
+
+[target.'cfg(not(windows))'.dependencies]
+libc = { version = "0.2.0", optional = true, default-features = false }
+
+[features]
+default = [ "std_detect_dlsym_getauxval", "std_detect_file_io" ]
+std_detect_file_io = [ "libc" ]
+std_detect_dlsym_getauxval = [ "libc" ]
+std_detect_env_override = [ "libc" ]
+rustc-dep-of-std = [
+    "core",
+    "compiler_builtins",
+    "alloc",
+]
diff --git a/library/stdarch/crates/std_detect/LICENSE-APACHE b/library/stdarch/crates/std_detect/LICENSE-APACHE
new file mode 100644
index 0000000000000..16fe87b06e802
--- /dev/null
+++ b/library/stdarch/crates/std_detect/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/library/stdarch/crates/std_detect/LICENSE-MIT b/library/stdarch/crates/std_detect/LICENSE-MIT
new file mode 100644
index 0000000000000..52d82415d8b60
--- /dev/null
+++ b/library/stdarch/crates/std_detect/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2017 The Rust Project Developers
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/library/stdarch/crates/std_detect/README.md b/library/stdarch/crates/std_detect/README.md
new file mode 100644
index 0000000000000..091f5542e0e87
--- /dev/null
+++ b/library/stdarch/crates/std_detect/README.md
@@ -0,0 +1,93 @@
+`std::detect` - Rust's standard library run-time CPU feature detection
+=======
+
+The private `std::detect` module implements run-time feature detection in Rust's
+standard library. This allows detecting whether the CPU the binary runs on
+supports certain features, like SIMD instructions.
+
+# Usage
+
+`std::detect` APIs are available as part of `libstd`. Prefer using it via the
+standard library than through this crate. Unstable features of `std::detect` are
+available on nightly Rust behind various feature-gates.
+
+If you need run-time feature detection in `#[no_std]` environments, Rust `core`
+library cannot help you. By design, Rust `core` is platform independent, but
+performing run-time feature detection requires a certain level of cooperation
+from the platform.
+
+You can then manually include `std_detect` as a dependency to get similar
+run-time feature detection support than the one offered by Rust's standard
+library. We intend to make `std_detect` more flexible and configurable in this
+regard to better serve the needs of `#[no_std]` targets.
+
+# Features
+
+* `std_detect_dlsym_getauxval` (enabled by default, requires `libc`): Enable to
+use `libc::dlsym` to query whether [`getauxval`] is linked into the binary. When
+this is not the case, this feature allows other fallback methods to perform
+run-time feature detection. When this feature is disabled, `std_detect` assumes
+that [`getauxval`] is linked to the binary. If that is not the case the behavior
+is undefined.
+
+  Note: This feature is ignored on `*-linux-{gnu,musl,ohos}*` and `*-android*` targets
+  because we can safely assume `getauxval` is linked to the binary.
+  * `*-linux-gnu*` targets ([since Rust 1.64](https://blog.rust-lang.org/2022/08/01/Increasing-glibc-kernel-requirements.html))
+    have glibc requirements higher than [glibc 2.16 that added `getauxval`](https://sourceware.org/legacy-ml/libc-announce/2012/msg00000.html).
+  * `*-linux-musl*` targets ([at least since Rust 1.15](https://github.com/rust-lang/rust/blob/1.15.0/src/ci/docker/x86_64-musl/build-musl.sh#L15))
+    use musl newer than [musl 1.1.0 that added `getauxval`](https://git.musl-libc.org/cgit/musl/tree/WHATSNEW?h=v1.1.0#n1197)
+  * `*-linux-ohos*` targets use a [fork of musl 1.2](https://gitee.com/openharmony/docs/blob/master/en/application-dev/reference/native-lib/musl.md)
+  * `*-android*` targets ([since Rust 1.68](https://blog.rust-lang.org/2023/01/09/android-ndk-update-r25.html))
+    have the minimum supported API level higher than [Android 4.3 (API level 18) that added `getauxval`](https://github.com/aosp-mirror/platform_bionic/blob/d3ebc2f7c49a9893b114124d4a6b315f3a328764/libc/include/sys/auxv.h#L49).
+
+* `std_detect_file_io` (enabled by default, requires `std`): Enable to perform run-time feature
+detection using file APIs (e.g. `/proc/self/auxv`, etc.) if other more performant
+methods fail. This feature requires `libstd` as a dependency, preventing the
+crate from working on applications in which `std` is not available.
+
+[`getauxval`]: https://man7.org/linux/man-pages/man3/getauxval.3.html
+
+# Platform support
+
+* All `x86`/`x86_64` targets are supported on all platforms by querying the
+  `cpuid` instruction directly for the features supported by the hardware and
+  the operating system. `std_detect` assumes that the binary is an user-space
+  application.
+
+* Linux/Android:
+  * `arm{32, 64}`, `mips{32,64}{,el}`, `powerpc{32,64}{,le}`, `loongarch64`, `s390x`:
+    `std_detect` supports these on Linux by querying ELF auxiliary vectors (using `getauxval`
+    when available), and if that fails, by querying `/proc/self/auxv`.
+  * `arm64`: partial support for doing run-time feature detection by directly
+    querying `mrs` is implemented for Linux >= 4.11, but not enabled by default.
+  * `riscv{32,64}`:
+    `std_detect` supports these on Linux by querying `riscv_hwprobe`, and
+    by querying ELF auxiliary vectors (using `getauxval` when available).
+
+* FreeBSD:
+  * `arm32`, `powerpc64`: `std_detect` supports these on FreeBSD by querying ELF
+    auxiliary vectors using `sysctl`.
+  * `arm64`: run-time feature detection is implemented by directly querying `mrs`.
+
+* OpenBSD:
+  * `arm64`: run-time feature detection is implemented by querying `sysctl`.
+
+* Windows:
+  * `arm64`: run-time feature detection is implemented by querying `IsProcessorFeaturePresent`.
+
+# License
+
+This project is licensed under either of
+
+ * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
+   http://www.apache.org/licenses/LICENSE-2.0)
+ * MIT license ([LICENSE-MIT](LICENSE-MIT) or
+   http://opensource.org/licenses/MIT)
+
+at your option.
+
+# Contribution
+
+Unless you explicitly state otherwise, any contribution intentionally submitted
+for inclusion in `std_detect` by you, as defined in the Apache-2.0 license,
+shall be dual licensed as above, without any additional terms or conditions.
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/arch/aarch64.rs
new file mode 100644
index 0000000000000..13570a25c1cfe
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/aarch64.rs
@@ -0,0 +1,259 @@
+//! Aarch64 run-time features.
+
+features! {
+    @TARGET: aarch64;
+    @CFG: any(target_arch = "aarch64", target_arch = "arm64ec");
+    @MACRO_NAME: is_aarch64_feature_detected;
+    @MACRO_ATTRS:
+    /// This macro tests, at runtime, whether an `aarch64` feature is enabled on aarch64 platforms.
+    /// Currently most features are only supported on linux-based platforms.
+    ///
+    /// This macro takes one argument which is a string literal of the feature being tested for.
+    /// The feature names are mostly taken from their FEAT_* definitions in the [ARM Architecture
+    /// Reference Manual][docs].
+    ///
+    /// ## Supported arguments
+    ///
+    /// * `"aes"` - FEAT_AES & FEAT_PMULL
+    /// * `"asimd"` or "neon" - FEAT_AdvSIMD
+    /// * `"bf16"` - FEAT_BF16
+    /// * `"bti"` - FEAT_BTI
+    /// * `"crc"` - FEAT_CRC
+    /// * `"cssc"` - FEAT_CSSC
+    /// * `"dit"` - FEAT_DIT
+    /// * `"dotprod"` - FEAT_DotProd
+    /// * `"dpb"` - FEAT_DPB
+    /// * `"dpb2"` - FEAT_DPB2
+    /// * `"ecv"` - FEAT_ECV
+    /// * `"f32mm"` - FEAT_F32MM
+    /// * `"f64mm"` - FEAT_F64MM
+    /// * `"faminmax"` - FEAT_FAMINMAX
+    /// * `"fcma"` - FEAT_FCMA
+    /// * `"fhm"` - FEAT_FHM
+    /// * `"flagm"` - FEAT_FLAGM
+    /// * `"flagm2"` - FEAT_FLAGM2
+    /// * `"fp"` - FEAT_FP
+    /// * `"fp16"` - FEAT_FP16
+    /// * `"fp8"` - FEAT_FP8
+    /// * `"fp8dot2"` - FEAT_FP8DOT2
+    /// * `"fp8dot4"` - FEAT_FP8DOT4
+    /// * `"fp8fma"` - FEAT_FP8FMA
+    /// * `"fpmr"` - FEAT_FPMR
+    /// * `"frintts"` - FEAT_FRINTTS
+    /// * `"hbc"` - FEAT_HBC
+    /// * `"i8mm"` - FEAT_I8MM
+    /// * `"jsconv"` - FEAT_JSCVT
+    /// * `"lse"` - FEAT_LSE
+    /// * `"lse128"` - FEAT_LSE128
+    /// * `"lse2"` - FEAT_LSE2
+    /// * `"lut"` - FEAT_LUT
+    /// * `"mops"` - FEAT_MOPS
+    /// * `"mte"` - FEAT_MTE & FEAT_MTE2
+    /// * `"paca"` - FEAT_PAuth (address authentication)
+    /// * `"pacg"` - FEAT_Pauth (generic authentication)
+    /// * `"pauth-lr"` - FEAT_PAuth_LR
+    /// * `"pmull"` - FEAT_PMULL
+    /// * `"rand"` - FEAT_RNG
+    /// * `"rcpc"` - FEAT_LRCPC
+    /// * `"rcpc2"` - FEAT_LRCPC2
+    /// * `"rcpc3"` - FEAT_LRCPC3
+    /// * `"rdm"` - FEAT_RDM
+    /// * `"sb"` - FEAT_SB
+    /// * `"sha2"` - FEAT_SHA1 & FEAT_SHA256
+    /// * `"sha3"` - FEAT_SHA512 & FEAT_SHA3
+    /// * `"sm4"` - FEAT_SM3 & FEAT_SM4
+    /// * `"sme"` - FEAT_SME
+    /// * `"sme-b16b16"` - FEAT_SME_B16B16
+    /// * `"sme-f16f16"` - FEAT_SME_F16F16
+    /// * `"sme-f64f64"` - FEAT_SME_F64F64
+    /// * `"sme-f8f16"` - FEAT_SME_F8F16
+    /// * `"sme-f8f32"` - FEAT_SME_F8F32
+    /// * `"sme-fa64"` - FEAT_SME_FA64
+    /// * `"sme-i16i64"` - FEAT_SME_I16I64
+    /// * `"sme-lutv2"` - FEAT_SME_LUTv2
+    /// * `"sme2"` - FEAT_SME2
+    /// * `"sme2p1"` - FEAT_SME2p1
+    /// * `"ssbs"` - FEAT_SSBS & FEAT_SSBS2
+    /// * `"ssve-fp8dot2"` - FEAT_SSVE_FP8DOT2
+    /// * `"ssve-fp8dot4"` - FEAT_SSVE_FP8DOT4
+    /// * `"ssve-fp8fma"` - FEAT_SSVE_FP8FMA
+    /// * `"sve"` - FEAT_SVE
+    /// * `"sve-b16b16"` - FEAT_SVE_B16B16 (SVE or SME Z-targeting instructions)
+    /// * `"sve2"` - FEAT_SVE2
+    /// * `"sve2-aes"` - FEAT_SVE_AES & FEAT_SVE_PMULL128 (SVE2 AES crypto)
+    /// * `"sve2-bitperm"` - FEAT_SVE2_BitPerm
+    /// * `"sve2-sha3"` - FEAT_SVE2_SHA3
+    /// * `"sve2-sm4"` - FEAT_SVE2_SM4
+    /// * `"sve2p1"` - FEAT_SVE2p1
+    /// * `"tme"` - FEAT_TME
+    /// * `"wfxt"` - FEAT_WFxT
+    ///
+    /// [docs]: https://developer.arm.com/documentation/ddi0487/latest
+    #[stable(feature = "simd_aarch64", since = "1.60.0")]
+    @BIND_FEATURE_NAME: "asimd"; "neon";
+    @NO_RUNTIME_DETECTION: "ras";
+    @NO_RUNTIME_DETECTION: "v8.1a";
+    @NO_RUNTIME_DETECTION: "v8.2a";
+    @NO_RUNTIME_DETECTION: "v8.3a";
+    @NO_RUNTIME_DETECTION: "v8.4a";
+    @NO_RUNTIME_DETECTION: "v8.5a";
+    @NO_RUNTIME_DETECTION: "v8.6a";
+    @NO_RUNTIME_DETECTION: "v8.7a";
+    @NO_RUNTIME_DETECTION: "v8.8a";
+    @NO_RUNTIME_DETECTION: "v8.9a";
+    @NO_RUNTIME_DETECTION: "v9.1a";
+    @NO_RUNTIME_DETECTION: "v9.2a";
+    @NO_RUNTIME_DETECTION: "v9.3a";
+    @NO_RUNTIME_DETECTION: "v9.4a";
+    @NO_RUNTIME_DETECTION: "v9.5a";
+    @NO_RUNTIME_DETECTION: "v9a";
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] asimd: "neon";
+    /// FEAT_AdvSIMD (Advanced SIMD/NEON)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] pmull: "pmull";
+    implied by target_features: ["aes"];
+    /// FEAT_PMULL (Polynomial Multiply) - Implied by `aes` target_feature
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] fp: "fp";
+    implied by target_features: ["neon"];
+    /// FEAT_FP (Floating point support) - Implied by `neon` target_feature
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] aes: "aes";
+    /// FEAT_AES (AES SIMD instructions) & FEAT_PMULL (PMULL{2}, 64-bit operand variants)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] bf16: "bf16";
+    /// FEAT_BF16 (BFloat16 type, plus MM instructions, plus ASIMD support)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] bti: "bti";
+    /// FEAT_BTI (Branch Target Identification)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] crc: "crc";
+    /// FEAT_CRC32 (Cyclic Redundancy Check)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] cssc: "cssc";
+    /// FEAT_CSSC (Common Short Sequence Compression instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] dit: "dit";
+    /// FEAT_DIT (Data Independent Timing instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] dpb: "dpb";
+    /// FEAT_DPB (aka dcpop - data cache clean to point of persistence)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] dpb2: "dpb2";
+    /// FEAT_DPB2 (aka dcpodp - data cache clean to point of deep persistence)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] dotprod: "dotprod";
+    /// FEAT_DotProd (Vector Dot-Product - ASIMDDP)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] ecv: "ecv";
+    /// FEAT_ECV (Enhanced Counter Virtualization)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] f32mm: "f32mm";
+    /// FEAT_F32MM (single-precision matrix multiplication)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] f64mm: "f64mm";
+    /// FEAT_F64MM (double-precision matrix multiplication)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] faminmax: "faminmax";
+    /// FEAT_FAMINMAX (FAMIN and FAMAX SIMD/SVE/SME instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] fcma: "fcma";
+    /// FEAT_FCMA (float complex number operations)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] fhm: "fhm";
+    /// FEAT_FHM (fp16 multiplication instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] flagm: "flagm";
+    /// FEAT_FLAGM (flag manipulation instructions)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] flagm2: "flagm2";
+    /// FEAT_FLAGM2 (flag manipulation instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] fp16: "fp16";
+    /// FEAT_FP16 (Half-float support)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] fp8: "fp8";
+    /// FEAT_FP8 (F8CVT Instructions)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] fp8dot2: "fp8dot2";
+    /// FEAT_FP8DOT2 (F8DP2 Instructions)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] fp8dot4: "fp8dot4";
+    /// FEAT_FP8DOT4 (F8DP4 Instructions)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] fp8fma: "fp8fma";
+    /// FEAT_FP8FMA (F8FMA Instructions)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] fpmr: "fpmr";
+    without cfg check: true;
+    /// FEAT_FPMR (Special-purpose AArch64-FPMR register)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] frintts: "frintts";
+    /// FEAT_FRINTTS (float to integer rounding instructions)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] hbc: "hbc";
+    /// FEAT_HBC (Hinted conditional branches)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] i8mm: "i8mm";
+    /// FEAT_I8MM (integer matrix multiplication, plus ASIMD support)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] jsconv: "jsconv";
+    /// FEAT_JSCVT (JavaScript float conversion instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] lse: "lse";
+    /// FEAT_LSE (Large System Extension - atomics)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] lse128: "lse128";
+    /// FEAT_LSE128 (128-bit atomics)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] lse2: "lse2";
+    /// FEAT_LSE2 (unaligned and register-pair atomics)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] lut: "lut";
+    /// FEAT_LUT (Lookup Table Instructions)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] mops: "mops";
+    /// FEAT_MOPS (Standardization of memory operations)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] mte: "mte";
+    /// FEAT_MTE & FEAT_MTE2 (Memory Tagging Extension)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] paca: "paca";
+    /// FEAT_PAuth (address authentication)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] pacg: "pacg";
+    /// FEAT_PAuth (generic authentication)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] pauth_lr: "pauth-lr";
+    /// FEAT_PAuth_LR
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] rand: "rand";
+    /// FEAT_RNG (Random Number Generator)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] rcpc: "rcpc";
+    /// FEAT_LRCPC (Release consistent Processor consistent)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] rcpc2: "rcpc2";
+    /// FEAT_LRCPC2 (RCPC with immediate offsets)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] rcpc3: "rcpc3";
+    /// FEAT_LRCPC3 (RCPC Instructions v3)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] rdm: "rdm";
+    /// FEAT_RDM (Rounding Doubling Multiply - ASIMDRDM)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sb: "sb";
+    /// FEAT_SB (speculation barrier)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sha2: "sha2";
+    /// FEAT_SHA1 & FEAT_SHA256 (SHA1 & SHA2-256 instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sha3: "sha3";
+    /// FEAT_SHA512 & FEAT_SHA3 (SHA2-512 & SHA3 instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sm4: "sm4";
+    /// FEAT_SM3 & FEAT_SM4 (SM3 & SM4 instructions)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] sme: "sme";
+    /// FEAT_SME (Scalable Matrix Extension)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] sme2: "sme2";
+    /// FEAT_SME2 (SME Version 2)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] sme2p1: "sme2p1";
+    /// FEAT_SME2p1 (SME Version 2.1)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] sme_b16b16: "sme-b16b16";
+    /// FEAT_SME_B16B16
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] sme_f16f16: "sme-f16f16";
+    /// FEAT_SME_F16F16 (Non-widening half-precision FP16 to FP16 arithmetic for SME2)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] sme_f64f64: "sme-f64f64";
+    /// FEAT_SME_F64F64 (Double-precision floating-point outer product instructions)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] sme_f8f16: "sme-f8f16";
+    /// FEAT_SME_F8F16
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] sme_f8f32: "sme-f8f32";
+    /// FEAT_SME_F8F32
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] sme_fa64: "sme-fa64";
+    /// FEAT_SME_FA64 (Full A64 instruction set support in Streaming SVE mode)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] sme_i16i64: "sme-i16i64";
+    /// FEAT_SME_I16I64 (16-bit to 64-bit integer widening outer product instructions)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] sme_lutv2: "sme-lutv2";
+    /// FEAT_SME_LUTv2 (LUTI4 Instruction)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] ssbs: "ssbs";
+    /// FEAT_SSBS & FEAT_SSBS2 (speculative store bypass safe)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] ssve_fp8dot2: "ssve-fp8dot2";
+    /// FEAT_SSVE_FP8DOT2
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] ssve_fp8dot4: "ssve-fp8dot4";
+    /// FEAT_SSVE_FP8DOT4
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] ssve_fp8fma: "ssve-fp8fma";
+    /// FEAT_SSVE_FP8FMA
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sve: "sve";
+    /// FEAT_SVE (Scalable Vector Extension)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sve2: "sve2";
+    /// FEAT_SVE2 (Scalable Vector Extension 2)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] sve2p1: "sve2p1";
+    /// FEAT_SVE2p1 (Scalable Vector Extension 2.1)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sve2_aes: "sve2-aes";
+    /// FEAT_SVE_AES & FEAT_SVE_PMULL128 (SVE2 AES crypto)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] sve_b16b16: "sve-b16b16";
+    /// FEAT_SVE_B16B16 (SVE or SME Z-targeting instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sve2_bitperm: "sve2-bitperm";
+    /// FEAT_SVE_BitPerm (SVE2 bit permutation instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sve2_sha3: "sve2-sha3";
+    /// FEAT_SVE_SHA3 (SVE2 SHA3 crypto)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sve2_sm4: "sve2-sm4";
+    /// FEAT_SVE_SM4 (SVE2 SM4 crypto)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] tme: "tme";
+    /// FEAT_TME (Transactional Memory Extensions)
+    @FEATURE: #[unstable(feature = "stdarch_aarch64_feature_detection", issue = "127764")] wfxt: "wfxt";
+    /// FEAT_WFxT (WFET and WFIT Instructions)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/arm.rs b/library/stdarch/crates/std_detect/src/detect/arch/arm.rs
new file mode 100644
index 0000000000000..c3c8883ce3153
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/arm.rs
@@ -0,0 +1,29 @@
+//! Run-time feature detection on ARM Aarch32.
+
+features! {
+    @TARGET: arm;
+    @CFG: target_arch = "arm";
+    @MACRO_NAME: is_arm_feature_detected;
+    @MACRO_ATTRS:
+    /// Checks if `arm` feature is enabled.
+    #[unstable(feature = "stdarch_arm_feature_detection", issue = "111190")]
+    @NO_RUNTIME_DETECTION: "v7";
+    @NO_RUNTIME_DETECTION: "vfp2";
+    @NO_RUNTIME_DETECTION: "vfp3";
+    @NO_RUNTIME_DETECTION: "vfp4";
+    @FEATURE: #[unstable(feature = "stdarch_arm_feature_detection", issue = "111190")] neon: "neon";
+    /// ARM Advanced SIMD (NEON) - Aarch32
+    @FEATURE: #[unstable(feature = "stdarch_arm_feature_detection", issue = "111190")] pmull: "pmull";
+    without cfg check: true;
+    /// Polynomial Multiply
+    @FEATURE: #[unstable(feature = "stdarch_arm_feature_detection", issue = "111190")] crc: "crc";
+    /// CRC32 (Cyclic Redundancy Check)
+    @FEATURE: #[unstable(feature = "stdarch_arm_feature_detection", issue = "111190")] aes: "aes";
+    /// FEAT_AES (AES instructions)
+    @FEATURE: #[unstable(feature = "stdarch_arm_feature_detection", issue = "111190")] sha2: "sha2";
+    /// FEAT_SHA1 & FEAT_SHA256 (SHA1 & SHA2-256 instructions)
+    @FEATURE: #[unstable(feature = "stdarch_arm_feature_detection", issue = "111190")] i8mm: "i8mm";
+    /// FEAT_I8MM (integer matrix multiplication, plus ASIMD support)
+    @FEATURE: #[unstable(feature = "stdarch_arm_feature_detection", issue = "111190")] dotprod: "dotprod";
+    /// FEAT_DotProd (Vector Dot-Product - ASIMDDP)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/loongarch.rs b/library/stdarch/crates/std_detect/src/detect/arch/loongarch.rs
new file mode 100644
index 0000000000000..e9d68f6a9bf7f
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/loongarch.rs
@@ -0,0 +1,51 @@
+//! Run-time feature detection on LoongArch.
+
+features! {
+    @TARGET: loongarch;
+    @CFG: target_arch = "loongarch64";
+    @MACRO_NAME: is_loongarch_feature_detected;
+    @MACRO_ATTRS:
+    /// Checks if `loongarch` feature is enabled.
+    /// Supported arguments are:
+    ///
+    /// * `"f"`
+    /// * `"d"`
+    /// * `"frecipe"`
+    /// * `"div32"`
+    /// * `"lsx"`
+    /// * `"lasx"`
+    /// * `"lam-bh"`
+    /// * `"lamcas"`
+    /// * `"ld-seq-sa"`
+    /// * `"scq"`
+    /// * `"lbt"`
+    /// * `"lvz"`
+    /// * `"ual"`
+    #[stable(feature = "stdarch_loongarch_feature", since = "1.89.0")]
+    @FEATURE: #[stable(feature = "stdarch_loongarch_feature", since = "1.89.0")] f: "f";
+    /// F
+    @FEATURE: #[stable(feature = "stdarch_loongarch_feature", since = "1.89.0")] d: "d";
+    /// D
+    @FEATURE: #[stable(feature = "stdarch_loongarch_feature", since = "1.89.0")] frecipe: "frecipe";
+    /// Frecipe
+    @FEATURE: #[unstable(feature = "stdarch_loongarch_feature_detection", issue = "117425")] div32: "div32";
+    /// Div32
+    @FEATURE: #[stable(feature = "stdarch_loongarch_feature", since = "1.89.0")] lsx: "lsx";
+    /// LSX
+    @FEATURE: #[stable(feature = "stdarch_loongarch_feature", since = "1.89.0")] lasx: "lasx";
+    /// LASX
+    @FEATURE: #[unstable(feature = "stdarch_loongarch_feature_detection", issue = "117425")] lam_bh: "lam-bh";
+    /// LAM-BH
+    @FEATURE: #[unstable(feature = "stdarch_loongarch_feature_detection", issue = "117425")] lamcas: "lamcas";
+    /// LAM-CAS
+    @FEATURE: #[unstable(feature = "stdarch_loongarch_feature_detection", issue = "117425")] ld_seq_sa: "ld-seq-sa";
+    /// LD-SEQ-SA
+    @FEATURE: #[unstable(feature = "stdarch_loongarch_feature_detection", issue = "117425")] scq: "scq";
+    /// SCQ
+    @FEATURE: #[stable(feature = "stdarch_loongarch_feature", since = "1.89.0")] lbt: "lbt";
+    /// LBT
+    @FEATURE: #[stable(feature = "stdarch_loongarch_feature", since = "1.89.0")] lvz: "lvz";
+    /// LVZ
+    @FEATURE: #[unstable(feature = "stdarch_loongarch_feature_detection", issue = "117425")] ual: "ual";
+    /// UAL
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/mips.rs b/library/stdarch/crates/std_detect/src/detect/arch/mips.rs
new file mode 100644
index 0000000000000..e185fdfcaac6c
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/mips.rs
@@ -0,0 +1,12 @@
+//! Run-time feature detection on MIPS.
+
+features! {
+    @TARGET: mips;
+    @CFG: target_arch = "mips";
+    @MACRO_NAME: is_mips_feature_detected;
+    @MACRO_ATTRS:
+    /// Checks if `mips` feature is enabled.
+    #[unstable(feature = "stdarch_mips_feature_detection", issue = "111188")]
+    @FEATURE: #[unstable(feature = "stdarch_mips_feature_detection", issue = "111188")] msa: "msa";
+    /// MIPS SIMD Architecture (MSA)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/mips64.rs b/library/stdarch/crates/std_detect/src/detect/arch/mips64.rs
new file mode 100644
index 0000000000000..69fe4869d30eb
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/mips64.rs
@@ -0,0 +1,12 @@
+//! Run-time feature detection on MIPS64.
+
+features! {
+    @TARGET: mips64;
+    @CFG: target_arch = "mips64";
+    @MACRO_NAME: is_mips64_feature_detected;
+    @MACRO_ATTRS:
+    /// Checks if `mips64` feature is enabled.
+    #[unstable(feature = "stdarch_mips_feature_detection", issue = "111188")]
+    @FEATURE: #[unstable(feature = "stdarch_mips_feature_detection", issue = "111188")] msa: "msa";
+    /// MIPS SIMD Architecture (MSA)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/mod.rs b/library/stdarch/crates/std_detect/src/detect/arch/mod.rs
new file mode 100644
index 0000000000000..d5a13acc02826
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/mod.rs
@@ -0,0 +1,75 @@
+#![allow(dead_code)]
+
+use cfg_if::cfg_if;
+
+// Export the macros for all supported architectures.
+#[macro_use]
+mod x86;
+#[macro_use]
+mod arm;
+#[macro_use]
+mod aarch64;
+#[macro_use]
+mod riscv;
+#[macro_use]
+mod powerpc;
+#[macro_use]
+mod powerpc64;
+#[macro_use]
+mod mips;
+#[macro_use]
+mod mips64;
+#[macro_use]
+mod loongarch;
+#[macro_use]
+mod s390x;
+
+cfg_if! {
+    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        #[stable(feature = "simd_x86", since = "1.27.0")]
+        pub use x86::*;
+    } else if #[cfg(target_arch = "arm")] {
+        #[unstable(feature = "stdarch_arm_feature_detection", issue = "111190")]
+        pub use arm::*;
+    } else if #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] {
+        #[stable(feature = "simd_aarch64", since = "1.60.0")]
+        pub use aarch64::*;
+    } else if #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] {
+        #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")]
+        pub use riscv::*;
+    } else if #[cfg(target_arch = "powerpc")] {
+        #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")]
+        pub use powerpc::*;
+    } else if #[cfg(target_arch = "powerpc64")] {
+        #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")]
+        pub use powerpc64::*;
+    } else if #[cfg(target_arch = "mips")] {
+        #[unstable(feature = "stdarch_mips_feature_detection", issue = "111188")]
+        pub use mips::*;
+    } else if #[cfg(target_arch = "mips64")] {
+        #[unstable(feature = "stdarch_mips_feature_detection", issue = "111188")]
+        pub use mips64::*;
+    } else if #[cfg(target_arch = "loongarch64")] {
+        #[stable(feature = "stdarch_loongarch_feature", since = "1.89.0")]
+        pub use loongarch::*;
+    } else if #[cfg(target_arch = "s390x")] {
+        #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")]
+        pub use s390x::*;
+    } else {
+        // Unimplemented architecture:
+        #[doc(hidden)]
+        pub(crate) enum Feature {
+            Null
+        }
+        #[doc(hidden)]
+        #[unstable(feature = "stdarch_internal", issue = "none")]
+        pub mod __is_feature_detected {}
+
+        impl Feature {
+            #[doc(hidden)]
+            pub(crate) fn from_str(_s: &str) -> Result<Feature, ()> { Err(()) }
+            #[doc(hidden)]
+            pub(crate) fn to_str(self) -> &'static str { "" }
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/powerpc.rs b/library/stdarch/crates/std_detect/src/detect/arch/powerpc.rs
new file mode 100644
index 0000000000000..c390993a48a69
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/powerpc.rs
@@ -0,0 +1,30 @@
+//! Run-time feature detection on PowerPC.
+
+features! {
+    @TARGET: powerpc;
+    @CFG: target_arch = "powerpc";
+    @MACRO_NAME: is_powerpc_feature_detected;
+    @MACRO_ATTRS:
+    /// Checks if `powerpc` feature is enabled.
+    #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")]
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] altivec: "altivec";
+    /// Altivec
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] vsx: "vsx";
+    /// VSX
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power8: "power8";
+    without cfg check: true;
+    /// Power8
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power8_altivec: "power8-altivec";
+    /// Power8 altivec
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power8_vector: "power8-vector";
+    /// Power8 vector
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power8_crypto: "power8-crypto";
+    /// Power8 crypto
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power9: "power9";
+    without cfg check: true;
+    /// Power9
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power9_altivec: "power9-altivec";
+    /// Power9 altivec
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power9_vector: "power9-vector";
+    /// Power9 vector
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/powerpc64.rs b/library/stdarch/crates/std_detect/src/detect/arch/powerpc64.rs
new file mode 100644
index 0000000000000..cf05baa6f799e
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/powerpc64.rs
@@ -0,0 +1,30 @@
+//! Run-time feature detection on PowerPC64.
+
+features! {
+    @TARGET: powerpc64;
+    @CFG: target_arch = "powerpc64";
+    @MACRO_NAME: is_powerpc64_feature_detected;
+    @MACRO_ATTRS:
+    /// Checks if `powerpc` feature is enabled.
+    #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")]
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] altivec: "altivec";
+    /// Altivec
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] vsx: "vsx";
+    /// VSX
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power8: "power8";
+    without cfg check: true;
+    /// Power8
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power8_altivec: "power8-altivec";
+    /// Power8 altivec
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power8_vector: "power8-vector";
+    /// Power8 vector
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power8_crypto: "power8-crypto";
+    /// Power8 crypto
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power9: "power9";
+    without cfg check: true;
+    /// Power9
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power9_altivec: "power9-altivec";
+    /// Power9 altivec
+    @FEATURE: #[unstable(feature = "stdarch_powerpc_feature_detection", issue = "111191")] power9_vector: "power9-vector";
+    /// Power9 vector
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/riscv.rs b/library/stdarch/crates/std_detect/src/detect/arch/riscv.rs
new file mode 100644
index 0000000000000..b86190d7bbf0c
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/riscv.rs
@@ -0,0 +1,344 @@
+//! Run-time feature detection on RISC-V.
+
+features! {
+    @TARGET: riscv;
+    @CFG: any(target_arch = "riscv32", target_arch = "riscv64");
+    @MACRO_NAME: is_riscv_feature_detected;
+    @MACRO_ATTRS:
+    /// A macro to test at *runtime* whether instruction sets are available on
+    /// RISC-V platforms.
+    ///
+    /// RISC-V standard defined the base sets and the extension sets.
+    /// The base sets are RV32I, RV64I, RV32E or RV128I. Any RISC-V platform
+    /// must support one base set and/or multiple extension sets.
+    ///
+    /// Any RISC-V standard instruction sets can be in state of either ratified,
+    /// frozen or draft. The version and status of current standard instruction
+    /// sets can be checked out from preface section of the [ISA manual].
+    ///
+    /// Platform may define and support their own custom instruction sets with
+    /// ISA prefix X. These sets are highly platform specific and should be
+    /// detected with their own platform support crates.
+    ///
+    /// [ISA manual]: https://riscv.org/specifications/ratified/
+    ///
+    /// # Platform-specific/agnostic Behavior and Availability
+    ///
+    /// Runtime detection depends on the platform-specific feature detection
+    /// facility and its availability per feature is
+    /// highly platform/version-specific.
+    ///
+    /// Still, a best-effort attempt is performed to enable subset/dependent
+    /// features if a superset feature is enabled regardless of the platform.
+    /// For instance, if the A extension (`"a"`) is enabled, its subsets (the
+    /// Zalrsc and Zaamo extensions; `"zalrsc"` and `"zaamo"`) are also enabled.
+    /// Likewise, if the F extension (`"f"`) is enabled, one of its dependencies
+    /// (the Zicsr extension `"zicsr"`) is also enabled.
+    ///
+    /// # Unprivileged Specification
+    ///
+    /// The supported ratified RISC-V instruction sets are as follows:
+    ///
+    /// * RV32E: `"rv32e"`
+    /// * RV32I: `"rv32i"`
+    /// * RV64I: `"rv64i"`
+    /// * A: `"a"`
+    ///   * Zaamo: `"zaamo"`
+    ///   * Zalrsc: `"zalrsc"`
+    /// * B: `"b"`
+    ///   * Zba: `"zba"`
+    ///   * Zbb: `"zbb"`
+    ///   * Zbs: `"zbs"`
+    /// * C: `"c"`
+    ///   * Zca: `"zca"`
+    ///   * Zcd: `"zcd"` (if D is enabled)
+    ///   * Zcf: `"zcf"` (if F is enabled on RV32)
+    /// * D: `"d"`
+    /// * F: `"f"`
+    /// * M: `"m"`
+    /// * Q: `"q"`
+    /// * V: `"v"`
+    ///   * Zve32x: `"zve32x"`
+    ///   * Zve32f: `"zve32f"`
+    ///   * Zve64x: `"zve64x"`
+    ///   * Zve64f: `"zve64f"`
+    ///   * Zve64d: `"zve64d"`
+    /// * Zicbom: `"zicbom"`
+    /// * Zicboz: `"zicboz"`
+    /// * Zicntr: `"zicntr"`
+    /// * Zicond: `"zicond"`
+    /// * Zicsr: `"zicsr"`
+    /// * Zifencei: `"zifencei"`
+    /// * Zihintntl: `"zihintntl"`
+    /// * Zihintpause: `"zihintpause"`
+    /// * Zihpm: `"zihpm"`
+    /// * Zimop: `"zimop"`
+    /// * Zacas: `"zacas"`
+    /// * Zawrs: `"zawrs"`
+    /// * Zfa: `"zfa"`
+    /// * Zfbfmin: `"zfbfmin"`
+    /// * Zfh: `"zfh"`
+    ///   * Zfhmin: `"zfhmin"`
+    /// * Zfinx: `"zfinx"`
+    /// * Zdinx: `"zdinx"`
+    /// * Zhinx: `"zhinx"`
+    ///   * Zhinxmin: `"zhinxmin"`
+    /// * Zcb: `"zcb"`
+    /// * Zcmop: `"zcmop"`
+    /// * Zbc: `"zbc"`
+    /// * Zbkb: `"zbkb"`
+    /// * Zbkc: `"zbkc"`
+    /// * Zbkx: `"zbkx"`
+    /// * Zk: `"zk"`
+    /// * Zkn: `"zkn"`
+    ///   * Zknd: `"zknd"`
+    ///   * Zkne: `"zkne"`
+    ///   * Zknh: `"zknh"`
+    /// * Zkr: `"zkr"`
+    /// * Zks: `"zks"`
+    ///   * Zksed: `"zksed"`
+    ///   * Zksh: `"zksh"`
+    /// * Zkt: `"zkt"`
+    /// * Zvbb: `"zvbb"`
+    /// * Zvbc: `"zvbc"`
+    /// * Zvfbfmin: `"zvfbfmin"`
+    /// * Zvfbfwma: `"zvfbfwma"`
+    /// * Zvfh: `"zvfh"`
+    ///   * Zvfhmin: `"zvfhmin"`
+    /// * Zvkb: `"zvkb"`
+    /// * Zvkg: `"zvkg"`
+    /// * Zvkn: `"zvkn"`
+    ///   * Zvkned: `"zvkned"`
+    ///   * Zvknha: `"zvknha"`
+    ///   * Zvknhb: `"zvknhb"`
+    /// * Zvknc: `"zvknc"`
+    /// * Zvkng: `"zvkng"`
+    /// * Zvks: `"zvks"`
+    ///   * Zvksed: `"zvksed"`
+    ///   * Zvksh: `"zvksh"`
+    /// * Zvksc: `"zvksc"`
+    /// * Zvksg: `"zvksg"`
+    /// * Zvkt: `"zvkt"`
+    /// * Ztso: `"ztso"`
+    ///
+    /// There's also bases and extensions marked as standard instruction set,
+    /// but they are in frozen or draft state. These instruction sets are also
+    /// reserved by this macro and can be detected in the future platforms.
+    ///
+    /// Draft RISC-V instruction sets:
+    ///
+    /// * RV128I: `"rv128i"`
+    /// * J: `"j"`
+    /// * P: `"p"`
+    /// * Zam: `"zam"`
+    ///
+    /// # Performance Hints
+    ///
+    /// The two features below define performance hints for unaligned
+    /// scalar/vector memory accesses, respectively.  If enabled, it denotes that
+    /// corresponding unaligned memory access is reasonably fast.
+    ///
+    /// * `"unaligned-scalar-mem"`
+    ///   * Runtime detection requires Linux kernel version 6.4 or later.
+    /// * `"unaligned-vector-mem"`
+    ///   * Runtime detection requires Linux kernel version 6.13 or later.
+    #[stable(feature = "riscv_ratified", since = "1.78.0")]
+
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] rv32i: "rv32i";
+    without cfg check: true;
+    /// RV32I Base Integer Instruction Set
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] rv32e: "rv32e";
+    without cfg check: true;
+    /// RV32E Base Integer Instruction Set
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] rv64i: "rv64i";
+    without cfg check: true;
+    /// RV64I Base Integer Instruction Set
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] rv128i: "rv128i";
+    without cfg check: true;
+    /// RV128I Base Integer Instruction Set
+
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] unaligned_scalar_mem: "unaligned-scalar-mem";
+    /// Has reasonably performant unaligned scalar
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] unaligned_vector_mem: "unaligned-vector-mem";
+    /// Has reasonably performant unaligned vector
+
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zicsr: "zicsr";
+    /// "Zicsr" Extension for Control and Status Register (CSR) Instructions
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zicntr: "zicntr";
+    /// "Zicntr" Extension for Base Counters and Timers
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zihpm: "zihpm";
+    /// "Zihpm" Extension for Hardware Performance Counters
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zifencei: "zifencei";
+    /// "Zifencei" Extension for Instruction-Fetch Fence
+
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zihintntl: "zihintntl";
+    /// "Zihintntl" Extension for Non-Temporal Locality Hints
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zihintpause: "zihintpause";
+    /// "Zihintpause" Extension for Pause Hint
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zimop: "zimop";
+    /// "Zimop" Extension for May-Be-Operations
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zicbom: "zicbom";
+    /// "Zicbom" Extension for Cache-Block Management Instructions
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zicboz: "zicboz";
+    /// "Zicboz" Extension for Cache-Block Zero Instruction
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zicond: "zicond";
+    /// "Zicond" Extension for Integer Conditional Operations
+
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] m: "m";
+    /// "M" Extension for Integer Multiplication and Division
+
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] a: "a";
+    /// "A" Extension for Atomic Instructions
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zalrsc: "zalrsc";
+    /// "Zalrsc" Extension for Load-Reserved/Store-Conditional Instructions
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zaamo: "zaamo";
+    /// "Zaamo" Extension for Atomic Memory Operations
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zawrs: "zawrs";
+    /// "Zawrs" Extension for Wait-on-Reservation-Set Instructions
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zacas: "zacas";
+    /// "Zacas" Extension for Atomic Compare-and-Swap (CAS) Instructions
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zam: "zam";
+    without cfg check: true;
+    /// "Zam" Extension for Misaligned Atomics
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] ztso: "ztso";
+    /// "Ztso" Extension for Total Store Ordering
+
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] f: "f";
+    /// "F" Extension for Single-Precision Floating-Point
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] d: "d";
+    /// "D" Extension for Double-Precision Floating-Point
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] q: "q";
+    without cfg check: true;
+    /// "Q" Extension for Quad-Precision Floating-Point
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zfh: "zfh";
+    /// "Zfh" Extension for Half-Precision Floating-Point
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zfhmin: "zfhmin";
+    /// "Zfhmin" Extension for Minimal Half-Precision Floating-Point
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zfa: "zfa";
+    /// "Zfa" Extension for Additional Floating-Point Instructions
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zfbfmin: "zfbfmin";
+    /// "Zfbfmin" Extension for Scalar BF16 Converts
+
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zfinx: "zfinx";
+    /// "Zfinx" Extension for Single-Precision Floating-Point in Integer Registers
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zdinx: "zdinx";
+    /// "Zdinx" Extension for Double-Precision Floating-Point in Integer Registers
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zhinx: "zhinx";
+    /// "Zhinx" Extension for Half-Precision Floating-Point in Integer Registers
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zhinxmin: "zhinxmin";
+    /// "Zhinxmin" Extension for Minimal Half-Precision Floating-Point in Integer Registers
+
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] c: "c";
+    /// "C" Extension for Compressed Instructions
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zca: "zca";
+    /// "Zca" Compressed Instructions excluding Floating-Point Loads/Stores
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zcf: "zcf";
+    without cfg check: true;
+    /// "Zcf" Compressed Instructions for Single-Precision Floating-Point Loads/Stores on RV32
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zcd: "zcd";
+    without cfg check: true;
+    /// "Zcd" Compressed Instructions for Double-Precision Floating-Point Loads/Stores
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zcb: "zcb";
+    /// "Zcb" Simple Code-size Saving Compressed Instructions
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zcmop: "zcmop";
+    /// "Zcmop" Extension for Compressed May-Be-Operations
+
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] b: "b";
+    /// "B" Extension for Bit Manipulation
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zba: "zba";
+    /// "Zba" Extension for Address Generation
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zbb: "zbb";
+    /// "Zbb" Extension for Basic Bit-Manipulation
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zbc: "zbc";
+    /// "Zbc" Extension for Carry-less Multiplication
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zbs: "zbs";
+    /// "Zbs" Extension for Single-Bit Instructions
+
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zbkb: "zbkb";
+    /// "Zbkb" Extension for Bit-Manipulation for Cryptography
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zbkc: "zbkc";
+    /// "Zbkc" Extension for Carry-less Multiplication for Cryptography
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zbkx: "zbkx";
+    /// "Zbkx" Extension for Crossbar Permutations
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zknd: "zknd";
+    /// "Zknd" Cryptography Extension for NIST Suite: AES Decryption
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zkne: "zkne";
+    /// "Zkne" Cryptography Extension for NIST Suite: AES Encryption
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zknh: "zknh";
+    /// "Zknh" Cryptography Extension for NIST Suite: Hash Function Instructions
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zksed: "zksed";
+    /// "Zksed" Cryptography Extension for ShangMi Suite: SM4 Block Cipher Instructions
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zksh: "zksh";
+    /// "Zksh" Cryptography Extension for ShangMi Suite: SM3 Hash Function Instructions
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zkr: "zkr";
+    /// "Zkr" Entropy Source Extension
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zkn: "zkn";
+    /// "Zkn" Cryptography Extension for NIST Algorithm Suite
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zks: "zks";
+    /// "Zks" Cryptography Extension for ShangMi Algorithm Suite
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zk: "zk";
+    /// "Zk" Cryptography Extension for Standard Scalar Cryptography
+    @FEATURE: #[stable(feature = "riscv_ratified", since = "1.78.0")] zkt: "zkt";
+    /// "Zkt" Cryptography Extension for Data Independent Execution Latency
+
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] v: "v";
+    /// "V" Extension for Vector Operations
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zve32x: "zve32x";
+    /// "Zve32x" Vector Extension for Embedded Processors (32-bit+; Integer)
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zve32f: "zve32f";
+    /// "Zve32f" Vector Extension for Embedded Processors (32-bit+; with Single-Precision Floating-Point)
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zve64x: "zve64x";
+    /// "Zve64x" Vector Extension for Embedded Processors (64-bit+; Integer)
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zve64f: "zve64f";
+    /// "Zve64f" Vector Extension for Embedded Processors (64-bit+; with Single-Precision Floating-Point)
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zve64d: "zve64d";
+    /// "Zve64d" Vector Extension for Embedded Processors (64-bit+; with Double-Precision Floating-Point)
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvfh: "zvfh";
+    /// "Zvfh" Vector Extension for Half-Precision Floating-Point
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvfhmin: "zvfhmin";
+    /// "Zvfhmin" Vector Extension for Minimal Half-Precision Floating-Point
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvfbfmin: "zvfbfmin";
+    /// "Zvfbfmin" Vector Extension for BF16 Converts
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvfbfwma: "zvfbfwma";
+    /// "Zvfbfwma" Vector Extension for BF16 Widening Multiply-Add
+
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvbb: "zvbb";
+    /// "Zvbb" Extension for Vector Basic Bit-Manipulation
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvbc: "zvbc";
+    /// "Zvbc" Extension for Vector Carryless Multiplication
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvkb: "zvkb";
+    /// "Zvkb" Extension for Vector Cryptography Bit-Manipulation
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvkg: "zvkg";
+    /// "Zvkg" Cryptography Extension for Vector GCM/GMAC
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvkned: "zvkned";
+    /// "Zvkned" Cryptography Extension for NIST Suite: Vector AES Block Cipher
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvknha: "zvknha";
+    /// "Zvknha" Cryptography Extension for Vector SHA-2 Secure Hash (SHA-256)
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvknhb: "zvknhb";
+    /// "Zvknhb" Cryptography Extension for Vector SHA-2 Secure Hash (SHA-256/512)
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvksed: "zvksed";
+    /// "Zvksed" Cryptography Extension for ShangMi Suite: Vector SM4 Block Cipher
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvksh: "zvksh";
+    /// "Zvksh" Cryptography Extension for ShangMi Suite: Vector SM3 Secure Hash
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvkn: "zvkn";
+    /// "Zvkn" Cryptography Extension for NIST Algorithm Suite
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvknc: "zvknc";
+    /// "Zvknc" Cryptography Extension for NIST Algorithm Suite with Carryless Multiply
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvkng: "zvkng";
+    /// "Zvkng" Cryptography Extension for NIST Algorithm Suite with GCM
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvks: "zvks";
+    /// "Zvks" Cryptography Extension for ShangMi Algorithm Suite
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvksc: "zvksc";
+    /// "Zvksc" Cryptography Extension for ShangMi Algorithm Suite with Carryless Multiply
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvksg: "zvksg";
+    /// "Zvksg" Cryptography Extension for ShangMi Algorithm Suite with GCM
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] zvkt: "zvkt";
+    /// "Zvkt" Extension for Vector Data-Independent Execution Latency
+
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] j: "j";
+    without cfg check: true;
+    /// "J" Extension for Dynamically Translated Languages
+    @FEATURE: #[unstable(feature = "stdarch_riscv_feature_detection", issue = "111192")] p: "p";
+    without cfg check: true;
+    /// "P" Extension for Packed-SIMD Instructions
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/s390x.rs b/library/stdarch/crates/std_detect/src/detect/arch/s390x.rs
new file mode 100644
index 0000000000000..812607c6bcde9
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/s390x.rs
@@ -0,0 +1,45 @@
+//! Run-time feature detection on s390x.
+
+features! {
+    @TARGET: s390x;
+    @CFG: target_arch = "s390x";
+    @MACRO_NAME: is_s390x_feature_detected;
+    @MACRO_ATTRS:
+    /// Checks if `s390x` feature is enabled.
+    #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")]
+    @FEATURE: #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")] deflate_conversion: "deflate-conversion";
+    /// s390x deflate-conversion facility
+    #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")]
+    @FEATURE: #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")] enhanced_sort: "enhanced-sort";
+    /// s390x enhanced-sort facility
+    #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")]
+    @FEATURE: #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")] guarded_storage: "guarded-storage";
+    /// s390x guarded-storage facility
+    #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")]
+    @FEATURE: #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")] high_word: "high-word";
+    /// s390x high-word facility
+    #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")]
+    @FEATURE: #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")] nnp_assist: "nnp-assist";
+    /// s390x nnp-assist facility
+    #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")]
+    @FEATURE: #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")] transactional_execution: "transactional-execution";
+    /// s390x transactional-execution facility
+    #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")]
+    @FEATURE: #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")] vector: "vector";
+    /// s390x vector facility
+    #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")]
+    @FEATURE: #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")] vector_enhancements_1: "vector-enhancements-1";
+    /// s390x vector-enhancements-1 facility
+    #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")]
+    @FEATURE: #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")] vector_enhancements_2: "vector-enhancements-2";
+    /// s390x vector-enhancements-2 facility
+    #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")]
+    @FEATURE: #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")] vector_packed_decimal: "vector-packed-decimal";
+    /// s390x vector-packed-decimal facility
+    #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")]
+    @FEATURE: #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")] vector_packed_decimal_enhancement: "vector-packed-decimal-enhancement";
+    /// s390x vector-packed-decimal-enhancement facility
+    #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")]
+    @FEATURE: #[unstable(feature = "stdarch_s390x_feature_detection", issue = "135413")] vector_packed_decimal_enhancement_2: "vector-packed-decimal-enhancement-2";
+    /// s390x vector-packed-decimal-enhancement-2 facility
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/x86.rs b/library/stdarch/crates/std_detect/src/detect/arch/x86.rs
new file mode 100644
index 0000000000000..f23cfc334170f
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/x86.rs
@@ -0,0 +1,278 @@
+//! This module implements minimal run-time feature detection for x86.
+//!
+//! The features are detected using the `detect_features` function below.
+//! This function uses the CPUID instruction to read the feature flags from the
+//! CPU and encodes them in a `usize` where each bit position represents
+//! whether a feature is available (bit is set) or unavailable (bit is cleared).
+//!
+//! The enum `Feature` is used to map bit positions to feature names, and the
+//! the `__crate::detect::check_for!` macro is used to map string literals (e.g.,
+//! "avx") to these bit positions (e.g., `Feature::avx`).
+//!
+//! The run-time feature detection is performed by the
+//! `__crate::detect::check_for(Feature) -> bool` function. On its first call,
+//! this functions queries the CPU for the available features and stores them
+//! in a global `AtomicUsize` variable. The query is performed by just checking
+//! whether the feature bit in this global variable is set or cleared.
+
+features! {
+    @TARGET: x86;
+    @CFG: any(target_arch = "x86", target_arch = "x86_64");
+    @MACRO_NAME: is_x86_feature_detected;
+    @MACRO_ATTRS:
+    /// A macro to test at *runtime* whether a CPU feature is available on
+    /// x86/x86-64 platforms.
+    ///
+    /// This macro is provided in the standard library and will detect at runtime
+    /// whether the specified CPU feature is detected. This does **not** resolve at
+    /// compile time unless the specified feature is already enabled for the entire
+    /// crate. Runtime detection currently relies mostly on the `cpuid` instruction.
+    ///
+    /// This macro only takes one argument which is a string literal of the feature
+    /// being tested for. The feature names supported are the lowercase versions of
+    /// the ones defined by Intel in [their documentation][docs].
+    ///
+    /// ## Supported arguments
+    ///
+    /// This macro supports the same names that `#[target_feature]` supports. Unlike
+    /// `#[target_feature]`, however, this macro does not support names separated
+    /// with a comma. Instead testing for multiple features must be done through
+    /// separate macro invocations for now.
+    ///
+    /// Supported arguments are:
+    ///
+    /// * `"aes"`
+    /// * `"pclmulqdq"`
+    /// * `"rdrand"`
+    /// * `"rdseed"`
+    /// * `"tsc"`
+    /// * `"mmx"`
+    /// * `"sse"`
+    /// * `"sse2"`
+    /// * `"sse3"`
+    /// * `"ssse3"`
+    /// * `"sse4.1"`
+    /// * `"sse4.2"`
+    /// * `"sse4a"`
+    /// * `"sha"`
+    /// * `"avx"`
+    /// * `"avx2"`
+    /// * `"sha512"`
+    /// * `"sm3"`
+    /// * `"sm4"`
+    /// * `"avx512f"`
+    /// * `"avx512cd"`
+    /// * `"avx512er"`
+    /// * `"avx512pf"`
+    /// * `"avx512bw"`
+    /// * `"avx512dq"`
+    /// * `"avx512vl"`
+    /// * `"avx512ifma"`
+    /// * `"avx512vbmi"`
+    /// * `"avx512vpopcntdq"`
+    /// * `"avx512vbmi2"`
+    /// * `"gfni"`
+    /// * `"vaes"`
+    /// * `"vpclmulqdq"`
+    /// * `"avx512vnni"`
+    /// * `"avx512bitalg"`
+    /// * `"avx512bf16"`
+    /// * `"avx512vp2intersect"`
+    /// * `"avx512fp16"`
+    /// * `"avxvnni"`
+    /// * `"avxifma"`
+    /// * `"avxneconvert"`
+    /// * `"avxvnniint8"`
+    /// * `"avxvnniint16"`
+    /// * `"amx-tile"`
+    /// * `"amx-int8"`
+    /// * `"amx-bf16"`
+    /// * `"amx-fp16"`
+    /// * `"amx-complex"`
+    /// * `"amx-avx512"`
+    /// * `"amx-fp8"`
+    /// * `"amx-movrs"`
+    /// * `"amx-tf32"`
+    /// * `"amx-transpose"`
+    /// * `"f16c"`
+    /// * `"fma"`
+    /// * `"bmi1"`
+    /// * `"bmi2"`
+    /// * `"abm"`
+    /// * `"lzcnt"`
+    /// * `"tbm"`
+    /// * `"popcnt"`
+    /// * `"fxsr"`
+    /// * `"xsave"`
+    /// * `"xsaveopt"`
+    /// * `"xsaves"`
+    /// * `"xsavec"`
+    /// * `"cmpxchg16b"`
+    /// * `"kl"`
+    /// * `"widekl"`
+    /// * `"adx"`
+    /// * `"rtm"`
+    /// * `"movbe"`
+    /// * `"ermsb"`
+    /// * `"movrs"`
+    /// * `"xop"`
+    ///
+    /// [docs]: https://software.intel.com/sites/landingpage/IntrinsicsGuide
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    @BIND_FEATURE_NAME: "abm"; "lzcnt"; // abm is a synonym for lzcnt
+    @BIND_FEATURE_NAME: "avx512gfni"; "gfni"; #[deprecated(since = "1.67.0", note = "the `avx512gfni` feature has been renamed to `gfni`")];
+    @BIND_FEATURE_NAME: "avx512vaes"; "vaes"; #[deprecated(since = "1.67.0", note = "the `avx512vaes` feature has been renamed to `vaes`")];
+    @BIND_FEATURE_NAME: "avx512vpclmulqdq"; "vpclmulqdq"; #[deprecated(since = "1.67.0", note = "the `avx512vpclmulqdq` feature has been renamed to `vpclmulqdq`")];
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] aes: "aes";
+    /// AES (Advanced Encryption Standard New Instructions AES-NI)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] pclmulqdq: "pclmulqdq";
+    /// CLMUL (Carry-less Multiplication)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] rdrand: "rdrand";
+    /// RDRAND
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] rdseed: "rdseed";
+    /// RDSEED
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] tsc: "tsc";
+    without cfg check: true;
+    /// TSC (Time Stamp Counter)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] mmx: "mmx";
+    without cfg check: true;
+    /// MMX (MultiMedia eXtensions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sse: "sse";
+    /// SSE (Streaming SIMD Extensions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sse2: "sse2";
+    /// SSE2 (Streaming SIMD Extensions 2)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sse3: "sse3";
+    /// SSE3 (Streaming SIMD Extensions 3)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] ssse3: "ssse3";
+    /// SSSE3 (Supplemental Streaming SIMD Extensions 3)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sse4_1: "sse4.1";
+    /// SSE4.1 (Streaming SIMD Extensions 4.1)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sse4_2: "sse4.2";
+    /// SSE4.2 (Streaming SIMD Extensions 4.2)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sse4a: "sse4a";
+    /// SSE4a (Streaming SIMD Extensions 4a)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sha: "sha";
+    /// SHA
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx: "avx";
+    /// AVX (Advanced Vector Extensions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx2: "avx2";
+    /// AVX2 (Advanced Vector Extensions 2)
+    @FEATURE: #[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")] sha512: "sha512";
+    /// SHA512
+    @FEATURE: #[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")] sm3: "sm3";
+    /// SM3
+    @FEATURE: #[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")] sm4: "sm4";
+    /// SM4
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512f: "avx512f" ;
+    /// AVX-512 F (Foundation)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512cd: "avx512cd" ;
+    /// AVX-512 CD (Conflict Detection Instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512er: "avx512er";
+    without cfg check: true;
+    /// AVX-512 ER (Expo nential and Reciprocal Instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512pf: "avx512pf";
+    without cfg check: true;
+    /// AVX-512 PF (Prefetch Instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512bw: "avx512bw";
+    /// AVX-512 BW (Byte and Word Instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512dq: "avx512dq";
+    /// AVX-512 DQ (Doubleword and Quadword)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vl: "avx512vl";
+    /// AVX-512 VL (Vector Length Extensions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512ifma: "avx512ifma";
+    /// AVX-512 IFMA (Integer Fused Multiply Add)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vbmi: "avx512vbmi";
+    /// AVX-512 VBMI (Vector Byte Manipulation Instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vpopcntdq: "avx512vpopcntdq";
+    /// AVX-512 VPOPCNTDQ (Vector Population Count Doubleword and Quadword)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vbmi2: "avx512vbmi2";
+    /// AVX-512 VBMI2 (Additional byte, word, dword and qword capabilities)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] gfni: "gfni";
+    /// AVX-512 GFNI (Galois Field New Instruction)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] vaes: "vaes";
+    /// AVX-512 VAES (Vector AES instruction)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] vpclmulqdq: "vpclmulqdq";
+    /// AVX-512 VPCLMULQDQ (Vector PCLMULQDQ instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vnni: "avx512vnni";
+    /// AVX-512 VNNI (Vector Neural Network Instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512bitalg: "avx512bitalg";
+    /// AVX-512 BITALG (Support for VPOPCNT\[B,W\] and VPSHUFBITQMB)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512bf16: "avx512bf16";
+    /// AVX-512 BF16 (BFLOAT16 instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vp2intersect: "avx512vp2intersect";
+    /// AVX-512 P2INTERSECT
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512fp16: "avx512fp16";
+    /// AVX-512 FP16 (FLOAT16 instructions)
+    @FEATURE: #[stable(feature = "avx512_target_feature", since = "1.89.0")] avxifma: "avxifma";
+    /// AVX-IFMA (Integer Fused Multiply Add)
+    @FEATURE: #[stable(feature = "avx512_target_feature", since = "1.89.0")] avxneconvert: "avxneconvert";
+    /// AVX-NE-CONVERT (Exceptionless Convert)
+    @FEATURE: #[stable(feature = "avx512_target_feature", since = "1.89.0")] avxvnni: "avxvnni";
+    /// AVX-VNNI (Vector Neural Network Instructions)
+    @FEATURE: #[stable(feature = "avx512_target_feature", since = "1.89.0")] avxvnniint16: "avxvnniint16";
+    /// AVX-VNNI_INT8 (VNNI with 16-bit Integers)
+    @FEATURE: #[stable(feature = "avx512_target_feature", since = "1.89.0")] avxvnniint8: "avxvnniint8";
+    /// AVX-VNNI_INT16 (VNNI with 8-bit integers)
+    @FEATURE: #[unstable(feature = "x86_amx_intrinsics", issue = "126622")] amx_tile: "amx-tile";
+    /// AMX (Advanced Matrix Extensions) - Tile load/store
+    @FEATURE: #[unstable(feature = "x86_amx_intrinsics", issue = "126622")] amx_int8: "amx-int8";
+    /// AMX-INT8 (Operations on 8-bit integers)
+    @FEATURE: #[unstable(feature = "x86_amx_intrinsics", issue = "126622")] amx_bf16: "amx-bf16";
+    /// AMX-BF16 (BFloat16 Operations)
+    @FEATURE: #[unstable(feature = "x86_amx_intrinsics", issue = "126622")] amx_fp16: "amx-fp16";
+    /// AMX-FP16 (Float16 Operations)
+    @FEATURE: #[unstable(feature = "x86_amx_intrinsics", issue = "126622")] amx_complex: "amx-complex";
+    /// AMX-COMPLEX (Complex number Operations)
+    @FEATURE: #[unstable(feature = "x86_amx_intrinsics", issue = "126622")] amx_avx512: "amx-avx512";
+    /// AMX-AVX512 (AVX512 operations extended to matrices)
+    @FEATURE: #[unstable(feature = "x86_amx_intrinsics", issue = "126622")] amx_fp8: "amx-fp8";
+    /// AMX-FP8 (Float8 Operations)
+    @FEATURE: #[unstable(feature = "x86_amx_intrinsics", issue = "126622")] amx_movrs: "amx-movrs";
+    /// AMX-MOVRS (Matrix MOVERS operations)
+    @FEATURE: #[unstable(feature = "x86_amx_intrinsics", issue = "126622")] amx_tf32: "amx-tf32";
+    /// AMX-TF32 (TensorFloat32 Operations)
+    @FEATURE: #[unstable(feature = "x86_amx_intrinsics", issue = "126622")] amx_transpose: "amx-transpose";
+    /// AMX-TRANSPOSE (Matrix Transpose Operations)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] f16c: "f16c";
+    /// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] fma: "fma";
+    /// FMA (Fused Multiply Add)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] bmi1: "bmi1" ;
+    /// BMI1 (Bit Manipulation Instructions 1)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] bmi2: "bmi2" ;
+    /// BMI2 (Bit Manipulation Instructions 2)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] lzcnt: "lzcnt";
+    /// ABM (Advanced Bit Manipulation) / LZCNT (Leading Zero Count)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] tbm: "tbm";
+    /// TBM (Trailing Bit Manipulation)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] popcnt: "popcnt";
+    /// POPCNT (Population Count)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] fxsr: "fxsr";
+    /// FXSR (Floating-point context fast save and restore)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] xsave: "xsave";
+    /// XSAVE (Save Processor Extended States)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] xsaveopt: "xsaveopt";
+    /// XSAVEOPT (Save Processor Extended States Optimized)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] xsaves: "xsaves";
+    /// XSAVES (Save Processor Extended States Supervisor)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] xsavec: "xsavec";
+    /// XSAVEC (Save Processor Extended States Compacted)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] cmpxchg16b: "cmpxchg16b";
+    /// CMPXCH16B (16-byte compare-and-swap instruction)
+    @FEATURE: #[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")] kl: "kl";
+    /// Intel Key Locker
+    @FEATURE: #[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")] widekl: "widekl";
+    /// Intel Key Locker Wide
+    @FEATURE: #[stable(feature = "simd_x86_adx", since = "1.33.0")] adx: "adx";
+    /// ADX, Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] rtm: "rtm";
+    /// RTM, Intel (Restricted Transactional Memory)
+    @FEATURE: #[stable(feature = "movbe_target_feature", since = "1.67.0")] movbe: "movbe";
+    /// MOVBE (Move Data After Swapping Bytes)
+    @FEATURE: #[unstable(feature = "movrs_target_feature", issue = "137976")] movrs: "movrs";
+    /// MOVRS (Move data with the read-shared hint)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] ermsb: "ermsb";
+    /// ERMSB, Enhanced REP MOVSB and STOSB
+    @FEATURE: #[unstable(feature = "xop_target_feature", issue = "127208")] xop: "xop";
+    /// XOP: eXtended Operations (AMD)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/bit.rs b/library/stdarch/crates/std_detect/src/detect/bit.rs
new file mode 100644
index 0000000000000..6f06c5523e4fd
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/bit.rs
@@ -0,0 +1,9 @@
+//! Bit manipulation utilities.
+
+/// Tests the `bit` of `x`.
+#[allow(dead_code)]
+#[inline]
+pub(crate) fn test(x: usize, bit: u32) -> bool {
+    debug_assert!(bit < usize::BITS, "bit index out-of-bounds");
+    x & (1 << bit) != 0
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/cache.rs b/library/stdarch/crates/std_detect/src/detect/cache.rs
new file mode 100644
index 0000000000000..83bcedea612e6
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/cache.rs
@@ -0,0 +1,223 @@
+//! Caches run-time feature detection so that it only needs to be computed
+//! once.
+
+#![allow(dead_code)] // not used on all platforms
+
+use core::sync::atomic::Ordering;
+
+use core::sync::atomic::AtomicUsize;
+
+/// Sets the `bit` of `x`.
+#[inline]
+const fn set_bit(x: u128, bit: u32) -> u128 {
+    x | 1 << bit
+}
+
+/// Tests the `bit` of `x`.
+#[inline]
+const fn test_bit(x: u128, bit: u32) -> bool {
+    x & (1 << bit) != 0
+}
+
+/// Unset the `bit of `x`.
+#[inline]
+const fn unset_bit(x: u128, bit: u32) -> u128 {
+    x & !(1 << bit)
+}
+
+/// Maximum number of features that can be cached.
+const CACHE_CAPACITY: u32 = 93;
+
+/// This type is used to initialize the cache
+// The derived `Default` implementation will initialize the field to zero,
+// which is what we want.
+#[derive(Copy, Clone, Default, PartialEq, Eq)]
+pub(crate) struct Initializer(u128);
+
+// NOTE: the `debug_assert!` would catch that we do not add more Features than
+// the one fitting our cache.
+impl Initializer {
+    /// Tests the `bit` of the cache.
+    #[inline]
+    pub(crate) fn test(self, bit: u32) -> bool {
+        debug_assert!(
+            bit < CACHE_CAPACITY,
+            "too many features, time to increase the cache size!"
+        );
+        test_bit(self.0, bit)
+    }
+
+    /// Sets the `bit` of the cache.
+    #[inline]
+    pub(crate) fn set(&mut self, bit: u32) {
+        debug_assert!(
+            bit < CACHE_CAPACITY,
+            "too many features, time to increase the cache size!"
+        );
+        let v = self.0;
+        self.0 = set_bit(v, bit);
+    }
+
+    /// Unsets the `bit` of the cache.
+    #[inline]
+    pub(crate) fn unset(&mut self, bit: u32) {
+        debug_assert!(
+            bit < CACHE_CAPACITY,
+            "too many features, time to increase the cache size!"
+        );
+        let v = self.0;
+        self.0 = unset_bit(v, bit);
+    }
+}
+
+/// This global variable is a cache of the features supported by the CPU.
+// Note: the third slot is only used in x86
+// Another Slot can be added if needed without any change to `Initializer`
+static CACHE: [Cache; 3] = [
+    Cache::uninitialized(),
+    Cache::uninitialized(),
+    Cache::uninitialized(),
+];
+
+/// Feature cache with capacity for `size_of::<usize>() * 8 - 1` features.
+///
+/// Note: 0 is used to represent an uninitialized cache, and (at least) the most
+/// significant bit is set on any cache which has been initialized.
+///
+/// Note: we use `Relaxed` atomic operations, because we are only interested in
+/// the effects of operations on a single memory location. That is, we only need
+/// "modification order", and not the full-blown "happens before".
+struct Cache(AtomicUsize);
+
+impl Cache {
+    const CAPACITY: u32 = (core::mem::size_of::<usize>() * 8 - 1) as u32;
+    const MASK: usize = (1 << Cache::CAPACITY) - 1;
+    const INITIALIZED_BIT: usize = 1usize << Cache::CAPACITY;
+
+    /// Creates an uninitialized cache.
+    #[allow(clippy::declare_interior_mutable_const)]
+    const fn uninitialized() -> Self {
+        Cache(AtomicUsize::new(0))
+    }
+
+    /// Is the `bit` in the cache set? Returns `None` if the cache has not been initialized.
+    #[inline]
+    pub(crate) fn test(&self, bit: u32) -> Option<bool> {
+        let cached = self.0.load(Ordering::Relaxed);
+        if cached == 0 {
+            None
+        } else {
+            Some(test_bit(cached as u128, bit))
+        }
+    }
+
+    /// Initializes the cache.
+    #[inline]
+    fn initialize(&self, value: usize) -> usize {
+        debug_assert_eq!((value & !Cache::MASK), 0);
+        self.0
+            .store(value | Cache::INITIALIZED_BIT, Ordering::Relaxed);
+        value
+    }
+}
+
+cfg_if::cfg_if! {
+    if #[cfg(feature = "std_detect_env_override")] {
+        #[inline]
+        fn disable_features(disable: &[u8], value: &mut Initializer) {
+            if let Ok(disable) = core::str::from_utf8(disable) {
+                for v in disable.split(" ") {
+                    let _ = super::Feature::from_str(v).map(|v| value.unset(v as u32));
+                }
+            }
+        }
+
+        #[inline]
+        fn initialize(mut value: Initializer) -> Initializer {
+            use core::ffi::CStr;
+            const RUST_STD_DETECT_UNSTABLE: &CStr = c"RUST_STD_DETECT_UNSTABLE";
+            cfg_if::cfg_if! {
+                if #[cfg(windows)] {
+                    use alloc::vec;
+                    #[link(name = "kernel32")]
+                    unsafe extern "system" {
+                        fn GetEnvironmentVariableA(name: *const u8, buffer: *mut u8, size: u32) -> u32;
+                    }
+                    let len = unsafe { GetEnvironmentVariableA(RUST_STD_DETECT_UNSTABLE.as_ptr().cast::<u8>(), core::ptr::null_mut(), 0) };
+                    if len > 0 {
+                        // +1 to include the null terminator.
+                        let mut env = vec![0; len as usize + 1];
+                        let len = unsafe { GetEnvironmentVariableA(RUST_STD_DETECT_UNSTABLE.as_ptr().cast::<u8>(), env.as_mut_ptr(), len + 1) };
+                        if len > 0 {
+                            disable_features(&env[..len as usize], &mut value);
+                        }
+                    }
+                } else {
+                    let env = unsafe {
+                        libc::getenv(RUST_STD_DETECT_UNSTABLE.as_ptr())
+                    };
+                    if !env.is_null() {
+                        let len = unsafe { libc::strlen(env) };
+                        let env = unsafe { core::slice::from_raw_parts(env as *const u8, len) };
+                        disable_features(env, &mut value);
+                    }
+                }
+            }
+            do_initialize(value);
+            value
+        }
+    } else {
+        #[inline]
+        fn initialize(value: Initializer) -> Initializer {
+            do_initialize(value);
+            value
+        }
+    }
+}
+
+#[inline]
+fn do_initialize(value: Initializer) {
+    CACHE[0].initialize((value.0) as usize & Cache::MASK);
+    CACHE[1].initialize((value.0 >> Cache::CAPACITY) as usize & Cache::MASK);
+    CACHE[2].initialize((value.0 >> (2 * Cache::CAPACITY)) as usize & Cache::MASK);
+}
+
+// We only have to detect features once, and it's fairly costly, so hint to LLVM
+// that it should assume that cache hits are more common than misses (which is
+// the point of caching). It's possibly unfortunate that this function needs to
+// reach across modules like this to call `os::detect_features`, but it produces
+// the best code out of several attempted variants.
+//
+// The `Initializer` that the cache was initialized with is returned, so that
+// the caller can call `test()` on it without having to load the value from the
+// cache again.
+#[cold]
+fn detect_and_initialize() -> Initializer {
+    initialize(super::os::detect_features())
+}
+
+/// Tests the `bit` of the storage. If the storage has not been initialized,
+/// initializes it with the result of `os::detect_features()`.
+///
+/// On its first invocation, it detects the CPU features and caches them in the
+/// `CACHE` global variable as an `AtomicU64`.
+///
+/// It uses the `Feature` variant to index into this variable as a bitset. If
+/// the bit is set, the feature is enabled, and otherwise it is disabled.
+///
+/// If the feature `std_detect_env_override` is enabled looks for the env
+/// variable `RUST_STD_DETECT_UNSTABLE` and uses its content to disable
+/// Features that would had been otherwise detected.
+#[inline]
+pub(crate) fn test(bit: u32) -> bool {
+    let (relative_bit, idx) = if bit < Cache::CAPACITY {
+        (bit, 0)
+    } else if bit < 2 * Cache::CAPACITY {
+        (bit - Cache::CAPACITY, 1)
+    } else {
+        (bit - 2 * Cache::CAPACITY, 2)
+    };
+    CACHE[idx]
+        .test(relative_bit)
+        .unwrap_or_else(|| detect_and_initialize().test(bit))
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/macros.rs b/library/stdarch/crates/std_detect/src/detect/macros.rs
new file mode 100644
index 0000000000000..a2994fb7daa7a
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/macros.rs
@@ -0,0 +1,204 @@
+#[macro_export]
+#[allow_internal_unstable(stdarch_internal)]
+#[unstable(feature = "stdarch_internal", issue = "none")]
+macro_rules! detect_feature {
+    ($feature:tt, $feature_lit:tt) => {
+        $crate::detect_feature!($feature, $feature_lit : $feature_lit)
+    };
+    ($feature:tt, $feature_lit:tt : $($target_feature_lit:tt),*) => {
+        $(cfg!(target_feature = $target_feature_lit) ||)*
+            $crate::detect::__is_feature_detected::$feature()
+    };
+    ($feature:tt, $feature_lit:tt, without cfg check: true) => {
+        $crate::detect::__is_feature_detected::$feature()
+    };
+}
+
+#[allow(unused_macros, reason = "it's used in the features! macro below")]
+macro_rules! check_cfg_feature {
+    ($feature:tt, $feature_lit:tt) => {
+        check_cfg_feature!($feature, $feature_lit : $feature_lit)
+    };
+    ($feature:tt, $feature_lit:tt : $($target_feature_lit:tt),*) => {
+        $(cfg!(target_feature = $target_feature_lit);)*
+    };
+    ($feature:tt, $feature_lit:tt, without cfg check: $feature_cfg_check:literal) => {
+        #[allow(unexpected_cfgs, reason = $feature_lit)]
+        { cfg!(target_feature = $feature_lit) }
+    };
+}
+
+#[allow(unused)]
+macro_rules! features {
+    (
+      @TARGET: $target:ident;
+      @CFG: $cfg:meta;
+      @MACRO_NAME: $macro_name:ident;
+      @MACRO_ATTRS: $(#[$macro_attrs:meta])*
+      $(@BIND_FEATURE_NAME: $bind_feature:tt; $feature_impl:tt; $(#[$deprecate_attr:meta];)?)*
+      $(@NO_RUNTIME_DETECTION: $nort_feature:tt; )*
+      $(@FEATURE: #[$stability_attr:meta] $feature:ident: $feature_lit:tt;
+          $(without cfg check: $feature_cfg_check:tt;)?
+          $(implied by target_features: [$($target_feature_lit:tt),*];)?
+          $(#[$feature_comment:meta])*)*
+    ) => {
+        #[macro_export]
+        $(#[$macro_attrs])*
+        #[allow_internal_unstable(stdarch_internal)]
+        #[cfg($cfg)]
+        #[doc(cfg($cfg))]
+        macro_rules! $macro_name {
+            $(
+                ($feature_lit) => {
+                    $crate::detect_feature!($feature, $feature_lit $(, without cfg check: $feature_cfg_check)? $(: $($target_feature_lit),*)?)
+                };
+            )*
+            $(
+                ($bind_feature) => {
+                    {
+                        $(
+                            #[$deprecate_attr] macro_rules! deprecated_feature { {} => {}; }
+                            deprecated_feature! {};
+                        )?
+                        $crate::$macro_name!($feature_impl)
+                    }
+                };
+            )*
+            $(
+                ($nort_feature) => {
+                    compile_error!(
+                        concat!(
+                            stringify!($nort_feature),
+                            " feature cannot be detected at run-time"
+                        )
+                    )
+                };
+            )*
+            ($t:tt,) => {
+                    $crate::$macro_name!($t);
+            };
+            ($t:tt) => {
+                compile_error!(
+                    concat!(
+                        concat!("unknown ", stringify!($target)),
+                        concat!(" target feature: ", $t)
+                    )
+                )
+            };
+        }
+
+        $(#[$macro_attrs])*
+        #[macro_export]
+        #[cfg(not($cfg))]
+        #[doc(cfg($cfg))]
+        macro_rules! $macro_name {
+            $(
+                ($feature_lit) => {
+                    compile_error!(
+                        concat!(
+                            r#"This macro cannot be used on the current target.
+                            You can prevent it from being used in other architectures by
+                            guarding it behind a cfg("#,
+                            stringify!($cfg),
+                            ")."
+                        )
+                    )
+                };
+            )*
+            $(
+                ($bind_feature) => { $crate::$macro_name!($feature_impl) };
+            )*
+            $(
+                ($nort_feature) => {
+                    compile_error!(
+                        concat!(
+                            stringify!($nort_feature),
+                            " feature cannot be detected at run-time"
+                        )
+                    )
+                };
+            )*
+            ($t:tt,) => {
+                    $crate::$macro_name!($t);
+            };
+            ($t:tt) => {
+                compile_error!(
+                    concat!(
+                        concat!("unknown ", stringify!($target)),
+                        concat!(" target feature: ", $t)
+                    )
+                )
+            };
+        }
+
+        #[test]
+        #[deny(unexpected_cfgs)]
+        #[deny(unfulfilled_lint_expectations)]
+        fn unexpected_cfgs() {
+            $(
+                check_cfg_feature!($feature, $feature_lit $(, without cfg check: $feature_cfg_check)? $(: $($target_feature_lit),*)?);
+            )*
+        }
+
+        /// Each variant denotes a position in a bitset for a particular feature.
+        ///
+        /// PLEASE: do not use this, it is an implementation detail subject
+        /// to change.
+        #[doc(hidden)]
+        #[allow(non_camel_case_types)]
+        #[derive(Copy, Clone)]
+        #[repr(u8)]
+        #[unstable(feature = "stdarch_internal", issue = "none")]
+        #[cfg($cfg)]
+        pub(crate) enum Feature {
+            $(
+                $(#[$feature_comment])*
+                $feature,
+            )*
+
+            // Do not add variants after last:
+            _last
+        }
+
+        #[cfg($cfg)]
+        impl Feature {
+            pub(crate) fn to_str(self) -> &'static str {
+                match self {
+                    $(Feature::$feature => $feature_lit,)*
+                    Feature::_last => unreachable!(),
+                }
+            }
+
+            #[cfg(feature = "std_detect_env_override")]
+            pub(crate) fn from_str(s: &str) -> Result<Feature, ()> {
+                match s {
+                    $($feature_lit => Ok(Feature::$feature),)*
+                    _ => Err(())
+                }
+            }
+        }
+
+        /// Each function performs run-time feature detection for a single
+        /// feature. This allow us to use stability attributes on a per feature
+        /// basis.
+        ///
+        /// PLEASE: do not use this, it is an implementation detail subject
+        /// to change.
+        #[doc(hidden)]
+        #[cfg($cfg)]
+        #[unstable(feature = "stdarch_internal", issue = "none")]
+        pub mod __is_feature_detected {
+            $(
+
+                /// PLEASE: do not use this, it is an implementation detail
+                /// subject to change.
+                #[inline]
+                #[doc(hidden)]
+                #[$stability_attr]
+                pub fn $feature() -> bool {
+                    $crate::detect::check_for($crate::detect::Feature::$feature)
+                }
+            )*
+        }
+    };
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/mod.rs b/library/stdarch/crates/std_detect/src/detect/mod.rs
new file mode 100644
index 0000000000000..8fd3d95793288
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/mod.rs
@@ -0,0 +1,120 @@
+//! This module implements run-time feature detection.
+//!
+//! The `is_{arch}_feature_detected!("feature-name")` macros take the name of a
+//! feature as a string-literal, and return a boolean indicating whether the
+//! feature is enabled at run-time or not.
+//!
+//! These macros do two things:
+//! * map the string-literal into an integer stored as a `Feature` enum,
+//! * call a `os::check_for(x: Feature)` function that returns `true` if the
+//! feature is enabled.
+//!
+//! The `Feature` enums are also implemented in the `arch/{target_arch}.rs`
+//! modules.
+//!
+//! The `check_for` functions are, in general, Operating System dependent. Most
+//! architectures do not allow user-space programs to query the feature bits
+//! due to security concerns (x86 is the big exception). These functions are
+//! implemented in the `os/{target_os}.rs` modules.
+
+use cfg_if::cfg_if;
+
+#[macro_use]
+mod macros;
+
+mod arch;
+
+// This module needs to be public because the `is_{arch}_feature_detected!`
+// macros expand calls to items within it in user crates.
+#[doc(hidden)]
+#[unstable(feature = "stdarch_internal", issue = "none")]
+pub use self::arch::__is_feature_detected;
+
+pub(crate) use self::arch::Feature;
+
+mod bit;
+mod cache;
+
+cfg_if! {
+    if #[cfg(miri)] {
+        // When running under miri all target-features that are not enabled at
+        // compile-time are reported as disabled at run-time.
+        //
+        // For features for which `cfg(target_feature)` returns true,
+        // this run-time detection logic is never called.
+        #[path = "os/other.rs"]
+        mod os;
+    } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        // On x86/x86_64 no OS specific functionality is required.
+        #[path = "os/x86.rs"]
+        mod os;
+    } else if #[cfg(all(any(target_os = "linux", target_os = "android"), feature = "libc"))] {
+        #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
+        #[path = "os/riscv.rs"]
+        mod riscv;
+        #[path = "os/linux/mod.rs"]
+        mod os;
+    } else if #[cfg(all(target_os = "freebsd", feature = "libc"))] {
+        #[cfg(target_arch = "aarch64")]
+        #[path = "os/aarch64.rs"]
+        mod aarch64;
+        #[path = "os/freebsd/mod.rs"]
+        mod os;
+    } else if #[cfg(all(target_os = "openbsd", target_arch = "aarch64", feature = "libc"))] {
+        #[allow(dead_code)] // we don't use code that calls the mrs instruction.
+        #[path = "os/aarch64.rs"]
+        mod aarch64;
+        #[path = "os/openbsd/aarch64.rs"]
+        mod os;
+    } else if #[cfg(all(target_os = "windows", any(target_arch = "aarch64", target_arch = "arm64ec")))] {
+        #[path = "os/windows/aarch64.rs"]
+        mod os;
+    } else if #[cfg(all(target_vendor = "apple", target_arch = "aarch64", feature = "libc"))] {
+        #[path = "os/darwin/aarch64.rs"]
+        mod os;
+    } else {
+        #[path = "os/other.rs"]
+        mod os;
+    }
+}
+
+/// Performs run-time feature detection.
+#[inline]
+#[allow(dead_code)]
+fn check_for(x: Feature) -> bool {
+    cache::test(x as u32)
+}
+
+/// Returns an `Iterator<Item=(&'static str, bool)>` where
+/// `Item.0` is the feature name, and `Item.1` is a `bool` which
+/// is `true` if the feature is supported by the host and `false` otherwise.
+#[unstable(feature = "stdarch_internal", issue = "none")]
+pub fn features() -> impl Iterator<Item = (&'static str, bool)> {
+    cfg_if! {
+        if #[cfg(any(
+            target_arch = "x86",
+            target_arch = "x86_64",
+            target_arch = "arm",
+            target_arch = "aarch64",
+            target_arch = "arm64ec",
+            target_arch = "riscv32",
+            target_arch = "riscv64",
+            target_arch = "powerpc",
+            target_arch = "powerpc64",
+            target_arch = "mips",
+            target_arch = "mips64",
+            target_arch = "loongarch64",
+            target_arch = "s390x",
+        ))] {
+            (0_u8..Feature::_last as u8).map(|discriminant: u8| {
+                #[allow(bindings_with_variant_name)] // RISC-V has Feature::f
+                let f: Feature = unsafe { core::mem::transmute(discriminant) };
+                let name: &'static str = f.to_str();
+                let enabled: bool = check_for(f);
+                (name, enabled)
+            })
+        } else {
+            None.into_iter()
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/aarch64.rs
new file mode 100644
index 0000000000000..1ff2a17e6e1e5
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/aarch64.rs
@@ -0,0 +1,130 @@
+//! Run-time feature detection for Aarch64 on any OS that emulates the mrs instruction.
+//!
+//! On FreeBSD >= 12.0, Linux >= 4.11 and other operating systems, it is possible to use
+//! privileged system registers from userspace to check CPU feature support.
+//!
+//! AArch64 system registers ID_AA64ISAR0_EL1, ID_AA64PFR0_EL1, ID_AA64ISAR1_EL1
+//! have bits dedicated to features like AdvSIMD, CRC32, AES, atomics (LSE), etc.
+//! Each part of the register indicates the level of support for a certain feature, e.g.
+//! when ID_AA64ISAR0_EL1\[7:4\] is >= 1, AES is supported; when it's >= 2, PMULL is supported.
+//!
+//! For proper support of [SoCs where different cores have different capabilities](https://medium.com/@jadr2ddude/a-big-little-problem-a-tale-of-big-little-gone-wrong-e7778ce744bb),
+//! the OS has to always report only the features supported by all cores, like [FreeBSD does](https://reviews.freebsd.org/D17137#393947).
+//!
+//! References:
+//!
+//! - [Zircon implementation](https://fuchsia.googlesource.com/zircon/+/master/kernel/arch/arm64/feature.cpp)
+//! - [Linux documentation](https://www.kernel.org/doc/Documentation/arm64/cpu-feature-registers.txt)
+//! - [ARM documentation](https://developer.arm.com/documentation/ddi0601/2022-12/AArch64-Registers?lang=en)
+
+use crate::detect::{Feature, cache};
+use core::arch::asm;
+
+/// Try to read the features from the system registers.
+///
+/// This will cause SIGILL if the current OS is not trapping the mrs instruction.
+pub(crate) fn detect_features() -> cache::Initializer {
+    // ID_AA64ISAR0_EL1 - Instruction Set Attribute Register 0
+    let aa64isar0: u64;
+    unsafe {
+        asm!(
+            "mrs {}, ID_AA64ISAR0_EL1",
+            out(reg) aa64isar0,
+            options(pure, nomem, preserves_flags, nostack)
+        );
+    }
+
+    // ID_AA64ISAR1_EL1 - Instruction Set Attribute Register 1
+    let aa64isar1: u64;
+    unsafe {
+        asm!(
+            "mrs {}, ID_AA64ISAR1_EL1",
+            out(reg) aa64isar1,
+            options(pure, nomem, preserves_flags, nostack)
+        );
+    }
+
+    // ID_AA64MMFR2_EL1 - AArch64 Memory Model Feature Register 2
+    let aa64mmfr2: u64;
+    unsafe {
+        asm!(
+            "mrs {}, ID_AA64MMFR2_EL1",
+            out(reg) aa64mmfr2,
+            options(pure, nomem, preserves_flags, nostack)
+        );
+    }
+
+    // ID_AA64PFR0_EL1 - Processor Feature Register 0
+    let aa64pfr0: u64;
+    unsafe {
+        asm!(
+            "mrs {}, ID_AA64PFR0_EL1",
+            out(reg) aa64pfr0,
+            options(pure, nomem, preserves_flags, nostack)
+        );
+    }
+
+    parse_system_registers(aa64isar0, aa64isar1, aa64mmfr2, Some(aa64pfr0))
+}
+
+pub(crate) fn parse_system_registers(
+    aa64isar0: u64,
+    aa64isar1: u64,
+    aa64mmfr2: u64,
+    aa64pfr0: Option<u64>,
+) -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+
+    let mut enable_feature = |f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    // ID_AA64ISAR0_EL1 - Instruction Set Attribute Register 0
+    enable_feature(Feature::pmull, bits_shift(aa64isar0, 7, 4) >= 2);
+    enable_feature(Feature::tme, bits_shift(aa64isar0, 27, 24) == 1);
+    enable_feature(Feature::lse, bits_shift(aa64isar0, 23, 20) >= 2);
+    enable_feature(Feature::crc, bits_shift(aa64isar0, 19, 16) >= 1);
+
+    // ID_AA64PFR0_EL1 - Processor Feature Register 0
+    if let Some(aa64pfr0) = aa64pfr0 {
+        let fp = bits_shift(aa64pfr0, 19, 16) < 0xF;
+        let fphp = bits_shift(aa64pfr0, 19, 16) >= 1;
+        let asimd = bits_shift(aa64pfr0, 23, 20) < 0xF;
+        let asimdhp = bits_shift(aa64pfr0, 23, 20) >= 1;
+        enable_feature(Feature::fp, fp);
+        enable_feature(Feature::fp16, fphp);
+        // SIMD support requires float support - if half-floats are
+        // supported, it also requires half-float support:
+        enable_feature(Feature::asimd, fp && asimd && (!fphp | asimdhp));
+        // SIMD extensions require SIMD support:
+        enable_feature(Feature::aes, asimd && bits_shift(aa64isar0, 7, 4) >= 2);
+        let sha1 = bits_shift(aa64isar0, 11, 8) >= 1;
+        let sha2 = bits_shift(aa64isar0, 15, 12) >= 1;
+        enable_feature(Feature::sha2, asimd && sha1 && sha2);
+        enable_feature(Feature::rdm, asimd && bits_shift(aa64isar0, 31, 28) >= 1);
+        enable_feature(
+            Feature::dotprod,
+            asimd && bits_shift(aa64isar0, 47, 44) >= 1,
+        );
+        enable_feature(Feature::sve, asimd && bits_shift(aa64pfr0, 35, 32) >= 1);
+    }
+
+    // ID_AA64ISAR1_EL1 - Instruction Set Attribute Register 1
+    // Check for either APA or API field
+    enable_feature(Feature::paca, bits_shift(aa64isar1, 11, 4) >= 1);
+    enable_feature(Feature::rcpc, bits_shift(aa64isar1, 23, 20) >= 1);
+    // Check for either GPA or GPI field
+    enable_feature(Feature::pacg, bits_shift(aa64isar1, 31, 24) >= 1);
+
+    // ID_AA64MMFR2_EL1 - AArch64 Memory Model Feature Register 2
+    enable_feature(Feature::lse2, bits_shift(aa64mmfr2, 35, 32) >= 1);
+
+    value
+}
+
+#[inline]
+fn bits_shift(x: u64, high: usize, low: usize) -> u64 {
+    (x >> low) & ((1 << (high - low + 1)) - 1)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/darwin/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/darwin/aarch64.rs
new file mode 100644
index 0000000000000..6699a66b1adf9
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/darwin/aarch64.rs
@@ -0,0 +1,147 @@
+//! Run-time feature detection for aarch64 on Darwin (macOS/iOS/tvOS/watchOS/visionOS).
+//!
+//! <https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics>
+
+use crate::detect::{Feature, cache};
+use core::ffi::CStr;
+
+#[inline]
+fn _sysctlbyname(name: &CStr) -> bool {
+    use libc;
+
+    let mut enabled: i32 = 0;
+    let mut enabled_len: usize = 4;
+    let enabled_ptr = &mut enabled as *mut i32 as *mut libc::c_void;
+
+    let ret = unsafe {
+        libc::sysctlbyname(
+            name.as_ptr(),
+            enabled_ptr,
+            &mut enabled_len,
+            core::ptr::null_mut(),
+            0,
+        )
+    };
+
+    match ret {
+        0 => enabled != 0,
+        _ => false,
+    }
+}
+
+/// Try to read the features using sysctlbyname.
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+
+    let mut enable_feature = |f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    // Armv8.0 features not using the standard identifiers
+    let fp = _sysctlbyname(c"hw.optional.floatingpoint");
+    let asimd = _sysctlbyname(c"hw.optional.AdvSIMD");
+    let crc = _sysctlbyname(c"hw.optional.armv8_crc32");
+
+    // Armv8 and Armv9 features using the standard identifiers
+    let aes = _sysctlbyname(c"hw.optional.arm.FEAT_AES");
+    let bf16 = _sysctlbyname(c"hw.optional.arm.FEAT_BF16");
+    let bti = _sysctlbyname(c"hw.optional.arm.FEAT_BTI");
+    let dit = _sysctlbyname(c"hw.optional.arm.FEAT_DIT");
+    let dpb = _sysctlbyname(c"hw.optional.arm.FEAT_DPB");
+    let dpb2 = _sysctlbyname(c"hw.optional.arm.FEAT_DPB2");
+    let dotprod = _sysctlbyname(c"hw.optional.arm.FEAT_DotProd");
+    let ecv = _sysctlbyname(c"hw.optional.arm.FEAT_ECV");
+    let fcma = _sysctlbyname(c"hw.optional.arm.FEAT_FCMA");
+    let fhm = _sysctlbyname(c"hw.optional.arm.FEAT_FHM");
+    let fp16 = _sysctlbyname(c"hw.optional.arm.FEAT_FP16");
+    let frintts = _sysctlbyname(c"hw.optional.arm.FEAT_FRINTTS");
+    let flagm = _sysctlbyname(c"hw.optional.arm.FEAT_FlagM");
+    let flagm2 = _sysctlbyname(c"hw.optional.arm.FEAT_FlagM2");
+    let i8mm = _sysctlbyname(c"hw.optional.arm.FEAT_I8MM");
+    let jsconv = _sysctlbyname(c"hw.optional.arm.FEAT_JSCVT");
+    let rcpc = _sysctlbyname(c"hw.optional.arm.FEAT_LRCPC");
+    let rcpc2 = _sysctlbyname(c"hw.optional.arm.FEAT_LRCPC2");
+    let lse = _sysctlbyname(c"hw.optional.arm.FEAT_LSE");
+    let lse2 = _sysctlbyname(c"hw.optional.arm.FEAT_LSE2");
+    let pauth = _sysctlbyname(c"hw.optional.arm.FEAT_PAuth");
+    let pmull = _sysctlbyname(c"hw.optional.arm.FEAT_PMULL");
+    let rdm = _sysctlbyname(c"hw.optional.arm.FEAT_RDM");
+    let sb = _sysctlbyname(c"hw.optional.arm.FEAT_SB");
+    let sha1 = _sysctlbyname(c"hw.optional.arm.FEAT_SHA1");
+    let sha256 = _sysctlbyname(c"hw.optional.arm.FEAT_SHA256");
+    let sha3 = _sysctlbyname(c"hw.optional.arm.FEAT_SHA3");
+    let sha512 = _sysctlbyname(c"hw.optional.arm.FEAT_SHA512");
+    let sme = _sysctlbyname(c"hw.optional.arm.FEAT_SME");
+    let sme2 = _sysctlbyname(c"hw.optional.arm.FEAT_SME2");
+    let sme_f64f64 = _sysctlbyname(c"hw.optional.arm.FEAT_SME_F64F64");
+    let sme_i16i64 = _sysctlbyname(c"hw.optional.arm.FEAT_SME_I16I64");
+    let ssbs = _sysctlbyname(c"hw.optional.arm.FEAT_SSBS");
+    let wfxt = _sysctlbyname(c"hw.optional.arm.FEAT_WFxT");
+
+    // The following features are not exposed by `is_aarch64_feature_detected`,
+    // but *are* reported by `sysctl`. They are here as documentation that they
+    // exist, and may potentially be exposed later.
+    /*
+    let afp = _sysctlbyname(c"hw.optional.arm.FEAT_AFP");
+    let csv2 = _sysctlbyname(c"hw.optional.arm.FEAT_CSV2");
+    let csv3 = _sysctlbyname(c"hw.optional.arm.FEAT_CSV3");
+    let fpac = _sysctlbyname(c"hw.optional.arm.FEAT_FPAC");
+    let pauth2 = _sysctlbyname(c"hw.optional.arm.FEAT_PAuth2");
+    let rpres = _sysctlbyname(c"hw.optional.arm.FEAT_RPRES");
+    let specres = _sysctlbyname(c"hw.optional.arm.FEAT_SPECRES");
+     */
+
+    // The following "features" are reported by `sysctl` but are mandatory parts
+    // of SME or SME2, and so are not exposed separately by
+    // `is_aarch64_feature_detected`.  They are here to document their
+    // existence, in case they're needed in the future.
+    /*
+    let sme_b16f32 = _sysctlbyname(c"hw.optional.arm.SME_B16F32");
+    let sme_bi32i32 = _sysctlbyname(c"hw.optional.arm.SME_BI32I32");
+    let sme_f16f32 = _sysctlbyname(c"hw.optional.arm.SME_F16F32");
+    let sme_f32f32 = _sysctlbyname(c"hw.optional.arm.SME_F32F32");
+    let sme_i16i32 = _sysctlbyname(c"hw.optional.arm.SME_I16I32");
+    let sme_i8i32 = _sysctlbyname(c"hw.optional.arm.SME_I8I32");
+     */
+
+    enable_feature(Feature::aes, aes && pmull);
+    enable_feature(Feature::asimd, asimd);
+    enable_feature(Feature::bf16, bf16);
+    enable_feature(Feature::bti, bti);
+    enable_feature(Feature::crc, crc);
+    enable_feature(Feature::dit, dit);
+    enable_feature(Feature::dotprod, dotprod);
+    enable_feature(Feature::dpb, dpb);
+    enable_feature(Feature::dpb2, dpb2);
+    enable_feature(Feature::ecv, ecv);
+    enable_feature(Feature::fcma, fcma);
+    enable_feature(Feature::fhm, fhm);
+    enable_feature(Feature::flagm, flagm);
+    enable_feature(Feature::flagm2, flagm2);
+    enable_feature(Feature::fp, fp);
+    enable_feature(Feature::fp16, fp16);
+    enable_feature(Feature::frintts, frintts);
+    enable_feature(Feature::i8mm, i8mm);
+    enable_feature(Feature::jsconv, jsconv);
+    enable_feature(Feature::lse, lse);
+    enable_feature(Feature::lse2, lse2);
+    enable_feature(Feature::paca, pauth);
+    enable_feature(Feature::pacg, pauth);
+    enable_feature(Feature::pmull, aes && pmull);
+    enable_feature(Feature::rcpc, rcpc);
+    enable_feature(Feature::rcpc2, rcpc2);
+    enable_feature(Feature::rdm, rdm);
+    enable_feature(Feature::sb, sb);
+    enable_feature(Feature::sha2, sha1 && sha256 && asimd);
+    enable_feature(Feature::sha3, sha512 && sha3 && asimd);
+    enable_feature(Feature::sme, sme);
+    enable_feature(Feature::sme2, sme2);
+    enable_feature(Feature::sme_f64f64, sme_f64f64);
+    enable_feature(Feature::sme_i16i64, sme_i16i64);
+    enable_feature(Feature::ssbs, ssbs);
+    enable_feature(Feature::wfxt, wfxt);
+
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/freebsd/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/freebsd/aarch64.rs
new file mode 100644
index 0000000000000..ccc48f536054d
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/freebsd/aarch64.rs
@@ -0,0 +1,3 @@
+//! Run-time feature detection for Aarch64 on FreeBSD.
+
+pub(crate) use super::super::aarch64::detect_features;
diff --git a/library/stdarch/crates/std_detect/src/detect/os/freebsd/arm.rs b/library/stdarch/crates/std_detect/src/detect/os/freebsd/arm.rs
new file mode 100644
index 0000000000000..0a15156e1bd8d
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/freebsd/arm.rs
@@ -0,0 +1,36 @@
+//! Run-time feature detection for ARM on FreeBSD
+
+use super::auxvec;
+use crate::detect::{Feature, cache};
+
+// Defined in machine/elf.h.
+// https://github.com/freebsd/freebsd-src/blob/deb63adf945d446ed91a9d84124c71f15ae571d1/sys/arm/include/elf.h
+const HWCAP_NEON: usize = 0x00001000;
+const HWCAP2_AES: usize = 0x00000001;
+const HWCAP2_PMULL: usize = 0x00000002;
+const HWCAP2_SHA1: usize = 0x00000004;
+const HWCAP2_SHA2: usize = 0x00000008;
+const HWCAP2_CRC32: usize = 0x00000010;
+
+/// Try to read the features from the auxiliary vector
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    if let Ok(auxv) = auxvec::auxv() {
+        enable_feature(&mut value, Feature::neon, auxv.hwcap & HWCAP_NEON != 0);
+        enable_feature(&mut value, Feature::pmull, auxv.hwcap2 & HWCAP2_PMULL != 0);
+        enable_feature(&mut value, Feature::crc, auxv.hwcap2 & HWCAP2_CRC32 != 0);
+        enable_feature(&mut value, Feature::aes, auxv.hwcap2 & HWCAP2_AES != 0);
+        // SHA2 requires SHA1 & SHA2 features
+        let sha1 = auxv.hwcap2 & HWCAP2_SHA1 != 0;
+        let sha2 = auxv.hwcap2 & HWCAP2_SHA2 != 0;
+        enable_feature(&mut value, Feature::sha2, sha1 && sha2);
+        return value;
+    }
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/freebsd/auxvec.rs b/library/stdarch/crates/std_detect/src/detect/os/freebsd/auxvec.rs
new file mode 100644
index 0000000000000..4e72bf22d76cd
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/freebsd/auxvec.rs
@@ -0,0 +1,66 @@
+//! Parses ELF auxiliary vectors.
+#![cfg_attr(
+    any(
+        target_arch = "aarch64",
+        target_arch = "arm",
+        target_arch = "powerpc64",
+        target_arch = "riscv64"
+    ),
+    allow(dead_code)
+)]
+
+/// Cache HWCAP bitfields of the ELF Auxiliary Vector.
+///
+/// If an entry cannot be read all the bits in the bitfield are set to zero.
+/// This should be interpreted as all the features being disabled.
+#[derive(Debug, Copy, Clone)]
+pub(crate) struct AuxVec {
+    pub hwcap: usize,
+    pub hwcap2: usize,
+}
+
+/// ELF Auxiliary Vector
+///
+/// The auxiliary vector is a memory region in a running ELF program's stack
+/// composed of (key: usize, value: usize) pairs.
+///
+/// The keys used in the aux vector are platform dependent. For FreeBSD, they are
+/// defined in [sys/elf_common.h][elf_common_h]. The hardware capabilities of a given
+/// CPU can be queried with the  `AT_HWCAP` and `AT_HWCAP2` keys.
+///
+/// Note that run-time feature detection is not invoked for features that can
+/// be detected at compile-time.
+///
+/// [elf_common.h]: https://svnweb.freebsd.org/base/release/12.0.0/sys/sys/elf_common.h?revision=341707
+pub(crate) fn auxv() -> Result<AuxVec, ()> {
+    let hwcap = archauxv(libc::AT_HWCAP);
+    let hwcap2 = archauxv(libc::AT_HWCAP2);
+    // Zero could indicate that no features were detected, but it's also used to
+    // indicate an error. In particular, on many platforms AT_HWCAP2 will be
+    // legitimately zero, since it contains the most recent feature flags.
+    if hwcap != 0 || hwcap2 != 0 {
+        return Ok(AuxVec { hwcap, hwcap2 });
+    }
+    Err(())
+}
+
+/// Tries to read the `key` from the auxiliary vector.
+fn archauxv(key: libc::c_int) -> usize {
+    const OUT_LEN: libc::c_int = core::mem::size_of::<libc::c_ulong>() as libc::c_int;
+    let mut out: libc::c_ulong = 0;
+    unsafe {
+        // elf_aux_info is available on FreeBSD 12.0+ and 11.4+:
+        // https://github.com/freebsd/freebsd-src/commit/0b08ae2120cdd08c20a2b806e2fcef4d0a36c470
+        // https://github.com/freebsd/freebsd-src/blob/release/11.4.0/sys/sys/auxv.h
+        // FreeBSD 11 support in std has been removed in Rust 1.75 (https://github.com/rust-lang/rust/pull/114521),
+        // so we can safely use this function.
+        let res = libc::elf_aux_info(
+            key,
+            &mut out as *mut libc::c_ulong as *mut libc::c_void,
+            OUT_LEN,
+        );
+        // If elf_aux_info fails, `out` will be left at zero (which is the proper default value).
+        debug_assert!(res == 0 || out == 0);
+    }
+    out as usize
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/freebsd/mod.rs b/library/stdarch/crates/std_detect/src/detect/os/freebsd/mod.rs
new file mode 100644
index 0000000000000..ade7fb6269d13
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/freebsd/mod.rs
@@ -0,0 +1,22 @@
+//! Run-time feature detection on FreeBSD
+
+mod auxvec;
+
+cfg_if::cfg_if! {
+    if #[cfg(target_arch = "aarch64")] {
+        mod aarch64;
+        pub(crate) use self::aarch64::detect_features;
+    } else if #[cfg(target_arch = "arm")] {
+        mod arm;
+        pub(crate) use self::arm::detect_features;
+    } else if #[cfg(target_arch = "powerpc64")] {
+        mod powerpc;
+        pub(crate) use self::powerpc::detect_features;
+    } else {
+        use crate::detect::cache;
+        /// Performs run-time feature detection.
+        pub(crate) fn detect_features() -> cache::Initializer {
+            cache::Initializer::default()
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/freebsd/powerpc.rs b/library/stdarch/crates/std_detect/src/detect/os/freebsd/powerpc.rs
new file mode 100644
index 0000000000000..d03af68cd0815
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/freebsd/powerpc.rs
@@ -0,0 +1,21 @@
+//! Run-time feature detection for PowerPC on FreeBSD.
+
+use super::auxvec;
+use crate::detect::{Feature, cache};
+
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    if let Ok(auxv) = auxvec::auxv() {
+        enable_feature(&mut value, Feature::altivec, auxv.hwcap & 0x10000000 != 0);
+        enable_feature(&mut value, Feature::vsx, auxv.hwcap & 0x00000080 != 0);
+        enable_feature(&mut value, Feature::power8, auxv.hwcap2 & 0x80000000 != 0);
+        return value;
+    }
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs
new file mode 100644
index 0000000000000..22a9cefff7b83
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs
@@ -0,0 +1,484 @@
+//! Run-time feature detection for Aarch64 on Linux.
+
+use super::auxvec;
+use crate::detect::{Feature, bit, cache};
+
+/// Try to read the features from the auxiliary vector.
+pub(crate) fn detect_features() -> cache::Initializer {
+    #[cfg(target_os = "android")]
+    let is_exynos9810 = {
+        // Samsung Exynos 9810 has a bug that big and little cores have different
+        // ISAs. And on older Android (pre-9), the kernel incorrectly reports
+        // that features available only on some cores are available on all cores.
+        // https://reviews.llvm.org/D114523
+        let mut arch = [0_u8; libc::PROP_VALUE_MAX as usize];
+        let len = unsafe {
+            libc::__system_property_get(c"ro.arch".as_ptr(), arch.as_mut_ptr() as *mut libc::c_char)
+        };
+        // On Exynos, ro.arch is not available on Android 12+, but it is fine
+        // because Android 9+ includes the fix.
+        len > 0 && arch.starts_with(b"exynos9810")
+    };
+    #[cfg(not(target_os = "android"))]
+    let is_exynos9810 = false;
+
+    if let Ok(auxv) = auxvec::auxv() {
+        let hwcap: AtHwcap = auxv.into();
+        return hwcap.cache(is_exynos9810);
+    }
+    cache::Initializer::default()
+}
+
+/// These values are part of the platform-specific [asm/hwcap.h][hwcap] .
+///
+/// The names match those used for cpuinfo.
+///
+/// [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm64/include/uapi/asm/hwcap.h
+#[derive(Debug, Default, PartialEq)]
+struct AtHwcap {
+    // AT_HWCAP
+    fp: bool,
+    asimd: bool,
+    // evtstrm: No LLVM support.
+    aes: bool,
+    pmull: bool,
+    sha1: bool,
+    sha2: bool,
+    crc32: bool,
+    atomics: bool,
+    fphp: bool,
+    asimdhp: bool,
+    // cpuid: No LLVM support.
+    asimdrdm: bool,
+    jscvt: bool,
+    fcma: bool,
+    lrcpc: bool,
+    dcpop: bool,
+    sha3: bool,
+    sm3: bool,
+    sm4: bool,
+    asimddp: bool,
+    sha512: bool,
+    sve: bool,
+    fhm: bool,
+    dit: bool,
+    uscat: bool,
+    ilrcpc: bool,
+    flagm: bool,
+    ssbs: bool,
+    sb: bool,
+    paca: bool,
+    pacg: bool,
+
+    // AT_HWCAP2
+    dcpodp: bool,
+    sve2: bool,
+    sveaes: bool,
+    svepmull: bool,
+    svebitperm: bool,
+    svesha3: bool,
+    svesm4: bool,
+    flagm2: bool,
+    frint: bool,
+    // svei8mm: See i8mm feature.
+    svef32mm: bool,
+    svef64mm: bool,
+    // svebf16: See bf16 feature.
+    i8mm: bool,
+    bf16: bool,
+    // dgh: No LLVM support.
+    rng: bool,
+    bti: bool,
+    mte: bool,
+    ecv: bool,
+    // afp: bool,
+    // rpres: bool,
+    // mte3: bool,
+    sme: bool,
+    smei16i64: bool,
+    smef64f64: bool,
+    // smei8i32: bool,
+    // smef16f32: bool,
+    // smeb16f32: bool,
+    // smef32f32: bool,
+    smefa64: bool,
+    wfxt: bool,
+    // ebf16: bool,
+    // sveebf16: bool,
+    cssc: bool,
+    // rprfm: bool,
+    sve2p1: bool,
+    sme2: bool,
+    sme2p1: bool,
+    // smei16i32: bool,
+    // smebi32i32: bool,
+    smeb16b16: bool,
+    smef16f16: bool,
+    mops: bool,
+    hbc: bool,
+    sveb16b16: bool,
+    lrcpc3: bool,
+    lse128: bool,
+    fpmr: bool,
+    lut: bool,
+    faminmax: bool,
+    f8cvt: bool,
+    f8fma: bool,
+    f8dp4: bool,
+    f8dp2: bool,
+    f8e4m3: bool,
+    f8e5m2: bool,
+    smelutv2: bool,
+    smef8f16: bool,
+    smef8f32: bool,
+    smesf8fma: bool,
+    smesf8dp4: bool,
+    smesf8dp2: bool,
+    // pauthlr: bool,
+}
+
+impl From<auxvec::AuxVec> for AtHwcap {
+    /// Reads AtHwcap from the auxiliary vector.
+    fn from(auxv: auxvec::AuxVec) -> Self {
+        AtHwcap {
+            fp: bit::test(auxv.hwcap, 0),
+            asimd: bit::test(auxv.hwcap, 1),
+            // evtstrm: bit::test(auxv.hwcap, 2),
+            aes: bit::test(auxv.hwcap, 3),
+            pmull: bit::test(auxv.hwcap, 4),
+            sha1: bit::test(auxv.hwcap, 5),
+            sha2: bit::test(auxv.hwcap, 6),
+            crc32: bit::test(auxv.hwcap, 7),
+            atomics: bit::test(auxv.hwcap, 8),
+            fphp: bit::test(auxv.hwcap, 9),
+            asimdhp: bit::test(auxv.hwcap, 10),
+            // cpuid: bit::test(auxv.hwcap, 11),
+            asimdrdm: bit::test(auxv.hwcap, 12),
+            jscvt: bit::test(auxv.hwcap, 13),
+            fcma: bit::test(auxv.hwcap, 14),
+            lrcpc: bit::test(auxv.hwcap, 15),
+            dcpop: bit::test(auxv.hwcap, 16),
+            sha3: bit::test(auxv.hwcap, 17),
+            sm3: bit::test(auxv.hwcap, 18),
+            sm4: bit::test(auxv.hwcap, 19),
+            asimddp: bit::test(auxv.hwcap, 20),
+            sha512: bit::test(auxv.hwcap, 21),
+            sve: bit::test(auxv.hwcap, 22),
+            fhm: bit::test(auxv.hwcap, 23),
+            dit: bit::test(auxv.hwcap, 24),
+            uscat: bit::test(auxv.hwcap, 25),
+            ilrcpc: bit::test(auxv.hwcap, 26),
+            flagm: bit::test(auxv.hwcap, 27),
+            ssbs: bit::test(auxv.hwcap, 28),
+            sb: bit::test(auxv.hwcap, 29),
+            paca: bit::test(auxv.hwcap, 30),
+            pacg: bit::test(auxv.hwcap, 31),
+
+            // AT_HWCAP2
+            dcpodp: bit::test(auxv.hwcap2, 0),
+            sve2: bit::test(auxv.hwcap2, 1),
+            sveaes: bit::test(auxv.hwcap2, 2),
+            svepmull: bit::test(auxv.hwcap2, 3),
+            svebitperm: bit::test(auxv.hwcap2, 4),
+            svesha3: bit::test(auxv.hwcap2, 5),
+            svesm4: bit::test(auxv.hwcap2, 6),
+            flagm2: bit::test(auxv.hwcap2, 7),
+            frint: bit::test(auxv.hwcap2, 8),
+            // svei8mm: bit::test(auxv.hwcap2, 9),
+            svef32mm: bit::test(auxv.hwcap2, 10),
+            svef64mm: bit::test(auxv.hwcap2, 11),
+            // svebf16: bit::test(auxv.hwcap2, 12),
+            i8mm: bit::test(auxv.hwcap2, 13),
+            bf16: bit::test(auxv.hwcap2, 14),
+            // dgh: bit::test(auxv.hwcap2, 15),
+            rng: bit::test(auxv.hwcap2, 16),
+            bti: bit::test(auxv.hwcap2, 17),
+            mte: bit::test(auxv.hwcap2, 18),
+            ecv: bit::test(auxv.hwcap2, 19),
+            // afp: bit::test(auxv.hwcap2, 20),
+            // rpres: bit::test(auxv.hwcap2, 21),
+            // mte3: bit::test(auxv.hwcap2, 22),
+            sme: bit::test(auxv.hwcap2, 23),
+            smei16i64: bit::test(auxv.hwcap2, 24),
+            smef64f64: bit::test(auxv.hwcap2, 25),
+            // smei8i32: bit::test(auxv.hwcap2, 26),
+            // smef16f32: bit::test(auxv.hwcap2, 27),
+            // smeb16f32: bit::test(auxv.hwcap2, 28),
+            // smef32f32: bit::test(auxv.hwcap2, 29),
+            smefa64: bit::test(auxv.hwcap2, 30),
+            wfxt: bit::test(auxv.hwcap2, 31),
+            // ebf16: bit::test(auxv.hwcap2, 32),
+            // sveebf16: bit::test(auxv.hwcap2, 33),
+            cssc: bit::test(auxv.hwcap2, 34),
+            // rprfm: bit::test(auxv.hwcap2, 35),
+            sve2p1: bit::test(auxv.hwcap2, 36),
+            sme2: bit::test(auxv.hwcap2, 37),
+            sme2p1: bit::test(auxv.hwcap2, 38),
+            // smei16i32: bit::test(auxv.hwcap2, 39),
+            // smebi32i32: bit::test(auxv.hwcap2, 40),
+            smeb16b16: bit::test(auxv.hwcap2, 41),
+            smef16f16: bit::test(auxv.hwcap2, 42),
+            mops: bit::test(auxv.hwcap2, 43),
+            hbc: bit::test(auxv.hwcap2, 44),
+            sveb16b16: bit::test(auxv.hwcap2, 45),
+            lrcpc3: bit::test(auxv.hwcap2, 46),
+            lse128: bit::test(auxv.hwcap2, 47),
+            fpmr: bit::test(auxv.hwcap2, 48),
+            lut: bit::test(auxv.hwcap2, 49),
+            faminmax: bit::test(auxv.hwcap2, 50),
+            f8cvt: bit::test(auxv.hwcap2, 51),
+            f8fma: bit::test(auxv.hwcap2, 52),
+            f8dp4: bit::test(auxv.hwcap2, 53),
+            f8dp2: bit::test(auxv.hwcap2, 54),
+            f8e4m3: bit::test(auxv.hwcap2, 55),
+            f8e5m2: bit::test(auxv.hwcap2, 56),
+            smelutv2: bit::test(auxv.hwcap2, 57),
+            smef8f16: bit::test(auxv.hwcap2, 58),
+            smef8f32: bit::test(auxv.hwcap2, 59),
+            smesf8fma: bit::test(auxv.hwcap2, 60),
+            smesf8dp4: bit::test(auxv.hwcap2, 61),
+            smesf8dp2: bit::test(auxv.hwcap2, 62),
+            // pauthlr: bit::test(auxv.hwcap2, ??),
+        }
+    }
+}
+
+impl AtHwcap {
+    /// Initializes the cache from the feature -bits.
+    ///
+    /// The feature dependencies here come directly from LLVM's feature definitions:
+    /// https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AArch64/AArch64.td
+    fn cache(self, is_exynos9810: bool) -> cache::Initializer {
+        let mut value = cache::Initializer::default();
+        {
+            let mut enable_feature = |f, enable| {
+                if enable {
+                    value.set(f as u32);
+                }
+            };
+
+            // Samsung Exynos 9810 has a bug that big and little cores have different
+            // ISAs. And on older Android (pre-9), the kernel incorrectly reports
+            // that features available only on some cores are available on all cores.
+            // So, only check features that are known to be available on exynos-m3:
+            // $ rustc --print cfg --target aarch64-linux-android -C target-cpu=exynos-m3 | grep target_feature
+            // See also https://github.com/rust-lang/stdarch/pull/1378#discussion_r1103748342.
+            if is_exynos9810 {
+                enable_feature(Feature::fp, self.fp);
+                enable_feature(Feature::crc, self.crc32);
+                // ASIMD support requires float support - if half-floats are
+                // supported, it also requires half-float support:
+                let asimd = self.fp && self.asimd && (!self.fphp | self.asimdhp);
+                enable_feature(Feature::asimd, asimd);
+                // Cryptographic extensions require ASIMD
+                // AES also covers FEAT_PMULL
+                enable_feature(Feature::aes, self.aes && self.pmull && asimd);
+                enable_feature(Feature::sha2, self.sha1 && self.sha2 && asimd);
+                return value;
+            }
+
+            enable_feature(Feature::fp, self.fp);
+            // Half-float support requires float support
+            enable_feature(Feature::fp16, self.fp && self.fphp);
+            // FHM (fp16fml in LLVM) requires half float support
+            enable_feature(Feature::fhm, self.fphp && self.fhm);
+            enable_feature(Feature::pmull, self.pmull);
+            enable_feature(Feature::crc, self.crc32);
+            enable_feature(Feature::lse, self.atomics);
+            enable_feature(Feature::lse2, self.uscat);
+            enable_feature(Feature::lse128, self.lse128 && self.atomics);
+            enable_feature(Feature::rcpc, self.lrcpc);
+            // RCPC2 (rcpc-immo in LLVM) requires RCPC support
+            let rcpc2 = self.ilrcpc && self.lrcpc;
+            enable_feature(Feature::rcpc2, rcpc2);
+            enable_feature(Feature::rcpc3, self.lrcpc3 && rcpc2);
+            enable_feature(Feature::dit, self.dit);
+            enable_feature(Feature::flagm, self.flagm);
+            enable_feature(Feature::flagm2, self.flagm2);
+            enable_feature(Feature::ssbs, self.ssbs);
+            enable_feature(Feature::sb, self.sb);
+            enable_feature(Feature::paca, self.paca);
+            enable_feature(Feature::pacg, self.pacg);
+            // enable_feature(Feature::pauth_lr, self.pauthlr);
+            enable_feature(Feature::dpb, self.dcpop);
+            enable_feature(Feature::dpb2, self.dcpodp);
+            enable_feature(Feature::rand, self.rng);
+            enable_feature(Feature::bti, self.bti);
+            enable_feature(Feature::mte, self.mte);
+            // jsconv requires float support
+            enable_feature(Feature::jsconv, self.jscvt && self.fp);
+            enable_feature(Feature::rdm, self.asimdrdm);
+            enable_feature(Feature::dotprod, self.asimddp);
+            enable_feature(Feature::frintts, self.frint);
+
+            // FEAT_I8MM & FEAT_BF16 also include optional SVE components which linux exposes
+            // separately. We ignore that distinction here.
+            enable_feature(Feature::i8mm, self.i8mm);
+            enable_feature(Feature::bf16, self.bf16);
+
+            // ASIMD support requires float support - if half-floats are
+            // supported, it also requires half-float support:
+            let asimd = self.fp && self.asimd && (!self.fphp | self.asimdhp);
+            enable_feature(Feature::asimd, asimd);
+            // ASIMD extensions require ASIMD support:
+            enable_feature(Feature::fcma, self.fcma && asimd);
+            enable_feature(Feature::sve, self.sve && asimd);
+
+            // SVE extensions require SVE & ASIMD
+            enable_feature(Feature::f32mm, self.svef32mm && self.sve && asimd);
+            enable_feature(Feature::f64mm, self.svef64mm && self.sve && asimd);
+
+            // Cryptographic extensions require ASIMD
+            enable_feature(Feature::aes, self.aes && asimd);
+            enable_feature(Feature::sha2, self.sha1 && self.sha2 && asimd);
+            // SHA512/SHA3 require SHA1 & SHA256
+            enable_feature(
+                Feature::sha3,
+                self.sha512 && self.sha3 && self.sha1 && self.sha2 && asimd,
+            );
+            enable_feature(Feature::sm4, self.sm3 && self.sm4 && asimd);
+
+            // SVE2 requires SVE
+            let sve2 = self.sve2 && self.sve && asimd;
+            enable_feature(Feature::sve2, sve2);
+            enable_feature(Feature::sve2p1, self.sve2p1 && sve2);
+            // SVE2 extensions require SVE2 and crypto features
+            enable_feature(
+                Feature::sve2_aes,
+                self.sveaes && self.svepmull && sve2 && self.aes,
+            );
+            enable_feature(
+                Feature::sve2_sm4,
+                self.svesm4 && sve2 && self.sm3 && self.sm4,
+            );
+            enable_feature(
+                Feature::sve2_sha3,
+                self.svesha3 && sve2 && self.sha512 && self.sha3 && self.sha1 && self.sha2,
+            );
+            enable_feature(Feature::sve2_bitperm, self.svebitperm && self.sve2);
+            enable_feature(Feature::sve_b16b16, self.bf16 && self.sveb16b16);
+            enable_feature(Feature::hbc, self.hbc);
+            enable_feature(Feature::mops, self.mops);
+            enable_feature(Feature::ecv, self.ecv);
+            enable_feature(Feature::lut, self.lut);
+            enable_feature(Feature::cssc, self.cssc);
+            enable_feature(Feature::fpmr, self.fpmr);
+            enable_feature(Feature::faminmax, self.faminmax);
+            let fp8 = self.f8cvt && self.faminmax && self.lut && self.bf16;
+            enable_feature(Feature::fp8, fp8);
+            let fp8fma = self.f8fma && fp8;
+            enable_feature(Feature::fp8fma, fp8fma);
+            let fp8dot4 = self.f8dp4 && fp8fma;
+            enable_feature(Feature::fp8dot4, fp8dot4);
+            enable_feature(Feature::fp8dot2, self.f8dp2 && fp8dot4);
+            enable_feature(Feature::wfxt, self.wfxt);
+            let sme = self.sme && self.bf16;
+            enable_feature(Feature::sme, sme);
+            enable_feature(Feature::sme_i16i64, self.smei16i64 && sme);
+            enable_feature(Feature::sme_f64f64, self.smef64f64 && sme);
+            enable_feature(Feature::sme_fa64, self.smefa64 && sme && sve2);
+            let sme2 = self.sme2 && sme;
+            enable_feature(Feature::sme2, sme2);
+            enable_feature(Feature::sme2p1, self.sme2p1 && sme2);
+            enable_feature(
+                Feature::sme_b16b16,
+                sme2 && self.bf16 && self.sveb16b16 && self.smeb16b16,
+            );
+            enable_feature(Feature::sme_f16f16, self.smef16f16 && sme2);
+            enable_feature(Feature::sme_lutv2, self.smelutv2);
+            let sme_f8f32 = self.smef8f32 && sme2 && fp8;
+            enable_feature(Feature::sme_f8f32, sme_f8f32);
+            enable_feature(Feature::sme_f8f16, self.smef8f16 && sme_f8f32);
+            let ssve_fp8fma = self.smesf8fma && sme2 && fp8;
+            enable_feature(Feature::ssve_fp8fma, ssve_fp8fma);
+            let ssve_fp8dot4 = self.smesf8dp4 && ssve_fp8fma;
+            enable_feature(Feature::ssve_fp8dot4, ssve_fp8dot4);
+            enable_feature(Feature::ssve_fp8dot2, self.smesf8dp2 && ssve_fp8dot4);
+        }
+        value
+    }
+}
+
+#[cfg(target_endian = "little")]
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[cfg(feature = "std_detect_file_io")]
+    mod auxv_from_file {
+        use super::auxvec::auxv_from_file;
+        use super::*;
+        // The baseline hwcaps used in the (artificial) auxv test files.
+        fn baseline_hwcaps() -> AtHwcap {
+            AtHwcap {
+                fp: true,
+                asimd: true,
+                aes: true,
+                pmull: true,
+                sha1: true,
+                sha2: true,
+                crc32: true,
+                atomics: true,
+                fphp: true,
+                asimdhp: true,
+                asimdrdm: true,
+                lrcpc: true,
+                dcpop: true,
+                asimddp: true,
+                ssbs: true,
+                ..AtHwcap::default()
+            }
+        }
+
+        #[test]
+        fn linux_empty_hwcap2_aarch64() {
+            let file = concat!(
+                env!("CARGO_MANIFEST_DIR"),
+                "/src/detect/test_data/linux-empty-hwcap2-aarch64.auxv"
+            );
+            println!("file: {file}");
+            let v = auxv_from_file(file).unwrap();
+            println!("HWCAP : 0x{:0x}", v.hwcap);
+            println!("HWCAP2: 0x{:0x}", v.hwcap2);
+            assert_eq!(AtHwcap::from(v), baseline_hwcaps());
+        }
+        #[test]
+        fn linux_no_hwcap2_aarch64() {
+            let file = concat!(
+                env!("CARGO_MANIFEST_DIR"),
+                "/src/detect/test_data/linux-no-hwcap2-aarch64.auxv"
+            );
+            println!("file: {file}");
+            let v = auxv_from_file(file).unwrap();
+            println!("HWCAP : 0x{:0x}", v.hwcap);
+            println!("HWCAP2: 0x{:0x}", v.hwcap2);
+            assert_eq!(AtHwcap::from(v), baseline_hwcaps());
+        }
+        #[test]
+        fn linux_hwcap2_aarch64() {
+            let file = concat!(
+                env!("CARGO_MANIFEST_DIR"),
+                "/src/detect/test_data/linux-hwcap2-aarch64.auxv"
+            );
+            println!("file: {file}");
+            let v = auxv_from_file(file).unwrap();
+            println!("HWCAP : 0x{:0x}", v.hwcap);
+            println!("HWCAP2: 0x{:0x}", v.hwcap2);
+            assert_eq!(
+                AtHwcap::from(v),
+                AtHwcap {
+                    // Some other HWCAP bits.
+                    paca: true,
+                    pacg: true,
+                    // HWCAP2-only bits.
+                    dcpodp: true,
+                    frint: true,
+                    rng: true,
+                    bti: true,
+                    mte: true,
+                    ..baseline_hwcaps()
+                }
+            );
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs
new file mode 100644
index 0000000000000..bbb173227d07f
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs
@@ -0,0 +1,34 @@
+//! Run-time feature detection for ARM on Linux.
+
+use super::auxvec;
+use crate::detect::{Feature, bit, cache};
+
+/// Try to read the features from the auxiliary vector.
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    // The values are part of the platform-specific [asm/hwcap.h][hwcap]
+    //
+    // [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm/include/uapi/asm/hwcap.h
+    if let Ok(auxv) = auxvec::auxv() {
+        enable_feature(&mut value, Feature::i8mm, bit::test(auxv.hwcap, 27));
+        enable_feature(&mut value, Feature::dotprod, bit::test(auxv.hwcap, 24));
+        enable_feature(&mut value, Feature::neon, bit::test(auxv.hwcap, 12));
+        enable_feature(&mut value, Feature::pmull, bit::test(auxv.hwcap2, 1));
+        enable_feature(&mut value, Feature::crc, bit::test(auxv.hwcap2, 4));
+        enable_feature(&mut value, Feature::aes, bit::test(auxv.hwcap2, 0));
+        // SHA2 requires SHA1 & SHA2 features
+        enable_feature(
+            &mut value,
+            Feature::sha2,
+            bit::test(auxv.hwcap2, 2) && bit::test(auxv.hwcap2, 3),
+        );
+        return value;
+    }
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
new file mode 100644
index 0000000000000..c30379ff06554
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
@@ -0,0 +1,339 @@
+//! Parses ELF auxiliary vectors.
+#![allow(dead_code)]
+
+pub(crate) const AT_NULL: usize = 0;
+
+/// Key to access the CPU Hardware capabilities bitfield.
+pub(crate) const AT_HWCAP: usize = 16;
+/// Key to access the CPU Hardware capabilities 2 bitfield.
+#[cfg(any(
+    target_arch = "aarch64",
+    target_arch = "arm",
+    target_arch = "powerpc",
+    target_arch = "powerpc64",
+    target_arch = "s390x",
+))]
+pub(crate) const AT_HWCAP2: usize = 26;
+
+/// Cache HWCAP bitfields of the ELF Auxiliary Vector.
+///
+/// If an entry cannot be read all the bits in the bitfield are set to zero.
+/// This should be interpreted as all the features being disabled.
+#[derive(Debug, Copy, Clone)]
+#[cfg_attr(test, derive(PartialEq))]
+pub(crate) struct AuxVec {
+    pub hwcap: usize,
+    #[cfg(any(
+        target_arch = "aarch64",
+        target_arch = "arm",
+        target_arch = "powerpc",
+        target_arch = "powerpc64",
+        target_arch = "s390x",
+    ))]
+    pub hwcap2: usize,
+}
+
+/// ELF Auxiliary Vector
+///
+/// The auxiliary vector is a memory region in a running ELF program's stack
+/// composed of (key: usize, value: usize) pairs.
+///
+/// The keys used in the aux vector are platform dependent. For Linux, they are
+/// defined in [linux/auxvec.h][auxvec_h]. The hardware capabilities of a given
+/// CPU can be queried with the  `AT_HWCAP` and `AT_HWCAP2` keys.
+///
+/// There is no perfect way of reading the auxiliary vector.
+///
+/// - If the `std_detect_dlsym_getauxval` cargo feature is enabled, this will use
+///   `getauxval` if its linked to the binary, and otherwise proceed to a fallback implementation.
+///   When `std_detect_dlsym_getauxval` is disabled, this will assume that `getauxval` is
+///   linked to the binary - if that is not the case the behavior is undefined.
+/// - Otherwise, if the `std_detect_file_io` cargo feature is enabled, it will
+///   try to read `/proc/self/auxv`.
+/// - If that fails, this function returns an error.
+///
+/// Note that run-time feature detection is not invoked for features that can
+/// be detected at compile-time.
+///
+///  Note: The `std_detect_dlsym_getauxval` cargo feature is ignored on
+/// `*-linux-{gnu,musl,ohos}*` and `*-android*` targets because we can safely assume `getauxval`
+/// is linked to the binary.
+/// - `*-linux-gnu*` targets ([since Rust 1.64](https://blog.rust-lang.org/2022/08/01/Increasing-glibc-kernel-requirements.html))
+///   have glibc requirements higher than [glibc 2.16 that added `getauxval`](https://sourceware.org/legacy-ml/libc-announce/2012/msg00000.html).
+/// - `*-linux-musl*` targets ([at least since Rust 1.15](https://github.com/rust-lang/rust/blob/1.15.0/src/ci/docker/x86_64-musl/build-musl.sh#L15))
+///   use musl newer than [musl 1.1.0 that added `getauxval`](https://git.musl-libc.org/cgit/musl/tree/WHATSNEW?h=v1.1.0#n1197)
+/// - `*-linux-ohos*` targets use a [fork of musl 1.2](https://gitee.com/openharmony/docs/blob/master/en/application-dev/reference/native-lib/musl.md)
+/// - `*-android*` targets ([since Rust 1.68](https://blog.rust-lang.org/2023/01/09/android-ndk-update-r25.html))
+///   have the minimum supported API level higher than [Android 4.3 (API level 18) that added `getauxval`](https://github.com/aosp-mirror/platform_bionic/blob/d3ebc2f7c49a9893b114124d4a6b315f3a328764/libc/include/sys/auxv.h#L49).
+///
+/// For more information about when `getauxval` is available check the great
+/// [`auxv` crate documentation][auxv_docs].
+///
+/// [auxvec_h]: https://github.com/torvalds/linux/blob/master/include/uapi/linux/auxvec.h
+/// [auxv_docs]: https://docs.rs/auxv/0.3.3/auxv/
+pub(crate) fn auxv() -> Result<AuxVec, ()> {
+    // Try to call a getauxval function.
+    if let Ok(hwcap) = getauxval(AT_HWCAP) {
+        // Targets with only AT_HWCAP:
+        #[cfg(any(
+            target_arch = "riscv32",
+            target_arch = "riscv64",
+            target_arch = "mips",
+            target_arch = "mips64",
+            target_arch = "loongarch64",
+        ))]
+        {
+            // Zero could indicate that no features were detected, but it's also used to indicate
+            // an error. In either case, try the fallback.
+            if hwcap != 0 {
+                return Ok(AuxVec { hwcap });
+            }
+        }
+
+        // Targets with AT_HWCAP and AT_HWCAP2:
+        #[cfg(any(
+            target_arch = "aarch64",
+            target_arch = "arm",
+            target_arch = "powerpc",
+            target_arch = "powerpc64",
+            target_arch = "s390x",
+        ))]
+        {
+            if let Ok(hwcap2) = getauxval(AT_HWCAP2) {
+                // Zero could indicate that no features were detected, but it's also used to indicate
+                // an error. In particular, on many platforms AT_HWCAP2 will be legitimately zero,
+                // since it contains the most recent feature flags. Use the fallback only if no
+                // features were detected at all.
+                if hwcap != 0 || hwcap2 != 0 {
+                    return Ok(AuxVec { hwcap, hwcap2 });
+                }
+            }
+        }
+
+        // Intentionnaly not used
+        let _ = hwcap;
+    }
+
+    #[cfg(feature = "std_detect_file_io")]
+    {
+        // If calling getauxval fails, try to read the auxiliary vector from
+        // its file:
+        auxv_from_file("/proc/self/auxv")
+    }
+    #[cfg(not(feature = "std_detect_file_io"))]
+    {
+        Err(())
+    }
+}
+
+/// Tries to read the `key` from the auxiliary vector by calling the
+/// `getauxval` function. If the function is not linked, this function return `Err`.
+fn getauxval(key: usize) -> Result<usize, ()> {
+    type F = unsafe extern "C" fn(libc::c_ulong) -> libc::c_ulong;
+    cfg_if::cfg_if! {
+        if #[cfg(all(
+            feature = "std_detect_dlsym_getauxval",
+            not(all(
+                target_os = "linux",
+                any(target_env = "gnu", target_env = "musl", target_env = "ohos"),
+            )),
+            not(target_os = "android"),
+        ))] {
+            let ffi_getauxval: F = unsafe {
+                let ptr = libc::dlsym(libc::RTLD_DEFAULT, c"getauxval".as_ptr());
+                if ptr.is_null() {
+                    return Err(());
+                }
+                core::mem::transmute(ptr)
+            };
+        } else {
+            let ffi_getauxval: F = libc::getauxval;
+        }
+    }
+    Ok(unsafe { ffi_getauxval(key as libc::c_ulong) as usize })
+}
+
+/// Tries to read the auxiliary vector from the `file`. If this fails, this
+/// function returns `Err`.
+#[cfg(feature = "std_detect_file_io")]
+pub(super) fn auxv_from_file(file: &str) -> Result<AuxVec, ()> {
+    let file = super::read_file(file)?;
+
+    // See <https://github.com/torvalds/linux/blob/v5.15/include/uapi/linux/auxvec.h>.
+    //
+    // The auxiliary vector contains at most 34 (key,value) fields: from
+    // `AT_MINSIGSTKSZ` to `AT_NULL`, but its number may increase.
+    let len = file.len();
+    let mut buf = alloc::vec![0_usize; 1 + len / core::mem::size_of::<usize>()];
+    unsafe {
+        core::ptr::copy_nonoverlapping(file.as_ptr(), buf.as_mut_ptr() as *mut u8, len);
+    }
+
+    auxv_from_buf(&buf)
+}
+
+/// Tries to interpret the `buffer` as an auxiliary vector. If that fails, this
+/// function returns `Err`.
+#[cfg(feature = "std_detect_file_io")]
+fn auxv_from_buf(buf: &[usize]) -> Result<AuxVec, ()> {
+    // Targets with only AT_HWCAP:
+    #[cfg(any(
+        target_arch = "riscv32",
+        target_arch = "riscv64",
+        target_arch = "mips",
+        target_arch = "mips64",
+        target_arch = "loongarch64",
+    ))]
+    {
+        for el in buf.chunks(2) {
+            match el[0] {
+                AT_NULL => break,
+                AT_HWCAP => return Ok(AuxVec { hwcap: el[1] }),
+                _ => (),
+            }
+        }
+    }
+    // Targets with AT_HWCAP and AT_HWCAP2:
+    #[cfg(any(
+        target_arch = "aarch64",
+        target_arch = "arm",
+        target_arch = "powerpc",
+        target_arch = "powerpc64",
+        target_arch = "s390x",
+    ))]
+    {
+        let mut hwcap = None;
+        // For some platforms, AT_HWCAP2 was added recently, so let it default to zero.
+        let mut hwcap2 = 0;
+        for el in buf.chunks(2) {
+            match el[0] {
+                AT_NULL => break,
+                AT_HWCAP => hwcap = Some(el[1]),
+                AT_HWCAP2 => hwcap2 = el[1],
+                _ => (),
+            }
+        }
+
+        if let Some(hwcap) = hwcap {
+            return Ok(AuxVec { hwcap, hwcap2 });
+        }
+    }
+    // Suppress unused variable
+    let _ = buf;
+    Err(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // FIXME: on mips/mips64 getauxval returns 0, and /proc/self/auxv
+    // does not always contain the AT_HWCAP key under qemu.
+    #[cfg(any(
+        target_arch = "arm",
+        target_arch = "powerpc",
+        target_arch = "powerpc64",
+        target_arch = "s390x",
+    ))]
+    #[test]
+    fn auxv_crate() {
+        let v = auxv();
+        if let Ok(hwcap) = getauxval(AT_HWCAP) {
+            let rt_hwcap = v.expect("failed to find hwcap key").hwcap;
+            assert_eq!(rt_hwcap, hwcap);
+        }
+
+        // Targets with AT_HWCAP and AT_HWCAP2:
+        #[cfg(any(
+            target_arch = "aarch64",
+            target_arch = "arm",
+            target_arch = "powerpc",
+            target_arch = "powerpc64",
+            target_arch = "s390x",
+        ))]
+        {
+            if let Ok(hwcap2) = getauxval(AT_HWCAP2) {
+                let rt_hwcap2 = v.expect("failed to find hwcap2 key").hwcap2;
+                assert_eq!(rt_hwcap2, hwcap2);
+            }
+        }
+    }
+
+    #[test]
+    fn auxv_dump() {
+        if let Ok(auxvec) = auxv() {
+            println!("{:?}", auxvec);
+        } else {
+            println!("both getauxval() and reading /proc/self/auxv failed!");
+        }
+    }
+
+    #[cfg(feature = "std_detect_file_io")]
+    cfg_if::cfg_if! {
+        if #[cfg(target_arch = "arm")] {
+            #[test]
+            fn linux_rpi3() {
+                let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-rpi3.auxv");
+                println!("file: {file}");
+                let v = auxv_from_file(file).unwrap();
+                assert_eq!(v.hwcap, 4174038);
+                assert_eq!(v.hwcap2, 16);
+            }
+
+            #[test]
+            fn linux_macos_vb() {
+                let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv");
+                println!("file: {file}");
+                // The file contains HWCAP but not HWCAP2. In that case, we treat HWCAP2 as zero.
+                let v = auxv_from_file(file).unwrap();
+                assert_eq!(v.hwcap, 126614527);
+                assert_eq!(v.hwcap2, 0);
+            }
+        } else if #[cfg(target_arch = "aarch64")] {
+            #[cfg(target_endian = "little")]
+            #[test]
+            fn linux_artificial_aarch64() {
+                let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-artificial-aarch64.auxv");
+                println!("file: {file}");
+                let v = auxv_from_file(file).unwrap();
+                assert_eq!(v.hwcap, 0x0123456789abcdef);
+                assert_eq!(v.hwcap2, 0x02468ace13579bdf);
+            }
+            #[cfg(target_endian = "little")]
+            #[test]
+            fn linux_no_hwcap2_aarch64() {
+                let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-no-hwcap2-aarch64.auxv");
+                println!("file: {file}");
+                let v = auxv_from_file(file).unwrap();
+                // An absent HWCAP2 is treated as zero, and does not prevent acceptance of HWCAP.
+                assert_ne!(v.hwcap, 0);
+                assert_eq!(v.hwcap2, 0);
+            }
+        }
+    }
+
+    #[test]
+    #[cfg(feature = "std_detect_file_io")]
+    fn auxv_dump_procfs() {
+        if let Ok(auxvec) = auxv_from_file("/proc/self/auxv") {
+            println!("{:?}", auxvec);
+        } else {
+            println!("reading /proc/self/auxv failed!");
+        }
+    }
+
+    #[cfg(any(
+        target_arch = "aarch64",
+        target_arch = "arm",
+        target_arch = "powerpc",
+        target_arch = "powerpc64",
+        target_arch = "s390x",
+    ))]
+    #[test]
+    #[cfg(feature = "std_detect_file_io")]
+    fn auxv_crate_procfs() {
+        if let Ok(procfs_auxv) = auxv_from_file("/proc/self/auxv") {
+            assert_eq!(auxv().unwrap(), procfs_auxv);
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/loongarch.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/loongarch.rs
new file mode 100644
index 0000000000000..14cc7a7318354
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/loongarch.rs
@@ -0,0 +1,68 @@
+//! Run-time feature detection for LoongArch on Linux.
+
+use super::auxvec;
+use crate::detect::{Feature, bit, cache};
+use core::arch::asm;
+
+/// Try to read the features from the auxiliary vector.
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, feature, enable| {
+        if enable {
+            value.set(feature as u32);
+        }
+    };
+
+    // The values are part of the platform-specific [cpucfg]
+    //
+    // [cpucfg]: LoongArch Reference Manual Volume 1: Basic Architecture v1.1
+    let cpucfg2: usize;
+    unsafe {
+        asm!(
+            "cpucfg {}, {}",
+            out(reg) cpucfg2, in(reg) 2,
+            options(pure, nomem, preserves_flags, nostack)
+        );
+    }
+    let cpucfg3: usize;
+    unsafe {
+        asm!(
+            "cpucfg {}, {}",
+            out(reg) cpucfg3, in(reg) 3,
+            options(pure, nomem, preserves_flags, nostack)
+        );
+    }
+    enable_feature(&mut value, Feature::frecipe, bit::test(cpucfg2, 25));
+    enable_feature(&mut value, Feature::div32, bit::test(cpucfg2, 26));
+    enable_feature(&mut value, Feature::lam_bh, bit::test(cpucfg2, 27));
+    enable_feature(&mut value, Feature::lamcas, bit::test(cpucfg2, 28));
+    enable_feature(&mut value, Feature::scq, bit::test(cpucfg2, 30));
+    enable_feature(&mut value, Feature::ld_seq_sa, bit::test(cpucfg3, 23));
+
+    // The values are part of the platform-specific [asm/hwcap.h][hwcap]
+    //
+    // [hwcap]: https://github.com/torvalds/linux/blob/master/arch/loongarch/include/uapi/asm/hwcap.h
+    if let Ok(auxv) = auxvec::auxv() {
+        enable_feature(
+            &mut value,
+            Feature::f,
+            bit::test(cpucfg2, 1) && bit::test(auxv.hwcap, 3),
+        );
+        enable_feature(
+            &mut value,
+            Feature::d,
+            bit::test(cpucfg2, 2) && bit::test(auxv.hwcap, 3),
+        );
+        enable_feature(&mut value, Feature::lsx, bit::test(auxv.hwcap, 4));
+        enable_feature(&mut value, Feature::lasx, bit::test(auxv.hwcap, 5));
+        enable_feature(
+            &mut value,
+            Feature::lbt,
+            bit::test(auxv.hwcap, 10) && bit::test(auxv.hwcap, 11) && bit::test(auxv.hwcap, 12),
+        );
+        enable_feature(&mut value, Feature::lvz, bit::test(auxv.hwcap, 9));
+        enable_feature(&mut value, Feature::ual, bit::test(auxv.hwcap, 2));
+        return value;
+    }
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/mips.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/mips.rs
new file mode 100644
index 0000000000000..0cfa8869887ee
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/mips.rs
@@ -0,0 +1,23 @@
+//! Run-time feature detection for MIPS on Linux.
+
+use super::auxvec;
+use crate::detect::{Feature, bit, cache};
+
+/// Try to read the features from the auxiliary vector.
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    // The values are part of the platform-specific [asm/hwcap.h][hwcap]
+    //
+    // [hwcap]: https://github.com/torvalds/linux/blob/master/arch/mips/include/uapi/asm/hwcap.h
+    if let Ok(auxv) = auxvec::auxv() {
+        enable_feature(&mut value, Feature::msa, bit::test(auxv.hwcap, 1));
+        return value;
+    }
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/mod.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/mod.rs
new file mode 100644
index 0000000000000..8c689d0b1f0e8
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/mod.rs
@@ -0,0 +1,67 @@
+//! Run-time feature detection on Linux
+//!
+#[cfg(feature = "std_detect_file_io")]
+use alloc::vec::Vec;
+
+mod auxvec;
+
+#[cfg(feature = "std_detect_file_io")]
+fn read_file(path: &str) -> Result<Vec<u8>, ()> {
+    let mut path = Vec::from(path.as_bytes());
+    path.push(0);
+
+    unsafe {
+        let file = libc::open(path.as_ptr() as *const libc::c_char, libc::O_RDONLY);
+        if file == -1 {
+            return Err(());
+        }
+
+        let mut data = Vec::new();
+        loop {
+            data.reserve(4096);
+            let spare = data.spare_capacity_mut();
+            match libc::read(file, spare.as_mut_ptr() as *mut _, spare.len()) {
+                -1 => {
+                    libc::close(file);
+                    return Err(());
+                }
+                0 => break,
+                n => data.set_len(data.len() + n as usize),
+            }
+        }
+
+        libc::close(file);
+        Ok(data)
+    }
+}
+
+cfg_if::cfg_if! {
+    if #[cfg(target_arch = "aarch64")] {
+        mod aarch64;
+        pub(crate) use self::aarch64::detect_features;
+    } else if #[cfg(target_arch = "arm")] {
+        mod arm;
+        pub(crate) use self::arm::detect_features;
+    } else if #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] {
+        mod riscv;
+        pub(crate) use self::riscv::detect_features;
+    } else if #[cfg(any(target_arch = "mips", target_arch = "mips64"))] {
+        mod mips;
+        pub(crate) use self::mips::detect_features;
+    } else if #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))] {
+        mod powerpc;
+        pub(crate) use self::powerpc::detect_features;
+    } else if #[cfg(target_arch = "loongarch64")] {
+        mod loongarch;
+        pub(crate) use self::loongarch::detect_features;
+    } else if #[cfg(target_arch = "s390x")] {
+        mod s390x;
+        pub(crate) use self::s390x::detect_features;
+    } else {
+        use crate::detect::cache;
+        /// Performs run-time feature detection.
+        pub(crate) fn detect_features() -> cache::Initializer {
+            cache::Initializer::default()
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/powerpc.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/powerpc.rs
new file mode 100644
index 0000000000000..6a4f7e715d932
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/powerpc.rs
@@ -0,0 +1,35 @@
+//! Run-time feature detection for PowerPC on Linux.
+
+use super::auxvec;
+use crate::detect::{Feature, cache};
+
+/// Try to read the features from the auxiliary vector.
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    // The values are part of the platform-specific [asm/cputable.h][cputable]
+    //
+    // [cputable]: https://github.com/torvalds/linux/blob/master/arch/powerpc/include/uapi/asm/cputable.h
+    if let Ok(auxv) = auxvec::auxv() {
+        // note: the PowerPC values are the mask to do the test (instead of the
+        // index of the bit to test like in ARM and Aarch64)
+        enable_feature(&mut value, Feature::altivec, auxv.hwcap & 0x10000000 != 0);
+        enable_feature(&mut value, Feature::vsx, auxv.hwcap & 0x00000080 != 0);
+        let power8_features = auxv.hwcap2 & 0x80000000 != 0;
+        enable_feature(&mut value, Feature::power8, power8_features);
+        enable_feature(&mut value, Feature::power8_altivec, power8_features);
+        enable_feature(&mut value, Feature::power8_crypto, power8_features);
+        enable_feature(&mut value, Feature::power8_vector, power8_features);
+        let power9_features = auxv.hwcap2 & 0x00800000 != 0;
+        enable_feature(&mut value, Feature::power9, power9_features);
+        enable_feature(&mut value, Feature::power9_altivec, power9_features);
+        enable_feature(&mut value, Feature::power9_vector, power9_features);
+        return value;
+    }
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/riscv.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/riscv.rs
new file mode 100644
index 0000000000000..5506ff31fc792
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/riscv.rs
@@ -0,0 +1,330 @@
+//! Run-time feature detection for RISC-V on Linux.
+//!
+//! On RISC-V, detection using auxv only supports single-letter extensions.
+//! So, we use riscv_hwprobe that supports multi-letter extensions if available.
+//! <https://www.kernel.org/doc/html/latest/arch/riscv/hwprobe.html>
+
+use core::ptr;
+
+use super::super::riscv::imply_features;
+use super::auxvec;
+use crate::detect::{Feature, bit, cache};
+
+// See <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/prctl.h?h=v6.15>
+// for runtime status query constants.
+const PR_RISCV_V_GET_CONTROL: libc::c_int = 70;
+const PR_RISCV_V_VSTATE_CTRL_ON: libc::c_int = 2;
+const PR_RISCV_V_VSTATE_CTRL_CUR_MASK: libc::c_int = 3;
+
+// See <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/riscv/include/uapi/asm/hwprobe.h?h=v6.15>
+// for riscv_hwprobe struct and hardware probing constants.
+
+#[repr(C)]
+struct riscv_hwprobe {
+    key: i64,
+    value: u64,
+}
+
+#[allow(non_upper_case_globals)]
+const __NR_riscv_hwprobe: libc::c_long = 258;
+
+const RISCV_HWPROBE_KEY_BASE_BEHAVIOR: i64 = 3;
+const RISCV_HWPROBE_BASE_BEHAVIOR_IMA: u64 = 1 << 0;
+
+const RISCV_HWPROBE_KEY_IMA_EXT_0: i64 = 4;
+const RISCV_HWPROBE_IMA_FD: u64 = 1 << 0;
+const RISCV_HWPROBE_IMA_C: u64 = 1 << 1;
+const RISCV_HWPROBE_IMA_V: u64 = 1 << 2;
+const RISCV_HWPROBE_EXT_ZBA: u64 = 1 << 3;
+const RISCV_HWPROBE_EXT_ZBB: u64 = 1 << 4;
+const RISCV_HWPROBE_EXT_ZBS: u64 = 1 << 5;
+const RISCV_HWPROBE_EXT_ZICBOZ: u64 = 1 << 6;
+const RISCV_HWPROBE_EXT_ZBC: u64 = 1 << 7;
+const RISCV_HWPROBE_EXT_ZBKB: u64 = 1 << 8;
+const RISCV_HWPROBE_EXT_ZBKC: u64 = 1 << 9;
+const RISCV_HWPROBE_EXT_ZBKX: u64 = 1 << 10;
+const RISCV_HWPROBE_EXT_ZKND: u64 = 1 << 11;
+const RISCV_HWPROBE_EXT_ZKNE: u64 = 1 << 12;
+const RISCV_HWPROBE_EXT_ZKNH: u64 = 1 << 13;
+const RISCV_HWPROBE_EXT_ZKSED: u64 = 1 << 14;
+const RISCV_HWPROBE_EXT_ZKSH: u64 = 1 << 15;
+const RISCV_HWPROBE_EXT_ZKT: u64 = 1 << 16;
+const RISCV_HWPROBE_EXT_ZVBB: u64 = 1 << 17;
+const RISCV_HWPROBE_EXT_ZVBC: u64 = 1 << 18;
+const RISCV_HWPROBE_EXT_ZVKB: u64 = 1 << 19;
+const RISCV_HWPROBE_EXT_ZVKG: u64 = 1 << 20;
+const RISCV_HWPROBE_EXT_ZVKNED: u64 = 1 << 21;
+const RISCV_HWPROBE_EXT_ZVKNHA: u64 = 1 << 22;
+const RISCV_HWPROBE_EXT_ZVKNHB: u64 = 1 << 23;
+const RISCV_HWPROBE_EXT_ZVKSED: u64 = 1 << 24;
+const RISCV_HWPROBE_EXT_ZVKSH: u64 = 1 << 25;
+const RISCV_HWPROBE_EXT_ZVKT: u64 = 1 << 26;
+const RISCV_HWPROBE_EXT_ZFH: u64 = 1 << 27;
+const RISCV_HWPROBE_EXT_ZFHMIN: u64 = 1 << 28;
+const RISCV_HWPROBE_EXT_ZIHINTNTL: u64 = 1 << 29;
+const RISCV_HWPROBE_EXT_ZVFH: u64 = 1 << 30;
+const RISCV_HWPROBE_EXT_ZVFHMIN: u64 = 1 << 31;
+const RISCV_HWPROBE_EXT_ZFA: u64 = 1 << 32;
+const RISCV_HWPROBE_EXT_ZTSO: u64 = 1 << 33;
+const RISCV_HWPROBE_EXT_ZACAS: u64 = 1 << 34;
+const RISCV_HWPROBE_EXT_ZICOND: u64 = 1 << 35;
+const RISCV_HWPROBE_EXT_ZIHINTPAUSE: u64 = 1 << 36;
+const RISCV_HWPROBE_EXT_ZVE32X: u64 = 1 << 37;
+const RISCV_HWPROBE_EXT_ZVE32F: u64 = 1 << 38;
+const RISCV_HWPROBE_EXT_ZVE64X: u64 = 1 << 39;
+const RISCV_HWPROBE_EXT_ZVE64F: u64 = 1 << 40;
+const RISCV_HWPROBE_EXT_ZVE64D: u64 = 1 << 41;
+const RISCV_HWPROBE_EXT_ZIMOP: u64 = 1 << 42;
+const RISCV_HWPROBE_EXT_ZCA: u64 = 1 << 43;
+const RISCV_HWPROBE_EXT_ZCB: u64 = 1 << 44;
+const RISCV_HWPROBE_EXT_ZCD: u64 = 1 << 45;
+const RISCV_HWPROBE_EXT_ZCF: u64 = 1 << 46;
+const RISCV_HWPROBE_EXT_ZCMOP: u64 = 1 << 47;
+const RISCV_HWPROBE_EXT_ZAWRS: u64 = 1 << 48;
+// Excluded because it only reports the existence of `prctl`-based pointer masking control.
+// const RISCV_HWPROBE_EXT_SUPM: u64 = 1 << 49;
+const RISCV_HWPROBE_EXT_ZICNTR: u64 = 1 << 50;
+const RISCV_HWPROBE_EXT_ZIHPM: u64 = 1 << 51;
+const RISCV_HWPROBE_EXT_ZFBFMIN: u64 = 1 << 52;
+const RISCV_HWPROBE_EXT_ZVFBFMIN: u64 = 1 << 53;
+const RISCV_HWPROBE_EXT_ZVFBFWMA: u64 = 1 << 54;
+const RISCV_HWPROBE_EXT_ZICBOM: u64 = 1 << 55;
+const RISCV_HWPROBE_EXT_ZAAMO: u64 = 1 << 56;
+const RISCV_HWPROBE_EXT_ZALRSC: u64 = 1 << 57;
+
+const RISCV_HWPROBE_KEY_CPUPERF_0: i64 = 5;
+const RISCV_HWPROBE_MISALIGNED_FAST: u64 = 3;
+const RISCV_HWPROBE_MISALIGNED_MASK: u64 = 7;
+
+const RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF: i64 = 9;
+const RISCV_HWPROBE_MISALIGNED_SCALAR_FAST: u64 = 3;
+
+const RISCV_HWPROBE_KEY_MISALIGNED_VECTOR_PERF: i64 = 10;
+const RISCV_HWPROBE_MISALIGNED_VECTOR_FAST: u64 = 3;
+
+// syscall returns an unsupported error if riscv_hwprobe is not supported,
+// so we can safely use this function on older versions of Linux.
+fn _riscv_hwprobe(out: &mut [riscv_hwprobe]) -> bool {
+    unsafe fn __riscv_hwprobe(
+        pairs: *mut riscv_hwprobe,
+        pair_count: libc::size_t,
+        cpu_set_size: libc::size_t,
+        cpus: *mut libc::c_ulong,
+        flags: libc::c_uint,
+    ) -> libc::c_long {
+        unsafe {
+            libc::syscall(
+                __NR_riscv_hwprobe,
+                pairs,
+                pair_count,
+                cpu_set_size,
+                cpus,
+                flags,
+            )
+        }
+    }
+
+    let len = out.len();
+    unsafe { __riscv_hwprobe(out.as_mut_ptr(), len, 0, ptr::null_mut(), 0) == 0 }
+}
+
+/// Read list of supported features from (1) the auxiliary vector
+/// and (2) the results of `riscv_hwprobe` and `prctl` system calls.
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let mut enable_feature = |feature, enable| {
+        if enable {
+            value.set(feature as u32);
+        }
+    };
+
+    // Use auxiliary vector to enable single-letter ISA extensions.
+    // The values are part of the platform-specific [asm/hwcap.h][hwcap]
+    //
+    // [hwcap]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/riscv/include/uapi/asm/hwcap.h?h=v6.15
+    let auxv = auxvec::auxv().expect("read auxvec"); // should not fail on RISC-V platform
+    let mut has_i = bit::test(auxv.hwcap, (b'i' - b'a').into());
+    #[allow(clippy::eq_op)]
+    enable_feature(Feature::a, bit::test(auxv.hwcap, (b'a' - b'a').into()));
+    enable_feature(Feature::c, bit::test(auxv.hwcap, (b'c' - b'a').into()));
+    enable_feature(Feature::d, bit::test(auxv.hwcap, (b'd' - b'a').into()));
+    enable_feature(Feature::f, bit::test(auxv.hwcap, (b'f' - b'a').into()));
+    enable_feature(Feature::m, bit::test(auxv.hwcap, (b'm' - b'a').into()));
+    let has_v = bit::test(auxv.hwcap, (b'v' - b'a').into());
+    let mut is_v_set = false;
+
+    // Use riscv_hwprobe syscall to query more extensions and
+    // performance-related capabilities.
+    'hwprobe: {
+        let mut out = [
+            riscv_hwprobe {
+                key: RISCV_HWPROBE_KEY_BASE_BEHAVIOR,
+                value: 0,
+            },
+            riscv_hwprobe {
+                key: RISCV_HWPROBE_KEY_IMA_EXT_0,
+                value: 0,
+            },
+            riscv_hwprobe {
+                key: RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF,
+                value: 0,
+            },
+            riscv_hwprobe {
+                key: RISCV_HWPROBE_KEY_MISALIGNED_VECTOR_PERF,
+                value: 0,
+            },
+            riscv_hwprobe {
+                key: RISCV_HWPROBE_KEY_CPUPERF_0,
+                value: 0,
+            },
+        ];
+        if !_riscv_hwprobe(&mut out) {
+            break 'hwprobe;
+        }
+
+        // Query scalar/vector misaligned behavior.
+        if out[2].key != -1 {
+            enable_feature(
+                Feature::unaligned_scalar_mem,
+                out[2].value == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST,
+            );
+        } else if out[4].key != -1 {
+            // Deprecated method for fallback
+            enable_feature(
+                Feature::unaligned_scalar_mem,
+                out[4].value & RISCV_HWPROBE_MISALIGNED_MASK == RISCV_HWPROBE_MISALIGNED_FAST,
+            );
+        }
+        if out[3].key != -1 {
+            enable_feature(
+                Feature::unaligned_vector_mem,
+                out[3].value == RISCV_HWPROBE_MISALIGNED_VECTOR_FAST,
+            );
+        }
+
+        // Query whether "I" base and extensions "M" and "A" (as in the ISA
+        // manual version 2.2) are enabled.  "I" base at that time corresponds
+        // to "I", "Zicsr", "Zicntr" and "Zifencei" (as in the ISA manual version
+        // 20240411).
+        // This is a current requirement of
+        // `RISCV_HWPROBE_KEY_IMA_EXT_0`-based tests.
+        let has_ima = (out[0].key != -1) && (out[0].value & RISCV_HWPROBE_BASE_BEHAVIOR_IMA != 0);
+        if !has_ima {
+            break 'hwprobe;
+        }
+        has_i |= has_ima;
+        enable_feature(Feature::zicsr, has_ima);
+        enable_feature(Feature::zicntr, has_ima);
+        enable_feature(Feature::zifencei, has_ima);
+        enable_feature(Feature::m, has_ima);
+        enable_feature(Feature::a, has_ima);
+
+        // Enable features based on `RISCV_HWPROBE_KEY_IMA_EXT_0`.
+        if out[1].key == -1 {
+            break 'hwprobe;
+        }
+        let ima_ext_0 = out[1].value;
+        let test = |mask| (ima_ext_0 & mask) != 0;
+
+        enable_feature(Feature::d, test(RISCV_HWPROBE_IMA_FD)); // F is implied.
+        enable_feature(Feature::c, test(RISCV_HWPROBE_IMA_C));
+
+        enable_feature(Feature::zicntr, test(RISCV_HWPROBE_EXT_ZICNTR));
+        enable_feature(Feature::zihpm, test(RISCV_HWPROBE_EXT_ZIHPM));
+
+        enable_feature(Feature::zihintntl, test(RISCV_HWPROBE_EXT_ZIHINTNTL));
+        enable_feature(Feature::zihintpause, test(RISCV_HWPROBE_EXT_ZIHINTPAUSE));
+        enable_feature(Feature::zimop, test(RISCV_HWPROBE_EXT_ZIMOP));
+        enable_feature(Feature::zicbom, test(RISCV_HWPROBE_EXT_ZICBOM));
+        enable_feature(Feature::zicboz, test(RISCV_HWPROBE_EXT_ZICBOZ));
+        enable_feature(Feature::zicond, test(RISCV_HWPROBE_EXT_ZICOND));
+
+        enable_feature(Feature::zalrsc, test(RISCV_HWPROBE_EXT_ZALRSC));
+        enable_feature(Feature::zaamo, test(RISCV_HWPROBE_EXT_ZAAMO));
+        enable_feature(Feature::zawrs, test(RISCV_HWPROBE_EXT_ZAWRS));
+        enable_feature(Feature::zacas, test(RISCV_HWPROBE_EXT_ZACAS));
+        enable_feature(Feature::ztso, test(RISCV_HWPROBE_EXT_ZTSO));
+
+        enable_feature(Feature::zba, test(RISCV_HWPROBE_EXT_ZBA));
+        enable_feature(Feature::zbb, test(RISCV_HWPROBE_EXT_ZBB));
+        enable_feature(Feature::zbs, test(RISCV_HWPROBE_EXT_ZBS));
+        enable_feature(Feature::zbc, test(RISCV_HWPROBE_EXT_ZBC));
+
+        enable_feature(Feature::zbkb, test(RISCV_HWPROBE_EXT_ZBKB));
+        enable_feature(Feature::zbkc, test(RISCV_HWPROBE_EXT_ZBKC));
+        enable_feature(Feature::zbkx, test(RISCV_HWPROBE_EXT_ZBKX));
+        enable_feature(Feature::zknd, test(RISCV_HWPROBE_EXT_ZKND));
+        enable_feature(Feature::zkne, test(RISCV_HWPROBE_EXT_ZKNE));
+        enable_feature(Feature::zknh, test(RISCV_HWPROBE_EXT_ZKNH));
+        enable_feature(Feature::zksed, test(RISCV_HWPROBE_EXT_ZKSED));
+        enable_feature(Feature::zksh, test(RISCV_HWPROBE_EXT_ZKSH));
+        enable_feature(Feature::zkt, test(RISCV_HWPROBE_EXT_ZKT));
+
+        enable_feature(Feature::zcmop, test(RISCV_HWPROBE_EXT_ZCMOP));
+        enable_feature(Feature::zca, test(RISCV_HWPROBE_EXT_ZCA));
+        enable_feature(Feature::zcf, test(RISCV_HWPROBE_EXT_ZCF));
+        enable_feature(Feature::zcd, test(RISCV_HWPROBE_EXT_ZCD));
+        enable_feature(Feature::zcb, test(RISCV_HWPROBE_EXT_ZCB));
+
+        enable_feature(Feature::zfh, test(RISCV_HWPROBE_EXT_ZFH));
+        enable_feature(Feature::zfhmin, test(RISCV_HWPROBE_EXT_ZFHMIN));
+        enable_feature(Feature::zfa, test(RISCV_HWPROBE_EXT_ZFA));
+        enable_feature(Feature::zfbfmin, test(RISCV_HWPROBE_EXT_ZFBFMIN));
+
+        // Use prctl (if any) to determine whether the vector extension
+        // is enabled on the current thread (assuming the entire process
+        // share the same status).  If prctl fails (e.g. QEMU userland emulator
+        // as of version 9.2.3), use auxiliary vector to retrieve the default
+        // vector status on the process startup.
+        let has_vectors = {
+            let v_status = unsafe { libc::prctl(PR_RISCV_V_GET_CONTROL) };
+            if v_status >= 0 {
+                (v_status & PR_RISCV_V_VSTATE_CTRL_CUR_MASK) == PR_RISCV_V_VSTATE_CTRL_ON
+            } else {
+                has_v
+            }
+        };
+        if has_vectors {
+            enable_feature(Feature::v, test(RISCV_HWPROBE_IMA_V));
+            enable_feature(Feature::zve32x, test(RISCV_HWPROBE_EXT_ZVE32X));
+            enable_feature(Feature::zve32f, test(RISCV_HWPROBE_EXT_ZVE32F));
+            enable_feature(Feature::zve64x, test(RISCV_HWPROBE_EXT_ZVE64X));
+            enable_feature(Feature::zve64f, test(RISCV_HWPROBE_EXT_ZVE64F));
+            enable_feature(Feature::zve64d, test(RISCV_HWPROBE_EXT_ZVE64D));
+
+            enable_feature(Feature::zvbb, test(RISCV_HWPROBE_EXT_ZVBB));
+            enable_feature(Feature::zvbc, test(RISCV_HWPROBE_EXT_ZVBC));
+            enable_feature(Feature::zvkb, test(RISCV_HWPROBE_EXT_ZVKB));
+            enable_feature(Feature::zvkg, test(RISCV_HWPROBE_EXT_ZVKG));
+            enable_feature(Feature::zvkned, test(RISCV_HWPROBE_EXT_ZVKNED));
+            enable_feature(Feature::zvknha, test(RISCV_HWPROBE_EXT_ZVKNHA));
+            enable_feature(Feature::zvknhb, test(RISCV_HWPROBE_EXT_ZVKNHB));
+            enable_feature(Feature::zvksed, test(RISCV_HWPROBE_EXT_ZVKSED));
+            enable_feature(Feature::zvksh, test(RISCV_HWPROBE_EXT_ZVKSH));
+            enable_feature(Feature::zvkt, test(RISCV_HWPROBE_EXT_ZVKT));
+
+            enable_feature(Feature::zvfh, test(RISCV_HWPROBE_EXT_ZVFH));
+            enable_feature(Feature::zvfhmin, test(RISCV_HWPROBE_EXT_ZVFHMIN));
+            enable_feature(Feature::zvfbfmin, test(RISCV_HWPROBE_EXT_ZVFBFMIN));
+            enable_feature(Feature::zvfbfwma, test(RISCV_HWPROBE_EXT_ZVFBFWMA));
+        }
+        is_v_set = true;
+    };
+
+    // Set V purely depending on the auxiliary vector
+    // only if no fine-grained vector extension detection is available.
+    if !is_v_set {
+        enable_feature(Feature::v, has_v);
+    }
+
+    // Handle base ISA.
+    // If future RV128I is supported, implement with `enable_feature` here.
+    // Note that we should use `target_arch` instead of `target_pointer_width`
+    // to avoid misdetection caused by experimental ABIs such as RV64ILP32.
+    #[cfg(target_arch = "riscv64")]
+    enable_feature(Feature::rv64i, has_i);
+    #[cfg(target_arch = "riscv32")]
+    enable_feature(Feature::rv32i, has_i);
+
+    imply_features(value)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/s390x.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/s390x.rs
new file mode 100644
index 0000000000000..5cc35fd3025fa
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/s390x.rs
@@ -0,0 +1,132 @@
+//! Run-time feature detection for s390x on Linux.
+
+use super::auxvec;
+use crate::detect::{Feature, bit, cache};
+
+/// Try to read the features from the auxiliary vector
+pub(crate) fn detect_features() -> cache::Initializer {
+    if let Ok(auxv) = auxvec::auxv() {
+        let hwcap: AtHwcap = auxv.into();
+        return hwcap.cache();
+    }
+
+    cache::Initializer::default()
+}
+
+/// These values are part of the platform-specific [asm/elf.h][kernel], and are a selection of the
+/// fields found in the [Facility Indications].
+///
+/// [Facility Indications]: https://www.ibm.com/support/pages/sites/default/files/2021-05/SA22-7871-10.pdf#page=63
+/// [kernel]: https://github.com/torvalds/linux/blob/b62cef9a5c673f1b8083159f5dc03c1c5daced2f/arch/s390/include/asm/elf.h#L129
+#[derive(Debug, Default, PartialEq)]
+struct AtHwcap {
+    esan3: bool,
+    zarch: bool,
+    stfle: bool,
+    msa: bool,
+    ldisp: bool,
+    eimm: bool,
+    dfp: bool,
+    hpage: bool,
+    etf3eh: bool,
+    high_gprs: bool,
+    te: bool,
+    vxrs: bool,
+    vxrs_bcd: bool,
+    vxrs_ext: bool,
+    gs: bool,
+    vxrs_ext2: bool,
+    vxrs_pde: bool,
+    sort: bool,
+    dflt: bool,
+    vxrs_pde2: bool,
+    nnpa: bool,
+    pci_mio: bool,
+    sie: bool,
+}
+
+impl From<auxvec::AuxVec> for AtHwcap {
+    /// Reads AtHwcap from the auxiliary vector.
+    fn from(auxv: auxvec::AuxVec) -> Self {
+        AtHwcap {
+            esan3: bit::test(auxv.hwcap, 0),
+            zarch: bit::test(auxv.hwcap, 1),
+            stfle: bit::test(auxv.hwcap, 2),
+            msa: bit::test(auxv.hwcap, 3),
+            ldisp: bit::test(auxv.hwcap, 4),
+            eimm: bit::test(auxv.hwcap, 5),
+            dfp: bit::test(auxv.hwcap, 6),
+            hpage: bit::test(auxv.hwcap, 7),
+            etf3eh: bit::test(auxv.hwcap, 8),
+            high_gprs: bit::test(auxv.hwcap, 9),
+            te: bit::test(auxv.hwcap, 10),
+            vxrs: bit::test(auxv.hwcap, 11),
+            vxrs_bcd: bit::test(auxv.hwcap, 12),
+            vxrs_ext: bit::test(auxv.hwcap, 13),
+            gs: bit::test(auxv.hwcap, 14),
+            vxrs_ext2: bit::test(auxv.hwcap, 15),
+            vxrs_pde: bit::test(auxv.hwcap, 16),
+            sort: bit::test(auxv.hwcap, 17),
+            dflt: bit::test(auxv.hwcap, 18),
+            vxrs_pde2: bit::test(auxv.hwcap, 19),
+            nnpa: bit::test(auxv.hwcap, 20),
+            pci_mio: bit::test(auxv.hwcap, 21),
+            sie: bit::test(auxv.hwcap, 22),
+        }
+    }
+}
+
+impl AtHwcap {
+    /// Initializes the cache from the feature bits.
+    fn cache(self) -> cache::Initializer {
+        let mut value = cache::Initializer::default();
+        {
+            let mut enable_feature = |f, enable| {
+                if enable {
+                    value.set(f as u32);
+                }
+            };
+
+            // vector and related
+
+            // bit 129 of the extended facility list
+            enable_feature(Feature::vector, self.vxrs);
+
+            // bit 135 of the extended facility list
+            enable_feature(Feature::vector_enhancements_1, self.vxrs_ext);
+
+            // bit 148 of the extended facility list
+            enable_feature(Feature::vector_enhancements_2, self.vxrs_ext2);
+
+            // bit 134 of the extended facility list
+            enable_feature(Feature::vector_packed_decimal, self.vxrs_bcd);
+
+            // bit 152 of the extended facility list
+            enable_feature(Feature::vector_packed_decimal_enhancement, self.vxrs_pde);
+
+            // bit 192 of the extended facility list
+            enable_feature(Feature::vector_packed_decimal_enhancement_2, self.vxrs_pde2);
+
+            // bit 165 of the extended facility list
+            enable_feature(Feature::nnp_assist, self.nnpa);
+
+            // others
+
+            // bit 45 of the extended facility list
+            enable_feature(Feature::high_word, self.high_gprs);
+
+            // bit 73 of the extended facility list
+            enable_feature(Feature::transactional_execution, self.te);
+
+            // bit 133 of the extended facility list
+            enable_feature(Feature::guarded_storage, self.gs);
+
+            // bit 150 of the extended facility list
+            enable_feature(Feature::enhanced_sort, self.sort);
+
+            // bit 151 of the extended facility list
+            enable_feature(Feature::deflate_conversion, self.dflt);
+        }
+        value
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/openbsd/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/openbsd/aarch64.rs
new file mode 100644
index 0000000000000..cfe4ad10ad643
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/openbsd/aarch64.rs
@@ -0,0 +1,55 @@
+//! Run-time feature detection for Aarch64 on OpenBSD.
+//!
+//! OpenBSD doesn't trap the mrs instruction, but exposes the system registers through sysctl.
+//! https://github.com/openbsd/src/commit/d335af936b9d7dd9cf655cae1ce19560c45de6c8
+//! https://github.com/golang/go/commit/cd54ef1f61945459486e9eea2f016d99ef1da925
+
+use crate::detect::cache;
+use core::{mem::MaybeUninit, ptr};
+
+// Defined in machine/cpu.h.
+// https://github.com/openbsd/src/blob/72ccc03bd11da614f31f7ff76e3f6fce99bc1c79/sys/arch/arm64/include/cpu.h#L25-L40
+const CPU_ID_AA64ISAR0: libc::c_int = 2;
+const CPU_ID_AA64ISAR1: libc::c_int = 3;
+const CPU_ID_AA64MMFR2: libc::c_int = 7;
+const CPU_ID_AA64PFR0: libc::c_int = 8;
+
+/// Try to read the features from the system registers.
+pub(crate) fn detect_features() -> cache::Initializer {
+    // ID_AA64ISAR0_EL1 and ID_AA64ISAR1_EL1 are supported on OpenBSD 7.1+.
+    // https://github.com/openbsd/src/commit/d335af936b9d7dd9cf655cae1ce19560c45de6c8
+    // Others are supported on OpenBSD 7.3+.
+    // https://github.com/openbsd/src/commit/c7654cd65262d532212f65123ee3905ba200365c
+    // sysctl returns an unsupported error if operation is not supported,
+    // so we can safely use this function on older versions of OpenBSD.
+    let aa64isar0 = sysctl64(&[libc::CTL_MACHDEP, CPU_ID_AA64ISAR0]).unwrap_or(0);
+    let aa64isar1 = sysctl64(&[libc::CTL_MACHDEP, CPU_ID_AA64ISAR1]).unwrap_or(0);
+    let aa64mmfr2 = sysctl64(&[libc::CTL_MACHDEP, CPU_ID_AA64MMFR2]).unwrap_or(0);
+    // Do not use unwrap_or(0) because in fp and asimd fields, 0 indicates that
+    // the feature is available.
+    let aa64pfr0 = sysctl64(&[libc::CTL_MACHDEP, CPU_ID_AA64PFR0]);
+
+    super::aarch64::parse_system_registers(aa64isar0, aa64isar1, aa64mmfr2, aa64pfr0)
+}
+
+#[inline]
+fn sysctl64(mib: &[libc::c_int]) -> Option<u64> {
+    const OUT_LEN: libc::size_t = core::mem::size_of::<u64>();
+    let mut out = MaybeUninit::<u64>::uninit();
+    let mut out_len = OUT_LEN;
+    let res = unsafe {
+        libc::sysctl(
+            mib.as_ptr(),
+            mib.len() as libc::c_uint,
+            out.as_mut_ptr() as *mut libc::c_void,
+            &mut out_len,
+            ptr::null_mut(),
+            0,
+        )
+    };
+    if res == -1 || out_len != OUT_LEN {
+        return None;
+    }
+    // SAFETY: we've checked that sysctl was successful and `out` was filled.
+    Some(unsafe { out.assume_init() })
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/other.rs b/library/stdarch/crates/std_detect/src/detect/os/other.rs
new file mode 100644
index 0000000000000..091fafc4ebf4d
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/other.rs
@@ -0,0 +1,8 @@
+//! Other operating systems
+
+use crate::detect::cache;
+
+#[allow(dead_code)]
+pub(crate) fn detect_features() -> cache::Initializer {
+    cache::Initializer::default()
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/riscv.rs b/library/stdarch/crates/std_detect/src/detect/os/riscv.rs
new file mode 100644
index 0000000000000..4c59ede80293e
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/riscv.rs
@@ -0,0 +1,203 @@
+//! Run-time feature detection utility for RISC-V.
+//!
+//! On RISC-V, full feature detection needs a help of one or more
+//! feature detection mechanisms (usually provided by the operating system).
+//!
+//! RISC-V architecture defines many extensions and some have dependency to others.
+//! More importantly, some of them cannot be enabled without resolving such
+//! dependencies due to limited set of features that such mechanisms provide.
+//!
+//! This module provides an OS-independent utility to process such relations
+//! between RISC-V extensions.
+
+use crate::detect::{Feature, cache};
+
+/// Imply features by the given set of enabled features.
+///
+/// Note that it does not perform any consistency checks including existence of
+/// conflicting extensions and/or complicated requirements.  Eliminating such
+/// inconsistencies is the responsibility of the feature detection logic and
+/// its provider(s).
+pub(crate) fn imply_features(mut value: cache::Initializer) -> cache::Initializer {
+    loop {
+        // Check convergence of the feature flags later.
+        let prev = value;
+
+        // Expect that the optimizer turns repeated operations into
+        // a fewer number of bit-manipulation operations.
+        macro_rules! imply {
+            // Regular implication:
+            // A1 => (B1[, B2...]), A2 => (B1[, B2...]) and so on.
+            ($($from: ident)|+ => $($to: ident)&+) => {
+                if [$(Feature::$from as u32),+].iter().any(|&x| value.test(x)) {
+                    $(
+                        value.set(Feature::$to as u32);
+                    )+
+                }
+            };
+            // Implication with multiple requirements:
+            // A1 && A2 ... => (B1[, B2...]).
+            ($($from: ident)&+ => $($to: ident)&+) => {
+                if [$(Feature::$from as u32),+].iter().all(|&x| value.test(x)) {
+                    $(
+                        value.set(Feature::$to as u32);
+                    )+
+                }
+            };
+        }
+        macro_rules! group {
+            ($group: ident == $($member: ident)&+) => {
+                // Forward implication as defined in the specifications.
+                imply!($group => $($member)&+);
+                // Reverse implication to "group extension" from its members.
+                // This is not a part of specifications but convenient for
+                // feature detection and implemented in e.g. LLVM.
+                imply!($($member)&+ => $group);
+            };
+        }
+
+        /*
+            If a dependency/implication is not explicitly stated in the
+            specification, it is denoted as a comment as follows:
+            "defined as subset":
+                The latter extension is described as a subset of the former
+                (but the evidence is weak).
+            "functional":
+                The former extension is functionally a superset of the latter
+                (no direct references though).
+        */
+
+        imply!(zvbb => zvkb);
+
+        // Certain set of vector cryptography extensions form a group.
+        group!(zvkn == zvkned & zvknhb & zvkb & zvkt);
+        group!(zvknc == zvkn & zvbc);
+        group!(zvkng == zvkn & zvkg);
+        group!(zvks == zvksed & zvksh & zvkb & zvkt);
+        group!(zvksc == zvks & zvbc);
+        group!(zvksg == zvks & zvkg);
+
+        imply!(zvknhb => zvknha); // functional
+
+        // For vector cryptography, Zvknhb and Zvbc require integer arithmetic
+        // with EEW=64 (Zve64x) while others not depending on them
+        // require EEW=32 (Zve32x).
+        imply!(zvknhb | zvbc => zve64x);
+        imply!(zvbb | zvkb | zvkg | zvkned | zvknha | zvksed | zvksh => zve32x);
+
+        imply!(zbc => zbkc); // defined as subset
+        group!(zkn == zbkb & zbkc & zbkx & zkne & zknd & zknh);
+        group!(zks == zbkb & zbkc & zbkx & zksed & zksh);
+        group!(zk == zkn & zkr & zkt);
+
+        imply!(zacas => zaamo);
+        group!(a == zalrsc & zaamo);
+
+        group!(b == zba & zbb & zbs);
+
+        imply!(zcf => zca & f);
+        imply!(zcd => zca & d);
+        imply!(zcmop | zcb => zca);
+
+        imply!(zhinx => zhinxmin);
+        imply!(zdinx | zhinxmin => zfinx);
+
+        imply!(zvfh => zvfhmin); // functional
+        imply!(zvfh => zve32f & zfhmin);
+        imply!(zvfhmin => zve32f);
+        imply!(zvfbfwma => zvfbfmin & zfbfmin);
+        imply!(zvfbfmin => zve32f);
+
+        imply!(v => zve64d);
+        imply!(zve64d => zve64f & d);
+        imply!(zve64f => zve64x & zve32f);
+        imply!(zve64x => zve32x);
+        imply!(zve32f => zve32x & f);
+
+        imply!(zfh => zfhmin);
+        imply!(q => d);
+        imply!(d | zfhmin | zfa => f);
+        imply!(zfbfmin => f); // and some of (not all) "Zfh" instructions.
+
+        // Relatively complex implication rules from the "C" extension.
+        imply!(c => zca);
+        imply!(c & d => zcd);
+        #[cfg(target_arch = "riscv32")]
+        imply!(c & f => zcf);
+
+        imply!(zicntr | zihpm | f | zfinx | zve32x => zicsr);
+
+        // Loop until the feature flags converge.
+        if prev == value {
+            return value;
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn simple_direct() {
+        let mut value = cache::Initializer::default();
+        value.set(Feature::f as u32);
+        // F (and other extensions with CSRs) -> Zicsr
+        assert!(imply_features(value).test(Feature::zicsr as u32));
+    }
+
+    #[test]
+    fn simple_indirect() {
+        let mut value = cache::Initializer::default();
+        value.set(Feature::q as u32);
+        // Q -> D, D -> F, F -> Zicsr
+        assert!(imply_features(value).test(Feature::zicsr as u32));
+    }
+
+    #[test]
+    fn complex_zcd() {
+        let mut value = cache::Initializer::default();
+        // C & D -> Zcd
+        value.set(Feature::c as u32);
+        assert!(!imply_features(value).test(Feature::zcd as u32));
+        value.set(Feature::d as u32);
+        assert!(imply_features(value).test(Feature::zcd as u32));
+    }
+
+    #[test]
+    fn group_simple_forward() {
+        let mut value = cache::Initializer::default();
+        // A -> Zalrsc & Zaamo (forward implication)
+        value.set(Feature::a as u32);
+        let value = imply_features(value);
+        assert!(value.test(Feature::zalrsc as u32));
+        assert!(value.test(Feature::zaamo as u32));
+    }
+
+    #[test]
+    fn group_simple_backward() {
+        let mut value = cache::Initializer::default();
+        // Zalrsc & Zaamo -> A (reverse implication)
+        value.set(Feature::zalrsc as u32);
+        value.set(Feature::zaamo as u32);
+        assert!(imply_features(value).test(Feature::a as u32));
+    }
+
+    #[test]
+    fn group_complex_convergence() {
+        let mut value = cache::Initializer::default();
+        // Needs 3 iterations to converge
+        // (and 4th iteration for convergence checking):
+        // 1.  [Zvksc] -> Zvks & Zvbc
+        // 2.  Zvks -> Zvksed & Zvksh & Zvkb & Zvkt
+        // 3a. [Zvkned] & [Zvknhb] & [Zvkb] & Zvkt -> {Zvkn}
+        // 3b. Zvkn & Zvbc -> {Zvknc}
+        value.set(Feature::zvksc as u32);
+        value.set(Feature::zvkned as u32);
+        value.set(Feature::zvknhb as u32);
+        value.set(Feature::zvkb as u32);
+        let value = imply_features(value);
+        assert!(value.test(Feature::zvkn as u32));
+        assert!(value.test(Feature::zvknc as u32));
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/windows/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/windows/aarch64.rs
new file mode 100644
index 0000000000000..937f9f26eedc1
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/windows/aarch64.rs
@@ -0,0 +1,125 @@
+//! Run-time feature detection for Aarch64 on Windows.
+
+use crate::detect::{Feature, cache};
+
+/// Try to read the features using IsProcessorFeaturePresent.
+pub(crate) fn detect_features() -> cache::Initializer {
+    type DWORD = u32;
+    type BOOL = i32;
+
+    const FALSE: BOOL = 0;
+    // The following Microsoft documents isn't updated for aarch64.
+    // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent
+    // These are defined in winnt.h of Windows SDK
+    const PF_ARM_VFP_32_REGISTERS_AVAILABLE: u32 = 18;
+    const PF_ARM_NEON_INSTRUCTIONS_AVAILABLE: u32 = 19;
+    const PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE: u32 = 30;
+    const PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE: u32 = 31;
+    const PF_ARM_V81_ATOMIC_INSTRUCTIONS_AVAILABLE: u32 = 34;
+    const PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE: u32 = 43;
+    const PF_ARM_V83_JSCVT_INSTRUCTIONS_AVAILABLE: u32 = 44;
+    const PF_ARM_V83_LRCPC_INSTRUCTIONS_AVAILABLE: u32 = 45;
+    const PF_ARM_SVE_INSTRUCTIONS_AVAILABLE: u32 = 46;
+    const PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE: u32 = 47;
+    const PF_ARM_SVE2_1_INSTRUCTIONS_AVAILABLE: u32 = 48;
+    const PF_ARM_SVE_AES_INSTRUCTIONS_AVAILABLE: u32 = 49;
+    const PF_ARM_SVE_PMULL128_INSTRUCTIONS_AVAILABLE: u32 = 50;
+    const PF_ARM_SVE_BITPERM_INSTRUCTIONS_AVAILABLE: u32 = 51;
+    // const PF_ARM_SVE_BF16_INSTRUCTIONS_AVAILABLE: u32 = 52;
+    // const PF_ARM_SVE_EBF16_INSTRUCTIONS_AVAILABLE: u32 = 53;
+    const PF_ARM_SVE_B16B16_INSTRUCTIONS_AVAILABLE: u32 = 54;
+    const PF_ARM_SVE_SHA3_INSTRUCTIONS_AVAILABLE: u32 = 55;
+    const PF_ARM_SVE_SM4_INSTRUCTIONS_AVAILABLE: u32 = 56;
+    // const PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE: u32 = 57;
+    // const PF_ARM_SVE_F32MM_INSTRUCTIONS_AVAILABLE: u32 = 58;
+    // const PF_ARM_SVE_F64MM_INSTRUCTIONS_AVAILABLE: u32 = 59;
+
+    unsafe extern "system" {
+        fn IsProcessorFeaturePresent(ProcessorFeature: DWORD) -> BOOL;
+    }
+
+    let mut value = cache::Initializer::default();
+    {
+        let mut enable_feature = |f, enable| {
+            if enable {
+                value.set(f as u32);
+            }
+        };
+
+        // Some features may be supported on current CPU,
+        // but no way to detect it by OS API.
+        // Also, we require unsafe block for the extern "system" calls.
+        unsafe {
+            enable_feature(
+                Feature::fp,
+                IsProcessorFeaturePresent(PF_ARM_VFP_32_REGISTERS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::asimd,
+                IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::crc,
+                IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::lse,
+                IsProcessorFeaturePresent(PF_ARM_V81_ATOMIC_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::dotprod,
+                IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::jsconv,
+                IsProcessorFeaturePresent(PF_ARM_V83_JSCVT_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::rcpc,
+                IsProcessorFeaturePresent(PF_ARM_V83_LRCPC_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::sve,
+                IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::sve2,
+                IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::sve2p1,
+                IsProcessorFeaturePresent(PF_ARM_SVE2_1_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::sve2_aes,
+                IsProcessorFeaturePresent(PF_ARM_SVE_AES_INSTRUCTIONS_AVAILABLE) != FALSE
+                    && IsProcessorFeaturePresent(PF_ARM_SVE_PMULL128_INSTRUCTIONS_AVAILABLE)
+                        != FALSE,
+            );
+            enable_feature(
+                Feature::sve2_bitperm,
+                IsProcessorFeaturePresent(PF_ARM_SVE_BITPERM_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::sve_b16b16,
+                IsProcessorFeaturePresent(PF_ARM_SVE_B16B16_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::sve2_sha3,
+                IsProcessorFeaturePresent(PF_ARM_SVE_SHA3_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::sve2_sm4,
+                IsProcessorFeaturePresent(PF_ARM_SVE_SM4_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            // PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE means aes, sha1, sha2 and
+            // pmull support
+            let crypto =
+                IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != FALSE;
+            enable_feature(Feature::aes, crypto);
+            enable_feature(Feature::pmull, crypto);
+            enable_feature(Feature::sha2, crypto);
+        }
+    }
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/x86.rs b/library/stdarch/crates/std_detect/src/detect/os/x86.rs
new file mode 100644
index 0000000000000..8565c2f85e246
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/x86.rs
@@ -0,0 +1,335 @@
+//! x86 run-time feature detection is OS independent.
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+use core::mem;
+
+use crate::detect::{Feature, bit, cache};
+
+/// Run-time feature detection on x86 works by using the CPUID instruction.
+///
+/// The [CPUID Wikipedia page][wiki_cpuid] contains
+/// all the information about which flags to set to query which values, and in
+/// which registers these are reported.
+///
+/// The definitive references are:
+/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+///   Instruction Set Reference, A-Z][intel64_ref].
+/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+///   System Instructions][amd64_ref].
+///
+/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
+/// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+#[allow(clippy::similar_names)]
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+
+    if cfg!(target_env = "sgx") {
+        // doesn't support this because it is untrusted data
+        return value;
+    }
+
+    // Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU
+    // has `cpuid` support.
+
+    // 0. EAX = 0: Basic Information:
+    // - EAX returns the "Highest Function Parameter", that is, the maximum
+    // leaf value for subsequent calls of `cpuinfo` in range [0,
+    // 0x8000_0000]. - The vendor ID is stored in 12 u8 ascii chars,
+    // returned in EBX, EDX, and   ECX (in that order):
+    let (max_basic_leaf, vendor_id) = unsafe {
+        let CpuidResult {
+            eax: max_basic_leaf,
+            ebx,
+            ecx,
+            edx,
+        } = __cpuid(0);
+        let vendor_id: [[u8; 4]; 3] = [ebx.to_ne_bytes(), edx.to_ne_bytes(), ecx.to_ne_bytes()];
+        let vendor_id: [u8; 12] = mem::transmute(vendor_id);
+        (max_basic_leaf, vendor_id)
+    };
+
+    if max_basic_leaf < 1 {
+        // Earlier Intel 486, CPUID not implemented
+        return value;
+    }
+
+    // EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits";
+    // Contains information about most x86 features.
+    let CpuidResult {
+        ecx: proc_info_ecx,
+        edx: proc_info_edx,
+        ..
+    } = unsafe { __cpuid(0x0000_0001_u32) };
+
+    // EAX = 7: Queries "Extended Features";
+    // Contains information about bmi,bmi2, and avx2 support.
+    let (
+        extended_features_ebx,
+        extended_features_ecx,
+        extended_features_edx,
+        extended_features_eax_leaf_1,
+        extended_features_edx_leaf_1,
+    ) = if max_basic_leaf >= 7 {
+        let CpuidResult { ebx, ecx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
+        let CpuidResult {
+            eax: eax_1,
+            edx: edx_1,
+            ..
+        } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
+        (ebx, ecx, edx, eax_1, edx_1)
+    } else {
+        (0, 0, 0, 0, 0) // CPUID does not support "Extended Features"
+    };
+
+    // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
+    // - EAX returns the max leaf value for extended information, that is,
+    // `cpuid` calls in range [0x8000_0000; u32::MAX]:
+    let CpuidResult {
+        eax: extended_max_basic_leaf,
+        ..
+    } = unsafe { __cpuid(0x8000_0000_u32) };
+
+    // EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature
+    // Bits"
+    let extended_proc_info_ecx = if extended_max_basic_leaf >= 1 {
+        let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) };
+        ecx
+    } else {
+        0
+    };
+
+    {
+        // borrows value till the end of this scope:
+        let mut enable = |r, rb, f| {
+            let present = bit::test(r as usize, rb);
+            if present {
+                value.set(f as u32);
+            }
+            present
+        };
+
+        enable(proc_info_ecx, 0, Feature::sse3);
+        enable(proc_info_ecx, 1, Feature::pclmulqdq);
+        enable(proc_info_ecx, 9, Feature::ssse3);
+        enable(proc_info_ecx, 13, Feature::cmpxchg16b);
+        enable(proc_info_ecx, 19, Feature::sse4_1);
+        enable(proc_info_ecx, 20, Feature::sse4_2);
+        enable(proc_info_ecx, 22, Feature::movbe);
+        enable(proc_info_ecx, 23, Feature::popcnt);
+        enable(proc_info_ecx, 25, Feature::aes);
+        let f16c = enable(proc_info_ecx, 29, Feature::f16c);
+        enable(proc_info_ecx, 30, Feature::rdrand);
+        enable(extended_features_ebx, 18, Feature::rdseed);
+        enable(extended_features_ebx, 19, Feature::adx);
+        enable(extended_features_ebx, 11, Feature::rtm);
+        enable(proc_info_edx, 4, Feature::tsc);
+        enable(proc_info_edx, 23, Feature::mmx);
+        enable(proc_info_edx, 24, Feature::fxsr);
+        enable(proc_info_edx, 25, Feature::sse);
+        enable(proc_info_edx, 26, Feature::sse2);
+        enable(extended_features_ebx, 29, Feature::sha);
+
+        enable(extended_features_ecx, 8, Feature::gfni);
+        enable(extended_features_ecx, 9, Feature::vaes);
+        enable(extended_features_ecx, 10, Feature::vpclmulqdq);
+
+        enable(extended_features_ebx, 3, Feature::bmi1);
+        enable(extended_features_ebx, 8, Feature::bmi2);
+
+        enable(extended_features_ebx, 9, Feature::ermsb);
+
+        enable(extended_features_eax_leaf_1, 31, Feature::movrs);
+
+        // Detect if CPUID.19h available
+        if bit::test(extended_features_ecx as usize, 23) {
+            let CpuidResult { ebx, .. } = unsafe { __cpuid(0x19) };
+            enable(ebx, 0, Feature::kl);
+            enable(ebx, 2, Feature::widekl);
+        }
+
+        // `XSAVE` and `AVX` support:
+        let cpu_xsave = bit::test(proc_info_ecx as usize, 26);
+        if cpu_xsave {
+            // 0. Here the CPU supports `XSAVE`.
+
+            // 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and
+            // supports saving the state of the AVX/AVX2 vector registers on
+            // context-switches, see:
+            //
+            // - [intel: is avx enabled?][is_avx_enabled],
+            // - [mozilla: sse.cpp][mozilla_sse_cpp].
+            //
+            // [is_avx_enabled]: https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
+            // [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
+            let cpu_osxsave = bit::test(proc_info_ecx as usize, 27);
+
+            if cpu_osxsave {
+                // 2. The OS must have signaled the CPU that it supports saving and
+                // restoring the:
+                //
+                // * SSE -> `XCR0.SSE[1]`
+                // * AVX -> `XCR0.AVX[2]`
+                // * AVX-512 -> `XCR0.AVX-512[7:5]`.
+                // * AMX -> `XCR0.AMX[18:17]`
+                //
+                // by setting the corresponding bits of `XCR0` to `1`.
+                //
+                // This is safe because the CPU supports `xsave`
+                // and the OS has set `osxsave`.
+                let xcr0 = unsafe { _xgetbv(0) };
+                // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
+                let os_avx_support = xcr0 & 6 == 6;
+                // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
+                let os_avx512_support = xcr0 & 0xe0 == 0xe0;
+                // Test `XCR0.AMX[18:17]` with the mask `0b110_0000_0000_0000_0000 == 0x60000`
+                let os_amx_support = xcr0 & 0x60000 == 0x60000;
+
+                // Only if the OS and the CPU support saving/restoring the AVX
+                // registers we enable `xsave` support:
+                if os_avx_support {
+                    // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED
+                    // FEATURES" in the "Intel® 64 and IA-32 Architectures Software
+                    // Developer’s Manual, Volume 1: Basic Architecture":
+                    //
+                    // "Software enables the XSAVE feature set by setting
+                    // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4
+                    // instruction). If this bit is 0, execution of any of XGETBV,
+                    // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV
+                    // causes an invalid-opcode exception (#UD)"
+                    //
+                    enable(proc_info_ecx, 26, Feature::xsave);
+
+                    // For `xsaveopt`, `xsavec`, and `xsaves` we need to query:
+                    // Processor Extended State Enumeration Sub-leaf (EAX = 0DH,
+                    // ECX = 1):
+                    if max_basic_leaf >= 0xd {
+                        let CpuidResult {
+                            eax: proc_extended_state1_eax,
+                            ..
+                        } = unsafe { __cpuid_count(0xd_u32, 1) };
+                        enable(proc_extended_state1_eax, 0, Feature::xsaveopt);
+                        enable(proc_extended_state1_eax, 1, Feature::xsavec);
+                        enable(proc_extended_state1_eax, 3, Feature::xsaves);
+                    }
+
+                    // FMA (uses 256-bit wide registers):
+                    let fma = enable(proc_info_ecx, 12, Feature::fma);
+
+                    // And AVX/AVX2:
+                    enable(proc_info_ecx, 28, Feature::avx);
+                    enable(extended_features_ebx, 5, Feature::avx2);
+
+                    // "Short" versions of AVX512 instructions
+                    enable(extended_features_eax_leaf_1, 4, Feature::avxvnni);
+                    enable(extended_features_eax_leaf_1, 23, Feature::avxifma);
+                    enable(extended_features_edx_leaf_1, 4, Feature::avxvnniint8);
+                    enable(extended_features_edx_leaf_1, 5, Feature::avxneconvert);
+                    enable(extended_features_edx_leaf_1, 10, Feature::avxvnniint16);
+
+                    enable(extended_features_eax_leaf_1, 0, Feature::sha512);
+                    enable(extended_features_eax_leaf_1, 1, Feature::sm3);
+                    enable(extended_features_eax_leaf_1, 2, Feature::sm4);
+
+                    // For AVX-512 the OS also needs to support saving/restoring
+                    // the extended state, only then we enable AVX-512 support:
+                    // Also, Rust makes `avx512f` imply `fma` and `f16c`, because
+                    // otherwise the assembler is broken. But Intel doesn't guarantee
+                    // that `fma` and `f16c` are available with `avx512f`, so we
+                    // need to check for them separately.
+                    if os_avx512_support && f16c && fma {
+                        enable(extended_features_ebx, 16, Feature::avx512f);
+                        enable(extended_features_ebx, 17, Feature::avx512dq);
+                        enable(extended_features_ebx, 21, Feature::avx512ifma);
+                        enable(extended_features_ebx, 26, Feature::avx512pf);
+                        enable(extended_features_ebx, 27, Feature::avx512er);
+                        enable(extended_features_ebx, 28, Feature::avx512cd);
+                        enable(extended_features_ebx, 30, Feature::avx512bw);
+                        enable(extended_features_ebx, 31, Feature::avx512vl);
+                        enable(extended_features_ecx, 1, Feature::avx512vbmi);
+                        enable(extended_features_ecx, 6, Feature::avx512vbmi2);
+                        enable(extended_features_ecx, 11, Feature::avx512vnni);
+                        enable(extended_features_ecx, 12, Feature::avx512bitalg);
+                        enable(extended_features_ecx, 14, Feature::avx512vpopcntdq);
+                        enable(extended_features_edx, 8, Feature::avx512vp2intersect);
+                        enable(extended_features_edx, 23, Feature::avx512fp16);
+                        enable(extended_features_eax_leaf_1, 5, Feature::avx512bf16);
+                    }
+                }
+
+                if os_amx_support {
+                    enable(extended_features_edx, 24, Feature::amx_tile);
+                    enable(extended_features_edx, 25, Feature::amx_int8);
+                    enable(extended_features_edx, 22, Feature::amx_bf16);
+                    enable(extended_features_eax_leaf_1, 21, Feature::amx_fp16);
+                    enable(extended_features_edx_leaf_1, 8, Feature::amx_complex);
+
+                    if max_basic_leaf >= 0x1e {
+                        let CpuidResult {
+                            eax: amx_feature_flags_eax,
+                            ..
+                        } = unsafe { __cpuid_count(0x1e_u32, 1) };
+
+                        enable(amx_feature_flags_eax, 4, Feature::amx_fp8);
+                        enable(amx_feature_flags_eax, 5, Feature::amx_transpose);
+                        enable(amx_feature_flags_eax, 6, Feature::amx_tf32);
+                        enable(amx_feature_flags_eax, 7, Feature::amx_avx512);
+                        enable(amx_feature_flags_eax, 8, Feature::amx_movrs);
+                    }
+                }
+            }
+        }
+
+        // This detects ABM on AMD CPUs and LZCNT on Intel CPUs.
+        // On intel CPUs with popcnt, lzcnt implements the
+        // "missing part" of ABM, so we map both to the same
+        // internal feature.
+        //
+        // The `is_x86_feature_detected!("lzcnt")` macro then
+        // internally maps to Feature::abm.
+        enable(extended_proc_info_ecx, 5, Feature::lzcnt);
+
+        // As Hygon Dhyana originates from AMD technology and shares most of the architecture with
+        // AMD's family 17h, but with different CPU Vendor ID("HygonGenuine")/Family series
+        // number(Family 18h).
+        //
+        // For CPUID feature bits, Hygon Dhyana(family 18h) share the same definition with AMD
+        // family 17h.
+        //
+        // Related AMD CPUID specification is https://www.amd.com/system/files/TechDocs/25481.pdf.
+        // Related Hygon kernel patch can be found on
+        // http://lkml.kernel.org/r/5ce86123a7b9dad925ac583d88d2f921040e859b.1538583282.git.puwen@hygon.cn
+        if vendor_id == *b"AuthenticAMD" || vendor_id == *b"HygonGenuine" {
+            // These features are available on AMD arch CPUs:
+            enable(extended_proc_info_ecx, 6, Feature::sse4a);
+            enable(extended_proc_info_ecx, 21, Feature::tbm);
+            enable(extended_proc_info_ecx, 11, Feature::xop);
+        }
+    }
+
+    // Unfortunately, some Skylake chips erroneously report support for BMI1 and
+    // BMI2 without actual support. These chips don't support AVX, and it seems
+    // that all Intel chips with non-erroneous support BMI do (I didn't check
+    // other vendors), so we can disable these flags for chips that don't also
+    // report support for AVX.
+    //
+    // It's possible this will pessimize future chips that do support BMI and
+    // not AVX, but this seems minor compared to a hard crash you get when
+    // executing an unsupported instruction (to put it another way, it's safe
+    // for us to under-report CPU features, but not to over-report them). Still,
+    // to limit any impact this may have in the future, we only do this for
+    // Intel chips, as it's a bug only present in their chips.
+    //
+    // This bug is documented as `SKL052` in the errata section of this document:
+    // http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/desktop-6th-gen-core-family-spec-update.pdf
+    if vendor_id == *b"GenuineIntel" && !value.test(Feature::avx as u32) {
+        value.unset(Feature::bmi1 as u32);
+        value.unset(Feature::bmi2 as u32);
+    }
+
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-artificial-aarch64.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-artificial-aarch64.auxv
new file mode 100644
index 0000000000000..ec826afcf3817
Binary files /dev/null and b/library/stdarch/crates/std_detect/src/detect/test_data/linux-artificial-aarch64.auxv differ
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-empty-hwcap2-aarch64.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-empty-hwcap2-aarch64.auxv
new file mode 100644
index 0000000000000..95537b73f2069
Binary files /dev/null and b/library/stdarch/crates/std_detect/src/detect/test_data/linux-empty-hwcap2-aarch64.auxv differ
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-hwcap2-aarch64.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-hwcap2-aarch64.auxv
new file mode 100644
index 0000000000000..1d87264b22190
Binary files /dev/null and b/library/stdarch/crates/std_detect/src/detect/test_data/linux-hwcap2-aarch64.auxv differ
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-no-hwcap2-aarch64.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-no-hwcap2-aarch64.auxv
new file mode 100644
index 0000000000000..35f01cc767c50
Binary files /dev/null and b/library/stdarch/crates/std_detect/src/detect/test_data/linux-no-hwcap2-aarch64.auxv differ
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-rpi3.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-rpi3.auxv
new file mode 100644
index 0000000000000..0538e661f63ad
Binary files /dev/null and b/library/stdarch/crates/std_detect/src/detect/test_data/linux-rpi3.auxv differ
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv
new file mode 100644
index 0000000000000..75abc02d17813
Binary files /dev/null and b/library/stdarch/crates/std_detect/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv differ
diff --git a/library/stdarch/crates/std_detect/src/lib.rs b/library/stdarch/crates/std_detect/src/lib.rs
new file mode 100644
index 0000000000000..ab1b77bad5bef
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/lib.rs
@@ -0,0 +1,36 @@
+//! Run-time feature detection for the Rust standard library.
+//!
+//! To detect whether a feature is enabled in the system running the binary
+//! use one of the appropriate macro for the target:
+//!
+//! * `x86` and `x86_64`: [`is_x86_feature_detected`]
+//! * `arm`: [`is_arm_feature_detected`]
+//! * `aarch64`: [`is_aarch64_feature_detected`]
+//! * `riscv`: [`is_riscv_feature_detected`]
+//! * `mips`: [`is_mips_feature_detected`]
+//! * `mips64`: [`is_mips64_feature_detected`]
+//! * `powerpc`: [`is_powerpc_feature_detected`]
+//! * `powerpc64`: [`is_powerpc64_feature_detected`]
+//! * `loongarch`: [`is_loongarch_feature_detected`]
+//! * `s390x`: [`is_s390x_feature_detected`]
+
+#![unstable(feature = "stdarch_internal", issue = "none")]
+#![feature(staged_api, doc_cfg, allow_internal_unstable)]
+#![deny(rust_2018_idioms)]
+#![allow(clippy::shadow_reuse)]
+#![cfg_attr(test, allow(unused_imports))]
+#![no_std]
+#![allow(internal_features)]
+
+#[cfg(test)]
+#[macro_use]
+extern crate std;
+
+// rust-lang/rust#83888: removing `extern crate` gives an error that `vec_spare>
+#[cfg_attr(feature = "std_detect_file_io", allow(unused_extern_crates))]
+#[cfg(feature = "std_detect_file_io")]
+extern crate alloc;
+
+#[doc(hidden)]
+#[unstable(feature = "stdarch_internal", issue = "none")]
+pub mod detect;
diff --git a/library/stdarch/crates/std_detect/tests/cpu-detection.rs b/library/stdarch/crates/std_detect/tests/cpu-detection.rs
new file mode 100644
index 0000000000000..7976aedc75850
--- /dev/null
+++ b/library/stdarch/crates/std_detect/tests/cpu-detection.rs
@@ -0,0 +1,354 @@
+#![allow(internal_features)]
+#![feature(stdarch_internal)]
+#![cfg_attr(target_arch = "arm", feature(stdarch_arm_feature_detection))]
+#![cfg_attr(
+    any(target_arch = "aarch64", target_arch = "arm64ec"),
+    feature(stdarch_aarch64_feature_detection)
+)]
+#![cfg_attr(
+    any(target_arch = "riscv32", target_arch = "riscv64"),
+    feature(stdarch_riscv_feature_detection)
+)]
+#![cfg_attr(target_arch = "powerpc", feature(stdarch_powerpc_feature_detection))]
+#![cfg_attr(target_arch = "powerpc64", feature(stdarch_powerpc_feature_detection))]
+#![cfg_attr(target_arch = "s390x", feature(stdarch_s390x_feature_detection))]
+#![allow(clippy::unwrap_used, clippy::use_debug, clippy::print_stdout)]
+
+#[cfg_attr(
+    any(
+        target_arch = "arm",
+        target_arch = "aarch64",
+        target_arch = "arm64ec",
+        target_arch = "riscv32",
+        target_arch = "riscv64",
+        target_arch = "powerpc",
+        target_arch = "powerpc64",
+        target_arch = "s390x",
+    ),
+    macro_use
+)]
+extern crate std_detect;
+
+#[test]
+fn all() {
+    for (f, e) in std_detect::detect::features() {
+        println!("{f}: {e}");
+    }
+}
+
+#[test]
+#[cfg(all(target_arch = "arm", target_os = "freebsd"))]
+fn arm_freebsd() {
+    println!("neon: {}", is_arm_feature_detected!("neon"));
+    println!("pmull: {}", is_arm_feature_detected!("pmull"));
+    println!("crc: {}", is_arm_feature_detected!("crc"));
+    println!("aes: {}", is_arm_feature_detected!("aes"));
+    println!("sha2: {}", is_arm_feature_detected!("sha2"));
+}
+
+#[test]
+#[cfg(all(target_arch = "arm", any(target_os = "linux", target_os = "android")))]
+fn arm_linux() {
+    println!("neon: {}", is_arm_feature_detected!("neon"));
+    println!("pmull: {}", is_arm_feature_detected!("pmull"));
+    println!("crc: {}", is_arm_feature_detected!("crc"));
+    println!("aes: {}", is_arm_feature_detected!("aes"));
+    println!("sha2: {}", is_arm_feature_detected!("sha2"));
+    println!("dotprod: {}", is_arm_feature_detected!("dotprod"));
+    println!("i8mm: {}", is_arm_feature_detected!("i8mm"));
+}
+
+#[test]
+#[cfg(all(
+    target_arch = "aarch64",
+    any(target_os = "linux", target_os = "android")
+))]
+fn aarch64_linux() {
+    println!("asimd: {}", is_aarch64_feature_detected!("asimd"));
+    println!("neon: {}", is_aarch64_feature_detected!("neon"));
+    println!("pmull: {}", is_aarch64_feature_detected!("pmull"));
+    println!("fp: {}", is_aarch64_feature_detected!("fp"));
+    println!("fp16: {}", is_aarch64_feature_detected!("fp16"));
+    println!("sve: {}", is_aarch64_feature_detected!("sve"));
+    println!("crc: {}", is_aarch64_feature_detected!("crc"));
+    println!("lse: {}", is_aarch64_feature_detected!("lse"));
+    println!("lse2: {}", is_aarch64_feature_detected!("lse2"));
+    println!("lse128: {}", is_aarch64_feature_detected!("lse128"));
+    println!("rdm: {}", is_aarch64_feature_detected!("rdm"));
+    println!("rcpc: {}", is_aarch64_feature_detected!("rcpc"));
+    println!("rcpc2: {}", is_aarch64_feature_detected!("rcpc2"));
+    println!("rcpc3: {}", is_aarch64_feature_detected!("rcpc3"));
+    println!("dotprod: {}", is_aarch64_feature_detected!("dotprod"));
+    println!("tme: {}", is_aarch64_feature_detected!("tme"));
+    println!("fhm: {}", is_aarch64_feature_detected!("fhm"));
+    println!("dit: {}", is_aarch64_feature_detected!("dit"));
+    println!("flagm: {}", is_aarch64_feature_detected!("flagm"));
+    println!("flagm2: {}", is_aarch64_feature_detected!("flagm2"));
+    println!("ssbs: {}", is_aarch64_feature_detected!("ssbs"));
+    println!("sb: {}", is_aarch64_feature_detected!("sb"));
+    println!("paca: {}", is_aarch64_feature_detected!("paca"));
+    println!("pacg: {}", is_aarch64_feature_detected!("pacg"));
+    // println!("pauth-lr: {}", is_aarch64_feature_detected!("pauth-lr"));
+    println!("dpb: {}", is_aarch64_feature_detected!("dpb"));
+    println!("dpb2: {}", is_aarch64_feature_detected!("dpb2"));
+    println!("sve-b16b16: {}", is_aarch64_feature_detected!("sve-b16b16"));
+    println!("sve2: {}", is_aarch64_feature_detected!("sve2"));
+    println!("sve2p1: {}", is_aarch64_feature_detected!("sve2p1"));
+    println!("sve2-aes: {}", is_aarch64_feature_detected!("sve2-aes"));
+    println!("sve2-sm4: {}", is_aarch64_feature_detected!("sve2-sm4"));
+    println!("sve2-sha3: {}", is_aarch64_feature_detected!("sve2-sha3"));
+    println!(
+        "sve2-bitperm: {}",
+        is_aarch64_feature_detected!("sve2-bitperm")
+    );
+    println!("frintts: {}", is_aarch64_feature_detected!("frintts"));
+    println!("i8mm: {}", is_aarch64_feature_detected!("i8mm"));
+    println!("f32mm: {}", is_aarch64_feature_detected!("f32mm"));
+    println!("f64mm: {}", is_aarch64_feature_detected!("f64mm"));
+    println!("bf16: {}", is_aarch64_feature_detected!("bf16"));
+    println!("rand: {}", is_aarch64_feature_detected!("rand"));
+    println!("bti: {}", is_aarch64_feature_detected!("bti"));
+    println!("mte: {}", is_aarch64_feature_detected!("mte"));
+    println!("jsconv: {}", is_aarch64_feature_detected!("jsconv"));
+    println!("fcma: {}", is_aarch64_feature_detected!("fcma"));
+    println!("aes: {}", is_aarch64_feature_detected!("aes"));
+    println!("sha2: {}", is_aarch64_feature_detected!("sha2"));
+    println!("sha3: {}", is_aarch64_feature_detected!("sha3"));
+    println!("sm4: {}", is_aarch64_feature_detected!("sm4"));
+    println!("hbc: {}", is_aarch64_feature_detected!("hbc"));
+    println!("mops: {}", is_aarch64_feature_detected!("mops"));
+    println!("ecv: {}", is_aarch64_feature_detected!("ecv"));
+    println!("cssc: {}", is_aarch64_feature_detected!("cssc"));
+    println!("fpmr: {}", is_aarch64_feature_detected!("fpmr"));
+    println!("lut: {}", is_aarch64_feature_detected!("lut"));
+    println!("faminmax: {}", is_aarch64_feature_detected!("faminmax"));
+    println!("fp8: {}", is_aarch64_feature_detected!("fp8"));
+    println!("fp8fma: {}", is_aarch64_feature_detected!("fp8fma"));
+    println!("fp8dot4: {}", is_aarch64_feature_detected!("fp8dot4"));
+    println!("fp8dot2: {}", is_aarch64_feature_detected!("fp8dot2"));
+    println!("wfxt: {}", is_aarch64_feature_detected!("wfxt"));
+    println!("sme: {}", is_aarch64_feature_detected!("sme"));
+    println!("sme-b16b16: {}", is_aarch64_feature_detected!("sme-b16b16"));
+    println!("sme-i16i64: {}", is_aarch64_feature_detected!("sme-i16i64"));
+    println!("sme-f64f64: {}", is_aarch64_feature_detected!("sme-f64f64"));
+    println!("sme-fa64: {}", is_aarch64_feature_detected!("sme-fa64"));
+    println!("sme2: {}", is_aarch64_feature_detected!("sme2"));
+    println!("sme2p1: {}", is_aarch64_feature_detected!("sme2p1"));
+    println!("sme-f16f16: {}", is_aarch64_feature_detected!("sme-f16f16"));
+    println!("sme-lutv2: {}", is_aarch64_feature_detected!("sme-lutv2"));
+    println!("sme-f8f16: {}", is_aarch64_feature_detected!("sme-f8f16"));
+    println!("sme-f8f32: {}", is_aarch64_feature_detected!("sme-f8f32"));
+    println!(
+        "ssve-fp8fma: {}",
+        is_aarch64_feature_detected!("ssve-fp8fma")
+    );
+    println!(
+        "ssve-fp8dot4: {}",
+        is_aarch64_feature_detected!("ssve-fp8dot4")
+    );
+    println!(
+        "ssve-fp8dot2: {}",
+        is_aarch64_feature_detected!("ssve-fp8dot2")
+    );
+}
+
+#[test]
+#[cfg(all(
+    any(target_arch = "aarch64", target_arch = "arm64ec"),
+    target_os = "windows"
+))]
+fn aarch64_windows() {
+    println!("asimd: {:?}", is_aarch64_feature_detected!("asimd"));
+    println!("fp: {:?}", is_aarch64_feature_detected!("fp"));
+    println!("crc: {:?}", is_aarch64_feature_detected!("crc"));
+    println!("lse: {:?}", is_aarch64_feature_detected!("lse"));
+    println!("dotprod: {:?}", is_aarch64_feature_detected!("dotprod"));
+    println!("jsconv: {:?}", is_aarch64_feature_detected!("jsconv"));
+    println!("rcpc: {:?}", is_aarch64_feature_detected!("rcpc"));
+    println!("aes: {:?}", is_aarch64_feature_detected!("aes"));
+    println!("pmull: {:?}", is_aarch64_feature_detected!("pmull"));
+    println!("sha2: {:?}", is_aarch64_feature_detected!("sha2"));
+}
+
+#[test]
+#[cfg(all(
+    target_arch = "aarch64",
+    any(target_os = "freebsd", target_os = "openbsd")
+))]
+fn aarch64_bsd() {
+    println!("asimd: {:?}", is_aarch64_feature_detected!("asimd"));
+    println!("pmull: {:?}", is_aarch64_feature_detected!("pmull"));
+    println!("fp: {:?}", is_aarch64_feature_detected!("fp"));
+    println!("fp16: {:?}", is_aarch64_feature_detected!("fp16"));
+    println!("sve: {:?}", is_aarch64_feature_detected!("sve"));
+    println!("crc: {:?}", is_aarch64_feature_detected!("crc"));
+    println!("lse: {:?}", is_aarch64_feature_detected!("lse"));
+    println!("lse2: {:?}", is_aarch64_feature_detected!("lse2"));
+    println!("rdm: {:?}", is_aarch64_feature_detected!("rdm"));
+    println!("rcpc: {:?}", is_aarch64_feature_detected!("rcpc"));
+    println!("dotprod: {:?}", is_aarch64_feature_detected!("dotprod"));
+    println!("tme: {:?}", is_aarch64_feature_detected!("tme"));
+    println!("paca: {:?}", is_aarch64_feature_detected!("paca"));
+    println!("pacg: {:?}", is_aarch64_feature_detected!("pacg"));
+    println!("aes: {:?}", is_aarch64_feature_detected!("aes"));
+    println!("sha2: {:?}", is_aarch64_feature_detected!("sha2"));
+}
+
+#[test]
+#[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
+fn aarch64_darwin() {
+    println!("asimd: {:?}", is_aarch64_feature_detected!("asimd"));
+    println!("fp: {:?}", is_aarch64_feature_detected!("fp"));
+    println!("fp16: {:?}", is_aarch64_feature_detected!("fp16"));
+    println!("pmull: {:?}", is_aarch64_feature_detected!("pmull"));
+    println!("crc: {:?}", is_aarch64_feature_detected!("crc"));
+    println!("lse: {:?}", is_aarch64_feature_detected!("lse"));
+    println!("lse2: {:?}", is_aarch64_feature_detected!("lse2"));
+    println!("rdm: {:?}", is_aarch64_feature_detected!("rdm"));
+    println!("rcpc: {:?}", is_aarch64_feature_detected!("rcpc"));
+    println!("rcpc2: {:?}", is_aarch64_feature_detected!("rcpc2"));
+    println!("dotprod: {:?}", is_aarch64_feature_detected!("dotprod"));
+    println!("fhm: {:?}", is_aarch64_feature_detected!("fhm"));
+    println!("flagm: {:?}", is_aarch64_feature_detected!("flagm"));
+    println!("ssbs: {:?}", is_aarch64_feature_detected!("ssbs"));
+    println!("sb: {:?}", is_aarch64_feature_detected!("sb"));
+    println!("paca: {:?}", is_aarch64_feature_detected!("paca"));
+    println!("dpb: {:?}", is_aarch64_feature_detected!("dpb"));
+    println!("dpb2: {:?}", is_aarch64_feature_detected!("dpb2"));
+    println!("frintts: {:?}", is_aarch64_feature_detected!("frintts"));
+    println!("i8mm: {:?}", is_aarch64_feature_detected!("i8mm"));
+    println!("bf16: {:?}", is_aarch64_feature_detected!("bf16"));
+    println!("bti: {:?}", is_aarch64_feature_detected!("bti"));
+    println!("fcma: {:?}", is_aarch64_feature_detected!("fcma"));
+    println!("jsconv: {:?}", is_aarch64_feature_detected!("jsconv"));
+    println!("aes: {:?}", is_aarch64_feature_detected!("aes"));
+    println!("sha2: {:?}", is_aarch64_feature_detected!("sha2"));
+    println!("sha3: {:?}", is_aarch64_feature_detected!("sha3"));
+}
+
+#[test]
+#[cfg(all(
+    any(target_arch = "riscv32", target_arch = "riscv64"),
+    any(target_os = "linux", target_os = "android")
+))]
+fn riscv_linux() {
+    println!("rv32i: {}", is_riscv_feature_detected!("rv32i"));
+    println!("rv32e: {}", is_riscv_feature_detected!("rv32e"));
+    println!("rv64i: {}", is_riscv_feature_detected!("rv64i"));
+    println!("rv128i: {}", is_riscv_feature_detected!("rv128i"));
+    println!(
+        "unaligned-scalar-mem: {}",
+        is_riscv_feature_detected!("unaligned-scalar-mem")
+    );
+    println!(
+        "unaligned-vector-mem: {}",
+        is_riscv_feature_detected!("unaligned-vector-mem")
+    );
+    println!("zicsr: {}", is_riscv_feature_detected!("zicsr"));
+    println!("zicntr: {}", is_riscv_feature_detected!("zicntr"));
+    println!("zihpm: {}", is_riscv_feature_detected!("zihpm"));
+    println!("zifencei: {}", is_riscv_feature_detected!("zifencei"));
+    println!("zihintntl: {}", is_riscv_feature_detected!("zihintntl"));
+    println!("zihintpause: {}", is_riscv_feature_detected!("zihintpause"));
+    println!("zimop: {}", is_riscv_feature_detected!("zimop"));
+    println!("zicbom: {}", is_riscv_feature_detected!("zicbom"));
+    println!("zicboz: {}", is_riscv_feature_detected!("zicboz"));
+    println!("zicond: {}", is_riscv_feature_detected!("zicond"));
+    println!("m: {}", is_riscv_feature_detected!("m"));
+    println!("a: {}", is_riscv_feature_detected!("a"));
+    println!("zalrsc: {}", is_riscv_feature_detected!("zalrsc"));
+    println!("zaamo: {}", is_riscv_feature_detected!("zaamo"));
+    println!("zawrs: {}", is_riscv_feature_detected!("zawrs"));
+    println!("zacas: {}", is_riscv_feature_detected!("zacas"));
+    println!("zam: {}", is_riscv_feature_detected!("zam"));
+    println!("ztso: {}", is_riscv_feature_detected!("ztso"));
+    println!("f: {}", is_riscv_feature_detected!("f"));
+    println!("d: {}", is_riscv_feature_detected!("d"));
+    println!("q: {}", is_riscv_feature_detected!("q"));
+    println!("zfh: {}", is_riscv_feature_detected!("zfh"));
+    println!("zfhmin: {}", is_riscv_feature_detected!("zfhmin"));
+    println!("zfa: {}", is_riscv_feature_detected!("zfa"));
+    println!("zfbfmin: {}", is_riscv_feature_detected!("zfbfmin"));
+    println!("zfinx: {}", is_riscv_feature_detected!("zfinx"));
+    println!("zdinx: {}", is_riscv_feature_detected!("zdinx"));
+    println!("zhinx: {}", is_riscv_feature_detected!("zhinx"));
+    println!("zhinxmin: {}", is_riscv_feature_detected!("zhinxmin"));
+    println!("c: {}", is_riscv_feature_detected!("c"));
+    println!("zca: {}", is_riscv_feature_detected!("zca"));
+    println!("zcf: {}", is_riscv_feature_detected!("zcf"));
+    println!("zcd: {}", is_riscv_feature_detected!("zcd"));
+    println!("zcb: {}", is_riscv_feature_detected!("zcb"));
+    println!("zcmop: {}", is_riscv_feature_detected!("zcmop"));
+    println!("b: {}", is_riscv_feature_detected!("b"));
+    println!("zba: {}", is_riscv_feature_detected!("zba"));
+    println!("zbb: {}", is_riscv_feature_detected!("zbb"));
+    println!("zbc: {}", is_riscv_feature_detected!("zbc"));
+    println!("zbs: {}", is_riscv_feature_detected!("zbs"));
+    println!("zbkb: {}", is_riscv_feature_detected!("zbkb"));
+    println!("zbkc: {}", is_riscv_feature_detected!("zbkc"));
+    println!("zbkx: {}", is_riscv_feature_detected!("zbkx"));
+    println!("zknd: {}", is_riscv_feature_detected!("zknd"));
+    println!("zkne: {}", is_riscv_feature_detected!("zkne"));
+    println!("zknh: {}", is_riscv_feature_detected!("zknh"));
+    println!("zksed: {}", is_riscv_feature_detected!("zksed"));
+    println!("zksh: {}", is_riscv_feature_detected!("zksh"));
+    println!("zkr: {}", is_riscv_feature_detected!("zkr"));
+    println!("zkn: {}", is_riscv_feature_detected!("zkn"));
+    println!("zks: {}", is_riscv_feature_detected!("zks"));
+    println!("zk: {}", is_riscv_feature_detected!("zk"));
+    println!("zkt: {}", is_riscv_feature_detected!("zkt"));
+    println!("v: {}", is_riscv_feature_detected!("v"));
+    println!("zve32x: {}", is_riscv_feature_detected!("zve32x"));
+    println!("zve32f: {}", is_riscv_feature_detected!("zve32f"));
+    println!("zve64x: {}", is_riscv_feature_detected!("zve64x"));
+    println!("zve64f: {}", is_riscv_feature_detected!("zve64f"));
+    println!("zve64d: {}", is_riscv_feature_detected!("zve64d"));
+    println!("zvfh: {}", is_riscv_feature_detected!("zvfh"));
+    println!("zvfhmin: {}", is_riscv_feature_detected!("zvfhmin"));
+    println!("zvfbfmin: {}", is_riscv_feature_detected!("zvfbfmin"));
+    println!("zvfbfwma: {}", is_riscv_feature_detected!("zvfbfwma"));
+    println!("zvbb: {}", is_riscv_feature_detected!("zvbb"));
+    println!("zvbc: {}", is_riscv_feature_detected!("zvbc"));
+    println!("zvkb: {}", is_riscv_feature_detected!("zvkb"));
+    println!("zvkg: {}", is_riscv_feature_detected!("zvkg"));
+    println!("zvkned: {}", is_riscv_feature_detected!("zvkned"));
+    println!("zvknha: {}", is_riscv_feature_detected!("zvknha"));
+    println!("zvknhb: {}", is_riscv_feature_detected!("zvknhb"));
+    println!("zvksed: {}", is_riscv_feature_detected!("zvksed"));
+    println!("zvksh: {}", is_riscv_feature_detected!("zvksh"));
+    println!("zvkn: {}", is_riscv_feature_detected!("zvkn"));
+    println!("zvknc: {}", is_riscv_feature_detected!("zvknc"));
+    println!("zvkng: {}", is_riscv_feature_detected!("zvkng"));
+    println!("zvks: {}", is_riscv_feature_detected!("zvks"));
+    println!("zvksc: {}", is_riscv_feature_detected!("zvksc"));
+    println!("zvksg: {}", is_riscv_feature_detected!("zvksg"));
+    println!("zvkt: {}", is_riscv_feature_detected!("zvkt"));
+    println!("j: {}", is_riscv_feature_detected!("j"));
+    println!("p: {}", is_riscv_feature_detected!("p"));
+}
+
+#[test]
+#[cfg(all(target_arch = "powerpc", target_os = "linux"))]
+fn powerpc_linux() {
+    println!("altivec: {}", is_powerpc_feature_detected!("altivec"));
+    println!("vsx: {}", is_powerpc_feature_detected!("vsx"));
+    println!("power8: {}", is_powerpc_feature_detected!("power8"));
+}
+
+#[test]
+#[cfg(all(
+    target_arch = "powerpc64",
+    any(target_os = "linux", target_os = "freebsd"),
+))]
+fn powerpc64_linux_or_freebsd() {
+    println!("altivec: {}", is_powerpc64_feature_detected!("altivec"));
+    println!("vsx: {}", is_powerpc64_feature_detected!("vsx"));
+    println!("power8: {}", is_powerpc64_feature_detected!("power8"));
+    println!("power9: {}", is_powerpc64_feature_detected!("power9"));
+}
+
+#[test]
+#[cfg(all(target_arch = "s390x", target_os = "linux",))]
+fn s390x_linux() {
+    println!("vector: {}", is_s390x_feature_detected!("vector"));
+}
diff --git a/library/stdarch/crates/std_detect/tests/macro_trailing_commas.rs b/library/stdarch/crates/std_detect/tests/macro_trailing_commas.rs
new file mode 100644
index 0000000000000..fa3a23c796817
--- /dev/null
+++ b/library/stdarch/crates/std_detect/tests/macro_trailing_commas.rs
@@ -0,0 +1,107 @@
+#![allow(internal_features)]
+#![cfg_attr(
+    any(
+        target_arch = "arm",
+        target_arch = "aarch64",
+        target_arch = "arm64ec",
+        target_arch = "x86",
+        target_arch = "x86_64",
+        target_arch = "powerpc",
+        target_arch = "powerpc64",
+        target_arch = "s390x",
+        target_arch = "riscv32",
+        target_arch = "riscv64",
+        target_arch = "loongarch64"
+    ),
+    feature(stdarch_internal)
+)]
+#![cfg_attr(target_arch = "arm", feature(stdarch_arm_feature_detection))]
+#![cfg_attr(
+    any(target_arch = "aarch64", target_arch = "arm64ec"),
+    feature(stdarch_aarch64_feature_detection)
+)]
+#![cfg_attr(
+    any(target_arch = "powerpc", target_arch = "powerpc64"),
+    feature(stdarch_powerpc_feature_detection)
+)]
+#![cfg_attr(target_arch = "s390x", feature(stdarch_s390x_feature_detection))]
+#![cfg_attr(
+    any(target_arch = "riscv32", target_arch = "riscv64"),
+    feature(stdarch_riscv_feature_detection)
+)]
+#![cfg_attr(
+    target_arch = "loongarch64",
+    feature(stdarch_loongarch_feature_detection)
+)]
+
+#[cfg(any(
+    target_arch = "arm",
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    target_arch = "x86",
+    target_arch = "x86_64",
+    target_arch = "powerpc",
+    target_arch = "powerpc64",
+    target_arch = "s390x",
+    target_arch = "riscv32",
+    target_arch = "riscv64",
+    target_arch = "loongarch64"
+))]
+#[macro_use]
+extern crate std_detect;
+
+#[test]
+#[cfg(target_arch = "arm")]
+fn arm() {
+    let _ = is_arm_feature_detected!("neon");
+    let _ = is_arm_feature_detected!("neon",);
+}
+
+#[test]
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+fn aarch64() {
+    let _ = is_aarch64_feature_detected!("fp");
+    let _ = is_aarch64_feature_detected!("fp",);
+}
+
+#[test]
+#[cfg(target_arch = "loongarch64")]
+fn loongarch64() {
+    let _ = is_loongarch_feature_detected!("lsx");
+    let _ = is_loongarch_feature_detected!("lsx",);
+}
+
+#[test]
+#[cfg(target_arch = "powerpc")]
+fn powerpc() {
+    let _ = is_powerpc_feature_detected!("altivec");
+    let _ = is_powerpc_feature_detected!("altivec",);
+}
+
+#[test]
+#[cfg(target_arch = "powerpc64")]
+fn powerpc64() {
+    let _ = is_powerpc64_feature_detected!("altivec");
+    let _ = is_powerpc64_feature_detected!("altivec",);
+}
+
+#[test]
+#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
+fn riscv() {
+    let _ = is_riscv_feature_detected!("zk");
+    let _ = is_riscv_feature_detected!("zk",);
+}
+
+#[test]
+#[cfg(target_arch = "s390x")]
+fn s390x() {
+    let _ = is_s390x_feature_detected!("vector");
+    let _ = is_s390x_feature_detected!("vector",);
+}
+
+#[test]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn x86() {
+    let _ = is_x86_feature_detected!("sse");
+    let _ = is_x86_feature_detected!("sse",);
+}
diff --git a/library/stdarch/crates/std_detect/tests/x86-specific.rs b/library/stdarch/crates/std_detect/tests/x86-specific.rs
new file mode 100644
index 0000000000000..04080f639c90d
--- /dev/null
+++ b/library/stdarch/crates/std_detect/tests/x86-specific.rs
@@ -0,0 +1,119 @@
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#![allow(internal_features)]
+#![feature(
+    stdarch_internal,
+    sha512_sm_x86,
+    x86_amx_intrinsics,
+    xop_target_feature,
+    keylocker_x86,
+    movrs_target_feature
+)]
+
+#[macro_use]
+extern crate std_detect;
+
+#[test]
+fn dump() {
+    println!("aes: {:?}", is_x86_feature_detected!("aes"));
+    println!("pclmulqdq: {:?}", is_x86_feature_detected!("pclmulqdq"));
+    println!("rdrand: {:?}", is_x86_feature_detected!("rdrand"));
+    println!("rdseed: {:?}", is_x86_feature_detected!("rdseed"));
+    println!("tsc: {:?}", is_x86_feature_detected!("tsc"));
+    println!("sse: {:?}", is_x86_feature_detected!("sse"));
+    println!("sse2: {:?}", is_x86_feature_detected!("sse2"));
+    println!("sse3: {:?}", is_x86_feature_detected!("sse3"));
+    println!("ssse3: {:?}", is_x86_feature_detected!("ssse3"));
+    println!("sse4.1: {:?}", is_x86_feature_detected!("sse4.1"));
+    println!("sse4.2: {:?}", is_x86_feature_detected!("sse4.2"));
+    println!("sse4a: {:?}", is_x86_feature_detected!("sse4a"));
+    println!("sha: {:?}", is_x86_feature_detected!("sha"));
+    println!("f16c: {:?}", is_x86_feature_detected!("f16c"));
+    println!("avx: {:?}", is_x86_feature_detected!("avx"));
+    println!("avx2: {:?}", is_x86_feature_detected!("avx2"));
+    println!("sha512: {:?}", is_x86_feature_detected!("sha512"));
+    println!("sm3: {:?}", is_x86_feature_detected!("sm3"));
+    println!("sm4: {:?}", is_x86_feature_detected!("sm4"));
+    println!("avx512f: {:?}", is_x86_feature_detected!("avx512f"));
+    println!("avx512cd: {:?}", is_x86_feature_detected!("avx512cd"));
+    println!("avx512er: {:?}", is_x86_feature_detected!("avx512er"));
+    println!("avx512pf: {:?}", is_x86_feature_detected!("avx512pf"));
+    println!("avx512bw: {:?}", is_x86_feature_detected!("avx512bw"));
+    println!("avx512dq: {:?}", is_x86_feature_detected!("avx512dq"));
+    println!("avx512vl: {:?}", is_x86_feature_detected!("avx512vl"));
+    println!("avx512_ifma: {:?}", is_x86_feature_detected!("avx512ifma"));
+    println!("avx512vbmi {:?}", is_x86_feature_detected!("avx512vbmi"));
+    println!(
+        "avx512_vpopcntdq: {:?}",
+        is_x86_feature_detected!("avx512vpopcntdq")
+    );
+    println!("avx512vbmi2: {:?}", is_x86_feature_detected!("avx512vbmi2"));
+    println!("gfni: {:?}", is_x86_feature_detected!("gfni"));
+    println!("vaes: {:?}", is_x86_feature_detected!("vaes"));
+    println!("vpclmulqdq: {:?}", is_x86_feature_detected!("vpclmulqdq"));
+    println!("avx512vnni: {:?}", is_x86_feature_detected!("avx512vnni"));
+    println!(
+        "avx512bitalg: {:?}",
+        is_x86_feature_detected!("avx512bitalg")
+    );
+    println!("avx512bf16: {:?}", is_x86_feature_detected!("avx512bf16"));
+    println!(
+        "avx512vp2intersect: {:?}",
+        is_x86_feature_detected!("avx512vp2intersect")
+    );
+    println!("avx512fp16: {:?}", is_x86_feature_detected!("avx512fp16"));
+    println!("fma: {:?}", is_x86_feature_detected!("fma"));
+    println!("abm: {:?}", is_x86_feature_detected!("abm"));
+    println!("bmi: {:?}", is_x86_feature_detected!("bmi1"));
+    println!("bmi2: {:?}", is_x86_feature_detected!("bmi2"));
+    println!("tbm: {:?}", is_x86_feature_detected!("tbm"));
+    println!("popcnt: {:?}", is_x86_feature_detected!("popcnt"));
+    println!("lzcnt: {:?}", is_x86_feature_detected!("lzcnt"));
+    println!("fxsr: {:?}", is_x86_feature_detected!("fxsr"));
+    println!("xsave: {:?}", is_x86_feature_detected!("xsave"));
+    println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt"));
+    println!("xsaves: {:?}", is_x86_feature_detected!("xsaves"));
+    println!("xsavec: {:?}", is_x86_feature_detected!("xsavec"));
+    println!("cmpxchg16b: {:?}", is_x86_feature_detected!("cmpxchg16b"));
+    println!("adx: {:?}", is_x86_feature_detected!("adx"));
+    println!("rtm: {:?}", is_x86_feature_detected!("rtm"));
+    println!("movbe: {:?}", is_x86_feature_detected!("movbe"));
+    println!("avxvnni: {:?}", is_x86_feature_detected!("avxvnni"));
+    println!("avxvnniint8: {:?}", is_x86_feature_detected!("avxvnniint8"));
+    println!(
+        "avxneconvert: {:?}",
+        is_x86_feature_detected!("avxneconvert")
+    );
+    println!("avxifma: {:?}", is_x86_feature_detected!("avxifma"));
+    println!(
+        "avxvnniint16: {:?}",
+        is_x86_feature_detected!("avxvnniint16")
+    );
+    println!("amx-bf16: {:?}", is_x86_feature_detected!("amx-bf16"));
+    println!("amx-tile: {:?}", is_x86_feature_detected!("amx-tile"));
+    println!("amx-int8: {:?}", is_x86_feature_detected!("amx-int8"));
+    println!("amx-fp16: {:?}", is_x86_feature_detected!("amx-fp16"));
+    println!("amx-complex: {:?}", is_x86_feature_detected!("amx-complex"));
+    println!("xop: {:?}", is_x86_feature_detected!("xop"));
+    println!("kl: {:?}", is_x86_feature_detected!("kl"));
+    println!("widekl: {:?}", is_x86_feature_detected!("widekl"));
+    println!("movrs: {:?}", is_x86_feature_detected!("movrs"));
+    println!("amx-fp8: {:?}", is_x86_feature_detected!("amx-fp8"));
+    println!(
+        "amx-transpose: {:?}",
+        is_x86_feature_detected!("amx-transpose")
+    );
+    println!("amx-tf32: {:?}", is_x86_feature_detected!("amx-tf32"));
+    println!("amx-avx512: {:?}", is_x86_feature_detected!("amx-avx512"));
+    println!("amx-movrs: {:?}", is_x86_feature_detected!("amx-movrs"));
+}
+
+#[test]
+#[allow(deprecated)]
+fn x86_deprecated() {
+    println!("avx512gfni {:?}", is_x86_feature_detected!("avx512gfni"));
+    println!("avx512vaes {:?}", is_x86_feature_detected!("avx512vaes"));
+    println!(
+        "avx512vpclmulqdq {:?}",
+        is_x86_feature_detected!("avx512vpclmulqdq")
+    );
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/Cargo.toml b/library/stdarch/crates/stdarch-gen-arm/Cargo.toml
new file mode 100644
index 0000000000000..899296d25ea7e
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "stdarch-gen-arm"
+version = "0.1.0"
+authors = ["Luca Vizzarro <luca.vizzarro@arm.com>",
+        "Jamie Cunliffe <Jamie.Cunliffe@arm.com>",
+        "Adam Gemmell <Adam.Gemmell@arm.com",
+        "Jacob Bramley <jacob.bramley@arm.com>",
+        "James Barford-Evans <james.barford-evans@arm.com>"]
+license = "MIT OR Apache-2.0"
+edition = "2024"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+itertools = "0.14.0"
+lazy_static = "1.4.0"
+proc-macro2 = "1.0"
+quote = "1.0"
+regex = "1.5"
+serde = { version = "1.0", features = ["derive"] }
+serde_with = "1.14"
+serde_yaml = "0.8"
+walkdir = "2.3.2"
diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
new file mode 100644
index 0000000000000..f658267b9a19b
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
@@ -0,0 +1,14198 @@
+arch_cfgs:
+  - arch_name: aarch64
+    target_feature: [neon]
+    llvm_prefix: llvm.aarch64.neon
+# Generate big endian shuffles
+auto_big_endian: true
+
+# We do not want to automatically generate signed/unsigned casts
+auto_llvm_sign_conversion: false
+
+# Repeatedly used anchors
+# #[stable(feature = "neon_intrinsics", since = "1.59.0")]
+neon-stable: &neon-stable
+  FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+
+# #[cfg(not(target_arch = "arm"))]
+target-not-arm: &target-not-arm
+  FnCall: [cfg, [{ FnCall: [not, ['target_arch = "arm"']]}]]
+
+# #[cfg_attr(all(test, not(target_env = "msvc"))]
+msvc-disabled: &msvc-disabled
+  FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]
+
+# all(test, target_arch = "arm")
+test-is-arm: &test-is-arm
+  FnCall: [all, [test, 'target_arch = "arm"']]
+
+# #[target_feature(enable = "neon,aes")]
+neon-aes: &neon-aes
+  FnCall: [target_feature, ['enable = "neon,aes"']]
+
+# #[target_feature(enable = "neon,i8mm")]
+neon-i8mm: &neon-i8mm
+  FnCall: [target_feature, ['enable = "neon,i8mm"']]
+
+# #[target_feature(enable = "neon,fp16")]
+neon-fp16: &neon-fp16
+  FnCall: [target_feature, ['enable = "neon,fp16"']]
+
+# #[cfg_attr(not(target_arch = "arm"), target_feature(enable = "fhm"))]
+enable-fhm: &enable-fhm
+  FnCall: [cfg_attr, [{ FnCall: [not, ['target_arch = "arm"']]}, { FnCall: [target_feature, ['enable = "fhm"']] }]]
+
+enable-fcma: &enable-fcma
+  FnCall: [cfg_attr, [{ FnCall: [not, ['target_arch = "arm"']]}, { FnCall: [target_feature, ['enable = "fcma"']] }]]
+
+# #[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+neon-unstable-fcma: &neon-unstable-fcma
+  FnCall: [unstable, ['feature = "stdarch_neon_fcma"', 'issue = "117222"']]
+
+aarch64-crc-stable: &aarch64-crc-stable
+  FnCall: [stable, ['feature = "stdarch_aarch64_crc32"', 'since = "1.80.0"']]
+
+# #[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+neon-unstable-f16: &neon-unstable-f16
+  FnCall: [unstable, ['feature = "stdarch_neon_f16"', 'issue = "136306"']]
+
+# #[unstable(feature = "stdarch_neon_feat_lut", issue = "138050")]
+neon-unstable-feat-lut: &neon-unstable-feat-lut
+  FnCall: [unstable, ['feature = "stdarch_neon_feat_lut"', 'issue = "138050"']]
+
+# #[cfg(target_endian = "little")]
+little-endian: &little-endian
+  FnCall: [cfg, ['target_endian = "little"']]
+
+# #[cfg(target_endian = "big")]
+big-endian: &big-endian
+  FnCall: [cfg, ['target_endian = "big"']]
+
+intrinsics:
+  - name: "vaddd_{type}"
+    doc: Add
+    arguments: ["a: {type}", "b: {type}"]
+    return_type: "{type}"
+    attr: [*neon-stable]
+    assert_instr: [nop]
+    safety: safe
+    types:
+      - i64
+      - u64
+    compose:
+      - MethodCall:
+          - a
+          - wrapping_add
+          - - b
+
+  - name: "veor3{neon_type.no}"
+    doc: Three-way exclusive OR
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sha3"']]
+      - FnCall: [stable, ['feature = "stdarch_neon_sha3"', 'since = "1.79.0"']]
+    assert_instr: [eor3]
+    safety: safe
+    types:
+      - int8x16_t
+      - int16x8_t
+      - int32x4_t
+      - int64x2_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.crypto.eor3s.{neon_type}"
+          links:
+            - link: "llvm.aarch64.crypto.eor3s.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "veor3{neon_type.no}"
+    doc: Three-way exclusive OR
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sha3"']]
+      - FnCall: [stable, ['feature = "stdarch_neon_sha3"', 'since = "1.79.0"']]
+    assert_instr: [eor3]
+    safety: safe
+    types:
+      - uint8x16_t
+      - uint16x8_t
+      - uint32x4_t
+      - uint64x2_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.crypto.eor3u.{neon_type}"
+          links:
+            - link: "llvm.aarch64.crypto.eor3u.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vabd{neon_type.no}"
+    doc: Absolute difference between the arguments of Floating
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [fabd]
+    safety: safe
+    types:
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "fabd.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fabd.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vabd{type[0]}"
+    doc: "Floating-point absolute difference"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [fabd]
+    safety: safe
+    types:
+      - ['s_f32', 'f32']
+      - ['d_f64', 'f64']
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vabd_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+                  - FnCall: ["vdup_n_{type[1]}", [b]]
+            - 0
+
+  - name: "vabd{type[0]}"
+    doc: "Floating-point absolute difference"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fabd]
+    safety: safe
+    types:
+      - ['h_f16', 'f16']
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vabd_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+                  - FnCall: ["vdup_n_{type[1]}", [b]]
+            - 0
+
+  - name: "vabdl_high{neon_type[0].noq}"
+    doc: Signed Absolute difference Long
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [sabdl]
+    safety: safe
+    types:
+      - [int8x16_t, int16x8_t, int8x8_t, uint8x8_t]
+    compose:
+      - Let:
+          - c
+          - "{neon_type[2]}"
+          - FnCall:
+              - simd_shuffle!
+              - - a
+                - a
+                - [8, 9, 10, 11, 12, 13, 14, 15]
+      - Let:
+          - d
+          - "{neon_type[2]}"
+          - FnCall:
+              - simd_shuffle!
+              - - b
+                - b
+                - [8, 9, 10, 11, 12, 13, 14, 15]
+      - Let:
+          - e
+          - "{neon_type[3]}"
+          - FnCall:
+              - simd_cast
+              - - FnCall:
+                    - "vabd_{neon_type[0]}"
+                    - - c
+                      - d
+      - FnCall:
+          - simd_cast
+          - - e
+
+  - name: "vabdl_high{neon_type[0].noq}"
+    doc: Signed Absolute difference Long
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall:
+          - stable
+          - - 'feature = "neon_intrinsics"'
+            - 'since = "1.59.0"'
+    assert_instr: [sabdl]
+    safety: safe
+    types:
+      - [int16x8_t, int32x4_t, int16x4_t, uint16x4_t]
+    compose:
+      - Let:
+          - c
+          - "{neon_type[2]}"
+          - FnCall:
+              - simd_shuffle!
+              - - a
+                - a
+                - [4, 5, 6, 7]
+      - Let:
+          - d
+          - "{neon_type[2]}"
+          - FnCall:
+              - simd_shuffle!
+              - - b
+                - b
+                - [4, 5, 6, 7]
+      - Let:
+          - e
+          - "{neon_type[3]}"
+          - FnCall:
+              - simd_cast
+              - - FnCall:
+                    - "vabd_{neon_type[0]}"
+                    - - c
+                      - d
+      - FnCall:
+          - simd_cast
+          - - e
+
+  - name: "vabdl_high{neon_type[0].noq}"
+    doc: Signed Absolute difference Long
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall:
+          - stable
+          - - 'feature = "neon_intrinsics"'
+            - 'since = "1.59.0"'
+    assert_instr: [sabdl]
+    safety: safe
+    types:
+      - [int32x4_t, int64x2_t, int32x2_t, uint32x2_t]
+    compose:
+      - Let:
+          - c
+          - "{neon_type[2]}"
+          - FnCall:
+              - simd_shuffle!
+              - - a
+                - a
+                - [2, 3]
+      - Let:
+          - d
+          - "{neon_type[2]}"
+          - FnCall:
+              - simd_shuffle!
+              - - b
+                - b
+                - [2, 3]
+      - Let:
+          - e
+          - "{neon_type[3]}"
+          - FnCall:
+              - simd_cast
+              - - FnCall:
+                    - "vabd_{neon_type[0]}"
+                    - - c
+                      - d
+      - FnCall:
+          - simd_cast
+          - - e
+
+  - name: "vceq{neon_type[0].no}"
+    doc: "Compare bitwise Equal (vector)"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmeq]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint64x1_t, uint64x1_t]
+      - [uint64x2_t, uint64x2_t]
+      - [int64x1_t, uint64x1_t]
+      - [int64x2_t, uint64x2_t]
+      - [poly64x1_t, uint64x1_t]
+      - [poly64x2_t, uint64x2_t]
+    compose:
+      - FnCall: [simd_eq, [a, b]]
+
+  - name: "vceq{neon_type[0].no}"
+    doc: "Floating-point compare equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmeq]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - FnCall: [simd_eq, [a, b]]
+
+  - name: "vceq{type[0]}"
+    doc: "Floating-point compare equal"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "f32", "u32"]
+      - ["d_f64", "f64", "u64"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vceq_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+                  - FnCall: ["vdup_n_{type[1]}", [b]]
+            - '0'
+
+
+  - name: "vceq{type[0]}"
+    doc: "Floating-point compare equal"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["h_f16", "f16", "u16"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vceq_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+                  - FnCall: ["vdup_n_{type[1]}", [b]]
+            - '0'
+
+  - name: "vceqd_{type[0]}"
+    doc: "Compare bitwise equal"
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i64", "u64", "s64"]
+      - ["u64", "u64", "u64"]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vceq_{type[2]}"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vtst{neon_type[0].no}"
+    doc: "Signed compare bitwise Test bits nonzero"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmtst]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int64x1_t, uint64x1_t, 'i64x1', 'i64x1::new(0)']
+      - [int64x2_t, uint64x2_t, 'i64x2', 'i64x2::new(0, 0)']
+      - [poly64x1_t, uint64x1_t, 'i64x1', 'i64x1::new(0)']
+      - [poly64x2_t, uint64x2_t, 'i64x2', 'i64x2::new(0, 0)']
+    compose:
+      - Let: [c, "{neon_type[0]}", {FnCall: [simd_and, [a, b]]}]
+      - Let: [d, "{type[2]}", "{type[3]}"]
+      - FnCall: [simd_ne, [c, {FnCall: [transmute, [d]]}]]
+
+  - name: "vtstd_{type[0]}"
+    doc: "Compare bitwise test bits nonzero"
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tst]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i64", "u64", "s64"]
+      - ["u64", "u64", "u64"]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vtst_{type[2]}"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vuqadd{type[0]}"
+    doc: "Signed saturating accumulate of unsigned value"
+    arguments: ["a: {type[1]}", "b: {type[2]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [suqadd]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_s32", "i32", "u32"]
+      - ["d_s64", "i64", "u64"]
+    compose:
+      - LLVMLink:
+          name: "vuqadd{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.suqadd.{type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vuqadd{type[0]}"
+    doc: "Signed saturating accumulate of unsigned value"
+    arguments: ["a: {type[1]}", "b: {type[2]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [suqadd]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["b_s8", "i8", "u8", "s8"]
+      - ["h_s16", "i16", "u16", "s16"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vuqadd_{type[3]}"
+                - - FnCall: ["vdup_n_{type[3]}", [a]]
+                  - FnCall: ["vdup_n_{type[2]}", [b]]
+            - '0'
+
+  - name: "vabs{neon_type.no}"
+    doc: "Floating-point absolute value"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fabs]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - FnCall: [simd_fabs, [a]]
+
+  - name: "vcgt{neon_type[0].no}"
+    doc: "Compare signed greater than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmgt]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int64x1_t, uint64x1_t]
+      - [int64x2_t, uint64x2_t]
+    compose:
+      - FnCall: [simd_gt, [a, b]]
+
+  - name: "vcgt{neon_type.no}"
+    doc: "Compare unsigned greater than"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmhi]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - uint64x1_t
+      - uint64x2_t
+    compose:
+      - FnCall: [simd_gt, [a, b]]
+
+  - name: "vcgt{neon_type[0].no}"
+    doc: "Floating-point compare greater than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmgt]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - FnCall: [simd_gt, [a, b]]
+
+  - name: "vcgt{type[0]}"
+    doc: "Floating-point compare greater than"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "f32", "u32"]
+      - ["d_f64", "f64", "u64"]
+    compose:
+      - FnCall:
+          - 'simd_extract!'
+          - - FnCall:
+                - "vcgt_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+                  - FnCall: ["vdup_n_{type[1]}", [b]]
+            - '0'
+
+
+  - name: "vcgt{type[0]}"
+    doc: "Floating-point compare greater than"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["h_f16", "f16", "u16"]
+    compose:
+      - FnCall:
+          - 'simd_extract!'
+          - - FnCall:
+                - "vcgt_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+                  - FnCall: ["vdup_n_{type[1]}", [b]]
+            - '0'
+
+  - name: "vclt{neon_type[0].no}"
+    doc: "Compare signed less than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmgt]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int64x1_t, uint64x1_t]
+      - [int64x2_t, uint64x2_t]
+    compose:
+      - FnCall: [simd_lt, [a, b]]
+
+  - name: "vcle{neon_type[0].no}"
+    doc: "Compare signed less than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmge]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int64x1_t, uint64x1_t]
+      - [int64x2_t, uint64x2_t]
+    compose:
+      - FnCall: [simd_le, [a, b]]
+
+  - name: "vcle{neon_type[0].no}"
+    doc: "Floating-point compare less than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmge]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - FnCall: [simd_le, [a, b]]
+
+  - name: "vcle{type[0]}"
+    doc: "Floating-point compare less than or equal"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "f32", "u32"]
+      - ["d_f64", "f64", "u64"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vcle_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+                  - FnCall: ["vdup_n_{type[1]}", [b]]
+            - '0'
+
+
+  - name: "vcle{type[0]}"
+    doc: "Floating-point compare less than or equal"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["h_f16", "f16", "u16"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vcle_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+                  - FnCall: ["vdup_n_{type[1]}", [b]]
+            - '0'
+
+  - name: "vcge{neon_type[0].no}"
+    doc: "Compare signed greater than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmge]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int64x1_t, uint64x1_t]
+      - [int64x2_t, uint64x2_t]
+    compose:
+      - FnCall: [simd_ge, [a, b]]
+
+  - name: "vcgez{neon_type[0].no}"
+    doc: "Compare signed greater than or equal to zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmge]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, uint8x8_t, i8x8, 'i8x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int8x16_t, uint8x16_t, i8x16, 'i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int16x4_t, uint16x4_t, i16x4, 'i16x4::new(0, 0, 0, 0)']
+      - [int16x8_t, uint16x8_t, i16x8, 'i16x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int32x2_t, uint32x2_t, i32x2, 'i32x2::new(0, 0)']
+      - [int32x4_t, uint32x4_t, i32x4, 'i32x4::new(0, 0, 0, 0)']
+      - [int64x1_t, uint64x1_t, i64x1, 'i64x1::new(0)']
+      - [int64x2_t, uint64x2_t, i64x2, 'i64x2::new(0, 0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall: [simd_ge, [a, {FnCall: [transmute, [b]]}]]
+
+  - name: "vcgezd_s64"
+    doc: "Compare signed greater than or equal to zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i64", "u64"]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - vcgez_s64
+                - - FnCall: [transmute, [a]]
+
+  - name: "vclez{neon_type[0].no}"
+    doc: "Compare signed less than or equal to zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmle]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, uint8x8_t, i8x8, 'i8x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int8x16_t, uint8x16_t, i8x16, 'i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int16x4_t, uint16x4_t, i16x4, 'i16x4::new(0, 0, 0, 0)']
+      - [int16x8_t, uint16x8_t, i16x8, 'i16x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int32x2_t, uint32x2_t, i32x2, 'i32x2::new(0, 0)']
+      - [int32x4_t, uint32x4_t, i32x4, 'i32x4::new(0, 0, 0, 0)']
+      - [int64x1_t, uint64x1_t, i64x1, 'i64x1::new(0)']
+      - [int64x2_t, uint64x2_t, i64x2, 'i64x2::new(0, 0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall:
+          - simd_le
+          - - a
+            - FnCall: [transmute, [b]]
+
+  - name: "vclez{neon_type[0].no}"
+    doc: "Floating-point compare less than or equal to zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmle]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t, f32x2, 'f32x2::new(0.0, 0.0)']
+      - [float32x4_t, uint32x4_t, f32x4, 'f32x4::new(0.0, 0.0, 0.0, 0.0)']
+      - [float64x1_t, uint64x1_t, f64, '0.0']
+      - [float64x2_t, uint64x2_t, f64x2, 'f64x2::new(0.0, 0.0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall:
+          - simd_le
+          - - a
+            - FnCall: [transmute, [b]]
+
+  - name: "vclez{type[0]}"
+    doc: "Floating-point compare less than or equal to zero"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "f32", "u32"]
+      - ["d_f64", "f64", "u64"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vclez_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+            - '0'
+
+  - name: "vclez{type[0]}"
+    doc: "Floating-point compare less than or equal to zero"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["h_f16", "f16", "u16"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vclez_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+            - '0'
+
+  - name: "vcltz{neon_type[0].no}"
+    doc: "Compare signed less than zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmlt]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, uint8x8_t, i8x8, 'i8x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int8x16_t, uint8x16_t, i8x16, 'i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int16x4_t, uint16x4_t, i16x4, 'i16x4::new(0, 0, 0, 0)']
+      - [int16x8_t, uint16x8_t, i16x8, 'i16x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int32x2_t, uint32x2_t, i32x2, 'i32x2::new(0, 0)']
+      - [int32x4_t, uint32x4_t, i32x4, 'i32x4::new(0, 0, 0, 0)']
+      - [int64x1_t, uint64x1_t, i64x1, 'i64x1::new(0)']
+      - [int64x2_t, uint64x2_t, i64x2, 'i64x2::new(0, 0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall:
+          - simd_lt
+          - - a
+            - FnCall: [transmute, [b]]
+
+  - name: "vcltz{neon_type[0].no}"
+    doc: "Floating-point compare less than zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmlt]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t, f32x2, 'f32x2::new(0.0, 0.0)']
+      - [float32x4_t, uint32x4_t, f32x4, 'f32x4::new(0.0, 0.0, 0.0, 0.0)']
+      - [float64x1_t, uint64x1_t, f64, '0.0']
+      - [float64x2_t, uint64x2_t, f64x2, 'f64x2::new(0.0, 0.0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall:
+          - simd_lt
+          - - a
+            - FnCall: [transmute, [b]]
+
+  - name: "vcltz{type[0]}"
+    doc: "Floating-point compare less than zero"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "f32", "u32"]
+      - ["d_f64", "f64", "u64"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vcltz_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+            - '0'
+
+  - name: "vcltz{type[0]}"
+    doc: "Floating-point compare less than zero"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["h_f16", "f16", "u16"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vcltz_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+            - '0'
+
+  - name: "vcltzd_s64"
+    doc: "Compare less than zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [asr]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i64", "u64"]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - vcltz_s64
+                - - FnCall: [transmute, [a]]
+
+  - name: "vcagt{neon_type[0].no}"
+    doc: "Floating-point absolute compare greater than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [facgt]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vcagt{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.facgt.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcagt{type[0]}"
+    doc: "Floating-point absolute compare greater than"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [facgt]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "f32", "u32", i32]
+      - ["d_f64", "f64", "u64", i64]
+    compose:
+      - LLVMLink:
+          name: "vcagt{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.facgt.{type[3]}.{type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcagt{type[0]}"
+    doc: "Floating-point absolute compare greater than"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [facgt]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["h_f16", "f16", "u16", i32]
+    compose:
+      - LLVMLink:
+          name: "vcagt{type[0]}"
+          return_type: "{type[3]}"
+          links:
+            - link: "llvm.aarch64.neon.facgt.{type[3]}.{type[1]}"
+              arch: aarch64,arm64ec
+      - 'unsafe {{ _vcagth_f16(a, b) as u16 }}'
+
+  - name: "vcage{neon_type[0].no}"
+    doc: "Floating-point absolute compare greater than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [facge]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vcage{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.facge.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcage{type[0]}"
+    doc: "Floating-point absolute compare greater than or equal"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [facge]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "f32", "u32", i32]
+      - ["d_f64", "f64", "u64", i64]
+    compose:
+      - LLVMLink:
+          name: "vcage{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.facge.{type[3]}.{type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcage{type[0]}"
+    doc: "Floating-point absolute compare greater than or equal"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [facge]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["h_f16", "f16", "u16", i32]
+    compose:
+      - LLVMLink:
+          name: "vcage{type[0]}"
+          return_type: "{type[3]}"
+          links:
+            - link: "llvm.aarch64.neon.facge.{type[3]}.{type[1]}"
+              arch: aarch64,arm64ec
+      - "unsafe {{ _vcageh_f16(a, b) as u16 }}"
+
+  - name: "vcalt{neon_type[0].no}"
+    doc: "Floating-point absolute compare less than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [facgt]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - FnCall: ["vcagt{neon_type[0].no}", [b, a]]
+
+  - name: "vcalt{type[0]}"
+    doc: "Floating-point absolute compare less than"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [facgt]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "f32", "u32"]
+      - ["d_f64", "f64", "u64"]
+    compose:
+      - FnCall: ["vcagt{type[0]}", [b, a]]
+
+  - name: "vcalt{type[0]}"
+    doc: "Floating-point absolute compare less than"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [facgt]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["h_f16", "f16", "u16"]
+    compose:
+      - FnCall: ["vcagt{type[0]}", [b, a]]
+
+  - name: "vcale{neon_type[0].no}"
+    doc: "Floating-point absolute compare less than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [facge]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - FnCall: ["vcage{neon_type[0].no}", [b, a]]
+
+  - name: "vcale{type[0]}"
+    doc: "Floating-point absolute compare less than or equal"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [facge]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "f32", "u32"]
+      - ["d_f64", "f64", "u64"]
+    compose:
+      - FnCall: ["vcage{type[0]}", [b, a]]
+
+  - name: "vcale{type[0]}"
+    doc: "Floating-point absolute compare less than or equal"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [facge]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["h_f16", "f16", "u16"]
+    compose:
+      - FnCall: ["vcage{type[0]}", [b, a]]
+
+  - name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [scvtf]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int64x1_t, float64x1_t]
+      - [int64x2_t, float64x2_t]
+    compose:
+      - FnCall: [simd_cast, [a]]
+
+  - name: "vcvt{type[0]}_{type[3]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [scvtf]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "i32", "f32", s32]
+      - ["d_f64", "i64", "f64", s64]
+    compose:
+      - Identifier: ["a as {type[2]}", Symbol]
+
+  - name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ucvtf]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint64x1_t, float64x1_t]
+      - [uint64x2_t, float64x2_t]
+    compose:
+      - FnCall: [simd_cast, [a]]
+
+  - name: "vcvt{type[2]}_{type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ucvtf]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["u32", "f32", "s_f32"]
+      - ["u64", "f64", "d_f64"]
+    compose:
+      - Identifier: ["a as {type[1]}", Symbol]
+
+  - name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [scvtf, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int64x1_t, float64x1_t]
+      - [int64x2_t, float64x2_t]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 64']]
+      - LLVMLink:
+          name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.vcvtfxs2fp.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{neon_type[1].N}_{neon_type[0]}", [a, N], [], true]
+
+
+  - name: "vcvt{type[2]}_n_{type[1]}_{type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [scvtf, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [i32, f16, 'h']
+      - [i64, f16, 'h']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 16']]
+      - LLVMLink:
+          name: "vcvt{type[2]}_n_{type[1]}_{type[0]}"
+          arguments:
+            - "a: {type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.vcvtfxs2fp.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{type[2]}_n_{type[1]}_{type[0]}", [a, N], [], true]
+
+
+  - name: "vcvt{type[2]}_n_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to fixed-point, rounding toward zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[4]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzs, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [f16, s16, 'h', i32, i16]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 16']]
+      - "vcvt{type[2]}_n_{type[3]}_{type[0]}::<N>(a) as i16"
+
+  - name: "vcvt{type[2]}_n_{type[1]}_{type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ucvtf, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [u32, f16, 'h']
+      - [u64, f16, 'h']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 16']]
+      - LLVMLink:
+          name: "vcvt{type[2]}_n_{type[1]}_{type[0]}"
+          arguments:
+            - "a: {type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.vcvtfxu2fp.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{type[2]}_n_{type[1]}_{type[0]}", [a, N], [], true]
+
+
+  - name: "vcvt{type[2]}_n_{type[1]}_{type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [scvtf, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [i16, f16, 'h', 'i32', 'as i32']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 16']]
+      - "vcvt{type[2]}_n_{type[1]}_{type[3]}::<N>(a {type[4]}) as {type[1]}"
+
+
+  - name: "vcvt{type[2]}_n_{type[1]}_{type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ucvtf, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [u16, f16, 'h', u32]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 16']]
+      - "vcvt{type[2]}_n_{type[1]}_{type[3]}::<N>(a as {type[3]}) as {type[1]}"
+
+
+  - name: "vcvt{type[2]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [scvtf, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["i32", "f32", 's_n_f32_s32', 'N >= 1 && N <= 32']
+      - ["i64", "f64", 'd_n_f64_s64', 'N >= 1 && N <= 64']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 64']]
+      - LLVMLink:
+          name: "vcvt{type[2]}"
+          arguments:
+            - "a: {type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.vcvtfxs2fp.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{type[2]}", [a, N], [], true]
+
+
+  - name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ucvtf, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint64x1_t, float64x1_t]
+      - [uint64x2_t, float64x2_t]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 64']]
+      - LLVMLink:
+          name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.vcvtfxu2fp.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{neon_type[1].N}_{neon_type[0]}", ["a", N], [], true]
+
+  - name: "vcvt{type[2]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ucvtf, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["u32", "f32", 's_n_f32_u32', 'N >= 1 && N <= 32']
+      - ["u64", "f64", 'd_n_f64_u64', 'N >= 1 && N <= 64']
+    compose:
+      - FnCall: [static_assert!, ["{type[3]}"]]
+      - LLVMLink:
+          name: "vcvt{type[2]}"
+          arguments:
+            - "a: {type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.vcvtfxu2fp.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{type[2]}", ["a", N], [], true]
+
+  - name: "vcvt{type[2]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzs]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["f32", "i32", "s_s32_f32", "32"]
+      - ["f64", "i64", "d_s64_f64", "64"]
+    compose:
+      - Identifier: ["a as i{type[3]}", Symbol]
+
+  - name: "vcvt{type[2]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzu]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["f32", "u32", "s_u32_f32"]
+      - ["f64", "u64", "d_u64_f64"]
+    compose:
+      - Identifier: ["a as {type[1]}", Symbol]
+
+
+  - name: "vcvt{type[2]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {type[3]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [scvtf]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["s16", "f16", "h_f16_s16", i16]
+      - ["s32", "f16", "h_f16_s32", i32]
+      - ["s64", "f16", "h_f16_s64", i64]
+    compose:
+      - Identifier: ["a as {type[1]}", Symbol]
+
+  - name: "vcvt{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to signed fixed-point"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzs]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "s16", "h", i16, 'a as i16']
+      - ["f16", "s32", "h", i32, 'a as i32']
+      - ["f16", "s64", "h", i64, 'a as i64']
+    compose:
+      - Identifier: ["{type[4]}", Symbol]
+
+  - name: "vcvt{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to unsigned fixed-point"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzu]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "u16", "h", 'a as u16']
+      - ["f16", "u32", "h", 'a as u32']
+      - ["f16", "u64", "h", 'a as u64']
+    compose:
+      - Identifier: ["{type[3]}", Symbol]
+
+
+  - name: "vcvt{type[2]}"
+    doc: "Unsigned fixed-point convert to floating-point"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ucvtf]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["u16", "f16", "h_f16_u16"]
+      - ["u32", "f16", "h_f16_u32"]
+      - ["u64", "f16", "h_f16_u64"]
+    compose:
+      - Identifier: ["a as {type[1]}", Symbol]
+
+
+  - name: "vcvt_f64_f32"
+    doc: "Floating-point convert to higher precision long"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, float64x2_t]
+    compose:
+      - FnCall: [simd_cast, [a]]
+
+  - name: "vcvt_high_f64_f32"
+    doc: "Floating-point convert to higher precision long"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x4_t, float64x2_t]
+    compose:
+      - Let:
+          - b
+          - float32x2_t
+          - FnCall:
+              - simd_shuffle!
+              - - a
+                - a
+                - '[2, 3]'
+      - FnCall: [simd_cast, [b]]
+
+  - name: "vcvt_high_f16_f32"
+    doc: "Floating-point convert to lower precision"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtn2]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x8_t, float16x4_t, float32x4_t]
+    compose:
+      - FnCall:
+          - vcombine_f16
+          - - a
+            - FnCall: [vcvt_f16_f32, [b]]
+
+  - name: "vcvt_high_f32_f16"
+    doc: "Floating-point convert to higher precision"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtl2]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float32x4_t, float16x8_t]
+    compose:
+      - FnCall:
+          - vcvt_f32_f16
+          - - FnCall: [vget_high_f16, [a]]
+
+
+  - name: "vcvt_f32_f64"
+    doc: "Floating-point convert"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtn]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x2_t, float32x2_t]
+    compose:
+      - FnCall: [simd_cast, [a]]
+
+  - name: "vcvt_high_f32_f64"
+    doc: "Floating-point convert to lower precision narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtn]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, float64x2_t, float32x4_t]
+    compose:
+      - FnCall:
+          - simd_shuffle!
+          - - a
+            - FnCall: [simd_cast, [b]]
+            - '[0, 1, 2, 3]'
+
+  - name: "vcvtx_f32_f64"
+    doc: "Floating-point convert to lower precision narrow, rounding to odd"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtxn]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x2_t, float32x2_t]
+    compose:
+      - LLVMLink:
+          name: "vcvtx_f32_f64"
+          links:
+            - link: "llvm.aarch64.neon.fcvtxn.v2f32.v2f64"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtxd_f32_f64"
+    doc: "Floating-point convert to lower precision narrow, rounding to odd"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtxn]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["f64", "f32"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - vcvtx_f32_f64
+                - - FnCall: [vdupq_n_f64, [a]]
+            - '0'
+
+  - name: "vcvtx_high_f32_f64"
+    doc: "Floating-point convert to lower precision narrow, rounding to odd"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtxn]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, float64x2_t, float32x4_t]
+    compose:
+      - FnCall:
+          - simd_shuffle!
+          - - a
+            - FnCall: [vcvtx_f32_f64, [b]]
+            - '[0, 1, 2, 3]'
+
+  - name: "vcvt{type[2]}"
+    doc: "Floating-point convert to fixed-point, rounding toward zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzs, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [float64x1_t, int64x1_t, _n_s64_f64, '64']
+      - [float64x2_t, int64x2_t, q_n_s64_f64, '64']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= {type[3]}']]
+      - LLVMLink:
+          name: "vcvt{type[2]}"
+          arguments: ["a: {type[0]}", "n: i32"]
+          links:
+            - link: "llvm.aarch64.neon.vcvtfp2fxs.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{type[2]}", [a, N], [], true]
+
+  - name: "vcvt{type[2]}"
+    doc: "Floating-point convert to fixed-point, rounding toward zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzs, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["f32", "i32", s_n_s32_f32, '32']
+      - ["f64", "i64", d_n_s64_f64, '64']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= {type[3]}']]
+      - LLVMLink:
+          name: "vcvt{type[2]}"
+          arguments: ["a: {type[0]}", "n: i32"]
+          links:
+            - link: "llvm.aarch64.neon.vcvtfp2fxs.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{type[2]}", [a, N], [], true]
+
+
+  - name: "vcvt{type[2]}_n_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to fixed-point, rounding toward zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzs, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["f16", "i32", 'h', '16']
+      - ["f16", "i64", 'h', '16']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= {type[3]}']]
+      - LLVMLink:
+          name: "vcvt{type[2]}_n_{type[1]}_{type[0]}"
+          arguments: ["a: {type[0]}", "n: i32"]
+          links:
+            - link: "llvm.aarch64.neon.vcvtfp2fxs.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{type[2]}_n_{type[1]}_{type[0]}", [a, N], [], true]
+
+
+  - name: "vcvt{type[2]}_n_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to fixed-point, rounding toward zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzu, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [f16, u16, 'h', u32]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 16']]
+      - "vcvt{type[2]}_n_{type[3]}_{type[0]}::<N>(a) as {type[1]}"
+
+
+  - name: "vcvt{type[2]}_n_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to fixed-point, rounding toward zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzu, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["f16", "u32", 'h', '16']
+      - ["f16", "u64", 'h', '16']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= {type[3]}']]
+      - LLVMLink:
+          name: "vcvt{type[2]}_n_{type[1]}_{type[0]}"
+          arguments: ["a: {type[0]}", "n: i32"]
+          links:
+            - link: "llvm.aarch64.neon.vcvtfp2fxu.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{type[2]}_n_{type[1]}_{type[0]}", [a, N], [], true]
+
+  - name: "vcvt{type[2]}"
+    doc: "Floating-point convert to fixed-point, rounding toward zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzu, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [float64x1_t, uint64x1_t, _n_u64_f64, '64']
+      - [float64x2_t, uint64x2_t, q_n_u64_f64, '64']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= {type[3]}']]
+      - LLVMLink:
+          name: "vcvt{type[2]}"
+          arguments: ["a: {type[0]}", "n: i32"]
+          links:
+            - link: "llvm.aarch64.neon.vcvtfp2fxu.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{type[2]}", [a, N], [], true]
+
+  - name: "vcvt{type[2]}"
+    doc: "Floating-point convert to fixed-point, rounding toward zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzu, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["f32", "u32", s_n_u32_f32, '32']
+      - ["f64", "u64", d_n_u64_f64, '64']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= {type[3]}']]
+      - LLVMLink:
+          name: "vcvt{type[2]}"
+          arguments: ["a: {type[0]}", "n: i32"]
+          links:
+            - link: "llvm.aarch64.neon.vcvtfp2fxu.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{type[2]}", [a, N], [], true]
+
+  - name: "vcvta{type[2]}"
+    doc: "Floating-point convert to signed integer, rounding to nearest with ties to away"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtas]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, int32x2_t, _s32_f32]
+      - [float32x4_t, int32x4_t, q_s32_f32]
+      - [float64x1_t, int64x1_t, _s64_f64]
+      - [float64x2_t, int64x2_t, q_s64_f64]
+    compose:
+      - LLVMLink:
+          name: "vcvta{type[2]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtas.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvta{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to signed integer, rounding to nearest with ties to away"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtas]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, int16x4_t]
+      - [float16x8_t, int16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vcvta_{neon_type[1]}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtas.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvta{type[2]}"
+    doc: "Floating-point convert to integer, rounding to nearest with ties to away"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtas]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["f32", "i32", 's_s32_f32']
+      - ["f64", "i64", 'd_s64_f64']
+    compose:
+      - LLVMLink:
+          name: "vcvta{type[2]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtas.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvta{type[2]}"
+    doc: "Floating-point convert to integer, rounding to nearest with ties to away"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtau]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "u32", 'h_u32_f16']
+      - ["f16", "u64", 'h_u64_f16']
+
+    compose:
+      - LLVMLink:
+          name: "vcvta{type[2]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtau.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvta{type[2]}"
+    doc: "Floating-point convert to integer, rounding to nearest with ties to away"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtas]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "i32", 'h_s32_f16']
+      - ["f16", "i64", 'h_s64_f16']
+    compose:
+      - LLVMLink:
+          name: "vcvta{type[2]}"
+          return_type: "{type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtas.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvta{type[2]}"
+    doc: "Floating-point convert to integer, rounding to nearest with ties to away"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtas]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "i16", 'h_s16_f16', 's32']
+    compose:
+      - 'vcvtah_{type[3]}_f16(a) as i16'
+
+  - name: "vcvta{type[2]}"
+    doc: "Floating-point convert to integer, rounding to nearest with ties to away"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtau]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "u16", 'h_u16_f16', 'u32']
+    compose:
+      - 'vcvtah_{type[3]}_f16(a) as u16'
+
+  - name: "vcvta{type[2]}"
+    doc: "Floating-point convert to integer, rounding to nearest with ties to away"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtau]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["f32", "u32", 's_u32_f32']
+      - ["f64", "u64", 'd_u64_f64']
+    compose:
+      - LLVMLink:
+          name: "vcvta{type[2]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtau.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtn{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to signed integer, rounding to nearest with ties to even"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtns]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, int32x2_t]
+      - [float32x4_t, int32x4_t]
+      - [float64x1_t, int64x1_t]
+      - [float64x2_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vcvtn{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtns.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtn{type[2]}"
+    doc: "Floating-point convert to signed integer, rounding to nearest with ties to even"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtns]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["f32", "i32", 's_s32_f32']
+      - ["f64", "i64", 'd_s64_f64']
+    compose:
+      - LLVMLink:
+          name: "vcvtn{type[2]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtns.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvtn{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to signed integer, rounding to nearest with ties to even"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtns]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, int16x4_t]
+      - [float16x8_t, int16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vcvtn{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtns.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvtn{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to unsigned integer, rounding to nearest with ties to even"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtnu]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vcvtn{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtnu.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtn{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to integer, rounding to nearest with ties to even"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtns]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "i32", 'h']
+      - ["f16", "i64", 'h']
+    compose:
+      - LLVMLink:
+          name: "vcvtm{type[2]}_{type[1]}_{type[0]}"
+          return_type: "{type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtns.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtn{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to integer, rounding to nearest with ties to even"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtns]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "i16", 'h', 'i32']
+    compose:
+      - 'vcvtnh_{type[3]}_f16(a) as i16'
+
+
+  - name: "vcvtn{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to unsigned integer, rounding to nearest with ties to even"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtnu]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "u32", 'h']
+      - ["f16", "u64", 'h']
+    compose:
+      - LLVMLink:
+          name: "vcvtm{type[2]}_{type[1]}_{type[0]}"
+          return_type: "{type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtnu.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtn{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to unsigned integer, rounding to nearest with ties to even"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtnu]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "u16", 'h', 'u32']
+    compose:
+      - 'vcvtnh_{type[3]}_f16(a) as u16'
+
+  - name: "vcvtm{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to signed integer, rounding toward minus infinity"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtms]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, int32x2_t]
+      - [float32x4_t, int32x4_t]
+      - [float64x1_t, int64x1_t]
+      - [float64x2_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vcvtm{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtms.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvtm{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to signed integer, rounding toward minus infinity"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtms]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, int16x4_t]
+      - [float16x8_t, int16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vcvtm{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtms.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvtm{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to unsigned integer, rounding toward minus infinity"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtmu]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vcvtm{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtmu.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvtm{type[2]}"
+    doc: "Floating-point convert to signed integer, rounding toward minus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtms]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["f32", "i32", 's_s32_f32']
+      - ["f64", "i64", 'd_s64_f64']
+    compose:
+      - LLVMLink:
+          name: "vcvtm{type[2]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtms.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtp{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to signed integer, rounding toward plus infinity"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtps]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, int32x2_t]
+      - [float32x4_t, int32x4_t]
+      - [float64x1_t, int64x1_t]
+      - [float64x2_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vcvtp{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtps.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtp{type[2]}"
+    doc: "Floating-point convert to signed integer, rounding toward plus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtps]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["f32", "i32", 's_s32_f32']
+      - ["f64", "i64", 'd_s64_f64']
+    compose:
+      - LLVMLink:
+          name: "vcvtp{type[2]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtps.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtn{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to unsigned integer, rounding to nearest with ties to even"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtnu]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vcvtn{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtnu.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtn{type[2]}"
+    doc: "Floating-point convert to unsigned integer, rounding to nearest with ties to even"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtnu]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["f32", "u32", 's_u32_f32']
+      - ["f64", "u64", 'd_u64_f64']
+    compose:
+      - LLVMLink:
+          name: "vcvtn{type[2]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtnu.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtm{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to unsigned integer, rounding toward minus infinity"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtmu]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vcvtm{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtmu.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtm{type[2]}"
+    doc: "Floating-point convert to unsigned integer, rounding toward minus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtmu]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["f32", "u32", s_u32_f32]
+      - ["f64", "u64", d_u64_f64]
+    compose:
+      - LLVMLink:
+          name: "vcvtm{type[2]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtmu.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtp{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to unsigned integer, rounding toward plus infinity"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtpu]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vcvtp{neon_type[1].no}_{neon_type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtpu.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtp{type[2]}"
+    doc: "Floating-point convert to unsigned integer, rounding toward plus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtpu]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["f32", "u32", s_u32_f32, 'i32']
+      - ["f64", "u64", d_u64_f64, 'u64']
+    compose:
+      - LLVMLink:
+          name: "vcvtp{type[2]}"
+          arguments:
+            - "a: {type[0]}"
+          return_type: "{type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtpu.{type[3]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvtp{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to signed integer, rounding to plus infinity"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtps]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, int16x4_t]
+      - [float16x8_t, int16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vcvtp{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtps.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvtp{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to unsigned integer, rounding to plus infinity"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtpu]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vcvtp{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtpu.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvtp{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to integer, rounding to plus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtps]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "i32", 'h']
+      - ["f16", "i64", 'h']
+    compose:
+      - LLVMLink:
+          name: "vcvtp{type[2]}_{type[1]}_{type[0]}"
+          return_type: "{type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtps.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtp{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to integer, rounding to plus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtps]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "i16", 'h', 'i32']
+    compose:
+      - 'vcvtph_{type[3]}_f16(a) as i16'
+
+  - name: "vcvtp{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to unsigned integer, rounding to plus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtpu]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "u32", 'h']
+      - ["f16", "u64", 'h']
+    compose:
+      - LLVMLink:
+          name: "vcvtp{type[2]}_{type[1]}_{type[0]}"
+          return_type: "{type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtpu.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtp{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to unsigned integer, rounding to plus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtpu]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "u16", 'h', 'u32']
+    compose:
+      - 'vcvtph_{type[3]}_f16(a) as u16'
+
+  - name: "vdup{neon_type.laneq_nox}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [dup, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - poly64x2_t
+      - float64x2_t
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, 1]]
+      - FnCall: [simd_shuffle!, [a, a, '[N as u32, N as u32]']]
+
+  - name: "vdup{neon_type[1].lane_nox}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [dup, 'N = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [poly64x1_t, poly64x2_t]
+      - [float64x1_t, float64x2_t]
+    compose:
+      - FnCall: [static_assert!, ['N == 0']]
+      - FnCall: [simd_shuffle!, [a, a, '[N as u32, N as u32]']]
+
+  - name: "vdup{neon_type.lane_nox}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'N = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - poly64x1_t
+      - float64x1_t
+    compose:
+      - FnCall: [static_assert!, ['N == 0']]
+      - Identifier: [a, Symbol]
+
+  - name: "vdupd{neon_type[0].lane_nox}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'N = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int64x1_t, "i64"]
+      - [uint64x1_t, "u64"]
+      - [float64x1_t, "f64"]
+    compose:
+      - FnCall: [static_assert!, ['N == 0']]
+      - FnCall: [simd_extract!, [a, 'N as u32']]
+
+  - name: "vdup_laneq_{neon_type[0]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [poly64x2_t, poly64x1_t, 'u64']
+      - [float64x2_t, float64x1_t, 'f64']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, 1]]
+      - FnCall:
+          - "transmute::<{type[2]}, _>"
+          - - FnCall: [simd_extract!, [a, 'N as u32']]
+
+  - name: "vdup{type[2]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x2_t, "i32", s_lane_s32]
+      - [int64x2_t, "i64", d_laneq_s64]
+      - [uint32x2_t, "u32", s_lane_u32]
+      - [uint64x2_t, "u64", d_laneq_u64]
+      - [float32x2_t, "f32", s_lane_f32]
+      - [float64x2_t, "f64", d_laneq_f64]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, 1]]
+      - FnCall: [simd_extract!, [a, 'N as u32']]
+
+  - name: "vdup{type[2]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'N = 4']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, "i8", b_lane_s8]
+      - [int16x8_t, "i16", h_laneq_s16]
+      - [uint8x8_t, "u8", b_lane_u8]
+      - [uint16x8_t, "u16", h_laneq_u16]
+      - [poly8x8_t, "p8", b_lane_p8]
+      - [poly16x8_t, "p16", h_laneq_p16]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, 3]]
+      - FnCall: [simd_extract!, [a, 'N as u32']]
+
+
+  - name: "vdup{type[2]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [float16x4_t, "f16", h_lane_f16]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, 2]]
+      - FnCall: [simd_extract!, [a, 'N as u32']]
+
+
+  - name: "vdup{type[2]}"
+    doc: "Extract an element from a vector"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'N = 4']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [float16x8_t, "f16", h_laneq_f16]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, 4]]
+      - FnCall: [simd_extract!, [a, 'N as u32']]
+
+
+  - name: "vdup{type[2]}"
+    doc: "Extract an element from a vector"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'N = 8']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x16_t, "i8", b_laneq_s8]
+      - [uint8x16_t, "u8", b_laneq_u8]
+      - [poly8x16_t, "p8", b_laneq_p8]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, 4]]
+      - FnCall: [simd_extract!, [a, 'N as u32']]
+
+  - name: "vdup{type[2]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x4_t, "i16", h_lane_s16]
+      - [int32x4_t, "i32", s_laneq_s32]
+      - [uint16x4_t, "u16", h_lane_u16]
+      - [uint32x4_t, "u32", s_laneq_u32]
+      - [poly16x4_t, "p16", h_lane_p16]
+      - [float32x4_t, "f32", s_laneq_f32]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, 2]]
+      - FnCall: [simd_extract!, [a, 'N as u32']]
+
+  - name: "vext{neon_type[0].no}"
+    doc: "Extract vector from pair of vectors"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ext, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [poly64x2_t, ' static_assert_uimm_bits!(N, 1);', 'unsafe { match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), } }']
+      - [float64x2_t, ' static_assert_uimm_bits!(N, 1);', 'unsafe { match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), } }']
+    compose:
+      - Identifier: ["{type[1]}", Symbol]
+      - Identifier: ["{type[2]}", Symbol]
+
+  - name: "vmla{neon_type.no}"
+    doc: "Floating-point multiply-add to accumulator"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmul]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - FnCall: [simd_add, [a, {FnCall: [simd_mul, [b, c]]}]]
+
+  - name: "vmlal_high_{neon_type[1]}"
+    doc: "Signed multiply-add long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [smlal2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int16x8_t, int8x16_t, int8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int32x4_t, int16x8_t, int16x4_t, '[4, 5, 6, 7]', '[4, 5, 6, 7]']
+      - [int64x2_t, int32x4_t, int32x2_t, '[2, 3]', '[2, 3]']
+    compose:
+      - Let: [b, "{neon_type[2]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}]
+      - Let: [c, "{neon_type[2]}", {FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]}]
+      - FnCall: ["vmlal_{neon_type[2]}", [a, b, c]]
+
+  - name: "vmlal_high_{neon_type[1]}"
+    doc: "Unsigned multiply-add long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [umlal2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x16_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint32x4_t, uint16x8_t, uint16x4_t, '[4, 5, 6, 7]']
+      - [uint64x2_t, uint32x4_t, uint32x2_t, '[2, 3]']
+    compose:
+      - Let:
+          - b
+          - "{neon_type[2]}"
+          - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]
+      - Let:
+          - c
+          - "{neon_type[2]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[3]}"]]
+      - FnCall: ["vmlal_{neon_type[1]}", [a, b, c]]
+
+  - name: "vmlsl_high_{neon_type[1]}"
+    doc: "Signed multiply-subtract long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [smlsl2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int16x8_t, int8x16_t, int8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int32x4_t, int16x8_t, int16x4_t, '[4, 5, 6, 7]']
+      - [int64x2_t, int32x4_t, int32x2_t, '[2, 3]']
+    compose:
+      - Let:
+          - b
+          - "{neon_type[2]}"
+          - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]
+      - Let:
+          - c
+          - "{neon_type[2]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[3]}"]]
+      - FnCall: ["vmlsl_{neon_type[1]}", [a, b, c]]
+
+  - name: "vmlsl_high_{neon_type[1]}"
+    doc: "Unsigned multiply-subtract long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [umlsl2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x16_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint32x4_t, uint16x8_t, uint16x4_t, '[4, 5, 6, 7]']
+      - [uint64x2_t, uint32x4_t, uint32x2_t, '[2, 3]']
+    compose:
+      - Let: [b, "{neon_type[2]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}]
+      - Let: [c, "{neon_type[2]}", {FnCall: [simd_shuffle!, [c, c, "{type[3]}"]]}]
+      - FnCall: ["vmlsl_{neon_type[1]}", [a, b, c]]
+
+  - name: "vmovn_high{neon_type[1].noq}"
+    doc: Extract narrow
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr: [*neon-stable]
+    assert_instr: [xtn2]
+    safety: safe
+    types:
+      - [int8x8_t, int16x8_t, int8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int16x4_t, int32x4_t, int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [int32x2_t, int64x2_t, int32x4_t, '[0, 1, 2, 3]']
+      - [uint8x8_t, uint16x8_t, uint8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, uint32x4_t, uint16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, uint64x2_t, uint32x4_t, '[0, 1, 2, 3]']
+    compose:
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall:
+              - simd_cast
+              - - b
+      - FnCall:
+          - simd_shuffle!
+          - - a
+            - c
+            - "{type[3]}"
+
+  - name: "vneg{neon_type.no}"
+    doc: Negate
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [neg]
+    safety: safe
+    types:
+      - int64x1_t
+      - int64x2_t
+    compose:
+      - FnCall:
+          - simd_neg
+          - - a
+
+  - name: "vnegd_s64"
+    doc: Negate
+    arguments: ["a: {type}"]
+    return_type: "{type}"
+    attr: [*neon-stable]
+    assert_instr: [neg]
+    safety: safe
+    types:
+      - i64
+    compose:
+      - MethodCall: [a, wrapping_neg, []]
+
+
+  - name: "vnegh_{type}"
+    doc: Negate
+    arguments: ["a: {type}"]
+    return_type: "{type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fneg]
+    safety: safe
+    types:
+      - f16
+    compose:
+      - '-a'
+
+  - name: "vneg{neon_type.no}"
+    doc: Negate
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall:
+          - stable
+          - - 'feature = "neon_intrinsics"'
+            - 'since = "1.59.0"'
+    assert_instr: [fneg]
+    safety: safe
+    types:
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - FnCall:
+          - simd_neg
+          - - a
+
+  - name: "vqneg{type[1]}"
+    doc: Signed saturating negate
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[0]}"
+    attr: [*neon-stable]
+    assert_instr: [sqneg]
+    safety: safe
+    types:
+      - [i8, 'b_s8', 's8']
+      - [i16, 'h_s16', 's16']
+      - [i32, 's_s32', 's32']
+      - [i64, 'd_s64', 's64']
+    compose:
+      - FnCall:
+          - 'simd_extract!'
+          - - FnCall:
+                - 'vqneg_{type[2]}'
+                - - FnCall: ['vdup_n_{type[2]}', [a]]
+            - 0
+
+  - name: "vqneg{neon_type[0].no}"
+    doc: Signed saturating negate
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[0]}"
+    attr: [*neon-stable]
+    assert_instr: [sqneg]
+    safety: safe
+    types:
+      - [int64x1_t, 'i64']
+      - [int64x2_t, 'i64']
+    compose:
+      - LLVMLink:
+          name: "sqneg.{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.sqneg.v{neon_type[0].lane}{type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqsub{type[1]}"
+    doc: Saturating subtract
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[0]}"
+    attr: [*neon-stable]
+    assert_instr: [sqsub]
+    safety: safe
+    types:
+      - [i32, 's_s32', 'i32']
+      - [i64, 'd_s64', 'i64']
+    compose:
+      - LLVMLink:
+          name: "sqsub.{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.sqsub.{type[2]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqsub{type[1]}"
+    doc: Saturating subtract
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[0]}"
+    attr: [*neon-stable]
+    assert_instr: [uqsub]
+    safety: safe
+    types:
+      - [u32, 's_u32', 'i32']
+      - [u64, 'd_u64', 'i64']
+    compose:
+      - LLVMLink:
+          name: "uqsub.{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.uqsub.{type[2]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqsub{type[3]}"
+    doc: Saturating subtract
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[0]}"
+    attr: [*neon-stable]
+    assert_instr: [sqsub]
+    safety: safe
+    types:
+      - [i8, int8x8_t, s8, 'b_s8']
+      - [i16, int16x4_t, s16, 'h_s16']
+    compose:
+      - Let:
+          - a
+          - "{neon_type[1]}"
+          - FnCall:
+              - "vdup_n_{type[2]}"
+              - - a
+      - Let:
+          - b
+          - "{neon_type[1]}"
+          - FnCall:
+              - "vdup_n_{type[2]}"
+              - - b
+      - FnCall:
+          - 'simd_extract!'
+          - - FnCall:
+                - "vqsub_{type[2]}"
+                - - a
+                  - b
+            - "0"
+
+  - name: "vqsub{type[3]}"
+    doc: Saturating subtract
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[0]}"
+    attr: [*neon-stable]
+    assert_instr: [uqsub]
+    safety: safe
+    types:
+      - [u8, uint8x8_t, u8, 'b_u8']
+      - [u16, uint16x4_t, u16, 'h_u16']
+    compose:
+      - Let:
+          - a
+          - "{neon_type[1]}"
+          - FnCall:
+              - "vdup_n_{type[2]}"
+              - - a
+      - Let:
+          - b
+          - "{neon_type[1]}"
+          - FnCall:
+              - "vdup_n_{type[2]}"
+              - - b
+      - FnCall:
+          - 'simd_extract!'
+          - - FnCall:
+                - "vqsub_{type[2]}"
+                - - a
+                  - b
+            - "0"
+
+  - name: "vrbit{neon_type.no}"
+    doc: Reverse bit order
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [rbit]
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+    compose:
+      - FnCall:
+          - simd_bitreverse
+          - - a
+
+  - name: "vrbit{neon_type[0].no}"
+    doc: Reverse bit order
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr: [*neon-stable]
+    assert_instr: [rbit]
+    safety: safe
+    types:
+      - [uint8x8_t, int8x8_t]
+      - [uint8x16_t, int8x16_t]
+      - [poly8x8_t, int8x8_t]
+      - [poly8x16_t, int8x16_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vrbit{neon_type[1].no}"
+                - - FnCall: [transmute, [a]]
+
+  - name: "vrndx{neon_type.no}"
+    doc: "Floating-point round to integral exact, using current rounding mode"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [frintx]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "llvm.rint.{neon_type}"
+          links:
+            - link: "llvm.rint.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrndx{neon_type.no}"
+    doc: "Floating-point round to integral exact, using current rounding mode"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [frintx]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "llvm.rint.{neon_type}"
+          links:
+            - link: "llvm.rint.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrndx{type[1]}{type[0]}"
+    doc: "Floating-point round to integral, using current rounding mode"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [frintx]
+    safety: safe
+    types:
+      - [f16, 'h_']
+    compose:
+      - FnCall: [round_ties_even_f16, [a]]
+
+
+  - name: "vrnda{neon_type.no}"
+    doc: "Floating-point round to integral, to nearest with ties to away"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [frinta]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - FnCall: [simd_round, [a]]
+
+
+  - name: "vrnda{neon_type.no}"
+    doc: "Floating-point round to integral, to nearest with ties to away"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [frinta]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - FnCall: [simd_round, [a]]
+
+
+  - name: "vrnda{type[1]}{type[0]}"
+    doc: "Floating-point round to integral, to nearest with ties to away"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [frinta]
+    safety: safe
+    types:
+      - [f16, 'h_']
+    compose:
+      - FnCall: [roundf16, [a], [], true]
+
+  - name: "vrndn{neon_type.no}"
+    doc: "Floating-point round to integral, to nearest with ties to even"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [frintn]
+    safety: safe
+    types:
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "frintn.{neon_type}"
+          links:
+            - link: "llvm.roundeven.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vrndns_{type}"
+    doc: "Floating-point round to integral, to nearest with ties to even"
+    arguments: ["a: {type}"]
+    return_type: "{type}"
+    attr: [*neon-stable]
+    assert_instr: [frintn]
+    safety: safe
+    types:
+      - f32
+    compose:
+      - LLVMLink:
+          name: "roundeven.{type}"
+          links:
+            - link: "llvm.roundeven.{type}"
+              arch: aarch64,arm64ec
+
+  - name: "vrndn{type[1]}{type[0]}"
+    doc: "Floating-point round to integral, toward minus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [frintn]
+    safety: safe
+    types:
+      - [f16, 'h_']
+    compose:
+      - LLVMLink:
+          name: "llvm.roundeven.{type[0]}"
+          links:
+            - link: "llvm.roundeven.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vrndm{neon_type.no}"
+    doc: "Floating-point round to integral, toward minus infinity"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [frintm]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - FnCall: [simd_floor, [a]]
+
+
+  - name: "vrndm{neon_type.no}"
+    doc: "Floating-point round to integral, toward minus infinity"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [frintm]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - FnCall: [simd_floor, [a]]
+
+
+  - name: "vrndm{type[1]}{type[0]}"
+    doc: "Floating-point round to integral, toward minus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [frintm]
+    safety: safe
+    types:
+      - [f16, 'h_']
+    compose:
+      - FnCall: [floorf16, [a], [], true]
+
+
+
+  - name: "vrndp{neon_type.no}"
+    doc: "Floating-point round to integral, toward plus infinity"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [frintp]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - FnCall: [simd_ceil, [a]]
+
+
+  - name: "vrndp{neon_type.no}"
+    doc: "Floating-point round to integral, toward plus infinity"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [frintp]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - FnCall: [simd_ceil, [a]]
+
+  - name: "vrndp{type[1]}{type[0]}"
+    doc: "Floating-point round to integral, toward plus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [frintp]
+    safety: safe
+    types:
+      - [f16, 'h_']
+    compose:
+      - FnCall: [ceilf16, [a], [], true]
+
+  - name: "vrnd{neon_type.no}"
+    doc: "Floating-point round to integral, toward zero"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [frintz]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - FnCall: [simd_trunc, [a]]
+
+  - name: "vrnd{neon_type.no}"
+    doc: "Floating-point round to integral, toward zero"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [frintz]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - FnCall: [simd_trunc, [a]]
+
+
+  - name: "vrnd{type[1]}{type[0]}"
+    doc: "Floating-point round to integral, to nearest with ties to away"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [frintz]
+    safety: safe
+    types:
+      - [f16, 'h_']
+    compose:
+      - FnCall: [truncf16, [a], [], true]
+
+
+  - name: "vrndi{neon_type.no}"
+    doc: "Floating-point round to integral, using current rounding mode"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [frinti]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "llvm.nearbyint.{neon_type}"
+          links:
+            - link: "llvm.nearbyint.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrndi{neon_type.no}"
+    doc: "Floating-point round to integral, using current rounding mode"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [frinti]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "llvm.nearbyint.{neon_type}"
+          links:
+            - link: "llvm.nearbyint.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrndi{type[1]}{type[0]}"
+    doc: "Floating-point round to integral, using current rounding mode"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    # TODO: double check me
+    assert_instr: [frinti]
+    safety: safe
+    types:
+      - [f16, 'h_']
+    compose:
+      - LLVMLink:
+          name: "llvm.nearbyint.{type[0]}"
+          links:
+            - link: "llvm.nearbyint.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqadd{type[1]}"
+    doc: Saturating add
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[0]}"
+    attr: [*neon-stable]
+    assert_instr: [uqadd]
+    safety: safe
+    types:
+      - [u32, 's_u32', i32]
+      - [u64, 'd_u64', i64]
+    compose:
+      - LLVMLink:
+          name: "uqadd.{type[2]}"
+          links:
+            - link: "llvm.aarch64.neon.uqadd.{type[2]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqadd{type[1]}"
+    doc: Saturating add
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[0]}"
+    attr: [*neon-stable]
+    assert_instr: [sqadd]
+    safety: safe
+    types:
+      - [i32, 's_s32', i32]
+      - [i64, 'd_s64', i64]
+    compose:
+      - LLVMLink:
+          name: "uqadd.{type[2]}"
+          links:
+            - link: "llvm.aarch64.neon.sqadd.{type[2]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqadd{type[2]}"
+    doc: Saturating add
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[0]}"
+    attr: [*neon-stable]
+    assert_instr: [sqadd]
+    safety: safe
+    types:
+      - [i8, int8x8_t, 'b_s8']
+      - [i16, int16x4_t, 'h_s16']
+    compose:
+      - Let:
+          - a
+          - "{neon_type[1]}"
+          - FnCall:
+              - "vdup_n_{type[0]}"
+              - - a
+      - Let:
+          - b
+          - "{neon_type[1]}"
+          - FnCall:
+              - "vdup_n_{type[0]}"
+              - - b
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vqadd_{type[0]}"
+                - - a
+                  - b
+            - "0"
+
+  - name: "vqadd{type[2]}"
+    doc: Saturating add
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[0]}"
+    attr: [*neon-stable]
+    assert_instr: [uqadd]
+    safety: safe
+    types:
+      - [u8, uint8x8_t, 'b_u8']
+      - [u16, uint16x4_t, 'h_u16']
+    compose:
+      - Let:
+          - a
+          - "{neon_type[1]}"
+          - FnCall:
+              - "vdup_n_{type[0]}"
+              - - a
+      - Let:
+          - b
+          - "{neon_type[1]}"
+          - FnCall:
+              - "vdup_n_{type[0]}"
+              - - b
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vqadd_{type[0]}"
+                - - a
+                  - b
+            - "0"
+
+  - name: "vld1{neon_type[1].no}"
+    doc: "Load multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [ld1]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f64", float64x1x2_t]
+      - ["*const f64", float64x2x2_t]
+      - ["*const f64", float64x1x3_t]
+      - ["*const f64", float64x2x3_t]
+      - ["*const f64", float64x1x4_t]
+      - ["*const f64", float64x2x4_t]
+    compose:
+      - LLVMLink:
+          name: "vld1{neon_type[1].no}"
+          links:
+            - link: "llvm.aarch64.neon.ld1x{neon_type[1].tuple}.v{neon_type[1].lane}f{neon_type[1].base}.p0"
+              arch: aarch64,arm64ec
+
+  - name: "vld2{neon_type[1].lane_nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i8", int8x16x2_t, i8, int8x16_t, "4"]
+      - ["*const i64", int64x2x2_t, i64, int64x2_t, "1"]
+      - ["*const f64", float64x2x2_t, f64, float64x2_t, "1"]
+    compose:
+      - FnCall:
+          - "static_assert_uimm_bits!"
+          - - LANE
+            - "{type[4]}"
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "a: {neon_type[3]}"
+            - "b: {neon_type[3]}"
+            - "n: i64"
+            - "ptr: *const i8"
+          links:
+            - link: "llvm.aarch64.neon.ld2lane.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vld2{neon_type[1].lane_nox}", ["b.0", "b.1", "LANE as i64", "a as _"]]
+
+  - name: "vld2{neon_type[1].lane_nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i64", int64x1x2_t, i64, int64x1_t]
+      - ["*const f64", float64x1x2_t, f64, float64x1_t]
+    compose:
+      - FnCall: ["static_assert!", ['LANE == 0']]
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "a: {neon_type[3]}"
+            - "b: {neon_type[3]}"
+            - "n: i64"
+            - "ptr: *const i8"
+          links:
+            - link: "llvm.aarch64.neon.ld2lane.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vld2{neon_type[1].lane_nox}", ["b.0", "b.1", "LANE as i64", "a as _"]]
+
+  - name: "vld2{neon_type[1].lane_nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u64", uint64x1x2_t, int64x1x2_t]
+    compose:
+      - FnCall:
+          - "static_assert!"
+          - - 'LANE == 0'
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].lane_nox}::<LANE>"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vld2{neon_type[1].lane_nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const p64", poly64x1x2_t, int64x1x2_t]
+    compose:
+      - FnCall:
+          - "static_assert!"
+          - - 'LANE == 0'
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].lane_nox}::<LANE>"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vld2{neon_type[1].lane_nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u8", uint8x16x2_t, int8x16x2_t, "4"]
+      - ["*const p8", poly8x16x2_t, int8x16x2_t, "4"]
+      - ["*const u64", uint64x2x2_t, int64x2x2_t, "1"]
+    compose:
+      - FnCall:
+          - "static_assert_uimm_bits!"
+          - - LANE
+            - "{type[3]}"
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].lane_nox}::<LANE>"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vld2{neon_type[1].lane_nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const p64", poly64x2x2_t, int64x2x2_t, "1"]
+    compose:
+      - FnCall: ["static_assert_uimm_bits!", [LANE, '{type[3]}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].lane_nox}::<LANE>"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vld2{neon_type[1].nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [ld2]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f64", float64x2x2_t, f64, float64x2_t]
+      - ["*const i64", int64x2x2_t, i64, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "ptr: *const {neon_type[3]}"
+          links:
+            - link: "llvm.aarch64.neon.ld2.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld2{neon_type[1].nox}"
+          - - "a as _"
+
+  - name: "vld2{neon_type[1].nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f64", float64x1x2_t, f64, float64x1_t]
+    compose:
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "ptr: *const {neon_type[3]}"
+          links:
+            - link: "llvm.aarch64.neon.ld2.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld2{neon_type[1].nox}"
+          - - "a as _"
+
+  - name: "vld2{neon_type[1].nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [ld2]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u64", uint64x2x2_t, int64x2x2_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].nox}"
+                - - FnCall: [transmute, [a]]
+
+  - name: "vld2{neon_type[1].nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-stable
+    assert_instr: [ld2]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const p64", poly64x2x2_t, int64x2x2_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].nox}"
+                - - FnCall: [transmute, [a]]
+
+  - name: "vld2{neon_type[1].dup_nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [ld2r]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i64", int64x2x2_t, i64]
+      - ["*const f64", float64x1x2_t, f64]
+      - ["*const f64", float64x2x2_t, f64]
+    compose:
+      - LLVMLink:
+          name: "vld2dup.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.ld2r.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld2{neon_type[1].dup_nox}"
+          - - "a as _"
+
+  - name: "vld2{neon_type[1].dup_nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [ld2r]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u64", uint64x2x2_t, int64x2x2_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].dup_nox}"
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld2{neon_type[1].dup_nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-stable
+    assert_instr: [ld2r]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const p64", poly64x2x2_t, int64x2x2_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].dup_nox}"
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld3{neon_type[1].lane_nox}"
+    doc: "Load multiple 3-element structures to two registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i8', int8x16x3_t, int8x16_t, i8, '3']
+      - ['*const i64', int64x2x3_t, int64x2_t, i64, '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[4]}']]
+      - LLVMLink:
+          name: 'ld3lane.{neon_type[2]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *const i8'
+          links:
+            - link: 'llvm.aarch64.neon.ld3lane.v{neon_type[1].lane}{type[3]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld3{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'LANE as i64', 'a as _']]
+
+  - name: "vld3{neon_type[1].lane_nox}"
+    doc: "Load multiple 3-element structures to three registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const f64', float64x2x3_t, float64x2_t, f64, '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[4]}']]
+      - LLVMLink:
+          name: 'ld3lane.{neon_type[2]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *const i8'
+          links:
+            - link: 'llvm.aarch64.neon.ld3lane.v{neon_type[1].lane}{type[3]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld3{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'LANE as i64', 'a as _']]
+
+  - name: "vld3{neon_type[1].lane_nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const f64', float64x1x3_t, float64x1_t, f64]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - LLVMLink:
+          name: 'vld3.{neon_type[2]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *const i8'
+          links:
+            - link: 'llvm.aarch64.neon.ld3lane.v{neon_type[1].lane}{type[3]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld3{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'LANE as i64', 'a as _']]
+
+  - name: "vld3{neon_type[1].lane_nox}"
+    doc: "Load multiple 3-element structures to two registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i64', int64x1x3_t, int64x1_t, i64]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - LLVMLink:
+          name: 'vld3.{neon_type[2]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *const i8'
+          links:
+            - link: 'llvm.aarch64.neon.ld3lane.v{neon_type[1].lane}{type[3]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld3{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'LANE as i64', 'a as _']]
+
+  - name: "vld3{neon_type[1].lane_nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const p8', poly8x16x3_t, int8x16x3_t, '4']
+      - ['*const u8', uint8x16x3_t, int8x16x3_t, '4']
+      - ['*const u64', uint64x2x3_t, int64x2x3_t, '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', '{type[3]}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld3{neon_type[2].lane_nox}::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vld3{neon_type[1].lane_nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const u64', uint64x1x3_t, int64x1x3_t, '1']
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld3{neon_type[2].lane_nox}::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vld3{neon_type[1].lane_nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const p64', poly64x2x3_t, int64x2x3_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', 1]]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld3{neon_type[2].lane_nox}::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vld3{neon_type[1].lane_nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const p64', poly64x1x3_t, int64x1x3_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld3{neon_type[2].lane_nox}::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vld3{neon_type[1].nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    safety:
+      unsafe: [neon]
+    assert_instr: [ld3]
+    types:
+      - ['*const i64', int64x2x3_t, '*const int64x2_t', i64]
+      - ['*const f64', float64x2x3_t, '*const float64x2_t', f64]
+    compose:
+      - LLVMLink:
+          name: 'vld3{neon_type[1].nox}'
+          arguments:
+            - 'ptr: {type[2]}'
+          links:
+            - link: 'llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[3]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld3{neon_type[1].nox}', ['a as _']]
+
+  - name: "vld3{neon_type[1].nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    safety:
+      unsafe: [neon]
+    assert_instr: [nop]
+    types:
+      - ['*const f64', float64x1x3_t, '*const float64x1_t', f64]
+    compose:
+      - LLVMLink:
+          name: 'vld3{neon_type[1].nox}'
+          arguments:
+            - 'ptr: {type[2]}'
+          links:
+            - link: 'llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[3]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld3{neon_type[1].nox}', ['a as _']]
+
+  - name: "vld3{neon_type[1].nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    safety:
+      unsafe: [neon]
+    assert_instr: [ld3]
+    types:
+      - ['*const u64', uint64x2x3_t, int64x2x3_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld3{neon_type[2].nox}'
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld3{neon_type[1].nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-stable
+    safety:
+      unsafe: [neon]
+    assert_instr: [ld3]
+    types:
+      - ['*const p64', poly64x2x3_t, int64x2x3_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld3{neon_type[2].nox}'
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld3{neon_type[1].dup_nox}"
+    doc: Load single 3-element structure and replicate to all lanes of three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [ld3r]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i64", int64x2x3_t, i64]
+      - ["*const f64", float64x1x3_t, f64]
+      - ["*const f64", float64x2x3_t, f64]
+    compose:
+      - LLVMLink:
+          name: 'ld3r{neon_type[1].dup_nox}'
+          arguments:
+            - 'ptr: {type[0]}'
+          links:
+            - link: 'llvm.aarch64.neon.ld3r.v{neon_type[1].lane}{type[2]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld3{neon_type[1].dup_nox}', ['a as _']]
+
+  - name: "vld3{neon_type[1].dup_nox}"
+    doc: Load single 3-element structure and replicate to all lanes of three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [ld3r]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u64", uint64x2x3_t, int64x2x3_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld3{neon_type[2].dup_nox}"
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld3{neon_type[1].dup_nox}"
+    doc: Load single 3-element structure and replicate to all lanes of three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-stable
+    assert_instr: [ld3r]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const p64", poly64x2x3_t, int64x2x3_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld3{neon_type[2].dup_nox}"
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld4{neon_type[1].nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [ld4]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const f64', float64x2x4_t, f64, '*const float64x2_t']
+      - ['*const i64', int64x2x4_t, i64, '*const int64x2_t']
+    compose:
+      - LLVMLink:
+          name: 'vld4{neon_type[1].nox}'
+          arguments:
+            - 'ptr: {type[3]}'
+          links:
+            - link: 'llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld4{neon_type[1].nox}', ['a as _']]
+
+  - name: "vld4{neon_type[1].nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const f64', float64x1x4_t, f64, '*const float64x1_t']
+    compose:
+      - LLVMLink:
+          name: 'vld4{neon_type[1].nox}'
+          arguments:
+            - 'ptr: {type[3]}'
+          links:
+            - link: 'llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld4{neon_type[1].nox}', ['a as _']]
+
+  - name: "vld4{neon_type[1].nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [ld4]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u64", uint64x2x4_t, int64x2x4_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld4{neon_type[2].nox}'
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld4{neon_type[1].nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-stable
+      - *neon-aes
+    assert_instr: [ld4]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const p64", poly64x2x4_t, int64x2x4_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld4{neon_type[2].nox}'
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld4{neon_type[1].lane_nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i8', int8x16x4_t, int8x16_t, i8, '3']
+      - ['*const i64', int64x2x4_t, int64x2_t, i64, '1']
+      - ['*const f64', float64x2x4_t, float64x2_t, f64, '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[4]}']]
+      - LLVMLink:
+          name: 'ld4lane.{neon_type[2]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *const i8'
+          links:
+            - link: 'llvm.aarch64.neon.ld4lane.v{neon_type[1].lane}{type[3]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld4{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'LANE as i64', 'a as _']]
+
+  - name: "vld4{neon_type[1].lane_nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i64', int64x1x4_t, int64x1_t, i64]
+      - ['*const f64', float64x1x4_t, float64x1_t, f64]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - LLVMLink:
+          name: 'ld4lane.{neon_type[2]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *const i8'
+          links:
+            - link: 'llvm.aarch64.neon.ld4lane.v{neon_type[1].lane}{type[3]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld4{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'LANE as i64', 'a as _']]
+
+  - name: "vld4{neon_type[1].lane_nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const p8', poly8x16x4_t, int8x16x4_t, '4']
+      - ['*const u8', uint8x16x4_t, int8x16x4_t, '4']
+      - ['*const u64', uint64x2x4_t, int64x2x4_t, '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', '{type[3]}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld4{neon_type[2].lane_nox}::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vld4{neon_type[1].lane_nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const p64', poly64x2x4_t, int64x2x4_t, '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', '{type[3]}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld4{neon_type[2].lane_nox}::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vld4{neon_type[1].lane_nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const u64', uint64x1x4_t, int64x1x4_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld4{neon_type[2].lane_nox}::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vld4{neon_type[1].lane_nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall:
+          - target_feature
+          - - 'enable = "neon,aes"'
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const p64', poly64x1x4_t, int64x1x4_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld4{neon_type[2].lane_nox}::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vst1{neon_type[1].lane_nox}"
+    doc: "Store multiple single-element structures from one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    types:
+      - ['*mut f64', float64x1_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - Assign:
+          - "*a"
+          - FnCall: [simd_extract!, [b, 'LANE as u32']]
+      - Identifier: [';', Symbol]
+
+  - name: "vst1{neon_type[1].lane_nox}"
+    doc: "Store multiple single-element structures from one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    types:
+      - ['*mut f64', float64x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '1']]
+      - Assign:
+          - "*a"
+          - FnCall: [simd_extract!, [b, 'LANE as u32']]
+      - Identifier: [';', Symbol]
+
+  - name: "vst2{neon_type[1].nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *neon-stable
+    assert_instr: [st1]
+    types:
+      - ['f64', float64x1x2_t, float64x1_t]
+    compose:
+      - LLVMLink:
+          name: 'st2.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st2.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst2{neon_type[1].nox}', ['b.0', 'b.1', 'a as _']]
+
+  - name: "vst2{neon_type[1].nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *neon-stable
+    assert_instr: [st2]
+    types:
+      - [i64, int64x2x2_t, int64x2_t]
+      - [f64, float64x2x2_t, float64x2_t]
+    compose:
+      - LLVMLink:
+          name: 'st2.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st2.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst2{neon_type[1].nox}', ['b.0', 'b.1', 'a as _']]
+
+  - name: "vst2{neon_type[1].lane_nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i64, int64x1x2_t, int64x1_t]
+      - [f64, float64x1x2_t, float64x1_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - LLVMLink:
+          name: 'st2.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st2lane.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst2{neon_type[1].lane_nox}', ['b.0', 'b.1', 'LANE as i64', 'a as _']]
+
+  - name: "vst2{neon_type[1].lane_nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i8, int8x16x2_t, int8x16_t, '4']
+      - [i64, int64x2x2_t, int64x2_t, '1']
+      - [f64, float64x2x2_t, float64x2_t, '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', "{type[3]}"]]
+      - LLVMLink:
+          name: 'st2.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st2lane.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst2{neon_type[1].lane_nox}', ['b.0', 'b.1', 'LANE as i64', 'a as _']]
+
+  - name: "vst2{neon_type[1].lane_nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [u8, uint8x16x2_t, int8x16x2_t, '4']
+      - [u64, uint64x2x2_t, int64x2x2_t, '1']
+      - [p8, poly8x16x2_t, int8x16x2_t, '4']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', "{type[3]}"]]
+      - FnCall:
+          - "vst2{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst2{neon_type[1].lane_nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [u64, uint64x1x2_t, int64x1x2_t, '1']
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - "vst2{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst2{neon_type[1].nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-stable
+    assert_instr: [st2]
+    safety:
+      unsafe: [neon]
+    types:
+      - [u64, uint64x2x2_t, int64x2x2_t]
+    compose:
+      - FnCall:
+          - "vst2{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst2{neon_type[1].lane_nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-aes
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [p64, poly64x1x2_t, int64x1x2_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - "vst2{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst2{neon_type[1].lane_nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-aes
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [p64, poly64x2x2_t, int64x2x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', '1']]
+      - FnCall:
+          - "vst2{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst2{neon_type[1].nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-aes
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st2]]}]]
+      - *neon-stable
+    safety:
+      unsafe: [neon]
+    types:
+      - [p64, poly64x2x2_t, int64x2x2_t]
+    compose:
+      - FnCall:
+          - "vst2{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst3{neon_type[1].nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr: [*neon-stable]
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - [f64, float64x1x3_t, float64x1_t]
+    compose:
+      - LLVMLink:
+          name: 'st3.{neon_type[1].nox}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st3.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst3{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'a as _']]
+
+  - name: "vst3{neon_type[1].lane_nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-stable
+      - *neon-aes
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [p64, poly64x1x3_t, int64x1x3_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - "vst3{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst3{neon_type[1].lane_nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-stable
+      - *neon-aes
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [p64, poly64x2x3_t, int64x2x3_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '1']]
+      - FnCall:
+          - "vst3{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst3{neon_type[1].nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-stable
+      - *neon-aes
+    assert_instr: [st3]
+    safety:
+      unsafe: [neon]
+    types:
+      - [p64, poly64x2x3_t, int64x2x3_t]
+    compose:
+      - FnCall:
+          - "vst3{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst3{neon_type[1].nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr: [*neon-stable]
+    assert_instr: [st3]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i64, int64x2x3_t, int64x2_t]
+      - [f64, float64x2x3_t, float64x2_t]
+    compose:
+      - LLVMLink:
+          name: 'st3.{neon_type[1].nox}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st3.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst3{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'a as _']]
+
+  - name: "vst3{neon_type[1].nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr: [*neon-stable]
+    assert_instr: [st3]
+    safety:
+      unsafe: [neon]
+    types:
+      - [u64, uint64x2x3_t, int64x2x3_t]
+    compose:
+      - FnCall:
+          - "vst3{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst3{neon_type[1].lane_nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [u64, uint64x1x3_t, int64x1x3_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - "vst3{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst3{neon_type[1].lane_nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [u8, uint8x16x3_t, int8x16x3_t, '4']
+      - [u64, uint64x2x3_t, int64x2x3_t, '1']
+      - [p8, poly8x16x3_t, int8x16x3_t, '4']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - FnCall:
+          - "vst3{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst3{neon_type[1].lane_nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [f64, float64x2x3_t, float64x2_t, '1']
+      - [i8, int8x16x3_t, int8x16_t, '4']
+      - [i64, int64x2x3_t, int64x2_t, '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - LLVMLink:
+          name: 'st3lane.{neon_type[1].nox}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st3lane.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst3{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'LANE as i64', 'a as _']]
+
+  - name: "vst3{neon_type[1].lane_nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [i64, int64x1x3_t, int64x1_t, '1']
+      - [f64, float64x1x3_t, float64x1_t, '1']
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - LLVMLink:
+          name: 'st3lane.{neon_type[1].nox}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st3lane.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst3{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'LANE as i64', 'a as _']]
+
+  - name: "vst4{neon_type[1].nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr: [*neon-stable]
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - [f64, float64x1x4_t, float64x1_t]
+    compose:
+      - LLVMLink:
+          name: 'st4.{neon_type[1].nox}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st4.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst4{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'a as _']]
+
+  - name: "vst4{neon_type[1].lane_nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-stable
+      - *neon-aes
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [p64, poly64x1x4_t, int64x1x4_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - "vst4{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst4{neon_type[1].lane_nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-stable
+      - *neon-aes
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [p64, poly64x2x4_t, int64x2x4_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '1']]
+      - FnCall:
+          - "vst4{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst4{neon_type[1].nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-stable
+      - *neon-aes
+    assert_instr: [st4]
+    safety:
+      unsafe: [neon]
+    types:
+      - [p64, poly64x2x4_t, int64x2x4_t]
+    compose:
+      - FnCall:
+          - "vst4{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst4{neon_type[1].nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr: [*neon-stable]
+    assert_instr: [st4]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i64, int64x2x4_t, int64x2_t]
+      - [f64, float64x2x4_t, float64x2_t]
+    compose:
+      - LLVMLink:
+          name: 'st4.{neon_type[1].nox}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st4.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst4{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'a as _']]
+
+  - name: "vst4{neon_type[1].nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr: [*neon-stable]
+    assert_instr: [st4]
+    safety:
+      unsafe: [neon]
+    types:
+      - [u64, uint64x2x4_t, int64x2x4_t]
+    compose:
+      - FnCall:
+          - "vst4{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst4{neon_type[1].lane_nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [u64, uint64x1x4_t, int64x1x4_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - "vst4{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst4{neon_type[1].lane_nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [u8, uint8x16x4_t, int8x16x4_t, '4']
+      - [u64, uint64x2x4_t, int64x2x4_t, '1']
+      - [p8, poly8x16x4_t, int8x16x4_t, '4']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - FnCall:
+          - "vst4{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst4{neon_type[1].lane_nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [f64, float64x2x4_t, float64x2_t, '1']
+      - [i8, int8x16x4_t, int8x16_t, '4']
+      - [i64, int64x2x4_t, int64x2_t, '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - LLVMLink:
+          name: 'st4lane.{neon_type[1].nox}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st4lane.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst4{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'LANE as i64', 'a as _']]
+
+  - name: "vst4{neon_type[1].lane_nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [i64, int64x1x4_t, int64x1_t, '1']
+      - [f64, float64x1x4_t, float64x1_t, '1']
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - LLVMLink:
+          name: 'st4lane.{neon_type[1].nox}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st4lane.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst4{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'LANE as i64', 'a as _']]
+
+  - name: "vusdot{neon_type[0].laneq_nox}"
+    doc: "Dot product index form with unsigned and signed integers"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-i8mm
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [usdot, 'LANE = 3']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [unstable, ['feature = "stdarch_neon_i8mm"', 'issue = "117223"']]
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [int32x2_t, uint8x8_t, int8x16_t, '[LANE as u32, LANE as u32]']
+      - [int32x4_t, uint8x16_t, int8x16_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '2']]
+      - Let: [c, int32x4_t, {FnCall: [transmute, [c]]}]
+      - Let: [c, "{neon_type[0]}", {FnCall: [simd_shuffle!, [c, c, "{type[3]}"]]}]
+      - FnCall: ["vusdot{neon_type[0].no}", [a, b, {FnCall: [transmute, [c]]}]]
+
+  - name: "vsudot{neon_type[0].laneq_nox}"
+    doc: "Dot product index form with signed and unsigned integers"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-i8mm
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sudot, 'LANE = 3']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [unstable, ['feature = "stdarch_neon_i8mm"', 'issue = "117223"']]
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [int32x2_t, int8x8_t, uint8x16_t, '[LANE as u32, LANE as u32]', uint32x2_t]
+      - [int32x4_t, int8x16_t, uint8x16_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]', uint32x4_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, 2]]
+      - Let:
+          - c
+          - uint32x4_t
+          - FnCall: [transmute, [c]]
+      - Let:
+          - c
+          - "{type[4]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[3]}"]]
+      - FnCall: ["vusdot{neon_type[0].no}", [a, {FnCall: [transmute, [c]]}, b]]
+
+  - name: "vmul{neon_type.no}"
+    doc: Multiply
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [fmul]
+    safety: safe
+    types:
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - FnCall: [simd_mul, [a, b]]
+
+  - name: "vmull_high{neon_type[0].noq}"
+    doc: Signed multiply long
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[3]}"
+    attr: [*neon-stable]
+    assert_instr: [smull2]
+    safety: safe
+    types:
+      - [int8x16_t, int8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', int16x8_t]
+      - [int16x8_t, int16x4_t, '[4, 5, 6, 7]', int32x4_t]
+      - [int32x4_t, int32x2_t, '[2, 3]', int64x2_t]
+    compose:
+      - Let:
+          - a
+          - "{neon_type[1]}"
+          - FnCall: [simd_shuffle!, [a, a, "{type[2]}"]]
+      - Let:
+          - b
+          - "{neon_type[1]}"
+          - FnCall: [simd_shuffle!, [b, b, "{type[2]}"]]
+      - FnCall: ["vmull_{neon_type[0]}", [a, b]]
+
+  - name: "vmull_high{neon_type[0].noq}"
+    doc: "Unsigned multiply long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[3]}"
+    attr: [*neon-stable]
+    assert_instr: [umull2]
+    safety: safe
+    types:
+      - [uint8x16_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', uint16x8_t]
+      - [uint16x8_t, uint16x4_t, '[4, 5, 6, 7]', uint32x4_t]
+      - [uint32x4_t, uint32x2_t, '[2, 3]', uint64x2_t]
+    compose:
+      - Let:
+          - a
+          - "{neon_type[1]}"
+          - FnCall: [simd_shuffle!, [a, a, "{type[2]}"]]
+      - Let:
+          - b
+          - "{neon_type[1]}"
+          - FnCall: [simd_shuffle!, [b, b, "{type[2]}"]]
+      - FnCall: ["vmull_{neon_type[0]}", [a, b]]
+
+  - name: "vmull_p64"
+    doc: "Polynomial multiply long"
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-stable
+    safety: safe
+    assert_instr: [pmull]
+    types:
+      - ["p64", "p128"]
+    compose:
+      - LLVMLink:
+          name: "pmull.{type[0]}"
+          return_type: "int8x16_t"
+          links:
+            - link: "llvm.aarch64.neon.pmull64"
+              arch: aarch64,arm64ec
+      - FnCall: [transmute, [{FnCall: ["_vmull_p64", [a, b]]}]]
+
+  - name: "vmull_high{neon_type[0].noq}"
+    doc: "Polynomial multiply long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - *neon-stable
+    safety: safe
+    assert_instr: [pmull]
+    types:
+      - [poly8x16_t, poly8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', poly16x8_t]
+    compose:
+      - Let:
+          - a
+          - "{neon_type[1]}"
+          - FnCall: [simd_shuffle!, [a, a, "{type[2]}"]]
+      - Let:
+          - b
+          - "{neon_type[1]}"
+          - FnCall: [simd_shuffle!, [b, b, "{type[2]}"]]
+      - FnCall: ["vmull_{neon_type[0]}", [a, b]]
+
+  - name: "vmull_high{neon_type[0].noq}"
+    doc: "Polynomial multiply long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-stable
+    safety: safe
+    assert_instr: [pmull]
+    types:
+      - [poly64x2_t, "p128"]
+    compose:
+      - FnCall:
+          - "vmull_{neon_type[0]}"
+          - - FnCall: [simd_extract!, [a, '1']]
+            - FnCall: [simd_extract!, [b, '1']]
+
+  - name: "vmulx{neon_type.no}"
+    doc: Floating-point multiply extended
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [fmulx]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "fmulx.{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.fmulx.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmulx{neon_type.no}"
+    doc: Floating-point multiply extended
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fmulx]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "fmulx.{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.fmulx.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmulx{type[0]}"
+    doc: Floating-point multiply extended
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [fmulx]
+    safety: safe
+    types:
+      - ["s_f32", "f32"]
+      - ["d_f64", "f64"]
+    compose:
+      - LLVMLink:
+          name: "fmulx.{type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fmulx.{type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmulx{type[0]}"
+    doc: Floating-point multiply extended
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fmulx]
+    safety: safe
+    types:
+      - ["h_f16", "f16"]
+    compose:
+      - LLVMLink:
+          name: "fmulx.{type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fmulx.{type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmulx_lane_f64"
+    doc: Floating-point multiply extended
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmulx, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - float64x1_t
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - vmulx_f64
+          - - a
+            - FnCall:
+                - 'transmute::<f64, _>'
+                - - FnCall:
+                      - "simd_extract!"
+                      - - b
+                        - 'LANE as u32'
+
+  - name: "vmulx{type[0]}"
+    doc: Floating-point multiply extended
+    arguments: ["a: {type[1]}", "b: {neon_type[2]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmulx, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - ["q_lane_f64", float64x2_t, float64x1_t, "q_f64", '[LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - "vmulx{type[3]}"
+          - - a
+            - FnCall:
+                - "simd_shuffle!"
+                - - b
+                  - b
+                  - "{type[4]}"
+
+  - name: "vmulx{type[0]}"
+    doc: Floating-point multiply extended
+    arguments: ["a: {type[1]}", "b: {neon_type[2]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmulx, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - ["d_lane_f64", "f64", float64x1_t, "d_f64", 'LANE as u32']
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - "vmulx{type[3]}"
+          - - a
+            - FnCall:
+                - "simd_extract!"
+                - - b
+                  - "{type[4]}"
+
+  - name: "vmulx_laneq_f64"
+    doc: Floating-point multiply extended
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmulx, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float64x1_t, float64x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', '1']]
+      - FnCall:
+          - vmulx_f64
+          - - a
+            - FnCall:
+                - 'transmute::<f64, _>'
+                - - FnCall:
+                      - "simd_extract!"
+                      - - b
+                        - 'LANE as u32'
+
+  - name: "vmulx{type[0]}"
+    doc: Floating-point multiply extended
+    arguments: ["a: {type[1]}", "b: {neon_type[2]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmulx, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - ['_lane_f32', float32x2_t, float32x2_t, '1', '_f32', '[LANE as u32, LANE as u32]']
+      - ['_laneq_f32', float32x2_t, float32x4_t, '2', '_f32', '[LANE as u32, LANE as u32]']
+      - ['q_lane_f32', float32x4_t, float32x2_t, '1', 'q_f32', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - ['q_laneq_f32', float32x4_t, float32x4_t, '2', 'q_f32', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - ['q_laneq_f64', float64x2_t, float64x2_t, '1', 'q_f64', '[LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', "{type[3]}"]]
+      - FnCall:
+          - "vmulx{type[4]}"
+          - - a
+            - FnCall:
+                - "simd_shuffle!"
+                - - b
+                  - b
+                  - "{type[5]}"
+
+
+  - name: "vmulx{type[0]}"
+    doc: Floating-point multiply extended
+    arguments: ["a: {type[1]}", "b: {neon_type[2]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmulx, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-fp16 
+      - *neon-unstable-f16
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - ['_lane_f16', float16x4_t, float16x4_t, '2', '_f16', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - ['_laneq_f16', float16x4_t, float16x8_t, '3', '_f16', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - ['q_lane_f16', float16x8_t, float16x4_t, '2', 'q_f16', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - ['q_laneq_f16', float16x8_t, float16x8_t, '3', 'q_f16', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', "{type[3]}"]]
+      - FnCall:
+          - "vmulx{type[4]}"
+          - - a
+            - FnCall:
+                - "simd_shuffle!"
+                - - b
+                  - b
+                  - "{type[5]}"
+
+
+  - name: "vmulx{type[0]}"
+    doc: Floating-point multiply extended
+    arguments: ["a: {type[1]}", "b: {neon_type[2]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmulx, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - ['s_lane_f32', f32, float32x2_t, '1', 's_f32', 'LANE as u32']
+      - ['s_laneq_f32', f32, float32x4_t, '2', 's_f32', 'LANE as u32']
+      - ['d_laneq_f64', f64, float64x2_t, '1', 'd_f64', 'LANE as u32']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', "{type[3]}"]]
+      - FnCall:
+          - "vmulx{type[4]}"
+          - - a
+            - FnCall:
+                - "simd_extract!"
+                - - b
+                  - "{type[5]}"
+
+
+  - name: "vmulx{type[0]}"
+    doc: Floating-point multiply extended
+    arguments: ["a: {type[1]}", "b: {neon_type[2]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmulx, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-fp16 
+      - *neon-unstable-f16
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - ['h_lane_f16', f16, float16x4_t, '2', 'h_f16', "LANE as u32"]
+      - ['h_laneq_f16', f16, float16x8_t, '3', 'h_f16', "LANE as u32"]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', "{type[3]}"]]
+      - FnCall:
+          - "vmulx{type[4]}"
+          - - a
+            - FnCall:
+                - "simd_extract!"
+                - - b
+                  - "{type[5]}"
+
+
+  - name: "vmulx{neon_type[0].N}"
+    doc: "Vector multiply by scalar"
+    arguments: ["a: {neon_type[0]}", "b: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmulx]]}]]
+      - *neon-fp16 
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, "f16"]
+      - [float16x8_t, "f16"]
+    compose:
+      - FnCall:
+          - vmulx{neon_type[0].no}
+          - - a
+            - FnCall: ["vdup{neon_type[0].N}", [b]]
+
+  - name: "vfma{neon_type.no}"
+    doc: Floating-point fused Multiply-Add to accumulator(vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-stable
+    assert_instr: [fmadd]
+    safety: safe
+    types:
+      - float64x1_t
+    compose:
+      - FnCall: [simd_fma, [b, c, a]]
+
+  - name: "vfma{neon_type.no}"
+    doc: Floating-point fused Multiply-Add to accumulator(vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-stable
+    assert_instr: [fmla]
+    safety: safe
+    types:
+      - float64x2_t
+    compose:
+      - FnCall: [simd_fma, [b, c, a]]
+
+  - name: "vfma_n_f64"
+    doc: Floating-point fused Multiply-Add to accumulator(vector)
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+    assert_instr: [fmadd]
+    safety: safe
+    types:
+      - [float64x1_t, f64]
+    compose:
+      - FnCall:
+          - "vfma_f64"
+          - - a
+            - b
+            - FnCall:
+                - "vdup_n_f64"
+                - - c
+
+  - name: "vfmaq_n_f64"
+    doc: Floating-point fused Multiply-Add to accumulator(vector)
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall:
+          - stable
+          - - 'feature = "neon_intrinsics"'
+            - 'since = "1.59.0"'
+    assert_instr: [fmla]
+    safety: safe
+    types:
+      - [float64x2_t, f64]
+    compose:
+      - FnCall:
+          - "vfmaq_f64"
+          - - a
+            - b
+            - FnCall:
+                - "vdupq_n_f64"
+                - - c
+
+  - name: "vfma{neon_type[0].N}"
+    doc: Floating-point fused Multiply-Subtract from accumulator.
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-fp16 
+      - *neon-unstable-f16
+    assert_instr: [fmla]
+    safety: safe
+    types:
+      - [float16x4_t, f16]
+      - [float16x8_t, f16]
+    compose:
+      - FnCall:
+          - "vfma{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall:
+                - "vdup{neon_type[0].N}"
+                - - c
+
+  - name: "vdiv{neon_type.no}"
+    doc: "Divide"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [fdiv]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - FnCall: [simd_div, [a, b]]
+
+  - name: "vdiv{neon_type.no}"
+    doc: "Divide"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fdiv]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - FnCall: [simd_div, [a, b]]
+
+  - name: "vdiv{type[1]}_{type[0]}"
+    doc: Divide
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [nop]
+    safety: safe
+    types:
+      - [f16, 'h']
+    compose:
+      - 'a / b'
+
+  - name: "vsub{neon_type.no}"
+    doc: "Subtract"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [fsub]
+    safety: safe
+    types:
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - FnCall: [simd_sub, [a, b]]
+
+  - name: "vsub{type[0]}"
+    doc: "Subtract"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [nop]
+    safety: safe
+    types:
+      - ['d_s64', 'i64']
+      - ['d_u64', 'u64']
+    compose:
+      - MethodCall: [a, wrapping_sub, [b]]
+
+  - name: "vsub{type[0]}"
+    doc: "Subtract"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [nop]
+    safety: safe
+    types:
+      - ['h_f16', 'f16']
+    compose:
+      - 'a - b'
+
+  - name: "vaddv{neon_type[0].no}"
+    doc: Floating-point add across vector
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall:
+          - stable
+          - - 'feature = "neon_intrinsics"'
+            - 'since = "1.59.0"'
+    assert_instr: [faddp]
+    safety: safe
+    types:
+      - [float32x2_t, f32]
+      - [float32x4_t, f32]
+      - [float64x2_t, f64]
+    compose:
+      - LLVMLink:
+          name: "faddv.{type[1]}.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.faddv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vaddlv{neon_type[0].no}"
+    doc: Signed Add Long across Vector
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [saddlv]
+    safety: safe
+    types:
+      - [int16x4_t, i32]
+      - [int16x8_t, i32]
+      - [int32x4_t, i64]
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.saddlv.{type[1]}.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.saddlv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vaddlv{neon_type.no}"
+    doc: Signed Add Long across Vector
+    arguments: ["a: {neon_type}"]
+    return_type: "i64"
+    attr: [*neon-stable]
+    assert_instr: [saddlp]
+    safety: safe
+    types:
+      - int32x2_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.saddlv.i64.v2i32"
+          links:
+            - link: "llvm.aarch64.neon.saddlv.i64.v2i32"
+              arch: aarch64,arm64ec
+
+  - name: "vaddlv{neon_type[0].no}"
+    doc: Unsigned Add Long across Vector
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [uaddlv]
+    safety: safe
+    types:
+      - [uint16x4_t, u32, i32]
+      - [uint16x8_t, u32, i32]
+      - [uint32x4_t, u64, i64]
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.uaddlv.{type[2]}.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.uaddlv.{type[2]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ['_vaddlv{neon_type[0].no}', ['a'], [], true]
+
+  - name: "vaddlv{neon_type[0].no}"
+    doc: Unsigned Add Long across Vector
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [uaddlp]
+    safety: safe
+    types:
+      - [uint32x2_t, u64, i64]
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.uaddlv.{type[2]}.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.uaddlv.{type[2]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ['_vaddlv{neon_type[0].no}', ['a'], [], true]
+
+  - name: "vsubw_high{neon_type[1].noq}"
+    doc: Signed Subtract Wide
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr: [*neon-stable]
+    assert_instr: [ssubw]
+    safety: safe
+    types:
+      - [int16x8_t, int8x16_t, int8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int32x4_t, int16x8_t, int16x4_t, '[4, 5, 6, 7]']
+      - [int64x2_t, int32x4_t, int32x2_t, '[2, 3]']
+    compose:
+      - Let:
+          - c
+          - "{neon_type[2]}"
+          - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]
+      - FnCall:
+          - simd_sub
+          - - a
+            - FnCall: [simd_cast, [c]]
+
+  - name: "vsubw_high{neon_type[1].noq}"
+    doc: Unsigned Subtract Wide
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr: [*neon-stable]
+    assert_instr: [usubw]
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x16_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint32x4_t, uint16x8_t, uint16x4_t, '[4, 5, 6, 7]']
+      - [uint64x2_t, uint32x4_t, uint32x2_t, '[2, 3]']
+    compose:
+      - Let:
+          - c
+          - "{neon_type[2]}"
+          - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]
+      - FnCall:
+          - simd_sub
+          - - a
+            - FnCall: [simd_cast, [c]]
+
+  - name: "vsubl_high{neon_type[0].noq}"
+    doc: "Signed Subtract Long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [ssubl]
+    safety: safe
+    types:
+      - [int8x16_t, int16x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', int8x8_t]
+      - [int16x8_t, int32x4_t, '[4, 5, 6, 7]', int16x4_t]
+      - [int32x4_t, int64x2_t, '[2, 3]', int32x2_t]
+    compose:
+      - Let:
+          - c
+          - "{neon_type[3]}"
+          - FnCall: [simd_shuffle!, [a, a, "{type[2]}"]]
+      - Let:
+          - d
+          - "{neon_type[1]}"
+          - FnCall: [simd_cast, [c]]
+      - Let:
+          - e
+          - "{neon_type[3]}"
+          - FnCall: [simd_shuffle!, [b, b, "{type[2]}"]]
+      - Let:
+          - f
+          - "{neon_type[1]}"
+          - FnCall: [simd_cast, [e]]
+      - FnCall: [simd_sub, [d, f]]
+
+  - name: "vsubl_high{neon_type[0].noq}"
+    doc: "Unsigned Subtract Long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [usubl]
+    safety: safe
+    types:
+      - [uint8x16_t, uint16x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', uint8x8_t]
+      - [uint16x8_t, uint32x4_t, '[4, 5, 6, 7]', uint16x4_t]
+      - [uint32x4_t, uint64x2_t, '[2, 3]', uint32x2_t]
+    compose:
+      - Let:
+          - c
+          - "{neon_type[3]}"
+          - FnCall: [simd_shuffle!, [a, a, "{type[2]}"]]
+      - Let:
+          - d
+          - "{neon_type[1]}"
+          - FnCall: [simd_cast, [c]]
+      - Let:
+          - e
+          - "{neon_type[3]}"
+          - FnCall: [simd_shuffle!, [b, b, "{type[2]}"]]
+      - Let:
+          - f
+          - "{neon_type[1]}"
+          - FnCall: [simd_cast, [e]]
+      - FnCall: [simd_sub, [d, f]]
+
+  - name: "vbcax{neon_type.no}"
+    doc: Bit clear and exclusive OR
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sha3"']]
+      - FnCall: [stable, ['feature = "stdarch_neon_sha3"', 'since = "1.79.0"']]
+    assert_instr: [bcax]
+    safety: safe
+    types:
+      - int8x16_t
+      - int16x8_t
+      - int32x4_t
+      - int64x2_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.crypto.bcaxs.{neon_type}"
+          links:
+            - link: "llvm.aarch64.crypto.bcaxs.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vbcax{neon_type.no}"
+    doc: Bit clear and exclusive OR
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sha3"']]
+      - FnCall: [stable, ['feature = "stdarch_neon_sha3"', 'since = "1.79.0"']]
+    assert_instr: [bcax]
+    safety: safe
+    types:
+      - uint8x16_t
+      - uint16x8_t
+      - uint32x4_t
+      - uint64x2_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.crypto.bcaxu.{neon_type}"
+          links:
+            - link: "llvm.aarch64.crypto.bcaxu.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vcadd{neon_type.rot270}"
+    doc: "Floating-point complex add"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - *neon-unstable-fcma
+    assert_instr: [fcadd]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.vcadd.rot270.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.vcadd.rot270.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vcadd{neon_type.rot90}"
+    doc: "Floating-point complex add"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - *neon-unstable-fcma
+    assert_instr: [fcadd]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.vcadd.rot90.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.vcadd.rot90.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vcadd{neon_type.rot270}"
+    doc: "Floating-point complex add"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-fp16
+      - *enable-fcma
+      - *neon-unstable-f16
+    assert_instr: [fcadd]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "vcadd.rot270.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.vcadd.rot270.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vcadd{neon_type.rot90}"
+    doc: "Floating-point complex add"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-fp16
+      - *enable-fcma
+      - *neon-unstable-f16
+    assert_instr: [fcadd]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "vcadd.rot90.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.vcadd.rot90.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vcmla{neon_type.no}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - *neon-unstable-fcma
+    assert_instr: [fcmla]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.vcmla.rot0.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.vcmla.rot0.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vcmla{neon_type.no}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fcmla]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.vcmla.rot0.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.vcmla.rot0.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vcmla{neon_type.rot90}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - *neon-unstable-fcma
+    assert_instr: [fcmla]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.vcmla.rot90.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.vcmla.rot90.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vcmla{neon_type.rot90}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fcmla]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.vcmla.rot90.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.vcmla.rot90.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vcmla{neon_type.rot270}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - *neon-unstable-fcma
+    assert_instr: [fcmla]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.vcmla.rot270.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.vcmla.rot270.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcmla{neon_type.rot270}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fcmla]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.vcmla.rot270.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.vcmla.rot270.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vcmla{neon_type[0].laneq_nox}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-fcma
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]]
+
+  - name: "vcmla{neon_type[0].laneq_nox}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x8_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, 2]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]]
+
+  - name: "vcmla{neon_type[0].rot90_laneq}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-fcma
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]]
+
+  - name: "vcmla{neon_type[0].rot90_laneq}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x8_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, 2]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]]
+
+  - name: "vcmla{neon_type[0].rot90_lane}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-fcma
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]]
+
+  - name: "vcmla{neon_type[0].rot90_lane}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]]
+
+  - name: "vcmla{neon_type.rot180}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - *neon-unstable-fcma
+    assert_instr: [fcmla]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.vcmla.rot180.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.vcmla.rot180.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcmla{neon_type.rot180}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fcmla]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "llvm.aarch64.neon.vcmla.rot180.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.vcmla.rot180.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcmla{neon_type[0].rot180_laneq}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-fcma
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]]
+
+  - name: "vcmla{neon_type[0].rot180_laneq}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x8_t, float16x8_t,
+        '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'
+        ]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, 2]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]]
+
+  - name: "vcmla{type[3]}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-fcma
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]', '_rot180_lane_f32']
+      - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]', 'q_rot180_lane_f32']
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]]
+
+  - name: "vcmla{type[3]}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]', '_rot180_lane_f16']
+      - [float16x8_t, float16x4_t,
+          '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]', 'q_rot180_lane_f16'
+        ]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]]
+
+  - name: "vcmla{neon_type[0].rot270_laneq}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-fcma
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]]
+
+  - name: "vcmla{neon_type[0].rot270_laneq}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x8_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, 2]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]]
+
+  - name: "vcmla{neon_type[0].lane_nox}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-fcma
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]]
+
+
+  - name: "vcmla{neon_type[0].lane_nox}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]]
+
+  - name: "vcmla{neon_type[0].rot270_lane}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-fcma
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - Let: [c, "{neon_type[0]}", {FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]}]
+      - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]]
+
+
+  - name: "vcmla{neon_type[0].rot270_lane}"
+    doc: Floating-point complex multiply accumulate
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,fcma"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
+      - Let: [c, "{neon_type[0]}", {FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]}]
+      - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]]
+
+  - name: "vdot{neon_type[0].laneq_nox}"
+    doc: Dot product arithmetic (indexed)
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    static_defs: ["const LANE: i32"]
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,dotprod"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sdot, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [unstable, ['feature = "stdarch_neon_dotprod"', 'issue = "117224"']]
+    safety: safe
+    types:
+      - [int32x2_t, int8x8_t, int8x16_t, int32x4_t, '[LANE as u32, LANE as u32]']
+      - [int32x4_t, int8x16_t, int8x16_t, int32x4_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '2']]
+      - Let:
+          - c
+          - "{neon_type[3]}"
+          - FnCall: [transmute, [c]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, '{type[4]}']]
+      - FnCall:
+          - "vdot{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: [transmute, [c]]
+
+  - name: "vdot{neon_type[0].laneq_nox}"
+    doc: Dot product arithmetic (indexed)
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    static_defs: ["const LANE: i32"]
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,dotprod"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [udot, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [unstable, ['feature = "stdarch_neon_dotprod"', 'issue = "117224"']]
+    safety: safe
+    types:
+      - [uint32x2_t, uint8x8_t, uint8x16_t, uint32x4_t, '[LANE as u32, LANE as u32]']
+      - [uint32x4_t, uint8x16_t, uint8x16_t, uint32x4_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '2']]
+      - Let:
+          - c
+          - "{neon_type[3]}"
+          - FnCall: [transmute, [c]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, '{type[4]}']]
+      - FnCall:
+          - "vdot{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: [transmute, [c]]
+
+  - name: "vmax{neon_type.no}"
+    doc: Maximum (vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [fmax]
+    safety: safe
+    types:
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "fmax.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fmax.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmaxh_{type}"
+    doc: Maximum (vector)
+    arguments: ["a: {type}", "b: {type}"]
+    return_type: "{type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fmax]
+    safety: safe
+    types:
+      - f16
+    compose:
+      - LLVMLink:
+          name: "vmaxh.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fmax.{type}"
+              arch: aarch64,arm64ec
+
+
+
+  - name: "vmaxnm{neon_type.no}"
+    doc: Floating-point Maximum Number (vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [fmaxnm]
+    safety: safe
+    types:
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "fmaxnm.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxnm.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmaxnmh_{type}"
+    doc: Floating-point Maximum Number
+    arguments: ["a: {type}", "b: {type}"]
+    return_type: "{type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fmaxnm]
+    safety: safe
+    types:
+      - f16
+    compose:
+      - LLVMLink:
+          name: "vmaxh.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxnm.{type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vminnmh_{type}"
+    doc: Floating-point Minimum Number
+    arguments: ["a: {type}", "b: {type}"]
+    return_type: "{type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fminnm]
+    safety: safe
+    types:
+      - f16
+    compose:
+      - LLVMLink:
+          name: "vminh.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fminnm.{type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmaxnmv{neon_type[0].no}"
+    doc: Floating-point maximum number across vector
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [fmaxnmp]
+    safety: safe
+    types:
+      - [float32x2_t, f32]
+      - [float64x2_t, f64]
+    compose:
+      - LLVMLink:
+          name: "fmaxnmv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxnmv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vmaxnmv{neon_type[0].no}"
+    doc: Floating-point maximum number across vector
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [fmaxnmv]
+    safety: safe
+    types:
+      - [float32x4_t, f32]
+    compose:
+      - LLVMLink:
+          name: "fmaxnmv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxnmv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmaxnmv{neon_type[0].no}"
+    doc: Floating-point maximum number across vector
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fmaxnmv]
+    safety: safe
+    types:
+      - [float16x4_t, f16]
+      - [float16x8_t, f16]
+    compose:
+      - LLVMLink:
+          name: "fmaxnmv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxnmv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vminnmv{neon_type[0].no}"
+    doc: Floating-point minimum number across vector
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fminnmv]
+    safety: safe
+    types:
+      - [float16x4_t, f16]
+      - [float16x8_t, f16]
+    compose:
+      - LLVMLink:
+          name: "fminnmv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fminnmv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmaxv{neon_type[0].no}"
+    doc: Floating-point maximum number across vector
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fmaxv]
+    safety: safe
+    types:
+      - [float16x4_t, f16]
+      - [float16x8_t, f16]
+    compose:
+      - LLVMLink:
+          name: "fmaxv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vminv{neon_type[0].no}"
+    doc: Floating-point minimum number across vector
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fminv]
+    safety: safe
+    types:
+      - [float16x4_t, f16]
+      - [float16x8_t, f16]
+    compose:
+      - LLVMLink:
+          name: "fminv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fminv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vpmax{type[0]}"
+    doc: "Floating-point maximum pairwise"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{type[2]}"
+    attr: [*neon-stable]
+    assert_instr: [fmaxp]
+    safety: safe
+    types:
+      - ["s_f32", float32x2_t, f32]
+      - ["qd_f64", float64x2_t, f64]
+    compose:
+      - LLVMLink:
+          name: "fmaxv.{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxv.{type[2]}.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vmin{neon_type.no}"
+    doc: "Minimum (vector)"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [fmin]
+    safety: safe
+    types:
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "fmin.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fmin.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vminh_{type}"
+    doc: Minimum (vector)
+    arguments: ["a: {type}", "b: {type}"]
+    return_type: "{type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fmin]
+    safety: safe
+    types:
+      - f16
+    compose:
+      - LLVMLink:
+          name: "vminh.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fmin.{type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vminnm{neon_type.no}"
+    doc: "Floating-point Minimum Number (vector)"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr: [*neon-stable]
+    assert_instr: [fminnm]
+    safety: safe
+    types:
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "fminnm.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fminnm.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vminnmv{neon_type[0].no}"
+    doc: "Floating-point minimum number across vector"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fminnmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, "f32"]
+      - [float64x2_t, "f64"]
+    compose:
+      - LLVMLink:
+          name: "vminnmv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fminnmv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vminnmv{neon_type[0].no}"
+    doc: "Floating-point minimum number across vector"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fminnmv]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x4_t, "f32"]
+    compose:
+      - LLVMLink:
+          name: "vminnmv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fminnmv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vmovl_high{neon_type[0].noq}"
+    doc: Vector move
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [sxtl2]
+    safety: safe
+    types:
+      - [int8x16_t, int16x8_t, int8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int16x8_t, int32x4_t, int16x4_t, '[4, 5, 6, 7]']
+      - [int32x4_t, int64x2_t, int32x2_t, '[2, 3]']
+    compose:
+      - Let:
+          - a
+          - "{neon_type[2]}"
+          - FnCall: [simd_shuffle!, [a, a, "{type[3]}"]]
+      - FnCall: ["vmovl{neon_type[0].noq}", [a]]
+
+  - name: "vmovl_high{neon_type[0].noq}"
+    doc: Vector move
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [uxtl2]
+    safety: safe
+    types:
+      - [uint8x16_t, uint16x8_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x8_t, uint32x4_t, uint16x4_t, '[4, 5, 6, 7]']
+      - [uint32x4_t, uint64x2_t, uint32x2_t, '[2, 3]']
+    compose:
+      - Let:
+          - a
+          - "{neon_type[2]}"
+          - FnCall: [simd_shuffle!, [a, a, "{type[3]}"]]
+      - FnCall: ["vmovl{neon_type[0].noq}", [a]]
+
+  - name: "vpadd{neon_type.no}"
+    doc: Floating-point add pairwise
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{type}"
+    attr: [*neon-stable]
+    assert_instr: [faddp]
+    safety: safe
+    types:
+      - float32x4_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "faddp.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.faddp.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vpadd{neon_type.no}"
+    doc: Floating-point add pairwise
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [faddp]
+    safety: safe
+    types:
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "faddp.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.faddp.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vpmax{neon_type.no}"
+    doc: Floating-point add pairwise
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fmaxp]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "fmaxp.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxp.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vpmaxnm{neon_type.no}"
+    doc: Floating-point add pairwise
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fmaxnmp]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "fmaxnmp.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxnmp.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vpmin{neon_type.no}"
+    doc: Floating-point add pairwise
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fminp]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "fminp.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fminp.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vpminnm{neon_type.no}"
+    doc: Floating-point add pairwise
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{type}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fminnmp]
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "fminnmp.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fminnmp.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vpadd{type[0]}"
+    doc: "Floating-point add pairwise"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{type[2]}"
+    attr: [*neon-stable]
+    assert_instr: [nop]
+    safety: safe
+    types:
+      - ["s_f32", float32x2_t, f32]
+      - ["d_f64", float64x2_t, f64]
+    compose:
+      - Let:
+          - a1
+          - "{type[2]}"
+          - FnCall: [simd_extract!, [a, '0']]
+      - Let:
+          - a2
+          - "{type[2]}"
+          - FnCall: [simd_extract!, [a, '1']]
+      - Identifier: ['a1 + a2', Symbol]
+
+  - name: "vpmin{type[0]}"
+    doc: Floating-point minimum pairwise
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{type[2]}"
+    attr: [*neon-stable]
+    assert_instr: [fminp]
+    safety: safe
+    types:
+      - ["s_f32", float32x2_t, f32]
+      - ["qd_f64", float64x2_t, f64]
+    compose:
+      - LLVMLink:
+          name: "fminv.{type[2]}.{neon_type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fminv.{type[2]}.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqdmullh_s16"
+    doc: "Signed saturating doubling multiply long"
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmull]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i16", "i32"]
+    compose:
+      - Let: [a, int16x4_t, {FnCall: [vdup_n_s16, [a]]}]
+      - Let: [b, int16x4_t, {FnCall: [vdup_n_s16, [b]]}]
+      - FnCall: [simd_extract!, [{FnCall: [vqdmull_s16, [a, b]]}, '0']]
+
+  - name: "vqdmulls_s32"
+    doc: "Signed saturating doubling multiply long"
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmull]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i32", "i64"]
+    compose:
+      - LLVMLink:
+          name: "vqdmulls_s32"
+          links:
+            - link: "llvm.aarch64.neon.sqdmulls.scalar"
+              arch: aarch64,arm64ec
+
+  - name: "vqdmull_high{neon_type[0].noq}"
+    doc: "Signed saturating doubling multiply long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmull2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int16x8_t, int32x4_t, int16x4_t, '[4, 5, 6, 7]']
+      - [int32x4_t, int64x2_t, int32x2_t, '[2, 3]']
+    compose:
+      - Let: [a, "{neon_type[2]}", {FnCall: [simd_shuffle!, [a, a, '{type[3]}']]}]
+      - Let: [b, "{neon_type[2]}", {FnCall: [simd_shuffle!, [b, b, '{type[3]}']]}]
+      - FnCall: ["vqdmull{neon_type[0].noq}", [a, b]]
+
+  - name: "vqdmull_high_n_{type[1]}"
+    doc: "Signed saturating doubling multiply long"
+    arguments: ["a: {neon_type[0]}", "b: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmull2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int16x8_t, "i16", int32x4_t, int16x4_t, '[4, 5, 6, 7]']
+      - [int32x4_t, "i32", int64x2_t, int32x2_t, '[2, 3]']
+    compose:
+      - Let: [a, "{neon_type[3]}", {FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]}]
+      - Let: [b, "{neon_type[3]}", {FnCall: ["vdup_n{neon_type[0].noq}", [b]]}]
+      - FnCall: ["vqdmull{neon_type[0].noq}", [a, b]]
+
+  - name: "vqdmull{type[3]}"
+    doc: "Signed saturating doubling multiply long"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmull, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["i16", int16x4_t, "i32", 'h_lane_s16', 'h_s16']
+      - ["i32", int32x4_t, "i64", 's_laneq_s32', 's_s32']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, 2]]
+      - Let: [b, "{type[0]}", {FnCall: [simd_extract!, [b, 'N as u32']]}]
+      - FnCall: ["vqdmull{type[4]}", [a, b]]
+
+  - name: "vqdmullh_laneq_s16"
+    doc: "Signed saturating doubling multiply long"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmull, N = 4]]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["i16", int16x8_t, "i32"]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, 3]]
+      - Let: [b, "{type[0]}", {FnCall: [simd_extract!, [b, 'N as u32']]}]
+      - FnCall: ["vqdmullh_s16", [a, b]]
+
+  - name: "vqdmulls_lane_s32"
+    doc: "Signed saturating doubling multiply long"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmull, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["i32", int32x2_t, "i64"]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, 1]]
+      - Let: [b, "{type[0]}", {FnCall: [simd_extract!, [b, 'N as u32']]}]
+      - FnCall: ["vqdmulls_s32", [a, b]]
+
+  - name: "vqdmull{type[6]}"
+    doc: "Signed saturating doubling multiply long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmull2, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x8_t, int16x4_t, int32x4_t, int16x4_t, '[4, 5, 6, 7]', '[N as u32, N as u32, N as u32, N as u32]', '_high_lane_s16']
+      - [int32x4_t, int32x4_t, int64x2_t, int32x2_t, '[2, 3]', '[N as u32, N as u32]', '_high_laneq_s32']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '2']]
+      - Let: [a, "{neon_type[3]}", {FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]}]
+      - Let: [b, "{neon_type[3]}", {FnCall: [simd_shuffle!, [b, b, "{type[5]}"]]}]
+      - FnCall: ["vqdmull{neon_type[0].noq}", [a, b]]
+
+  - name: "vqdmull_high_lane_s32"
+    doc: "Signed saturating doubling multiply long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmull2, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x4_t, int32x2_t, int64x2_t, int32x2_t, '[2, 3]', '[N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '1']]
+      - Let: [a, "{neon_type[3]}", {FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]}]
+      - Let: [b, "{neon_type[3]}", {FnCall: [simd_shuffle!, [b, b, "{type[5]}"]]}]
+      - FnCall: ["vqdmull{neon_type[0].noq}", [a, b]]
+
+  - name: "vqdmull_high_laneq_s16"
+    doc: "Signed saturating doubling multiply long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmull2, N = 4]]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x8_t, int16x8_t, int32x4_t, int16x4_t, '[4, 5, 6, 7]', '[N as u32, N as u32, N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '3']]
+      - Let: [a, "{neon_type[3]}", {FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]}]
+      - Let: [b, "{neon_type[3]}", {FnCall: [simd_shuffle!, [b, b, "{type[5]}"]]}]
+      - FnCall: ["vqdmull{neon_type[0].noq}", [a, b]]
+
+  - name: "vqdmull_laneq_s16"
+    doc: "Vector saturating doubling long multiply by scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmull, 'N = 4']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x4_t, int16x8_t, int32x4_t, '[N as u32, N as u32, N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '3']]
+      - Let: [b, "{neon_type[0]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}]
+      - FnCall: [vqdmull_s16, [a, b]]
+
+  - name: "vqdmull_laneq_s32"
+    doc: "Vector saturating doubling long multiply by scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmull, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x2_t, int32x4_t, int64x2_t, '[N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '2']]
+      - Let: [b, "{neon_type[0]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}]
+      - FnCall: [vqdmull_s32, [a, b]]
+
+  - name: "vqdmlal{type[4]}"
+    doc: "Signed saturating doubling multiply-add long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlal2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int32x4_t, int16x8_t, int16x8_t, int32x4_t, _high_s16]
+      - [int64x2_t, int32x4_t, int32x4_t, int64x2_t, _high_s32]
+      - [int32x4_t, int16x8_t, "i16", int32x4_t, _high_n_s16]
+      - [int64x2_t, int32x4_t, "i32", int64x2_t, _high_n_s32]
+    compose:
+      - FnCall: ["vqadd{neon_type[0].no}", [a, {FnCall: ["vqdmull{type[4]}", [b, c]]}]]
+
+  - name: "vqdmlal{type[4]}"
+    doc: "Signed saturating doubling multiply-add long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlal2, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x4_t, int16x8_t, int16x4_t, int32x4_t, _high_lane_s16, '2']
+      - [int32x4_t, int16x8_t, int16x8_t, int32x4_t, _high_laneq_s16, '3']
+      - [int64x2_t, int32x4_t, int32x2_t, int64x2_t, _high_lane_s32, '1']
+      - [int64x2_t, int32x4_t, int32x4_t, int64x2_t, _high_laneq_s32, '2']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[5]}"]]
+      - FnCall: ["vqadd{neon_type[0].no}", [a, {FnCall: ["vqdmull{type[4]}::<N>", [b, c]]}]]
+
+  - name: "vqdmlalh_{type[2]}"
+    doc: "Signed saturating doubling multiply-add long"
+    arguments: ["a: {type[0]}", "b: {type[1]}", "c: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlal]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i32", "i16", "s16"]
+    compose:
+      - Let: [x, int32x4_t, {FnCall: [vqdmull_s16, [{FnCall: [vdup_n_s16, [b]]}, {FnCall: [vdup_n_s16, [c]]}]]}]
+      - FnCall: [vqadds_s32, [a, {FnCall: [simd_extract!, [x, 0]]}]]
+
+  - name: "vqdmlals_s32"
+    doc: "Signed saturating doubling multiply-add long"
+    arguments: ["a: {type[0]}", "b: {type[1]}", "c: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlal]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i64", "i32", "i32", "i64"]
+    compose:
+      - Let: [x, i64, {FnCall: [vqaddd_s64, [a, {FnCall: [vqdmulls_s32, [b, c]]}]]}]
+      - Identifier: ['x as i64', Symbol]
+
+  - name: "vqdmlal{type[4]}"
+    doc: "Signed saturating doubling multiply-add long"
+    arguments: ["a: {type[0]}", "b: {type[1]}", "c: {neon_type[2]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlal, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["i32", "i16", int16x4_t, "i32", h_lane_s16, '2', h_s16]
+      - ["i32", "i16", int16x8_t, "i32", h_laneq_s16, '3', h_s16]
+      - ["i64", "i32", int32x2_t, "i64", s_lane_s32, '1', s_s32]
+      - ["i64", "i32", int32x4_t, "i64", s_laneq_s32, '2', s_s32]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[5]}"]]
+      - FnCall: ["vqdmlal{type[6]}", [a, b, {FnCall: [simd_extract!, [c, 'LANE as u32']]}]]
+
+  - name: "vqdmlal_laneq_s16"
+    doc: "Vector widening saturating doubling multiply accumulate with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlal, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x4_t, int16x4_t, int16x8_t, int32x4_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '3']]
+      - FnCall: [vqaddq_s32, [a, {FnCall: ["vqdmull_laneq_s16::<N>", [b, c]]}]]
+
+  - name: "vqdmlal_laneq_s32"
+    doc: "Vector widening saturating doubling multiply accumulate with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlal, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int64x2_t, int32x2_t, int32x4_t, int64x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '2']]
+      - FnCall: [vqaddq_s64, [a, {FnCall: ["vqdmull_laneq_s32::<N>", [b, c]]}]]
+
+  - name: "vqdmlsl{type[4]}"
+    doc: "Signed saturating doubling multiply-subtract long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlsl2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int32x4_t, int16x8_t, int16x8_t, int32x4_t, _high_s16]
+      - [int64x2_t, int32x4_t, int32x4_t, int64x2_t, _high_s32]
+      - [int32x4_t, int16x8_t, "i16", int32x4_t, _high_n_s16]
+      - [int64x2_t, int32x4_t, "i32", int64x2_t, _high_n_s32]
+    compose:
+      - FnCall: ["vqsub{neon_type[0].no}", [a, {FnCall: ["vqdmull{type[4]}", [b, c]]}]]
+
+  - name: "vqdmlsl{type[4]}"
+    doc: "Signed saturating doubling multiply-subtract long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlsl2, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x4_t, int16x8_t, int16x4_t, int32x4_t, '_high_lane_s16', '2']
+      - [int32x4_t, int16x8_t, int16x8_t, int32x4_t, '_high_laneq_s16', '3']
+      - [int64x2_t, int32x4_t, int32x2_t, int64x2_t, '_high_lane_s32', '1']
+      - [int64x2_t, int32x4_t, int32x4_t, int64x2_t, '_high_laneq_s32', '2']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[5]}"]]
+      - FnCall: ["vqsub{neon_type[0].no}", [a, {FnCall: ["vqdmull{type[4]}::<N>", [b, c]]}]]
+
+  - name: "vqdmlslh_s16"
+    doc: "Signed saturating doubling multiply-subtract long"
+    arguments: ["a: {type[0]}", "b: {type[1]}", "c: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlsl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i32", "i16"]
+    compose:
+      - Let: [x, int32x4_t, {FnCall: [vqdmull_s16, [{FnCall: [vdup_n_s16, [b]]}, {FnCall: [vdup_n_s16, [c]]}]]}]
+      - FnCall: [vqsubs_s32, [a, {FnCall: [simd_extract!, [x, '0']]}]]
+
+  - name: "vqdmlsls_s32"
+    doc: "Signed saturating doubling multiply-subtract long"
+    arguments: ["a: {type[0]}", "b: {type[1]}", "c: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlsl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i64", "i32", "i32", "i64"]
+    compose:
+      - Let: [x, i64, {FnCall: [vqsubd_s64, [a, {FnCall: [vqdmulls_s32, [b, c]]}]]}]
+      - Identifier: ['x as i64', Symbol]
+
+  - name: "vqdmlsl{type[4]}"
+    doc: "Signed saturating doubling multiply-subtract long"
+    arguments: ["a: {type[0]}", "b: {type[1]}", "c: {neon_type[2]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlsl, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["i32", "i16", int16x4_t, "i32", 'h_lane_s16', '2', 'h_s16']
+      - ["i32", "i16", int16x8_t, "i32", 'h_laneq_s16', '3', 'h_s16']
+      - ["i64", "i32", int32x2_t, "i64", 's_lane_s32', '1', 's_s32']
+      - ["i64", "i32", int32x4_t, "i64", 's_laneq_s32', '2', 's_s32']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[5]}"]]
+      - FnCall: ["vqdmlsl{type[6]}", [a, b, {FnCall: [simd_extract!, [c, 'LANE as u32']]}]]
+
+  - name: "vqdmlsl_laneq_s16"
+    doc: "Vector widening saturating doubling multiply subtract with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlsl, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x4_t, int16x4_t, int16x8_t, int32x4_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '3']]
+      - FnCall: ["vqsubq_s32", [a, {FnCall: ["vqdmull_laneq_s16::<N>", [b, c]]}]]
+
+  - name: "vqdmlsl_laneq_s32"
+    doc: "Vector widening saturating doubling multiply subtract with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmlsl, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int64x2_t, int32x2_t, int32x4_t, int64x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '2']]
+      - FnCall: [vqsubq_s64, [a, {FnCall: ["vqdmull_laneq_s32::<N>", [b, c]]}]]
+
+  - name: "vqdmulh{type[4]}"
+    doc: "Signed saturating doubling multiply returning high half"
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmulh]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i16", "i16", "i16", int16x4_t, 'h_s16']
+      - ["i32", "i32", "i32", int32x2_t, 's_s32']
+    compose:
+      - Let: [a, "{neon_type[3]}", {FnCall: ["vdup_n{neon_type[3].no}", [a]]}]
+      - Let: [b, "{neon_type[3]}", {FnCall: ["vdup_n{neon_type[3].no}", [b]]}]
+      - FnCall: [simd_extract!, [{FnCall: ["vqdmulh{neon_type[3].no}", [a, b]]}, '0']]
+
+  - name: "vqdmulhh{type[3]}"
+    doc: "Signed saturating doubling multiply returning high half"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmulh, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["i16", int16x4_t, "i16", '_lane_s16', '2']
+      - ["i16", int16x8_t, "i16", '_laneq_s16', '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[4]}"]]
+      - Let: [b, 'i16', {FnCall: [simd_extract!, [b, 'N as u32']]}]
+      - FnCall: ['vqdmulhh_s16', [a, b]]
+
+  - name: "vqdmulhs{type[3]}"
+    doc: "Signed saturating doubling multiply returning high half"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmulh, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["i32", int32x2_t, "i32", "_lane_s32", '1']
+      - ["i32", int32x4_t, "i32", "_laneq_s32", '2']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[4]}"]]
+      - Let: [b, 'i32', {FnCall: [simd_extract!, [b, 'N as u32']]}]
+      - FnCall: ['vqdmulhs_s32', [a, b]]
+
+  - name: "vqmovn_high{neon_type[1].noq}"
+    doc: "Signed saturating extract narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqxtn2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, int16x8_t, int8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int16x4_t, int32x4_t, int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [int32x2_t, int64x2_t, int32x4_t, '[0, 1, 2, 3]']
+    compose:
+      - FnCall: [simd_shuffle!, [a, {FnCall: ["vqmovn{neon_type[1].noq}", [b]]}, "{type[3]}"]]
+
+  - name: "vqmovn_high{neon_type[1].noq}"
+    doc: "Signed saturating extract narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqxtn2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x8_t, uint16x8_t, uint8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, uint32x4_t, uint16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, uint64x2_t, uint32x4_t, '[0, 1, 2, 3]']
+    compose:
+      - FnCall: [simd_shuffle!, [a, {FnCall: ["vqmovn{neon_type[1].noq}", [b]]}, "{type[3]}"]]
+
+  - name: "vqmovn{type[2]}"
+    doc: "Saturating extract narrow"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqxtn]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i16", "i8", 'h_s16', s16]
+      - ["i32", "i16", 's_s32', s32]
+    compose:
+      - FnCall: [simd_extract!, [{FnCall: ["vqmovn_{type[3]}", [{FnCall: ["vdupq_n_{type[3]}", [a]]}]]}, '0']]
+
+  - name: "vqmovn{type[2]}"
+    doc: "Saturating extract narrow"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqxtn]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["u16", "u8", 'h_u16', 'u16']
+      - ["u32", "u16", 's_u32', 'u32']
+    compose:
+      - FnCall: [simd_extract!, [{FnCall: ["vqmovn_{type[3]}", [{FnCall: ["vdupq_n_{type[3]}", [a]]}]]}, '0']]
+
+  - name: "vqmovnd_s64"
+    doc: "Saturating extract narrow"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqxtn]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i64", "i32"]
+    compose:
+      - LLVMLink:
+          name: "vqmovnd_s64"
+          links:
+            - link: "llvm.aarch64.neon.scalar.sqxtn.i32.i64"
+              arch: aarch64,arm64ec
+
+  - name: "vqmovnd_u64"
+    doc: "Saturating extract narrow"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqxtn]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["u64", "u32"]
+    compose:
+      - LLVMLink:
+          name: "vqmovnd_u64"
+          links:
+            - link: "llvm.aarch64.neon.scalar.uqxtn.i32.i64"
+              arch: aarch64,arm64ec
+
+  - name: "vqmovun{type[2]}"
+    doc: "Signed saturating extract unsigned narrow"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqxtun]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i16", "u8", 'h_s16', s16]
+      - ["i32", "u16", 's_s32', s32]
+      - ["i64", "u32", 'd_s64', s64]
+    compose:
+      - FnCall: [simd_extract!, [{FnCall: ["vqmovun_{type[3]}", [{FnCall: ["vdupq_n_{type[3]}", [a]]}]]}, '0']]
+
+  - name: "vqmovun_high_{neon_type[1]}"
+    doc: "Signed saturating extract unsigned narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqxtun2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x8_t, int16x8_t, uint8x16_t, s16, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, int32x4_t, uint16x8_t, s32, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, int64x2_t, uint32x4_t, s64, '[0, 1, 2, 3]']
+    compose:
+      - FnCall: [simd_shuffle!, [a, {FnCall: ["vqmovun_{type[3]}", [b]]}, "{type[4]}"]]
+
+  - name: "vqrdmulh{type[1]}"
+    doc: "Signed saturating rounding doubling multiply returning high half"
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrdmulh]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i16", 'h_s16', 's16']
+      - ["i32", 's_s32', 's32']
+    compose:
+      - FnCall: [simd_extract!, [{FnCall: ["vqrdmulh_{type[2]}", [{FnCall: ["vdup_n_{type[2]}", [a]]}, {FnCall: ["vdup_n_{type[2]}", [b]]}]]}, '0']]
+
+  - name: "vqrdmulh{type[2]}"
+    doc: "Signed saturating rounding doubling multiply returning high half"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrdmulh, LANE = 1]]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["i16", int16x4_t, 'h_lane_s16', 'h_s16', '2']
+      - ["i16", int16x8_t, 'h_laneq_s16', 'h_s16', '3']
+      - ["i32", int32x2_t, 's_lane_s32', 's_s32', '1']
+      - ["i32", int32x4_t, 's_laneq_s32', 's_s32', '2']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[4]}"]]
+      - FnCall: ["vqrdmulh{type[3]}", [a, {FnCall: [simd_extract!, [b, 'LANE as u32']]}]]
+
+  - name: "vqrdmlah{neon_type.no}"
+    doc: "Signed saturating rounding doubling multiply accumulate returning high half"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "rdm"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrdmlah]]}]]
+      - FnCall: [stable, ['feature = "rdm_intrinsics"', 'since = "1.62.0"']]
+    safety: safe
+    types:
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+    compose:
+      - LLVMLink:
+          name: "vqrdmlah{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.sqrdmlah.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vqrdmlah{type[3]}"
+    doc: "Signed saturating rounding doubling multiply accumulate returning high half"
+    arguments: ["a: {type[0]}", "b: {type[0]}", "c: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "rdm"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrdmlah]]}]]
+      - FnCall: [stable, ['feature = "rdm_intrinsics"', 'since = "1.62.0"']]
+    safety: safe
+    types:
+      - ["i16", int16x4_t, s16, 'h_s16']
+      - ["i32", int32x2_t, s32, 's_s32']
+    compose:
+      - Let: [a, "{neon_type[1]}", {FnCall: ["vdup_n_{type[2]}", [a]]}]
+      - Let: [b, "{neon_type[1]}", {FnCall: ["vdup_n_{type[2]}", [b]]}]
+      - Let: [c, "{neon_type[1]}", {FnCall: ["vdup_n_{type[2]}", [c]]}]
+      - FnCall: [simd_extract!, [{FnCall: ["vqrdmlah_{type[2]}", [a, b, c]]}, '0']]
+
+  - name: "vqrdmlah{type[0]}"
+    doc: "Signed saturating rounding doubling multiply accumulate returning high half"
+    arguments: ["a: {type[1]}", "b: {type[2]}", "c: {neon_type[3]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "rdm"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrdmlah, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "rdm_intrinsics"', 'since = "1.62.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [_lane_s16, int16x4_t, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_laneq_s16, int16x4_t, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_lane_s16, int16x8_t, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_s16, int16x8_t, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_lane_s32, int32x2_t, int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [_laneq_s32, int32x2_t, int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]']
+      - [q_lane_s32, int32x4_t, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_s32, int32x4_t, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[4]}']]
+      - Let: [c, "{type[1]}", {FnCall: [simd_shuffle!, [c, c, "{type[5]}"]]}]
+      - FnCall: ["vqrdmlah{neon_type[2].no}", [a, b, c]]
+
+  - name: "vqrdmlah{type[4]}"
+    doc: "Signed saturating rounding doubling multiply accumulate returning high half"
+    arguments: ["a: {type[0]}", "b: {type[0]}", "c: {neon_type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "rdm"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrdmlah, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "rdm_intrinsics"', 'since = "1.62.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["i16", int16x4_t, '2', "h_s16", h_lane_s16, h_s16]
+      - ["i16", int16x8_t, '3', "h_s16", h_laneq_s16, h_s16]
+      - ["i32", int32x2_t, '1', "s_s32", s_lane_s32, s_s32]
+      - ["i32", int32x4_t, '2', "s_s32", s_laneq_s32, s_s32]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - FnCall: ["vqrdmlah{type[5]}", [a, b, {FnCall: [simd_extract!, [c, 'LANE as u32']]}]]
+
+  - name: "vqrdmlsh{neon_type.no}"
+    doc: "Signed saturating rounding doubling multiply subtract returning high half"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "rdm"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrdmlsh]]}]]
+      - FnCall: [stable, ['feature = "rdm_intrinsics"', 'since = "1.62.0"']]
+    safety: safe
+    types:
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+    compose:
+      - LLVMLink:
+          name: "vqrdmlsh{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.sqrdmlsh.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vqrdmlsh{type[1]}"
+    doc: "Signed saturating rounding doubling multiply subtract returning high half"
+    arguments: ["a: {type[0]}", "b: {type[0]}", "c: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "rdm"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrdmlsh]]}]]
+      - FnCall: [stable, ['feature = "rdm_intrinsics"', 'since = "1.62.0"']]
+    safety: safe
+    types:
+      - ["i16", "h_s16", int16x4_t, s16]
+      - ["i32", "s_s32", int32x2_t, s32]
+    compose:
+      - Let: [a, "{neon_type[2]}", {FnCall: ["vdup_n_{type[3]}", [a]]}]
+      - Let: [b, "{neon_type[2]}", {FnCall: ["vdup_n_{type[3]}", [b]]}]
+      - Let: [c, "{neon_type[2]}", {FnCall: ["vdup_n_{type[3]}", [c]]}]
+      - FnCall: [simd_extract!, [{FnCall: ["vqrdmlsh_{type[3]}", [a, b, c]]}, '0']]
+
+  - name: "vqrdmlsh{type[0]}"
+    doc: "Signed saturating rounding doubling multiply subtract returning high half"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}", "c: {neon_type[3]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "rdm"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrdmlsh, LANE = 1]]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "rdm_intrinsics"', 'since = "1.62.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [_lane_s16, int16x4_t, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_laneq_s16, int16x4_t, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_lane_s16, int16x8_t, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_s16, int16x8_t, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_lane_s32, int32x2_t, int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [_laneq_s32, int32x2_t, int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]']
+      - [q_lane_s32, int32x4_t, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_s32, int32x4_t, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[4]}']]
+      - Let: [c, "{type[1]}", {FnCall: [simd_shuffle!, [c, c, "{type[5]}"]]}]
+      - FnCall: ["vqrdmlsh{neon_type[2].no}", [a, b, c]]
+
+  - name: "vqrdmlsh{type[3]}"
+    doc: "Signed saturating rounding doubling multiply subtract returning high half"
+    arguments: ["a: {type[0]}", "b: {type[0]}", "c: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "rdm"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrdmlsh, LANE = 1]]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "rdm_intrinsics"', 'since = "1.62.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["i16", int16x4_t, '2', h_lane_s16, h_s16]
+      - ["i16", int16x8_t, '3', h_laneq_s16, h_s16]
+      - ["i32", int32x2_t, '1', s_lane_s32, s_s32]
+      - ["i32", int32x4_t, '2', s_laneq_s32, s_s32]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - FnCall: ["vqrdmlsh{type[4]}", [a, b, {FnCall: [simd_extract!, [c, 'LANE as u32']]}]]
+
+  - name: "vqrshl{type[0]}"
+    doc: "Signed saturating rounding shift left"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrshl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ['s_s32', "i32"]
+      - ['d_s64', "i64"]
+    compose:
+      - LLVMLink:
+          name: "vqrshl{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.sqrshl.{type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqrshl{type[1]}"
+    doc: "Signed saturating rounding shift left"
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrshl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i8", 'b_s8', int8x8_t, s8]
+      - ["i16", 'h_s16', int16x4_t, s16]
+    compose:
+      - Let: [a, "{neon_type[2]}", {FnCall: ["vdup_n_{type[3]}", [a]]}]
+      - Let: [b, "{neon_type[2]}", {FnCall: ["vdup_n_{type[3]}", [b]]}]
+      - FnCall: [simd_extract!, [{FnCall: ["vqrshl_{type[3]}", [a, b]]}, '0']]
+
+  - name: "vqrshl{type[2]}"
+    doc: "Unsigned signed saturating rounding shift left"
+    arguments: ["a: {type[0]}", "b: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqrshl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["u32", "i32", 's_u32']
+      - ["u64", "i64", 'd_u64']
+    compose:
+      - LLVMLink:
+          name: "vqrshl{type[2]}"
+          links:
+            - link: "llvm.aarch64.neon.uqrshl.{type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqrshl{type[2]}"
+    doc: "Unsigned signed saturating rounding shift left"
+    arguments: ["a: {type[0]}", "b: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqrshl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["u8", "i8", "b_u8", uint8x8_t, int8x8_t, s8]
+      - ["u16", "i16", "h_u16", uint16x4_t, int16x4_t, s16]
+    compose:
+      - Let: [a, "{neon_type[3]}", {FnCall: ["vdup_n_{type[0]}", [a]]}]
+      - Let: [b, "{neon_type[4]}", {FnCall: ["vdup_n_{type[5]}", [b]]}]
+      - FnCall: [simd_extract!, [{FnCall: ["vqrshl_{type[0]}", [a, b]]}, '0']]
+
+  - name: "vqrshrn{type[2]}"
+    doc: "Signed saturating rounded shift right narrow"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["i16", "i8", 'h_n_s16', 'N >= 1 && N <= 8', int16x8_t, q_n_s16]
+      - ["i32", "i16", 's_n_s32', 'N >= 1 && N <= 16', int32x4_t, q_n_s32]
+      - ["i64", "i32", 'd_n_s64', 'N >= 1 && N <= 32', int64x2_t, q_n_s64]
+    compose:
+      - FnCall: [static_assert!, ["{type[3]}"]]
+      - Let: [a, "{neon_type[4]}", {FnCall: ["vdup{type[5]}", [a]]}]
+      - FnCall: [simd_extract!, [{FnCall: ["vqrshrn_n{neon_type[4].noq}::<N>", [a]]}, '0']]
+
+  - name: "vqrshrn{type[3]}"
+    doc: "Signed saturating rounded shift right narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrshrn2, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, int16x8_t, int8x16_t, '_high_n_s16', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]', 'N >= 1 && N <= 8']
+      - [int16x4_t, int32x4_t, int16x8_t, '_high_n_s32', '[0, 1, 2, 3, 4, 5, 6, 7]', 'N >= 1 && N <= 16']
+      - [int32x2_t, int64x2_t, int32x4_t, '_high_n_s64', '[0, 1, 2, 3]', 'N >= 1 && N <= 32']
+    compose:
+      - FnCall: [static_assert!, ["{type[5]}"]]
+      - FnCall: [simd_shuffle!, [a, {FnCall: ["vqrshrn_n{neon_type[1].noq}::<N>", [b]]}, "{type[4]}"]]
+
+  - name: "vqrshrn{type[0]}"
+    doc: "Unsigned saturating rounded shift right narrow"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqrshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [h_n_u16, u16, u8, 'N >= 1 && N <= 8', uint16x8_t, q_n_u16, _n_u16]
+      - [s_n_u32, u32, u16, 'N >= 1 && N <= 16', uint32x4_t, q_n_u32, _n_u32]
+      - [d_n_u64, u64, u32, 'N >= 1 && N <= 32', uint64x2_t, q_n_u64, _n_u64]
+    compose:
+      - FnCall: [static_assert!, ['{type[3]}']]
+      - Let: [a, "{neon_type[4]}", {FnCall: ["vdup{type[5]}", [a]]}]
+      - FnCall: [simd_extract!, [{FnCall: ["vqrshrn{type[6]}::<N>", [a]]}, '0']]
+
+  - name: "vqrshrn_high_n{neon_type[1].noq}"
+    doc: "Unsigned saturating rounded shift right narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqrshrn2, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint8x8_t, uint16x8_t, uint8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, uint32x4_t, uint16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, uint64x2_t, uint32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]']
+    compose:
+      - FnCall: [static_assert!, ['{type[3]}']]
+      - FnCall:
+          - simd_shuffle!
+          - - a
+            - FnCall:
+                - "vqrshrn_n{neon_type[1].noq}::<N>"
+                - - b
+            - "{type[4]}"
+
+  - name: "vqrshrun{type[0]}"
+    doc: "Signed saturating rounded shift right unsigned narrow"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrshrun, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [h_n_s16, "i16", "u8", 'N >= 1 && N <= 8', int16x8_t, s16]
+      - [s_n_s32, "i32", "u16", 'N >= 1 && N <= 16', int32x4_t, s32]
+      - [d_n_s64, "i64", "u32", 'N >= 1 && N <= 32', int64x2_t, s64]
+    compose:
+      - FnCall: [static_assert!, ["{type[3]}"]]
+      - Let:
+          - a
+          - "{neon_type[4]}"
+          - FnCall: ["vdupq_n_{type[5]}", [a]]
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vqrshrun_n_{type[5]}::<N>"
+                - - a
+            - '0'
+
+  - name: "vqrshrun_high_n{neon_type[1].noq}"
+    doc: "Signed saturating rounded shift right unsigned narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrshrun2, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint8x8_t, int16x8_t, uint8x16_t, 'N >= 1 && N <= 8', s16, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, int32x4_t, uint16x8_t, 'N >= 1 && N <= 16', s32, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, int64x2_t, uint32x4_t, 'N >= 1 && N <= 32', s64, '[0, 1, 2, 3]']
+    compose:
+      - FnCall: [static_assert!, ["{type[3]}"]]
+      - FnCall:
+          - simd_shuffle!
+          - - a
+            - FnCall:
+                - "vqrshrun_n_{type[4]}::<N>"
+                - - b
+            - "{type[5]}"
+
+  - name: "vqshld_{type}"
+    doc: "Signed saturating shift left"
+    arguments: ["a: {type}", "b: {type}"]
+    return_type: "{type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqshl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - i64
+    compose:
+      - LLVMLink:
+          name: "vqshld{type}"
+          links:
+            - link: "llvm.aarch64.neon.sqshl.{type}"
+              arch: aarch64,arm64ec
+
+  - name: "vqshl{type[0]}"
+    doc: "Signed saturating shift left"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqshl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [b_s8, "i8", int8x8_t]
+      - [h_s16, "i16", int16x4_t]
+      - [s_s32, "i32", int32x2_t]
+    compose:
+      - Let:
+          - c
+          - "{neon_type[2]}"
+          - FnCall:
+              - "vqshl{neon_type[2].noq}"
+              - - FnCall: ["vdup_n{neon_type[2].no}", [a]]
+                - FnCall: ["vdup_n{neon_type[2].no}", [b]]
+      - FnCall: [simd_extract!, [c, '0']]
+
+  - name: "vqshl{type[0]}"
+    doc: "Signed saturating shift left"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqshl, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [b_n_s8, "i8", "3", s8]
+      - [h_n_s16, "i16", "4", s16]
+      - [s_n_s32, "i32", "5", s32]
+      - [d_n_s64, "i64", "6", s64]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[2]}"]]
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vqshl_n_{type[3]}::<N>"
+                - - FnCall: ["vdup_n_{type[3]}", [a]]
+            - '0'
+
+  - name: "vqshld_{type[0]}"
+    doc: "Unsigned saturating shift left"
+    arguments: ["a: {type[0]}", "b: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqshl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["u64", "i64"]
+    compose:
+      - LLVMLink:
+          name: "vqshld{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.uqshl.{type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqshl{type[0]}"
+    doc: "Unsigned saturating shift left"
+    arguments: ["a: {type[1]}", "b: {type[2]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqshl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [b_u8, "u8", "i8", uint8x8_t, int8x8_t]
+      - [h_u16, "u16", "i16", uint16x4_t, int16x4_t]
+      - [s_u32, "u32", "i32", uint32x2_t, int32x2_t]
+    compose:
+      - Let:
+          - c
+          - "{neon_type[3]}"
+          - FnCall:
+              - "vqshl{neon_type[3].noq}"
+              - - FnCall: ["vdup{neon_type[3].N}", [a]]
+                - FnCall: ["vdup{neon_type[4].N}", [b]]
+      - FnCall: [simd_extract!, [c, '0']]
+
+  - name: "vqshl{type[0]}"
+    doc: "Unsigned saturating shift left"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqshl, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [b_n_u8, "u8", '3']
+      - [h_n_u16, "u16", '4']
+      - [s_n_u32, "u32", '5']
+      - [d_n_u64, "u64", '6']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[2]}"]]
+      - FnCall:
+          - simd_extract!
+          - - FnCall: ["vqshl_n_{type[1]}::<N>", [{FnCall: ["vdup_n_{type[1]}", [a]]}]]
+            - '0'
+
+  - name: "vqshrnd_n_s64"
+    doc: "Signed saturating shift right narrow"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["i64", "i32"]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 32']]
+      - LLVMLink:
+          name: "vqshrnd{type[1]}"
+          arguments:
+            - "a: {type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.sqshrn.{type[1]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vqshrnd_n_s64", [a, N], [], true]
+
+  - name: "vqshrn{type[0]}"
+    doc: "Signed saturating shift right narrow"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [h_n_s16, "i16", "i8", 'N >= 1 && N <= 8', s16]
+      - [s_n_s32, "i32", "i16", 'N >= 1 && N <= 16', s32]
+    compose:
+      - FnCall: [static_assert!, ["{type[3]}"]]
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vqshrn_n_{type[4]}::<N>"
+                - - FnCall: ["vdupq_n_{type[4]}", [a]]
+            - '0'
+
+  - name: "vqshrn{type[0]}"
+    doc: "Signed saturating shift right narrow"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqshrn2, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [_high_n_s16, int8x8_t, int16x8_t, int8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]', s16]
+      - [_high_n_s32, int16x4_t, int32x4_t, int16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]', s32]
+      - [_high_n_s64, int32x2_t, int64x2_t, int32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]', s64]
+    compose:
+      - FnCall: [static_assert!, ["{type[4]}"]]
+      - FnCall:
+          - simd_shuffle!
+          - - a
+            - FnCall: ["vqshrn_n_{type[6]}::<N>", [b]]
+            - "{type[5]}"
+
+  - name: "vqshrnd_n_u64"
+    doc: "Unsigned saturating shift right narrow"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["u64", "u32"]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 32']]
+      - LLVMLink:
+          name: "vqshrnd_n_u64"
+          arguments:
+            - "a: u64"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.uqshrn.i32"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vqshrnd_n_u64", ["a", N], [], true]
+
+  - name: "vqshrn{type[0]}"
+    doc: "Unsigned saturating shift right narrow"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ['h_n_u16', "u16", "u8", 'N >= 1 && N <= 8']
+      - ['s_n_u32', "u32", "u16", 'N >= 1 && N <= 16']
+    compose:
+      - FnCall: [static_assert!, ["{type[3]}"]]
+      - FnCall:
+          - "simd_extract!"
+          - - FnCall:
+                - "vqshrn_n_{type[1]}::<N>"
+                - - FnCall: ["vdupq_n_{type[1]}", [a]]
+            - '0'
+
+  - name: "vqshrn{type[0]}"
+    doc: "Unsigned saturating shift right narrow"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqshrn2, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [_high_n_u16, uint8x8_t, uint16x8_t, uint8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [_high_n_u32, uint16x4_t, uint32x4_t, uint16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [_high_n_u64, uint32x2_t, uint64x2_t, uint32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]']
+    compose:
+      - FnCall: [static_assert!, ["{type[4]}"]]
+      - FnCall:
+          - simd_shuffle!
+          - - a
+            - FnCall: ["vqshrn_n_{neon_type[2]}::<N>", [b]]
+            - "{type[5]}"
+
+  - name: "vqshrun{type[0]}"
+    doc: "Signed saturating shift right unsigned narrow"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqshrun, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [h_n_s16, "i16", "u8", 'N >= 1 && N <= 8', s16]
+      - [s_n_s32, "i32", "u16", 'N >= 1 && N <= 16', s32]
+      - [d_n_s64, "i64", "u32", 'N >= 1 && N <= 32', s64]
+    compose:
+      - FnCall: [static_assert!, ["{type[3]}"]]
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vqshrun_n_{type[4]}::<N>"
+                - - FnCall: ["vdupq_n_{type[4]}", [a]]
+            - '0'
+
+  - name: "vqshrun_high_n_{neon_type[1]}"
+    doc: "Signed saturating shift right unsigned narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqshrun2, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint8x8_t, int16x8_t, uint8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, int32x4_t, uint16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, int64x2_t, uint32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]']
+    compose:
+      - FnCall: [static_assert!, ["{type[3]}"]]
+      - FnCall:
+          - simd_shuffle!
+          - - a
+            - FnCall: ["vqshrun_n_{neon_type[1]}::<N>", [b]]
+            - "{type[4]}"
+
+  - name: "vsqadd{type[0]}"
+    doc: "Unsigned saturating accumulate of signed value"
+    arguments: ["a: {type[1]}", "b: {type[2]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [usqadd]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [b_u8, "u8", "i8", s8]
+      - [h_u16, "u16", "i16", s16]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vsqadd_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+                  - FnCall: ["vdup_n_{type[2]}", [b]]
+            - '0'
+
+  - name: "vsqadd{type[0]}"
+    doc: "Unsigned saturating accumulate of signed value"
+    arguments: ["a: {type[1]}", "b: {type[2]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [usqadd]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [s_u32, "u32", "i32"]
+      - [d_u64, "u64", "i64"]
+    compose:
+      - LLVMLink:
+          name: "vsqadd{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.usqadd.{type[2]}"
+              arch: aarch64,arm64ec
+
+  - name: "vsqrt{neon_type.no}"
+    doc: "Calculates the square root of each lane."
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fsqrt]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - FnCall: [simd_fsqrt, [a]]
+
+  - name: "vsqrt{neon_type.no}"
+    doc: "Calculates the square root of each lane."
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fsqrt]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - FnCall: [simd_fsqrt, [a]]
+
+  - name: "vsqrt{type[1]}{type[0]}"
+    doc: "Floating-point round to integral, using current rounding mode"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fsqrt]
+    safety: safe
+    types:
+      - [f16, 'h_']
+    compose:
+      - FnCall: [sqrtf16, [a], [], true]
+
+  - name: "vrsqrts{type[0]}"
+    doc: "Floating-point reciprocal square root step"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frsqrts]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [_f64, float64x1_t, v1f64]
+      - [q_f64, float64x2_t, v2f64]
+    compose:
+      - LLVMLink:
+          name: "vrsqrts{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.frsqrts.{type[2]}"
+              arch: aarch64,arm64ec
+
+  - name: "vrsqrts{type[0]}"
+    doc: "Floating-point reciprocal square root step"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frsqrts]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [s_f32, "f32"]
+      - [d_f64, "f64"]
+    compose:
+      - LLVMLink:
+          name: "vrsqrts{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.frsqrts.{type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrsqrts{type[0]}"
+    doc: "Floating-point reciprocal square root step"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-fp16
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frsqrts]]}]]
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [h_f16, "f16"]
+    compose:
+      - LLVMLink:
+          name: "vrsqrts{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.frsqrts.{type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrecpe{type[0]}"
+    doc: "Reciprocal estimate."
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frecpe]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [_f64, float64x1_t, v1f64]
+      - [q_f64, float64x2_t, v2f64]
+    compose:
+      - LLVMLink:
+          name: "vrecpe{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.frecpe.{type[2]}"
+              arch: aarch64,arm64ec
+
+  - name: "vrecpe{type[0]}"
+    doc: "Reciprocal estimate."
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frecpe]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [s_f32, "f32"]
+      - [d_f64, "f64"]
+    compose:
+      - LLVMLink:
+          name: "vrecpe{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.frecpe.{type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrecpe{type[0]}"
+    doc: "Reciprocal estimate."
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frecpe]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [h_f16, "f16"]
+    compose:
+      - LLVMLink:
+          name: "vrecpe{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.frecpe.{type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrecps{type[0]}"
+    doc: "Floating-point reciprocal step"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frecps]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [_f64, float64x1_t, v1f64]
+      - [q_f64, float64x2_t, v2f64]
+    compose:
+      - LLVMLink:
+          name: "vrecps{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.frecps.{type[2]}"
+              arch: aarch64,arm64ec
+
+  - name: "vrecps{type[0]}"
+    doc: "Floating-point reciprocal step"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frecps]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [s_f32, "f32"]
+      - [d_f64, "f64"]
+    compose:
+      - LLVMLink:
+          name: "vrecps{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.frecps.{type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrecps{type[0]}"
+    doc: "Floating-point reciprocal step"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frecps]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [h_f16, "f16"]
+    compose:
+      - LLVMLink:
+          name: "vrecps{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.frecps.{type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrecpx{type[0]}"
+    doc: "Floating-point reciprocal exponent"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frecpx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [s_f32, "f32"]
+      - [d_f64, "f64"]
+    compose:
+      - LLVMLink:
+          name: "vrecpxs{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.frecpx.{type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrecpx{type[0]}"
+    doc: "Floating-point reciprocal exponent"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frecpx]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [h_f16, "f16"]
+    compose:
+      - LLVMLink:
+          name: "vrecpxs{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.frecpx.{type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vreinterpret{neon_type[1].no}{neon_type[0].noq}"
+    doc: Vector reinterpret cast operation
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr: [*neon-stable]
+    assert_instr: [nop]
+    safety: safe
+    types:
+      - [poly64x1_t, int64x1_t]
+      - [poly64x1_t, uint64x1_t]
+      - [int64x1_t, poly64x1_t]
+      - [uint64x1_t, poly64x1_t]
+      - [poly64x2_t, int64x2_t]
+      - [poly64x2_t, uint64x2_t]
+      - [int64x2_t, poly64x2_t]
+      - [uint64x2_t, poly64x2_t]
+      - [float64x1_t, int8x8_t]
+      - [float64x1_t, int16x4_t]
+      - [float64x1_t, int32x2_t]
+      - [float64x1_t, int64x1_t]
+      - [float64x2_t, int8x16_t]
+      - [float64x2_t, int16x8_t]
+      - [float64x2_t, int32x4_t]
+      - [float64x2_t, int64x2_t]
+      - [float64x1_t, uint8x8_t]
+      - [float64x1_t, uint16x4_t]
+      - [float64x1_t, uint32x2_t]
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint8x16_t]
+      - [float64x2_t, uint16x8_t]
+      - [float64x2_t, uint32x4_t]
+      - [float64x2_t, uint64x2_t]
+      - [float64x1_t, poly8x8_t]
+      - [float64x1_t, poly16x4_t]
+      - [float32x2_t, poly64x1_t]
+      - [float64x1_t, poly64x1_t]
+      - [float64x2_t, poly8x16_t]
+      - [float64x2_t, poly16x8_t]
+      - [float32x4_t, poly64x2_t]
+      - [float64x2_t, poly64x2_t]
+      - [float64x2_t, p128]
+      - [int8x8_t, float64x1_t]
+      - [int16x4_t, float64x1_t]
+      - [int32x2_t, float64x1_t]
+      - [int64x1_t, float64x1_t]
+      - [int8x16_t, float64x2_t]
+      - [int16x8_t, float64x2_t]
+      - [int32x4_t, float64x2_t]
+      - [int64x2_t, float64x2_t]
+      - [poly8x8_t, float64x1_t]
+      - [uint16x4_t, float64x1_t]
+      - [uint32x2_t, float64x1_t]
+      - [uint64x1_t, float64x1_t]
+      - [poly8x16_t, float64x2_t]
+      - [uint16x8_t, float64x2_t]
+      - [uint32x4_t, float64x2_t]
+      - [uint64x2_t, float64x2_t]
+      - [uint8x8_t, float64x1_t]
+      - [poly16x4_t, float64x1_t]
+      - [poly64x1_t, float64x1_t]
+      - [poly64x1_t, float32x2_t]
+      - [uint8x16_t, float64x2_t]
+      - [poly16x8_t, float64x2_t]
+      - [poly64x2_t, float64x2_t]
+      - [poly64x2_t, float32x4_t]
+      - [p128, float64x2_t]
+      - [float32x2_t, float64x1_t]
+      - [float64x1_t, float32x2_t]
+      - [float32x4_t, float64x2_t]
+      - [float64x2_t, float32x4_t]
+    compose:
+      - FnCall: [transmute, [a]]
+
+
+  - name: "vreinterpret{neon_type[1].no}{neon_type[0].noq}"
+    doc: Vector reinterpret cast operation
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [nop]
+    safety: safe
+    types:
+      - [float64x1_t, float16x4_t]
+      - [float16x4_t, float64x1_t]
+      # q
+      - [float64x2_t, float16x8_t]
+      - [float16x8_t, float64x2_t]
+    compose:
+      - FnCall: [transmute, [a]]
+
+
+  - name: "vrshld_s64"
+    doc: "Signed rounding shift left"
+    arguments: ["a: {type}", "b: {type}"]
+    return_type: "{type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [srshl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - "i64"
+    compose:
+      - LLVMLink:
+          name: "vrshld_{type}"
+          links:
+            - link: "llvm.aarch64.neon.srshl.{type}"
+              arch: aarch64,arm64ec
+
+  - name: "vrshld_{type[0]}"
+    doc: "Unsigned rounding shift left"
+    arguments: ["a: {type[0]}", "b: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [urshl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["u64", "i64"]
+    compose:
+      - LLVMLink:
+          name: "vrshld_{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.urshl.{type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vrshrd_n_s64"
+    doc: "Signed rounding shift right"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [srshr, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - ["i64", 'N >= 1 && N <= 64', '-N as i64']
+    compose:
+      - FnCall: [static_assert!, ["{type[1]}"]]
+      - FnCall: [vrshld_s64, [a, "{type[2]}"]]
+
+  - name: "vrshrd_n_u64"
+    doc: "Unsigned rounding shift right"
+    arguments: ["a: {type}"]
+    return_type: "{type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [urshr, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - "u64"
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 64']]
+      - FnCall: ["vrshld_u64", [a, '-N as i64']]
+
+  - name: "vrshrn_high_n_{neon_type[1]}"
+    doc: "Rounding shift right narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [rshrn2, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, int16x8_t, int8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int16x4_t, int32x4_t, int16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [int32x2_t, int64x2_t, int32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]']
+      - [uint8x8_t, uint16x8_t, uint8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, uint32x4_t, uint16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, uint64x2_t, uint32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]']
+    compose:
+      - FnCall: [static_assert!, ["{type[3]}"]]
+      - FnCall:
+          - simd_shuffle!
+          - - a
+            - FnCall: ["vrshrn_n_{neon_type[1]}::<N>", [b]]
+            - "{type[4]}"
+
+  - name: "vrsubhn_high_{neon_type[1]}"
+    doc: "Rounding subtract returning high narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - *little-endian
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [rsubhn2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, int16x8_t, int16x8_t, int8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int16x4_t, int32x4_t, int32x4_t, int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [int32x2_t, int64x2_t, int64x2_t, int32x4_t, '[0, 1, 2, 3]']
+      - [uint8x8_t, uint16x8_t, uint16x8_t, uint8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, uint32x4_t, uint32x4_t, uint16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, uint64x2_t, uint64x2_t, uint32x4_t, '[0, 1, 2, 3]']
+    compose:
+      - Let:
+          - x
+          - "{neon_type[0]}"
+          - FnCall: ["vrsubhn_{neon_type[1]}", [b, c]]
+      - FnCall: [simd_shuffle!, [a, x, "{type[4]}"]]
+
+  - name: "vrsubhn_high_{neon_type[1]}"
+    doc: "Rounding subtract returning high narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - *big-endian
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [rsubhn]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, int16x8_t, int16x8_t, int8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int16x4_t, int32x4_t, int32x4_t, int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [int32x2_t, int64x2_t, int64x2_t, int32x4_t, '[0, 1, 2, 3]']
+      - [uint8x8_t, uint16x8_t, uint16x8_t, uint8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, uint32x4_t, uint32x4_t, uint16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, uint64x2_t, uint64x2_t, uint32x4_t, '[0, 1, 2, 3]']
+    compose:
+      - Let:
+          - x
+          - "{neon_type[0]}"
+          - FnCall: ["vrsubhn_{neon_type[1]}", [b, c]]
+      - FnCall: [simd_shuffle!, [a, x, "{type[4]}"]]
+
+  - name: "vcopy{neon_type[0].lane_nox}"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [mov, 'LANE1 = 0', 'LANE2 = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1', '3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE1: i32, const LANE2: i32']
+    safety: safe
+    types:
+      - [int8x8_t, int8x8_t, int8x8_t, '3', '3', ' unsafe { match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [int16x4_t, int16x4_t, int16x4_t, '2', '2', ' unsafe { match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [int32x2_t, int32x2_t, int32x2_t, '1', '1', ' unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint8x8_t, uint8x8_t, uint8x8_t, '3', '3', ' unsafe { match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint16x4_t, uint16x4_t, uint16x4_t, '2', '2', ' unsafe { match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint32x2_t, uint32x2_t, uint32x2_t, '1', '1', ' unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [poly8x8_t, poly8x8_t, poly8x8_t, '3', '3', ' unsafe { match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [poly16x4_t, poly16x4_t, poly16x4_t, '2', '2', ' unsafe { match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [float32x2_t, float32x2_t, float32x2_t, '1', '1', ' unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE1, '{type[3]}']]
+      - FnCall: [static_assert_uimm_bits!, [LANE2, '{type[4]}']]
+      - Identifier: ["{type[5]}", Symbol]
+
+  - name: "vcopy{neon_type[0].lane_nox}"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [mov, 'LANE1 = 0', 'LANE2 = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1', '3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE1: i32, const LANE2: i32']
+    safety: safe
+    types:
+      - [int8x16_t, int8x8_t, int8x16_t, '4', '3', ' let b: int8x16_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) };', 'unsafe { match LANE1 & 0b1111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]), 8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]), 9 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]), 10 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]), 11 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]), 12 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]), 13 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]), 14 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]), 15 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [int16x8_t, int16x4_t, int16x8_t, '3', '2', ' let b: int16x8_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]) };', 'unsafe { match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [int32x4_t, int32x2_t, int32x4_t, '2', '1', ' let b: int32x4_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3]) };', 'unsafe { match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint8x16_t, uint8x8_t, uint8x16_t, '4', '3', ' let b: uint8x16_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) };', 'unsafe { match LANE1 & 0b1111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]), 8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]), 9 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]), 10 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]), 11 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]), 12 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]), 13 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]), 14 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]), 15 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint16x8_t, uint16x4_t, uint16x8_t, '3', '2', ' let b: uint16x8_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]) };', 'unsafe { match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint32x4_t, uint32x2_t, uint32x4_t, '2', '1', ' let b: uint32x4_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3]) };', 'unsafe { match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [poly8x16_t, poly8x8_t, poly8x16_t, '4', '3', ' let b: poly8x16_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) };', 'unsafe { match LANE1 & 0b1111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]), 8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]), 9 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]), 10 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]), 11 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]), 12 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]), 13 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]), 14 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]), 15 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [poly16x8_t, poly16x4_t, poly16x8_t, '3', '2', ' let b: poly16x8_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]) };', 'unsafe { match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE1, '{type[3]}']]
+      - FnCall: [static_assert_uimm_bits!, [LANE2, '{type[4]}']]
+      - Identifier: ["{type[5]}", Symbol]
+      - Identifier: ["{type[6]}", Symbol]
+
+  - name: "vcopy{neon_type[0].laneq_nox}"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [mov, 'LANE1 = 0', 'LANE2 = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1', '3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE1: i32, const LANE2: i32']
+    safety: safe
+    types:
+      - [int8x16_t, int8x16_t, int8x16_t, '4', '4', ' unsafe { match LANE1 & 0b1111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]), 8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]), 9 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]), 10 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]), 11 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]), 12 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]), 13 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]), 14 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]), 15 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [int16x8_t, int16x8_t, int16x8_t, '3', '3', ' unsafe { match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [int32x4_t, int32x4_t, int32x4_t, '2', '2', ' unsafe { match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [int64x2_t, int64x2_t, int64x2_t, '1', '1', ' unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint8x16_t, uint8x16_t, uint8x16_t, '4', '4', ' unsafe { match LANE1 & 0b1111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]), 8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]), 9 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]), 10 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]), 11 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]), 12 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]), 13 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]), 14 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]), 15 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint16x8_t, uint16x8_t, uint16x8_t, '3', '3', ' unsafe { match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint32x4_t, uint32x4_t, uint32x4_t, '2', '2', ' unsafe { match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint64x2_t, uint64x2_t, uint64x2_t, '1', '1', ' unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [poly8x16_t, poly8x16_t, poly8x16_t, '4', '4', ' unsafe { match LANE1 & 0b1111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]), 8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]), 9 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]), 10 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]), 11 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]), 12 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]), 13 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]), 14 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]), 15 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [poly16x8_t, poly16x8_t, poly16x8_t, '3', '3', ' unsafe { match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [poly64x2_t, poly64x2_t, poly64x2_t, '1', '1', ' unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [float32x4_t, float32x4_t, float32x4_t, '2', '2', ' unsafe { match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [float64x2_t, float64x2_t, float64x2_t, '1', '1', ' unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE1, '{type[3]}']]
+      - FnCall: [static_assert_uimm_bits!, [LANE2, '{type[4]}']]
+      - Identifier: ["{type[5]}", Symbol]
+
+  - name: "vcopy{neon_type[0].laneq_nox}"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [mov, 'LANE1 = 0', 'LANE2 = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1', '3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE1: i32, const LANE2: i32']
+    safety: safe
+    types:
+      - [int8x8_t, int8x16_t, int8x8_t, '3', '4', ' let a: int8x16_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) };', 'unsafe { match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [int16x4_t, int16x8_t, int16x4_t, '2', '3', ' let a: int16x8_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) };', 'unsafe { match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [int32x2_t, int32x4_t, int32x2_t, '1', '2', ' let a: int32x4_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) };', 'unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint8x8_t, uint8x16_t, uint8x8_t, '3', '4', ' let a: uint8x16_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) };', 'unsafe { match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint16x4_t, uint16x8_t, uint16x4_t, '2', '3', ' let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) };', 'unsafe { match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint32x2_t, uint32x4_t, uint32x2_t, '1', '2', 'let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) };', 'unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [poly8x8_t, poly8x16_t, poly8x8_t, '3', '4', ' let a: poly8x16_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) };', 'unsafe { match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [poly16x4_t, poly16x8_t, poly16x4_t, '2', '3', ' let a: poly16x8_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) };', 'unsafe { match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [float32x2_t, float32x4_t, float32x2_t, '1', '2', ' let a: float32x4_t = unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) };', 'unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE1, '{type[3]}']]
+      - FnCall: [static_assert_uimm_bits!, [LANE2, '{type[4]}']]
+      - Identifier: ["{type[5]}", Symbol]
+      - Identifier: ["{type[6]}", Symbol]
+
+  - name: "vcopyq_lane_{neon_type[0]}"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [mov, 'LANE1 = 1', 'LANE2 = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1', '3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE1: i32, const LANE2: i32']
+    safety: safe
+    types:
+      - [int64x2_t, int64x1_t, ' let b: int64x2_t = unsafe { simd_shuffle!(b, b, [0, 1]) };', 'unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [uint64x2_t, uint64x1_t, ' let b: uint64x2_t = unsafe { simd_shuffle!(b, b, [0, 1]) };', 'unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [poly64x2_t, poly64x1_t, ' let b: poly64x2_t = unsafe { simd_shuffle!(b, b, [0, 1]) };', 'unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+      - [float64x2_t, float64x1_t, ' let b: float64x2_t = unsafe { simd_shuffle!(b, b, [0, 1]) };', 'unsafe { match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE1, '1']]
+      - FnCall: [static_assert!, ['LANE2 == 0']]
+      - Identifier: ['{type[2]}', Symbol]
+      - Identifier: ['{type[3]}', Symbol]
+
+  - name: "vcopyq_lane_f32"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [mov, 'LANE1 = 1', 'LANE2 = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1', '3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE1: i32, const LANE2: i32']
+    safety: safe
+    types:
+      - [float32x4_t, float32x2_t, ' let b: float32x4_t = unsafe { simd_shuffle!(b, b, [0, 1, 2, 3]) };', 'unsafe { match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), } }']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE1, 2]]
+      - FnCall: [static_assert_uimm_bits!, [LANE2, 1]]
+      - Identifier: ["{type[2]}", Symbol]
+      - Identifier: ["{type[3]}", Symbol]
+
+  - name: "vcreate_f64"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["u64", float64x1_t]
+    compose:
+      - FnCall: [transmute, [a]]
+
+  - name: "vset_lane_f64"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["f64", float64x1_t, float64x1_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall: [simd_insert!, [b, 'LANE as u32', a]]
+
+  - name: "vsetq_lane_f64"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["f64", float64x2_t, float64x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '1']]
+      - FnCall: [simd_insert!, [b, 'LANE as u32', a]]
+
+  - name: "vshld_s64"
+    doc: "Signed Shift left"
+    arguments: ["a: {type}", "b: {type}"]
+    return_type: "{type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sshl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - "i64"
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - vshl_s64
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vshld_{type[0]}"
+    doc: "Unsigned Shift left"
+    arguments: ["a: {type[0]}", "b: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ushl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["u64", "i64"]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - vshl_u64
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vshll_high_n_{neon_type[0]}"
+    doc: "Signed shift left long"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sshll2, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x16_t, int16x8_t, int8x8_t, 'N >= 0 && N <= 8', '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int16x8_t, int32x4_t, int16x4_t, 'N >= 0 && N <= 16', '[4, 5, 6, 7]']
+      - [int32x4_t, int64x2_t, int32x2_t, 'N >= 0 && N <= 32', '[2, 3]']
+    compose:
+      - FnCall: [static_assert!, ["{type[3]}"]]
+      - Let: [b, "{neon_type[2]}", {FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]}]
+      - FnCall: ["vshll_n_{neon_type[2]}::<N>", [b]]
+
+  - name: "vshll_high_n_{neon_type[0]}"
+    doc: "Signed shift left long"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ushll2, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint8x16_t, uint16x8_t, uint8x8_t, 'N >= 0 && N <= 8', '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x8_t, uint32x4_t, uint16x4_t, 'N >= 0 && N <= 16', '[4, 5, 6, 7]']
+      - [uint32x4_t, uint64x2_t, uint32x2_t, 'N >= 0 && N <= 32', '[2, 3]']
+    compose:
+      - FnCall: [static_assert!, ["{type[3]}"]]
+      - Let: [b, "{neon_type[2]}", {FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]}]
+      - FnCall: ["vshll_n_{neon_type[2]}::<N>", [b]]
+
+  - name: "vshrn_high_n_{neon_type[1]}"
+    doc: "Shift right narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [shrn2, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, int16x8_t, int8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int16x4_t, int32x4_t, int16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [int32x2_t, int64x2_t, int32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]']
+      - [uint8x8_t, uint16x8_t, uint8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, uint32x4_t, uint16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, uint64x2_t, uint32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]']
+    compose:
+      - FnCall: [static_assert!, ["{type[3]}"]]
+      - FnCall:
+          - simd_shuffle!
+          - - a
+            - FnCall: ["vshrn_n_{neon_type[1]}::<N>", [b]]
+            - "{type[4]}"
+
+  - name: "vsm3partw1{neon_type.no}"
+    doc: "SM3PARTW1"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sm4"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sm3partw1]]}]]
+      - FnCall: [unstable, ['feature = "stdarch_neon_sm4"', 'issue = "117226"']]
+    safety: safe
+    types:
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: llvm.aarch64.crypto.sm3partw1
+          links:
+            - link: "llvm.aarch64.crypto.sm3partw1"
+              arch: aarch64,arm64ec
+
+  - name: "vsm3partw2{neon_type.no}"
+    doc: "SM3PARTW2"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sm4"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sm3partw2]]}]]
+      - FnCall: [unstable, ['feature = "stdarch_neon_sm4"', 'issue = "117226"']]
+    safety: safe
+    types:
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: llvm.aarch64.crypto.sm3partw2
+          links:
+            - link: "llvm.aarch64.crypto.sm3partw2"
+              arch: aarch64,arm64ec
+
+  - name: "vsm3ss1{neon_type.no}"
+    doc: "SM3SS1"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sm4"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sm3ss1]]}]]
+      - FnCall: [unstable, ['feature = "stdarch_neon_sm4"', 'issue = "117226"']]
+    safety: safe
+    types:
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: llvm.aarch64.crypto.sm3ss1
+          links:
+            - link: "llvm.aarch64.crypto.sm3ss1"
+              arch: aarch64,arm64ec
+
+  - name: "vsm4ekey{neon_type.no}"
+    doc: "SM4 key"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sm4"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sm4ekey]]}]]
+      - FnCall: [unstable, ['feature = "stdarch_neon_sm4"', 'issue = "117226"']]
+    safety: safe
+    types:
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: llvm.aarch64.crypto.sm4ekey
+          links:
+            - link: "llvm.aarch64.crypto.sm4ekey"
+              arch: aarch64,arm64ec
+
+  - name: "vsm4e{neon_type.no}"
+    doc: "SM4 encode"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sm4"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sm4e]]}]]
+      - FnCall: [unstable, ['feature = "stdarch_neon_sm4"', 'issue = "117226"']]
+    safety: safe
+    types:
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: llvm.aarch64.crypto.sm4e
+          links:
+            - link: "llvm.aarch64.crypto.sm4e"
+              arch: aarch64,arm64ec
+
+  - name: "vrax1{neon_type.no}"
+    doc: "Rotate and exclusive OR"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sha3"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [rax1]]}]]
+      - FnCall: [stable, ['feature = "stdarch_neon_sha3"', 'since = "1.79.0"']]
+    safety: safe
+    types:
+      - uint64x2_t
+    compose:
+      - LLVMLink:
+          name: llvm.aarch64.crypto.rax1
+          links:
+            - link: "llvm.aarch64.crypto.rax1"
+              arch: aarch64,arm64ec
+
+  - name: "vsha512h{neon_type.no}"
+    doc: "SHA512 hash update part 1"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sha3"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sha512h]]}]]
+      - FnCall: [stable, ['feature = "stdarch_neon_sha3"', 'since = "1.79.0"']]
+    safety: safe
+    types:
+      - uint64x2_t
+    compose:
+      - LLVMLink:
+          name: llvm.aarch64.crypto.sha512h
+          links:
+            - link: "llvm.aarch64.crypto.sha512h"
+              arch: aarch64,arm64ec
+
+  - name: "vsha512h2{neon_type.no}"
+    doc: "SHA512 hash update part 2"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sha3"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sha512h2]]}]]
+      - FnCall: [stable, ['feature = "stdarch_neon_sha3"', 'since = "1.79.0"']]
+    safety: safe
+    types:
+      - uint64x2_t
+    compose:
+      - LLVMLink:
+          name: llvm.aarch64.crypto.sha512h2
+          links:
+            - link: "llvm.aarch64.crypto.sha512h2"
+              arch: aarch64,arm64ec
+
+  - name: "vsha512su0{neon_type.no}"
+    doc: "SHA512 schedule update 0"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sha3"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sha512su0]]}]]
+      - FnCall: [stable, ['feature = "stdarch_neon_sha3"', 'since = "1.79.0"']]
+    safety: safe
+    types:
+      - uint64x2_t
+    compose:
+      - LLVMLink:
+          name: llvm.aarch64.crypto.sha512su0
+          links:
+            - link: "llvm.aarch64.crypto.sha512su0"
+              arch: aarch64,arm64ec
+
+  - name: "vsha512su1{neon_type.no}"
+    doc: "SHA512 schedule update 1"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sha3"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sha512su1]]}]]
+      - FnCall: [stable, ['feature = "stdarch_neon_sha3"', 'since = "1.79.0"']]
+    safety: safe
+    types:
+      - uint64x2_t
+    compose:
+      - LLVMLink:
+          name: llvm.aarch64.crypto.sha512su1
+          links:
+            - link: "llvm.aarch64.crypto.sha512su1"
+              arch: aarch64,arm64ec
+
+  - name: "vsm3tt{type[0]}"
+    doc: "{type[3]}"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sm4"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, ['{type[2]}', 'IMM2 = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [unstable, ['feature = "stdarch_neon_sm4"', 'issue = "117226"']]
+    static_defs: ["const IMM2: i32"]
+    safety: safe
+    types:
+      - ['1aq_u32', uint32x4_t, 'sm3tt1a', 'SM3TT1A']
+      - ['1bq_u32', uint32x4_t, 'sm3tt1b', 'SM3TT1B']
+      - ['2aq_u32', uint32x4_t, 'sm3tt2a', 'SM3TT2A']
+      - ['2bq_u32', uint32x4_t, 'sm3tt2b', 'SM3TT2B']
+    compose:
+      - FnCall: ["static_assert_uimm_bits!", [IMM2, "2"]]
+      - LLVMLink:
+          name: "_vsm3tt{type[0]}"
+          arguments:
+            - "a: {neon_type[1]}"
+            - "b: {neon_type[1]}"
+            - "c: {neon_type[1]}"
+            - "n: i64"
+          links:
+            - link: "llvm.aarch64.crypto.{type[2]}"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vsm3tt{type[0]}"
+          - - "a"
+            - "b"
+            - "c"
+            - "IMM2 as i64"
+          - []
+          - true
+
+  - name: "vxarq_u64"
+    doc: "Exclusive OR and rotate"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,sha3"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, ['xar', 'IMM6 = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "stdarch_neon_sha3"', 'since = "1.79.0"']]
+    static_defs: ["const IMM6: i32"]
+    safety: safe
+    types:
+      - uint64x2_t
+    compose:
+      - FnCall: ["static_assert_uimm_bits!", [IMM6, "6"]]
+      - LLVMLink:
+          name: "_vxarq_u64"
+          arguments:
+            - "a: {neon_type}"
+            - "b: {neon_type}"
+            - "n: i64"
+          links:
+            - link: "llvm.aarch64.crypto.xar"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vxarq_u64"
+          - - "a"
+            - "b"
+            - "IMM6 as i64"
+          - []
+          - true
+
+  - name: "vrnd32x{neon_type.no}"
+    doc: "Floating-point round to 32-bit integer, using current rounding mode"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,frintts"']]
+      - FnCall: [unstable, ['feature = "stdarch_neon_ftts"', 'issue = "117227"']]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [frint32x]]}]]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "vrnd32x{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.frint32x.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vrnd32x{neon_type.no}"
+    doc: "Floating-point round to 32-bit integer, using current rounding mode"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,frintts"']]
+      - FnCall: [unstable, ['feature = "stdarch_neon_ftts"', 'issue = "117227"']]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [frint32x]]}]]
+    safety: safe
+    types:
+      - float64x1_t
+    compose:
+      - LLVMLink:
+          name: "vrnd32x{neon_type.no}"
+          arguments:
+            - "a: f64"
+          return_type: "f64"
+          links:
+            - link: "llvm.aarch64.frint32x.f64"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - _vrnd32x_f64
+                - - FnCall: [simd_extract!, [a, 0]]
+
+  - name: "vrnd32z{neon_type.no}"
+    doc: "Floating-point round to 32-bit integer toward zero"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,frintts"']]
+      - FnCall: [unstable, ['feature = "stdarch_neon_ftts"', 'issue = "117227"']]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [frint32z]]}]]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "vrnd32z{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.frint32z.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vrnd32z{neon_type.no}"
+    doc: "Floating-point round to 32-bit integer toward zero"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,frintts"']]
+      - FnCall: [unstable, ['feature = "stdarch_neon_ftts"', 'issue = "117227"']]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [frint32z]]}]]
+    safety: safe
+    types:
+      - float64x1_t
+    compose:
+      - LLVMLink:
+          name: "vrnd32z{neon_type.no}"
+          arguments:
+            - "a: f64"
+          return_type: "f64"
+          links:
+            - link: "llvm.aarch64.frint32z.f64"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - transmute
+          - - FnCall: [_vrnd32z_f64, [{FnCall: [simd_extract!, [a, 0]]}]]
+
+  - name: "vrnd64x{neon_type.no}"
+    doc: "Floating-point round to 64-bit integer, using current rounding mode"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,frintts"']]
+      - FnCall: [unstable, ['feature = "stdarch_neon_ftts"', 'issue = "117227"']]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [frint64x]]}]]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "vrnd64x{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.frint64x.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vrnd64x{neon_type.no}"
+    doc: "Floating-point round to 64-bit integer, using current rounding mode"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,frintts"']]
+      - FnCall: [unstable, ['feature = "stdarch_neon_ftts"', 'issue = "117227"']]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [frint64x]]}]]
+    safety: safe
+    types:
+      - float64x1_t
+    compose:
+      - LLVMLink:
+          name: "vrnd64x{neon_type.no}"
+          arguments:
+            - "a: f64"
+          return_type: "f64"
+          links:
+            - link: "llvm.aarch64.frint64x.f64"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - transmute
+          - - FnCall: [_vrnd64x_f64, [{FnCall: [simd_extract!, [a, 0]]}]]
+
+  - name: "vrnd64z{neon_type.no}"
+    doc: "Floating-point round to 64-bit integer toward zero"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,frintts"']]
+      - FnCall: [unstable, ['feature = "stdarch_neon_ftts"', 'issue = "117227"']]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [frint64z]]}]]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "vrnd64z{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.frint64z.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vrnd64z{neon_type.no}"
+    doc: "Floating-point round to 64-bit integer toward zero"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,frintts"']]
+      - FnCall: [unstable, ['feature = "stdarch_neon_ftts"', 'issue = "117227"']]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [frint64z]]}]]
+    safety: safe
+    types:
+      - float64x1_t
+    compose:
+      - LLVMLink:
+          name: "vrnd64z{neon_type.no}"
+          arguments:
+            - "a: f64"
+          return_type: "f64"
+          links:
+            - link: "llvm.aarch64.frint64z.f64"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - transmute
+          - - FnCall: [_vrnd64z_f64, [{FnCall: [simd_extract!, [a, 0]]}]]
+
+  - name: "vtrn1{neon_type[0].no}"
+    doc: Transpose vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [trn1]]}]]
+    safety: safe
+    types:
+      - [int8x8_t, '[0, 8, 2, 10, 4, 12, 6, 14]']
+      - [int8x16_t, '[0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]']
+      - [int16x4_t, '[0, 4, 2, 6]']
+      - [int16x8_t, '[0, 8, 2, 10, 4, 12, 6, 14]']
+      - [int32x4_t, '[0, 4, 2, 6]']
+      - [uint8x8_t, '[0, 8, 2, 10, 4, 12, 6, 14]']
+      - [uint8x16_t, '[0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]']
+      - [uint16x4_t, '[0, 4, 2, 6]']
+      - [uint16x8_t, '[0, 8, 2, 10, 4, 12, 6, 14]']
+      - [uint32x4_t, '[0, 4, 2, 6]']
+      - [poly8x8_t, '[0, 8, 2, 10, 4, 12, 6, 14]']
+      - [poly8x16_t, '[0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]']
+      - [poly16x4_t, '[0, 4, 2, 6]']
+      - [poly16x8_t, '[0, 8, 2, 10, 4, 12, 6, 14]']
+      - [float32x4_t, '[0, 4, 2, 6]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+
+  - name: "vtrn1{neon_type[0].no}"
+    doc: Transpose vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [trn1]]}]]
+    safety: safe
+    types:
+      - [float16x4_t, '[0, 4, 2, 6]']
+      - [float16x8_t, '[0, 8, 2, 10, 4, 12, 6, 14]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+  - name: "vtrn1{neon_type[0].no}"
+    doc: Transpose vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [zip1]]}]]
+    safety: safe
+    types:
+      - [int32x2_t, '[0, 2]']
+      - [int64x2_t, '[0, 2]']
+      - [uint32x2_t, '[0, 2]']
+      - [uint64x2_t, '[0, 2]']
+      - [poly64x2_t, '[0, 2]']
+      - [float32x2_t, '[0, 2]']
+      - [float64x2_t, '[0, 2]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+  - name: "vtrn2{neon_type[0].no}"
+    doc: Transpose vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [trn2]]}]]
+    safety: safe
+    types:
+      - [int8x8_t, '[1, 9, 3, 11, 5, 13, 7, 15]']
+      - [int8x16_t, '[1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]']
+      - [int16x4_t, '[1, 5, 3, 7]']
+      - [int16x8_t, '[1, 9, 3, 11, 5, 13, 7, 15]']
+      - [int32x4_t, '[1, 5, 3, 7]']
+      - [uint8x8_t, '[1, 9, 3, 11, 5, 13, 7, 15]']
+      - [uint8x16_t, '[1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]']
+      - [uint16x4_t, '[1, 5, 3, 7]']
+      - [uint16x8_t, '[1, 9, 3, 11, 5, 13, 7, 15]']
+      - [uint32x4_t, '[1, 5, 3, 7]']
+      - [poly8x8_t, '[1, 9, 3, 11, 5, 13, 7, 15]']
+      - [poly8x16_t, '[1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]']
+      - [poly16x4_t, '[1, 5, 3, 7]']
+      - [poly16x8_t, '[1, 9, 3, 11, 5, 13, 7, 15]']
+      - [float32x4_t, '[1, 5, 3, 7]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+  - name: "vtrn2{neon_type[0].no}"
+    doc: Transpose vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [trn2]]}]]
+    safety: safe
+    types:
+      - [float16x4_t, '[1, 5, 3, 7]']
+      - [float16x8_t, '[1, 9, 3, 11, 5, 13, 7, 15]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+  - name: "vtrn2{neon_type[0].no}"
+    doc: Transpose vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [zip2]]}]]
+    safety: safe
+    types:
+      - [int32x2_t, '[1, 3]']
+      - [int64x2_t, '[1, 3]']
+      - [uint32x2_t, '[1, 3]']
+      - [uint64x2_t, '[1, 3]']
+      - [poly64x2_t, '[1, 3]']
+      - [float32x2_t, '[1, 3]']
+      - [float64x2_t, '[1, 3]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+  - name: "vzip2{neon_type[0].no}"
+    doc: Zip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [zip2]]}]]
+    safety: safe
+    types:
+      - [int8x8_t, '[4, 12, 5, 13, 6, 14, 7, 15]']
+      - [int8x16_t, '[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]']
+      - [int16x4_t, '[2, 6, 3, 7]']
+      - [int16x8_t, '[4, 12, 5, 13, 6, 14, 7, 15]']
+      - [int32x2_t, '[1, 3]']
+      - [int32x4_t, '[2, 6, 3, 7]']
+      - [int64x2_t, '[1, 3]']
+      - [uint8x8_t, '[4, 12, 5, 13, 6, 14, 7, 15]']
+      - [uint8x16_t, '[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]']
+      - [uint16x4_t, '[2, 6, 3, 7]']
+      - [uint16x8_t, '[4, 12, 5, 13, 6, 14, 7, 15]']
+      - [uint32x2_t, '[1, 3]']
+      - [uint32x4_t, '[2, 6, 3, 7]']
+      - [uint64x2_t, '[1, 3]']
+      - [poly8x8_t, '[4, 12, 5, 13, 6, 14, 7, 15]']
+      - [poly8x16_t, '[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]']
+      - [poly16x4_t, '[2, 6, 3, 7]']
+      - [poly16x8_t, '[4, 12, 5, 13, 6, 14, 7, 15]']
+      - [poly64x2_t, '[1, 3]']
+      - [float32x2_t, '[1, 3]']
+      - [float32x4_t, '[2, 6, 3, 7]']
+      - [float64x2_t, '[1, 3]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+  - name: "vzip2{neon_type[0].no}"
+    doc: Zip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [zip2]]}]]
+    safety: safe
+    types:
+      - [float16x4_t, '[2, 6, 3, 7]']
+      - [float16x8_t, '[4, 12, 5, 13, 6, 14, 7, 15]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+  - name: "vzip1{neon_type[0].no}"
+    doc: Zip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [zip1]]}]]
+    safety: safe
+    types:
+      - [int8x8_t, '[0, 8, 1, 9, 2, 10, 3, 11]']
+      - [int8x16_t, '[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]']
+      - [int16x4_t, '[0, 4, 1, 5]']
+      - [int16x8_t, '[0, 8, 1, 9, 2, 10, 3, 11]']
+      - [int32x2_t, '[0, 2]']
+      - [int32x4_t, '[0, 4, 1, 5]']
+      - [int64x2_t, '[0, 2]']
+      - [uint8x8_t, '[0, 8, 1, 9, 2, 10, 3, 11]']
+      - [uint8x16_t, '[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]']
+      - [uint16x4_t, '[0, 4, 1, 5]']
+      - [uint16x8_t, '[0, 8, 1, 9, 2, 10, 3, 11]']
+      - [uint32x2_t, '[0, 2]']
+      - [uint32x4_t, '[0, 4, 1, 5]']
+      - [uint64x2_t, '[0, 2]']
+      - [poly8x8_t, '[0, 8, 1, 9, 2, 10, 3, 11]']
+      - [poly8x16_t, '[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]']
+      - [poly16x4_t, '[0, 4, 1, 5]']
+      - [poly16x8_t, '[0, 8, 1, 9, 2, 10, 3, 11]']
+      - [poly64x2_t, '[0, 2]']
+      - [float32x2_t, '[0, 2]']
+      - [float32x4_t, '[0, 4, 1, 5]']
+      - [float64x2_t, '[0, 2]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+
+  - name: "vzip1{neon_type[0].no}"
+    doc: Zip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [zip1]]}]]
+    safety: safe
+    types:
+      - [float16x4_t, '[0, 4, 1, 5]']
+      - [float16x8_t, '[0, 8, 1, 9, 2, 10, 3, 11]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+  - name: "vuzp1{neon_type[0].no}"
+    doc: Unzip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [zip1]]}]]
+    safety: safe
+    types:
+      - [int32x2_t, '[0, 2]']
+      - [int64x2_t, '[0, 2]']
+      - [uint32x2_t, '[0, 2]']
+      - [uint64x2_t, '[0, 2]']
+      - [poly64x2_t, '[0, 2]']
+      - [float32x2_t, '[0, 2]']
+      - [float64x2_t, '[0, 2]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+  - name: "vuzp1{neon_type[0].no}"
+    doc: Unzip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [uzp1]]}]]
+    safety: safe
+    types:
+      - [int8x8_t, '[0, 2, 4, 6, 8, 10, 12, 14]']
+      - [int8x16_t, '[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]']
+      - [int16x4_t, '[0, 2, 4, 6]']
+      - [int16x8_t, '[0, 2, 4, 6, 8, 10, 12, 14]']
+      - [int32x4_t, '[0, 2, 4, 6]']
+      - [uint8x8_t, '[0, 2, 4, 6, 8, 10, 12, 14]']
+      - [uint8x16_t, '[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]']
+      - [uint16x4_t, '[0, 2, 4, 6]']
+      - [uint16x8_t, '[0, 2, 4, 6, 8, 10, 12, 14]']
+      - [uint32x4_t, '[0, 2, 4, 6] ']
+      - [poly8x8_t, '[0, 2, 4, 6, 8, 10, 12, 14]']
+      - [poly8x16_t, '[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]']
+      - [poly16x4_t, '[0, 2, 4, 6]']
+      - [poly16x8_t, '[0, 2, 4, 6, 8, 10, 12, 14]']
+      - [float32x4_t, '[0, 2, 4, 6]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+  - name: "vuzp1{neon_type[0].no}"
+    doc: Unzip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [uzp1]]}]]
+    safety: safe
+    types:
+      - [float16x4_t, '[0, 2, 4, 6]']
+      - [float16x8_t, '[0, 2, 4, 6, 8, 10, 12, 14]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+  - name: "vuzp2{neon_type[0].no}"
+    doc: Unzip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [zip2]]}]]
+    safety: safe
+    types:
+      - [int32x2_t, '[1, 3]']
+      - [int64x2_t, '[1, 3]']
+      - [uint32x2_t, '[1, 3]']
+      - [uint64x2_t, '[1, 3]']
+      - [poly64x2_t, '[1, 3]']
+      - [float32x2_t, '[1, 3]']
+      - [float64x2_t, '[1, 3]']
+    compose:
+      - FnCall: ["simd_shuffle!", [a, b, "{type[1]}"]]
+
+  - name: "vuzp2{neon_type[0].no}"
+    doc: Unzip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [uzp2]]}]]
+    safety: safe
+    types:
+      - [int8x8_t, '[1, 3, 5, 7, 9, 11, 13, 15]']
+      - [int8x16_t, '[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]']
+      - [int16x4_t, '[1, 3, 5, 7]']
+      - [int16x8_t, '[1, 3, 5, 7, 9, 11, 13, 15]']
+      - [int32x4_t, '[1, 3, 5, 7]']
+      - [uint8x8_t, '[1, 3, 5, 7, 9, 11, 13, 15]']
+      - [uint8x16_t, '[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]']
+      - [uint16x4_t, '[1, 3, 5, 7]']
+      - [uint16x8_t, '[1, 3, 5, 7, 9, 11, 13, 15]']
+      - [uint32x4_t, '[1, 3, 5, 7]']
+      - [poly8x8_t, '[1, 3, 5, 7, 9, 11, 13, 15]']
+      - [poly8x16_t, '[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]']
+      - [poly16x4_t, '[1, 3, 5, 7]']
+      - [poly16x8_t, '[1, 3, 5, 7, 9, 11, 13, 15]']
+      - [float32x4_t, '[1, 3, 5, 7]']
+    compose:
+      - FnCall:
+          - "simd_shuffle!"
+          - - a
+            - b
+            - "{type[1]}"
+
+  - name: "vuzp2{neon_type[0].no}"
+    doc: Unzip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [uzp2]]}]]
+    safety: safe
+    types:
+      - [float16x4_t, '[1, 3, 5, 7]']
+      - [float16x8_t, '[1, 3, 5, 7, 9, 11, 13, 15]']
+    compose:
+      - FnCall:
+          - "simd_shuffle!"
+          - - a
+            - b
+            - "{type[1]}"
+
+  - name: "vabal_high_{neon_type[1]}"
+    doc: "Unsigned Absolute difference and Accumulate Long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [uabal]]}]]
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x16_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint32x4_t, uint16x8_t, uint16x4_t, '[4, 5, 6, 7]', '[4, 5, 6, 7]']
+      - [uint64x2_t, uint32x4_t, uint32x2_t, '[2, 3]', '[2, 3]']
+    compose:
+      - Let:
+          - d
+          - "{neon_type[2]}"
+          - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]
+      - Let:
+          - e
+          - "{neon_type[2]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]
+      - Let: [f, "{neon_type[2]}", {FnCall: ["vabd_{neon_type[2]}", [d, e]]}]
+      - FnCall:
+          - simd_add
+          - - a
+            - FnCall: [simd_cast, [f]]
+
+  - name: "vabal_high{neon_type[1].noq}"
+    doc: Signed Absolute difference and Accumulate Long
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [sabal]]}]]
+    safety: safe
+    types:
+      - [int16x8_t, int8x16_t, int8x16_t, '[8, 9, 10, 11, 12, 13, 14, 15]', int8x8_t, uint8x8_t]
+      - [int32x4_t, int16x8_t, int16x8_t, '[4, 5, 6, 7]', int16x4_t, uint16x4_t]
+      - [int64x2_t, int32x4_t, int32x4_t, '[2, 3]', int32x2_t, uint32x2_t]
+    compose:
+      - Let:
+          - d
+          - "{neon_type[4]}"
+          - FnCall:
+              - simd_shuffle!
+              - - b
+                - b
+                - "{type[3]}"
+      - Let:
+          - e
+          - "{neon_type[4]}"
+          - FnCall:
+              - simd_shuffle!
+              - - c
+                - c
+                - "{type[3]}"
+      - Let:
+          - f
+          - "{neon_type[4]}"
+          - FnCall:
+              - "vabd{neon_type[4].no}"
+              - - d
+                - e
+      - Let:
+          - f
+          - "{neon_type[5]}"
+          - FnCall:
+              - simd_cast
+              - - f
+      - FnCall:
+          - simd_add
+          - - a
+            - FnCall:
+                - simd_cast
+                - - f
+
+  - name: "vqabs{neon_type.no}"
+    doc: Signed saturating Absolute value
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-stable
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [sqabs]]}]]
+    safety: safe
+    types:
+      - int64x1_t
+      - int64x2_t
+    compose:
+      - LLVMLink:
+          name: "sqabs.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.sqabs.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vslid_n_{type}"
+    doc: Shift left and insert
+    arguments: ["a: {type}", "b: {type}"]
+    return_type: "{type}"
+    static_defs:
+      - "const N: i32"
+    attr:
+      - *neon-stable
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [sli, 'N = 2']]}]]
+    safety: safe
+    types:
+      - i64
+      - u64
+    compose:
+      - FnCall:
+          - "static_assert!"
+          - - 'N >= 0 && N <= 63'
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vsli_n_{type}::<N>"
+                - - FnCall:
+                      - transmute
+                      - - a
+                  - FnCall:
+                      - transmute
+                      - - b
+
+  - name: "vsrid_n_{type}"
+    doc: Shift right and insert
+    arguments: ["a: {type}", "b: {type}"]
+    return_type: "{type}"
+    static_defs:
+      - "const N: i32"
+    attr:
+      - *neon-stable
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [sri, 'N = 2']]}]]
+    safety: safe
+    types:
+      - i64
+      - u64
+    compose:
+      - FnCall:
+          - "static_assert!"
+          - - 'N >= 1 && N <= 64'
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vsri_n_{type}::<N>"
+                - - FnCall:
+                      - transmute
+                      - - a
+                  - FnCall:
+                      - transmute
+                      - - b
+
+  - name: "vpmaxnm{neon_type.no}"
+    doc: "Floating-point Maximum Number Pairwise (vector)."
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmaxnmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - float32x2_t
+      - float64x2_t
+      - float32x4_t
+    compose:
+      - LLVMLink:
+          name: "vpmaxnm{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxnmp.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st1]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*mut f64', float64x1x2_t, float64x1_t]
+      - ['*mut f64', float64x2x2_t, float64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vst1{neon_type[1].no}"
+          arguments:
+            - "a: {neon_type[2]}"
+            - "b: {neon_type[2]}"
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.st1x{neon_type[1].tuple}.{neon_type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vst1{neon_type[1].no}", ['b.0', 'b.1', 'a']]
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st1]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*mut f64', float64x1x3_t, float64x1_t]
+      - ['*mut f64', float64x2x3_t, float64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vst1{neon_type[1].no}"
+          arguments:
+            - "a: {neon_type[2]}"
+            - "b: {neon_type[2]}"
+            - "c: {neon_type[2]}"
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.st1x{neon_type[1].tuple}.{neon_type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vst1{neon_type[1].no}", ['b.0', 'b.1', 'b.2', 'a']]
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st1]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*mut f64', float64x1x4_t, float64x1_t]
+      - ['*mut f64', float64x2x4_t, float64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vst1{neon_type[1].no}"
+          arguments:
+            - "a: {neon_type[2]}"
+            - "b: {neon_type[2]}"
+            - "c: {neon_type[2]}"
+            - "d: {neon_type[2]}"
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.st1x{neon_type[1].tuple}.{neon_type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vst1{neon_type[1].no}", ['b.0', 'b.1', 'b.2', 'b.3', 'a']]
+
+  - name: "vfma{type[3]}"
+    doc: "Floating-point fused multiply-add to accumulator"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float32x2_t, float32x2_t, '1', '_lane_f32']
+      - [float32x2_t, float32x4_t, '2', '_laneq_f32']
+      - [float32x4_t, float32x2_t, '1', 'q_lane_f32']
+      - [float32x4_t, float32x4_t, '2', 'q_laneq_f32']
+      - [float64x2_t, float64x2_t, '1', 'q_laneq_f64']
+    compose:
+      - FnCall: ["static_assert_uimm_bits!", [LANE, "{type[2]}"]]
+      - FnCall:
+          - "vfma{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]
+
+
+  - name: "vfma{type[3]}"
+    doc: "Floating-point fused multiply-add to accumulator"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float16x4_t, float16x4_t, '2', '_lane_f16']
+      - [float16x4_t, float16x8_t, '3', '_laneq_f16']
+      - [float16x8_t, float16x4_t, '2', 'q_lane_f16']
+      - [float16x8_t, float16x8_t, '3', 'q_laneq_f16']
+    compose:
+      - FnCall: ["static_assert_uimm_bits!", [LANE, "{type[2]}"]]
+      - FnCall:
+          - "vfma{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]
+
+
+  # vfms lane f16
+  - name: "vfms{type[3]}"
+    doc: "Floating-point fused multiply-subtract from accumulator"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmls, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float16x4_t, float16x4_t, '2', '_lane_f16']
+      - [float16x4_t, float16x8_t, '3', '_laneq_f16']
+      - [float16x8_t, float16x4_t, '2', 'q_lane_f16']
+      - [float16x8_t, float16x8_t, '3', 'q_laneq_f16']
+    compose:
+      - FnCall: ["static_assert_uimm_bits!", [LANE, "{type[2]}"]]
+      - FnCall:
+          - "vfms{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]
+
+
+  - name: "vfms{type[1]}"
+    doc: "Floating-point fused multiply-subtract from accumulator"
+    arguments: ["a: {type[0]}", "b: {type[0]}", "c: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmsub]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "h_f16"]
+    compose:
+      - FnCall: ["vfma{type[1]}", [a, -b, c]]
+
+
+  - name: "vfma_lane_f64"
+    doc: "Floating-point fused multiply-add to accumulator"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmadd, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - float64x1_t
+    compose:
+      - FnCall: ["static_assert!", ["LANE == 0"]]
+      - FnCall:
+          - "vfma{neon_type.no}"
+          - - a
+            - b
+            - FnCall: ["vdup{neon_type.N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]
+
+  - name: "vfma_laneq_f64"
+    doc: "Floating-point fused multiply-add to accumulator"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmadd, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float64x1_t, float64x2_t]
+    compose:
+      - FnCall: ["static_assert_uimm_bits!", ["LANE", "1"]]
+      - FnCall:
+          - "vfma{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]
+
+  - name: "vfmaq_lane_f64"
+    doc: "Floating-point fused multiply-add to accumulator"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmla, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float64x2_t, float64x1_t]
+    compose:
+      - FnCall: ["static_assert!", ["LANE == 0"]]
+      - FnCall:
+          - "vfma{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]
+
+  - name: "vfma{type[2]}"
+    doc: "Floating-point fused multiply-add to accumulator"
+    arguments: ["a: {type[0]}", "b: {type[0]}", "c: {neon_type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmadd, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["f32", float32x2_t, "s_lane_f32", '1']
+      - ["f32", float32x4_t, "s_laneq_f32", '2']
+      - ["f64", float64x2_t, "d_laneq_f64", '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', "{type[3]}"]]
+      - Let: [c, "{type[0]}", {FnCall: [simd_extract!, [c, 'LANE as u32']]}]
+      - FnCall: ["fma{type[0]}", [b, c, a]]
+
+  - name: "vfmad_lane_f64"
+    doc: "Floating-point fused multiply-add to accumulator"
+    arguments: ["a: {type[0]}", "b: {type[0]}", "c: {neon_type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmadd, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["f64", float64x1_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - Let: [c, "{type[0]}", {FnCall: [simd_extract!, [c, 'LANE as u32']]}]
+      - FnCall: [fmaf64, [b, c, a]]
+
+
+  - name: "vfma{type[1]}"
+    doc: "Floating-point fused multiply-add to accumulator"
+    arguments: ["a: {type[0]}", "b: {type[0]}", "c: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmadd]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "h_f16"]
+    compose:
+      - FnCall: [fmaf16, [b, c, a], [], true]
+
+
+  - name: "vfmah_lane{type[2]}"
+    doc: "Floating-point fused multiply-add to accumulator"
+    arguments: ["a: {type[0]}", "b: {type[0]}", "v: {neon_type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmadd, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["f16", float16x4_t, '_f16', '2']
+      - ["f16", float16x8_t, 'q_f16', '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - Let: [c, "{type[0]}", {FnCall: [simd_extract!, [v, 'LANE as u32']]}]
+      - FnCall: ["vfmah_{type[0]}", [a, b, c]]
+
+  - name: "vfmsh_lane{type[2]}"
+    doc: "Floating-point fused multiply-subtract from accumulator"
+    arguments: ["a: {type[0]}", "b: {type[0]}", "v: {neon_type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmsub, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["f16", float16x4_t, '_f16', '2']
+      - ["f16", float16x8_t, 'q_f16', '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - Let: [c, "{type[0]}", {FnCall: [simd_extract!, [v, 'LANE as u32']]}]
+      - FnCall: ["vfmsh_{type[0]}", [a, b, c]]
+
+  - name: "vfms_f64"
+    doc: "Floating-point fused multiply-subtract from accumulator"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmsub]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - float64x1_t
+    compose:
+      - Let: [b, "{neon_type}", {FnCall: [simd_neg, [b]]}]
+      - FnCall: [vfma_f64, [a, b, c]]
+
+  - name: "vfms{neon_type.no}"
+    doc: "Floating-point fused multiply-subtract from accumulator"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmls]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - float64x2_t
+    compose:
+      - Let: [b, "{neon_type}", {FnCall: [simd_neg, [b]]}]
+      - FnCall: [vfmaq_f64, [a, b, c]]
+
+  - name: "vmls{neon_type.no}"
+    doc: "Floating-point multiply-subtract from accumulator"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmul]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - FnCall: [simd_sub, [a, {FnCall: [simd_mul, [b, c]]}]]
+
+  - name: "vfms{type[3]}"
+    doc: "Floating-point fused multiply-subtract to accumulator"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmls, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float32x2_t, float32x2_t, '1', _lane_f32]
+      - [float32x2_t, float32x4_t, '2', _laneq_f32]
+      - [float32x4_t, float32x2_t, '1', q_lane_f32]
+      - [float32x4_t, float32x4_t, '2', q_laneq_f32]
+      - [float64x2_t, float64x2_t, '1', q_laneq_f64]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[2]}']]
+      - FnCall: ["vfms{neon_type[0].no}", [a, b, {FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]}]]
+
+  - name: "vfms_lane_f64"
+    doc: "Floating-point fused multiply-subtract to accumulator"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmsub, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - float64x1_t
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall: ["vfms{neon_type.no}", [a, b, {FnCall: ["vdup{neon_type.N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]}]]
+
+  - name: "vfms_laneq_f64"
+    doc: "Floating-point fused multiply-subtract to accumulator"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmsub, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float64x1_t, float64x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '1']]
+      - FnCall: ["vfms{neon_type[0].no}", [a, b, {FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]}]]
+
+  - name: "vfmsq_lane_f64"
+    doc: "Floating-point fused multiply-subtract to accumulator"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmls, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float64x2_t, float64x1_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall: ["vfms{neon_type[0].no}", [a, b, {FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]}]]
+
+  - name: "vfms{type[2]}"
+    doc: "Floating-point fused multiply-subtract to accumulator"
+    arguments: ["a: {type[0]}", "b: {type[0]}", "c: {neon_type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmsub, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["f32", float32x2_t, "s_lane_f32"]
+      - ["f32", float32x4_t, "s_laneq_f32"]
+      - ["f64", float64x1_t, "d_lane_f64"]
+      - ["f64", float64x2_t, "d_laneq_f64"]
+    compose:
+      - FnCall: ["vfma{type[2]}::<LANE>", ['a', '-b', 'c']]
+
+
+  - name: "vceqz{neon_type[0].no}"
+    doc: "Floating-point compare bitwise equal to zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmeq]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t, 'f32x2', 'f32x2::new(0.0, 0.0)']
+      - [float32x4_t, uint32x4_t, 'f32x4', 'f32x4::new(0.0, 0.0, 0.0, 0.0)']
+      - [float64x1_t, uint64x1_t, 'f64', '0.0']
+      - [float64x2_t, uint64x2_t, 'f64x2', 'f64x2::new(0.0, 0.0)']
+    compose:
+      - Let: [b, '{type[2]}', '{type[3]}']
+      - FnCall: [simd_eq, [a, {FnCall: [transmute, [b]]}]]
+
+  - name: "vceqz{neon_type[0].no}"
+    doc: "Floating-point compare bitwise equal to zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmeq]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t, 'f16x4', 'f16x4::new(0.0, 0.0, 0.0, 0.0)']
+      - [float16x8_t, uint16x8_t, 'f16x8', 'f16x8::new(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)']
+    compose:
+      - Let: [b, '{type[2]}', '{type[3]}']
+      - FnCall: [simd_eq, [a, {FnCall: [transmute, [b]]}]]
+
+  - name: "vceqz{type[2]}"
+    doc: "Floating-point compare bitwise equal to zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["f32", "u32", "s_f32"]
+      - ["f64", "u64", "d_f64"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vceqz_{type[0]}"
+                - - FnCall: ["vdup_n_{type[0]}", [a]]
+            - '0'
+
+  - name: "vceqz{type[2]}"
+    doc: "Floating-point compare bitwise equal to zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "u16", "h_f16"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vceqz_{type[0]}"
+                - - FnCall: ["vdup_n_{type[0]}", [a]]
+            - '0'
+
+  - name: "vceqzd_{type[2]}"
+    doc: "Compare bitwise equal to zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i64", "u64", "s64"]
+      - ["u64", "u64", "u64"]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vceqz_{type[2]}"
+                - - FnCall: [transmute, [a]]
+
+  - name: "vceqz{neon_type[0].no}"
+    doc: "Signed compare bitwise equal to zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmeq]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, uint8x8_t, i8x8, 'i8x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int8x16_t, uint8x16_t, i8x16, 'i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int16x4_t, uint16x4_t, i16x4, 'i16x4::new(0, 0, 0, 0)']
+      - [int16x8_t, uint16x8_t, i16x8, 'i16x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int32x2_t, uint32x2_t, i32x2, 'i32x2::new(0, 0)']
+      - [int32x4_t, uint32x4_t, i32x4, 'i32x4::new(0, 0, 0, 0)']
+      - [int64x1_t, uint64x1_t, i64x1, 'i64x1::new(0)']
+      - [int64x2_t, uint64x2_t, i64x2, 'i64x2::new(0, 0)']
+      - [poly8x8_t, uint8x8_t, i8x8, 'i8x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [poly8x16_t, uint8x16_t, i8x16, 'i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)']
+      - [poly64x1_t, uint64x1_t, i64x1, 'i64x1::new(0)']
+      - [poly64x2_t, uint64x2_t, i64x2, 'i64x2::new(0, 0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall:
+          - simd_eq
+          - - a
+            - FnCall: [transmute, [b]]
+
+  - name: "vceqz{neon_type[0].no}"
+    doc: "Unsigned compare bitwise equal to zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmeq]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x8_t, uint8x8_t, u8x8, 'u8x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [uint8x16_t, uint8x16_t, u8x16, 'u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)']
+      - [uint16x4_t, uint16x4_t, u16x4, 'u16x4::new(0, 0, 0, 0)']
+      - [uint16x8_t, uint16x8_t, u16x8, 'u16x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [uint32x2_t, uint32x2_t, u32x2, 'u32x2::new(0, 0)']
+      - [uint32x4_t, uint32x4_t, u32x4, 'u32x4::new(0, 0, 0, 0)']
+      - [uint64x1_t, uint64x1_t, u64x1, 'u64x1::new(0)']
+      - [uint64x2_t, uint64x2_t, u64x2, 'u64x2::new(0, 0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall:
+          - simd_eq
+          - - a
+            - FnCall: [transmute, [b]]
+
+  - name: "vcge{neon_type.no}"
+    doc: "Compare unsigned greater than or equal"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmhs]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - uint64x1_t
+      - uint64x2_t
+    compose:
+      - FnCall: [simd_ge, [a, b]]
+
+  - name: "vcge{type[0]}"
+    doc: "Floating-point compare greater than or equal"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "f32", "u32"]
+      - ["d_f64", "f64", "u64"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vcge_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+                  - FnCall: ["vdup_n_{type[1]}", [b]]
+            - '0'
+
+
+  - name: "vcge{type[0]}"
+    doc: "Floating-point compare greater than or equal"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["h_f16", "f16", "u16"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vcge_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+                  - FnCall: ["vdup_n_{type[1]}", [b]]
+            - '0'
+
+  - name: "vcge{neon_type[0].no}"
+    doc: "Floating-point compare greater than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmge]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - FnCall: [simd_ge, [a, b]]
+
+  - name: "vcge{type[0]}"
+    doc: "Compare greater than or equal"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["d_s64", "i64", "u64", s64]
+      - ["d_u64", "u64", "u64", u64]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vcge_{type[3]}"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vclt{neon_type.no}"
+    doc: "Compare unsigned less than"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmhi]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - uint64x1_t
+      - uint64x2_t
+    compose:
+      - FnCall: [simd_lt, [a, b]]
+
+  - name: "vcltd_{type[0]}"
+    doc: "Compare less than"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s64", "i64", "u64"]
+      - ["u64", "u64", "u64"]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vclt_{type[0]}"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vtst{neon_type[0].no}"
+    doc: "Unsigned compare bitwise Test bits nonzero"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmtst]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint64x1_t, u64x1, 'u64x1::new(0)']
+      - [uint64x2_t, u64x2, 'u64x2::new(0, 0)']
+    compose:
+      - Let: [c, "{neon_type[0]}", {FnCall: [simd_and, [a, b]]}]
+      - Let: [d, "{type[1]}", "{type[2]}"]
+      - FnCall: [simd_ne, [c, {FnCall: [transmute, [d]]}]]
+
+  - name: "vcgez{neon_type[0].no}"
+    doc: "Floating-point compare greater than or equal to zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmge]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t, f32x2, 'f32x2::new(0.0, 0.0)']
+      - [float32x4_t, uint32x4_t, f32x4, 'f32x4::new(0.0, 0.0, 0.0, 0.0)']
+      - [float64x1_t, uint64x1_t, f64, '0.0']
+      - [float64x2_t, uint64x2_t, f64x2, 'f64x2::new(0.0, 0.0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall:
+          - simd_ge
+          - - a
+            - FnCall: [transmute, [b]]
+
+  - name: "vcgez{type[0]}"
+    doc: "Floating-point compare greater than or equal to zero"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "f32", "u32"]
+      - ["d_f64", "f64", "u64"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vcgez_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+            - '0'
+
+
+  - name: "vcgez{type[0]}"
+    doc: "Floating-point compare greater than or equal to zero"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["h_f16", "f16", "u16"]
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vcgez_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+            - '0'
+
+  - name: "vclezd_s64"
+    doc: "Compare less than or equal to zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i64", "u64"]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall: [vclez_s64, [{FnCall: [transmute, [a]]}]]
+
+  - name: "vcgtd_{type[2]}"
+    doc: "Compare greater than"
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i64", "u64", 's64']
+      - ["u64", "u64", 'u64']
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vcgt_{type[2]}"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vcgtz{neon_type[0].no}"
+    doc: "Compare signed greater than zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmgt]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, uint8x8_t, i8x8, 'i8x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int8x16_t, uint8x16_t, i8x16, 'i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int16x4_t, uint16x4_t, i16x4, 'i16x4::new(0, 0, 0, 0)']
+      - [int16x8_t, uint16x8_t, i16x8, 'i16x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int32x2_t, uint32x2_t, i32x2, 'i32x2::new(0, 0)']
+      - [int32x4_t, uint32x4_t, i32x4, 'i32x4::new(0, 0, 0, 0)']
+      - [int64x1_t, uint64x1_t, i64x1, 'i64x1::new(0)']
+      - [int64x2_t, uint64x2_t, i64x2, 'i64x2::new(0, 0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall:
+          - simd_gt
+          - - a
+            - FnCall: [transmute, [b]]
+
+  - name: "vcgtzd_s64"
+    doc: "Compare signed greater than zero"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["i64", "u64"]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - vcgtz_s64
+                - - FnCall: [transmute, [a]]
+
+  - name: "vcgtz{neon_type[0].no}"
+    doc: "Floating-point compare greater than zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmgt]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t, f32x2, 'f32x2::new(0.0, 0.0)']
+      - [float32x4_t, uint32x4_t, f32x4, 'f32x4::new(0.0, 0.0, 0.0, 0.0)']
+      - [float64x1_t, uint64x1_t, f64, '0.0']
+      - [float64x2_t, uint64x2_t, f64x2, 'f64x2::new(0.0, 0.0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall: [simd_gt, [a, {FnCall: [transmute, [b]]}]]
+
+  - name: "vcgtz{type[0]}"
+    doc: "Floating-point compare greater than zero"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "f32", "u32"]
+      - ["d_f64", "f64", "u64"]
+    compose:
+      - FnCall:
+          - "simd_extract!"
+          - - FnCall:
+                - "vcgtz_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+            - '0'
+
+  - name: "vcgtz{type[0]}"
+    doc: "Floating-point compare greater than zero"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["h_f16", "f16", "u16"]
+    compose:
+      - FnCall:
+          - "simd_extract!"
+          - - FnCall:
+                - "vcgtz_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+            - '0'
+
+  - name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to unsigned fixed-point, rounding toward zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzu]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.fptoui.sat.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vmul{neon_type[0].N}"
+    doc: "Vector multiply by scalar"
+    arguments: ["a: {neon_type[0]}", "b: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmul]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x1_t, "f64"]
+      - [float64x2_t, "f64"]
+    compose:
+      - FnCall:
+          - simd_mul
+          - - a
+            - FnCall: ["vdup{neon_type[0].N}", [b]]
+
+  - name: "vmul_lane_f64"
+    doc: "Floating-point multiply"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmul, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - float64x1_t
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - simd_mul
+          - - a
+            - FnCall:
+                - "transmute::<f64, _>"
+                - - FnCall: [simd_extract!, [b, 'LANE as u32']]
+
+  - name: "vmulq_lane_f64"
+    doc: "Floating-point multiply"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmul, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float64x2_t, float64x1_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall:
+          - simd_mul
+          - - a
+            - FnCall: ["simd_shuffle!", [b, b, '[LANE as u32, LANE as u32]']]
+
+  - name: "vmuld_lane_f64"
+    doc: "Floating-point multiply"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmul, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["f64", float64x1_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - Let: [b, '{type[0]}', {FnCall: [simd_extract!, [b, 'LANE as u32']]}]
+      - Identifier: ['a * b', Symbol]
+
+  - name: "vmul_laneq_f64"
+    doc: "Floating-point multiply"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmul, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float64x1_t, float64x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '1']]
+      - FnCall:
+          - simd_mul
+          - - a
+            - FnCall:
+                - "transmute::<f64, _>"
+                - - FnCall: [simd_extract!, [b, 'LANE as u32']]
+
+  - name: "vmulq_laneq_f64"
+    doc: "Floating-point multiply"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmul, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float64x2_t, float64x2_t, float64x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '1']]
+      - FnCall:
+          - simd_mul
+          - - a
+            - FnCall: [simd_shuffle!, [b, b, '[LANE as u32, LANE as u32]']]
+
+
+  # vmulq_laneq_f16
+  - name: "vmul{type[2]}{neon_type[1].no}"
+    doc: "Floating-point multiply"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmul, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float16x4_t, float16x8_t, '_lane', "[LANE as u32, LANE as u32, LANE as u32, LANE as u32]"]
+      - [float16x8_t, float16x8_t, 'q_lane', "[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]"]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '3']]
+      - FnCall:
+          - simd_mul
+          - - a
+            - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]
+
+
+  - name: "vmul{type[1]}_{type[0]}"
+    doc: Add
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [nop]
+    safety: safe
+    types:
+      - [f16, 'h']
+    compose:
+      - 'a * b'
+
+
+  - name: "vmul{type[2]}"
+    doc: "Floating-point multiply"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmul, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["f32", float32x2_t, "s_lane_f32", '1']
+      - ["f32", float32x4_t, "s_laneq_f32", '2']
+      - ["f64", float64x2_t, "d_laneq_f64", '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - Let: [b, '{type[0]}', {FnCall: [simd_extract!, [b, 'LANE as u32']]}]
+      - Identifier: ['a * b', Symbol]
+
+
+  - name: "vmul{type[2]}"
+    doc: "Floating-point multiply"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmul, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["f16", float16x4_t, "h_lane_f16", '2']
+      - ["f16", float16x8_t, "h_laneq_f16", '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - Let: [b, '{type[0]}', {FnCall: [simd_extract!, [b, 'LANE as u32']]}]
+      - Identifier: ['a * b', Symbol]
+
+
+  - name: "vrsrad_n_s64"
+    doc: "Signed rounding shift right and accumulate."
+    arguments: ["a: {type}", "b: {type}"]
+    return_type: "{type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [srshr, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - "i64"
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 64']]
+      - Let: [b, "{type}", {FnCall: ["vrshrd_n_s64::<N>", [b]]}]
+      - Identifier: ['a.wrapping_add(b)', Symbol]
+
+  - name: "vmlsl_high_n_{neon_type[1]}"
+    doc: "Multiply-subtract long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [smlsl2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int32x4_t, int16x8_t, "i16"]
+      - [int64x2_t, int32x4_t, "i32"]
+    compose:
+      - FnCall: ["vmlsl_high_{neon_type[1]}", [a, b, {FnCall: ["vdupq_n_{neon_type[1]}", [c]]}]]
+
+  - name: "vmlsl_high_n_{neon_type[1]}"
+    doc: "Multiply-subtract long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [umlsl2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint32x4_t, uint16x8_t, "u16"]
+      - [uint64x2_t, uint32x4_t, "u32"]
+    compose:
+      - FnCall: ["vmlsl_high_{neon_type[1]}", [a, b, {FnCall: ["vdupq_n_{neon_type[1]}", [c]]}]]
+
+  - name: "vmlsl_high_lane{neon_type[2].no}"
+    doc: "Multiply-subtract long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [smlsl2, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [int32x4_t, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int32x4_t, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int64x2_t, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int64x2_t, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall:
+          - "vmlsl_high_{neon_type[1]}"
+          - - a
+            - b
+            - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]
+
+  - name: "vmlsl_high_lane{neon_type[2].no}"
+    doc: "Multiply-subtract long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [umlsl2, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [uint32x4_t, uint16x8_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint32x4_t, uint16x8_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint64x2_t, uint32x4_t, uint32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint64x2_t, uint32x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall:
+          - "vmlsl_high_{neon_type[1]}"
+          - - a
+            - b
+            - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]
+
+  - name: "vclt{neon_type[0].no}"
+    doc: "Floating-point compare less than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmgt]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - FnCall: [simd_lt, [a, b]]
+
+  - name: "vclt{type[2]}"
+    doc: "Floating-point compare less than"
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["f32", "u32", 's_f32']
+      - ["f64", "u64", 'd_f64']
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vclt_{type[0]}"
+                - - FnCall: ["vdup_n_{type[0]}", [a]]
+                  - FnCall: ["vdup_n_{type[0]}", [b]]
+            - '0'
+
+
+  - name: "vclt{type[2]}"
+    doc: "Floating-point compare less than"
+    arguments: ["a: {type[0]}", "b: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcmp]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "u16", 'h_f16']
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vclt_{type[0]}"
+                - - FnCall: ["vdup_n_{type[0]}", [a]]
+                  - FnCall: ["vdup_n_{type[0]}", [b]]
+            - '0'
+
+  - name: "vabdl_high_{neon_type[0]}"
+    doc: "Unsigned Absolute difference Long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uabdl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x16_t, uint16x8_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x8_t, uint32x4_t, uint16x4_t, '[4, 5, 6, 7]']
+      - [uint32x4_t, uint64x2_t, uint32x2_t, '[2, 3]']
+    compose:
+      - Let: [c, "{neon_type[2]}", {FnCall: [simd_shuffle!, [a, a, "{type[3]}"]]}]
+      - Let: [d, "{neon_type[2]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}]
+      - FnCall: [simd_cast, [{FnCall: ["vabd_{neon_type[0]}", [c, d]]}]]
+
+  - name: "vfms_n_f64"
+    doc: "Floating-point fused Multiply-subtract to accumulator(vector)"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmsub]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x1_t, "f64"]
+    compose:
+      - FnCall:
+          - "vfms{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: ["vdup{neon_type[0].N}", [c]]
+
+  - name: "vfmsq_n_f64"
+    doc: "Floating-point fused Multiply-subtract to accumulator(vector)"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmls]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x2_t, "f64"]
+    compose:
+      - FnCall:
+          - "vfms{neon_type[1].no}"
+          - - a
+            - b
+            - FnCall: ["vdup{neon_type[1].N}", [c]]
+
+
+  - name: "vfms{neon_type[0].N}"
+    doc: Floating-point fused Multiply-Subtract from accumulator.
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [fmls]
+    safety: safe
+    types:
+      - [float16x4_t, f16]
+      - [float16x8_t, f16]
+    compose:
+      - FnCall:
+          - "vfms{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall:
+                - "vdup{neon_type[0].N}"
+                - - c
+
+
+  - name: "vpminnm{type[0]}"
+    doc: "Floating-point minimum number pairwise"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fminnmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ['s_f32', float32x2_t, "f32"]
+      - ['qd_f64', float64x2_t, "f64"]
+    compose:
+      - LLVMLink:
+          name: "vpminnm{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fminnmv.{type[2]}.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vpmaxnm{type[0]}"
+    doc: "Floating-point maximum number pairwise"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmaxnmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ['s_f32', float32x2_t, "f32"]
+      - ['qd_f64', float64x2_t, "f64"]
+    compose:
+      - LLVMLink:
+          name: "vpmaxnm{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxnmv.{type[2]}.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcled_{type[0]}"
+    doc: "Compare less than or equal"
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s64", "i64", "u64"]
+      - ["u64", "u64", "u64"]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vcle_{type[0]}"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vqdmulh{neon_type[0].lane_nox}"
+    doc: "Vector saturating doubling multiply high by scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqdmulh, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [int16x4_t, int16x4_t, '2']
+      - [int16x8_t, int16x4_t, '2']
+      - [int32x2_t, int32x2_t, '1']
+      - [int32x4_t, int32x2_t, '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - FnCall:
+          - "vqdmulh{neon_type[0].no}"
+          - - a
+            - FnCall:
+                - "vdup{neon_type[0].N}"
+                - - FnCall: [simd_extract!, [b, 'LANE as u32']]
+
+  - name: "vqabs{type[2]}"
+    doc: "Signed saturating absolute value"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [sqabs]]}]]
+    safety: safe
+    types:
+      - ["i8", "s8", 'b_s8']
+      - ["i16", "s16", 'h_s16']
+    compose:
+      - FnCall:
+          - "simd_extract!"
+          - - FnCall: ["vqabs_{type[1]}", [{FnCall: ["vdup_n_{type[1]}", [a]]}]]
+            - '0'
+
+  - name: "vqabs{type[1]}"
+    doc: "Signed saturating absolute value"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [sqabs]]}]]
+    safety: safe
+    types:
+      - ["i32", "s_s32"]
+      - ["i64", "d_s64"]
+    compose:
+      - LLVMLink:
+          name: "vqabs{type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.sqabs.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vmull_high_n_{neon_type[0]}"
+    doc: "Multiply long"
+    arguments: ["a: {neon_type[0]}", "b: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [smull2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int16x8_t, "i16", int32x4_t]
+      - [int32x4_t, "i32", int64x2_t]
+    compose:
+      - FnCall:
+          - "vmull_high_{neon_type[0]}"
+          - - a
+            - FnCall: ["vdupq_n_{neon_type[0]}", [b]]
+
+  - name: "vmull_high_n_{neon_type[0]}"
+    doc: "Multiply long"
+    arguments: ["a: {neon_type[0]}", "b: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [umull2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint16x8_t, "u16", uint32x4_t]
+      - [uint32x4_t, "u32", uint64x2_t]
+    compose:
+      - FnCall:
+          - "vmull_high_{neon_type[0]}"
+          - - a
+            - FnCall: ["vdupq_n_{neon_type[0]}", [b]]
+
+  - name: "vmull_high_lane{neon_type[1].no}"
+    doc: "Multiply long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [smull2, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [int16x8_t, int16x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int16x8_t, int16x8_t, int32x4_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int32x4_t, int32x2_t, int64x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int32x4_t, int32x4_t, int64x2_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - FnCall:
+          - "vmull_high_{neon_type[0]}"
+          - - a
+            - FnCall: [simd_shuffle!, [b, b, '{type[4]}']]
+
+  - name: "vmull_high_lane{neon_type[1].no}"
+    doc: "Multiply long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [umull2, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [uint16x8_t, uint16x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint16x8_t, uint16x8_t, uint32x4_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint32x4_t, uint32x2_t, uint64x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint32x4_t, uint32x4_t, uint64x2_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - FnCall:
+          - "vmull_high_{neon_type[0]}"
+          - - a
+            - FnCall: [simd_shuffle!, [b, b, '{type[4]}']]
+
+  - name: "vrsqrte{neon_type.no}"
+    doc: "Reciprocal square-root estimate."
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frsqrte]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - float64x1_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "vrsqrte{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.frsqrte.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vrsqrte{type[0]}"
+    doc: "Reciprocal square-root estimate."
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frsqrte]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["s_f32", "f32"]
+      - ["d_f64", "f64"]
+    compose:
+      - LLVMLink:
+          name: "vrsqrte{neon_type[1].no}"
+          links:
+            - link: "llvm.aarch64.neon.frsqrte.{type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrsqrte{type[0]}"
+    doc: "Reciprocal square-root estimate."
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-fp16
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [frsqrte]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["h_f16", "f16"]
+    compose:
+      - LLVMLink:
+          name: "vrsqrte{neon_type[1].no}"
+          links:
+            - link: "llvm.aarch64.neon.frsqrte.{type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vpminnm{neon_type.no}"
+    doc: "Floating-point Minimum Number Pairwise (vector)."
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fminnmp]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - float32x2_t
+      - float64x2_t
+      - float32x4_t
+    compose:
+      - LLVMLink:
+          name: "vpminnm{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.fminnmp.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vqshlu{type[0]}"
+    doc: "Signed saturating shift left unsigned"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqshlu, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [b_n_s8, i8, u8, '3', s8]
+      - [h_n_s16, i16, u16, '4', s16]
+      - [s_n_s32, i32, u32, '5', s32]
+      - [d_n_s64, i64, u64, '6', s64]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vqshlu_n_{type[4]}::<N>"
+                - - FnCall: ["vdup_n_{type[4]}", [a]]
+            - '0'
+
+  - name: "vcvta{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to unsigned integer, rounding to nearest with ties to away"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtau]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+      - [float64x1_t, uint64x1_t]
+      - [float64x2_t, uint64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vcvta{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtau.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvta{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to unsigned integer, rounding to nearest with ties to away"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtau]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vcvta{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtau.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to signed fixed-point, rounding toward zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzs]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [float64x1_t, int64x1_t]
+      - [float64x2_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.fptosi.sat.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtm{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to integer, rounding towards minus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtms]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "i32", 'h']
+      - ["f16", "i64", 'h']
+    compose:
+      - LLVMLink:
+          name: "vcvtm{type[2]}_{type[1]}_{type[0]}"
+          return_type: "{type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtms.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtm{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to integer, rounding towards minus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtms]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "i16", 'h', 'i32']
+    compose:
+      - 'vcvtmh_{type[3]}_f16(a) as i16'
+
+
+  - name: "vcvtm{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to unsigned integer, rounding towards minus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtmu]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "u32", 'h']
+      - ["f16", "u64", 'h']
+    compose:
+      - LLVMLink:
+          name: "vcvtm{type[2]}_{type[1]}_{type[0]}"
+          return_type: "{type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fcvtmu.{type[1]}.{type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcvtm{type[2]}_{type[1]}_{type[0]}"
+    doc: "Floating-point convert to integer, rounding towards minus infinity"
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtmu]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["f16", "u16", 'h', 'u32']
+    compose:
+      - 'vcvtmh_{type[3]}_f16(a) as u16'
+
+  - name: "vmlal_high_n_{neon_type[1]}"
+    doc: "Multiply-add long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [smlal2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int32x4_t, int16x8_t, "i16"]
+      - [int64x2_t, int32x4_t, "i32"]
+    compose:
+      - FnCall:
+          - "vmlal_high_{neon_type[1]}"
+          - - a
+            - b
+            - FnCall: ["vdupq_n_{neon_type[1]}", [c]]
+
+  - name: "vmlal_high_n_{neon_type[1]}"
+    doc: "Multiply-add long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [umlal2]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint32x4_t, uint16x8_t, "u16"]
+      - [uint64x2_t, uint32x4_t, "u32"]
+    compose:
+      - FnCall:
+          - "vmlal_high_{neon_type[1]}"
+          - - a
+            - b
+            - FnCall: ["vdupq_n_{neon_type[1]}", [c]]
+
+  - name: "vmlal_high_lane{neon_type[2].no}"
+    doc: "Multiply-add long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [smlal2, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [int32x4_t, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int32x4_t, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int64x2_t, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int64x2_t, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall: ['vmlal_high_{neon_type[2]}', [a, b, {FnCall: [simd_shuffle!, [c, c, '{type[4]}']]}]]
+
+  - name: "vmlal_high_lane{neon_type[2].no}"
+    doc: "Multiply-add long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [umlal2, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [uint32x4_t, uint16x8_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint32x4_t, uint16x8_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint64x2_t, uint32x4_t, uint32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint64x2_t, uint32x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall: ['vmlal_high_{neon_type[2]}', [a, b, {FnCall: [simd_shuffle!, [c, c, '{type[4]}']]}]]
+
+  - name: "vrsrad_n_u64"
+    doc: "Unsigned rounding shift right and accumulate."
+    arguments: ["a: {type}", "b: {type}"]
+    return_type: "{type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [urshr, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - "u64"
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 64']]
+      - Let: [b, u64, {FnCall: ["vrshrd_n_u64::<N>", [b]]}]
+      - Identifier: ['a.wrapping_add(b)', Symbol]
+
+  - name: "vcle{neon_type.no}"
+    doc: "Compare unsigned less than or equal"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [cmhs]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - uint64x1_t
+      - uint64x2_t
+    compose:
+      - FnCall: [simd_le, [a, b]]
+
+  - name: "vld4{neon_type[1].dup_nox}"
+    doc: "Load single 4-element structure and replicate to all lanes of four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld4r]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i64", int64x2x4_t, "v2i64"]
+      - ["*const f64", float64x1x4_t, "v1f64"]
+      - ["*const f64", float64x2x4_t, "v2f64"]
+    compose:
+      - LLVMLink:
+          name: "vld4{neon_type[1].dup_nox}"
+          arguments:
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.ld4r.{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vld4{neon_type[1].dup_nox}", ['a as _']]
+
+  - name: "vld4{neon_type[1].dup_nox}"
+    doc: "Load single 4-element structure and replicate to all lanes of four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld4r]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u64", uint64x2x4_t, "q_dup_s64"]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall: ["vld4{type[2]}", [{FnCall: [transmute, [a]]}]]
+
+  - name: "vld4{neon_type[1].dup_nox}"
+    doc: "Load single 4-element structure and replicate to all lanes of four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,aes"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld4r]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const p64", poly64x2x4_t, "q_dup_s64"]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall: ["vld4{type[2]}", [{FnCall: [transmute, [a]]}]]
+
+  - name: "vtbx4{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, int8x8x4_t]
+    compose:
+      - FnCall:
+          - "vqtbx2"
+          - - FnCall: [transmute, [a]]
+            - FnCall:
+                - transmute
+                - - FnCall: ["vcombine{neon_type[0].noq}", ["b.0", "b.1"]]
+            - FnCall:
+                - transmute
+                - - FnCall: ["vcombine{neon_type[0].noq}", ["b.2", "b.3"]]
+            - FnCall: [transmute, [c]]
+
+  - name: "vtbx4{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x8_t, uint8x8x4_t, uint8x8_t]
+      - [poly8x8_t, poly8x8x4_t, uint8x8_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vqtbx2"
+                - - FnCall: [transmute, [a]]
+                  - FnCall:
+                      - transmute
+                      - - FnCall: ["vcombine{neon_type[0].noq}", ["b.0", "b.1"]]
+                  - FnCall:
+                      - transmute
+                      - - FnCall: ["vcombine{neon_type[0].noq}", ["b.2", "b.3"]]
+                  - c
+
+  - name: "vtbl1{neon_type[0].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, 'int8x8_t', 'unsafe {{ transmute(b) }}']
+      - [uint8x8_t, 'uint8x8_t', 'b']
+      - [poly8x8_t, 'uint8x8_t', 'b']
+    compose:
+      - FnCall: 
+          - 'vqtbl1{neon_type[0].no}'
+          - - FnCall:
+                - 'vcombine{neon_type[0].no}'
+                - - a
+                  - 'unsafe {{ crate::mem::zeroed() }}'
+            - Identifier: ['{type[2]}', Symbol]
+
+  - name: "vtbl2{neon_type[1].noq}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8x2_t, 'int8x8_t']
+    compose:
+      - FnCall:
+          - vqtbl1
+          - - FnCall:
+                - transmute
+                - - FnCall:
+                      - 'vcombine{neon_type[1].noq}'
+                      - - 'a.0'
+                        - 'a.1'
+            - FnCall: [transmute, [b]]
+
+  - name: "vtbl2{neon_type[2].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x8x2_t, 'uint8x8_t', 'uint8x8_t']
+      - [poly8x8x2_t, 'uint8x8_t', 'poly8x8_t']
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - vqtbl1
+                - - FnCall:
+                      - transmute
+                      - - FnCall:
+                            - 'vcombine{neon_type[2].noq}'
+                            - - 'a.0'
+                              - 'a.1'
+                  - b
+
+  - name: "vtbl3{neon_type[1].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8x3_t, 'int8x8_t', 'int8x16x2']
+    compose:
+      - Let:
+          - x
+          - FnCall:
+              - '{type[2]}_t'
+              - - FnCall: ['vcombine{neon_type[1].no}', ['a.0', 'a.1']]
+                - FnCall: ['vcombine{neon_type[1].no}', ['a.2', 'unsafe {{ crate::mem::zeroed() }}']]
+      - FnCall: 
+          - transmute
+          - - FnCall:
+                - vqtbl2
+                - - FnCall: [transmute, ['x.0']]
+                  - FnCall: [transmute, ['x.1']]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vtbl3{neon_type[3].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x8x3_t, 'uint8x8_t', 'uint8x16x2', 'uint8x8_t']
+      - [poly8x8x3_t, 'uint8x8_t', 'poly8x16x2', 'poly8x8_t']
+    big_endian_inverse: true 
+    compose:
+      - Let:
+          - x
+          - FnCall:
+              - '{type[2]}_t'
+              - - FnCall: ['vcombine{neon_type[3].no}', ['a.0', 'a.1']]
+                - FnCall: ['vcombine{neon_type[3].no}', ['a.2', 'unsafe {{ crate::mem::zeroed() }}']]
+      - FnCall: 
+          - transmute
+          - - FnCall:
+                - vqtbl2
+                - - FnCall: [transmute, ['x.0']]
+                  - FnCall: [transmute, ['x.1']]
+                  - b
+
+  - name: "vtbl4{neon_type[1].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8x4_t, 'int8x8_t', 'int8x16x2']
+    compose:
+      - Let:
+          - x
+          - FnCall:
+              - '{type[2]}_t'
+              - - FnCall: ['vcombine{neon_type[1].no}', ['a.0', 'a.1']]
+                - FnCall: ['vcombine{neon_type[1].no}', ['a.2', 'a.3']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vqtbl2'
+                - - FnCall: [transmute, ['x.0']]
+                  - FnCall: [transmute, ['x.1']]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vtbl4{neon_type[3].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x8x4_t, 'uint8x8_t', 'uint8x16x2', 'uint8x8_t']
+      - [poly8x8x4_t, 'uint8x8_t', 'poly8x16x2', 'poly8x8_t']
+    big_endian_inverse: true
+    compose:
+      - Let:
+          - x
+          - FnCall:
+              - '{type[2]}_t'
+              - - FnCall: ['vcombine{neon_type[3].no}', ['a.0', 'a.1']]
+                - FnCall: ['vcombine{neon_type[3].no}', ['a.2', 'a.3']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vqtbl2'
+                - - FnCall: [transmute, ['x.0']]
+                  - FnCall: [transmute, ['x.1']]
+                  - b
+
+  - name: "vqtbx1{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, int8x16_t, uint8x8_t, vqtbx1]
+      - [int8x16_t, int8x16_t, uint8x16_t, vqtbx1q]
+    compose:
+      - FnCall: ['{type[3]}', [a, b, c]]
+
+  - name: "vqtbx1{type[4]}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x8_t, "uint8x16_t", uint8x8_t, "vqtbx1", "_u8"]
+      - [poly8x8_t, "poly8x16_t", uint8x8_t, "vqtbx1", "_p8"]
+      - [uint8x16_t, "uint8x16_t", uint8x16_t, "vqtbx1q", "q_u8"]
+      - [poly8x16_t, "poly8x16_t", uint8x16_t, "vqtbx1q", "q_p8"]
+    compose:
+      - Let:
+        - x
+        - FnCall:
+            - transmute
+            - - FnCall:
+                  - "{type[3]}"
+                  - - FnCall: [transmute, [a]]
+                    - FnCall: [transmute, [b]]
+                    - c
+      - Identifier: [x, Symbol]
+  
+  - name: "vtbx1{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, "int8x8_t", "transmute(c)", "i8x8::splat(8)", "int8x8"]
+      - [uint8x8_t, "uint8x8_t", "c", "u8x8::splat(8)", "uint8x8"]
+      - [poly8x8_t, "uint8x8_t", "c", "u8x8::splat(8)", "uint8x8"]
+    compose:
+      - FnCall:
+          - simd_select
+          - - FnCall: 
+                - "simd_lt::<{type[4]}_t, int8x8_t>"
+                - - c
+                  - FnCall: [transmute, ["{type[3]}"]]
+            - FnCall:
+                - transmute
+                - - FnCall: 
+                      - "vqtbx1"
+                      - - "transmute(a)"
+                        - FnCall:
+                            - transmute
+                            - - FnCall: ["vcombine{neon_type[0].no}", [b, "crate::mem::zeroed()"]]
+                        - "{type[2]}"
+            - a
+
+  - name: "vtbx2{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, 'int8x8x2_t']
+    compose:
+      - FnCall:
+          - vqtbx1
+          - - FnCall: [transmute, [a]]
+            - FnCall:
+                - transmute
+                - - FnCall: ["vcombine{neon_type[0].no}", ['b.0', 'b.1']]
+            - FnCall: [transmute, [c]]
+
+  - name: "vtbx2{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x8_t, 'uint8x8x2_t', uint8x8_t]
+      - [poly8x8_t, 'poly8x8x2_t', uint8x8_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - vqtbx1
+                - - FnCall: [transmute, [a]]
+                  - FnCall:
+                      - transmute
+                      - - FnCall: ["vcombine{neon_type[0].no}", ['b.0', 'b.1']]
+                  - c
+
+  - name: "vtbx3{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, 'int8x8x3_t', 'int8x16x2', 'i8x8::splat(24)', 'int8x8']
+    compose:
+      - Let:
+         - x
+         - FnCall:
+             - '{type[2]}_t'
+             - - FnCall: ['vcombine{neon_type[0].no}', ['b.0', 'b.1']]
+               - FnCall: ['vcombine{neon_type[0].no}', ['b.2', 'unsafe {{ crate::mem::zeroed() }}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - simd_select
+                - - FnCall:
+                      - 'simd_lt::<{type[4]}_t, int8x8_t>'
+                      - - FnCall: [transmute, [c]]
+                        - FnCall: [transmute, ['{type[3]}']]
+                  - FnCall:
+                      - transmute
+                      - - FnCall:
+                            - 'vqtbx2'
+                            - - FnCall: [transmute, [a]]
+                              - FnCall: [transmute, ['x.0']]
+                              - FnCall: [transmute, ['x.1']]
+                              - FnCall: [transmute, [c]]
+                  - a
+
+  - name: "vtbx3{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: uint8x8_t"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x8_t, 'uint8x8x3_t', 'uint8x16x2', 'u8x8::splat(24)', 'uint8x8']
+      - [poly8x8_t, 'poly8x8x3_t', 'poly8x16x2', 'u8x8::splat(24)', 'poly8x8']
+    big_endian_inverse: true
+    compose:
+      - Let:
+         - x
+         - FnCall:
+             - '{type[2]}_t'
+             - - FnCall: ['vcombine{neon_type[0].no}', ['b.0', 'b.1']]
+               - FnCall: ['vcombine{neon_type[0].no}', ['b.2', 'unsafe {{ crate::mem::zeroed() }}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - simd_select
+                - - FnCall:
+                      - 'simd_lt::<{type[4]}_t, int8x8_t>'
+                      - - FnCall: [transmute, [c]]
+                        - FnCall: [transmute, ['{type[3]}']]
+                  - FnCall:
+                      - transmute
+                      - - FnCall:
+                            - 'vqtbx2'
+                            - - FnCall: [transmute, [a]]
+                              - FnCall: [transmute, ['x.0']]
+                              - FnCall: [transmute, ['x.1']]
+                              - c
+                  - a
+
+  - name: "vqtbl1{neon_type[3].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ['int8x16_t', uint8x8_t, 'vqtbl1', 'int8x8_t']
+      - ['int8x16_t', uint8x16_t, 'vqtbl1q', 'int8x16_t']
+    compose:
+      - FnCall: ['{type[2]}', ['a', b]]
+
+  - name: "vqtbl1{neon_type[3].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ['uint8x16_t', uint8x8_t, 'vqtbl1', 'uint8x8_t']
+      - ['poly8x16_t', uint8x8_t, 'vqtbl1', 'poly8x8_t']
+      - ['uint8x16_t', uint8x16_t, 'vqtbl1q', 'uint8x16_t']
+      - ['poly8x16_t', uint8x16_t, 'vqtbl1q', 'poly8x16_t']
+    compose:
+      - Let:
+          - x
+          - FnCall:
+              - transmute
+              - - FnCall: 
+                    - '{type[2]}'
+                    - - FnCall: [transmute, ['a']]
+                      - b
+      - Identifier: [x, Symbol]
+
+  - name: "vqtbl2{neon_type[3].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ['int8x16x2_t', uint8x8_t, 'vqtbl2', 'int8x8_t']
+      - ['int8x16x2_t', uint8x16_t, 'vqtbl2q', 'int8x16_t']
+    compose:
+      - FnCall: ['{type[2]}', ['a.0', 'a.1', b]]
+
+  - name: "vqtbl2{neon_type[3].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ['uint8x16x2_t', uint8x8_t, 'vqtbl2', 'uint8x8_t']
+      - ['uint8x16x2_t', uint8x16_t, 'vqtbl2q', 'uint8x16_t']
+      - ['poly8x16x2_t', uint8x8_t, 'vqtbl2', 'poly8x8_t']
+      - ['poly8x16x2_t', uint8x16_t, 'vqtbl2q', 'poly8x16_t']
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall: 
+                - '{type[2]}'
+                - - FnCall: [transmute, ['a.0']]
+                  - FnCall: [transmute, ['a.1']]
+                  - b
+
+  - name: "vqtbx2{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, 'int8x16x2_t', uint8x8_t, 'vqtbx2']
+      - [int8x16_t, 'int8x16x2_t', uint8x16_t, 'vqtbx2q']
+    compose:
+      - FnCall: ['{type[3]}', [a, 'b.0', 'b.1', c]]
+
+  - name: "vqtbx2{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x8_t, 'uint8x16x2_t', uint8x8_t, 'vqtbx2']
+      - [uint8x16_t, 'uint8x16x2_t', uint8x16_t, 'vqtbx2q']
+      - [poly8x8_t, 'poly8x16x2_t', uint8x8_t, 'vqtbx2']
+      - [poly8x16_t, 'poly8x16x2_t', uint8x16_t, 'vqtbx2q']
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall: 
+                - '{type[3]}'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, ['b.0']]
+                  - FnCall: [transmute, ['b.1']]
+                  - c
+
+  - name: "vqtbl3{neon_type[0].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ['int8x8_t', 'int8x16x3_t', uint8x8_t, 'vqtbl3']
+      - ['int8x16_t', 'int8x16x3_t', uint8x16_t, 'vqtbl3q']
+    compose:
+      - FnCall: ['{type[3]}', ['a.0', 'a.1', 'a.2', b]]
+
+  - name: "vqtbl3{neon_type[0].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ['uint8x8_t', 'uint8x16x3_t', uint8x8_t, 'vqtbl3']
+      - ['uint8x16_t','uint8x16x3_t', uint8x16_t, 'vqtbl3q']
+      - ['poly8x8_t', 'poly8x16x3_t', uint8x8_t, 'vqtbl3']
+      - ['poly8x16_t','poly8x16x3_t', uint8x16_t, 'vqtbl3q']
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall: 
+                - '{type[3]}'
+                - - FnCall: [transmute, ['a.0']]
+                  - FnCall: [transmute, ['a.1']]
+                  - FnCall: [transmute, ['a.2']]
+                  - b
+
+  - name: "vqtbx3{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, 'int8x16x3_t', uint8x8_t, 'vqtbx3']
+      - [int8x16_t, 'int8x16x3_t', uint8x16_t, 'vqtbx3q']
+    compose:
+      - FnCall: ['{type[3]}', [a, 'b.0', 'b.1', 'b.2', c]]
+
+  - name: "vqtbx3{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x8_t, 'uint8x16x3_t', uint8x8_t, 'vqtbx3']
+      - [uint8x16_t, 'uint8x16x3_t', uint8x16_t, 'vqtbx3q']
+      - [poly8x8_t, 'poly8x16x3_t', uint8x8_t, 'vqtbx3']
+      - [poly8x16_t, 'poly8x16x3_t', uint8x16_t, 'vqtbx3q']
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall: 
+                - '{type[3]}'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, ['b.0']]
+                  - FnCall: [transmute, ['b.1']]
+                  - FnCall: [transmute, ['b.2']]
+                  - c
+
+  - name: "vqtbl4{neon_type[3].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ['int8x16x4_t', uint8x8_t, 'vqtbl4', 'int8x8_t']
+      - ['int8x16x4_t', uint8x16_t, 'vqtbl4q', 'int8x16_t']
+    compose:
+      - FnCall: ['{type[2]}', ['a.0', 'a.1', 'a.2', 'a.3', b]]
+
+  - name: "vqtbl4{neon_type[3].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ['uint8x16x4_t', uint8x8_t, 'vqtbl4', 'uint8x8_t']
+      - ['uint8x16x4_t', uint8x16_t, 'vqtbl4q', 'uint8x16_t']
+      - ['poly8x16x4_t', uint8x8_t, 'vqtbl4', 'poly8x8_t']
+      - ['poly8x16x4_t', uint8x16_t, 'vqtbl4q', 'poly8x16_t']
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall: 
+                - '{type[2]}'
+                - - FnCall: [transmute, ['a.0']]
+                  - FnCall: [transmute, ['a.1']]
+                  - FnCall: [transmute, ['a.2']]
+                  - FnCall: [transmute, ['a.3']]
+                  - b
+
+  - name: "vqtbx4{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, 'int8x16x4_t', uint8x8_t, 'vqtbx4']
+      - [int8x16_t, 'int8x16x4_t', uint8x16_t, 'vqtbx4q']
+    compose:
+      - FnCall: ['{type[3]}', [a, 'b.0', 'b.1', 'b.2', 'b.3', c]]
+
+  - name: "vqtbx4{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [uint8x8_t, 'uint8x16x4_t', uint8x8_t, 'vqtbx4']
+      - [uint8x16_t, 'uint8x16x4_t', uint8x16_t, 'vqtbx4q']
+      - [poly8x8_t, 'poly8x16x4_t', uint8x8_t, 'vqtbx4']
+      - [poly8x16_t, 'poly8x16x4_t', uint8x16_t, 'vqtbx4q']
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall: 
+                - '{type[3]}'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, ['b.0']]
+                  - FnCall: [transmute, ['b.1']]
+                  - FnCall: [transmute, ['b.2']]
+                  - FnCall: [transmute, ['b.3']]
+                  - c
+
+  - name: "{type[0]}"
+    visibility: private
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["vqtbl1", "int8x16_t", "uint8x8_t", "int8x8_t"]
+      - ["vqtbl1q", "int8x16_t", "uint8x16_t", "int8x16_t"]
+    compose:
+      - LLVMLink:
+          name: "_{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.tbl1.{neon_type[3]}"
+              arch: aarch64,arm64ec
+
+  - name: "{type[0]}"
+    visibility: private
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["vqtbl2", "int8x16_t", "uint8x8_t", "int8x8_t"]
+      - ["vqtbl2q", "int8x16_t", "uint8x16_t", "int8x16_t"]
+    compose:
+      - LLVMLink:
+          name: "_{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.tbl2.{neon_type[3]}"
+              arch: aarch64,arm64ec
+
+  - name: "{type[0]}"
+    visibility: private
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}", "c: {neon_type[1]}", "d: {neon_type[2]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["vqtbl3", int8x16_t, uint8x8_t, int8x8_t]
+      - ["vqtbl3q", int8x16_t, uint8x16_t, int8x16_t]
+    compose:
+      - LLVMLink:
+          name: "_{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.tbl3.{neon_type[3]}"
+              arch: aarch64,arm64ec
+
+  - name: "{type[0]}"
+    visibility: private
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}", "c: {neon_type[1]}", "d: {neon_type[1]}", "e: {neon_type[2]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbl]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - ["vqtbl4", int8x16_t, uint8x8_t, int8x8_t]
+      - ["vqtbl4q", int8x16_t, uint8x16_t, int8x16_t]
+    compose:
+      - LLVMLink:
+          name: "_{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.tbl4.{neon_type[3]}"
+              arch: aarch64,arm64ec
+
+  - name: "{type[0]}"
+    visibility: private
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}", "c: {neon_type[3]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [vqtbx1, "int8x8_t", "int8x16_t", "uint8x8_t"]
+      - [vqtbx1q, "int8x16_t", "int8x16_t", "uint8x16_t"]
+    compose:
+      - LLVMLink:
+          name: "_{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.tbx1.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "{type[0]}"
+    visibility: private
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}",  "c: {neon_type[2]}", "d: {neon_type[3]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [vqtbx2, "int8x8_t", "int8x16_t", "uint8x8_t"]
+      - [vqtbx2q, "int8x16_t", "int8x16_t", "uint8x16_t"]
+    compose:
+      - LLVMLink:
+          name: "_{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.tbx2.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "{type[0]}"
+    visibility: private
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}",  "c: {neon_type[2]}", "d: {neon_type[2]}", "e: {neon_type[3]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [vqtbx3, "int8x8_t", "int8x16_t", "uint8x8_t"]
+      - [vqtbx3q, "int8x16_t", "int8x16_t", "uint8x16_t"]
+    compose:
+      - LLVMLink:
+          name: "_{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.tbx3.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "{type[0]}"
+    visibility: private
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}", "c: {neon_type[2]}", "d: {neon_type[2]}", "e: {neon_type[2]}", "f: {neon_type[3]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [tbx]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [vqtbx4, "int8x8_t", "int8x16_t", "uint8x8_t"]
+      - [vqtbx4q, "int8x16_t", "int8x16_t", "uint8x16_t"]
+    compose:
+      - LLVMLink:
+          name: "_{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.tbx4.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vld1{neon_type[1].no}"
+    doc: "Load multiple single-element structures to one, two, three, or four registers"
+    arguments: ["ptr: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "{type[2]}"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ldr]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i8', int8x8_t, "neon"]
+      - ['*const i8', int8x16_t, "neon"]
+      - ['*const i16', int16x4_t, "neon"]
+      - ['*const i16', int16x8_t, "neon"]
+      - ['*const i32', int32x2_t, "neon"]
+      - ['*const i32', int32x4_t, "neon"]
+      - ['*const i64', int64x1_t, "neon"]
+      - ['*const i64', int64x2_t, "neon"]
+      - ['*const u8', uint8x8_t, "neon"]
+      - ['*const u8', uint8x16_t, "neon"]
+      - ['*const u16', uint16x4_t, "neon"]
+      - ['*const u16', uint16x8_t, "neon"]
+      - ['*const u32', uint32x2_t, "neon"]
+      - ['*const u32', uint32x4_t, "neon"]
+      - ['*const u64', uint64x1_t, "neon"]
+      - ['*const u64', uint64x2_t, "neon"]
+      - ['*const p8', poly8x8_t, "neon"]
+      - ['*const p8', poly8x16_t, "neon"]
+      - ['*const p16', poly16x4_t, "neon"]
+      - ['*const p16', poly16x8_t, "neon"]
+      - ['*const p64', poly64x1_t, "neon,aes"]
+      - ['*const p64', poly64x2_t, "neon,aes"]
+      - ['*const f32', float32x2_t, "neon"]
+      - ['*const f32', float32x4_t, "neon"]
+      - ['*const f64', float64x1_t, "neon"]
+      - ['*const f64', float64x2_t, "neon"]
+    compose:
+      - FnCall:
+          - 'crate::ptr::read_unaligned'
+          - - MethodCall:
+                - ptr
+                - cast
+                - []
+
+  - name: "vld1{neon_type[1].no}"
+    doc: "Load multiple single-element structures to one, two, three, or four registers"
+    arguments: ["ptr: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "{type[2]}"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ldr]]}]]
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const f16', float16x4_t, "neon,fp16"]
+      - ['*const f16', float16x8_t, "neon,fp16"]
+    compose:
+      - FnCall:
+          - 'crate::ptr::read_unaligned'
+          - - MethodCall:
+                - ptr
+                - cast
+                - []
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures from one, two, three, or four registers."
+    arguments: ["ptr: {type[0]}", "a: {neon_type[1]}"]
+    attr:
+      - FnCall: [target_feature, ['enable = "{type[2]}"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [str]]}]]
+      - FnCall: [allow, ['clippy::cast_ptr_alignment']]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*mut i8', int8x8_t, "neon"]
+      - ['*mut i8', int8x16_t, "neon"]
+      - ['*mut i16', int16x4_t, "neon"]
+      - ['*mut i16', int16x8_t, "neon"]
+      - ['*mut i32', int32x2_t, "neon"]
+      - ['*mut i32', int32x4_t, "neon"]
+      - ['*mut i64', int64x1_t, "neon"]
+      - ['*mut i64', int64x2_t, "neon"]
+      - ['*mut u8', uint8x8_t, "neon"]
+      - ['*mut u8', uint8x16_t, "neon"]
+      - ['*mut u16', uint16x4_t, "neon"]
+      - ['*mut u16', uint16x8_t, "neon"]
+      - ['*mut u32', uint32x2_t, "neon"]
+      - ['*mut u32', uint32x4_t, "neon"]
+      - ['*mut u64', uint64x1_t, "neon"]
+      - ['*mut u64', uint64x2_t, "neon"]
+      - ['*mut p8', poly8x8_t, "neon"]
+      - ['*mut p8', poly8x16_t, "neon"]
+      - ['*mut p16', poly16x4_t, "neon"]
+      - ['*mut p16', poly16x8_t, "neon"]
+      - ['*mut p64', poly64x1_t, "neon,aes"]
+      - ['*mut p64', poly64x2_t, "neon,aes"]
+      - ['*mut f32', float32x2_t, "neon"]
+      - ['*mut f32', float32x4_t, "neon"]
+      - ['*mut f64', float64x1_t, "neon"]
+      - ['*mut f64', float64x2_t, "neon"]
+    compose:
+      - FnCall:
+          - 'crate::ptr::write_unaligned'
+          - - MethodCall:
+                - ptr
+                - cast
+                - []
+            - a
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures from one, two, three, or four registers."
+    arguments: ["ptr: {type[0]}", "a: {neon_type[1]}"]
+    attr:
+      - FnCall: [target_feature, ['enable = "{type[2]}"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [str]]}]]
+      - FnCall: [allow, ['clippy::cast_ptr_alignment']]
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*mut f16', float16x4_t, "neon,fp16"]
+      - ['*mut f16', float16x8_t, "neon,fp16"]
+    compose:
+      - FnCall:
+          - 'crate::ptr::write_unaligned'
+          - - MethodCall:
+                - ptr
+                - cast
+                - []
+            - a
+
+  - name: "__crc32d"
+    doc: "CRC32 single round checksum for quad words (64 bits)."
+    arguments: ["crc: {type[0]}", "data: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "crc"']]
+      - *target-not-arm
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32x"]] }]]
+      - *aarch64-crc-stable
+    safety: safe
+    types:
+      - [u32, u64]
+    compose:
+      - LLVMLink:
+          name: "crc32x"
+          arguments:
+            - "crc: u32"
+            - "data: u64"
+          links:
+            - link: "llvm.aarch64.crc32x"
+              arch: aarch64,arm64ec
+
+  - name: "__crc32cd"
+    doc: "CRC32-C single round checksum for quad words (64 bits)."
+    arguments: ["crc: {type[0]}", "data: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "crc"']]
+      - *target-not-arm
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32cx"]] }]]
+      - *aarch64-crc-stable
+    safety: safe
+    types:
+      - [u32, u64]
+    compose:
+      - LLVMLink:
+          name: "crc32cx"
+          arguments:
+            - "crc: u32"
+            - "data: u64"
+          links:
+            - link: "llvm.aarch64.crc32cx"
+              arch: aarch64,arm64ec
+
+  - name: "{type[0]}"
+    doc: "Absolute Value (wrapping)."
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [abs]
+    safety: safe
+    types:
+      - ['vabsd_s64', i64, i64]
+      - ['vabs_s64', int64x1_t, v1i64]
+      - ['vabsq_s64', int64x2_t, v2i64]
+    compose:
+      - LLVMLink:
+          name: "{type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.abs.{type[2]}"
+              arch: aarch64,arm64ec
+
+  - name: "vuqadd{neon_type[0].no}"
+    doc: "Signed saturating Accumulate of Unsigned value."
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+    assert_instr: [suqadd]
+    safety: safe
+    types:
+      - [int8x8_t, uint8x8_t]
+      - [int8x16_t, uint8x16_t]
+      - [int16x4_t, uint16x4_t]
+      - [int16x8_t, uint16x8_t]
+      - [int32x2_t, uint32x2_t]
+      - [int32x4_t, uint32x4_t]
+      - [int64x1_t, uint64x1_t]
+      - [int64x2_t, uint64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vuqadd{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.suqadd.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vsqadd{neon_type[0].no}"
+    doc: "Unsigned saturating Accumulate of Signed value."
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+    assert_instr: [usqadd]
+    safety: safe
+    types:
+      - [uint8x8_t, int8x8_t]
+      - [uint8x16_t, int8x16_t]
+      - [uint16x4_t, int16x4_t]
+      - [uint16x8_t, int16x8_t]
+      - [uint32x2_t, int32x2_t]
+      - [uint32x4_t, int32x4_t]
+      - [uint64x1_t, int64x1_t]
+      - [uint64x2_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vsqadd{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.usqadd.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vpadd{neon_type.no}"
+    doc: "Add Pairwise"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-stable
+    assert_instr: [addp]
+    safety: safe
+    types:
+      - int8x16_t
+      - int16x8_t
+      - int32x4_t
+      - int64x2_t
+    compose:
+      - LLVMLink:
+          name: "vpadd{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.addp.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vpadd{neon_type[0].no}"
+    doc: "Add Pairwise"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-stable
+    assert_instr: [addp]
+    safety: safe
+    types:
+      - [uint8x16_t, int8x16_t]
+      - [uint16x8_t, int16x8_t]
+      - [uint32x4_t, int32x4_t]
+      - [uint64x2_t, int64x2_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vpadd{neon_type[1].no}'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vpaddd_s64"
+    doc: "Add pairwise"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [addp]
+    safety: safe
+    types:
+      - [int64x2_t, i64]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vaddvq_u64"
+                - - FnCall: [transmute, [a]]
+
+  - name: "vpaddd_u64"
+    doc: "Add pairwise"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [addp]
+    safety: safe
+    types:
+      - [uint64x2_t, u64]
+    compose:
+      - FnCall: [vaddvq_u64, [a]]
+
+  - name: "vaddv{neon_type[0].no}"
+    doc: "Add across vector"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [addv]
+    safety: safe
+    types:
+      - [int8x8_t, i8]
+      - [int16x4_t, i16]
+      - [int8x16_t, i8]
+      - [int16x8_t, i16]
+      - [int32x4_t, i32]
+    compose:
+      - LLVMLink:
+          name: "vaddv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.saddv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vaddv{neon_type[0].no}"
+    doc: "Add across vector"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [addp]
+    safety: safe
+    types:
+      - [int32x2_t, i32] 
+    compose:
+      - LLVMLink:
+          name: "vaddv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.saddv.i32.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vaddv{neon_type[0].no}"
+    doc: "Add across vector"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [addp]
+    safety: safe
+    types:
+      - [int64x2_t, i64]
+    compose:
+      - LLVMLink:
+          name: "vaddv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.saddv.i64.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vaddv{neon_type[0].no}"
+    doc: "Add across vector"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [addv]
+    safety: safe
+    types:
+      - [uint8x8_t,  u8]
+      - [uint16x4_t, u16]
+      - [uint8x16_t, u8]
+      - [uint16x8_t, u16]
+      - [uint32x4_t, u32]
+    compose:
+      - LLVMLink:
+          name: "vaddv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.uaddv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vaddv{neon_type[0].no}"
+    doc: "Add across vector"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [addp]
+    safety: safe
+    types:
+      - [uint32x2_t, u32, i32] 
+    compose:
+      - LLVMLink:
+          name: "vaddv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.uaddv.{type[2]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vaddv{neon_type[0].no}"
+    doc: "Add across vector"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [addp]
+    safety: safe
+    types:
+      - [uint64x2_t, u64, i64]
+    compose:
+      - LLVMLink:
+          name: "vaddv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.uaddv.{type[2]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vaddlv{neon_type[0].no}"
+    doc: "Signed Add Long across Vector"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [saddlv]
+    safety: safe
+    types:
+      - [int8x8_t, i16]
+      - [int8x16_t, i16]
+    compose:
+      - LLVMLink:
+          name: "vaddlv{neon_type[0].no}"
+          return_type: "i32"
+          links:
+            - link: "llvm.aarch64.neon.saddlv.i32.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - Identifier: ["unsafe {{ _vaddlv{neon_type[0].no}(a) as i16 }}", Symbol]
+
+  - name: "vaddlv{neon_type[0].no}"
+    doc: "Unsigned Add Long across Vector"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [uaddlv]
+    safety: safe
+    types:
+      - [uint8x8_t, u16]
+      - [uint8x16_t, u16]
+    compose:
+      - LLVMLink:
+          name: "vaddlv{neon_type[0].no}"
+          return_type: "i32"
+          links:
+            - link: "llvm.aarch64.neon.uaddlv.i32.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - Identifier: ["unsafe {{ _vaddlv{neon_type[0].no}(a) as u16 }}", Symbol]
+
+  - name: "vmaxv{neon_type[0].no}"
+    doc: "Horizontal vector max."
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: ['{type[2]}']
+    safety: safe
+    types:
+      - [int8x8_t, i8, 'smaxv']
+      - [int16x4_t, i16, 'smaxv']
+      - [int32x2_t, i32, 'smaxp'] 
+      - [int8x16_t, i8, 'smaxv']
+      - [int16x8_t, i16, 'smaxv']
+      - [int32x4_t, i32, 'smaxv']
+    compose:
+      - LLVMLink:
+          name: "vmaxv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.smaxv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vmaxv{neon_type[0].no}"
+    doc: "Horizontal vector max."
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: ['{type[2]}']
+    safety: safe
+    types:
+      - [uint8x8_t, u8, 'umaxv']
+      - [uint16x4_t, u16, 'umaxv']
+      - [uint32x2_t, u32, 'umaxp'] 
+      - [uint8x16_t, u8, 'umaxv']
+      - [uint16x8_t, u16, 'umaxv']
+      - [uint32x4_t, u32, 'umaxv']
+    compose:
+      - LLVMLink:
+          name: "vmaxv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.umaxv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vmaxv{neon_type[0].no}"
+    doc: "Horizontal vector max."
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: ['{type[2]}']
+    safety: safe
+    types:
+      - [float32x2_t, f32, 'fmaxp']
+      - [float32x4_t, f32, 'fmaxv']
+      - [float64x2_t, f64, 'fmaxp'] 
+    compose:
+      - LLVMLink:
+          name: "vmaxv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vminv{neon_type[0].no}"
+    doc: "Horizontal vector min."
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: ['{type[2]}']
+    safety: safe
+    types:
+      - [int8x8_t, i8, 'sminv']
+      - [int16x4_t, i16, 'sminv']
+      - [int32x2_t, i32, 'sminp'] 
+      - [int8x16_t, i8, 'sminv']
+      - [int16x8_t, i16, 'sminv']
+      - [int32x4_t, i32, 'sminv']
+    compose:
+      - LLVMLink:
+          name: "vminv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.sminv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vminv{neon_type[0].no}"
+    doc: "Horizontal vector min."
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: ['{type[2]}']
+    safety: safe
+    types:
+      - [uint8x8_t, u8, 'uminv']
+      - [uint16x4_t, u16, 'uminv']
+      - [uint32x2_t, u32, 'uminp'] 
+      - [uint8x16_t, u8, 'uminv']
+      - [uint16x8_t, u16, 'uminv']
+      - [uint32x4_t, u32, 'uminv']
+    compose:
+      - LLVMLink:
+          name: "vminv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.uminv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vminv{neon_type[0].no}"
+    doc: "Horizontal vector min."
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: ['{type[2]}']
+    safety: safe
+    types:
+      - [float32x2_t, f32, 'fminp']
+      - [float32x4_t, f32, 'fminv']
+      - [float64x2_t, f64, 'fminp'] 
+    compose:
+      - LLVMLink:
+          name: "vminv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.fminv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vpmin{neon_type.no}"
+    doc: "Folding minimum of adjacent pairs"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-stable
+    assert_instr: ['sminp']
+    safety: safe
+    types:
+      - int8x16_t
+      - int16x8_t
+      - int32x4_t
+    compose:
+      - LLVMLink:
+          name: "vpmin{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.sminp.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vpmin{neon_type.no}"
+    doc: "Folding minimum of adjacent pairs"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-stable
+    assert_instr: ['uminp']
+    safety: safe
+    types:
+      - uint8x16_t
+      - uint16x8_t
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: "vpmin{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.uminp.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vpmin{neon_type.no}"
+    doc: "Folding minimum of adjacent pairs"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-stable
+    assert_instr: ['fminp']
+    safety: safe
+    types:
+      - float32x4_t
+      - float64x2_t 
+    compose:
+      - LLVMLink:
+          name: "vpmin{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.fminp.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vpmax{neon_type.no}"
+    doc: "Folding maximum of adjacent pairs"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-stable
+    assert_instr: ['smaxp']
+    safety: safe
+    types:
+      - int8x16_t
+      - int16x8_t
+      - int32x4_t
+    compose:
+      - LLVMLink:
+          name: "vpmax{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.smaxp.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vpmax{neon_type.no}"
+    doc: "Folding maximum of adjacent pairs"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-stable
+    assert_instr: ['umaxp']
+    safety: safe
+    types:
+      - uint8x16_t
+      - uint16x8_t
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: "vpmax{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.umaxp.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vpmax{neon_type.no}"
+    doc: "Folding maximum of adjacent pairs"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-stable
+    assert_instr: ['fmaxp']
+    safety: safe
+    types:
+      - float32x4_t
+      - float64x2_t 
+    compose:
+      - LLVMLink:
+          name: "vpmax{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxp.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vsli{neon_type[0].N}"
+    doc: "Shift Left and Insert (immediate)"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sli, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t,  'static_assert_uimm_bits!', 'N, 3']
+      - [int8x16_t, 'static_assert_uimm_bits!', 'N, 3']
+      - [int16x4_t, 'static_assert_uimm_bits!', 'N, 4']
+      - [int16x8_t, 'static_assert_uimm_bits!', 'N, 4']
+      - [int32x2_t, 'static_assert!', 'N >= 0 && N <= 31']
+      - [int32x4_t, 'static_assert!', 'N >= 0 && N <= 31']
+      - [int64x1_t, 'static_assert!', 'N >= 0 && N <= 63']
+      - [int64x2_t, 'static_assert!', 'N >= 0 && N <= 63']
+    compose:
+      - FnCall: ['{type[1]}', ['{type[2]}']]
+      - LLVMLink:
+          name: "vsli{neon_type[0].N}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "b: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.vsli.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vsli{neon_type[0].N}", [a, b, N], [], true]
+
+  - name: "vsli{neon_type[0].N}"
+    doc: "Shift Left and Insert (immediate)"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "{type[4]}"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sli, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint8x8_t,  int8x8_t,  'static_assert_uimm_bits!', 'N, 3', "neon"]
+      - [uint8x16_t, int8x16_t, 'static_assert_uimm_bits!', 'N, 3', "neon"]
+      - [uint16x4_t, int16x4_t, 'static_assert_uimm_bits!', 'N, 4', "neon"]
+      - [uint16x8_t, int16x8_t, 'static_assert_uimm_bits!', 'N, 4', "neon"]
+      - [uint32x2_t, int32x2_t, 'static_assert!', 'N >= 0 && N <= 31', "neon"]
+      - [uint32x4_t, int32x4_t, 'static_assert!', 'N >= 0 && N <= 31', "neon"]
+      - [uint64x1_t, int64x1_t, 'static_assert!', 'N >= 0 && N <= 63', "neon"]
+      - [uint64x2_t, int64x2_t, 'static_assert!', 'N >= 0 && N <= 63', "neon"]
+      - [poly8x8_t,  int8x8_t,  'static_assert_uimm_bits!', 'N, 3', "neon"]
+      - [poly8x16_t, int8x16_t, 'static_assert_uimm_bits!', 'N, 3', "neon"]
+      - [poly16x4_t, int16x4_t, 'static_assert_uimm_bits!', 'N, 4', "neon"]
+      - [poly16x8_t, int16x8_t, 'static_assert_uimm_bits!', 'N, 4', "neon"]
+      - [poly64x1_t, int64x1_t, 'static_assert!', 'N >= 0 && N <= 63', "neon,aes"]
+      - [poly64x2_t, int64x2_t, 'static_assert!', 'N >= 0 && N <= 63', "neon,aes"]
+    compose:
+      - FnCall: ['{type[2]}', ['{type[3]}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vsli{neon_type[1].N}::<N>'
+                - - FnCall:
+                      - transmute
+                      - - a
+                  - FnCall:
+                      - transmute
+                      - - b
+
+  - name: "vsri{neon_type[0].N}"
+    doc: "Shift Right and Insert (immediate)"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sri, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t,  'N >= 1 && N <= 8']
+      - [int8x16_t, 'N >= 1 && N <= 8']
+      - [int16x4_t, 'N >= 1 && N <= 16']
+      - [int16x8_t, 'N >= 1 && N <= 16']
+      - [int32x2_t, 'N >= 1 && N <= 32']
+      - [int32x4_t, 'N >= 1 && N <= 32']
+      - [int64x1_t, 'N >= 1 && N <= 64']
+      - [int64x2_t, 'N >= 1 && N <= 64']
+    compose:
+      - FnCall: ['static_assert!', ['{type[1]}']]
+      - LLVMLink:
+          name: "vsri{neon_type[0].N}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "b: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.vsri.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vsri{neon_type[0].N}", [a, b, N], [], true]
+
+  - name: "vsri{neon_type[0].N}"
+    doc: "Shift Right and Insert (immediate)"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "{type[3]}"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sri, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint8x8_t,  int8x8_t,  'N >= 1 && N <= 8', "neon"]
+      - [uint8x16_t, int8x16_t, 'N >= 1 && N <= 8', "neon"]
+      - [uint16x4_t, int16x4_t, 'N >= 1 && N <= 16', "neon"]
+      - [uint16x8_t, int16x8_t, 'N >= 1 && N <= 16', "neon"]
+      - [uint32x2_t, int32x2_t, 'N >= 1 && N <= 32', "neon"]
+      - [uint32x4_t, int32x4_t, 'N >= 1 && N <= 32', "neon"]
+      - [uint64x1_t, int64x1_t, 'N >= 1 && N <= 64', "neon"]
+      - [uint64x2_t, int64x2_t, 'N >= 1 && N <= 64', "neon"]
+      - [poly8x8_t,  int8x8_t,  'N >= 1 && N <= 8', "neon"]
+      - [poly8x16_t, int8x16_t, 'N >= 1 && N <= 8', "neon"]
+      - [poly16x4_t, int16x4_t, 'N >= 1 && N <= 16', "neon"]
+      - [poly16x8_t, int16x8_t, 'N >= 1 && N <= 16', "neon"]
+      - [poly64x1_t, int64x1_t, 'N >= 1 && N <= 64', "neon,aes"]
+      - [poly64x2_t, int64x2_t, 'N >= 1 && N <= 64', "neon,aes"]
+    compose:
+      - FnCall: ['static_assert!', ['{type[2]}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vsri{neon_type[1].N}::<N>'
+                - - FnCall:
+                      - transmute
+                      - - a
+                  - FnCall:
+                      - transmute
+                      - - b
+
+  - name: "vfmlal{type[2]}{neon_type[1]}"
+    doc: "Floating-point fused Multiply-Add Long to accumulator (vector)."
+    arguments: ["r: {neon_type[0]}", "a: {neon_type[1]}",  "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-fp16
+      - *enable-fhm
+      - *neon-unstable-f16
+    assert_instr: [fmlal2]
+    safety: safe
+    types:
+      - [float32x2_t, float16x4_t, '_high_']
+      - [float32x4_t, float16x8_t, 'q_high_']
+    compose:
+      - LLVMLink:
+          name: "vfmlal{type[2]}.{neon_type[0]}.{neon_type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fmlal2.{neon_type[0]}.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vfmlal{type[3]}{neon_type[1]}"
+    doc: "Floating-point fused Multiply-Add Long to accumulator (by element)."
+    arguments: ["r: {neon_type[0]}", "a: {neon_type[1]}",  "b: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmlal2, 'LANE = 0']]}]]
+      - *neon-fp16
+      - *enable-fhm
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float32x2_t, float16x4_t, float16x4_t, '_lane_high_', '_high_', '2']
+      - [float32x2_t, float16x4_t, float16x8_t, '_laneq_high_', '_high_', '3']
+      - [float32x4_t, float16x8_t, float16x4_t, 'q_lane_high_', 'q_high_', '2']
+      - [float32x4_t, float16x8_t, float16x8_t, 'q_laneq_high_', 'q_high_', '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[5]}"]]
+      - FnCall:
+          - "vfmlal{type[4]}{neon_type[1]}"
+          - - r
+            - a
+            - FnCall: ["vdup{neon_type[1].N}", [{FnCall: [simd_extract!, [b, 'LANE as u32']]}]]
+
+
+  - name: "vfmlal{type[2]}{neon_type[1]}"
+    doc: "Floating-point fused Multiply-Add Long to accumulator (vector)."
+    arguments: ["r: {neon_type[0]}", "a: {neon_type[1]}",  "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-fp16
+      - *enable-fhm
+      - *neon-unstable-f16
+    assert_instr: [fmlal]
+    safety: safe
+    types:
+      - [float32x2_t, float16x4_t, '_low_']
+      - [float32x4_t, float16x8_t, 'q_low_']
+    compose:
+      - LLVMLink:
+          name: "vfmlal{type[2]}.{neon_type[0]}.{neon_type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fmlal.{neon_type[0]}.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vfmlal{type[3]}{neon_type[1]}"
+    doc: "Floating-point fused Multiply-Add Long to accumulator (by element)."
+    arguments: ["r: {neon_type[0]}", "a: {neon_type[1]}",  "b: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmlal, 'LANE = 0']]}]]
+      - *neon-fp16
+      - *enable-fhm
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float32x2_t, float16x4_t, float16x4_t, '_lane_low_', '_low_', '2']
+      - [float32x2_t, float16x4_t, float16x8_t, '_laneq_low_', '_low_', '3']
+      - [float32x4_t, float16x8_t, float16x4_t, 'q_lane_low_', 'q_low_', '2']
+      - [float32x4_t, float16x8_t, float16x8_t, 'q_laneq_low_', 'q_low_', '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[5]}"]]
+      - FnCall:
+          - "vfmlal{type[4]}{neon_type[1]}"
+          - - r
+            - a
+            - FnCall: ["vdup{neon_type[1].N}", [{FnCall: [simd_extract!, [b, 'LANE as u32']]}]]
+
+
+  - name: "vfmlsl{type[2]}{neon_type[1]}"
+    doc: "Floating-point fused Multiply-Subtract Long from accumulator (vector)."
+    arguments: ["r: {neon_type[0]}", "a: {neon_type[1]}",  "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-fp16
+      - *enable-fhm
+      - *neon-unstable-f16
+    assert_instr: [fmlsl2]
+    safety: safe
+    types:
+      - [float32x2_t, float16x4_t, '_high_']
+      - [float32x4_t, float16x8_t, 'q_high_']
+    compose:
+      - LLVMLink:
+          name: "vfmlsl{type[2]}.{neon_type[0]}.{neon_type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fmlsl2.{neon_type[0]}.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vfmlsl{type[3]}{neon_type[1]}"
+    doc: "Floating-point fused Multiply-Subtract Long from accumulator (by element)."
+    arguments: ["r: {neon_type[0]}", "a: {neon_type[1]}",  "b: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmlsl2, 'LANE = 0']]}]]
+      - *neon-fp16
+      - *enable-fhm
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float32x2_t, float16x4_t, float16x4_t, '_lane_high_', '_high_', '2']
+      - [float32x2_t, float16x4_t, float16x8_t, '_laneq_high_', '_high_', '3']
+      - [float32x4_t, float16x8_t, float16x4_t, 'q_lane_high_', 'q_high_', '2']
+      - [float32x4_t, float16x8_t, float16x8_t, 'q_laneq_high_', 'q_high_', '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[5]}"]]
+      - FnCall:
+          - "vfmlsl{type[4]}{neon_type[1]}"
+          - - r
+            - a
+            - FnCall: ["vdup{neon_type[1].N}", [{FnCall: [simd_extract!, [b, 'LANE as u32']]}]]
+
+
+  - name: "vfmlsl{type[2]}{neon_type[1]}"
+    doc: "Floating-point fused Multiply-Subtract Long from accumulator (vector)."
+    arguments: ["r: {neon_type[0]}", "a: {neon_type[1]}",  "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-fp16
+      - *enable-fhm
+      - *neon-unstable-f16
+    assert_instr: [fmlsl]
+    safety: safe
+    types:
+      - [float32x2_t, float16x4_t, '_low_']
+      - [float32x4_t, float16x8_t, 'q_low_']
+    compose:
+      - LLVMLink:
+          name: "vfmlsl{type[2]}.{neon_type[0]}.{neon_type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.fmlsl.{neon_type[0]}.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vfmlsl{type[3]}{neon_type[1]}"
+    doc: "Floating-point fused Multiply-Subtract Long from accumulator (by element)."
+    arguments: ["r: {neon_type[0]}", "a: {neon_type[1]}",  "b: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fmlsl, 'LANE = 0']]}]]
+      - *neon-fp16
+      - *enable-fhm
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float32x2_t, float16x4_t, float16x4_t, '_lane_low_', '_low_', '2']
+      - [float32x2_t, float16x4_t, float16x8_t, '_laneq_low_', '_low_', '3']
+      - [float32x4_t, float16x8_t, float16x4_t, 'q_lane_low_', 'q_low_', '2']
+      - [float32x4_t, float16x8_t, float16x8_t, 'q_laneq_low_', 'q_low_', '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[5]}"]]
+      - FnCall:
+          - "vfmlsl{type[4]}{neon_type[1]}"
+          - - r
+            - a
+            - FnCall: ["vdup{neon_type[1].N}", [{FnCall: [simd_extract!, [b, 'LANE as u32']]}]]
+
+  - name: "vamax{neon_type.no}"
+    doc: "Multi-vector floating-point absolute maximum"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,faminmax"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [unstable, ['feature = "faminmax"', 'issue = "137933"']]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "_vamax{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.famax.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vamin{neon_type.no}"
+    doc: "Multi-vector floating-point absolute minimum"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,faminmax"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [unstable, ['feature = "faminmax"', 'issue = "137933"']]
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+      - float64x2_t
+    compose:
+      - LLVMLink:
+          name: "_vamin{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.famin.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vluti2{neon_type[0].lane_nox}"
+    doc: "Lookup table read with 2-bit indices"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [target_feature, ['enable = {type[4]}']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 1']]}]]
+      - *neon-unstable-feat-lut
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [int8x8_t,   uint8x8_t, int8x16_t, 'LANE >= 0 && LANE <= 1', '"neon,lut"']
+      - [int8x16_t,  uint8x8_t, int8x16_t, 'LANE >= 0 && LANE <= 1', '"neon,lut"']
+      - [int16x4_t,  uint8x8_t, int16x8_t, 'LANE >= 0 && LANE <= 3', '"neon,lut"']
+      - [int16x8_t,  uint8x8_t, int16x8_t, 'LANE >= 0 && LANE <= 3', '"neon,lut"']
+    compose:
+      - FnCall: ['static_assert!', ['{type[3]}']]
+      - LLVMLink:
+          name: "vluti2{neon_type[0].lane_nox}"
+          arguments:
+            - 'a: {neon_type[0]}'
+            - 'b: {neon_type[1]}'
+            - 'n: i32'
+          links:
+            - link: "llvm.aarch64.neon.vluti2.lane.{neon_type[2]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ['_vluti2{neon_type[0].lane_nox}', [a, b, LANE]]
+
+  - name: "vluti2{neon_type[0].lane_nox}"
+    doc: "Lookup table read with 2-bit indices"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,lut"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 1']]}]]
+      - *neon-unstable-feat-lut
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [uint8x8_t,  uint8x8_t, uint8x16_t, 'LANE >= 0 && LANE <= 1', 'int8x8_t']
+      - [uint8x16_t, uint8x8_t, uint8x16_t, 'LANE >= 0 && LANE <= 1', 'int8x16_t']
+      - [poly8x8_t,  uint8x8_t, poly8x16_t, 'LANE >= 0 && LANE <= 1', 'int8x8_t']
+      - [poly8x16_t, uint8x8_t, poly8x16_t, 'LANE >= 0 && LANE <= 1', 'int8x16_t']
+      - [uint16x4_t, uint8x8_t, uint16x8_t, 'LANE >= 0 && LANE <= 3', 'int16x4_t']
+      - [uint16x8_t, uint8x8_t, uint16x8_t, 'LANE >= 0 && LANE <= 3', 'int16x8_t']
+      - [poly16x4_t, uint8x8_t, poly16x8_t, 'LANE >= 0 && LANE <= 3', 'int16x4_t']
+      - [poly16x8_t, uint8x8_t, poly16x8_t, 'LANE >= 0 && LANE <= 3', 'int16x8_t']
+    compose:
+      - FnCall: ['static_assert!', ['{type[3]}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vluti2{neon_type[4].lane_nox}::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - b
+
+  - name: "vluti4{neon_type[0].lane_nox}"
+    doc: "Lookup table read with 4-bit indices"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = {type[3]}']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - *neon-unstable-feat-lut
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [int8x16_t,  uint8x8_t, 'LANE == 0', '"neon,lut"']
+    compose:
+      - FnCall: ['static_assert!', ['{type[2]}']]
+      - LLVMLink:
+          name: "vluti4{neon_type[0].lane_nox}"
+          arguments:
+            - 'a: {neon_type[0]}'
+            - 'b: {neon_type[1]}'
+            - 'n: i32'
+          links:
+            - link: "llvm.aarch64.neon.vluti4q.lane.{neon_type[1]}"
+              arch: aarch64,arm64ec
+      - FnCall: ['_vluti4{neon_type[0].lane_nox}', [a, b, LANE]]
+
+  - name: "vluti4{neon_type[0].lane_nox}"
+    doc: "Lookup table read with 4-bit indices"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,lut"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - *neon-unstable-feat-lut
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [uint8x16_t, uint8x8_t, 'LANE == 0', int8x16_t]
+      - [poly8x16_t, uint8x8_t, 'LANE == 0', int8x16_t]
+    compose:
+      - FnCall: ['static_assert!', ['{type[2]}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vluti4{neon_type[3].lane_nox}::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - b
+
+  - name: "vluti4{neon_type[0].laneq_nox}"
+    doc: "Lookup table read with 4-bit indices"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,lut"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - *neon-unstable-feat-lut
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [int8x16_t,  uint8x16_t, 'LANE >= 0 && LANE <= 1']
+    compose:
+      - FnCall: ['static_assert!', ['{type[2]}']]
+      - LLVMLink:
+          name: "vluti4{neon_type[0].laneq_nox}"
+          arguments:
+            - 'a: {neon_type[0]}'
+            - 'b: {neon_type[1]}'
+            - 'n: i32'
+          links:
+            - link: "llvm.aarch64.neon.vluti4q.laneq.{neon_type[1]}"
+              arch: aarch64,arm64ec
+      - FnCall: ['_vluti4{neon_type[0].laneq_nox}', [a, b, LANE]]
+
+  - name: "vluti4{neon_type[0].laneq_nox}"
+    doc: "Lookup table read with 4-bit indices"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "neon,lut"']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - *neon-unstable-feat-lut
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [uint8x16_t, uint8x16_t, 'LANE >= 0 && LANE <= 1', int8x16_t]
+      - [poly8x16_t, uint8x16_t, 'LANE >= 0 && LANE <= 1', int8x16_t]
+    compose:
+      - FnCall: ['static_assert!', ['{type[2]}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vluti4{neon_type[3].laneq_nox}::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - b
+
+  - name: "vluti4q_lane_{neon_type[0]}_x2"
+    doc: "Lookup table read with 4-bit indices"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [target_feature, ['enable = {type[4]}']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - *neon-unstable-feat-lut
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [int16x8x2_t, uint8x8_t, int16x8_t, 'LANE >= 0 && LANE <= 1', '"neon,lut"']
+    compose:
+      - FnCall: ['static_assert!', ['{type[3]}']]
+      - LLVMLink:
+          name: "vluti4q_lane_{neon_type[0]}_x2"
+          arguments:
+            - 'a: {neon_type[2]}'
+            - 'a: {neon_type[2]}'
+            - 'b: {neon_type[1]}'
+            - 'n: i32'
+          links:
+            - link: "llvm.aarch64.neon.vluti4q.lane.x2.{neon_type[2]}"
+              arch: aarch64,arm64ec
+      - FnCall: ['_vluti4q_lane_{neon_type[0]}_x2', ['a.0', 'a.1', b, LANE]]
+
+  - name: "vluti4q_lane_{neon_type[0]}_x2"
+    doc: "Lookup table read with 4-bit indices"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [target_feature, ['enable = {type[4]}']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - *neon-unstable-feat-lut
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [uint16x8x2_t, uint8x8_t, uint16x8_t, 'LANE >= 0 && LANE <= 1', '"neon,lut"', int16x8x2_t]
+      - [poly16x8x2_t, uint8x8_t, poly16x8_t, 'LANE >= 0 && LANE <= 1', '"neon,lut"', int16x8x2_t]
+      - [float16x8x2_t, uint8x8_t, float16x8_t, 'LANE >= 0 && LANE <= 1', '"neon,lut,fp16"', int16x8x2_t]
+    compose:
+      - FnCall: ['static_assert!', ['{type[3]}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vluti4q_lane_{neon_type[5]}_x2::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - b
+
+  - name: "vluti4q_laneq_{neon_type[0]}_x2"
+    doc: "Lookup table read with 4-bit indices"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [target_feature, ['enable = {type[4]}']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 3']]}]]
+      - *neon-unstable-feat-lut
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [int16x8x2_t, uint8x16_t, int16x8_t, 'LANE >= 0 && LANE <= 3', '"neon,lut"']
+    compose:
+      - FnCall: ['static_assert!', ['{type[3]}']]
+      - LLVMLink:
+          name: "vluti4{neon_type[0].lane_nox}"
+          arguments:
+            - 'a: {neon_type[2]}'
+            - 'b: {neon_type[2]}'
+            - 'c: {neon_type[1]}'
+            - 'n: i32'
+          links:
+            - link: "llvm.aarch64.neon.vluti4q.laneq.x2.{neon_type[2]}"
+              arch: aarch64,arm64ec
+      - FnCall: ['_vluti4q_laneq_{neon_type[0]}_x2', ['a.0', 'a.1', b, LANE]]
+
+  - name: "vluti4q_laneq_{neon_type[0]}_x2"
+    doc: "Lookup table read with 4-bit indices"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [target_feature, ['enable = {type[4]}']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'LANE = 3']]}]]
+      - *neon-unstable-feat-lut
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [uint16x8x2_t,  uint8x16_t, uint16x8_t, 'LANE >= 0 && LANE <= 3', '"neon,lut"', int16x8x2_t]
+      - [poly16x8x2_t,  uint8x16_t, poly16x8_t, 'LANE >= 0 && LANE <= 3', '"neon,lut"', int16x8x2_t]
+      - [float16x8x2_t, uint8x16_t, float16x8_t, 'LANE >= 0 && LANE <= 3', '"neon,lut,fp16"', int16x8x2_t]
+    compose:
+      - FnCall: ['static_assert!', ['{type[3]}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vluti4q_laneq_{neon_type[5]}_x2::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - b
diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
new file mode 100644
index 0000000000000..118f5808f758f
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
@@ -0,0 +1,15071 @@
+arch_cfgs:
+  - arch_name: aarch64
+    target_feature: [neon]
+    llvm_prefix: llvm.aarch64.neon
+# Generate big endian shuffles
+auto_big_endian: true
+
+# Repeatedly used anchors 
+# #[stable(feature = "neon_intrinsics", since = "1.59.0")]
+neon-stable: &neon-stable
+  FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+
+# #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
+neon-cfg-arm-unstable: &neon-cfg-arm-unstable
+  FnCall: ['cfg_attr', ['target_arch = "arm"', {FnCall: ['unstable', ['feature = "stdarch_arm_neon_intrinsics"', 'issue = "111800"']]}]]
+
+# #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
+neon-arm-unstable: &neon-arm-unstable
+  FnCall: ['unstable', ['feature = "stdarch_arm_neon_intrinsics"', 'issue = "111800"']]
+
+# #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+neon-v7: &neon-v7
+  FnCall: [cfg_attr, ['target_arch = "arm"', { FnCall: [target_feature, [ 'enable = "v7"']]} ]]
+
+# #[target_feature(enable = "neon,v7")]
+enable-v7: &enable-v7
+  FnCall: [target_feature, ['enable = "neon,v7"']]
+
+# #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+neon-v8: &neon-v8
+  FnCall: [cfg_attr, ['target_arch = "arm"', { FnCall: [target_feature, [ 'enable = "v8"']]} ]]
+
+target-is-arm: &target-is-arm
+  FnCall: [cfg, ['target_arch = "arm"']]
+
+# #[cfg(not(target_arch = "arm"))]
+target-not-arm: &target-not-arm
+  FnCall: [cfg, [{ FnCall: [not, ['target_arch = "arm"']]}]]
+
+not-arm: &not-arm
+  FnCall: [not, ['target_arch = "arm"']]
+
+neon-target-aarch64-arm64ec: &neon-target-aarch64-arm64ec
+  FnCall: [all, [test, {FnCall: [any, ['target_arch = "aarch64"', 'target_arch = "arm64ec"']]}]]
+
+# #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
+neon-not-arm-stable: &neon-not-arm-stable
+  FnCall: [cfg_attr, [{ FnCall: [not, ['target_arch = "arm"']]}, {FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]}]]
+
+# #[cfg_attr(all(test, not(target_env = "msvc"))]
+msvc-disabled: &msvc-disabled
+  FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]
+
+# all(test, target_arch = "arm")
+test-is-arm: &test-is-arm
+  FnCall: [all, [test, 'target_arch = "arm"']]
+
+# #[target_feature(enable = "neon,aes")]
+neon-aes: &neon-aes
+  FnCall: [target_feature, ['enable = "neon,aes"']]
+
+# #[target_feature(enable = "neon,i8mm")]
+neon-i8mm: &neon-i8mm
+  FnCall: [target_feature, ['enable = "neon,i8mm"']]
+
+# #[target_feature(enable = "neon,fp16")]
+neon-fp16: &neon-fp16
+  FnCall: [target_feature, ['enable = "neon,fp16"']]
+
+enable-fcma: &enable-fcma
+  FnCall: [cfg_attr, [{ FnCall: [not, ['target_arch = "arm"']]}, { FnCall: [target_feature, ['enable = "fcma"']] }]]
+
+#[cfg_attr(not(target_arch = "arm"), unstable(feature = "stdarch_neon_i8mm", issue = "117223"))]
+neon-unstable-i8mm: &neon-unstable-i8mm
+  FnCall: [cfg_attr, [{ FnCall: [not, ['target_arch = "arm"']] }, { FnCall: [unstable, ['feature = "stdarch_neon_i8mm"', 'issue = "117223"']] } ]]
+
+# #[unstable(feature = "stdarch_neon_fcma", issue = "117222")]
+neon-unstable-fcma: &neon-unstable-fcma
+  FnCall: [unstable, ['feature = "stdarch_neon_fcma"', 'issue = "117222"']]
+
+arm-crc-unstable: &arm-crc-unstable
+  FnCall: [cfg_attr, ['target_arch = "arm"', {FnCall: [unstable, ['feature = "stdarch_aarch32_crc32"', 'issue = "125085"']]}]]
+
+aarch64-crc-stable: &aarch64-crc-stable
+  FnCall: [cfg_attr, [{FnCall: [not, ['target_arch = "arm"']]}, {FnCall: [stable, ['feature = "stdarch_aarch64_crc32"', 'since = "1.80.0"']]}]]
+
+# #[unstable(feature = "stdarch_neon_f16", issue = "136306")]
+neon-unstable-f16: &neon-unstable-f16
+  FnCall: [unstable, ['feature = "stdarch_neon_f16"', 'issue = "136306"']]
+
+intrinsics:
+  - name: "vand{neon_type.no}"
+    doc: Vector bitwise and
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vand]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [and]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+      - uint8x8_t
+      - uint8x16_t
+      - uint16x4_t
+      - uint16x8_t
+      - uint32x2_t
+      - uint32x4_t
+      - int64x1_t
+      - int64x2_t
+      - uint64x1_t
+      - uint64x2_t
+    compose:
+      - FnCall:
+          - simd_and
+          - - a
+            - b
+
+  - name: "vorr{neon_type.no}"
+    doc: "Vector bitwise or (immediate, inclusive)"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vorr]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [orr]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+      - uint8x8_t
+      - uint8x16_t
+      - uint16x4_t
+      - uint16x8_t
+      - uint32x2_t
+      - uint32x4_t
+      - int64x1_t
+      - int64x2_t
+      - uint64x1_t
+      - uint64x2_t
+    compose:
+      - FnCall:
+          - simd_or
+          - - a
+            - b
+
+  - name: "veor{neon_type.no}"
+    doc: Vector bitwise exclusive or (vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [veor]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [eor]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+      - uint8x8_t
+      - uint8x16_t
+      - uint16x4_t
+      - uint16x8_t
+      - uint32x2_t
+      - uint32x4_t
+      - int64x1_t
+      - int64x2_t
+      - uint64x1_t
+      - uint64x2_t
+    compose:
+      - FnCall:
+          - simd_xor
+          - - a
+            - b
+
+  - name: "vabd{neon_type.no}"
+    doc: Absolute difference between the arguments
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vabd.{neon_type}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sabd]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+    compose:
+      - LLVMLink:
+          name: "sabd.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.sabd.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vabds.{neon_type}"
+              arch: arm
+
+  - name: "vabd{neon_type.no}"
+    doc: Absolute difference between the arguments
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vabd.{neon_type}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uabd]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint8x8_t
+      - uint8x16_t
+      - uint16x4_t
+      - uint16x8_t
+      - uint32x2_t
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: "uabd.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.uabd.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vabdu.{neon_type}"
+              arch: arm
+
+  - name: "vabd{neon_type.no}"
+    doc: Absolute difference between the arguments of Floating
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vabd.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fabd]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - LLVMLink:
+          name: "fabd.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vabds.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fabd.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vabd{neon_type.no}"
+    doc: Absolute difference between the arguments of Floating
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vabd.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fabd]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "fabd.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vabds.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fabd.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vabdl{neon_type[0].noq}"
+    doc: Signed Absolute difference Long
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - 'target_arch = "arm"'
+            - FnCall:
+                - assert_instr
+                - - '"vabdl.{neon_type[0]}"'
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - FnCall:
+                      - any
+                      - - 'target_arch = "aarch64"'
+                        - 'target_arch = "arm64ec"'
+            - FnCall:
+                - assert_instr
+                - - sabdl
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, int16x8_t, uint8x8_t]
+      - [int16x4_t, int32x4_t, uint16x4_t]
+      - [int32x2_t, int64x2_t, uint32x2_t]
+    compose:
+      - Let:
+          - c
+          - "{neon_type[2]}"
+          - FnCall:
+              - simd_cast
+              - - FnCall:
+                    - "vabd_{neon_type[0]}"
+                    - - a
+                      - b
+      - FnCall:
+          - simd_cast
+          - - c
+
+  - name: "vceq{neon_type[0].no}"
+    doc: "Compare bitwise Equal (vector)"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vceq{type[2]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cmeq]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t, uint8x8_t, ".i8"]
+      - [uint8x16_t, uint8x16_t, ".i8"]
+      - [int8x8_t, uint8x8_t, ".i8"]
+      - [int8x16_t, uint8x16_t, ".i8"]
+      - [poly8x8_t, uint8x8_t, ".i8"]
+      - [poly8x16_t, uint8x16_t, ".i8"]
+      - [uint16x4_t, uint16x4_t, ".i16"]
+      - [uint16x8_t, uint16x8_t, ".i16"]
+      - [int16x4_t, uint16x4_t, ".i16"]
+      - [int16x8_t, uint16x8_t, ".i16"]
+      - [uint32x2_t, uint32x2_t, ".i32"]
+      - [uint32x4_t, uint32x4_t, ".i32"]
+      - [int32x2_t, uint32x2_t, ".i32"]
+      - [int32x4_t, uint32x4_t, ".i32"]
+    compose:
+      - FnCall: [simd_eq, [a, b]]
+
+  - name: "vceq{neon_type[0].no}"
+    doc: "Floating-point compare equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vceq.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmeq]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+    compose:
+      - FnCall: [simd_eq, [a, b]]
+
+
+  - name: "vceq{neon_type[0].no}"
+    doc: "Floating-point compare equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vceq.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmeq]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - FnCall: [simd_eq, [a, b]]
+
+  - name: "vtst{neon_type[0].no}"
+    doc: "Signed compare bitwise Test bits nonzero"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vtst]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cmtst]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, uint8x8_t, i8x8, 'i8x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int8x16_t, uint8x16_t, i8x16, 'i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int16x4_t, uint16x4_t, i16x4, 'i16x4::new(0, 0, 0, 0)']
+      - [int16x8_t, uint16x8_t, i16x8, 'i16x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [int32x2_t, uint32x2_t, i32x2, 'i32x2::new(0, 0)']
+      - [int32x4_t, uint32x4_t, i32x4, 'i32x4::new(0, 0, 0, 0)']
+      - [poly8x8_t, uint8x8_t, i8x8, 'i8x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [poly8x16_t, uint8x16_t, i8x16, 'i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)']
+      - [poly16x4_t, uint16x4_t, i16x4, 'i16x4::new(0, 0, 0, 0)']
+      - [poly16x8_t, uint16x8_t, i16x8, 'i16x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+    compose:
+      - Let: [c, "{neon_type[0]}", {FnCall: [simd_and, [a, b]]}]
+      - Let: [d, "{type[2]}", "{type[3]}"]
+      - FnCall: [simd_ne, [c, {FnCall: [transmute, [d]]}]]
+
+  - name: "vabs{neon_type.no}"
+    doc: "Floating-point absolute value"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vabs]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fabs]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - FnCall: [simd_fabs, [a]]
+
+  - name: "vabs{neon_type.no}"
+    doc: "Floating-point absolute value"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vabs]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fabs]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - FnCall: [simd_fabs, [a]]
+
+  - name: "vabs{type[0]}"
+    doc: "Floating-point absolute value"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vabs]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fabs]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ['h_f16', 'f16']
+    compose:
+      - FnCall:
+          - simd_extract!
+          - - FnCall:
+                - "vabs_{type[1]}"
+                - - FnCall: ["vdup_n_{type[1]}", [a]]
+            - 0
+
+  - name: "vcgt{neon_type[0].no}"
+    doc: "Compare signed greater than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcgt.{type[2]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cmgt]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, uint8x8_t, "s8"]
+      - [int8x16_t, uint8x16_t, "s8"]
+      - [int16x4_t, uint16x4_t, s16]
+      - [int16x8_t, uint16x8_t, s16]
+      - [int32x2_t, uint32x2_t, "s32"]
+      - [int32x4_t, uint32x4_t, "s32"]
+    compose:
+      - FnCall: [simd_gt, [a, b]]
+
+  - name: "vcgt{neon_type.no}"
+    doc: "Compare unsigned greater than"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcgt.{neon_type}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cmhi]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint8x8_t
+      - uint8x16_t
+      - uint16x4_t
+      - uint16x8_t
+      - uint32x2_t
+      - uint32x4_t
+    compose:
+      - FnCall: [simd_gt, [a, b]]
+
+  - name: "vcgt{neon_type[0].no}"
+    doc: "Floating-point compare greater than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcgt.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmgt]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+    compose:
+      - FnCall: [simd_gt, [a, b]]
+
+
+  - name: "vcgt{neon_type[0].no}"
+    doc: "Floating-point compare greater than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcgt.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmgt]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - FnCall: [simd_gt, [a, b]]
+
+
+  - name: "vcgtz{neon_type[0].no}"
+    doc: "Floating-point compare greater than zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcgt.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmgt]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t, f16x4, 'f16x4::new(0.0, 0.0, 0.0, 0.0)']
+      - [float16x8_t, uint16x8_t, f16x8, 'f16x8::new(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall: [simd_gt, [a, {FnCall: [transmute, [b]]}]]
+
+  - name: "vclt{neon_type[0].no}"
+    doc: "Compare signed less than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcgt.{neon_type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cmgt]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, uint8x8_t]
+      - [int8x16_t, uint8x16_t]
+      - [int16x4_t, uint16x4_t]
+      - [int16x8_t, uint16x8_t]
+      - [int32x2_t, uint32x2_t]
+      - [int32x4_t, uint32x4_t]
+    compose:
+      - FnCall: [simd_lt, [a, b]]
+
+  - name: "vcle{neon_type[0].no}"
+    doc: "Compare signed less than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcge.{neon_type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cmge]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, uint8x8_t]
+      - [int8x16_t, uint8x16_t]
+      - [int16x4_t, uint16x4_t]
+      - [int16x8_t, uint16x8_t]
+      - [int32x2_t, uint32x2_t]
+      - [int32x4_t, uint32x4_t]
+    compose:
+      - FnCall: [simd_le, [a, b]]
+
+  - name: "vcle{neon_type[0].no}"
+    doc: "Floating-point compare less than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcge.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmge]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+    compose:
+      - FnCall: [simd_le, [a, b]]
+
+
+  - name: "vcle{neon_type[0].no}"
+    doc: "Floating-point compare less than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcge.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmge]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - FnCall: [simd_le, [a, b]]
+
+  - name: "vclez{neon_type[0].no}"
+    doc: "Floating-point compare less than or equal to zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcle.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmle]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t, f16x4, 'f16x4::new(0.0, 0.0, 0.0, 0.0)']
+      - [float16x8_t, uint16x8_t, f16x8, 'f16x8::new(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall:
+          - simd_le
+          - - a
+            - FnCall: [transmute, [b]]
+
+  - name: "vcge{neon_type[0].no}"
+    doc: "Compare signed greater than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcge.{neon_type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cmge]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, uint8x8_t]
+      - [int8x16_t, uint8x16_t]
+      - [int16x4_t, uint16x4_t]
+      - [int16x8_t, uint16x8_t]
+      - [int32x2_t, uint32x2_t]
+      - [int32x4_t, uint32x4_t]
+    compose:
+      - FnCall: [simd_ge, [a, b]]
+
+  - name: "vcls{neon_type.no}"
+    doc: "Count leading sign bits"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcls.{neon_type}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cls]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+    compose:
+      - LLVMLink:
+          name: "vcls{neon_type.no}"
+          links:
+            - link: "llvm.arm.neon.vcls.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.cls.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vcls{neon_type[0].no}"
+    doc: "Count leading sign bits"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vcls]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cls]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t, int8x8_t]
+      - [uint8x16_t, int8x16_t]
+      - [uint16x4_t, int16x4_t]
+      - [uint16x8_t, int16x8_t]
+      - [uint32x2_t, int32x2_t]
+      - [uint32x4_t, int32x4_t]
+    compose:
+      - FnCall:
+          - "vcls{neon_type[1].no}"
+          - - FnCall: [transmute, [a]]
+
+  - name: "vclz{neon_type[0].no}"
+    doc: "Count leading zero bits"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vclz.i8"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [clz]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t, int8x8_t]
+      - [uint8x16_t, int8x16_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vclz{neon_type[1].no}"
+                - - FnCall: [transmute, [a]]
+
+  - name: "vclz{neon_type[0].no}"
+    doc: "Count leading zero bits"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vclz{type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [clz]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, '.i8']
+      - [int8x16_t, '.i8']
+      - [int16x4_t, '.i16']
+      - [int16x8_t, '.i16']
+      - [int32x2_t, '.i32']
+      - [int32x4_t, '.i32']
+    compose:
+      - FnCall: [simd_ctlz, [a]]
+
+  - name: "vclz{neon_type[0].no}"
+    doc: "Count leading zero bits"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vclz{type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [clz]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint32x2_t, '.i32', int32x2_t]
+      - [uint32x4_t, '.i32', int32x4_t]
+      - [uint16x4_t, '.i16', int16x4_t]
+      - [uint16x8_t, '.i16', int16x8_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vclz{neon_type[2].no}"
+                - - FnCall: [transmute, [a]]
+
+  - name: "vcagt{neon_type[0].no}"
+    doc: "Floating-point absolute compare greater than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vacgt.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [facgt]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+    compose:
+      - LLVMLink:
+          name: "vcagt{neon_type[0].no}"
+          links:
+            - link: "llvm.arm.neon.vacgt.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.facgt.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcagt{neon_type[0].no}"
+    doc: "Floating-point absolute compare greater than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vacgt.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [facgt]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vcagt{neon_type[0].no}"
+          links:
+            - link: "llvm.arm.neon.vacgt.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.facgt.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcage{neon_type[0].no}"
+    doc: "Floating-point absolute compare greater than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vacge.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [facge]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+    compose:
+      - LLVMLink:
+          name: "vcage{neon_type[0].no}"
+          links:
+            - link: "llvm.arm.neon.vacge.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.facge.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcage{neon_type[0].no}"
+    doc: "Floating-point absolute compare greater than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vacge.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [facge]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vcage{neon_type[0].no}"
+          links:
+            - link: "llvm.arm.neon.vacge.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.facge.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcalt{neon_type[0].no}"
+    doc: "Floating-point absolute compare less than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vacgt.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [facgt]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+    compose:
+      - FnCall: ["vcagt{neon_type[0].no}", [b, a]]
+
+  - name: "vcalt{neon_type[0].no}"
+    doc: "Floating-point absolute compare less than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vacgt.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [facgt]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - FnCall: ["vcagt{neon_type[0].no}", [b, a]]
+
+  - name: "vcale{neon_type[0].no}"
+    doc: "Floating-point absolute compare less than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vacge.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [facge]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+    compose:
+      - FnCall: ["vcage{neon_type[0].no}", [b, a]]
+
+
+  - name: "vcale{neon_type[0].no}"
+    doc: "Floating-point absolute compare less than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vacge.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [facge]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - FnCall: ["vcage{neon_type[0].no}", [b, a]]
+
+  - name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vcvt]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [scvtf]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x2_t, float32x2_t]
+      - [int32x4_t, float32x4_t]
+    compose:
+      - FnCall: [simd_cast, [a]]
+
+  - name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vcvt]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [scvtf]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [int16x4_t, float16x4_t]
+      - [int16x8_t, float16x8_t]
+    compose:
+      - FnCall: [simd_cast, [a]]
+
+  - name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vcvt]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ucvtf]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint32x2_t, float32x2_t]
+      - [uint32x4_t, float32x4_t]
+    compose:
+      - FnCall: [simd_cast, [a]]
+
+  - name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vcvt]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ucvtf]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [uint16x4_t, float16x4_t]
+      - [uint16x8_t, float16x8_t]
+    compose:
+      - FnCall: [simd_cast, [a]]
+
+  - name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vcvt, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint32x2_t, float32x2_t]
+      - [uint32x4_t, float32x4_t]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 32']]
+      - LLVMLink:
+          name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.arm.neon.vcvtfxu2fp.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+      - FnCall: ["_vcvt{neon_type[1].N}_{neon_type[0]}", ["a", N], [], true]
+
+  - name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ucvtf, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint32x2_t, float32x2_t]
+      - [uint32x4_t, float32x4_t]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 32']]
+      - LLVMLink:
+          name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.vcvtfxu2fp.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{neon_type[1].N}_{neon_type[0]}", ["a", N], [], true]
+
+  - name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcvt"', 'N = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ucvtf, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint16x4_t, float16x4_t]
+      - [uint16x8_t, float16x8_t]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 16']]
+      - LLVMLink:
+          name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.arm.neon.vcvtfxu2fp.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.vcvtfxu2fp.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{neon_type[1].N}_{neon_type[0]}", ["a", N], [], true]
+
+
+  - name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+    doc: "Floating-point convert to signed fixed-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcvt"', 'N = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcvtzs, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [float16x4_t, int16x4_t]
+      - [float16x8_t, int16x8_t]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 16']]
+      - LLVMLink:
+          name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.arm.neon.vcvtfp2fxs.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.vcvtfp2fxs.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{neon_type[1].N}_{neon_type[0]}", [a, N], [], true]
+
+
+  - name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+    doc: "Fixed-point convert to unsigned fixed-point, rounding toward zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcvt"', 'N = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcvtzu, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 16']]
+      - LLVMLink:
+          name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.arm.neon.vcvtfp2fxu.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.vcvtfp2fxu.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{neon_type[1].N}_{neon_type[0]}", ["a", N], [], true]
+
+  - name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vcvt, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x2_t, float32x2_t]
+      - [int32x4_t, float32x4_t]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 32']]
+      - LLVMLink:
+          name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.arm.neon.vcvtfxs2fp.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+      - FnCall: ["_vcvt{neon_type[1].N}_{neon_type[0]}", [a, N], [], true]
+
+
+  - name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcvt"', 'N = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [scvtf, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x4_t, float16x4_t]
+      - [int16x8_t, float16x8_t]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 16']]
+      - LLVMLink:
+          name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.arm.neon.vcvtfxs2fp.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.vcvtfxs2fp.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{neon_type[1].N}_{neon_type[0]}", [a, N], [], true]
+
+  - name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+    doc: "Fixed-point convert to floating-point"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [scvtf, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x2_t, float32x2_t]
+      - [int32x4_t, float32x4_t]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 32']]
+      - LLVMLink:
+          name: "vcvt{neon_type[1].N}_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.vcvtfxs2fp.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{neon_type[1].N}_{neon_type[0]}", [a, N], [], true]
+
+  - name: "vcvt{type[2]}"
+    doc: "Floating-point convert to fixed-point, rounding toward zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vcvt, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [float32x2_t, int32x2_t, _n_s32_f32]
+      - [float32x4_t, int32x4_t, q_n_s32_f32]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 32']]
+      - LLVMLink:
+          name: "vcvt{type[2]}"
+          arguments: ["a: {type[0]}", "n: i32"]
+          links:
+            - link: "llvm.arm.neon.vcvtfp2fxs.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+      - FnCall: ["_vcvt{type[2]}", [a, N], [], true]
+
+  - name: "vcvt{type[2]}"
+    doc: "Floating-point convert to fixed-point, rounding toward zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vcvt, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t, _n_u32_f32]
+      - [float32x4_t, uint32x4_t, q_n_u32_f32]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 32']]
+      - LLVMLink:
+          name: "vcvt{type[2]}"
+          arguments: ["a: {type[0]}", "n: i32"]
+          links:
+            - link: "llvm.arm.neon.vcvtfp2fxu.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+      - FnCall: ["_vcvt{type[2]}", [a, N], [], true]
+
+  - name: "vcvt{type[2]}"
+    doc: "Floating-point convert to fixed-point, rounding toward zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzs, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [float32x2_t, int32x2_t, _n_s32_f32]
+      - [float32x4_t, int32x4_t, q_n_s32_f32]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 32']]
+      - LLVMLink:
+          name: "vcvt{type[2]}"
+          arguments: ["a: {type[0]}", "n: i32"]
+          links:
+            - link: "llvm.aarch64.neon.vcvtfp2fxs.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{type[2]}", [a, N], [], true]
+
+  - name: "vcvt{type[2]}"
+    doc: "Floating-point convert to fixed-point, rounding toward zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtzu, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t, _n_u32_f32]
+      - [float32x4_t, uint32x4_t, q_n_u32_f32]
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= 32']]
+      - LLVMLink:
+          name: "vcvt{type[2]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.vcvtfp2fxu.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vcvt{type[2]}", [a, N], [], true]
+
+  - name: "vdup{type[0]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vdup.8"', 'N = 4']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [dup, 'N = 4']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [_lane_s8, int8x8_t, int8x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [q_lane_s8, int8x8_t, int8x16_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_lane_u8, uint8x8_t, uint8x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [q_lane_u8, uint8x8_t, uint8x16_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_lane_p8, poly8x8_t, poly8x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [q_lane_p8, poly8x8_t, poly8x16_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
+      - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
+
+  - name: "vdup{type[0]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vdup.8"', 'N = 8']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [dup, 'N = 8']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [q_laneq_s8, int8x16_t, int8x16_t, '4', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_laneq_s8, int8x16_t, int8x8_t, '4', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [q_laneq_u8, uint8x16_t, uint8x16_t, '4', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_laneq_u8, uint8x16_t, uint8x8_t, '4', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [q_laneq_p8, poly8x16_t, poly8x16_t, '4', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_laneq_p8, poly8x16_t, poly8x8_t, '4', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
+      - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
+
+  - name: "vdup{type[0]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vdup.16"', 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [dup, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [_lane_s16, int16x4_t, int16x4_t, '2', '[N as u32, N as u32, N as u32, N as u32]']
+      - [q_lane_s16, int16x4_t, int16x8_t, '2', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_lane_u16, uint16x4_t, uint16x4_t, '2', '[N as u32, N as u32, N as u32, N as u32]']
+      - [q_lane_u16, uint16x4_t, uint16x8_t, '2', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_lane_p16, poly16x4_t, poly16x4_t, '2', '[N as u32, N as u32, N as u32, N as u32]']
+      - [q_lane_p16, poly16x4_t, poly16x8_t, '2', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
+      - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
+
+  - name: "vdup{type[0]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vdup.16"', 'N = 4']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [dup, 'N = 4']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [q_laneq_s16, int16x8_t, int16x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_laneq_s16, int16x8_t, int16x4_t, '3', '[N as u32, N as u32, N as u32, N as u32]']
+      - [q_laneq_u16, uint16x8_t, uint16x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_laneq_u16, uint16x8_t, uint16x4_t, '3', '[N as u32, N as u32, N as u32, N as u32]']
+      - [q_laneq_p16, poly16x8_t, poly16x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_laneq_p16, poly16x8_t, poly16x4_t, '3', '[N as u32, N as u32, N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
+      - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
+
+
+  - name: "vdup{type[0]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vdup.16"', 'N = 4']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [dup, 'N = 4']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [q_laneq_f16, float16x8_t, float16x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_laneq_f16, float16x8_t, float16x4_t, '3', '[N as u32, N as u32, N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
+      - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
+
+  - name: "vdup{type[3]}{neon_type[0]}"
+    doc: "Create a new vector with all lanes set to a value"
+    arguments: ["a: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vdup.16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [dup]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, f16, 'float16x4', '_n_']
+      - [float16x8_t, f16, 'float16x8', 'q_n_']
+    compose:
+      - "{type[2]}_t::splat(a)"
+
+  - name: "vdup{type[0]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vdup.16"', 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [dup, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [_lane_f16, float16x4_t, float16x4_t, '2', '[N as u32, N as u32, N as u32, N as u32]']
+      - [q_lane_f16, float16x4_t, float16x8_t, '2', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
+      - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
+
+
+  - name: "vdup{type[0]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vdup.32"', 'N = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [dup, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [_lane_s32, int32x2_t, int32x2_t, '1', '[N as u32, N as u32]']
+      - [q_lane_s32, int32x2_t, int32x4_t, '1', '[N as u32, N as u32, N as u32, N as u32]']
+      - [_lane_u32, uint32x2_t, uint32x2_t, '1', '[N as u32, N as u32]']
+      - [q_lane_u32, uint32x2_t, uint32x4_t, '1', '[N as u32, N as u32, N as u32, N as u32]']
+      - [_lane_f32, float32x2_t, float32x2_t, '1', '[N as u32, N as u32]']
+      - [q_lane_f32, float32x2_t, float32x4_t, '1', '[N as u32, N as u32, N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
+      - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
+
+  - name: "vdup{type[0]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vdup.32"', 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [dup, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [q_laneq_s32, int32x4_t, int32x4_t, '2', '[N as u32, N as u32, N as u32, N as u32]']
+      - [_laneq_s32, int32x4_t, int32x2_t, '2', '[N as u32, N as u32]']
+      - [q_laneq_u32, uint32x4_t, uint32x4_t, '2', '[N as u32, N as u32, N as u32, N as u32]']
+      - [_laneq_u32, uint32x4_t, uint32x2_t, '2', '[N as u32, N as u32]']
+      - [q_laneq_f32, float32x4_t, float32x4_t, '2', '[N as u32, N as u32, N as u32, N as u32]']
+      - [_laneq_f32, float32x4_t, float32x2_t, '2', '[N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
+      - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
+
+  - name: "vdup{type[0]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmov, 'N = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [dup, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [q_laneq_s64, int64x2_t, '1', '[N as u32, N as u32]']
+      - [q_laneq_u64, uint64x2_t, '1', '[N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[2]}"]]
+      - FnCall: [simd_shuffle!, [a, a, "{type[3]}"]]
+
+  - name: "vdup{type[0]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmov, 'N = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [dup, 'N = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [q_lane_s64, int64x1_t, int64x2_t]
+      - [q_lane_u64, uint64x1_t, uint64x2_t]
+    compose:
+      - FnCall: [static_assert!, ['N == 0']]
+      - FnCall: [simd_shuffle!, [a, a, '[N as u32, N as u32]']]
+
+  - name: "vdup{type[0]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop, 'N = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop, 'N = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [_lane_s64, int64x1_t]
+      - [_lane_u64, uint64x1_t]
+    compose:
+      - FnCall: [static_assert!, ['N == 0']]
+      - Identifier: [a, Symbol]
+
+  - name: "vdup{type[0]}"
+    doc: "Set all vector lanes to the same value"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmov, 'N = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [_laneq_s64, int64x2_t, int64x1_t, '::<i64, _>']
+      - [_laneq_u64, uint64x2_t, uint64x1_t, '::<u64, _>']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, 1]]
+      - FnCall:
+          - "transmute{type[3]}"
+          - - FnCall: [simd_extract!, [a, 'N as u32']]
+
+  - name: "vext{neon_type[0].no}"
+    doc: "Extract vector from pair of vectors"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vext.8"', 'N = 7']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ext, 'N = 7']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, ' static_assert_uimm_bits!(N, 3);', 'unsafe { match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), } }']
+      - [int16x8_t, ' static_assert_uimm_bits!(N, 3);', 'unsafe { match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), } }']
+      - [uint8x8_t, ' static_assert_uimm_bits!(N, 3);', 'unsafe { match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), } }']
+      - [uint16x8_t, ' static_assert_uimm_bits!(N, 3);', 'unsafe { match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), } }']
+      - [poly8x8_t, ' static_assert_uimm_bits!(N, 3);', 'unsafe { match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), } }']
+      - [poly16x8_t, ' static_assert_uimm_bits!(N, 3);', 'unsafe { match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), } }']
+    compose:
+      - Identifier: ["{type[1]}", Symbol]
+      - Identifier: ["{type[2]}", Symbol]
+
+  - name: "vext{neon_type[0].no}"
+    doc: "Extract vector from pair of vectors"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vext.8"', 'N = 15']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ext, 'N = 15']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x16_t, ' static_assert_uimm_bits!(N, 4);', 'unsafe { match N & 0b1111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]), 8 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]), 9 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]), 10 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), 11 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]), 12 => simd_shuffle!(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]), 13 => simd_shuffle!(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]), 14 => simd_shuffle!(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]), 15 => simd_shuffle!(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), _ => unreachable_unchecked(), } }']
+      - [uint8x16_t, ' static_assert_uimm_bits!(N, 4);', 'unsafe { match N & 0b1111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]), 8 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]), 9 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]), 10 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), 11 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]), 12 => simd_shuffle!(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]), 13 => simd_shuffle!(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]), 14 => simd_shuffle!(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]), 15 => simd_shuffle!(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), _ => unreachable_unchecked(), } }']
+      - [poly8x16_t, ' static_assert_uimm_bits!(N, 4);', 'unsafe { match N & 0b1111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]), 8 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]), 9 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]), 10 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), 11 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]), 12 => simd_shuffle!(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]), 13 => simd_shuffle!(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]), 14 => simd_shuffle!(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]), 15 => simd_shuffle!(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), _ => unreachable_unchecked(), } }']
+    compose:
+      - Identifier: ["{type[1]}", Symbol]
+      - Identifier: ["{type[2]}", Symbol]
+
+  - name: "vext{neon_type[0].no}"
+    doc: "Extract vector from pair of vectors"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vext.8"', 'N = 3']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ext, 'N = 3']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x4_t, 'static_assert_uimm_bits!(N, 2);', 'unsafe { match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), } }']
+      - [int32x4_t, ' static_assert_uimm_bits!(N, 2);', 'unsafe { match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), } }']
+      - [uint16x4_t, ' static_assert_uimm_bits!(N, 2);', 'unsafe { match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), } }']
+      - [uint32x4_t, ' static_assert_uimm_bits!(N, 2);', 'unsafe { match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), } }']
+      - [poly16x4_t, ' static_assert_uimm_bits!(N, 2);', 'unsafe { match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), } }']
+      - [float32x4_t, ' static_assert_uimm_bits!(N, 2);', 'unsafe { match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), } }']
+    compose:
+      - Identifier: ["{type[1]}", Symbol]
+      - Identifier: ["{type[2]}", Symbol]
+
+
+  - name: "vext{neon_type[0].no}"
+    doc: "Extract vector from pair of vectors"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vext.8"', 'N = 3']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ext, 'N = 3']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [float16x4_t, ' static_assert_uimm_bits!(N, 2); unsafe { match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), } }']
+    compose:
+      - Identifier: ["{type[1]}", Symbol]
+
+  - name: "vext{neon_type[0].no}"
+    doc: "Extract vector from pair of vectors"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vext.8"', 'N = 7']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ext, 'N = 7']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [float16x8_t, ' static_assert_uimm_bits!(N, 3); unsafe { match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), } }']
+    compose:
+      - Identifier: ["{type[1]}", Symbol]
+
+
+
+  - name: "vext{neon_type[0].no}"
+    doc: "Extract vector from pair of vectors"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vext.8"', 'N = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ext, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x2_t, ' static_assert_uimm_bits!(N, 1);', 'unsafe { match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), } }']
+      - [uint32x2_t, ' static_assert_uimm_bits!(N, 1);', 'unsafe { match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), } }']
+      - [float32x2_t, ' static_assert_uimm_bits!(N, 1);', 'unsafe { match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), } }']
+    compose:
+      - Identifier: ["{type[1]}", Symbol]
+      - Identifier: ["{type[2]}", Symbol]
+
+  - name: "vext{neon_type[0].no}"
+    doc: "Extract vector from pair of vectors"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmov, 'N = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ext, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int64x2_t, 'static_assert_uimm_bits!(N, 1);', 'unsafe { match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), } }']
+      - [uint64x2_t, 'static_assert_uimm_bits!(N, 1);', 'unsafe { match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), } }']
+    compose:
+      - Identifier: ["{type[1]}", Symbol]
+      - Identifier: ["{type[2]}", Symbol]
+
+  - name: "vmla{neon_type[0].no}"
+    doc: "Multiply-add to accumulator"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmla{type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mla]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, ".i8"]
+      - [int8x16_t, ".i8"]
+      - [uint8x8_t, ".i8"]
+      - [uint8x16_t, ".i8"]
+      - [int16x4_t, ".i16"]
+      - [int16x8_t, ".i16"]
+      - [uint16x4_t, ".i16"]
+      - [uint16x8_t, ".i16"]
+      - [int32x2_t, ".i32"]
+      - [int32x4_t, ".i32"]
+      - [uint32x2_t, ".i32"]
+      - [uint32x4_t, ".i32"]
+    compose:
+      - FnCall: [simd_add, [a, {FnCall: [simd_mul, [b, c]]}]]
+
+  - name: "vmla{neon_type.no}"
+    doc: "Floating-point multiply-add to accumulator"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmla.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmul]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - FnCall: [simd_add, [a, {FnCall: [simd_mul, [b, c]]}]]
+
+  - name: "vmlal{neon_type[1].no}"
+    doc: "Signed multiply-add long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmlal.{type[2]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smlal]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t, "s8"]
+      - [int32x4_t, int16x4_t, "s16"]
+      - [int64x2_t, int32x2_t, "s32"]
+    compose:
+      - FnCall: [simd_add, [a, {FnCall: ["vmull_{type[2]}", [b, c]]}]]
+
+  - name: "vmlal_n_{type[4]}"
+    doc: "Vector widening multiply accumulate with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmlal.{type[4]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smlal]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x4_t, int16x4_t, "i16", int32x4_t, 's16']
+      - [int64x2_t, int32x2_t, "i32", int64x2_t, 's32']
+    compose:
+      - FnCall:
+          - "vmlal{neon_type[1].noq}"
+          - - a
+            - b
+            - FnCall: ["vdup_n_{neon_type[1]}", [c]]
+
+  - name: "vmlal_n_{type[2]}"
+    doc: "Vector widening multiply accumulate with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmlal.{type[2]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [umlal]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint32x4_t, uint16x4_t, "u16", uint32x4_t]
+      - [uint64x2_t, uint32x2_t, "u32", uint64x2_t]
+    compose:
+      - FnCall:
+          - "vmlal{neon_type[1].noq}"
+          - - a
+            - b
+            - FnCall: ["vdup_n_{neon_type[1]}", [c]]
+
+  - name: "vmlal_lane{neon_type[2].no}"
+    doc: "Vector widening multiply accumulate with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmlal.{neon_type[1]}"', 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smlal, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [int32x4_t, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int32x4_t, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int64x2_t, int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [int64x2_t, int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - FnCall:
+          - "vmlal_{neon_type[1]}"
+          - - a
+            - b
+            - FnCall: [simd_shuffle!, [c, c, '{type[4]}']]
+
+  - name: "vmlal_lane{neon_type[2].no}"
+    doc: "Vector widening multiply accumulate with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmlal.{neon_type[1]}"', 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [umlal, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [uint32x4_t, uint16x4_t, uint16x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint32x4_t, uint16x4_t, uint16x8_t, uint32x4_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint64x2_t, uint32x2_t, uint32x2_t, uint64x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [uint64x2_t, uint32x2_t, uint32x4_t, uint64x2_t, '2', '[LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[4]}"]]
+      - FnCall:
+          - "vmlal_{neon_type[1]}"
+          - - a
+            - b
+            - FnCall: [simd_shuffle!, [c, c, '{type[5]}']]
+
+  - name: "vmlal_{neon_type[1]}"
+    doc: "Unsigned multiply-add long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmlal.{neon_type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [umlal]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x8_t]
+      - [uint32x4_t, uint16x4_t]
+      - [uint64x2_t, uint32x2_t]
+    compose:
+      - FnCall:
+          - simd_add
+          - - a
+            - FnCall: ["vmull_{neon_type[1]}", [b, c]]
+
+  - name: "vmls{neon_type[0].no}"
+    doc: "Multiply-subtract from accumulator"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmls{type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mls]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, '.i8']
+      - [int8x16_t, '.i8']
+      - [uint8x8_t, '.i8']
+      - [uint8x16_t, '.i8']
+      - [int16x4_t, ".i16"]
+      - [int16x8_t, ".i16"]
+      - [uint16x4_t, ".i16"]
+      - [uint16x8_t, ".i16"]
+      - [int32x2_t, ".i32"]
+      - [int32x4_t, ".i32"]
+      - [uint32x2_t, ".i32"]
+      - [uint32x4_t, ".i32"]
+    compose:
+      - FnCall:
+          - simd_sub
+          - - a
+            - FnCall: [simd_mul, [b, c]]
+
+  - name: "vmlsl_{neon_type[1]}"
+    doc: "Signed multiply-subtract long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmlsl.{neon_type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smlsl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t]
+      - [int32x4_t, int16x4_t]
+      - [int64x2_t, int32x2_t]
+    compose:
+      - FnCall: [simd_sub, [a, {FnCall: ["vmull_{neon_type[1]}", [b, c]]}]]
+
+  - name: "vmlsl_n_{neon_type[1]}"
+    doc: "Vector widening multiply subtract with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmlsl.{neon_type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smlsl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x4_t, int16x4_t, "i16"]
+      - [int64x2_t, int32x2_t, "i32"]
+    compose:
+      - FnCall: ["vmlsl_{neon_type[1]}", [a, b, {FnCall: ["vdup_n_{neon_type[1]}", [c]]}]]
+
+  - name: "vmlsl_n_{neon_type[1]}"
+    doc: "Vector widening multiply subtract with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmlsl.{neon_type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [umlsl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint32x4_t, uint16x4_t, "u16"]
+      - [uint64x2_t, uint32x2_t, "u32"]
+    compose:
+      - FnCall: ["vmlsl_{neon_type[1]}", [a, b, {FnCall: ["vdup_n_{neon_type[1]}", [c]]}]]
+
+  - name: "vmlsl_lane{neon_type[2].no}"
+    doc: "Vector widening multiply subtract with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmlsl.{neon_type[1]}"', 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smlsl, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [int32x4_t, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int32x4_t, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall:
+          - "vmlsl_{neon_type[1]}"
+          - - a
+            - b
+            - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]
+
+  - name: "vmlsl_lane{neon_type[2].no}"
+    doc: "Vector widening multiply subtract with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmlsl.{neon_type[1]}"', 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smlsl, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [int64x2_t, int32x2_t, int32x2_t, '[LANE as u32, LANE as u32]', '1']
+      - [int64x2_t, int32x2_t, int32x4_t, '[LANE as u32, LANE as u32]', '2']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[4]}"]]
+      - FnCall:
+          - "vmlsl_{neon_type[1]}"
+          - - a
+            - b
+            - FnCall: [simd_shuffle!, [c, c, "{type[3]}"]]
+
+  - name: "vmlsl_lane{neon_type[2].no}"
+    doc: "Vector widening multiply subtract with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmlsl.{neon_type[1]}"', 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [umlsl, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [uint32x4_t, uint16x4_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint32x4_t, uint16x4_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint64x2_t, uint32x2_t, uint32x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [uint64x2_t, uint32x2_t, uint32x4_t, '2', '[LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - FnCall:
+          - "vmlsl_{neon_type[1]}"
+          - - a
+            - b
+            - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]
+
+  - name: "vmlsl_{neon_type[1]}"
+    doc: "Unsigned multiply-subtract long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmlsl.{neon_type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [umlsl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x8_t]
+      - [uint32x4_t, uint16x4_t]
+      - [uint64x2_t, uint32x2_t]
+    compose:
+      - FnCall: [simd_sub, [a, {FnCall: ["vmull_{neon_type[1]}", [b, c]]}]]
+
+  - name: "vneg{neon_type[0].no}"
+    doc: Negate
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vneg.{type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [neg]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, 's8']
+      - [int8x16_t, 's8']
+      - [int16x4_t, 's16']
+      - [int16x8_t, 's16']
+      - [int32x2_t, 's32']
+      - [int32x4_t, 's32']
+    compose:
+      - FnCall: [simd_neg, [a]]
+
+  - name: "vneg{neon_type[0].no}"
+    doc: Negate
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vneg.{type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fneg]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, 'f32']
+      - [float32x4_t, 'f32']
+    compose:
+      - FnCall: [simd_neg, [a]]
+
+  - name: "vneg{neon_type[0].no}"
+    doc: Negate
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vneg.{type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fneg]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, 'f16']
+      - [float16x8_t, 'f16']
+    compose:
+      - FnCall: [simd_neg, [a]]
+
+  - name: "vqneg{neon_type[0].no}"
+    doc: Signed saturating negate
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vqneg.{type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqneg]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, 's8', 'i8']
+      - [int8x16_t, 's8', 'i8']
+      - [int16x4_t, 's16', 'i16']
+      - [int16x8_t, 's16', 'i16']
+      - [int32x2_t, 's32', 'i32']
+      - [int32x4_t, 's32', 'i32']
+    compose:
+      - LLVMLink:
+          name: "sqneg.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.sqneg.v{neon_type[0].lane}{type[2]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vqneg.v{neon_type[0].lane}{type[2]}"
+              arch: arm
+
+  - name: "vqsub{neon_type[0].no}"
+    doc: Saturating subtract
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vqsub.{type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uqsub]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t, u8, i8]
+      - [uint8x16_t, u8, i8]
+      - [uint16x4_t, u16, i16]
+      - [uint16x8_t, u16, i16]
+      - [uint32x2_t, u32, i32]
+      - [uint32x4_t, u32, i32]
+      - [uint64x1_t, u64, i64]
+      - [uint64x2_t, u64, i64]
+    compose:
+      - LLVMLink:
+          name: "uqsub.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.uqsub.v{neon_type[0].lane}{type[2]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.usub.sat.v{neon_type[0].lane}{type[2]}"
+              arch: arm
+
+  - name: "vqsub{neon_type[0].no}"
+    doc: Saturating subtract
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vqsub.{type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqsub]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, s8, i8]
+      - [int8x16_t, s8, i8]
+      - [int16x4_t, s16, i16]
+      - [int16x8_t, s16, i16]
+      - [int32x2_t, s32, i32]
+      - [int32x4_t, s32, i32]
+      - [int64x1_t, s64, i64]
+      - [int64x2_t, s64, i64]
+    compose:
+      - LLVMLink:
+          name: "sqsub.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.sqsub.v{neon_type[0].lane}{type[2]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.ssub.sat.v{neon_type[0].lane}{type[2]}"
+              arch: arm
+
+  - name: "vhadd{neon_type.no}"
+    doc: Halving add
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - 'target_arch = "arm"'
+            - FnCall:
+                - assert_instr
+                - - '"vhadd.{neon_type}"'
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - FnCall:
+                      - any
+                      - - 'target_arch = "aarch64"'
+                        - 'target_arch = "arm64ec"'
+            - FnCall:
+                - assert_instr
+                - - uhadd
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint8x8_t
+      - uint8x16_t
+      - uint16x4_t
+      - uint16x8_t
+      - uint32x2_t
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: "uhadd.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.uhadd.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vhaddu.{neon_type}"
+              arch: arm
+
+  - name: "vhadd{neon_type.no}"
+    doc: Halving add
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - 'target_arch = "arm"'
+            - FnCall:
+                - assert_instr
+                - - '"vhadd.{neon_type}"'
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - FnCall:
+                      - any
+                      - - 'target_arch = "aarch64"'
+                        - 'target_arch = "arm64ec"'
+            - FnCall:
+                - assert_instr
+                - - shadd
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+    compose:
+      - LLVMLink:
+          name: "shadd.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.shadd.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vhadds.{neon_type}"
+              arch: arm
+
+  - name: "vrhadd{neon_type.no}"
+    doc: Rounding halving add
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vrhadd.{neon_type}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [srhadd]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+    compose:
+      - LLVMLink:
+          name: "vrhadd.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.srhadd.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vrhadds.{neon_type}"
+              arch: arm
+
+  - name: "vrhadd{neon_type.no}"
+    doc: Rounding halving add
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vrhadd.{neon_type}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [urhadd]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint8x8_t
+      - uint8x16_t
+      - uint16x4_t
+      - uint16x8_t
+      - uint32x2_t
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: "vrhaddu.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.urhadd.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vrhaddu.{neon_type}"
+              arch: arm
+
+  - name: "vrndn{neon_type.no}"
+    doc: "Floating-point round to integral, to nearest with ties to even"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, ['target_arch = "arm"', {FnCall: [target_feature, ['enable = "fp-armv8,v8"']]}]]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrintn]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [frintn]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - LLVMLink:
+          name: "llvm.frinn.{neon_type}"
+          links:
+            - link: "llvm.roundeven.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vrintn.{neon_type}"
+              arch: arm
+
+  - name: "vrndn{neon_type.no}"
+    doc: "Floating-point round to integral, to nearest with ties to even"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, ['target_arch = "arm"', {FnCall: [target_feature, ['enable = "fp-armv8,v8"']]}]]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrintn]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [frintn]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "llvm.frinn.{neon_type}"
+          links:
+            - link: "llvm.roundeven.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vrintn.{neon_type}"
+              arch: arm
+
+  - name: "vqadd{neon_type.no}"
+    doc: Saturating add
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vqadd.{neon_type}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uqadd]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint8x8_t
+      - uint8x16_t
+      - uint16x4_t
+      - uint16x8_t
+      - uint32x2_t
+      - uint32x4_t
+      - uint64x1_t
+      - uint64x2_t
+    compose:
+      - LLVMLink:
+          name: "uqadd.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.uqadd.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.uadd.sat.{neon_type}"
+              arch: arm
+
+  - name: "vqadd{neon_type.no}"
+    doc: Saturating add
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vqadd.{neon_type}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqadd]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+      - int64x1_t
+      - int64x2_t
+    compose:
+      - LLVMLink:
+          name: "sqadd.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.sqadd.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.sadd.sat.{neon_type}"
+              arch: arm
+
+  - name: "vld1{neon_type[1].no}"
+    doc: "Load multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld1]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f32", float32x2x2_t]
+      - ["*const f32", float32x4x2_t]
+      - ["*const f32", float32x2x3_t]
+      - ["*const f32", float32x4x3_t]
+      - ["*const f32", float32x2x4_t]
+      - ["*const f32", float32x4x4_t]
+    compose:
+      - LLVMLink:
+          name: "vld1x{neon_type[1].tuple}.{neon_type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.ld1x{neon_type[1].tuple}.v{neon_type[1].lane}f{neon_type[1].base}.p0"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vld1x{neon_type[1].tuple}.v{neon_type[1].lane}f{neon_type[1].base}.p0"
+              arch: arm
+
+  - name: "vld1{neon_type[1].no}"
+    doc: "Load multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld1]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i8", int8x8x2_t]
+      - ["*const i8", int8x16x2_t]
+      - ["*const i8", int8x8x3_t]
+      - ["*const i8", int8x16x3_t]
+      - ["*const i8", int8x8x4_t]
+      - ["*const i8", int8x16x4_t]
+      - ["*const i16", int16x4x2_t]
+      - ["*const i16", int16x8x2_t]
+      - ["*const i16", int16x4x3_t]
+      - ["*const i16", int16x8x3_t]
+      - ["*const i16", int16x4x4_t]
+      - ["*const i16", int16x8x4_t]
+      - ["*const i32", int32x2x2_t]
+      - ["*const i32", int32x4x2_t]
+      - ["*const i32", int32x2x3_t]
+      - ["*const i32", int32x4x3_t]
+      - ["*const i32", int32x2x4_t]
+      - ["*const i32", int32x4x4_t]
+      - ["*const i64", int64x1x2_t]
+      - ["*const i64", int64x1x3_t]
+      - ["*const i64", int64x1x4_t]
+      - ["*const i64", int64x2x2_t]
+      - ["*const i64", int64x2x3_t]
+      - ["*const i64", int64x2x4_t]
+    compose:
+      - LLVMLink:
+          name: "ld1x{neon_type[1].tuple}.{neon_type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.ld1x{neon_type[1].tuple}.v{neon_type[1].lane}i{neon_type[1].base}.p0"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vld1x{neon_type[1].tuple}.v{neon_type[1].lane}i{neon_type[1].base}.p0"
+              arch: arm
+
+  - name: "vld1{neon_type[1].no}"
+    doc: "Load multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld1]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u8", uint8x8x2_t, int8x8x2_t]
+      - ["*const u8", uint8x16x2_t, int8x16x2_t]
+      - ["*const u8", uint8x8x3_t, int8x8x3_t]
+      - ["*const u8", uint8x16x3_t, int8x16x3_t]
+      - ["*const u8", uint8x8x4_t, int8x8x4_t]
+      - ["*const u8", uint8x16x4_t, int8x16x4_t]
+      - ["*const u16", uint16x4x2_t, int16x4x2_t]
+      - ["*const u16", uint16x8x2_t, int16x8x2_t]
+      - ["*const u16", uint16x4x3_t, int16x4x3_t]
+      - ["*const u16", uint16x8x3_t, int16x8x3_t]
+      - ["*const u16", uint16x4x4_t, int16x4x4_t]
+      - ["*const u16", uint16x8x4_t, int16x8x4_t]
+      - ["*const u32", uint32x2x2_t, int32x2x2_t]
+      - ["*const u32", uint32x4x2_t, int32x4x2_t]
+      - ["*const u32", uint32x2x3_t, int32x2x3_t]
+      - ["*const u32", uint32x4x3_t, int32x4x3_t]
+      - ["*const u32", uint32x2x4_t, int32x2x4_t]
+      - ["*const u32", uint32x4x4_t, int32x4x4_t]
+      - ["*const u64", uint64x1x2_t, int64x1x2_t]
+      - ["*const u64", uint64x1x3_t, int64x1x3_t]
+      - ["*const u64", uint64x1x4_t, int64x1x4_t]
+      - ["*const u64", uint64x2x2_t, int64x2x2_t]
+      - ["*const u64", uint64x2x3_t, int64x2x3_t]
+      - ["*const u64", uint64x2x4_t, int64x2x4_t]
+      - ["*const p8", poly8x8x2_t, int8x8x2_t]
+      - ["*const p8", poly8x8x3_t, int8x8x3_t]
+      - ["*const p8", poly8x8x4_t, int8x8x4_t]
+      - ["*const p8", poly8x16x2_t, int8x16x2_t]
+      - ["*const p8", poly8x16x3_t, int8x16x3_t]
+      - ["*const p8", poly8x16x4_t, int8x16x4_t]
+      - ["*const p16", poly16x4x2_t, int16x4x2_t]
+      - ["*const p16", poly16x4x3_t, int16x4x3_t]
+      - ["*const p16", poly16x4x4_t, int16x4x4_t]
+      - ["*const p16", poly16x8x2_t, int16x8x2_t]
+      - ["*const p16", poly16x8x3_t, int16x8x3_t]
+      - ["*const p16", poly16x8x4_t, int16x8x4_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld1{neon_type[2].no}"
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld1{neon_type[1].no}"
+    doc: "Load multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld1]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const p64", poly64x1x3_t, int64x1x3_t]
+      - ["*const p64", poly64x1x4_t, int64x1x4_t]
+      - ["*const p64", poly64x2x2_t, int64x2x2_t]
+      - ["*const p64", poly64x2x3_t, int64x2x3_t]
+      - ["*const p64", poly64x2x4_t, int64x2x4_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld1{neon_type[2].no}"
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld1{neon_type[1].no}"
+    doc: "Load multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld1]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const p64", poly64x1x2_t, int64x1x2_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld1{neon_type[2].no}"
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld1{neon_type[1].no}"
+    doc: "Load multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld1]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x2_t]
+      - ["*const f16", float16x8x2_t]
+      - ["*const f16", float16x4x3_t]
+      - ["*const f16", float16x8x3_t]
+      - ["*const f16", float16x4x4_t]
+      - ["*const f16", float16x8x4_t]
+    compose:
+      - LLVMLink:
+          name: "vld1x{neon_type[1].tuple}.{neon_type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.ld1x{neon_type[1].tuple}.v{neon_type[1].lane}f{neon_type[1].base}.p0"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vld1x{neon_type[1].tuple}.v{neon_type[1].lane}f{neon_type[1].base}.p0"
+              arch: arm
+
+  - name: "vld1{type[2]}_{neon_type[1]}"
+    doc: "Load one single-element structure to one lane of one register"
+    arguments: ["ptr: {type[0]}", "src: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld1, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld1, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4_t, '_lane', '2']
+      - ["*const f16", float16x8_t, 'q_lane', '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall: [simd_insert!, [src, "LANE as u32", "*ptr"]]
+
+  - name: "vld1{type[2]}_{neon_type[1]}"
+    doc: "Load one single-element structure and replicate to all lanes of one register"
+    arguments: ["ptr: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ["vld1"]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld1r]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4_t, '_dup', 'f16x4', "[0, 0, 0, 0]"]
+      - ["*const f16", float16x8_t, 'q_dup', 'f16x8', "[0, 0, 0, 0, 0, 0, 0, 0]"]
+    compose:
+      - Let: [x, "{neon_type[1]}", "vld1{neon_type[1].lane_nox}::<0>(ptr, transmute({type[3]}::splat(0.0)))"]
+      - FnCall: [simd_shuffle!, [x, x, "{type[4]}"]]
+
+
+  - name: "vld2{neon_type[1].nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vld2]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i8", int8x8x2_t, i8]
+      - ["*const i16", int16x4x2_t, i16]
+      - ["*const i32", int32x2x2_t, i32]
+      - ["*const i8", int8x16x2_t, i8]
+      - ["*const i16", int16x8x2_t, i16]
+      - ["*const i32", int32x4x2_t, i32]
+      - ["*const f32", float32x2x2_t, f32]
+      - ["*const f32", float32x4x2_t, f32]
+    compose:
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "ptr: *const i8"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld2.v{neon_type[1].lane}{type[2]}"
+              arch: arm
+      - FnCall:
+          - "_vld2{neon_type[1].nox}"
+          - - "a as *const i8"
+            - "{neon_type[1].base_byte_size}"
+
+  - name: "vld2{neon_type[1].nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i64", int64x1x2_t, i64]
+    compose:
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "ptr: *const i8"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld2.v{neon_type[1].lane}{type[2]}"
+              arch: arm
+      - FnCall:
+          - "_vld2{neon_type[1].nox}"
+          - - "a as *const i8"
+            - "{neon_type[1].base_byte_size}"
+
+  - name: "vld2{neon_type[1].nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-not-arm
+      - *neon-stable
+    assert_instr: [ld2]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i8", int8x8x2_t, i8, int8x8_t]
+      - ["*const i16", int16x4x2_t, i16, int16x4_t]
+      - ["*const i32", int32x2x2_t, i32, int32x2_t]
+      - ["*const i8", int8x16x2_t, i8, int8x16_t]
+      - ["*const i16", int16x8x2_t, i16, int16x8_t]
+      - ["*const i32", int32x4x2_t, i32, int32x4_t]
+      - ["*const f32", float32x2x2_t, f32, float32x2_t]
+      - ["*const f32", float32x4x2_t, f32, float32x4_t]
+    compose:
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "ptr: *const {neon_type[3]}"
+          links:
+            - link: "llvm.aarch64.neon.ld2.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld2{neon_type[1].nox}"
+          - - "a as _"
+
+  - name: "vld2{neon_type[1].nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-not-arm
+      - *neon-stable
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i64", int64x1x2_t, i64, int64x1_t]
+    compose:
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "ptr: *const {neon_type[3]}"
+          links:
+            - link: "llvm.aarch64.neon.ld2.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld2{neon_type[1].nox}"
+          - - "a as _"
+
+  - name: "vld2{neon_type[1].nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld2]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld2]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u8", uint8x8x2_t, int8x8x2_t]
+      - ["*const u16", uint16x4x2_t, int16x4x2_t]
+      - ["*const u32", uint32x2x2_t, int32x2x2_t]
+      - ["*const u8", uint8x16x2_t, int8x16x2_t]
+      - ["*const u16", uint16x8x2_t, int16x8x2_t]
+      - ["*const u32", uint32x4x2_t, int32x4x2_t]
+      - ["*const p8", poly8x8x2_t, int8x8x2_t]
+      - ["*const p16", poly16x4x2_t, int16x4x2_t]
+      - ["*const p8", poly8x16x2_t, int8x16x2_t]
+      - ["*const p16", poly16x8x2_t, int16x8x2_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].nox}"
+                - - FnCall: [transmute, [a]]
+
+  - name: "vld2{neon_type[1].nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u64", uint64x1x2_t, int64x1x2_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].nox}"
+                - - FnCall: [transmute, [a]]
+
+  - name: "vld2{neon_type[1].nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const p64", poly64x1x2_t, int64x1x2_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].nox}"
+                - - FnCall: [transmute, [a]]
+
+  - name: "vld2{neon_type[1].lane_nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - FnCall:
+          - cfg_attr
+          - - test
+            - FnCall:
+                - assert_instr
+                - - vld2
+                  - "LANE = 0"
+      - FnCall:
+          - rustc_legacy_const_generics
+          - - "2"
+      - *neon-arm-unstable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i8", int8x8x2_t, i8, int8x8_t, "3"]
+      - ["*const i16", int16x4x2_t, i16, int16x4_t, "2"]
+      - ["*const i32", int32x2x2_t, i32, int32x2_t, "1"]
+    compose:
+      - FnCall:
+          - "static_assert_uimm_bits!"
+          - - LANE
+            - "{type[4]}"
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "ptr: *const i8"
+            - "a: {neon_type[3]}"
+            - "b: {neon_type[3]}"
+            - "n: i32"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld2lane.v{neon_type[1].lane}{type[2]}.p0"
+              arch: arm
+      - FnCall:
+          - "_vld2_lane{neon_type[1].nox}"
+          - - "a as _"
+            - "b.0"
+            - "b.1"
+            - "LANE"
+            - "{neon_type[1].base_byte_size}"
+
+  - name: "vld2{neon_type[1].lane_nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld2, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u8", uint8x8x2_t, int8x8_t, "3"]
+      - ["*const u16", uint16x4x2_t, int16x4_t, "2"]
+      - ["*const u32", uint32x2x2_t, int32x2_t, "1"]
+      - ["*const u16", uint16x8x2_t, int16x8_t, "3"]
+      - ["*const u32", uint32x4x2_t, int32x4_t, "2"]
+      - ["*const p8", poly8x8x2_t, int8x8_t, "3"]
+      - ["*const p16", poly16x4x2_t, int16x4_t, "2"]
+      - ["*const p16", poly16x8x2_t, int16x8_t, "3"]
+    compose:
+      - FnCall:
+          - "static_assert_uimm_bits!"
+          - - LANE
+            - "{type[3]}"
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].lane_nox}::<LANE>"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vld2{neon_type[1].lane_nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall:
+          - cfg_attr
+          - - test
+            - FnCall:
+                - assert_instr
+                - - ld2
+                  - "LANE = 0"
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i8", int8x8x2_t, i8, int8x8_t, "3"]
+      - ["*const i16", int16x4x2_t, i16, int16x4_t, "2"]
+      - ["*const i32", int32x2x2_t, i32, int32x2_t, "1"]
+      - ["*const i16", int16x8x2_t, i16, int16x8_t, "3"]
+      - ["*const i32", int32x4x2_t, i32, int32x4_t, "2"]
+      - ["*const f32", float32x2x2_t, f32, float32x2_t, "2"]
+      - ["*const f32", float32x4x2_t, f32, float32x4_t, "2"]
+    compose:
+      - FnCall:
+          - "static_assert_uimm_bits!"
+          - - LANE
+            - "{type[4]}"
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "a: {neon_type[3]}"
+            - "b: {neon_type[3]}"
+            - "n: i64"
+            - "ptr: *const i8"
+          links:
+            - link: "llvm.aarch64.neon.ld2lane.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld2{neon_type[1].lane_nox}"
+          - - "b.0"
+            - "b.1"
+            - "LANE as i64"
+            - "a as _"
+
+  - name: "vld2{neon_type[1].lane_nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - FnCall:
+          - cfg_attr
+          - - test
+            - FnCall:
+                - assert_instr
+                - - vld2
+                  - "LANE = 0"
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-arm-unstable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i16", int16x8x2_t, i16, int16x8_t, "3"]
+      - ["*const i32", int32x4x2_t, i32, int32x4_t, "2"]
+      - ["*const f32", float32x2x2_t, f32, float32x2_t, "1"]
+      - ["*const f32", float32x4x2_t, f32, float32x4_t, "2"]
+    compose:
+      - FnCall:
+          - "static_assert_uimm_bits!"
+          - - LANE
+            - "{type[4]}"
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "ptr: *const i8"
+            - "a: {neon_type[3]}"
+            - "b: {neon_type[3]}"
+            - "n: i32"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld2lane.v{neon_type[1].lane}{type[2]}.p0"
+              arch: arm
+      - FnCall:
+          - "_vld2{neon_type[1].lane_nox}"
+          - - "a as _"
+            - "b.0"
+            - "b.1"
+            - "LANE"
+            - "{neon_type[1].base_byte_size}"
+
+  - name: "vld2{neon_type[1].dup_nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i64", int64x1x2_t, i64]
+    compose:
+      - LLVMLink:
+          name: "vld2dup.{neon_type[1]}"
+          arguments:
+            - "ptr: *const i8"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld2dup.v{neon_type[1].lane}{type[2]}.p0"
+              arch: arm
+      - FnCall:
+          - "_vld2{neon_type[1].dup_nox}"
+          - - "a as *const i8"
+            - "{neon_type[1].base_byte_size}"
+
+  - name: "vld2{neon_type[1].dup_nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-not-arm
+      - *neon-stable
+    assert_instr: [ld2r]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i64", int64x1x2_t, i64]
+    compose:
+      - LLVMLink:
+          name: "vld2dup.{neon_type[1]}"
+          arguments:
+            - "ptr: *const i64"
+          links:
+            - link: "llvm.aarch64.neon.ld2r.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld2{neon_type[1].dup_nox}"
+          - - "a as _"
+
+  - name: "vld2{neon_type[1].dup_nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vld2]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i8", int8x8x2_t, i8]
+      - ["*const i16", int16x4x2_t, i16]
+      - ["*const i32", int32x2x2_t, i32]
+      - ["*const i8", int8x16x2_t, i8]
+      - ["*const i16", int16x8x2_t, i16]
+      - ["*const i32", int32x4x2_t, i32]
+      - ["*const f32", float32x2x2_t, f32]
+      - ["*const f32", float32x4x2_t, f32]
+    compose:
+      - LLVMLink:
+          name: "vld2dup.{neon_type[1]}"
+          arguments:
+            - "ptr: *const i8"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld2dup.v{neon_type[1].lane}{type[2]}.p0"
+              arch: arm
+      - FnCall:
+          - "_vld2{neon_type[1].dup_nox}"
+          - - "a as *const i8"
+            - "{neon_type[1].base_byte_size}"
+
+  - name: "vld2{neon_type[1].dup_nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld2]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld2r]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u8", uint8x8x2_t, int8x8x2_t]
+      - ["*const u16", uint16x4x2_t, int16x4x2_t]
+      - ["*const u32", uint32x2x2_t, int32x2x2_t]
+      - ["*const u8", uint8x16x2_t, int8x16x2_t]
+      - ["*const u16", uint16x8x2_t, int16x8x2_t]
+      - ["*const u32", uint32x4x2_t, int32x4x2_t]
+      - ["*const p8", poly8x8x2_t, int8x8x2_t]
+      - ["*const p16", poly16x4x2_t, int16x4x2_t]
+      - ["*const p8", poly8x16x2_t, int8x16x2_t]
+      - ["*const p16", poly16x8x2_t, int16x8x2_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].dup_nox}"
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld2{neon_type[1].dup_nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld2r]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u64", uint64x1x2_t, int64x1x2_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].dup_nox}"
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld2{neon_type[1].dup_nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld2r]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const p64", poly64x1x2_t, int64x1x2_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vld2{neon_type[2].dup_nox}"
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld2{neon_type[1].dup_nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - FnCall:
+          - cfg
+          - - FnCall:
+                - not
+                - - 'target_arch = "arm"'
+      - *neon-stable
+    assert_instr: [ld2r]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i8", int8x8x2_t, i8]
+      - ["*const i16", int16x4x2_t, i16]
+      - ["*const i32", int32x2x2_t, i32]
+      - ["*const i8", int8x16x2_t, i8]
+      - ["*const i16", int16x8x2_t, i16]
+      - ["*const i32", int32x4x2_t, i32]
+      - ["*const f32", float32x2x2_t, f32]
+      - ["*const f32", float32x4x2_t, f32]
+    compose:
+      - LLVMLink:
+          name: "vld2dup.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.ld2r.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld2{neon_type[1].dup_nox}"
+          - - "a as _"
+
+  - name: "vld2{neon_type[1].nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v7
+      - *target-is-arm
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld2]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x2_t, f16]
+      - ["*const f16", float16x8x2_t, f16]
+    compose:
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld2.v{neon_type[1].lane}{type[2]}.p0"
+              arch: arm
+      - FnCall:
+          - "_vld2{neon_type[1].nox}"
+          - - "a as _"
+            - "2"
+
+  - name: "vld2{neon_type[1].nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld2]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x2_t, f16]
+      - ["*const f16", float16x8x2_t, f16]
+    compose:
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.ld2.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld2{neon_type[1].nox}"
+          - - "a as _"
+
+  - name: "vld2{neon_type[1].dup_nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v7
+      - *target-is-arm
+      - *neon-fp16
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld2]]}]]
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x2_t, f16]
+      - ["*const f16", float16x8x2_t, f16]
+    compose:
+      - LLVMLink:
+          name: "vld2dup.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld2dup.v{neon_type[1].lane}{type[2]}.p0"
+              arch: arm
+      - FnCall:
+          - "_vld2{neon_type[1].dup_nox}"
+          - - "a as _"
+            - "2"
+
+
+  - name: "vld2{neon_type[1].dup_nox}"
+    doc: Load single 2-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld2r]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x2_t, f16]
+      - ["*const f16", float16x8x2_t, f16]
+    compose:
+      - LLVMLink:
+          name: "vld2dup.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.ld2r.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld2{neon_type[1].dup_nox}"
+          - - "a as _"
+
+
+  - name: "vld2{neon_type[1].lane_nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['vld2', 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x2_t, f16, float16x4_t, "2"]
+      - ["*const f16", float16x8x2_t, f16, float16x8_t, "3"]
+    compose:
+      - FnCall:
+          - "static_assert_uimm_bits!"
+          - - LANE
+            - "{type[4]}"
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "ptr: *const f16"
+            - "a: {neon_type[3]}"
+            - "b: {neon_type[3]}"
+            - "n: i32"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld2lane.v{neon_type[1].lane}{type[2]}.p0"
+              arch: arm
+      - FnCall:
+          - "_vld2{neon_type[1].lane_nox}"
+          - - "a as _"
+            - "b.0"
+            - "b.1"
+            - "LANE"
+            - "2"
+
+
+  - name: "vld2{neon_type[1].lane_nox}"
+    doc: Load multiple 2-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x2_t, f16, float16x4_t, "2"]
+      - ["*const f16", float16x8x2_t, f16, float16x8_t, "3"]
+    compose:
+      - FnCall:
+          - "static_assert_uimm_bits!"
+          - - LANE
+            - "{type[4]}"
+      - LLVMLink:
+          name: "vld2.{neon_type[1]}"
+          arguments:
+            - "a: {neon_type[3]}"
+            - "b: {neon_type[3]}"
+            - "n: i64"
+            - "ptr: *const f16"
+          links:
+            - link: "llvm.aarch64.neon.ld2lane.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld2{neon_type[1].lane_nox}"
+          - - "b.0"
+            - "b.1"
+            - "LANE as i64"
+            - "a as _"
+
+
+  - name: "vld3{neon_type[1].nox}"
+    doc: Load single 3-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v7
+      - *target-is-arm
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld3]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x3_t, f16]
+      - ["*const f16", float16x8x3_t, f16]
+    compose:
+      - LLVMLink:
+          name: "vld3.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld3.v{neon_type[1].lane}{type[2]}.p0"
+              arch: arm
+      - FnCall:
+          - "_vld3{neon_type[1].nox}"
+          - - "a as _"
+            - "2"
+
+  - name: "vld3{neon_type[1].nox}"
+    doc: Load single 3-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld3]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x3_t, f16]
+      - ["*const f16", float16x8x3_t, f16]
+    compose:
+      - LLVMLink:
+          name: "vld3.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld3{neon_type[1].nox}"
+          - - "a as _"
+
+  - name: "vld3{neon_type[1].dup_nox}"
+    doc: Load single 3-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v7
+      - *target-is-arm
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld3]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x3_t, f16]
+      - ["*const f16", float16x8x3_t, f16]
+    compose:
+      - LLVMLink:
+          name: "vld3dup.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld3dup.v{neon_type[1].lane}{type[2]}.p0"
+              arch: arm
+      - FnCall:
+          - "_vld3{neon_type[1].dup_nox}"
+          - - "a as _"
+            - "2"
+
+
+  - name: "vld3{neon_type[1].dup_nox}"
+    doc: Load single 3-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld3r]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x3_t, f16]
+      - ["*const f16", float16x8x3_t, f16]
+    compose:
+      - LLVMLink:
+          name: "vld3dup.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.ld3r.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld3{neon_type[1].dup_nox}"
+          - - "a as _"
+
+
+  - name: "vld3{neon_type[1].lane_nox}"
+    doc: Load multiple 3-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['vld3', 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x3_t, f16, float16x4_t, "2"]
+      - ["*const f16", float16x8x3_t, f16, float16x8_t, "3"]
+    compose:
+      - FnCall:
+          - "static_assert_uimm_bits!"
+          - - LANE
+            - "{type[4]}"
+      - LLVMLink:
+          name: "vld3.{neon_type[1]}"
+          arguments:
+            - "ptr: *const f16"
+            - "a: {neon_type[3]}"
+            - "b: {neon_type[3]}"
+            - "c: {neon_type[3]}"
+            - "n: i32"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld3lane.v{neon_type[1].lane}{type[2]}.p0"
+              arch: arm
+      - FnCall:
+          - "_vld3{neon_type[1].lane_nox}"
+          - - "a as _"
+            - "b.0"
+            - "b.1"
+            - "b.2"
+            - "LANE"
+            - "2"
+
+
+  - name: "vld3{neon_type[1].lane_nox}"
+    doc: Load multiple 3-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x3_t, f16, float16x4_t, "2"]
+      - ["*const f16", float16x8x3_t, f16, float16x8_t, "3"]
+    compose:
+      - FnCall:
+          - "static_assert_uimm_bits!"
+          - - LANE
+            - "{type[4]}"
+      - LLVMLink:
+          name: "vld3.{neon_type[1]}"
+          arguments:
+            - "a: {neon_type[3]}"
+            - "b: {neon_type[3]}"
+            - "c: {neon_type[3]}"
+            - "n: i64"
+            - "ptr: *const f16"
+          links:
+            - link: "llvm.aarch64.neon.ld3lane.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld3{neon_type[1].lane_nox}"
+          - - "b.0"
+            - "b.1"
+            - "b.2"
+            - "LANE as i64"
+            - "a as _"
+
+  - name: "vld3{neon_type[1].lane_nox}"
+    doc: "Load multiple 3-element structures to two registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i8', int8x8x3_t, int8x8_t, i8, '3']
+      - ['*const i16', int16x8x3_t, int16x8_t, i16, '4']
+      - ['*const i32', int32x4x3_t, int32x4_t, i32, '2']
+      - ['*const i16', int16x4x3_t, int16x4_t, i16, '2']
+      - ['*const i32', int32x2x3_t, int32x2_t, i32, '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[4]}']]
+      - LLVMLink:
+          name: 'ld3lane.{neon_type[2]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *const i8'
+          links:
+            - link: 'llvm.aarch64.neon.ld3lane.v{neon_type[1].lane}{type[3]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld3{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'LANE as i64', 'a as _']]
+
+  - name: "vld3{neon_type[1].nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-stable
+      - *target-not-arm
+    assert_instr: [ld3]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i8', int8x8x3_t, '*const int8x8_t', i8]
+      - ['*const i16', int16x4x3_t, '*const int16x4_t', i16]
+      - ['*const i32', int32x2x3_t, '*const int32x2_t', i32]
+      - ['*const i8', int8x16x3_t, '*const int8x16_t', i8]
+      - ['*const i16', int16x8x3_t, '*const int16x8_t', i16]
+      - ['*const i32', int32x4x3_t, '*const int32x4_t', i32]
+      - ['*const f32', float32x2x3_t, '*const float32x2_t', f32]
+      - ['*const f32', float32x4x3_t, '*const float32x4_t', f32]
+    compose:
+      - LLVMLink:
+          name: 'vld3{neon_type[1].nox}'
+          arguments:
+            - 'ptr: {type[2]}'
+          links:
+            - link: 'llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[3]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld3{neon_type[1].nox}', ['a as _']]
+
+  - name: "vld3{neon_type[1].nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-stable
+      - *target-not-arm
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i64', int64x1x3_t, '*const int64x1_t', i64]
+    compose:
+      - LLVMLink:
+          name: "vld3{neon_type[1].nox}"
+          arguments:
+            - 'ptr: {type[2]}'
+          links:
+            - link: 'llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[3]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld3{neon_type[1].nox}', ['a as _']]
+
+  - name: "vld3{neon_type[1].nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - *neon-arm-unstable
+    assert_instr: [vld3]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i8', int8x8x3_t, i8]
+      - ['*const i16', int16x4x3_t, i16]
+      - ['*const i32', int32x2x3_t, i32]
+      - ['*const i8', int8x16x3_t, i8]
+      - ['*const i16', int16x8x3_t, i16]
+      - ['*const i32', int32x4x3_t, i32]
+      - ['*const f32', float32x2x3_t, f32]
+      - ['*const f32', float32x4x3_t, f32]
+    compose:
+      - LLVMLink:
+          name: 'vld3{neon_type[1].nox}'
+          arguments:
+            - 'ptr: *const i8'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vld3.v{neon_type[1].lane}{type[2]}.p0'
+              arch: arm
+      - FnCall: ['_vld3{neon_type[1].nox}', ['a as *const i8', '{neon_type[1].base_byte_size}']]
+
+  - name: "vld3{neon_type[1].nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - *neon-arm-unstable
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i64', int64x1x3_t, i64]
+    compose:
+      - LLVMLink:
+          name: 'vld3{neon_type[1].nox}'
+          arguments:
+            - 'ptr: *const i8'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vld3.v{neon_type[1].lane}{type[2]}.p0'
+              arch: arm
+      - FnCall: ['_vld3{neon_type[1].nox}', ['a as *const i8', '{neon_type[1].base_byte_size}']]
+
+  - name: "vld3{neon_type[1].lane_nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const f32', float32x4x3_t, float32x4_t, f32, '2']
+      - ['*const f32', float32x2x3_t, float32x2_t, f32, '1']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', '{type[4]}']]
+      - LLVMLink:
+          name: 'vld3{neon_type[1].lane_nox}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *const i8'
+          links:
+            - link: 'llvm.aarch64.neon.ld3lane.v{neon_type[1].lane}{type[3]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld3{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'LANE as i64', 'a as _']]
+
+  - name: "vld3{neon_type[2].lane_nox}"
+    doc: "Load multiple 3-element structures to three registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const f32', float32x2x3_t, float32x2_t, f32, '1', '4']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', '{type[4]}']]
+      - LLVMLink:
+          name: 'vld3{neon_type[1].lane_nox}'
+          arguments:
+            - 'ptr: *const i8'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'n: i32'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vld3lane.v{neon_type[1].lane}{type[3]}.p0'
+              arch: arm
+      - FnCall: ['_vld3{neon_type[1].lane_nox}', ['a as _', 'b.0', 'b.1', 'b.2', 'LANE', '{type[5]}']]
+
+  - name: "vld3{neon_type[2].lane_nox}"
+    doc: "Load multiple 3-element structures to two registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i8', int8x8x3_t, int8x8_t, i8, '3', '1']
+      - ['*const i16', int16x4x3_t, int16x4_t, i16, '2', '2']
+      - ['*const i32', int32x2x3_t, int32x2_t, i32, '1', '4']
+      - ['*const i16', int16x8x3_t, int16x8_t, i16, '3', '2']
+      - ['*const i32', int32x4x3_t, int32x4_t, i32, '2', '4']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', '{type[4]}']]
+      - LLVMLink:
+          name: 'vld3{neon_type[1].lane_nox}'
+          arguments:
+            - 'ptr: *const i8'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'n: i32'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vld3lane.v{neon_type[1].lane}{type[3]}.p0'
+              arch: arm
+      - FnCall: ['_vld3{neon_type[1].lane_nox}', ['a as _', 'b.0', 'b.1', 'b.2', 'LANE', '{type[5]}']]
+
+  - name: "vld3{neon_type[2].lane_nox}"
+    doc: "Load multiple 3-element structures to three registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const f32', float32x4x3_t, float32x4_t, f32, '2', '4']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', '{type[4]}']]
+      - LLVMLink:
+          name: 'vld3{neon_type[1].lane_nox}'
+          arguments:
+            - 'ptr: *const i8'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'n: i32'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vld3lane.v{neon_type[1].lane}{type[3]}.p0'
+              arch: arm
+      - FnCall: ['_vld3{neon_type[1].lane_nox}', ['a as _', 'b.0', 'b.1', 'b.2', 'LANE', '{type[5]}']]
+
+  - name: "vld3{neon_type[1].lane_nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld3, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const u8', uint8x8x3_t, int8x8x3_t, '3']
+      - ['*const u16', uint16x4x3_t, int16x4x3_t, '2']
+      - ['*const u32', uint32x2x3_t, int32x2x3_t, '1']
+      - ['*const p8', poly8x8x3_t, int8x8x3_t, '3']
+      - ['*const u16', uint16x8x3_t, int16x8x3_t, '3']
+      - ['*const p16', poly16x4x3_t, int16x4x3_t, '2']
+      - ['*const p16', poly16x8x3_t, int16x8x3_t, '3']
+      - ['*const u32', uint32x4x3_t, int32x4x3_t, '2']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld3{neon_type[2].lane_nox}::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vld3{neon_type[1].nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld3]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld3]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const u8', uint8x8x3_t, int8x8x3_t]
+      - ['*const u8', uint8x16x3_t, int8x16x3_t]
+      - ['*const u16', uint16x4x3_t, int16x4x3_t]
+      - ['*const u32', uint32x2x3_t, int32x2x3_t]
+      - ['*const u16', uint16x8x3_t, int16x8x3_t]
+      - ['*const u32', uint32x4x3_t, int32x4x3_t]
+      - ['*const p8', poly8x8x3_t, int8x8x3_t]
+      - ['*const p8', poly8x16x3_t, int8x16x3_t]
+      - ['*const p16', poly16x4x3_t, int16x4x3_t]
+      - ['*const p16', poly16x8x3_t, int16x8x3_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld3{neon_type[2].nox}'
+                - - FnCall: [transmute, [a]]
+
+  - name: "vld3{neon_type[1].nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const u64', uint64x1x3_t, int64x1x3_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld3{neon_type[2].nox}'
+                - - FnCall: [transmute, [a]]
+
+  - name: "vld3{neon_type[1].nox}"
+    doc: Load multiple 3-element structures to three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const p64', poly64x1x3_t, int64x1x3_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld3{neon_type[2].nox}'
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld3{neon_type[1].dup_nox}"
+    doc: Load single 3-element structure and replicate to all lanes of three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*target-not-arm, *neon-stable]
+    assert_instr: [ld3r]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i8", int8x8x3_t, i8]
+      - ["*const i16", int16x4x3_t, i16]
+      - ["*const i32", int32x2x3_t, i32]
+      - ["*const i32", int32x4x3_t, i32]
+      - ["*const i16", int16x8x3_t, i16]
+      - ["*const i8", int8x16x3_t, i8]
+      - ["*const i64", int64x1x3_t, i64]
+      - ["*const f32", float32x4x3_t, f32]
+      - ["*const f32", float32x2x3_t, f32]
+    compose:
+      - LLVMLink:
+          name: 'ld3r{neon_type[1].dup_nox}'
+          arguments:
+            - 'ptr: {type[0]}'
+          links:
+            - link: 'llvm.aarch64.neon.ld3r.v{neon_type[1].lane}{type[2]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld3{neon_type[1].dup_nox}', ['a as _']]
+
+  - name: "vld3{neon_type[1].dup_nox}"
+    doc: Load single 3-element structure and replicate to all lanes of three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*enable-v7, *target-is-arm, *neon-arm-unstable]
+    assert_instr: [vld3]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i8", int8x8x3_t, i8, '1']
+      - ["*const i16", int16x4x3_t, i16, '2']
+      - ["*const i32", int32x2x3_t, i32, '4']
+      - ["*const i8", int8x16x3_t, i8, '1']
+      - ["*const i16", int16x8x3_t, i16, '2']
+      - ["*const i32", int32x4x3_t, i32, '4']
+      - ["*const f32", float32x4x3_t, f32, '4']
+      - ["*const f32", float32x2x3_t, f32, '4']
+    compose:
+      - LLVMLink:
+          name: 'vld3{neon_type[1].dup_nox}'
+          arguments:
+            - 'ptr: *const i8'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vld3dup.v{neon_type[1].lane}{type[2]}.p0'
+              arch: arm
+      - FnCall: ['_vld3{neon_type[1].dup_nox}', ['a as *const i8', '{type[3]}']]
+
+  - name: "vld3{neon_type[1].dup_nox}"
+    doc: Load single 3-element structure and replicate to all lanes of three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld3]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld3r]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const u8', uint8x8x3_t, int8x8x3_t]
+      - ['*const u16', uint16x4x3_t, int16x4x3_t]
+      - ['*const u32', uint32x2x3_t, int32x2x3_t]
+      - ['*const u8', uint8x16x3_t, int8x16x3_t]
+      - ['*const u16', uint16x8x3_t, int16x8x3_t]
+      - ['*const u32', uint32x4x3_t, int32x4x3_t]
+      - ['*const p8', poly8x8x3_t, int8x8x3_t]
+      - ['*const p16', poly16x4x3_t, int16x4x3_t]
+      - ['*const p8', poly8x16x3_t, int8x16x3_t]
+      - ['*const p16', poly16x8x3_t, int16x8x3_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld3{neon_type[2].dup_nox}'
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld3{neon_type[1].dup_nox}"
+    doc: Load single 3-element structure and replicate to all lanes of three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*target-is-arm, *enable-v7, *neon-arm-unstable]
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i64", int64x1x3_t, i64, '8']
+    compose:
+      - LLVMLink:
+          name: 'vld3{neon_type[1].dup_nox}'
+          arguments:
+            - 'ptr: *const i8'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vld3dup.v{neon_type[1].lane}{type[2]}.p0'
+              arch: arm
+      - FnCall: ['_vld3{neon_type[1].dup_nox}', ['a as *const i8', '{type[3]}']]
+
+  - name: "vld3{neon_type[1].dup_nox}"
+    doc: Load single 3-element structure and replicate to all lanes of three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld3r]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u64", uint64x1x3_t, int64x1x3_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld3{neon_type[2].dup_nox}'
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld3{neon_type[1].dup_nox}"
+    doc: Load single 3-element structure and replicate to all lanes of three registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld3r]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const p64", poly64x1x3_t, int64x1x3_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld3{neon_type[2].dup_nox}'
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld4{neon_type[1].nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-not-arm
+      - *neon-stable
+    assert_instr: [ld4]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i8', int8x8x4_t, i8, '*const int8x8_t']
+      - ['*const i32', int32x4x4_t, i32, '*const int32x4_t']
+      - ['*const i16', int16x4x4_t, i16, '*const int16x4_t']
+      - ['*const i32', int32x2x4_t, i32, '*const int32x2_t']
+      - ['*const i8', int8x16x4_t, i8, '*const int8x16_t']
+      - ['*const i16', int16x8x4_t, i16, '*const int16x8_t']
+      - ['*const f32', float32x2x4_t, f32, '*const float32x2_t']
+      - ['*const f32', float32x4x4_t, f32, '*const float32x4_t']
+    compose:
+      - LLVMLink:
+          name: 'vld4{neon_type[1].nox}'
+          arguments:
+            - 'ptr: {type[3]}'
+          links:
+            - link: 'llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld4{neon_type[1].nox}', ['a as _']]
+
+  - name: "vld4{neon_type[1].nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr: [*target-not-arm, *neon-stable]
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i64', int64x1x4_t, i64, '*const int64x1_t']
+    compose:
+      - LLVMLink:
+          name: 'vld4{neon_type[1].nox}'
+          arguments:
+            - 'ptr: {type[3]}'
+          links:
+            - link: 'llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld4{neon_type[1].nox}', ['a as _']]
+
+  - name: "vld4{neon_type[1].lane_nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-stable
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i8', int8x8x4_t, int8x8_t, i8, '3']
+      - ['*const i16', int16x4x4_t, int16x4_t, i16, '2']
+      - ['*const i16', int16x8x4_t, int16x8_t, i16, '3']
+      - ['*const i32', int32x2x4_t, int32x2_t, i32, '1']
+      - ['*const i32', int32x4x4_t, int32x4_t, i32, '2']
+      - ['*const f32', float32x2x4_t, float32x2_t, f32, '1']
+      - ['*const f32', float32x4x4_t, float32x4_t, f32, '2']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[4]}']]
+      - LLVMLink:
+          name: 'ld4lane.{neon_type[2]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'n: i64'
+            - 'ptr: *const i8'
+          links:
+            - link: 'llvm.aarch64.neon.ld4lane.v{neon_type[1].lane}{type[3]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vld4{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'LANE as i64', 'a as _']]
+
+  - name: "vld4{neon_type[1].nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vld4]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i8', int8x8x4_t, i8, '1']
+      - ['*const i16', int16x4x4_t, i16, '2']
+      - ['*const i32', int32x2x4_t, i32, '4']
+      - ['*const i8', int8x16x4_t, i8, '1']
+      - ['*const i16', int16x8x4_t, i16, '2']
+      - ['*const i32', int32x4x4_t, i32, '4']
+      - ['*const f32', float32x4x4_t, f32, '4']
+      - ['*const f32', float32x2x4_t, f32, '4']
+    compose:
+      - LLVMLink:
+          name: 'vld4{neon_type[1].nox}'
+          arguments:
+            - 'ptr: *const i8'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vld4.v{neon_type[1].lane}{type[2]}.p0'
+              arch: arm
+      - FnCall: ['_vld4{neon_type[1].nox}', ['a as *const i8', '{type[3]}']]
+
+  - name: "vld4{neon_type[1].nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i64', int64x1x4_t, i64, '8']
+    compose:
+      - LLVMLink:
+          name: 'vld4{neon_type[1].nox}'
+          arguments:
+            - 'ptr: *const i8'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vld4.v{neon_type[1].lane}{type[2]}.p0'
+              arch: arm
+      - FnCall: ['_vld4{neon_type[1].nox}', ['a as *const i8', '{type[3]}']]
+
+  - name: "vld4{neon_type[1].nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld4]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld4]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const u8', uint8x8x4_t, int8x8x4_t]
+      - ['*const u16', uint16x4x4_t, int16x4x4_t]
+      - ['*const u32', uint32x2x4_t, int32x2x4_t]
+      - ['*const u8', uint8x16x4_t, int8x16x4_t]
+      - ['*const u16', uint16x8x4_t, int16x8x4_t]
+      - ['*const u32', uint32x4x4_t, int32x4x4_t]
+      - ['*const p8', poly8x8x4_t, int8x8x4_t]
+      - ['*const p16', poly16x4x4_t, int16x4x4_t]
+      - ['*const p8', poly8x16x4_t, int8x16x4_t]
+      - ['*const p16', poly16x8x4_t, int16x8x4_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld4{neon_type[2].nox}'
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld4{neon_type[1].nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const u64', uint64x1x4_t, int64x1x4_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld4{neon_type[2].nox}'
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld4{neon_type[1].nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall:
+          - target_feature
+          - - 'enable = "neon,aes"'
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const p64', poly64x1x4_t, int64x1x4_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld4{neon_type[2].nox}'
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vld4{neon_type[1].lane_nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vld4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-arm-unstable
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const i8', int8x8x4_t, int8x8_t, i8, '1', '3']
+      - ['*const i16', int16x4x4_t, int16x4_t, i16, '2', '2']
+      - ['*const i32', int32x2x4_t, int32x2_t, i32, '4', '1']
+      - ['*const i16', int16x8x4_t, int16x8_t, i16, '2', '3']
+      - ['*const i32', int32x4x4_t, int32x4_t, i32, '4', '2']
+      - ['*const f32', float32x2x4_t, float32x2_t, f32, '4', '1']
+      - ['*const f32', float32x4x4_t, float32x4_t, f32, '4', '2']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[5]}']]
+      - LLVMLink:
+          name: 'ld4lane.{neon_type[2]}'
+          arguments:
+            - 'ptr: *const i8'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'n: i32'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vld4lane.v{neon_type[1].lane}{type[3]}.p0'
+              arch: arm
+      - FnCall: ['_vld4{neon_type[1].lane_nox}', ['a as _', 'b.0', 'b.1', 'b.2', 'b.3', LANE, '{type[4]}']]
+
+  - name: "vld4{neon_type[1].lane_nox}"
+    doc: Load multiple 4-element structures to four registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld4, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*const u8', uint8x8x4_t, int8x8x4_t, '3']
+      - ['*const u16', uint16x4x4_t, int16x4x4_t, '2']
+      - ['*const u32', uint32x2x4_t, int32x2x4_t, '1']
+      - ['*const u16', uint16x8x4_t, int16x8x4_t, '3']
+      - ['*const u32', uint32x4x4_t, int32x4x4_t, '2']
+      - ['*const p8', poly8x8x4_t, int8x8x4_t, '3']
+      - ['*const p16', poly16x4x4_t, int16x4x4_t, '2']
+      - ['*const p16', poly16x8x4_t, int16x8x4_t, '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, ['LANE', '{type[3]}']]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - 'vld4{neon_type[2].lane_nox}::<LANE>'
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vst1{neon_type[1].lane_nox}"
+    doc: "Store multiple single-element structures from one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    types:
+      - ['*mut i64', int64x1_t]
+      - ['*mut u64', uint64x1_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - Assign:
+          - "*a"
+          - FnCall: [simd_extract!, [b, 'LANE as u32']]
+      - Identifier: [';', Symbol]
+
+  - name: "vst1{neon_type[1].lane_nox}"
+    doc: "Store multiple single-element structures from one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *neon-v8
+      - FnCall:
+          - target_feature
+          - - 'enable = "neon,aes"'
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    types:
+      - ['*mut p64', poly64x1_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - Assign:
+          - "*a"
+          - FnCall: [simd_extract!, [b, 'LANE as u32']]
+      - Identifier: [';', Symbol]
+
+  - name: "vst1{neon_type[1].lane_nox}"
+    doc: "Store multiple single-element structures from one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *neon-v8
+      - FnCall:
+          - target_feature
+          - - 'enable = "neon,aes"'
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    types:
+      - ['*mut p64', poly64x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '1']]
+      - Assign:
+          - "*a"
+          - FnCall: [simd_extract!, [b, 'LANE as u32']]
+      - Identifier: [';', Symbol]
+
+  - name: "vst1{neon_type[1].lane_nox}"
+    doc: "Store multiple single-element structures from one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    types:
+      - ['*mut i8', int8x8_t, '3']
+      - ['*mut i16', int16x4_t, '2']
+      - ['*mut i32', int32x2_t, '1']
+      - ['*mut i8', int8x16_t, '4']
+      - ['*mut i16', int16x8_t, '3']
+      - ['*mut i32', int32x4_t, '2']
+      - ['*mut i64', int64x2_t, '1']
+      - ['*mut u8', uint8x8_t, '3']
+      - ['*mut u16', uint16x4_t, '2']
+      - ['*mut u32', uint32x2_t, '1']
+      - ['*mut u8', uint8x16_t, '4']
+      - ['*mut u16', uint16x8_t, '3']
+      - ['*mut u32', uint32x4_t, '2']
+      - ['*mut u64', uint64x2_t, '1']
+      - ['*mut p8', poly8x8_t, '3']
+      - ['*mut p16', poly16x4_t, '2']
+      - ['*mut p8', poly8x16_t, '4']
+      - ['*mut p16', poly16x8_t, '3']
+      - ['*mut f32', float32x2_t, '1']
+      - ['*mut f32', float32x4_t, '2']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - Assign:
+          - "*a"
+          - FnCall: [simd_extract!, [b, 'LANE as u32']]
+      - Identifier: [';', Symbol]
+
+
+  - name: "vst1{neon_type[1].lane_nox}"
+    doc: "Store multiple single-element structures from one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    types:
+      - ['*mut f16', float16x4_t, '2']
+      - ['*mut f16', float16x8_t, '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - Assign:
+          - "*a"
+          - FnCall: [simd_extract!, [b, 'LANE as u32']]
+      - Identifier: [';', Symbol]
+
+
+  - name: 'vst1{neon_type[1].no}'
+    doc: "Store multiple single-element structures from one, two, three, or four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-not-arm
+      - *neon-stable
+    assert_instr: [st1]
+    types:
+      - [i8, int8x8x2_t, int8x8_t]
+      - [i16, int16x4x2_t, int16x4_t]
+      - [i32, int32x2x2_t, int32x2_t]
+      - [i64, int64x1x2_t, int64x1_t]
+      - [i8, int8x16x2_t, int8x16_t]
+      - [i16, int16x8x2_t, int16x8_t]
+      - [i32, int32x4x2_t, int32x4_t]
+      - [i64, int64x2x2_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: 'st1x2.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'ptr: *mut {type[0]}'
+          links:
+            - link: 'llvm.aarch64.neon.st1x2.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst1{neon_type[1].no}', ['b.0', 'b.1', 'a']]
+
+  - name: 'vst1{neon_type[1].no}'
+    doc: "Store multiple single-element structures from one, two, three, or four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-not-arm
+      - *neon-stable
+    assert_instr: [st1]
+    types:
+      - [i8, int8x8x3_t, int8x8_t]
+      - [i16, int16x4x3_t, int16x4_t]
+      - [i32, int32x2x3_t, int32x2_t]
+      - [i64, int64x1x3_t, int64x1_t]
+      - [i8, int8x16x3_t, int8x16_t]
+      - [i16, int16x8x3_t, int16x8_t]
+      - [i32, int32x4x3_t, int32x4_t]
+      - [i64, int64x2x3_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: 'st1x3.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'ptr: *mut {type[0]}'
+          links:
+            - link: 'llvm.aarch64.neon.st1x3.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst1{neon_type[1].no}', ['b.0', 'b.1', 'b.2', 'a']]
+
+  - name: 'vst1{neon_type[1].no}'
+    doc: "Store multiple single-element structures from one, two, three, or four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-not-arm
+      - *neon-stable
+    assert_instr: [st1]
+    types:
+      - [i8, int8x8x4_t, int8x8_t]
+      - [i16, int16x4x4_t, int16x4_t]
+      - [i32, int32x2x4_t, int32x2_t]
+      - [i64, int64x1x4_t, int64x1_t]
+      - [i8, int8x16x4_t, int8x16_t]
+      - [i16, int16x8x4_t, int16x8_t]
+      - [i32, int32x4x4_t, int32x4_t]
+      - [i64, int64x2x4_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: 'st1x4.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'ptr: *mut {type[0]}'
+          links:
+            - link: 'llvm.aarch64.neon.st1x4.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst1{neon_type[1].no}', ['b.0', 'b.1', 'b.2', 'b.3', 'a']]
+
+  - name: 'vst1{neon_type[1].no}'
+    doc: "Store multiple single-element structures from one, two, three, or four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vst1]
+    types:
+      - [i8, int8x8x2_t, int8x8_t]
+      - [i16, int16x4x2_t, int16x4_t]
+      - [i32, int32x2x2_t, int32x2_t]
+      - [i64, int64x1x2_t, int64x1_t]
+      - [i8, int8x16x2_t, int8x16_t]
+      - [i16, int16x8x2_t, int16x8_t]
+      - [i32, int32x4x2_t, int32x4_t]
+      - [i64, int64x2x2_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: 'st1x2.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut {type[0]}'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+          links:
+            - link: 'llvm.arm.neon.vst1x2.v{neon_type[1].lane}{type[0]}.p0'
+              arch: arm
+      - FnCall: ['_vst1{neon_type[1].no}', ['a', 'b.0', 'b.1']]
+
+  - name: 'vst1{neon_type[1].no}'
+    doc: "Store multiple single-element structures from one, two, three, or four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vst1]
+    types:
+      - [i8, int8x8x3_t, int8x8_t]
+      - [i16, int16x4x3_t, int16x4_t]
+      - [i32, int32x2x3_t, int32x2_t]
+      - [i64, int64x1x3_t, int64x1_t]
+      - [i8, int8x16x3_t, int8x16_t]
+      - [i16, int16x8x3_t, int16x8_t]
+      - [i32, int32x4x3_t, int32x4_t]
+      - [i64, int64x2x3_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: 'st1x3.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut {type[0]}'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+          links:
+            - link: 'llvm.arm.neon.vst1x3.p0.v{neon_type[1].lane}{type[0]}.p0'
+              arch: arm
+      - FnCall: ['_vst1{neon_type[1].no}', ['a', 'b.0', 'b.1', 'b.2']]
+
+  - name: 'vst1{neon_type[1].no}'
+    doc: "Store multiple single-element structures from one, two, three, or four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - *neon-arm-unstable
+    assert_instr: [vst1]
+    types:
+      - [i8, int8x8x4_t, int8x8_t]
+      - [i16, int16x4x4_t, int16x4_t]
+      - [i32, int32x2x4_t, int32x2_t]
+      - [i64, int64x1x4_t, int64x1_t]
+      - [i8, int8x16x4_t, int8x16_t]
+      - [i16, int16x8x4_t, int16x8_t]
+      - [i32, int32x4x4_t, int32x4_t]
+      - [i64, int64x2x4_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: 'st1x4.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut {type[0]}'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+          links:
+            - link: 'llvm.arm.neon.vst1x4.p0.v{neon_type[1].lane}{type[0]}.p0'
+              arch: arm
+      - FnCall: ['_vst1{neon_type[1].no}', ['a', 'b.0', 'b.1', 'b.2', 'b.3']]
+
+  - name: 'vst1{neon_type[1].no}'
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - *neon-arm-unstable
+    assert_instr: [vst1]
+    types:
+      - [f32, float32x2x4_t, float32x2_t]
+      - [f32, float32x4x4_t, float32x4_t]
+    compose:
+      - LLVMLink:
+          name: 'st1x4.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut {type[0]}'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+          links:
+            - link: 'llvm.arm.neon.vst1x4.p0.v{neon_type[1].lane}{type[0]}.p0'
+              arch: arm
+      - FnCall: ['_vst1{neon_type[1].no}', ['a', 'b.0', 'b.1', 'b.2', 'b.3']]
+
+  - name: 'vst1{neon_type[1].no}'
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [vst1]
+    types:
+      - [f16, float16x4x4_t, float16x4_t]
+      - [f16, float16x8x4_t, float16x8_t]
+    compose:
+      - LLVMLink:
+          name: 'st1x4.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut {type[0]}'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+          links:
+            - link: 'llvm.arm.neon.vst1x4.p0.v{neon_type[1].lane}{type[0]}'
+              arch: arm
+      - FnCall: ['_vst1{neon_type[1].no}', ['a', 'b.0', 'b.1', 'b.2', 'b.3']]
+
+  - name: "vst2{neon_type[1].nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-v8
+      - *neon-aes
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - [p64, poly64x1x2_t, int64x1x2_t]
+    compose:
+      - FnCall:
+          - "vst2{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst2{neon_type[1].nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - *neon-arm-unstable
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i64, int64x1x2_t, int64x1_t]
+    compose:
+      - LLVMLink:
+          name: 'vst2.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst2.v{neon_type[1].lane}{type[0]}.p0'
+              arch: arm
+      - FnCall: ['_vst2{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', '8']]
+
+  - name: "vst2{neon_type[1].nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - [u64, uint64x1x2_t, int64x1x2_t]
+    compose:
+      - FnCall:
+          - "vst2{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst2{neon_type[1].nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-not-arm
+      - *neon-stable
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i64, int64x1x2_t, int64x1_t]
+    compose:
+      - LLVMLink:
+          name: 'st2.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st2.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst2{neon_type[1].nox}', ['b.0', 'b.1', 'a as _']]
+
+  - name: "vst2{neon_type[1].nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-not-arm
+      - *neon-stable
+    assert_instr: [st2]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i8, int8x8x2_t, int8x8_t]
+      - [i16, int16x4x2_t, int16x4_t]
+      - [i32, int32x2x2_t, int32x2_t]
+      - [i8, int8x16x2_t, int8x16_t]
+      - [i16, int16x8x2_t, int16x8_t]
+      - [i32, int32x4x2_t, int32x4_t]
+      - [f32, float32x2x2_t, float32x2_t]
+      - [f32, float32x4x2_t, float32x4_t]
+    compose:
+      - LLVMLink:
+          name: 'st2.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st2.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst2{neon_type[1].nox}', ['b.0', 'b.1', 'a as _']]
+
+
+  - name: "vst2{neon_type[1].nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-not-arm
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [st2]
+    safety:
+      unsafe: [neon]
+    types:
+      - [f16, float16x4x2_t, float16x4_t]
+      - [f16, float16x8x2_t, float16x8_t]
+    compose:
+      - LLVMLink:
+          name: 'st2.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st2.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst2{neon_type[1].nox}', ['b.0', 'b.1', 'a as _']]
+
+
+  - name: "vst2{neon_type[1].nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vst2]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [st2]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - [u8, uint8x8x2_t, int8x8x2_t]
+      - [u16, uint16x4x2_t, int16x4x2_t]
+      - [u32, uint32x2x2_t, int32x2x2_t]
+      - [u8, uint8x16x2_t, int8x16x2_t]
+      - [u16, uint16x8x2_t, int16x8x2_t]
+      - [u32, uint32x4x2_t, int32x4x2_t]
+      - [p8, poly8x8x2_t, int8x8x2_t]
+      - [p16, poly16x4x2_t, int16x4x2_t]
+      - [p8, poly8x16x2_t, int8x16x2_t]
+      - [p16, poly16x8x2_t, int16x8x2_t]
+    compose:
+      - FnCall:
+          - "vst2{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst2{neon_type[1].lane_nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-not-arm
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st2, 'LANE = 0']]}]]
+      - *neon-stable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [i8, int8x8x2_t, '3', int8x8_t]
+      - [i16, int16x4x2_t, '2', int16x4_t]
+      - [i32, int32x2x2_t, '1', int32x2_t]
+      - [i16, int16x8x2_t, '3', int16x8_t]
+      - [i32, int32x4x2_t, '2', int32x4_t]
+      - [f32, float32x2x2_t, '1', float32x2_t]
+      - [f32, float32x4x2_t, '2', float32x4_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - LLVMLink:
+          name: 'vst2.{neon_type[1].lane_nox}'
+          arguments:
+            - 'a: {type[3]}'
+            - 'b: {type[3]}'
+            - 'n: i64'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st2lane.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst2{neon_type[1].lane_nox}', ['b.0', 'b.1', 'LANE as i64', 'a as _']]
+
+
+  - name: "vst2{neon_type[1].lane_nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-not-arm
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st2, 'LANE = 0']]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [f16, float16x4x2_t, '2', float16x4_t]
+      - [f16, float16x8x2_t, '3', float16x8_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - LLVMLink:
+          name: 'vst2.{neon_type[1].lane_nox}'
+          arguments:
+            - 'a: {type[3]}'
+            - 'b: {type[3]}'
+            - 'n: i64'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st2lane.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst2{neon_type[1].lane_nox}', ['b.0', 'b.1', 'LANE as i64', 'a as _']]
+
+
+  - name: "vst2{neon_type[1].lane_nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vst2, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [st2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [u8, uint8x8x2_t, int8x8x2_t, '3']
+      - [u16, uint16x4x2_t, int16x4x2_t, '2']
+      - [u32, uint32x2x2_t, int32x2x2_t, '1']
+      - [u16, uint16x8x2_t, int16x8x2_t, '3']
+      - [u32, uint32x4x2_t, int32x4x2_t, '2']
+      - [p8, poly8x8x2_t, int8x8x2_t, '3']
+      - [p16, poly16x4x2_t, int16x4x2_t, '2']
+      - [p16, poly16x8x2_t, int16x8x2_t, '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - FnCall:
+          - "vst2{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst2{neon_type[1].nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - *neon-arm-unstable
+    assert_instr: [vst2]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i8, int8x8x2_t, int8x8_t, '1']
+      - [i16, int16x4x2_t, int16x4_t, '2']
+      - [i32, int32x2x2_t, int32x2_t, '4']
+      - [i8, int8x16x2_t, int8x16_t, '1']
+      - [i16, int16x8x2_t, int16x8_t, '2']
+      - [i32, int32x4x2_t, int32x4_t, '4']
+      - [f32, float32x2x2_t, float32x2_t, '4']
+      - [f32, float32x4x2_t, float32x4_t, '4']
+    compose:
+      - LLVMLink:
+          name: 'vst2.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst2.v{neon_type[1].lane}{type[0]}.p0'
+              arch: arm
+      - FnCall: ['_vst2{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', "{type[3]}"]]
+
+
+  - name: "vst2{neon_type[1].nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [vst2]
+    safety:
+      unsafe: [neon]
+    types:
+      - [f16, float16x4x2_t, float16x4_t, '2']
+      - [f16, float16x8x2_t, float16x8_t, '2']
+    compose:
+      - LLVMLink:
+          name: 'vst2.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst2.p0.v{neon_type[1].lane}{type[0]}'
+              arch: arm
+      - FnCall: ['_vst2{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', "{type[3]}"]]
+
+
+  - name: "vst2{neon_type[1].lane_nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vst2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [i8, int8x8x2_t, '3', int8x8_t, '1']
+      - [i16, int16x4x2_t, '2', int16x4_t, '2']
+      - [i32, int32x2x2_t, '1', int32x2_t, '4']
+      - [i16, int16x8x2_t, '3', int16x8_t, '2']
+      - [i32, int32x4x2_t, '2', int32x4_t, '4']
+      - [f32, float32x4x2_t, '2', float32x4_t, '4']
+      - [f32, float32x2x2_t, '1', float32x2_t, '4']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - LLVMLink:
+          name: 'vst2lane.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[3]}'
+            - 'b: {type[3]}'
+            - 'n: i32'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst2lane.v{neon_type[1].lane}{type[0]}.p0'
+              arch: arm
+      - FnCall: ['_vst2{neon_type[1].lane_nox}', ['a as _', 'b.0', 'b.1', 'LANE', "{type[4]}"]]
+
+
+  - name: "vst2{neon_type[1].lane_nox}"
+    doc: "Store multiple 2-element structures from two registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vst2, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [f16, float16x4x2_t, '2', float16x4_t, '2']
+      - [f16, float16x8x2_t, '1', float16x8_t, '2']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - LLVMLink:
+          name: 'vst2lane.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[3]}'
+            - 'b: {type[3]}'
+            - 'n: i32'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst2lane.p0.v{neon_type[1].lane}{type[0]}'
+              arch: arm
+      - FnCall: ['_vst2{neon_type[1].lane_nox}', ['a as _', 'b.0', 'b.1', 'LANE', "{type[4]}"]]
+
+
+  - name: "vst3{neon_type[1].nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-not-arm
+      - *neon-stable
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i64, int64x1x3_t, int64x1_t]
+    compose:
+      - LLVMLink:
+          name: 'st3.{neon_type[1].nox}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st3.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst3{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'a as _']]
+
+  - name: "vst3{neon_type[1].nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-v8
+      - *neon-aes
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - [p64, poly64x1x3_t, int64x1x3_t]
+    compose:
+      - FnCall:
+          - "vst3{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst3{neon_type[1].nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - *neon-arm-unstable
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i64, int64x1x3_t, int64x1_t]
+    compose:
+      - LLVMLink:
+          name: 'vst3.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst3.p0.v{neon_type[1].lane}{type[0]}'
+              arch: arm
+      - FnCall: ['_vst3{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', 'b.2', '8']]
+
+  - name: "vst3{neon_type[1].nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - [u64, uint64x1x3_t, int64x1x3_t]
+    compose:
+      - FnCall:
+          - "vst3{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst3{neon_type[1].lane_nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vst3, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [st3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [u8, uint8x8x3_t, int8x8x3_t, '3']
+      - [u16, uint16x4x3_t, int16x4x3_t, '2']
+      - [u32, uint32x2x3_t, int32x2x3_t, '1']
+      - [u16, uint16x8x3_t, int16x8x3_t, '3']
+      - [u32, uint32x4x3_t, int32x4x3_t, '2']
+      - [p8, poly8x8x3_t, int8x8x3_t, '3']
+      - [p16, poly16x4x3_t, int16x4x3_t, '2']
+      - [p16, poly16x8x3_t, int16x8x3_t, '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - FnCall:
+          - "vst3{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst3{neon_type[1].nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vst3]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [st3]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - [u8, uint8x8x3_t, int8x8x3_t]
+      - [u16, uint16x4x3_t, int16x4x3_t]
+      - [u32, uint32x2x3_t, int32x2x3_t]
+      - [u8, uint8x16x3_t, int8x16x3_t]
+      - [u16, uint16x8x3_t, int16x8x3_t]
+      - [u32, uint32x4x3_t, int32x4x3_t]
+      - [p8, poly8x8x3_t, int8x8x3_t]
+      - [p16, poly16x4x3_t, int16x4x3_t]
+      - [p8, poly8x16x3_t, int8x16x3_t]
+      - [p16, poly16x8x3_t, int16x8x3_t]
+    compose:
+      - FnCall:
+          - "vst3{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst3{neon_type[1].nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - *neon-arm-unstable
+    assert_instr: [vst3]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i8, int8x8x3_t, int8x8_t, '1']
+      - [i16, int16x4x3_t, int16x4_t, '2']
+      - [i32, int32x2x3_t, int32x2_t, '4']
+      - [i8, int8x16x3_t, int8x16_t, '1']
+      - [i16, int16x8x3_t, int16x8_t, '2']
+      - [i32, int32x4x3_t, int32x4_t, '4']
+      - [f32, float32x2x3_t, float32x2_t, '4']
+      - [f32, float32x4x3_t, float32x4_t, '4']
+    compose:
+      - LLVMLink:
+          name: 'vst3.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst3.p0.v{neon_type[1].lane}{type[0]}'
+              arch: arm
+      - FnCall: ['_vst3{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', 'b.2', "{type[3]}"]]
+
+
+  - name: "vst3{neon_type[1].nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [vst3]
+    safety:
+      unsafe: [neon]
+    types:
+      - [f16, float16x4x3_t, float16x4_t, '2']
+      - [f16, float16x8x3_t, float16x8_t, '2']
+    compose:
+      - LLVMLink:
+          name: 'vst3.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst3.p0.v{neon_type[1].lane}{type[0]}'
+              arch: arm
+      - FnCall: ['_vst3{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', 'b.2', "{type[3]}"]]
+
+
+  - name: "vst3{neon_type[1].lane_nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vst3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [i8, int8x8x3_t, '3', int8x8_t, '1']
+      - [i16, int16x4x3_t, '2', int16x4_t, '2']
+      - [i32, int32x2x3_t, '1', int32x2_t, '4']
+      - [i16, int16x8x3_t, '3', int16x8_t, '2']
+      - [i32, int32x4x3_t, '2', int32x4_t, '4']
+      - [f32, float32x2x3_t, '1', float32x2_t, '4']
+      - [f32, float32x4x3_t, '2', float32x4_t, '4']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - LLVMLink:
+          name: 'vst3lane.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[3]}'
+            - 'b: {type[3]}'
+            - 'c: {type[3]}'
+            - 'n: i32'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst3lane.p0.v{neon_type[1].lane}{type[0]}'
+              arch: arm
+      - FnCall: ['_vst3{neon_type[1].lane_nox}', ['a as _', 'b.0', 'b.1', 'b.2', 'LANE', "{type[4]}"]]
+
+
+  - name: "vst3{neon_type[1].lane_nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vst3, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [f16, float16x4x3_t, '2', float16x4_t, '4']
+      - [f16, float16x8x3_t, '3', float16x8_t, '4']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - LLVMLink:
+          name: 'vst3lane.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[3]}'
+            - 'b: {type[3]}'
+            - 'c: {type[3]}'
+            - 'n: i32'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst3lane.p0.v{neon_type[1].lane}{type[0]}'
+              arch: arm
+      - FnCall: ['_vst3{neon_type[1].lane_nox}', ['a as _', 'b.0', 'b.1', 'b.2', 'LANE', "{type[4]}"]]
+
+
+  - name: "vst3{neon_type[1].nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr: [*target-not-arm, *neon-stable]
+    assert_instr: [st3]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i8, int8x8x3_t, int8x8_t]
+      - [i16, int16x4x3_t, int16x4_t]
+      - [i32, int32x2x3_t, int32x2_t]
+      - [i8, int8x16x3_t, int8x16_t]
+      - [i16, int16x8x3_t, int16x8_t]
+      - [i32, int32x4x3_t, int32x4_t]
+      - [f32, float32x2x3_t, float32x2_t]
+      - [f32, float32x4x3_t, float32x4_t]
+    compose:
+      - LLVMLink:
+          name: 'vst3.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st3.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst3{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'a as _']]
+
+
+  - name: "vst3{neon_type[1].nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-not-arm
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [st3]
+    safety:
+      unsafe: [neon]
+    types:
+      - [f16, float16x4x3_t, float16x4_t]
+      - [f16, float16x8x3_t, float16x8_t]
+    compose:
+      - LLVMLink:
+          name: 'vst3.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st3.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst3{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'a as _']]
+
+
+  - name: "vst3{neon_type[1].lane_nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-not-arm
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st3, 'LANE = 0']]}]]
+      - *neon-stable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [i8, int8x8x3_t, '3', int8x8_t]
+      - [i16, int16x4x3_t, '2', int16x4_t]
+      - [i32, int32x2x3_t, '1', int32x2_t]
+      - [i16, int16x8x3_t, '3', int16x8_t]
+      - [i32, int32x4x3_t, '2', int32x4_t]
+      - [f32, float32x2x3_t, '1', float32x2_t]
+      - [f32, float32x4x3_t, '2', float32x4_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - LLVMLink:
+          name: 'vst3.{neon_type[1].lane_nox}'
+          arguments:
+            - 'a: {type[3]}'
+            - 'b: {type[3]}'
+            - 'c: {type[3]}'
+            - 'n: i64'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st3lane.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst3{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'LANE as i64', 'a as _']]
+
+
+  - name: "vst3{neon_type[1].lane_nox}"
+    doc: "Store multiple 3-element structures from three registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-not-arm
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st3, 'LANE = 0']]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [f16, float16x4x3_t, '2', float16x4_t]
+      - [f16, float16x8x3_t, '3', float16x8_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - LLVMLink:
+          name: 'vst3.{neon_type[1].lane_nox}'
+          arguments:
+            - 'a: {type[3]}'
+            - 'b: {type[3]}'
+            - 'c: {type[3]}'
+            - 'n: i64'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st3lane.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst3{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'LANE as i64', 'a as _']]
+
+
+  - name: "vst4{neon_type[1].nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-v8
+      - *neon-aes
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - [p64, poly64x1x4_t, int64x1x4_t]
+    compose:
+      - FnCall:
+          - "vst4{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst4{neon_type[1].nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - *neon-arm-unstable
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i64, int64x1x4_t, int64x1_t]
+    compose:
+      - LLVMLink:
+          name: 'vst4.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst4.p0.v{neon_type[1].lane}{type[0]}'
+              arch: arm
+      - FnCall: ['_vst4{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', 'b.2', 'b.3', '8']]
+
+  - name: "vst4{neon_type[1].nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-not-arm
+      - *neon-stable
+    assert_instr: [nop]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i64, int64x1x4_t, int64x1_t]
+    compose:
+      - LLVMLink:
+          name: 'vst4.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st4.{neon_type[2]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst4{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'a as _']]
+
+  - name: "vst4{neon_type[1].nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - [u64, uint64x1x4_t, int64x1x3_t]
+    compose:
+      - FnCall:
+          - "vst4{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst4{neon_type[1].lane_nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vst4, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [st4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ["const LANE: i32"]
+    safety:
+      unsafe: [neon]
+    types:
+      - [u8, uint8x8x4_t, int8x8x4_t, '3']
+      - [u16, uint16x4x4_t, int16x4x4_t, '2']
+      - [u32, uint32x2x4_t, int32x2x4_t, '1']
+      - [u16, uint16x8x4_t, int16x8x4_t, '3']
+      - [u32, uint32x4x4_t, int32x4x4_t, '2']
+      - [p8, poly8x8x4_t, int8x8x4_t, '3']
+      - [p16, poly16x4x4_t, int16x4x4_t, '2']
+      - [p16, poly16x8x4_t, int16x8x4_t, '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - FnCall:
+          - "vst4{neon_type[2].lane_nox}::<LANE>"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst4{neon_type[1].nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vst4]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [st4]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - [u8, uint8x8x4_t, int8x8x4_t]
+      - [u16, uint16x4x4_t, int16x4x4_t]
+      - [u32, uint32x2x4_t, int32x2x4_t]
+      - [u8, uint8x16x4_t, int8x16x4_t]
+      - [u16, uint16x8x4_t, int16x8x4_t]
+      - [u32, uint32x4x4_t, int32x4x4_t]
+      - [p8, poly8x8x4_t, int8x8x4_t]
+      - [p16, poly16x4x4_t, int16x4x4_t]
+      - [p8, poly8x16x4_t, int8x16x4_t]
+      - [p16, poly16x8x4_t, int16x8x4_t]
+    compose:
+      - FnCall:
+          - "vst4{neon_type[2].nox}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst4{neon_type[1].nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - *neon-arm-unstable
+    assert_instr: [vst4]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i8, int8x8x4_t, int8x8_t, '1']
+      - [i16, int16x4x4_t, int16x4_t, '2']
+      - [i32, int32x2x4_t, int32x2_t, '4']
+      - [i8, int8x16x4_t, int8x16_t, '1']
+      - [i16, int16x8x4_t, int16x8_t, '2']
+      - [i32, int32x4x4_t, int32x4_t, '4']
+      - [f32, float32x2x4_t, float32x2_t, '4']
+      - [f32, float32x4x4_t, float32x4_t, '4']
+    compose:
+      - LLVMLink:
+          name: 'vst4.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst4.p0.v{neon_type[1].lane}{type[0]}'
+              arch: arm
+      - FnCall: ['_vst4{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', 'b.2', 'b.3', "{type[3]}"]]
+
+
+  - name: "vst4{neon_type[1].nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [vst4]
+    safety:
+      unsafe: [neon]
+    types:
+      - [f16, float16x4x4_t, float16x4_t, '2']
+      - [f16, float16x8x4_t, float16x8_t, '2']
+    compose:
+      - LLVMLink:
+          name: 'vst4.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst4.p0.v{neon_type[1].lane}{type[0]}'
+              arch: arm
+      - FnCall: ['_vst4{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', 'b.2', 'b.3', "{type[3]}"]]
+
+
+  - name: "vst4{neon_type[1].lane_nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vst4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [i8, int8x8x4_t, '3', int8x8_t, '1']
+      - [i16, int16x4x4_t, '2', int16x4_t, '2']
+      - [i32, int32x2x4_t, '1', int32x2_t, '4']
+      - [i16, int16x8x4_t, '3', int16x8_t, '2']
+      - [i32, int32x4x4_t, '2', int32x4_t, '4']
+      - [f32, float32x2x4_t, '1', float32x2_t, '4']
+      - [f32, float32x4x4_t, '2', float32x4_t, '4']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - LLVMLink:
+          name: 'vst4lane.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[3]}'
+            - 'b: {type[3]}'
+            - 'c: {type[3]}'
+            - 'd: {type[3]}'
+            - 'n: i32'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst4lane.p0.v{neon_type[1].lane}{type[0]}'
+              arch: arm
+      - FnCall: ['_vst4{neon_type[1].lane_nox}', ['a as _', 'b.0', 'b.1', 'b.2', 'b.3', 'LANE', "{type[4]}"]]
+
+  - name: "vst4{neon_type[1].lane_nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vst4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [f16, float16x4x4_t, '2', float16x4_t, '2']
+      - [f16, float16x8x4_t, '3', float16x8_t, '2']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - LLVMLink:
+          name: 'vst4lane.{neon_type[1]}'
+          arguments:
+            - 'ptr: *mut i8'
+            - 'a: {type[3]}'
+            - 'b: {type[3]}'
+            - 'c: {type[3]}'
+            - 'd: {type[3]}'
+            - 'n: i32'
+            - 'size: i32'
+          links:
+            - link: 'llvm.arm.neon.vst4lane.p0.v{neon_type[1].lane}{type[0]}'
+              arch: arm
+      - FnCall: ['_vst4{neon_type[1].lane_nox}', ['a as _', 'b.0', 'b.1', 'b.2', 'b.3', 'LANE', "{type[4]}"]]
+
+
+  - name: "vst4{neon_type[1].nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr: [*target-not-arm, *neon-stable]
+    assert_instr: [st4]
+    safety:
+      unsafe: [neon]
+    types:
+      - [i8, int8x8x4_t, int8x8_t]
+      - [i16, int16x4x4_t, int16x4_t]
+      - [i32, int32x2x4_t, int32x2_t]
+      - [i8, int8x16x4_t, int8x16_t]
+      - [i16, int16x8x4_t, int16x8_t]
+      - [i32, int32x4x4_t, int32x4_t]
+      - [f32, float32x2x4_t, float32x2_t]
+      - [f32, float32x4x4_t, float32x4_t]
+    compose:
+      - LLVMLink:
+          name: 'vst4.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st4.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst4{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'a as _']]
+
+
+  - name: "vst4{neon_type[1].nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-not-arm
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [st4]
+    safety:
+      unsafe: [neon]
+    types:
+      - [f16, float16x4x4_t, float16x4_t]
+      - [f16, float16x8x4_t, float16x8_t]
+    compose:
+      - LLVMLink:
+          name: 'vst4.{neon_type[1]}'
+          arguments:
+            - 'a: {type[2]}'
+            - 'b: {type[2]}'
+            - 'c: {type[2]}'
+            - 'd: {type[2]}'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st4.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst4{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'a as _']]
+
+
+  - name: "vst4{neon_type[1].lane_nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-not-arm
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st4, 'LANE = 0']]}]]
+      - *neon-stable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [i8, int8x8x4_t, '3', int8x8_t]
+      - [i16, int16x4x4_t, '2', int16x4_t]
+      - [i32, int32x2x4_t, '1', int32x2_t]
+      - [i16, int16x8x4_t, '3', int16x8_t]
+      - [i32, int32x4x4_t, '2', int32x4_t]
+      - [f32, float32x2x4_t, '1', float32x2_t]
+      - [f32, float32x4x4_t, '2', float32x4_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - LLVMLink:
+          name: 'vst4.{neon_type[1].lane_nox}'
+          arguments:
+            - 'a: {type[3]}'
+            - 'b: {type[3]}'
+            - 'c: {type[3]}'
+            - 'd: {type[3]}'
+            - 'n: i64'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st4lane.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst4{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'LANE as i64', 'a as _']]
+
+
+  - name: "vst4{neon_type[1].lane_nox}"
+    doc: "Store multiple 4-element structures from four registers"
+    arguments: ["a: *mut {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-not-arm
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st4, 'LANE = 0']]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - [f16, float16x4x4_t, '2', float16x4_t]
+      - [f16, float16x8x4_t, '3', float16x8_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - LLVMLink:
+          name: 'vst4.{neon_type[1].lane_nox}'
+          arguments:
+            - 'a: {type[3]}'
+            - 'b: {type[3]}'
+            - 'c: {type[3]}'
+            - 'd: {type[3]}'
+            - 'n: i64'
+            - 'ptr: *mut i8'
+          links:
+            - link: 'llvm.aarch64.neon.st4lane.v{neon_type[1].lane}{type[0]}.p0'
+              arch: aarch64,arm64ec
+      - FnCall: ['_vst4{neon_type[1].lane_nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'LANE as i64', 'a as _']]
+
+
+  - name: "vusdot{neon_type[0].no}"
+    doc: "Dot product vector form with unsigned and signed integers"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-i8mm
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vusdot]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [usdot]]}]]
+      - *neon-unstable-i8mm
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x2_t, uint8x8_t, int8x8_t]
+      - [int32x4_t, uint8x16_t, int8x16_t]
+    compose:
+      - LLVMLink:
+          name: "usdot.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.usdot.v{neon_type[0].lane}i32.v{neon_type[1].lane}i8"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.usdot.v{neon_type[0].lane}i32.v{neon_type[1].lane}i8"
+              arch: arm
+
+  - name: "vusdot{type[0]}"
+    doc: "Dot product index form with unsigned and signed integers"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}", "c: int8x8_t"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-i8mm
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vusdot, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [usdot, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-i8mm
+      - *neon-cfg-arm-unstable
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - ['_lane_s32', int32x2_t, uint8x8_t, '[LANE as u32, LANE as u32]']
+      - ['q_lane_s32', int32x4_t, uint8x16_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '1']]
+      - Let:
+          - c
+          - int32x2_t
+          - FnCall: [transmute, [c]]
+      - Let:
+          - c
+          - "{type[1]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[3]}"]]
+      - FnCall: ["vusdot{neon_type[1].no}", [a, b, {FnCall: [transmute, [c]]}]]
+
+  - name: "vsudot{neon_type[0].lane_nox}"
+    doc: "Dot product index form with signed and unsigned integers"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-i8mm
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vsudot, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sudot, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-unstable-i8mm
+      - *neon-cfg-arm-unstable
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [int32x2_t, int8x8_t, uint8x8_t, '[LANE as u32, LANE as u32]', uint32x2_t]
+      - [int32x4_t, int8x16_t, uint8x8_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]', uint32x4_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '1']]
+      - Let:
+          - c
+          - uint32x2_t
+          - FnCall: [transmute, [c]]
+      - Let:
+          - c
+          - "{type[4]}"
+          - FnCall: [simd_shuffle!, [c, c, "{type[3]}"]]
+      - FnCall: ["vusdot{neon_type[0].no}", [a, {FnCall: [transmute, [c]]}, b]]
+
+  - name: "vmul{neon_type[1].no}"
+    doc: Multiply
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmul{type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mul]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['.i8', int8x8_t]
+      - ['.i8', int8x16_t]
+      - ['.i16', int16x4_t]
+      - ['.i16', int16x8_t]
+      - ['.i32', int32x2_t]
+      - ['.i32', int32x4_t]
+      - ['.i8', uint8x8_t]
+      - ['.i8', uint8x16_t]
+      - ['.i16', uint16x4_t]
+      - ['.i16', uint16x8_t]
+      - ['.i32', uint32x2_t]
+      - ['.i32', uint32x4_t]
+    compose:
+      - FnCall: [simd_mul, [a, b]]
+
+  - name: "vmul{neon_type[1].no}"
+    doc: Multiply
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmul.{type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmul]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [f32, float32x2_t]
+      - [f32, float32x4_t]
+    compose:
+      - FnCall: [simd_mul, [a, b]]
+
+
+  - name: "vmul{neon_type[1].no}"
+    doc: Multiply
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmul.{type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmul]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [f16, float16x4_t]
+      - [f16, float16x8_t]
+    compose:
+      - FnCall: [simd_mul, [a, b]]
+
+
+  - name: "vmul{neon_type[0].lane_nox}"
+    doc: Multiply
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmul, 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mul, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint16x4_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint16x8_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint32x2_t, uint32x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [uint32x4_t, uint32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: ["static_assert_uimm_bits!", [LANE, "{type[2]}"]]
+      - FnCall:
+          - simd_mul
+          - - a
+            - FnCall: ["simd_shuffle!", [b, b, "{type[3]}"]]
+
+
+  - name: "vmul{neon_type[0].lane_nox}"
+    doc: Multiply
+    arguments: ["a: {neon_type[0]}", "v: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmul, 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmul, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [float16x4_t, float16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [float16x8_t, float16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: ["static_assert_uimm_bits!", [LANE, "{type[2]}"]]
+      - FnCall:
+          - simd_mul
+          - - a
+            - FnCall: ["simd_shuffle!", [v, v, "{type[3]}"]]
+
+
+  - name: "vmul{neon_type[0].laneq_nox}"
+    doc: Multiply
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmul, 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mul, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ["const LANE: i32"]
+    safety: safe
+    types:
+      - [int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]']
+      - [int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint16x4_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint16x8_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint32x2_t, uint32x4_t, '2', '[LANE as u32, LANE as u32]']
+      - [uint32x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: ["static_assert_uimm_bits!", [LANE, "{type[2]}"]]
+      - FnCall:
+          - simd_mul
+          - - a
+            - FnCall: ["simd_shuffle!", [b, b, "{type[3]}"]]
+
+  - name: "vmull{neon_type[1].no}"
+    doc: Signed multiply long
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmull.{type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smull]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ["s8", int8x8_t, int16x8_t]
+      - ["s16", int16x4_t, int32x4_t]
+      - ["s32", int32x2_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: "smull.{neon_type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.smull.{neon_type[2]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vmulls.{neon_type[2]}"
+              arch: arm
+
+  - name: "vmull{neon_type[1].no}"
+    doc: "Unsigned multiply long"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmull.{type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [umull]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ["u8", uint8x8_t, uint16x8_t]
+      - ["u16", uint16x4_t, uint32x4_t]
+      - ["u32", uint32x2_t, uint64x2_t]
+    compose:
+      - LLVMLink:
+          name: "smull.{neon_type[1]}"
+          links:
+            - link: "llvm.aarch64.neon.umull.{neon_type[2]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vmullu.{neon_type[2]}"
+              arch: arm
+
+  - name: "vmull{neon_type[1].no}"
+    doc: "Polynomial multiply long"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmull.{type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [pmull]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ["p8", poly8x8_t, poly16x8_t]
+    compose:
+      - LLVMLink:
+          name: "pmull.{neon_type[1].no}"
+          links:
+            - link: "llvm.aarch64.neon.pmull.v8i16"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vmullp.v8i16"
+              arch: arm
+
+  - name: "vmull_n{neon_type[0].no}"
+    doc: Vector long multiply with scalar
+    arguments: ["a: {neon_type[0]}", "b: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ["vmull"]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smull]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x4_t, "i16", int32x4_t]
+      - [int32x2_t, "i32", int64x2_t]
+    compose:
+      - FnCall:
+          - "vmull{neon_type[0].no}"
+          - - a
+            - FnCall:
+                - "vdup_n{neon_type[0].no}"
+                - - b
+
+  - name: "vmull_n{neon_type[0].no}"
+    doc: Vector long multiply with scalar
+    arguments: ["a: {neon_type[0]}", "b: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ["vmull"]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [umull]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint16x4_t, "u16", uint32x4_t]
+      - [uint32x2_t, "u32", uint64x2_t]
+    compose:
+      - FnCall:
+          - "vmull{neon_type[0].no}"
+          - - a
+            - FnCall:
+                - "vdup_n{neon_type[0].no}"
+                - - b
+
+  - name: "vfma{neon_type.no}"
+    doc: Floating-point fused Multiply-Add to accumulator(vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [target_arch = "arm", {FnCall: [target_feature, ['enable = "vfp4"']]}]]
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - 'target_arch = "arm"'
+            - FnCall:
+                - assert_instr
+                - - vfma
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - FnCall:
+                      - any
+                      - - 'target_arch = "aarch64"'
+                        - 'target_arch = "arm64ec"'
+            - FnCall:
+                - assert_instr
+                - - fmla
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - FnCall: [simd_fma, [b, c, a]]
+
+
+  - name: "vfma{neon_type.no}"
+    doc: Floating-point fused Multiply-Add to accumulator (vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [target_arch = "arm", {FnCall: [target_feature, ['enable = "vfp4"']]}]]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vfma]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmla]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - FnCall: [simd_fma, [b, c, a]]
+
+
+  - name: "vfma{neon_type[0].N}"
+    doc: Floating-point fused Multiply-Add to accumulator(vector)
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [target_arch = "arm", {FnCall: [target_feature, ['enable = "vfp4"']]}]]
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - 'target_arch = "arm"'
+            - FnCall:
+                - assert_instr
+                - - vfma
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - FnCall:
+                      - any
+                      - - 'target_arch = "aarch64"'
+                        - 'target_arch = "arm64ec"'
+            - FnCall:
+                - assert_instr
+                - - fmla
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, f32]
+      - [float32x4_t, f32]
+    compose:
+      - FnCall:
+          - "vfma{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall:
+                - "vdup{neon_type[0].N}_vfp4"
+                - - c
+
+  - name: "vsub{neon_type[1].no}"
+    doc: "Subtract"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vsub{type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sub]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['.i8', int8x8_t]
+      - ['.i8', int8x16_t]
+      - ['.i16', int16x4_t]
+      - ['.i16', int16x8_t]
+      - ['.i32', int32x2_t]
+      - ['.i32', int32x4_t]
+      - ['.i8', uint8x8_t]
+      - ['.i8', uint8x16_t]
+      - ['.i16', uint16x4_t]
+      - ['.i16', uint16x8_t]
+      - ['.i32', uint32x2_t]
+      - ['.i32', uint32x4_t]
+      - ['.i64', int64x1_t]
+      - ['.i64', int64x2_t]
+      - ['.i64', uint64x1_t]
+      - ['.i64', uint64x2_t]
+    compose:
+      - FnCall: [simd_sub, [a, b]]
+
+  - name: "vsub{neon_type[1].no}"
+    doc: "Subtract"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vsub.{type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fsub]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['f32', float32x2_t]
+      - ['f32', float32x4_t]
+    compose:
+      - FnCall: [simd_sub, [a, b]]
+
+
+  - name: "vsub{neon_type[1].no}"
+    doc: "Subtract"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vsub.{type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fsub]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ['f16', float16x4_t]
+      - ['f16', float16x8_t]
+    compose:
+      - FnCall: [simd_sub, [a, b]]
+
+
+  - name: "vadd{neon_type.no}"
+    doc: Floating-point Add (vector).
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vadd.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fadd]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - FnCall:
+          - simd_add
+          - - a
+            - b
+
+  - name: "vadd{type[0]}"
+    doc: Add
+    arguments: ["a: {type[1]}", "b: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vadd.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fadd]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ['h_f16', 'f16']
+    compose:
+      - 'a + b'
+
+  - name: "vadd{neon_type.no}"
+    doc: Bitwise exclusive OR
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - 'target_arch = "arm"'
+            - FnCall:
+                - assert_instr
+                - - nop
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - FnCall:
+                      - any
+                      - - 'target_arch = "aarch64"'
+                        - 'target_arch = "arm64ec"'
+            - FnCall:
+                - assert_instr
+                - - nop
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - poly8x8_t
+      - poly16x4_t
+      - poly8x16_t
+      - poly16x8_t
+      - poly64x1_t
+      - poly64x2_t
+    compose:
+      - FnCall:
+          - simd_xor
+          - - a
+            - b
+
+  - name: "vaddq_{type}"
+    doc: Bitwise exclusive OR
+    arguments: ["a: {type}", "b: {type}"]
+    return_type: "{type}"
+    attr:
+      - *neon-v7
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - 'target_arch = "arm"'
+            - FnCall:
+                - assert_instr
+                - - nop
+      - FnCall:
+          - cfg_attr
+          - - FnCall:
+                - all
+                - - test
+                  - FnCall:
+                      - any
+                      - - 'target_arch = "aarch64"'
+                        - 'target_arch = "arm64ec"'
+            - FnCall:
+                - assert_instr
+                - - nop
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - p128
+    compose:
+      - Xor:
+          - a
+          - b
+
+  - name: "vsubhn{neon_type[0].noq}"
+    doc: Subtract returning high narrow
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ["vsubhn"]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [subhn]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t, 'i16x8', 'i16x8::new(8, 8, 8, 8, 8, 8, 8, 8)']
+      - [int32x4_t, int16x4_t, 'i32x4', 'i32x4::new(16, 16, 16, 16)']
+      - [int64x2_t, int32x2_t, 'i64x2', 'i64x2::new(32, 32)']
+      - [uint16x8_t, uint8x8_t, 'u16x8', 'u16x8::new(8, 8, 8, 8, 8, 8, 8, 8)']
+      - [uint32x4_t, uint16x4_t, 'u32x4', 'u32x4::new(16, 16, 16, 16)']
+      - [uint64x2_t, uint32x2_t, 'u64x2', 'u64x2::new(32, 32)']
+    compose:
+      - Let: [c, "{type[2]}", "{type[3]}"]
+      - FnCall:
+          - simd_cast
+          - - FnCall:
+                - simd_shr
+                - - FnCall: [simd_sub, [a, b]]
+                  - FnCall: [transmute, [c]]
+
+  - name: "vsubhn_high{neon_type[1].noq}"
+    doc: Subtract returning high narrow
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ["vsubhn"]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [subhn2]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, int16x8_t, int8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int16x4_t, int32x4_t, int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [int32x2_t, int64x2_t, int32x4_t, '[0, 1, 2, 3]']
+      - [uint8x8_t, uint16x8_t, uint8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, uint32x4_t, uint16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, uint64x2_t, uint32x4_t, '[0, 1, 2, 3]']
+    compose:
+      - Let:
+          - d
+          - "{neon_type[0]}"
+          - FnCall: ["vsubhn{neon_type[1].noq}", [b, c]]
+      - FnCall: [simd_shuffle!, [a, d, "{type[3]}"]]
+
+  - name: "vhsub{neon_type[1].no}"
+    doc: "Signed halving subtract"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vhsub.{type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uhsub]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['u8', uint8x8_t]
+      - ['u8', uint8x16_t]
+      - ['u16', uint16x4_t]
+      - ['u16', uint16x8_t]
+      - ['u32', uint32x2_t]
+      - ['u32', uint32x4_t]
+    compose:
+      - LLVMLink:
+          name: "uhsub.{neon_type[1].no}"
+          links:
+            - link: "llvm.aarch64.neon.uhsub.{neon_type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vhsubu.{neon_type[1]}"
+              arch: arm
+
+  - name: "vhsub{neon_type[1].no}"
+    doc: "Signed halving subtract"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vhsub.{type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [shsub]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['s8', int8x8_t]
+      - ['s8', int8x16_t]
+      - ['s16', int16x4_t]
+      - ['s16', int16x8_t]
+      - ['s32', int32x2_t]
+      - ['s32', int32x4_t]
+    compose:
+      - LLVMLink:
+          name: "shsub.{neon_type[1].no}"
+          links:
+            - link: "llvm.aarch64.neon.shsub.{neon_type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vhsubs.{neon_type[1]}"
+              arch: arm
+
+  - name: "vsubw{neon_type[1].noq}"
+    doc: Signed Subtract Wide
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vsubw]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ssubw]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t]
+      - [int32x4_t, int16x4_t]
+      - [int64x2_t, int32x2_t]
+    compose:
+      - FnCall:
+          - simd_sub
+          - - a
+            - FnCall: [simd_cast, [b]]
+
+  - name: "vsubw{neon_type[1].noq}"
+    doc: Unsigned Subtract Wide
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vsubw]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [usubw]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x8_t]
+      - [uint32x4_t, uint16x4_t]
+      - [uint64x2_t, uint32x2_t]
+    compose:
+      - FnCall:
+          - simd_sub
+          - - a
+            - FnCall: [simd_cast, [b]]
+
+  - name: "vsubl{neon_type[0].noq}"
+    doc: "Signed Subtract Long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vsubl]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ssubl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, int16x8_t]
+      - [int16x4_t, int32x4_t]
+      - [int32x2_t, int64x2_t]
+    compose:
+      - Let:
+          - c
+          - "{neon_type[1]}"
+          - FnCall: [simd_cast, [a]]
+      - Let:
+          - d
+          - "{neon_type[1]}"
+          - FnCall: [simd_cast, [b]]
+      - FnCall: [simd_sub, [c, d]]
+
+  - name: "vsubl{neon_type[0].noq}"
+    doc: "Unsigned Subtract Long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vsubl]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [usubl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t, uint16x8_t]
+      - [uint16x4_t, uint32x4_t]
+      - [uint32x2_t, uint64x2_t]
+    compose:
+      - Let:
+          - c
+          - "{neon_type[1]}"
+          - FnCall: [simd_cast, [a]]
+      - Let:
+          - d
+          - "{neon_type[1]}"
+          - FnCall: [simd_cast, [b]]
+      - FnCall: [simd_sub, [c, d]]
+
+  - name: "vdot{neon_type[0].no}"
+    doc: Dot product arithmetic (vector)
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v8
+      - FnCall: [target_feature, ['enable = "neon,dotprod"']]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vsdot]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sdot]]}]]
+      - FnCall: [cfg_attr, [{FnCall: [not, ['target_arch = "arm"']]}, {FnCall: [unstable, ['feature = "stdarch_neon_dotprod"', 'issue = "117224"']]}]]
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x2_t, int8x8_t]
+      - [int32x4_t, int8x16_t]
+    compose:
+      - LLVMLink:
+          name: "sdot.{neon_type[0]}.{neon_type[1]}"
+          links:
+            - link: "llvm.arm.neon.sdot.{neon_type[0]}.{neon_type[1]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.sdot.{neon_type[0]}.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vdot{neon_type[0].no}"
+    doc: Dot product arithmetic (vector)
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v8
+      - FnCall: [target_feature, ['enable = "neon,dotprod"']]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vudot]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [udot]]}]]
+      - FnCall: [cfg_attr, [{FnCall: [not, ['target_arch = "arm"']]}, {FnCall: [unstable, ['feature = "stdarch_neon_dotprod"', 'issue = "117224"']]}]]
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint32x2_t, uint8x8_t]
+      - [uint32x4_t, uint8x16_t]
+    compose:
+      - LLVMLink:
+          name: "udot.{neon_type[0]}.{neon_type[1]}"
+          links:
+            - link: "llvm.arm.neon.udot.{neon_type[0]}.{neon_type[1]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.udot.{neon_type[0]}.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vdot{neon_type[0].lane_nox}"
+    doc: Dot product arithmetic (indexed)
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    static_defs: ["const LANE: i32"]
+    attr:
+      - *neon-v8
+      - FnCall: [target_feature, ['enable = "neon,dotprod"']]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vsdot, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sdot, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [cfg_attr, [{FnCall: [not, ['target_arch = "arm"']]}, {FnCall: [unstable, ['feature = "stdarch_neon_dotprod"', 'issue = "117224"']]}]]
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x2_t, int8x8_t, int8x8_t, int32x2_t, '[LANE as u32, LANE as u32]']
+      - [int32x4_t, int8x16_t, int8x8_t, int32x2_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '1']]
+      - Let:
+          - c
+          - "{neon_type[3]}"
+          - FnCall: [transmute, [c]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, '{type[4]}']]
+      - FnCall:
+          - "vdot{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: [transmute, [c]]
+
+  - name: "vdot{neon_type[0].lane_nox}"
+    doc: Dot product arithmetic (indexed)
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    static_defs: ["const LANE: i32"]
+    attr:
+      - *neon-v8
+      - FnCall: [target_feature, ['enable = "neon,dotprod"']]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vudot, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [udot, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - FnCall: [cfg_attr, [{FnCall: [not, ['target_arch = "arm"']]}, {FnCall: [unstable, ['feature = "stdarch_neon_dotprod"', 'issue = "117224"']]}]]
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint32x2_t, uint8x8_t, uint8x8_t, uint32x2_t, '[LANE as u32, LANE as u32]']
+      - [uint32x4_t, uint8x16_t, uint8x8_t, uint32x2_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '1']]
+      - Let:
+          - c
+          - "{neon_type[3]}"
+          - FnCall: [transmute, [c]]
+      - Let:
+          - c
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [c, c, '{type[4]}']]
+      - FnCall:
+          - "vdot{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: [transmute, [c]]
+
+  - name: "vmax{neon_type.no}"
+    doc: Maximum (vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmax]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smax]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+    compose:
+      - LLVMLink:
+          name: "smax.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vmaxs.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.smax.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vmax{neon_type.no}"
+    doc: Maximum (vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmax]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [umax]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint8x8_t
+      - uint8x16_t
+      - uint16x4_t
+      - uint16x8_t
+      - uint32x2_t
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: "smax.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vmaxu.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.umax.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vmax{neon_type.no}"
+    doc: Maximum (vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmax]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmax]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - LLVMLink:
+          name: "smax.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vmaxs.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fmax.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmax{neon_type.no}"
+    doc: Maximum (vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmax]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmax]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "vmax.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vmaxs.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fmax.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmaxnm{neon_type.no}"
+    doc: Floating-point Maximum Number (vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, ['target_arch = "arm"', {FnCall: [target_feature, ['enable = "fp-armv8,v8"']]}]]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmaxnm]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmaxnm]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - LLVMLink:
+          name: "fmaxnm.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vmaxnm.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fmaxnm.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmaxnm{neon_type.no}"
+    doc: Floating-point Maximum Number (vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, ['target_arch = "arm"', {FnCall: [target_feature, ['enable = "fp-armv8,v8"']]}]]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmaxnm]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmaxnm]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "fmaxnm.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vmaxnm.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fmaxnm.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vminnm{neon_type.no}"
+    doc: Floating-point Minimum Number (vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, ['target_arch = "arm"', {FnCall: [target_feature, ['enable = "fp-armv8,v8"']]}]]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vminnm]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fminnm]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "fminnm.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vminnm.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fminnm.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmin{neon_type.no}"
+    doc: "Minimum (vector)"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmin]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smin]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+    compose:
+      - LLVMLink:
+          name: "smin.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vmins.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.smin.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vmin{neon_type.no}"
+    doc: "Minimum (vector)"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmin]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [umin]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint8x8_t
+      - uint8x16_t
+      - uint16x4_t
+      - uint16x8_t
+      - uint32x2_t
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: "umin.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vminu.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.umin.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vmin{neon_type.no}"
+    doc: "Minimum (vector)"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmin]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmin]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - LLVMLink:
+          name: "fmin.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vmins.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fmin.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vmin{neon_type.no}"
+    doc: Minimum (vector)
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmin]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmin]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "vmin.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vmins.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fmin.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+
+  - name: "vminnm{neon_type.no}"
+    doc: "Floating-point Minimum Number (vector)"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, ['target_arch = "arm"', {FnCall: [target_feature, ['enable = "fp-armv8,v8"']]}]]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vminnm]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fminnm]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - LLVMLink:
+          name: "fminnm.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vminnm.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fminnm.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vpadd{neon_type.no}"
+    doc: Floating-point add pairwise
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vpadd]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [faddp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+    compose:
+      - LLVMLink:
+          name: "faddp.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vpadd.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.faddp.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vpadd{neon_type.no}"
+    doc: Floating-point add pairwise
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vpadd]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [faddp]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+    compose:
+      - LLVMLink:
+          name: "faddp.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vpadd.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.faddp.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vqdmull{neon_type[0].noq}"
+    doc: "Signed saturating doubling multiply long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmull]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmull]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x4_t, int32x4_t]
+      - [int32x2_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vqdmull{neon_type[0].no}"
+          links:
+            - link: "llvm.arm.neon.vqdmull.{neon_type[1]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.sqdmull.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqdmull_n{neon_type[0].no}"
+    doc: "Vector saturating doubling long multiply with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmull]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmull]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x4_t, "i16", int32x4_t]
+      - [int32x2_t, "i32", int64x2_t]
+    compose:
+      - FnCall: ["vqdmull{neon_type[0].noq}", [a, {FnCall: ["vdup_n{neon_type[0].noq}", [b]]}]]
+
+  - name: "vqdmull_lane_s16"
+    doc: "Vector saturating doubling long multiply by scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmull, 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmull, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x4_t, int16x4_t, int32x4_t, '[N as u32, N as u32, N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '2']]
+      - Let: [b, "{neon_type[0]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}]
+      - FnCall: [vqdmull_s16, [a, b]]
+
+  - name: "vqdmull_lane_s32"
+    doc: "Vector saturating doubling long multiply by scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmull, 'N = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmull, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x2_t, int32x2_t, int64x2_t, '[N as u32, N as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '1']]
+      - Let: [b, "{neon_type[0]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}]
+      - FnCall: [vqdmull_s32, [a, b]]
+
+  - name: "vqdmlal{neon_type[1].noq}"
+    doc: "Signed saturating doubling multiply-add long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmlal]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmlal]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x4_t, int16x4_t, int16x4_t, int32x4_t]
+      - [int64x2_t, int32x2_t, int32x2_t, int64x2_t]
+    compose:
+      - FnCall: ["vqadd{neon_type[0].no}", [a, {FnCall: ["vqdmull{neon_type[2].noq}", [b, c]]}]]
+
+  - name: "vqdmlal_n{neon_type[1].noq}"
+    doc: "Vector widening saturating doubling multiply accumulate with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmlal]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmlal]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x4_t, int16x4_t, "i16", int32x4_t]
+      - [int64x2_t, int32x2_t, "i32", int64x2_t]
+    compose:
+      - FnCall: ["vqadd{neon_type[0].no}", [a, {FnCall: ["vqdmull_n{neon_type[1].noq}", [b, c]]}]]
+
+  - name: "vqdmlal_lane_s16"
+    doc: "Vector widening saturating doubling multiply accumulate with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmlal, N = 2]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmlal, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x4_t, int16x4_t, int16x4_t, int32x4_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '2']]
+      - FnCall: [vqaddq_s32, [a, {FnCall: ["vqdmull_lane_s16::<N>", [b, c]]}]]
+
+  - name: "vqdmlal_lane_s32"
+    doc: "Vector widening saturating doubling multiply accumulate with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmlal, N = 1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmlal, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int64x2_t, int32x2_t, int32x2_t, int64x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '1']]
+      - FnCall: [vqaddq_s64, [a, {FnCall: ["vqdmull_lane_s32::<N>", [b, c]]}]]
+
+  - name: "vqdmlsl{neon_type[1].noq}"
+    doc: "Signed saturating doubling multiply-subtract long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmlsl]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmlsl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x4_t, int16x4_t, int16x4_t, int32x4_t]
+      - [int64x2_t, int32x2_t, int32x2_t, int64x2_t]
+    compose:
+      - FnCall: ["vqsub{neon_type[0].no}", [a, {FnCall: ["vqdmull{neon_type[1].noq}", [b, c]]}]]
+
+  - name: "vqdmlsl{type[4]}"
+    doc: "Vector widening saturating doubling multiply subtract with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmlsl]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmlsl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x4_t, int16x4_t, "i16", int32x4_t, '_n_s16']
+      - [int64x2_t, int32x2_t, "i32", int64x2_t, '_n_s32']
+    compose:
+      - FnCall: ["vqsub{neon_type[0].no}", [a, {FnCall: ["vqdmull{type[4]}", [b, c]]}]]
+
+  - name: "vqdmlsl_lane_s16"
+    doc: "Vector widening saturating doubling multiply subtract with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmlsl, N = 2]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmlsl, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x4_t, int16x4_t, int16x4_t, int32x4_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '2']]
+      - FnCall: [vqsubq_s32, [a, {FnCall: ["vqdmull_lane_s16::<N>", [b, c]]}]]
+
+  - name: "vqdmlsl_lane_s32"
+    doc: "Vector widening saturating doubling multiply subtract with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmlsl, N = 1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmlsl, 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int64x2_t, int32x2_t, int32x2_t, int64x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, '1']]
+      - FnCall: [vqsubq_s64, [a, {FnCall: ["vqdmull_lane_s32::<N>", [b, c]]}]]
+
+  - name: "vqdmulh{neon_type[0].no}"
+    doc: "Signed saturating doubling multiply returning high half"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmulh]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmulh]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x4_t, int16x4_t, int16x4_t]
+      - [int16x8_t, int16x8_t, int16x8_t]
+      - [int32x2_t, int32x2_t, int32x2_t]
+      - [int32x4_t, int32x4_t, int32x4_t]
+    compose:
+      - LLVMLink:
+          name: "vqdmulh{neon_type[0].no}"
+          links:
+            - link: "llvm.arm.neon.vqdmulh.{neon_type[0]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.sqdmulh.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqdmulh{type[3]}"
+    doc: "Vector saturating doubling multiply high with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmulh]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmulh]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x4_t, "i16", int16x4_t, '_n_s16']
+      - [int32x2_t, "i32", int32x2_t, '_n_s32']
+      - [int16x8_t, "i16", int16x8_t, 'q_n_s16']
+      - [int32x4_t, "i32", int32x4_t, 'q_n_s32']
+    compose:
+      - Let: [b, "{neon_type[0]}", {FnCall: ["vdup{type[3]}", [b]]}]
+      - FnCall: ["vqdmulh{neon_type[0].no}", [a, b]]
+
+  - name: "vqmovn{neon_type[0].noq}"
+    doc: "Signed saturating extract narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqmovn]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqxtn]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t]
+      - [int32x4_t, int16x4_t]
+      - [int64x2_t, int32x2_t]
+    compose:
+      - LLVMLink:
+          name: "vqmovn{neon_type[0].noq}"
+          links:
+            - link: "llvm.arm.neon.vqmovns.{neon_type[1]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.sqxtn.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqmovun{neon_type[0].noq}"
+    doc: "Signed saturating extract unsigned narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqmovun]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqxtun]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x8_t, uint8x8_t]
+      - [int32x4_t, uint16x4_t]
+      - [int64x2_t, uint32x2_t]
+    compose:
+      - LLVMLink:
+          name: "vqmovun{neon_type[0].noq}"
+          links:
+            - link: "llvm.arm.neon.vqmovnsu.{neon_type[1]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.sqxtun.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqrdmulh{neon_type[0].no}"
+    doc: "Signed saturating rounding doubling multiply returning high half"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqrdmulh]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqrdmulh]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x4_t, int16x4_t, int16x4_t]
+      - [int16x8_t, int16x8_t, int16x8_t]
+      - [int32x2_t, int32x2_t, int32x2_t]
+      - [int32x4_t, int32x4_t, int32x4_t]
+    compose:
+      - LLVMLink:
+          name: "vqrdmulh{neon_type[0].no}"
+          links:
+            - link: "llvm.arm.neon.vqrdmulh.{neon_type[0]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.sqrdmulh.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqrshl{neon_type.no}"
+    doc: "Signed saturating rounding shift left"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqrshl]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqrshl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+      - int64x1_t
+      - int64x2_t
+    compose:
+      - LLVMLink:
+          name: "vqrshl{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vqrshifts.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.sqrshl.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vqrshl{neon_type[0].no}"
+    doc: "Unsigned signed saturating rounding shift left"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqrshl]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uqrshl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t, int8x8_t]
+      - [uint8x16_t, int8x16_t]
+      - [uint16x4_t, int16x4_t]
+      - [uint16x8_t, int16x8_t]
+      - [uint32x2_t, int32x2_t]
+      - [uint32x4_t, int32x4_t]
+      - [uint64x1_t, int64x1_t]
+      - [uint64x2_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vqrshl{neon_type[0].no}"
+          links:
+            - link: "llvm.arm.neon.vqrshiftu.{neon_type[1]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.uqrshl.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqrshrn_n{neon_type[0].noq}"
+    doc: "Signed saturating rounded shift right narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vqrshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16]) }']
+      - [int32x4_t, int16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) }']
+      - [int64x2_t, int32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64, -N as i64]) }']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - LLVMLink:
+          name: "vqrshrn{neon_type[0].noq}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: {neon_type[0]}"
+          links:
+            - link: "llvm.arm.neon.vqrshiftns.{neon_type[1]}"
+              arch: arm
+      - FnCall: ["_vqrshrn_n{neon_type[0].noq}", [a, '{type[3]}'], [], true]
+
+  - name: "vqrshrn_n{neon_type[0].noq}"
+    doc: "Signed saturating rounded shift right narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t, 'N >= 1 && N <= 8']
+      - [int32x4_t, int16x4_t, 'N >= 1 && N <= 16']
+      - [int64x2_t, int32x2_t, 'N >= 1 && N <= 32']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - LLVMLink:
+          name: "vqrshrn{neon_type[0].no}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.sqrshrn.{neon_type[1]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vqrshrn_n{neon_type[0].noq}", [a, N], [], true]
+
+  - name: "vqrshrun_n{neon_type[0].noq}"
+    doc: "Signed saturating rounded shift right unsigned narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vqrshrun, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x8_t, uint8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16]) }']
+      - [int32x4_t, uint16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) }']
+      - [int64x2_t, uint32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64, -N as i64]) }']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - LLVMLink:
+          name: "vqrshrun_n{neon_type[0].noq}"
+          arguments:
+            - 'a: {neon_type[0]}'
+            - 'n: {neon_type[0]}'
+          links:
+            - link: "llvm.arm.neon.vqrshiftnsu.{neon_type[1]}"
+              arch: arm
+      - FnCall:
+          - "_vqrshrun_n{neon_type[0].noq}"
+          - - a
+            - "{type[3]}"
+          - []
+          - true
+
+  - name: "vqrshrun_n{neon_type[0].noq}"
+    doc: "Signed saturating rounded shift right unsigned narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqrshrun, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x8_t, uint8x8_t, 'N >= 1 && N <= 8']
+      - [int32x4_t, uint16x4_t, 'N >= 1 && N <= 16']
+      - [int64x2_t, uint32x2_t, 'N >= 1 && N <= 32']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - LLVMLink:
+          name: "vqrshrun_n{neon_type[0].noq}"
+          arguments:
+            - 'a: {neon_type[0]}'
+            - 'n: i32'
+          links:
+            - link: "llvm.aarch64.neon.sqrshrun.{neon_type[1]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vqrshrun_n{neon_type[0].noq}", [a, N], [], true]
+
+  - name: "vqshl{neon_type.no}"
+    doc: "Signed saturating shift left"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqshl]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqshl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+      - int64x1_t
+      - int64x2_t
+    compose:
+      - LLVMLink:
+          name: "vqshl{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vqshifts.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.sqshl.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vqshl{neon_type[0].N}"
+    doc: "Signed saturating shift left"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqshl, 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqshl, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, '3']
+      - [int8x16_t, '3']
+      - [int16x4_t, '4']
+      - [int16x8_t, '4']
+      - [int32x2_t, '5']
+      - [int32x4_t, '5']
+      - [int64x1_t, '6']
+      - [int64x2_t, '6']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[1]}"]]
+      - FnCall:
+          - "vqshl{neon_type[0].no}"
+          - - a
+            - FnCall: ["vdup{neon_type[0].N}", ['N as _']]
+
+  - name: "vqshl{neon_type[0].no}"
+    doc: "Unsigned saturating shift left"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqshl]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uqshl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t, int8x8_t]
+      - [uint8x16_t, int8x16_t]
+      - [uint16x4_t, int16x4_t]
+      - [uint16x8_t, int16x8_t]
+      - [uint32x2_t, int32x2_t]
+      - [uint32x4_t, int32x4_t]
+      - [uint64x1_t, int64x1_t]
+      - [uint64x2_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vqshl{neon_type[0].no}"
+          links:
+            - link: "llvm.arm.neon.vqshiftu.{neon_type[1]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.uqshl.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vqshl{neon_type[0].N}"
+    doc: "Unsigned saturating shift left"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqshl, N = 2]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uqshl, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint8x8_t, '3', int8x8_t]
+      - [uint8x16_t, '3', int8x16_t]
+      - [uint16x4_t, '4', int16x4_t]
+      - [uint16x8_t, '4', int16x8_t]
+      - [uint32x2_t, '5', int32x2_t]
+      - [uint32x4_t, '5', int32x4_t]
+      - [uint64x1_t, '6', int64x1_t]
+      - [uint64x2_t, '6', int64x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[1]}"]]
+      - FnCall:
+          - "vqshl{neon_type[0].no}"
+          - - a
+            - FnCall: ["vdup{neon_type[2].N}", ['N as _']]
+
+  - name: "vqshrn_n{neon_type[0].noq}"
+    doc: "Signed saturating shift right narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vqshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16]) }']
+      - [int32x4_t, int16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) }']
+      - [int64x2_t, int32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64, -N as i64]) }']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - LLVMLink:
+          name: "vqshrn{neon_type[0].no}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: {neon_type[0]}"
+          links:
+            - link: "llvm.arm.neon.vqshiftns.{neon_type[1]}"
+              arch: arm
+      - FnCall: ["_vqshrn_n{neon_type[0].noq}", [a, "{type[3]}"], [], true]
+
+  - name: "vqshrn_n{neon_type[0].noq}"
+    doc: "Signed saturating shift right narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t, 'N >= 1 && N <= 8']
+      - [int32x4_t, int16x4_t, 'N >= 1 && N <= 16']
+      - [int64x2_t, int32x2_t, 'N >= 1 && N <= 32']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - LLVMLink:
+          name: "vqshrn_n{neon_type[0].noq}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.sqshrn.{neon_type[1]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vqshrn_n{neon_type[0].noq}", [a, N], [], true]
+
+  - name: "vqshrn_n_{neon_type[0]}"
+    doc: "Unsigned saturating shift right narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vqshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x8_t, 'N >= 1 && N <= 8', 'const { uint16x8_t([-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16]) }']
+      - [uint32x4_t, uint16x4_t, 'N >= 1 && N <= 16', 'const { uint32x4_t([-N as u32, -N as u32, -N as u32, -N as u32]) }']
+      - [uint64x2_t, uint32x2_t, 'N >= 1 && N <= 32', 'const { uint64x2_t([-N as u64, -N as u64]) }']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - LLVMLink:
+          name: "vqshrn_n_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: {neon_type[0]}"
+          links:
+            - link: "llvm.arm.neon.vqshiftnu.{neon_type[1]}"
+              arch: arm
+      - FnCall: ["_vqshrn_n_{neon_type[0]}", ["a", "{type[3]}"], [], true]
+
+  - name: "vqshrn_n_{neon_type[0]}"
+    doc: "Unsigned saturating shift right narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x8_t, 'N >= 1 && N <= 8']
+      - [uint32x4_t, uint16x4_t, 'N >= 1 && N <= 16']
+      - [uint64x2_t, uint32x2_t, 'N >= 1 && N <= 32']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - LLVMLink:
+          name: "vqshrn{neon_type[1].no}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.uqshrn.{neon_type[1]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vqshrn_n_{neon_type[0]}", ["a", N], [], true]
+
+  - name: "vqshrun_n_{neon_type[0]}"
+    doc: "Signed saturating shift right unsigned narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vqshrun, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x8_t, uint8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16]) }']
+      - [int32x4_t, uint16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) }']
+      - [int64x2_t, uint32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64, -N as i64]) }']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - LLVMLink:
+          name: "vqshrun_n_{neon_type[1]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: {neon_type[0]}"
+          links:
+            - link: "llvm.arm.neon.vqshiftnsu.{neon_type[1]}"
+              arch: arm
+      - FnCall: ["_vqshrun_n_{neon_type[0]}", [a, "{type[3]}"], [], true]
+
+  - name: "vqshrun_n_{neon_type[0]}"
+    doc: "Signed saturating shift right unsigned narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqshrun, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x8_t, uint8x8_t, 'N >= 1 && N <= 8']
+      - [int32x4_t, uint16x4_t, 'N >= 1 && N <= 16']
+      - [int64x2_t, uint32x2_t, 'N >= 1 && N <= 32']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - LLVMLink:
+          name: "vqshrun_n_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.sqshrun.{neon_type[1]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vqshrun_n_{neon_type[0]}", [a, N], [], true]
+
+  - name: "vrsqrts{neon_type.no}"
+    doc: "Floating-point reciprocal square root step"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrsqrts]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [frsqrts]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - LLVMLink:
+          name: "vrsqrts{neon_type.no}"
+          links:
+            - link: "llvm.arm.neon.vrsqrts.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.frsqrts.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrsqrts{neon_type.no}"
+    doc: "Floating-point reciprocal square root step"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v8
+      - *neon-fp16
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrsqrts]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [frsqrts]]}]]
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "vrsqrts{neon_type.no}"
+          links:
+            - link: "llvm.arm.neon.vrsqrts.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.frsqrts.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrecpe{neon_type.no}"
+    doc: "Reciprocal estimate."
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrecpe]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [frecpe]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - LLVMLink:
+          name: "vrecpe{neon_type.no}"
+          links:
+            - link: "llvm.arm.neon.vrecpe.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.frecpe.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrecpe{neon_type.no}"
+    doc: "Reciprocal estimate."
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrecpe]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [frecpe]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "vrecpe{neon_type.no}"
+          links:
+            - link: "llvm.arm.neon.vrecpe.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.frecpe.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrecps{neon_type.no}"
+    doc: "Floating-point reciprocal step"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrecps]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [frecps]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - LLVMLink:
+          name: "vrecps{neon_type.no}"
+          links:
+            - link: "llvm.arm.neon.vrecps.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.frecps.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrecps{neon_type.no}"
+    doc: "Floating-point reciprocal step"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrecps]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [frecps]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "vrecps{neon_type.no}"
+          links:
+            - link: "llvm.arm.neon.vrecps.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.frecps.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vreinterpret{neon_type[1].no}{neon_type[0].noq}"
+    doc: Vector reinterpret cast operation
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [poly64x1_t, int32x2_t]
+      - [poly64x1_t, uint32x2_t]
+      - [poly64x2_t, int32x4_t]
+      - [poly64x2_t, uint32x4_t]
+      - [p128, int64x2_t]
+      - [p128, uint64x2_t]
+      - [p128, poly64x2_t]
+      - [poly8x16_t, p128]
+      - [p128, int8x16_t]
+      - [p128, uint8x16_t]
+      - [p128, poly8x16_t]
+      - [int32x2_t, poly64x1_t]
+      - [uint32x2_t, poly64x1_t]
+      - [int32x4_t, poly64x2_t]
+      - [uint32x4_t, poly64x2_t]
+      - [int64x2_t, p128]
+      - [uint64x2_t, p128]
+      - [poly64x2_t, p128]
+      - [poly64x1_t, int16x4_t]
+      - [poly64x1_t, uint16x4_t]
+      - [poly64x1_t, poly16x4_t]
+      - [poly64x2_t, int16x8_t]
+      - [poly64x2_t, uint16x8_t]
+      - [poly64x2_t, poly16x8_t]
+      - [p128, int32x4_t]
+      - [p128, uint32x4_t]
+      - [poly16x4_t, poly64x1_t]
+      - [int16x4_t, poly64x1_t]
+      - [uint16x4_t, poly64x1_t]
+      - [poly16x8_t, poly64x2_t]
+      - [int16x8_t, poly64x2_t]
+      - [uint16x8_t, poly64x2_t]
+      - [int32x4_t, p128]
+      - [uint32x4_t, p128]
+      - [poly64x1_t, int8x8_t]
+      - [poly64x1_t, uint8x8_t]
+      - [poly64x1_t, poly8x8_t]
+      - [poly64x2_t, int8x16_t]
+      - [poly64x2_t, uint8x16_t]
+      - [poly64x2_t, poly8x16_t]
+      - [p128, int16x8_t]
+      - [p128, uint16x8_t]
+      - [p128, poly16x8_t]
+      - [poly8x8_t, poly64x1_t]
+      - [int8x8_t, poly64x1_t]
+      - [uint8x8_t, poly64x1_t]
+      - [poly8x16_t, poly64x2_t]
+      - [int8x16_t, poly64x2_t]
+      - [uint8x16_t, poly64x2_t]
+      - [int16x8_t, p128]
+      - [uint16x8_t, p128]
+      - [poly16x8_t, p128]
+      - [int8x16_t, p128]
+      - [uint8x16_t, p128]
+    compose:
+      - FnCall: [transmute, [a]]
+
+  - name: "vreinterpret{neon_type[1].no}{neon_type[0].noq}"
+    doc: Vector reinterpret cast operation
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t, int8x8_t]
+      - [poly8x8_t, int8x8_t]
+      - [poly16x4_t, int16x4_t]
+      - [uint16x4_t, int16x4_t]
+      - [uint32x2_t, int32x2_t]
+      - [uint64x1_t, int64x1_t]
+      - [uint8x16_t, int8x16_t]
+      - [poly8x16_t, int8x16_t]
+      - [poly16x8_t, int16x8_t]
+      - [uint16x8_t, int16x8_t]
+      - [uint32x4_t, int32x4_t]
+      - [uint64x2_t, int64x2_t]
+      - [poly8x8_t, uint8x8_t]
+      - [int8x8_t, uint8x8_t]
+      - [poly16x4_t, uint16x4_t]
+      - [int16x4_t, uint16x4_t]
+      - [int32x2_t, uint32x2_t]
+      - [int64x1_t, uint64x1_t]
+      - [poly8x16_t, uint8x16_t]
+      - [int8x16_t, uint8x16_t]
+      - [poly16x8_t, uint16x8_t]
+      - [int16x8_t, uint16x8_t]
+      - [int32x4_t, uint32x4_t]
+      - [int64x2_t, uint64x2_t]
+      - [int8x8_t, poly8x8_t]
+      - [uint8x8_t, poly8x8_t]
+      - [int16x4_t, poly16x4_t]
+      - [uint16x4_t, poly16x4_t]
+      - [int8x16_t, poly8x16_t]
+      - [uint8x16_t, poly8x16_t]
+      - [int16x8_t, poly16x8_t]
+      - [uint16x8_t, poly16x8_t]
+      - [int16x4_t, int8x8_t]
+      - [uint16x4_t, int8x8_t]
+      - [poly16x4_t, int8x8_t]
+      - [int32x2_t, int16x4_t]
+      - [uint32x2_t, int16x4_t]
+      - [int64x1_t, int32x2_t]
+      - [uint64x1_t, int32x2_t]
+      - [int16x8_t, int8x16_t]
+      - [uint16x8_t, int8x16_t]
+      - [poly16x8_t, int8x16_t]
+      - [int32x4_t, int16x8_t]
+      - [uint32x4_t, int16x8_t]
+      - [int64x2_t, int32x4_t]
+      - [uint64x2_t, int32x4_t]
+      - [poly16x4_t, uint8x8_t]
+      - [int16x4_t, uint8x8_t]
+      - [uint16x4_t, uint8x8_t]
+      - [int32x2_t, uint16x4_t]
+      - [uint32x2_t, uint16x4_t]
+      - [int64x1_t, uint32x2_t]
+      - [uint64x1_t, uint32x2_t]
+      - [poly16x8_t, uint8x16_t]
+      - [int16x8_t, uint8x16_t]
+      - [uint16x8_t, uint8x16_t]
+      - [int32x4_t, uint16x8_t]
+      - [uint32x4_t, uint16x8_t]
+      - [int64x2_t, uint32x4_t]
+      - [uint64x2_t, uint32x4_t]
+      - [poly16x4_t, poly8x8_t]
+      - [int16x4_t, poly8x8_t]
+      - [uint16x4_t, poly8x8_t]
+      - [int32x2_t, poly16x4_t]
+      - [uint32x2_t, poly16x4_t]
+      - [poly16x8_t, poly8x16_t]
+      - [int16x8_t, poly8x16_t]
+      - [uint16x8_t, poly8x16_t]
+      - [int32x4_t, poly16x8_t]
+      - [uint32x4_t, poly16x8_t]
+      - [poly8x8_t, int16x4_t]
+      - [int8x8_t, int16x4_t]
+      - [uint8x8_t, int16x4_t]
+      - [poly16x4_t, int32x2_t]
+      - [int16x4_t, int32x2_t]
+      - [uint16x4_t, int32x2_t]
+      - [int32x2_t, int64x1_t]
+      - [uint32x2_t, int64x1_t]
+      - [poly8x16_t, int16x8_t]
+      - [int8x16_t, int16x8_t]
+      - [uint8x16_t, int16x8_t]
+      - [poly16x8_t, int32x4_t]
+      - [int16x8_t, int32x4_t]
+      - [uint16x8_t, int32x4_t]
+      - [int32x4_t, int64x2_t]
+      - [uint32x4_t, int64x2_t]
+      - [poly8x8_t, uint16x4_t]
+      - [int8x8_t, uint16x4_t]
+      - [uint8x8_t, uint16x4_t]
+      - [poly16x4_t, uint32x2_t]
+      - [int16x4_t, uint32x2_t]
+      - [uint16x4_t, uint32x2_t]
+      - [int32x2_t, uint64x1_t]
+      - [uint32x2_t, uint64x1_t]
+      - [poly8x16_t, uint16x8_t]
+      - [int8x16_t, uint16x8_t]
+      - [uint8x16_t, uint16x8_t]
+      - [poly16x8_t, uint32x4_t]
+      - [int16x8_t, uint32x4_t]
+      - [uint16x8_t, uint32x4_t]
+      - [int32x4_t, uint64x2_t]
+      - [uint32x4_t, uint64x2_t]
+      - [poly8x8_t, poly16x4_t]
+      - [int8x8_t, poly16x4_t]
+      - [uint8x8_t, poly16x4_t]
+      - [poly8x16_t, poly16x8_t]
+      - [int8x16_t, poly16x8_t]
+      - [uint8x16_t, poly16x8_t]
+      - [int32x2_t, int8x8_t]
+      - [uint32x2_t, int8x8_t]
+      - [int64x1_t, int16x4_t]
+      - [uint64x1_t, int16x4_t]
+      - [int32x4_t, int8x16_t]
+      - [uint32x4_t, int8x16_t]
+      - [int64x2_t, int16x8_t]
+      - [uint64x2_t, int16x8_t]
+      - [int32x2_t, uint8x8_t]
+      - [uint32x2_t, uint8x8_t]
+      - [int64x1_t, uint16x4_t]
+      - [uint64x1_t, uint16x4_t]
+      - [int32x4_t, uint8x16_t]
+      - [uint32x4_t, uint8x16_t]
+      - [int64x2_t, uint16x8_t]
+      - [uint64x2_t, uint16x8_t]
+      - [int32x2_t, poly8x8_t]
+      - [uint32x2_t, poly8x8_t]
+      - [int64x1_t, poly16x4_t]
+      - [uint64x1_t, poly16x4_t]
+      - [int32x4_t, poly8x16_t]
+      - [uint32x4_t, poly8x16_t]
+      - [int64x2_t, poly16x8_t]
+      - [uint64x2_t, poly16x8_t]
+      - [poly8x8_t, int32x2_t]
+      - [int8x8_t, int32x2_t]
+      - [uint8x8_t, int32x2_t]
+      - [poly16x4_t, int64x1_t]
+      - [int16x4_t, int64x1_t]
+      - [uint16x4_t, int64x1_t]
+      - [poly8x16_t, int32x4_t]
+      - [int8x16_t, int32x4_t]
+      - [uint8x16_t, int32x4_t]
+      - [poly16x8_t, int64x2_t]
+      - [int16x8_t, int64x2_t]
+      - [uint16x8_t, int64x2_t]
+      - [poly8x8_t, uint32x2_t]
+      - [int8x8_t, uint32x2_t]
+      - [uint8x8_t, uint32x2_t]
+      - [poly16x4_t, uint64x1_t]
+      - [int16x4_t, uint64x1_t]
+      - [uint16x4_t, uint64x1_t]
+      - [poly8x16_t, uint32x4_t]
+      - [int8x16_t, uint32x4_t]
+      - [uint8x16_t, uint32x4_t]
+      - [poly16x8_t, uint64x2_t]
+      - [int16x8_t, uint64x2_t]
+      - [uint16x8_t, uint64x2_t]
+      - [int64x1_t, int8x8_t]
+      - [uint64x1_t, int8x8_t]
+      - [int64x1_t, uint8x8_t]
+      - [uint64x1_t, uint8x8_t]
+      - [int64x1_t, poly8x8_t]
+      - [uint64x1_t, poly8x8_t]
+      - [int64x2_t, int8x16_t]
+      - [uint64x2_t, int8x16_t]
+      - [int64x2_t, uint8x16_t]
+      - [uint64x2_t, uint8x16_t]
+      - [int64x2_t, poly8x16_t]
+      - [uint64x2_t, poly8x16_t]
+      - [poly8x8_t, int64x1_t]
+      - [int8x8_t, int64x1_t]
+      - [uint8x8_t, int64x1_t]
+      - [poly8x8_t, uint64x1_t]
+      - [int8x8_t, uint64x1_t]
+      - [uint8x8_t, uint64x1_t]
+      - [poly8x16_t, int64x2_t]
+      - [int8x16_t, int64x2_t]
+      - [uint8x16_t, int64x2_t]
+      - [poly8x16_t, uint64x2_t]
+      - [int8x16_t, uint64x2_t]
+      - [uint8x16_t, uint64x2_t]
+      - [float32x2_t, int8x8_t]
+      - [float32x2_t, int16x4_t]
+      - [float32x2_t, int32x2_t]
+      - [float32x2_t, int64x1_t]
+      - [float32x4_t, int8x16_t]
+      - [float32x4_t, int16x8_t]
+      - [float32x4_t, int32x4_t]
+      - [float32x4_t, int64x2_t]
+      - [float32x2_t, uint8x8_t]
+      - [float32x2_t, uint16x4_t]
+      - [float32x2_t, uint32x2_t]
+      - [float32x2_t, uint64x1_t]
+      - [float32x4_t, uint8x16_t]
+      - [float32x4_t, uint16x8_t]
+      - [float32x4_t, uint32x4_t]
+      - [float32x4_t, uint64x2_t]
+      - [float32x2_t, poly8x8_t]
+      - [float32x2_t, poly16x4_t]
+      - [float32x4_t, poly8x16_t]
+      - [float32x4_t, poly16x8_t]
+      - [float32x4_t, p128]
+      - [int8x8_t, float32x2_t]
+      - [int16x4_t, float32x2_t]
+      - [int32x2_t, float32x2_t]
+      - [int64x1_t, float32x2_t]
+      - [int8x16_t, float32x4_t]
+      - [int16x8_t, float32x4_t]
+      - [int32x4_t, float32x4_t]
+      - [int64x2_t, float32x4_t]
+      - [uint8x8_t, float32x2_t]
+      - [uint16x4_t, float32x2_t]
+      - [uint32x2_t, float32x2_t]
+      - [uint64x1_t, float32x2_t]
+      - [uint8x16_t, float32x4_t]
+      - [uint16x8_t, float32x4_t]
+      - [uint32x4_t, float32x4_t]
+      - [uint64x2_t, float32x4_t]
+      - [poly8x8_t, float32x2_t]
+      - [poly16x4_t, float32x2_t]
+      - [poly8x16_t, float32x4_t]
+      - [poly16x8_t, float32x4_t]
+      - [p128, float32x4_t]
+    compose:
+      - FnCall: [transmute, [a]]
+
+
+  - name: "vreinterpret{neon_type[1].no}{neon_type[0].noq}"
+    doc: Vector reinterpret cast operation
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      # non-q
+      - [float32x2_t, float16x4_t]
+      - [poly16x4_t, float16x4_t]
+      - [poly8x8_t, float16x4_t]
+      - [int8x8_t, float16x4_t]
+      - [int16x4_t, float16x4_t]
+      - [int32x2_t, float16x4_t]
+      - [int64x1_t, float16x4_t]
+      - [uint8x8_t, float16x4_t]
+      - [uint16x4_t, float16x4_t]
+      - [uint32x2_t, float16x4_t]
+      - [uint64x1_t, float16x4_t]
+      - [float16x4_t, float32x2_t]
+      - [float16x4_t, poly16x4_t]
+      - [float16x4_t, poly8x8_t]
+      - [float16x4_t, int8x8_t]
+      - [float16x4_t, int16x4_t]
+      - [float16x4_t, int32x2_t]
+      - [float16x4_t, int64x1_t]
+      - [float16x4_t, uint8x8_t]
+      - [float16x4_t, uint16x4_t]
+      - [float16x4_t, uint32x2_t]
+      - [float16x4_t, uint64x1_t]
+      # q
+      - [float32x4_t, float16x8_t]
+      - [poly16x8_t, float16x8_t]
+      - [poly8x16_t, float16x8_t]
+      - [int8x16_t, float16x8_t]
+      - [int16x8_t, float16x8_t]
+      - [int32x4_t, float16x8_t]
+      - [int64x2_t, float16x8_t]
+      - [uint8x16_t, float16x8_t]
+      - [uint16x8_t, float16x8_t]
+      - [uint32x4_t, float16x8_t]
+      - [uint64x2_t, float16x8_t]
+      - [float16x8_t, float32x4_t]
+      - [float16x8_t, poly16x8_t]
+      - [float16x8_t, poly8x16_t]
+      - [float16x8_t, int8x16_t]
+      - [float16x8_t, int16x8_t]
+      - [float16x8_t, int32x4_t]
+      - [float16x8_t, int64x2_t]
+      - [float16x8_t, uint8x16_t]
+      - [float16x8_t, uint16x8_t]
+      - [float16x8_t, uint32x4_t]
+      - [float16x8_t, uint64x2_t]
+    compose:
+      - FnCall: [transmute, [a]]
+
+
+  - name: "vreinterpret{neon_type[1].no}{neon_type[0].noq}"
+    doc: Vector reinterpret cast operation
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [poly64x1_t, float16x4_t]
+      - [float16x4_t, poly64x1_t]
+      # q
+      - [poly64x2_t, float16x8_t]
+      - [poly128_t, float16x8_t]
+      - [float16x8_t, poly128_t]
+      - [float16x8_t, poly64x2_t]
+    compose:
+      - FnCall: [transmute, [a]]
+
+  - name: "vrev64{neon_type[0].no}"
+    doc: Reverse elements in 64-bit doublewords
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrev64]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [rev64]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, "[3, 2, 1, 0]"]
+      - [float16x8_t, "[3, 2, 1, 0, 7, 6, 5, 4]"]
+    compose:
+      - FnCall: [simd_shuffle!, [a, a, "{type[1]}"]]
+
+  - name: "vrshl{neon_type.no}"
+    doc: "Signed rounding shift left"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrshl]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [srshl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+      - int64x1_t
+      - int64x2_t
+    compose:
+      - LLVMLink:
+          name: "vrshl{neon_type.no}"
+          links:
+            - link: "llvm.arm.neon.vrshifts.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.srshl.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vrshl{neon_type[0].no}"
+    doc: "Unsigned rounding shift left"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrshl]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [urshl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t, int8x8_t]
+      - [uint8x16_t, int8x16_t]
+      - [uint16x4_t, int16x4_t]
+      - [uint16x8_t, int16x8_t]
+      - [uint32x2_t, int32x2_t]
+      - [uint32x4_t, int32x4_t]
+      - [uint64x1_t, int64x1_t]
+      - [uint64x2_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vrshl{neon_type[0].no}"
+          links:
+            - link: "llvm.arm.neon.vrshiftu.{neon_type[0]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.urshl.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+  - name: "vrshr{neon_type[0].N}"
+    doc: "Signed rounding shift right"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrshr, 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [srshr, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, 'N >= 1 && N <= 8']
+      - [int8x16_t, 'N >= 1 && N <= 8']
+      - [int16x4_t, 'N >= 1 && N <= 16']
+      - [int16x8_t, 'N >= 1 && N <= 16']
+      - [int32x2_t, 'N >= 1 && N <= 32']
+      - [int32x4_t, 'N >= 1 && N <= 32']
+      - [int64x1_t, 'N >= 1 && N <= 64']
+      - [int64x2_t, 'N >= 1 && N <= 64']
+    compose:
+      - FnCall: [static_assert!, ["{type[1]}"]]
+      - FnCall:
+          - "vrshl{neon_type[0].no}"
+          - - a
+            - FnCall: ["vdup{neon_type[0].N}", ['-N as _']]
+
+  - name: "vrshr{neon_type[0].N}"
+    doc: "Unsigned rounding shift right"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrshr, N = 2]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [urshr, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint8x8_t, int8x8_t, 'N >= 1 && N <= 8']
+      - [uint8x16_t, int8x16_t, 'N >= 1 && N <= 8']
+      - [uint16x4_t, int16x4_t, 'N >= 1 && N <= 16']
+      - [uint16x8_t, int16x8_t, 'N >= 1 && N <= 16']
+      - [uint32x2_t, int32x2_t, 'N >= 1 && N <= 32']
+      - [uint32x4_t, int32x4_t, 'N >= 1 && N <= 32']
+      - [uint64x1_t, int64x1_t, 'N >= 1 && N <= 64']
+      - [uint64x2_t, int64x2_t, 'N >= 1 && N <= 64']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - FnCall:
+          - "vrshl{neon_type[0].no}"
+          - - a
+            - FnCall: ["vdup{neon_type[1].N}", ['-N as _']]
+
+  - name: "vrshrn_n_{neon_type[0]}"
+    doc: "Rounding shift right narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vrshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16]) }']
+      - [int32x4_t, int16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) }']
+      - [int64x2_t, int32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64, -N as i64]) }']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - LLVMLink:
+          name: "vrshrn_n_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: {neon_type[0]}"
+          links:
+            - link: "llvm.arm.neon.vrshiftn.{neon_type[1]}"
+              arch: arm
+      - FnCall: ["_vrshrn_n_{neon_type[0]}", [a, "{type[3]}"], [], true]
+
+  - name: "vrshrn_n_{neon_type[0]}"
+    doc: "Rounding shift right narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [rshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t, 'N >= 1 && N <= 8']
+      - [int32x4_t, int16x4_t, 'N >= 1 && N <= 16']
+      - [int64x2_t, int32x2_t, 'N >= 1 && N <= 32']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - LLVMLink:
+          name: "vrshrn_n_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.rshrn.{neon_type[1]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vrshrn_n_{neon_type[0]}", [a, N], [], true]
+
+  - name: "vrshrn_n_{neon_type[0]}"
+    doc: "Rounding shift right narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrshrn, N = 2]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [rshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x8_t, 'N >= 1 && N <= 8', s16]
+      - [uint32x4_t, uint16x4_t, 'N >= 1 && N <= 16', s32]
+      - [uint64x2_t, uint32x2_t, 'N >= 1 && N <= 32', s64]
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vrshrn_n_{type[3]}::<N>"
+                - - FnCall: [transmute, [a]]
+
+  - name: "vrsra{neon_type[0].N}"
+    doc: "Signed rounding shift right and accumulate"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrsra, 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [srsra, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, 'N >= 1 && N <= 8']
+      - [int8x16_t, 'N >= 1 && N <= 8']
+      - [int16x4_t, 'N >= 1 && N <= 16']
+      - [int16x8_t, 'N >= 1 && N <= 16']
+      - [int32x2_t, 'N >= 1 && N <= 32']
+      - [int32x4_t, 'N >= 1 && N <= 32']
+      - [int64x1_t, 'N >= 1 && N <= 64']
+      - [int64x2_t, 'N >= 1 && N <= 64']
+    compose:
+      - FnCall: [static_assert!, ["{type[1]}"]]
+      - FnCall:
+          - simd_add
+          - - a
+            - FnCall: ["vrshr{neon_type[0].N}::<N>", [b]]
+
+  - name: "vrsubhn_{neon_type[0]}"
+    doc: "Rounding subtract returning high narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrsubhn]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [rsubhn]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x8_t, int16x8_t, int8x8_t]
+      - [int32x4_t, int32x4_t, int16x4_t]
+      - [int64x2_t, int64x2_t, int32x2_t]
+    compose:
+      - LLVMLink:
+          name: "vrsubhn_{neon_type[0]}"
+          links:
+            - link: "llvm.arm.neon.vrsubhn.{neon_type[2]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.rsubhn.{neon_type[2]}"
+              arch: aarch64,arm64ec
+
+  - name: "vrsubhn_{neon_type[0]}"
+    doc: "Rounding subtract returning high narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrsubhn]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [rsubhn]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint16x8_t, uint16x8_t, uint8x8_t, s16]
+      - [uint32x4_t, uint32x4_t, uint16x4_t, s32]
+      - [uint64x2_t, uint64x2_t, uint32x2_t, s64]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vrsubhn_{type[3]}"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vcreate_{neon_type[1]}"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ["u64", int8x8_t]
+      - ["u64", int16x4_t]
+      - ["u64", int32x2_t]
+      - ["u64", int64x1_t]
+      - ["u64", uint8x8_t]
+      - ["u64", uint16x4_t]
+      - ["u64", uint32x2_t]
+      - ["u64", uint64x1_t]
+      - ["u64", poly8x8_t]
+      - ["u64", poly16x4_t]
+      - ["u64", float32x2_t]
+    compose:
+      - FnCall: [transmute, [a]]
+
+  - name: "vcreate_{neon_type[1]}"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ["u64", float16x4_t]
+    compose:
+      - FnCall: [transmute, [a]]
+
+  - name: "vcreate_p64"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ["u64", poly64x1_t]
+    compose:
+      - FnCall: [transmute, [a]]
+
+  - name: "vset{neon_type[1].lane_nox}"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop, LANE = 0]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["i8", int8x8_t, '3']
+      - ["i16", int16x4_t, '2']
+      - ["i32", int32x2_t, '1']
+      - ["u8", uint8x8_t, '3']
+      - ["u16", uint16x4_t, '2']
+      - ["u32", uint32x2_t, '1']
+      - ["p8", poly8x8_t, '3']
+      - ["p16", poly16x4_t, '2']
+      - ["i8", int8x16_t, '4']
+      - ["i16", int16x8_t, '3']
+      - ["i32", int32x4_t, '2']
+      - ["i64", int64x2_t, '1']
+      - ["u8", uint8x16_t, '4']
+      - ["u16", uint16x8_t, '3']
+      - ["u32", uint32x4_t, '2']
+      - ["u64", uint64x2_t, '1']
+      - ["p8", poly8x16_t, '4']
+      - ["p16", poly16x8_t, '3']
+      - ["f32", float32x2_t, '1']
+      - ["f32", float32x4_t, '2']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - FnCall: [simd_insert!, [b, 'LANE as u32', a]]
+
+
+  - name: "vset{neon_type[1].lane_nox}"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop, LANE = 0]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["f16", float16x4_t, '2']
+      - ["f16", float16x8_t, '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - FnCall: [simd_insert!, [b, 'LANE as u32', a]]
+
+
+  - name: "vset_lane_{neon_type[0]}"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["i64", int64x1_t, int64x1_t]
+      - ["u64", uint64x1_t, uint64x1_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall: [simd_insert!, [b, 'LANE as u32', a]]
+
+  - name: "vset_lane_{neon_type[0]}"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["p64", poly64x1_t, poly64x1_t]
+    compose:
+      - FnCall: [static_assert!, ['LANE == 0']]
+      - FnCall: [simd_insert!, [b, 'LANE as u32', a]]
+
+  - name: "vsetq_lane_p64"
+    doc: "Insert vector element from another vector element"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - ["p64", poly64x2_t, poly64x2_t]
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '1']]
+      - FnCall: [simd_insert!, [b, 'LANE as u32', a]]
+
+  - name: "vshl{neon_type.no}"
+    doc: "Signed Shift left"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vshl]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sshl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+      - int64x1_t
+      - int64x2_t
+    compose:
+      - LLVMLink:
+          name: "vshl{neon_type.no}"
+          links:
+            - link: "llvm.arm.neon.vshifts.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.sshl.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vshl{neon_type[0].no}"
+    doc: "Unsigned Shift left"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vshl]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ushl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t, int8x8_t]
+      - [uint8x16_t, int8x16_t]
+      - [uint16x4_t, int16x4_t]
+      - [uint16x8_t, int16x8_t]
+      - [uint32x2_t, int32x2_t]
+      - [uint32x4_t, int32x4_t]
+      - [uint64x1_t, int64x1_t]
+      - [uint64x2_t, int64x2_t]
+    compose:
+      - LLVMLink:
+          name: "vshl{neon_type[0].no}"
+          links:
+            - link: "llvm.arm.neon.vshiftu.{neon_type[1]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.ushl.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vshll_n_s8"
+    doc: "Signed shift left long"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vshll.s8"', 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sshll, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, int16x8_t, 'N >= 0 && N <= 8']
+    compose:
+      - FnCall: [static_assert!, ["{type[2]}"]]
+      - FnCall:
+          - simd_shl
+          - - FnCall: [simd_cast, [a]]
+            - FnCall: [vdupq_n_s16, ['N as _']]
+
+  - name: "vshll_n_s16"
+    doc: "Signed shift left long"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vshll.s16"', 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sshll, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x4_t, int32x4_t]
+    compose:
+      - FnCall: [static_assert!, ["N >= 0 && N <= 16"]]
+      - FnCall:
+          - simd_shl
+          - - FnCall: [simd_cast, [a]]
+            - FnCall: [vdupq_n_s32, ['N as _']]
+
+  - name: "vshll_n_s32"
+    doc: "Signed shift left long"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vshll.s32"', 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sshll, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int32x2_t, int64x2_t]
+    compose:
+      - FnCall: [static_assert!, ["N >= 0 && N <= 32"]]
+      - FnCall:
+          - simd_shl
+          - - FnCall: [simd_cast, [a]]
+            - FnCall: [vdupq_n_s64, ['N as _']]
+
+  - name: "vshll_n_u8"
+    doc: "Signed shift left long"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vshll.u8"', 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ushll, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint8x8_t, uint16x8_t]
+    compose:
+      - FnCall: [static_assert!, ["N >= 0 && N <= 8"]]
+      - FnCall:
+          - simd_shl
+          - - FnCall: [simd_cast, [a]]
+            - FnCall: [vdupq_n_u16, ['N as _']]
+
+  - name: "vshll_n_u16"
+    doc: "Signed shift left long"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vshll.u16"', 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ushll, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint16x4_t, uint32x4_t]
+    compose:
+      - FnCall: [static_assert!, ["N >= 0 && N <= 16"]]
+      - FnCall:
+          - simd_shl
+          - - FnCall: [simd_cast, [a]]
+            - FnCall: [vdupq_n_u32, ['N as _']]
+
+  - name: "vshll_n_u32"
+    doc: "Signed shift left long"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vshll.u32"', 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ushll, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint32x2_t, uint64x2_t]
+    compose:
+      - FnCall: [static_assert!, ["N >= 0 && N <= 32"]]
+      - FnCall:
+          - simd_shl
+          - - FnCall: [simd_cast, [a]]
+            - FnCall: [vdupq_n_u64, ['N as _']]
+
+  - name: "vshr{neon_type[0].N}"
+    doc: "Shift right"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vshr.{neon_type[0]}"', 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sshr, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, 'N >= 1 && N <= 8', 'let n: i32 = if N == 8 { 7 }', 'else { N };']
+      - [int8x16_t, 'N >= 1 && N <= 8', 'let n: i32 = if N == 8 { 7 }', 'else { N };']
+      - [int16x4_t, 'N >= 1 && N <= 16', 'let n: i32 = if N == 16 { 15 }', 'else { N };']
+      - [int16x8_t, 'N >= 1 && N <= 16', 'let n: i32 = if N == 16 { 15 }', 'else { N };']
+      - [int32x2_t, 'N >= 1 && N <= 32', 'let n: i32 = if N == 32 { 31 }', 'else { N };']
+      - [int32x4_t, 'N >= 1 && N <= 32', 'let n: i32 = if N == 32 { 31 }', 'else { N };']
+      - [int64x1_t, 'N >= 1 && N <= 64', 'let n: i32 = if N == 64 { 63 }', 'else { N };']
+      - [int64x2_t, 'N >= 1 && N <= 64', 'let n: i32 = if N == 64 { 63 }', 'else { N };']
+    compose:
+      - FnCall: [static_assert!, ["{type[1]}"]]
+      - Identifier: ["{type[2]}{type[3]}", Symbol]
+      - FnCall:
+          - simd_shr
+          - - a
+            - FnCall: ["vdup{neon_type[0].N}", ['n as _']]
+
+  - name: "vshr{neon_type[0].N}"
+    doc: "Shift right"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vshr.{neon_type[0]}"', 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ushr, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint8x8_t, 'N >= 1 && N <= 8', 'let n: i32 = if N == 8 { return vdup_n_u8(0);    }', 'else { N };']
+      - [uint8x16_t, 'N >= 1 && N <= 8', 'let n: i32 = if N == 8 { return vdupq_n_u8(0);   }', 'else { N };']
+      - [uint16x4_t, 'N >= 1 && N <= 16', 'let n: i32 = if N == 16 { return vdup_n_u16(0);  }', 'else { N };']
+      - [uint16x8_t, 'N >= 1 && N <= 16', 'let n: i32 = if N == 16 { return vdupq_n_u16(0); }', 'else { N };']
+      - [uint32x2_t, 'N >= 1 && N <= 32', 'let n: i32 = if N == 32 { return vdup_n_u32(0);  }', 'else { N };']
+      - [uint32x4_t, 'N >= 1 && N <= 32', 'let n: i32 = if N == 32 { return vdupq_n_u32(0); }', 'else { N };']
+      - [uint64x1_t, 'N >= 1 && N <= 64', 'let n: i32 = if N == 64 { return vdup_n_u64(0);  }', 'else { N };']
+      - [uint64x2_t, 'N >= 1 && N <= 64', 'let n: i32 = if N == 64 { return vdupq_n_u64(0); }', 'else { N };']
+    compose:
+      - FnCall: [static_assert!, ["{type[1]}"]]
+      - Identifier: ['{type[2]}{type[3]}', Symbol]
+      - FnCall:
+          - simd_shr
+          - - a
+            - FnCall: ["vdup{neon_type[0].N}", ['n as _']]
+
+  - name: "vshrn_n_{neon_type[0]}"
+    doc: "Shift right narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vshrn{type[2]}"', 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [shrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t, '.i16', 'N >= 1 && N <= 8']
+      - [uint16x8_t, uint8x8_t, '.i16', 'N >= 1 && N <= 8']
+      - [int32x4_t, int16x4_t, '.i32', 'N >= 1 && N <= 16']
+      - [uint32x4_t, uint16x4_t, '.i32', 'N >= 1 && N <= 16']
+      - [int64x2_t, int32x2_t, '.i64', 'N >= 1 && N <= 32']
+      - [uint64x2_t, uint32x2_t, '.i64', 'N >= 1 && N <= 32']
+    compose:
+      - FnCall: [static_assert!, ["{type[3]}"]]
+      - FnCall:
+          - simd_cast
+          - - FnCall:
+                - simd_shr
+                - - a
+                  - FnCall: ["vdupq_n_{neon_type[0]}", ['N as _']]
+
+  - name: "vsra{neon_type[0].N}"
+    doc: "Signed shift right and accumulate"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vsra, 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ssra, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, 'N >= 1 && N <= 8']
+      - [int8x16_t, 'N >= 1 && N <= 8']
+      - [int16x4_t, 'N >= 1 && N <= 16']
+      - [int16x8_t, 'N >= 1 && N <= 16']
+      - [int32x2_t, 'N >= 1 && N <= 32']
+      - [int32x4_t, 'N >= 1 && N <= 32']
+      - [int64x1_t, 'N >= 1 && N <= 64']
+      - [int64x2_t, 'N >= 1 && N <= 64']
+    compose:
+      - FnCall: [static_assert!, ["{type[1]}"]]
+      - FnCall:
+          - simd_add
+          - - a
+            - FnCall: ["vshr{neon_type[0].N}::<N>", [b]]
+
+  - name: "vtrn{neon_type[0].no}"
+    doc: "Transpose elements"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vtrn]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [trn]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, int8x8x2_t, '[0, 8, 2, 10, 4, 12, 6, 14]', '[1, 9, 3, 11, 5, 13, 7, 15]']
+      - [int16x4_t, int16x4x2_t, '[0, 4, 2, 6]', '[1, 5, 3, 7]']
+      - [int8x16_t, int8x16x2_t, '[0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]', '[1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]']
+      - [int16x8_t, int16x8x2_t, '[0, 8, 2, 10, 4, 12, 6, 14]', '[1, 9, 3, 11, 5, 13, 7, 15]']
+      - [int32x4_t, int32x4x2_t, '[0, 4, 2, 6]', '[1, 5, 3, 7]']
+      - [uint8x8_t, uint8x8x2_t, '[0, 8, 2, 10, 4, 12, 6, 14]', '[1, 9, 3, 11, 5, 13, 7, 15]']
+      - [uint16x4_t, uint16x4x2_t, '[0, 4, 2, 6]', '[1, 5, 3, 7]']
+      - [uint8x16_t, uint8x16x2_t, '[0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]', '[1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]']
+      - [uint16x8_t, uint16x8x2_t, '[0, 8, 2, 10, 4, 12, 6, 14]', '[1, 9, 3, 11, 5, 13, 7, 15]']
+      - [uint32x4_t, uint32x4x2_t, '[0, 4, 2, 6]', '[1, 5, 3, 7]']
+      - [poly8x8_t, poly8x8x2_t, '[0, 8, 2, 10, 4, 12, 6, 14]', '[1, 9, 3, 11, 5, 13, 7, 15]']
+      - [poly16x4_t, poly16x4x2_t, '[0, 4, 2, 6]', '[1, 5, 3, 7]']
+      - [poly8x16_t, poly8x16x2_t, '[0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]', '[1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]']
+      - [poly16x8_t, poly16x8x2_t, '[0, 8, 2, 10, 4, 12, 6, 14]', '[1, 9, 3, 11, 5, 13, 7, 15]']
+      - [float32x4_t, float32x4x2_t, '[0, 4, 2, 6]', '[1, 5, 3, 7]']
+    compose:
+      - Let:
+          - a1
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [a, b, "{type[2]}"]]
+      - Let:
+          - b1
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [a, b, "{type[3]}"]]
+      - FnCall:
+          - transmute
+          - - Identifier: ['(a1, b1)', Symbol]
+
+
+  - name: "vtrn{neon_type[0].no}"
+    doc: "Transpose elements"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vtrn]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [trn]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, float16x4x2_t, '[0, 4, 2, 6]', '[1, 5, 3, 7]']
+      - [float16x8_t, float16x8x2_t, '[0, 8, 2, 10, 4, 12, 6, 14]', '[1, 9, 3, 11, 5, 13, 7, 15]']
+    compose:
+      - Let:
+          - a1
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [a, b, "{type[2]}"]]
+      - Let:
+          - b1
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [a, b, "{type[3]}"]]
+      - FnCall:
+          - transmute
+          - - Identifier: ['(a1, b1)', Symbol]
+
+
+  - name: "vtrn{neon_type[0].no}"
+    doc: "Transpose elements"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vtrn]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x2_t, int32x2x2_t, '[0, 2]', '[1, 3]']
+      - [uint32x2_t, uint32x2x2_t, '[0, 2]', '[1, 3]']
+      - [float32x2_t, float32x2x2_t, '[0, 2]', '[1, 3]']
+    compose:
+      - Let:
+          - a1
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [a, b, "{type[2]}"]]
+      - Let:
+          - b1
+          - "{neon_type[0]}"
+          - FnCall: [simd_shuffle!, [a, b, "{type[3]}"]]
+      - FnCall:
+          - transmute
+          - - Identifier: ['(a1, b1)', Symbol]
+
+  - name: "vzip{neon_type[0].no}"
+    doc: Zip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vorr]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x16_t, int8x16x2_t, '[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]', '[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]']
+      - [int16x8_t, int16x8x2_t, '[0, 8, 1, 9, 2, 10, 3, 11]', '[4, 12, 5, 13, 6, 14, 7, 15]']
+      - [int32x4_t, int32x4x2_t, '[0, 4, 1, 5]', '[2, 6, 3, 7]']
+      - [uint8x16_t, uint8x16x2_t, '[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]', '[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]']
+      - [uint16x8_t, uint16x8x2_t, '[0, 8, 1, 9, 2, 10, 3, 11]', '[4, 12, 5, 13, 6, 14, 7, 15]']
+      - [uint32x4_t, uint32x4x2_t, '[0, 4, 1, 5]', '[2, 6, 3, 7]']
+      - [poly8x16_t, poly8x16x2_t, '[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]', '[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]']
+      - [poly16x8_t, poly16x8x2_t, '[0, 8, 1, 9, 2, 10, 3, 11]', '[4, 12, 5, 13, 6, 14, 7, 15]']
+      - [float32x4_t, float32x4x2_t, '[0, 4, 1, 5]', '[2, 6, 3, 7]']
+    compose:
+      - Let:
+          - a0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[2]}"]]
+      - Let:
+          - b0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[3]}"]]
+      - FnCall:
+          - transmute
+          - - '(a0, b0)'
+
+  - name: "vzip{neon_type[0].no}"
+    doc: Zip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vtrn]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x2_t, int32x2x2_t, '[0, 2]', '[1, 3]']
+      - [uint32x2_t, uint32x2x2_t, '[0, 2]', '[1, 3]']
+      - [float32x2_t, float32x2x2_t, '[0, 2]', '[1, 3]']
+    compose:
+      - Let:
+          - a0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[2]}"]]
+      - Let:
+          - b0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[3]}"]]
+      - FnCall:
+          - transmute
+          - - '(a0, b0)'
+
+  - name: "vzip{neon_type[0].no}"
+    doc: Zip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vzip]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, int8x8x2_t, '[0, 8, 1, 9, 2, 10, 3, 11]', '[4, 12, 5, 13, 6, 14, 7, 15]']
+      - [int16x4_t, int16x4x2_t, '[0, 4, 1, 5]', '[2, 6, 3, 7]']
+      - [uint8x8_t, uint8x8x2_t, '[0, 8, 1, 9, 2, 10, 3, 11]', '[4, 12, 5, 13, 6, 14, 7, 15]']
+      - [uint16x4_t, uint16x4x2_t, '[0, 4, 1, 5]', '[2, 6, 3, 7]']
+      - [poly8x8_t, poly8x8x2_t, '[0, 8, 1, 9, 2, 10, 3, 11]', '[4, 12, 5, 13, 6, 14, 7, 15]']
+      - [poly16x4_t, poly16x4x2_t, '[0, 4, 1, 5]', '[2, 6, 3, 7]']
+    compose:
+      - Let:
+          - a0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[2]}"]]
+      - Let:
+          - b0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[3]}"]]
+      - FnCall:
+          - transmute
+          - - '(a0, b0)'
+
+
+  - name: "vzip{neon_type[0].no}"
+    doc: Zip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vzip.16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, float16x4x2_t, '[0, 4, 1, 5]', '[2, 6, 3, 7]']
+      - [float16x8_t, float16x8x2_t, '[0, 8, 1, 9, 2, 10, 3, 11]', '[4, 12, 5, 13, 6, 14, 7, 15]']
+    compose:
+      - Let:
+          - a0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[2]}"]]
+      - Let:
+          - b0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[3]}"]]
+      - FnCall:
+          - transmute
+          - - '(a0, b0)'
+
+  - name: "vuzp{neon_type[0].no}"
+    doc: Unzip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vuzp]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uzp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t, int8x8x2_t, '[0, 2, 4, 6, 8, 10, 12, 14]', '[1, 3, 5, 7, 9, 11, 13, 15]']
+      - [int16x4_t, int16x4x2_t, '[0, 2, 4, 6]', '[1, 3, 5, 7]']
+      - [int8x16_t, int8x16x2_t, '[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]', '[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]']
+      - [int16x8_t, int16x8x2_t, '[0, 2, 4, 6, 8, 10, 12, 14]', '[1, 3, 5, 7, 9, 11, 13, 15]']
+      - [int32x4_t, int32x4x2_t, '[0, 2, 4, 6]', '[1, 3, 5, 7]']
+      - [uint8x8_t, uint8x8x2_t, '[0, 2, 4, 6, 8, 10, 12, 14]', '[1, 3, 5, 7, 9, 11, 13, 15]']
+      - [uint16x4_t, uint16x4x2_t, '[0, 2, 4, 6]', '[1, 3, 5, 7]']
+      - [uint8x16_t, uint8x16x2_t, '[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]', '[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]']
+      - [uint16x8_t, uint16x8x2_t, '[0, 2, 4, 6, 8, 10, 12, 14]', '[1, 3, 5, 7, 9, 11, 13, 15]']
+      - [uint32x4_t, uint32x4x2_t, '[0, 2, 4, 6]', '[1, 3, 5, 7]']
+      - [poly8x8_t, poly8x8x2_t, '[0, 2, 4, 6, 8, 10, 12, 14]', '[1, 3, 5, 7, 9, 11, 13, 15]']
+      - [poly16x4_t, poly16x4x2_t, '[0, 2, 4, 6]', '[1, 3, 5, 7]']
+      - [poly8x16_t, poly8x16x2_t, '[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]', '[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]']
+      - [poly16x8_t, poly16x8x2_t, '[0, 2, 4, 6, 8, 10, 12, 14]', '[1, 3, 5, 7, 9, 11, 13, 15]']
+      - [float32x4_t, float32x4x2_t, '[0, 2, 4, 6]', '[1, 3, 5, 7]']
+    compose:
+      - Let:
+          - a0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[2]}"]]
+      - Let:
+          - b0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[3]}"]]
+      - FnCall:
+          - transmute
+          - - '(a0, b0)'
+
+
+  - name: "vuzp{neon_type[0].no}"
+    doc: Unzip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vuzp]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uzp]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, float16x4x2_t, '[0, 2, 4, 6]', '[1, 3, 5, 7]']
+      - [float16x8_t, float16x8x2_t, '[0, 2, 4, 6, 8, 10, 12, 14]', '[1, 3, 5, 7, 9, 11, 13, 15]']
+    compose:
+      - Let:
+          - a0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[2]}"]]
+      - Let:
+          - b0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[3]}"]]
+      - FnCall:
+          - transmute
+          - - '(a0, b0)'
+
+
+  - name: "vuzp{neon_type[0].no}"
+    doc: Unzip vectors
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vtrn]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, float32x2x2_t, '[0, 2]', '[1, 3]']
+      - [int32x2_t, int32x2x2_t, '[0, 2]', '[1, 3]']
+      - [uint32x2_t, uint32x2x2_t, '[0, 2]', '[1, 3]']
+    compose:
+      - Let:
+          - a0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[2]}"]]
+      - Let:
+          - b0
+          - "{neon_type[0]}"
+          - FnCall: ["simd_shuffle!", [a, b, "{type[3]}"]]
+      - FnCall:
+          - transmute
+          - - '(a0, b0)'
+
+  - name: "vabal_{neon_type[1]}"
+    doc: "Unsigned Absolute difference and Accumulate Long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vabal.{type[2]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uabal]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x8_t, "u8"]
+      - [uint32x4_t, uint16x4_t, "u16"]
+      - [uint64x2_t, uint32x2_t, "u32"]
+    compose:
+      - Let: [d, "{neon_type[1]}", {FnCall: ["vabd_{type[2]}", [b, c]]}]
+      - FnCall: [simd_add, [a, {FnCall: [simd_cast, [d]]}]]
+
+  - name: "vabal_{neon_type[1]}"
+    doc: "Signed Absolute difference and Accumulate Long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vabal.{neon_type[1]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sabal]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t, uint8x8_t]
+      - [int32x4_t, int16x4_t, uint16x4_t]
+      - [int64x2_t, int32x2_t, uint32x2_t]
+    compose:
+      - Let: [d, "{type[1]}", {FnCall: ["vabd_{neon_type[1]}", [b, c]]}]
+      - Let: [e, "{type[2]}", {FnCall: ["simd_cast", [d]]}]
+      - FnCall: [simd_add, [a, {FnCall: [simd_cast, [e]]}]]
+
+  - name: "vqabs{neon_type.no}"
+    doc: Signed saturating Absolute value
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vqabs.{neon_type}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqabs]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int8x16_t
+      - int16x4_t
+      - int16x8_t
+      - int32x2_t
+      - int32x4_t
+    compose:
+      - LLVMLink:
+          name: "sqabs.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.sqabs.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vqabs.{neon_type}"
+              arch: arm
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vst1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [st1]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*mut u8", uint8x8x2_t, int8x8x2_t]
+      - ["*mut u16", uint16x4x2_t, int16x4x2_t]
+      - ["*mut u32", uint32x2x2_t, int32x2x2_t]
+      - ["*mut u64", uint64x1x2_t, int64x1x2_t]
+      - ["*mut u8", uint8x16x2_t, int8x16x2_t]
+      - ["*mut u16", uint16x8x2_t, int16x8x2_t]
+      - ["*mut u32", uint32x4x2_t, int32x4x2_t]
+      - ["*mut u64", uint64x2x2_t, int64x2x2_t]
+      - ["*mut u8", uint8x8x3_t, int8x8x3_t]
+      - ["*mut u16", uint16x4x3_t, int16x4x3_t]
+      - ["*mut u32", uint32x2x3_t, int32x2x3_t]
+      - ["*mut u64", uint64x1x3_t, int64x1x3_t]
+      - ["*mut u8", uint8x16x3_t, int8x16x3_t]
+      - ["*mut u16", uint16x8x3_t, int16x8x3_t]
+      - ["*mut u32", uint32x4x3_t, int32x4x3_t]
+      - ["*mut u64", uint64x2x3_t, int64x2x3_t]
+      - ["*mut u8", uint8x8x4_t, int8x8x4_t]
+      - ["*mut u16", uint16x4x4_t, int16x4x4_t]
+      - ["*mut u32", uint32x2x4_t, int32x2x4_t]
+      - ["*mut u64", uint64x1x4_t, int64x1x4_t]
+      - ["*mut u8", uint8x16x4_t, int8x16x4_t]
+      - ["*mut u16", uint16x8x4_t, int16x8x4_t]
+      - ["*mut u32", uint32x4x4_t, int32x4x4_t]
+      - ["*mut u64", uint64x2x4_t, int64x2x4_t]
+      - ["*mut p8", poly8x8x2_t, int8x8x2_t]
+      - ["*mut p8", poly8x8x3_t, int8x8x3_t]
+      - ["*mut p8", poly8x8x4_t, int8x8x4_t]
+      - ["*mut p8", poly8x16x2_t, int8x16x2_t]
+      - ["*mut p8", poly8x16x3_t, int8x16x3_t]
+      - ["*mut p8", poly8x16x4_t, int8x16x4_t]
+      - ["*mut p16", poly16x4x2_t, int16x4x2_t]
+      - ["*mut p16", poly16x4x3_t, int16x4x3_t]
+      - ["*mut p16", poly16x4x4_t, int16x4x4_t]
+      - ["*mut p16", poly16x8x2_t, int16x8x2_t]
+      - ["*mut p16", poly16x8x3_t, int16x8x3_t]
+      - ["*mut p16", poly16x8x4_t, int16x8x4_t]
+    compose:
+      - FnCall:
+          - "vst1{neon_type[2].no}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-aes
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vst1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [st1]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*mut p64", poly64x1x2_t, int64x1x2_t]
+    compose:
+      - FnCall:
+          - "vst1{neon_type[2].no}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *neon-aes
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [st1]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*mut p64", poly64x1x3_t, int64x1x3_t]
+      - ["*mut p64", poly64x1x4_t, int64x1x4_t]
+      - ["*mut p64", poly64x2x2_t, int64x2x2_t]
+      - ["*mut p64", poly64x2x3_t, int64x2x3_t]
+      - ["*mut p64", poly64x2x4_t, int64x2x4_t]
+    compose:
+      - FnCall:
+          - "vst1{neon_type[2].no}"
+          - - FnCall: [transmute, [a]]
+            - FnCall: [transmute, [b]]
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vst1]]}]]
+      - *neon-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*mut f32', float32x2x2_t, float32x2_t]
+      - ['*mut f32', float32x4x2_t, float32x4_t]
+    compose:
+      - LLVMLink:
+          name: "vst1{neon_type[1].no}"
+          arguments:
+            - "ptr: {type[0]}"
+            - "a: {neon_type[2]}"
+            - "b: {neon_type[2]}"
+          links:
+            - link: "llvm.arm.neon.vst1x{neon_type[1].tuple}.{neon_type[2]}.p0"
+              arch: arm
+      - FnCall: ["_vst1{neon_type[1].no}", ['a', 'b.0', 'b.1']]
+
+
+    # vst1_f16_x2 - arm
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vst1]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*mut f16', float16x4x2_t, float16x4_t]
+      - ['*mut f16', float16x8x2_t, float16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vst1{neon_type[1].no}"
+          arguments:
+            - "ptr: {type[0]}"
+            - "a: {neon_type[2]}"
+            - "b: {neon_type[2]}"
+          links:
+            - link: "llvm.arm.neon.vst1x{neon_type[1].tuple}.p0.{neon_type[2]}"
+              arch: arm
+      - FnCall: ["_vst1{neon_type[1].no}", ['a', 'b.0', 'b.1']]
+
+
+    # vst1_f16_x2 - aarch64
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st1]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*mut f16", float16x4x2_t, float16x4_t]
+      - ["*mut f16", float16x8x2_t, float16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vst1{neon_type[1].no}"
+          arguments:
+            - "a: {neon_type[2]}"
+            - "b: {neon_type[2]}"
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.st1x2.{neon_type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vst1{neon_type[1].no}", ['b.0', 'b.1', a]]
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vst1]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ['*mut f16', float16x4x3_t, float16x4_t]
+      - ['*mut f16', float16x8x3_t, float16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vst1{neon_type[1].no}"
+          arguments:
+            - "ptr: {type[0]}"
+            - "a: {neon_type[2]}"
+            - "b: {neon_type[2]}"
+            - "c: {neon_type[2]}"
+          links:
+            - link: "llvm.arm.neon.vst1x{neon_type[1].tuple}.p0.{neon_type[2]}"
+              arch: arm
+      - FnCall: ["_vst1{neon_type[1].no}", ['a', 'b.0', 'b.1', 'b.2']]
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st1]]}]]
+      - *neon-stable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*mut f32", float32x2x2_t, float32x2_t]
+      - ["*mut f32", float32x4x2_t, float32x4_t]
+    compose:
+      - LLVMLink:
+          name: "vst1{neon_type[1].no}"
+          arguments:
+            - "a: {neon_type[2]}"
+            - "b: {neon_type[2]}"
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.st1x2.{neon_type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vst1{neon_type[1].no}", ['b.0', 'b.1', a]]
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st1]]}]]
+      - *neon-stable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*mut f32", float32x2x3_t, float32x2_t]
+      - ["*mut f32", float32x4x3_t, float32x4_t]
+    compose:
+      - LLVMLink:
+          name: "vst1{neon_type[1].no}"
+          arguments:
+            - "a: {neon_type[2]}"
+            - "b: {neon_type[2]}"
+            - "c: {neon_type[2]}"
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.st1x3.{neon_type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vst1{neon_type[1].no}", ['b.0', 'b.1', 'b.2', a]]
+
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st1]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*mut f16", float16x4x3_t, float16x4_t]
+      - ["*mut f16", float16x8x3_t, float16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vst1{neon_type[1].no}"
+          arguments:
+            - "a: {neon_type[2]}"
+            - "b: {neon_type[2]}"
+            - "c: {neon_type[2]}"
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.st1x3.{neon_type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vst1{neon_type[1].no}", ['b.0', 'b.1', 'b.2', a]]
+
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st1]]}]]
+      - *neon-stable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*mut f32", float32x2x4_t, float32x2_t]
+      - ["*mut f32", float32x4x4_t, float32x4_t]
+    compose:
+      - LLVMLink:
+          name: "vst1{neon_type[1].no}"
+          arguments:
+            - "a: {neon_type[2]}"
+            - "b: {neon_type[2]}"
+            - "c: {neon_type[2]}"
+            - "d: {neon_type[2]}"
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.st1x4.{neon_type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vst1{neon_type[1].no}", ['b.0', 'b.1', 'b.2', 'b.3', a]]
+
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [st1]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*mut f16", float16x4x4_t, float16x4_t]
+      - ["*mut f16", float16x8x4_t, float16x8_t]
+    compose:
+      - LLVMLink:
+          name: "vst1{neon_type[1].no}"
+          arguments:
+            - "a: {neon_type[2]}"
+            - "b: {neon_type[2]}"
+            - "c: {neon_type[2]}"
+            - "d: {neon_type[2]}"
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.st1x4.{neon_type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vst1{neon_type[1].no}", ['b.0', 'b.1', 'b.2', 'b.3', a]]
+
+
+#  - name: "vst1{neon_type[1].no}"
+#    doc: "Store a single-element structures to one register."
+#    arguments: ["ptr: {type[0]}", "a: {neon_type[1]}"]
+#    attr:
+#      - *neon-v7
+#      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vst1]]}]]
+#      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [st1]]}]]
+#      - *neon-fp16
+#      - *neon-unstable-f16
+#    safety:
+#      unsafe: [neon]
+#    types:
+#      - ["*mut f16", float16x4_t]
+#      - ["*mut f16", float16x8_t]
+#    compose:
+#      - FnCall: [core::ptr::write_unaligned, ['ptr.cast()', a]]
+
+  - name: "vfms{neon_type.no}"
+    doc: "Floating-point fused multiply-subtract from accumulator"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [cfg_attr, [target_arch = "arm", {FnCall: [target_feature, ['enable = "vfp4"']]}]]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vfms]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmls]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - Let: [b, "{neon_type}", {FnCall: [simd_neg, [b]]}]
+      - FnCall: ["vfma{neon_type.no}", [a, b, c]]
+
+  - name: "vmul{neon_type[0].no}"
+    doc: "Polynomial multiply"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmul]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [pmul]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [poly8x8_t, int8x8_t]
+      - [poly8x16_t, int8x16_t]
+    compose:
+      - LLVMLink:
+          name: "vmul{neon_type[0].no}"
+          links:
+            - link: "llvm.arm.neon.vmulp.{neon_type[1]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.pmul.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vmls{neon_type.no}"
+    doc: "Floating-point multiply-subtract from accumulator"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmls.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmul]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - FnCall: [simd_sub, [a, {FnCall: [simd_mul, [b, c]]}]]
+
+  - name: "vcge{neon_type.no}"
+    doc: "Compare unsigned greater than or equal"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcge.{neon_type}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cmhs]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint8x8_t
+      - uint8x16_t
+      - uint16x4_t
+      - uint16x8_t
+      - uint32x2_t
+      - uint32x4_t
+    compose:
+      - FnCall: [simd_ge, [a, b]]
+
+  - name: "vcge{neon_type[0].no}"
+    doc: "Floating-point compare greater than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcge.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmge]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+    compose:
+      - FnCall: [simd_ge, [a, b]]
+
+  - name: "vcge{neon_type[0].no}"
+    doc: "Floating-point compare greater than or equal"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcge.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmge]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - FnCall: [simd_ge, [a, b]]
+
+
+  - name: "vcgez{neon_type[0].no}"
+    doc: "Floating-point compare greater than or equal to zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcge.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmge]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t, f16x4, 'f16x4::new(0.0, 0.0, 0.0, 0.0)']
+      - [float16x8_t, uint16x8_t, f16x8, 'f16x8::new(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall:
+          - simd_ge
+          - - a
+            - FnCall: [transmute, [b]]
+
+  - name: "vclt{neon_type.no}"
+    doc: "Compare unsigned less than"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcgt.{neon_type}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cmhi]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint8x8_t
+      - uint8x16_t
+      - uint16x4_t
+      - uint16x8_t
+      - uint32x2_t
+      - uint32x4_t
+    compose:
+      - FnCall: [simd_lt, [a, b]]
+
+  - name: "vtst{neon_type[0].no}"
+    doc: "Unsigned compare bitwise Test bits nonzero"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vtst]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cmtst]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t, u8x8, 'u8x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [uint8x16_t, u8x16, 'u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)']
+      - [uint16x4_t, u16x4, 'u16x4::new(0, 0, 0, 0)']
+      - [uint16x8_t, u16x8, 'u16x8::new(0, 0, 0, 0, 0, 0, 0, 0)']
+      - [uint32x2_t, u32x2, 'u32x2::new(0, 0)']
+      - [uint32x4_t, u32x4, 'u32x4::new(0, 0, 0, 0)']
+    compose:
+      - Let: [c, "{neon_type[0]}", {FnCall: [simd_and, [a, b]]}]
+      - Let: [d, "{type[1]}", "{type[2]}"]
+      - FnCall: [simd_ne, [c, {FnCall: [transmute, [d]]}]]
+
+  - name: "vshl{neon_type[0].N}"
+    doc: "Shift left"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vshl, 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [shl, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, '3']
+      - [int8x16_t, '3']
+      - [int16x4_t, '4']
+      - [int16x8_t, '4']
+      - [int32x2_t, '5']
+      - [int32x4_t, '5']
+      - [uint8x8_t, '3']
+      - [uint8x16_t, '3']
+      - [uint16x4_t, '4']
+      - [uint16x8_t, '4']
+      - [uint32x2_t, '5']
+      - [uint32x4_t, '5']
+      - [int64x1_t, '6']
+      - [int64x2_t, '6']
+      - [uint64x1_t, '6']
+      - [uint64x2_t, '6']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[1]}"]]
+      - FnCall:
+          - simd_shl
+          - - a
+            - FnCall: ["vdup{neon_type[0].N}", ['N as _']]
+
+  - name: "vsra{neon_type[0].N}"
+    doc: "Unsigned shift right and accumulate"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vsra, 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [usra, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint8x8_t, '8']
+      - [uint8x16_t, '8']
+      - [uint16x4_t, '16']
+      - [uint16x8_t, '16']
+      - [uint32x2_t, '32']
+      - [uint32x4_t, '32']
+      - [uint64x1_t, '64']
+      - [uint64x2_t, '64']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= {type[1]}']]
+      - FnCall:
+          - simd_add
+          - - a
+            - FnCall: ["vshr{neon_type[0].N}::<N>", [b]]
+
+  - name: "vrsra{neon_type[0].N}"
+    doc: "Unsigned rounding shift right and accumulate"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrsra, 'N = 2']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ursra, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint8x8_t, '8']
+      - [uint8x16_t, '8']
+      - [uint16x4_t, '16']
+      - [uint16x8_t, '16']
+      - [uint32x2_t, '32']
+      - [uint32x4_t, '32']
+      - [uint64x1_t, '64']
+      - [uint64x2_t, '64']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= {type[1]}']]
+      - FnCall:
+          - simd_add
+          - - a
+            - FnCall: ["vrshr{neon_type[0].N}::<N>", [b]]
+
+  - name: "vqrshrn_n_{neon_type[0]}"
+    doc: "Unsigned signed saturating rounded shift right narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vqrshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x8_t, '8', 'const { uint16x8_t([-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16]) }']
+      - [uint32x4_t, uint16x4_t, '16', 'const { uint32x4_t([-N as u32, -N as u32, -N as u32, -N as u32]) }']
+      - [uint64x2_t, uint32x2_t, '32', 'const { uint64x2_t([-N as u64, -N as u64]) }']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= {type[2]}']]
+      - LLVMLink:
+          name: "vqrshrn{neon_type[0].N}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: {neon_type[0]}"
+          links:
+            - link: "llvm.arm.neon.vqrshiftnu.{neon_type[1]}"
+              arch: arm
+      - FnCall: ["_vqrshrn_n{neon_type[0].noq}", ["a", "{type[3]}"], [], true]
+
+  - name: "vqrshrn_n_{neon_type[0]}"
+    doc: "Unsigned signed saturating rounded shift right narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uqrshrn, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x8_t, '8']
+      - [uint32x4_t, uint16x4_t, '16']
+      - [uint64x2_t, uint32x2_t, '32']
+    compose:
+      - FnCall: [static_assert!, ['N >= 1 && N <= {type[2]}']]
+      - LLVMLink:
+          name: "vqrshrn_n_{neon_type[0]}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: i32"
+          links:
+            - link: "llvm.aarch64.neon.uqrshrn.{neon_type[1]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vqrshrn_n_{neon_type[0]}", ["a", N], [], true]
+
+  - name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to unsigned fixed-point, rounding toward zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vcvt]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcvtzu]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+    compose:
+      - LLVMLink:
+          name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.fptoui.sat.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+            - link: "llvm.fptoui.sat.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to unsigned fixed-point, rounding toward zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vcvt]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcvtzu]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - FnCall:
+          - simd_cast
+          - - a
+
+  - name: "vcvt_f16_{neon_type[0]}"
+    doc: "Floating-point convert to lower precision narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vcvt.f16.f32]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcvtn]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float32x4_t, float16x4_t]
+    compose:
+      - FnCall: [simd_cast, [a]]
+
+  - name: "vcvt_f32_f16"
+    doc: "Floating-point convert to higher precision long"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vcvt]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcvtl]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, float32x4_t]
+    compose:
+      - FnCall: [simd_cast, [a]]
+
+  - name: "vmla{neon_type[0].N}"
+    doc: "Vector multiply accumulate with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmla.i16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mla]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x4_t, "i16", int16x4_t]
+      - [int16x8_t, "i16", int16x8_t]
+      - [uint16x4_t, "u16", uint16x4_t]
+      - [uint16x8_t, "u16", uint16x8_t]
+    compose:
+      - FnCall:
+          - "vmla{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: ["vdup{neon_type[0].N}", [c]]
+
+  - name: "vmla{neon_type[0].N}"
+    doc: "Vector multiply accumulate with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmla.i32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mla]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x2_t, "i32", int32x2_t]
+      - [int32x4_t, "i32", int32x4_t]
+      - [uint32x2_t, "u32", uint32x2_t]
+      - [uint32x4_t, "u32", uint32x4_t]
+    compose:
+      - FnCall:
+          - "vmla{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: ["vdup{neon_type[0].N}", [c]]
+
+  - name: "vmla{neon_type[0].N}"
+    doc: "Vector multiply accumulate with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmla.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmul]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, "f32", float32x2_t]
+      - [float32x4_t, "f32", float32x4_t]
+    compose:
+      - FnCall: ["vmla{neon_type[0].no}", [a, b, {FnCall: ["vdup{neon_type[0].N}", [c]]}]]
+
+  - name: "vmla{type[0]}"
+    doc: "Vector multiply accumulate with scalar"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmla.i16"', 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mla, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [_lane_s16, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_laneq_s16, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_lane_s16, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_s16, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_lane_u16, uint16x4_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_laneq_u16, uint16x4_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_lane_u16, uint16x8_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_u16, uint16x8_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall:
+          - "vmla{neon_type[1].no}"
+          - - a
+            - b
+            - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]
+
+  - name: "vmla{type[0]}"
+    doc: "Vector multiply accumulate with scalar"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmla.i32"', 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mla, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [_lane_s32, int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [_laneq_s32, int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]']
+      - [q_lane_s32, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_s32, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_lane_u32, uint32x2_t, uint32x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [_laneq_u32, uint32x2_t, uint32x4_t, '2', '[LANE as u32, LANE as u32]']
+      - [q_lane_u32, uint32x4_t, uint32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_u32, uint32x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall:
+          - "vmla{neon_type[1].no}"
+          - - a
+            - b
+            - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]
+
+  - name: "vmla{type[0]}"
+    doc: "Vector multiply accumulate with scalar"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmla.f32"', 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmul, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [_lane_f32, float32x2_t, float32x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [_laneq_f32, float32x2_t, float32x4_t, '2', '[LANE as u32, LANE as u32]']
+      - [q_lane_f32, float32x4_t, float32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_f32, float32x4_t, float32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall:
+          - "vmla{neon_type[1].no}"
+          - - a
+            - b
+            - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]
+
+  - name: "vmls{neon_type[0].N}"
+    doc: "Vector multiply subtract with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmls.i16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mls]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x4_t, "i16", int16x4_t]
+      - [int16x8_t, "i16", int16x8_t]
+      - [uint16x4_t, "u16", uint16x4_t]
+      - [uint16x8_t, "u16", uint16x8_t]
+    compose:
+      - FnCall:
+          - "vmls{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: ["vdup{neon_type[0].N}", [c]]
+
+  - name: "vmls{neon_type[0].N}"
+    doc: "Vector multiply subtract with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmls.i32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mls]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int32x2_t, "i32", int32x2_t]
+      - [int32x4_t, "i32", int32x4_t]
+      - [uint32x2_t, "u32", uint32x2_t]
+      - [uint32x4_t, "u32", uint32x4_t]
+    compose:
+      - FnCall:
+          - "vmls{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: ["vdup{neon_type[0].N}", [c]]
+
+  - name: "vmls{neon_type[0].N}"
+    doc: "Vector multiply subtract with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmls.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmul]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, "f32", float32x2_t]
+      - [float32x4_t, "f32", float32x4_t]
+    compose:
+      - FnCall: ["vmls{neon_type[0].no}", [a, b, {FnCall: ["vdup{neon_type[0].N}", [c]]}]]
+
+  - name: "vmls{type[0]}"
+    doc: "Vector multiply subtract with scalar"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmls.i16"', 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mls, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [_lane_s16, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_laneq_s16, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_lane_s16, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_s16, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_lane_u16, uint16x4_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_laneq_u16, uint16x4_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_lane_u16, uint16x8_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_u16, uint16x8_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall:
+          - "vmls{neon_type[1].no}"
+          - - a
+            - b
+            - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]
+
+  - name: "vmls{type[0]}"
+    doc: "Vector multiply subtract with scalar"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmls.i32"', 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mls, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [_lane_s32, int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [_laneq_s32, int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]']
+      - [q_lane_s32, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_s32, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_lane_u32, uint32x2_t, uint32x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [_laneq_u32, uint32x2_t, uint32x4_t, '2', '[LANE as u32, LANE as u32]']
+      - [q_lane_u32, uint32x4_t, uint32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_u32, uint32x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall:
+          - "vmls{neon_type[1].no}"
+          - - a
+            - b
+            - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]
+
+  - name: "vmls{type[0]}"
+    doc: "Vector multiply subtract with scalar"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vmls.f32"', 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmul, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['3']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [_lane_f32, float32x2_t, float32x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [_laneq_f32, float32x2_t, float32x4_t, '2', '[LANE as u32, LANE as u32]']
+      - [q_lane_f32, float32x4_t, float32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_f32, float32x4_t, float32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall:
+          - "vmls{neon_type[1].no}"
+          - - a
+            - b
+            - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]
+
+  - name: "vmul{neon_type[0].N}"
+    doc: "Vector multiply by scalar"
+    arguments: ["a: {neon_type[0]}", "b: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmul]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [mul]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x4_t, "i16"]
+      - [int16x8_t, "i16"]
+      - [int32x2_t, "i32"]
+      - [int32x4_t, "i32"]
+      - [uint16x4_t, "u16"]
+      - [uint16x8_t, "u16"]
+      - [uint32x2_t, "u32"]
+      - [uint32x4_t, "u32"]
+    compose:
+      - FnCall:
+          - simd_mul
+          - - a
+            - FnCall: ["vdup{neon_type[0].N}", [b]]
+
+  - name: "vmul{neon_type[0].N}"
+    doc: "Vector multiply by scalar"
+    arguments: ["a: {neon_type[0]}", "b: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmul]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmul]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, "f32"]
+      - [float32x4_t, "f32"]
+    compose:
+      - FnCall:
+          - simd_mul
+          - - a
+            - FnCall: ["vdup{neon_type[0].N}", [b]]
+
+
+  - name: "vmul{neon_type[0].N}"
+    doc: "Vector multiply by scalar"
+    arguments: ["a: {neon_type[0]}", "b: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmul]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmul]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, "f16"]
+      - [float16x8_t, "f16"]
+    compose:
+      - FnCall:
+          - simd_mul
+          - - a
+            - FnCall: ["vdup{neon_type[0].N}", [b]]
+
+
+  - name: "vmul{type[2]}"
+    doc: "Floating-point multiply"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmul, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmul, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float32x2_t, float32x2_t, '_lane_f32', '1', '[LANE as u32, LANE as u32]']
+      - [float32x2_t, float32x4_t, '_laneq_f32', '2', '[LANE as u32, LANE as u32]']
+      - [float32x4_t, float32x2_t, 'q_lane_f32', '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [float32x4_t, float32x4_t, 'q_laneq_f32', '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall:
+          - simd_mul
+          - - a
+            - FnCall: [simd_shuffle!, [b, b, "{type[4]}"]]
+
+  - name: "vqrdmulh{type[0]}"
+    doc: "Vector rounding saturating doubling multiply high by scalar"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqrdmulh, 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqrdmulh, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [_lane_s16, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_laneq_s16, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_lane_s16, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_s16, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_lane_s32, int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [_laneq_s32, int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]']
+      - [q_lane_s32, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [q_laneq_s32, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - Let: [b, "{neon_type[1]}", {FnCall: [simd_shuffle!, [b, b, '{type[4]}']]}]
+      - FnCall: ["vqrdmulh{neon_type[1].no}", [a, b]]
+
+  - name: "vqrdmulh{neon_type[0].N}"
+    doc: "Vector saturating rounding doubling multiply high with scalar"
+    arguments: ["a: {neon_type[0]}", "b: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqrdmulh]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqrdmulh]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x4_t, "i16"]
+      - [int16x8_t, "i16"]
+      - [int32x2_t, "i32"]
+      - [int32x4_t, "i32"]
+    compose:
+      - FnCall:
+          - "vqrdmulh{neon_type[0].no}"
+          - - a
+            - FnCall: ["vdup{neon_type[0].N}", [b]]
+
+  - name: "vclt{neon_type[0].no}"
+    doc: "Floating-point compare less than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcgt.f32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmgt]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, uint32x2_t]
+      - [float32x4_t, uint32x4_t]
+    compose:
+      - FnCall: [simd_lt, [a, b]]
+
+  - name: "vclt{neon_type[0].no}"
+    doc: "Floating-point compare less than"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcgt.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmgt]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t]
+      - [float16x8_t, uint16x8_t]
+    compose:
+      - FnCall: [simd_lt, [a, b]]
+
+
+  - name: "vcltz{neon_type[0].no}"
+    doc: "Floating-point compare less than"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vclt.f16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcmlt]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, uint16x4_t, f16x4, 'f16x4::new(0.0, 0.0, 0.0, 0.0)']
+      - [float16x8_t, uint16x8_t, f16x8, 'f16x8::new(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)']
+    compose:
+      - Let: [b, "{type[2]}", "{type[3]}"]
+      - FnCall:
+          - simd_lt
+          - - a
+            - FnCall: [transmute, [b]]
+
+  - name: "vabdl_{neon_type[0]}"
+    doc: "Unsigned Absolute difference Long"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vabdl.{neon_type[0]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uabdl]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t, uint16x8_t]
+      - [uint16x4_t, uint32x4_t]
+      - [uint32x2_t, uint64x2_t]
+    compose:
+      - FnCall: [simd_cast, [{FnCall: ["vabd_{neon_type[0]}", [a, b]]}]]
+
+  - name: "vmull_lane{neon_type[1].no}"
+    doc: "Vector long multiply by scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmull, 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smull, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [int16x4_t, int16x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int16x4_t, int16x8_t, int32x4_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int32x2_t, int32x2_t, int64x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [int32x2_t, int32x4_t, int64x2_t, '2', '[LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - FnCall:
+          - "vmull_{neon_type[0]}"
+          - - a
+            - FnCall: [simd_shuffle!, [b, b, "{type[4]}"]]
+
+  - name: "vmull_lane{neon_type[1].no}"
+    doc: "Vector long multiply by scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmull, 'LANE = 1']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [umull, 'LANE = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [uint16x4_t, uint16x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint16x4_t, uint16x8_t, uint32x4_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint32x2_t, uint32x2_t, uint64x2_t, '1', '[LANE as u32, LANE as u32]']
+      - [uint32x2_t, uint32x4_t, uint64x2_t, '2', '[LANE as u32, LANE as u32]']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
+      - FnCall:
+          - "vmull_{neon_type[0]}"
+          - - a
+            - FnCall: [simd_shuffle!, [b, b, "{type[4]}"]]
+
+  - name: "vfms{neon_type[0].N}"
+    doc: "Floating-point fused Multiply-subtract to accumulator(vector)"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [cfg_attr, [target_arch = "arm", {FnCall: [target_feature, ['enable = "vfp4"']]}]]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vfms]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmls]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, "f32"]
+      - [float32x4_t, "f32"]
+    compose:
+      - FnCall:
+          - "vfms{neon_type[0].no}"
+          - - a
+            - b
+            - FnCall: ["vdup{neon_type[0].N}_vfp4", [c]]
+
+
+  - name: "vfms{neon_type.no}"
+    doc: "Floating-point fused multiply-subtract from accumulator"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v8
+      - FnCall: [cfg_attr, [target_arch = "arm", {FnCall: [target_feature, ['enable = "vfp4"']]}]]
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmls]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - Let: [b, "{neon_type}", {FnCall: [simd_neg, [b]]}]
+      - FnCall: ["vfma{neon_type.no}", [a, b, c]]
+
+  - name: "vqdmulh{neon_type[0].laneq_nox}"
+    doc: "Vector saturating doubling multiply high by scalar"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqdmulh, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sqdmulh, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [int16x8_t, int16x8_t, '3']
+      - [int16x4_t, int16x8_t, '3']
+      - [int32x4_t, int32x4_t, '2']
+      - [int32x2_t, int32x4_t, '2']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]]
+      - FnCall:
+          - "vqdmulh{neon_type[0].no}"
+          - - a
+            - FnCall:
+                - "vdup{neon_type[0].N}"
+                - - FnCall: [simd_extract!, [b, 'LANE as u32']]
+
+  - name: "vrecpe{neon_type.no}"
+    doc: "Unsigned reciprocal estimate"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrecpe]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [urecpe]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint32x2_t
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: "vrecpe{neon_type.no}"
+          links:
+            - link: "llvm.arm.neon.vrecpe.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.urecpe.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vrsqrte{neon_type.no}"
+    doc: "Unsigned reciprocal square root estimate"
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrsqrte]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ursqrte]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint32x2_t
+      - uint32x4_t
+    compose:
+      - LLVMLink:
+          name: "vrsqrte{neon_type.no}"
+          links:
+            - link: "llvm.arm.neon.vrsqrte.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.ursqrte.{neon_type}"
+              arch: aarch64,arm64ec
+
+  - name: "vrsqrte{neon_type.no}"
+    doc: "Reciprocal square-root estimate."
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrsqrte]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [frsqrte]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+      - float32x4_t
+    compose:
+      - LLVMLink:
+          name: "vrsqrte{neon_type.no}"
+          links:
+            - link: "llvm.arm.neon.vrsqrte.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.frsqrte.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vrsqrte{neon_type.no}"
+    doc: "Reciprocal square-root estimate."
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v8
+      - *neon-fp16
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vrsqrte]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [frsqrte]]}]]
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - float16x4_t
+      - float16x8_t
+    compose:
+      - LLVMLink:
+          name: "vrsqrte{neon_type.no}"
+          links:
+            - link: "llvm.arm.neon.vrsqrte.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.frsqrte.{neon_type}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vqshlu{neon_type[0].N}"
+    doc: "Signed saturating shift left unsigned"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vqshlu, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-arm-unstable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, uint8x8_t, '3', 'const { int8x8_t([N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8]) }']
+      - [int16x4_t, uint16x4_t, '4', 'const { int16x4_t([N as i16, N as i16, N as i16, N as i16]) }']
+      - [int32x2_t, uint32x2_t, '5', 'const { int32x2_t([N as i32, N as i32]) }']
+      - [int64x1_t, uint64x1_t, '6', 'const { int64x1_t([N as i64]) }']
+      - [int8x16_t, uint8x16_t, '3', 'const { int8x16_t([N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8]) }']
+      - [int16x8_t, uint16x8_t, '4', 'const { int16x8_t([N as i16, N as i16, N as i16, N as i16, N as i16, N as i16, N as i16, N as i16]) }']
+      - [int32x4_t, uint32x4_t, '5', 'const { int32x4_t([N as i32, N as i32, N as i32, N as i32]) }']
+      - [int64x2_t, uint64x2_t, '6', 'const { int64x2_t([N as i64, N as i64]) }']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[2]}"]]
+      - LLVMLink:
+          name: "vqshlu{neon_type[0].N}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: {neon_type[0]}"
+          links:
+            - link: "llvm.arm.neon.vqshiftsu.{neon_type[0]}"
+              arch: arm
+      - FnCall: ["_vqshlu{neon_type[0].N}", [a, "{type[3]}"], [], true]
+
+  - name: "vqshlu{neon_type[0].N}"
+    doc: "Signed saturating shift left unsigned"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [sqshlu, 'N = 2']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - *neon-stable
+    static_defs: ['const N: i32']
+    safety: safe
+    types:
+      - [int8x8_t, uint8x8_t, '3', 'const { int8x8_t([N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8]) }']
+      - [int16x4_t, uint16x4_t, '4', 'const { int16x4_t([N as i16, N as i16, N as i16, N as i16]) }']
+      - [int32x2_t, uint32x2_t, '5', 'const { int32x2_t([N as i32, N as i32]) }']
+      - [int64x1_t, uint64x1_t, '6', 'const { int64x1_t([N as i64]) }']
+      - [int8x16_t, uint8x16_t, '3', 'const { int8x16_t([N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8]) }']
+      - [int16x8_t, uint16x8_t, '4', 'const { int16x8_t([N as i16, N as i16, N as i16, N as i16, N as i16, N as i16, N as i16, N as i16]) }']
+      - [int32x4_t, uint32x4_t, '5', 'const { int32x4_t([N as i32, N as i32, N as i32, N as i32]) }']
+      - [int64x2_t, uint64x2_t, '6', 'const { int64x2_t([N as i64, N as i64]) }']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [N, "{type[2]}"]]
+      - LLVMLink:
+          name: "vqshlu{neon_type[0].N}"
+          arguments:
+            - "a: {neon_type[0]}"
+            - "n: {neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.sqshlu.{neon_type[0]}"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vqshlu{neon_type[0].N}", [a, "{type[3]}"], [], true]
+
+  - name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to signed fixed-point, rounding toward zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vcvt]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcvtzs]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, int32x2_t]
+      - [float32x4_t, int32x4_t]
+    compose:
+      - LLVMLink:
+          name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+          links:
+            - link: "llvm.fptosi.sat.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+            - link: "llvm.fptosi.sat.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+
+
+  - name: "vcvt{neon_type[1].no}_{neon_type[0]}"
+    doc: "Floating-point convert to signed fixed-point, rounding toward zero"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vcvt]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fcvtzs]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, int16x4_t]
+      - [float16x8_t, int16x8_t]
+    compose:
+      - FnCall:
+          - simd_cast
+          - - a
+
+  - name: "vqmovn_{neon_type[0]}"
+    doc: "Unsigned saturating extract narrow"
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vqmovn]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uqxtn]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x8_t]
+      - [uint32x4_t, uint16x4_t]
+      - [uint64x2_t, uint32x2_t]
+    compose:
+      - LLVMLink:
+          name: "vqmovn_{neon_type[1]}"
+          links:
+            - link: "llvm.arm.neon.vqmovnu.{neon_type[1]}"
+              arch: arm
+            - link: "llvm.aarch64.neon.uqxtn.{neon_type[1]}"
+              arch: aarch64,arm64ec
+
+  - name: "vcle{neon_type.no}"
+    doc: "Compare unsigned less than or equal"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vcge.{neon_type}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cmhs]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint8x8_t
+      - uint8x16_t
+      - uint16x4_t
+      - uint16x8_t
+      - uint32x2_t
+      - uint32x4_t
+    compose:
+      - FnCall: [simd_le, [a, b]]
+
+  - name: "vld4{neon_type[1].dup_nox}"
+    doc: "Load single 4-element structure and replicate to all lanes of four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [vld4]]}]]
+      - *neon-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i8", int8x8x4_t, int8x8_t, '1']
+      - ["*const i16", int16x4x4_t, int16x4_t, '2']
+      - ["*const i32", int32x2x4_t, int32x2_t, '4']
+      - ["*const i8", int8x16x4_t, int8x16_t, '1']
+      - ["*const i16", int16x8x4_t, int16x8_t, '2']
+      - ["*const i32", int32x4x4_t, int32x4_t, '4']
+      - ["*const f32", float32x2x4_t, float32x2_t, '4']
+      - ["*const f32", float32x4x4_t, float32x4_t, '4']
+    compose:
+      - LLVMLink:
+          name: "vld4{neon_type[1].dup_nox}"
+          arguments:
+            - "ptr: *const i8"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld4dup.{neon_type[2]}.p0"
+              arch: arm
+      - FnCall: ["_vld4{neon_type[1].dup_nox}", ['a as *const i8', "{type[3]}"]]
+
+  - name: "vld4{neon_type[1].dup_nox}"
+    doc: "Load single 4-element structure and replicate to all lanes of four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - FnCall: [cfg, [{FnCall: [not, ['target_arch = "arm"']]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ld4r]]}]]
+      - *neon-stable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i8", int8x8x4_t, int8x8_t]
+      - ["*const i16", int16x4x4_t, int16x4_t]
+      - ["*const i32", int32x2x4_t, int32x2_t]
+      - ["*const i8", int8x16x4_t, int8x16_t]
+      - ["*const i16", int16x8x4_t, int16x8_t]
+      - ["*const i32", int32x4x4_t, int32x4_t]
+      - ["*const i64", int64x1x4_t, int64x1_t]
+      - ["*const f32", float32x2x4_t, float32x2_t]
+      - ["*const f32", float32x4x4_t, float32x4_t]
+    compose:
+      - LLVMLink:
+          name: "vld4{neon_type[1].dup_nox}"
+          arguments:
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.ld4r.{neon_type[2]}.p0.p0"
+              arch: aarch64,arm64ec
+      - FnCall: ["_vld4{neon_type[1].dup_nox}", ['a as _']]
+
+  - name: "vld4{neon_type[1].dup_nox}"
+    doc: "Load single 4-element structure and replicate to all lanes of four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const i64", int64x1x4_t]
+    compose:
+      - LLVMLink:
+          name: "vld4{neon_type[1].dup_nox}"
+          arguments:
+            - "ptr: *const i8"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld4dup.v1i64.p0"
+              arch: arm
+      - FnCall: ["_vld4{neon_type[1].dup_nox}", ['a as *const i8', '8']]
+
+  - name: "vld4{neon_type[1].dup_nox}"
+    doc: "Load single 4-element structure and replicate to all lanes of four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld4]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld4r]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u8", uint8x8x4_t, int8x8x4_t]
+      - ["*const u16", uint16x4x4_t, int16x4x4_t]
+      - ["*const u32", uint32x2x4_t, int32x2x4_t]
+      - ["*const u8", uint8x16x4_t, int8x16x4_t]
+      - ["*const u16", uint16x8x4_t, int16x8x4_t]
+      - ["*const u32", uint32x4x4_t, int32x4x4_t]
+      - ["*const p8", poly8x8x4_t, int8x8x4_t]
+      - ["*const p16", poly16x4x4_t, int16x4x4_t]
+      - ["*const p8", poly8x16x4_t, int8x16x4_t]
+      - ["*const p16", poly16x8x4_t, int16x8x4_t]
+    compose:
+      - FnCall:
+          - "transmute"
+          - - FnCall: ["vld4{neon_type[2].dup_nox}", [{FnCall: [transmute, [a]]}]]
+
+  - name: "vld4{neon_type[1].dup_nox}"
+    doc: "Load single 4-element structure and replicate to all lanes of four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld4r]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const u64", uint64x1x4_t, int64x1x4_t]
+    compose:
+      - FnCall:
+          - "transmute"
+          - - FnCall: ["vld4{neon_type[2].dup_nox}", [{FnCall: [transmute, [a]]}]]
+
+  - name: "vld4{neon_type[1].dup_nox}"
+    doc: "Load single 4-element structure and replicate to all lanes of four registers"
+    arguments: ["a: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-aes
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld4r]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const p64", poly64x1x4_t, int64x1x4_t]
+    compose:
+      - FnCall:
+          - "transmute"
+          - - FnCall: ["vld4{neon_type[2].dup_nox}", [{FnCall: [transmute, [a]]}]]
+
+  - name: "vld1{type[0]}"
+    visibility: private
+    doc: "Load multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[1]}", "b: {type[2]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      # - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld1]]}]]
+      - *neon-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ["_v8i8", "*const i8", "i32", "int8x8_t"]
+      - ["q_v16i8", "*const i8", "i32", "int8x16_t"]
+      - ["_v4i16", "*const i8", "i32", "int16x4_t"]
+      - ["q_v8i16", "*const i8", "i32", "int16x8_t"]
+      - ["_v2i32", "*const i8", "i32", "int32x2_t"]
+      - ["q_v4i32", "*const i8", "i32", "int32x4_t"]
+      - ["_v1i64", "*const i8", "i32", "int64x1_t"]
+      - ["q_v2i64", "*const i8", "i32", "int64x2_t"]
+      - ["_v2f32", "*const i8", "i32", "float32x2_t"]
+      - ["q_v4f32", "*const i8", "i32", "float32x4_t"]
+    compose:
+      - LLVMLink:
+          name: "vld1.{type[0]}"
+          links:
+            - link: "llvm.arm.neon.vld1.{neon_type[3]}"
+              arch: arm
+      - FnCall: ["_vld1{type[0]}", [a, b]]
+
+
+  - name: "vld1{type[0]}"
+    visibility: private
+    doc: "Load multiple single-element structures to one, two, three, or four registers"
+    arguments: ["a: {type[1]}", "b: {type[2]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["_v4f16", "*const i8", "i32", "float16x4_t"]
+      - ["q_v8f16", "*const i8", "i32", "float16x8_t"]
+    compose:
+      - LLVMLink:
+          name: "vld1.{type[0]}"
+          links:
+            - link: "llvm.arm.neon.vld1.{neon_type[3]}"
+              arch: arm
+      - FnCall: ["_vld1{type[0]}", [a, b]]
+
+
+  - name: "vld1{neon_type[1].no}"
+    doc: "Load multiple single-element structures to one, two, three, or four registers."
+    arguments: ["ptr: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - *neon-arm-unstable
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['{type[2]}']]}]]
+    types:
+      - ['*const i8',  int8x8_t,  '"vld1.8"', 'crate::mem::align_of::<i8>() as i32', '_v8i8']
+      - ['*const i8',  int8x16_t, '"vld1.8"', 'crate::mem::align_of::<i8>() as i32', 'q_v16i8']
+      - ['*const i16', int16x4_t, '"vld1.16"', 'crate::mem::align_of::<i16>() as i32', '_v4i16']
+      - ['*const i16', int16x8_t, '"vld1.16"', 'crate::mem::align_of::<i16>() as i32', 'q_v8i16']
+      - ['*const i32', int32x2_t, 'vldr', 'crate::mem::align_of::<i32>() as i32', '_v2i32']
+      - ['*const i32', int32x4_t, '"vld1.32"', 'crate::mem::align_of::<i32>() as i32', 'q_v4i32']
+      - ['*const i64', int64x1_t, 'vldr', 'crate::mem::align_of::<i64>() as i32', '_v1i64']
+      - ['*const i64', int64x2_t, '"vld1.64"', 'crate::mem::align_of::<i64>() as i32', 'q_v2i64']
+    compose:
+      - FnCall:
+          - "vld1{type[4]}"
+          - - 'ptr as *const i8'
+            - '{type[3]}'
+
+  - name: "vld1{neon_type[1].no}"
+    doc: "Load multiple single-element structures to one, two, three, or four registers."
+    arguments: ["ptr: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-is-arm
+      - FnCall: [target_feature, ['enable = "{type[3]}"']]
+      - *neon-arm-unstable
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['{type[2]}']]}]]
+    types:
+      - ['*const u8',  uint8x8_t,   '"vld1.8"', 'neon,v7', 'crate::mem::align_of::<u8>() as i32', '_v8i8']
+      - ['*const u8',  uint8x16_t,  '"vld1.8"', 'neon,v7', 'crate::mem::align_of::<u8>() as i32', 'q_v16i8']
+      - ['*const u16', uint16x4_t,  '"vld1.16"', 'neon,v7', 'crate::mem::align_of::<u16>() as i32', '_v4i16']
+      - ['*const u16', uint16x8_t,  '"vld1.16"', 'neon,v7', 'crate::mem::align_of::<u16>() as i32', 'q_v8i16']
+      - ['*const u32', uint32x2_t,  'vldr', 'neon,v7', 'crate::mem::align_of::<u32>() as i32', '_v2i32']
+      - ['*const u32', uint32x4_t,  '"vld1.32"', 'neon,v7', 'crate::mem::align_of::<u32>() as i32', 'q_v4i32']
+      - ['*const u64', uint64x1_t,  'vldr', 'neon,v7', 'crate::mem::align_of::<u64>() as i32', '_v1i64']
+      - ['*const u64', uint64x2_t,  '"vld1.64"', 'neon,v7', 'crate::mem::align_of::<u64>() as i32', 'q_v2i64']
+      - ['*const p8',  poly8x8_t,   '"vld1.8"', 'neon,v7', 'crate::mem::align_of::<p8>() as i32', '_v8i8']
+      - ['*const p8',  poly8x16_t,  '"vld1.8"', 'neon,v7', 'crate::mem::align_of::<p8>() as i32', 'q_v16i8']
+      - ['*const p16', poly16x4_t,  '"vld1.16"', 'neon,v7', 'crate::mem::align_of::<p16>() as i32', '_v4i16']
+      - ['*const p16', poly16x8_t,  '"vld1.16"', 'neon,v7', 'crate::mem::align_of::<p16>() as i32', 'q_v8i16']
+      - ['*const p64', poly64x2_t,  '"vld1.64"', 'neon,aes', 'crate::mem::align_of::<p64>() as i32', 'q_v2i64']
+      - ['*const f32', float32x2_t, 'vldr', 'neon,v7', 'crate::mem::align_of::<f32>() as i32', '_v2f32']
+      - ['*const f32', float32x4_t, '"vld1.32"', 'neon,v7', 'crate::mem::align_of::<f32>() as i32', 'q_v4f32']
+    compose:
+      - FnCall:
+        - transmute
+        - - FnCall:
+              - "vld1{type[5]}"
+              - - 'ptr as *const i8'
+                - '{type[4]}'
+
+  - name: "vld1{neon_type[1].no}"
+    doc: "Load multiple single-element structures to one, two, three, or four registers."
+    arguments: ["ptr: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-is-arm
+      - FnCall: [target_feature, ['enable = "{type[3]}"']]
+      - *neon-fp16
+      - *neon-unstable-f16
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['{type[2]}']]}]]
+    types:
+      - ['*const f16', float16x4_t, '"vld1.16"', 'neon,v7', 'crate::mem::align_of::<f16>() as i32', '_v4f16']
+      - ['*const f16', float16x8_t, '"vld1.16"', 'neon,v7', 'crate::mem::align_of::<f16>() as i32', 'q_v8f16']
+    compose:
+      - FnCall:
+        - transmute
+        - - FnCall:
+              - "vld1{type[5]}"
+              - - 'ptr as *const i8'
+                - '{type[4]}'
+
+  - name: "vld1{neon_type[1].no}"
+    doc: "Load multiple single-element structures to one, two, three, or four registers."
+    arguments: ["ptr: {type[0]}"]
+    return_type: "{neon_type[1]}"
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-is-arm
+      - *neon-aes
+      - *neon-arm-unstable
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['vldr']]}]]
+    types:
+      - ['*const p64', poly64x1_t]
+    compose:
+      # Inlining seems broken for 'fn vld1_v1i64', this "fixes" it
+      - Let: [a, '*const i8', 'ptr as *const i8']
+      - Let: [b, i32, 'crate::mem::align_of::<p64>() as i32']
+      - 'unsafe extern "unadjusted" {{ #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v1i64")] fn _vld1_v1i64(a: *const i8, b: i32) -> int64x1_t; }} transmute(_vld1_v1i64(a, b))'
+
+  - name: "vtbx1"
+    visibility: private
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vtbx]
+    safety: safe
+    types:
+      - "int8x8_t"
+    compose:
+      - LLVMLink:
+          name: "vtbx1"
+          links:
+            - link: "llvm.arm.neon.vtbx1"
+              arch: arm
+
+  - name: "vtbx1_s8"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vtbx]
+    safety: safe
+    types:
+      - int8x8_t
+    compose:
+      - FnCall: [vtbx1, [a, b, c]]
+
+  - name: "vtbx1{neon_type.no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: uint8x8_t"]
+    return_type: "{neon_type}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vtbx]
+    safety: safe
+    types:
+      - uint8x8_t
+      - poly8x8_t
+    compose:
+      - FnCall: 
+          - transmute
+          - - FnCall: 
+                - vtbx1
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+                  - FnCall: [transmute, [c]]
+
+  - name: "vtbx2"
+    visibility: private
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}", "d: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vtbx]
+    safety: safe
+    types:
+      - "int8x8_t"
+    compose:
+      - LLVMLink:
+          name: "vtbx2"
+          links:
+            - link: "llvm.arm.neon.vtbx2"
+              arch: arm
+
+  - name: "vtbx2_s8"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vtbx]
+    safety: safe
+    types:
+      - [int8x8_t, int8x8x2_t]
+    compose:
+      - FnCall: [vtbx2, [a, 'b.0', 'b.1', c]]
+
+  - name: "vtbx2{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vtbx]
+    safety: safe
+    types:
+      - [uint8x8_t, uint8x8x2_t, uint8x8_t]
+      - [poly8x8_t, poly8x8x2_t, uint8x8_t]
+    compose:
+      - FnCall: 
+          - transmute
+          - - FnCall: 
+                - vtbx2
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, ['b.0']]
+                  - FnCall: [transmute, ['b.1']]
+                  - FnCall: [transmute, [c]]
+
+  - name: "vtbx3"
+    visibility: private
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}", "d: {neon_type}", "e: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vtbx]
+    safety: safe
+    types:
+      - "int8x8_t"
+    compose:
+      - LLVMLink:
+          name: "vtbx3"
+          links:
+            - link: "llvm.arm.neon.vtbx3"
+              arch: arm
+
+  - name: "vtbx3_s8"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vtbx]
+    safety: safe
+    types:
+      - [int8x8_t, int8x8x3_t]
+    compose:
+      - FnCall: [vtbx3, [a, 'b.0', 'b.1', 'b.2', c]]
+
+  - name: "vtbx3{neon_type[0].no}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vtbx]
+    safety: safe
+    types:
+      - [uint8x8_t, uint8x8x3_t, uint8x8_t]
+      - [poly8x8_t, poly8x8x3_t, uint8x8_t]
+    compose:
+      - FnCall: 
+          - transmute
+          - - FnCall: 
+                - vtbx3
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, ['b.0']]
+                  - FnCall: [transmute, ['b.1']]
+                  - FnCall: [transmute, ['b.2']]
+                  - FnCall: [transmute, [c]]
+
+  - name: "vtbx4"
+    visibility: private
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}", "d: {neon_type}", "e: {neon_type}", "f: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vtbx]
+    safety: safe
+    types:
+      - "int8x8_t"
+    compose:
+      - LLVMLink:
+          name: "vtbx4"
+          links:
+            - link: "llvm.arm.neon.vtbx4"
+              arch: arm
+
+  - name: "vtbx4{neon_type[0].noq}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vtbx]
+    safety: safe
+    types:
+      - ["uint8x8_t", "uint8x8x4_t", "uint8x8_t"]
+      - ["poly8x8_t", "poly8x8x4_t", "uint8x8_t"]
+    compose:
+      - FnCall: 
+          - "transmute"
+          - - FnCall:
+               - vtbx4
+               - - FnCall: [transmute, [a]]
+                 - FnCall: [transmute, ["b.0"]]
+                 - FnCall: [transmute, ["b.1"]]
+                 - FnCall: [transmute, ["b.2"]]
+                 - FnCall: [transmute, ["b.3"]]
+                 - FnCall: [transmute, [c]]
+
+  - name: "vtbx4{neon_type[0].noq}"
+    doc: "Extended table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+    assert_instr: [vtbx]
+    safety: safe
+    types:
+      - ["int8x8_t", "int8x8x4_t"]
+    big_endian_inverse: true
+    compose:
+       - FnCall:
+           - vtbx4
+           - - a
+             - FnCall: [transmute, ["b.0"]]
+             - FnCall: [transmute, ["b.1"]]
+             - FnCall: [transmute, ["b.2"]]
+             - FnCall: [transmute, ["b.3"]]
+             - c
+
+  - name: "vld4{neon_type[1].nox}"
+    doc: Load single 4-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v7
+      - *target-is-arm
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld4]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x4_t, f16]
+      - ["*const f16", float16x8x4_t, f16]
+    compose:
+      - LLVMLink:
+          name: "vld4.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld4.v{neon_type[1].lane}{type[2]}.p0"
+              arch: arm
+      - FnCall:
+          - "_vld4{neon_type[1].nox}"
+          - - "a as _"
+            - "2"
+
+  - name: "vld4{neon_type[1].nox}"
+    doc: Load single 4-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld4]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x4_t, f16]
+      - ["*const f16", float16x8x4_t, f16]
+    compose:
+      - LLVMLink:
+          name: "vld4.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld4{neon_type[1].nox}"
+          - - "a as _"
+
+  - name: "vld4{neon_type[1].dup_nox}"
+    doc: Load single 4-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v7
+      - *target-is-arm
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vld4]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x4_t, f16]
+      - ["*const f16", float16x8x4_t, f16]
+    compose:
+      - LLVMLink:
+          name: "vld4dup.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld4dup.v{neon_type[1].lane}{type[2]}.p0"
+              arch: arm
+      - FnCall:
+          - "_vld4{neon_type[1].dup_nox}"
+          - - "a as _"
+            - "2"
+
+
+  - name: "vld4{neon_type[1].dup_nox}"
+    doc: Load single 4-element structure and replicate to all lanes of two registers
+    arguments: ["a: {type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld4r]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x4_t, f16]
+      - ["*const f16", float16x8x4_t, f16]
+    compose:
+      - LLVMLink:
+          name: "vld4dup.{neon_type[1]}"
+          arguments:
+            - "ptr: {type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.ld4r.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld4{neon_type[1].dup_nox}"
+          - - "a as _"
+
+
+  - name: "vld4{neon_type[1].lane_nox}"
+    doc: Load multiple 4-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['vld4', 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x4_t, f16, float16x4_t, "2"]
+      - ["*const f16", float16x8x4_t, f16, float16x8_t, "3"]
+    compose:
+      - FnCall:
+          - "static_assert_uimm_bits!"
+          - - LANE
+            - "{type[4]}"
+      - LLVMLink:
+          name: "vld4.{neon_type[1]}"
+          arguments:
+            - "ptr: *const f16"
+            - "a: {neon_type[3]}"
+            - "b: {neon_type[3]}"
+            - "c: {neon_type[3]}"
+            - "d: {neon_type[3]}"
+            - "n: i32"
+            - "size: i32"
+          links:
+            - link: "llvm.arm.neon.vld4lane.v{neon_type[1].lane}{type[2]}.p0"
+              arch: arm
+      - FnCall:
+          - "_vld4{neon_type[1].lane_nox}"
+          - - "a as _"
+            - "b.0"
+            - "b.1"
+            - "b.2"
+            - "b.3"
+            - "LANE"
+            - "2"
+
+
+  - name: "vld4{neon_type[1].lane_nox}"
+    doc: Load multiple 4-element structures to two registers
+    arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *target-not-arm
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld4, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["2"]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    static_defs:
+      - "const LANE: i32"
+    safety:
+      unsafe: [neon]
+    types:
+      - ["*const f16", float16x4x4_t, f16, float16x4_t, "2"]
+      - ["*const f16", float16x8x4_t, f16, float16x8_t, "3"]
+    compose:
+      - FnCall:
+          - "static_assert_uimm_bits!"
+          - - LANE
+            - "{type[4]}"
+      - LLVMLink:
+          name: "vld4.{neon_type[1]}"
+          arguments:
+            - "a: {neon_type[3]}"
+            - "b: {neon_type[3]}"
+            - "c: {neon_type[3]}"
+            - "d: {neon_type[3]}"
+            - "n: i64"
+            - "ptr: *const f16"
+          links:
+            - link: "llvm.aarch64.neon.ld4lane.v{neon_type[1].lane}{type[2]}.p0"
+              arch: aarch64,arm64ec
+      - FnCall:
+          - "_vld4{neon_type[1].lane_nox}"
+          - - "b.0"
+            - "b.1"
+            - "b.2"
+            - "b.3"
+            - "LANE as i64"
+            - "a as _"
+
+  - name: "vcombine{neon_type[0].noq}"
+    doc: Join two smaller vectors into a single larger vector
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [float32x2_t, float32x4_t, '[0, 1, 2, 3]']
+      - [poly8x8_t, poly8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [poly16x4_t, poly16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [int8x8_t, int8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int16x4_t, int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [int32x2_t, int32x4_t, '[0, 1, 2, 3]']
+      - [int64x1_t, int64x2_t, '[0, 1]']
+      - [uint8x8_t, uint8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, uint16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, uint32x4_t, '[0, 1, 2, 3]']
+      - [uint64x1_t, uint64x2_t, '[0, 1]']
+      - [poly64x1_t, poly64x2_t, '[0, 1]']
+    compose:
+      - FnCall: [simd_shuffle!, [a, b, '{type[2]}']]
+
+  - name: "vaeseq_u8"
+    doc: "AES single round encryption."
+    arguments: ["data: {neon_type}", "key: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "aes"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, [aese]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - uint8x16_t
+    compose:
+      - LLVMLink:
+          name: "vaeseq_u8"
+          links:
+            - link: "llvm.aarch64.crypto.aese"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.aese"
+              arch: arm
+
+  - name: "vaesdq_u8"
+    doc: "AES single round encryption."
+    arguments: ["data: {neon_type}", "key: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "aes"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, [aesd]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - uint8x16_t
+    compose:
+      - LLVMLink:
+          name: "vaesdq_u8"
+          links:
+            - link: "llvm.aarch64.crypto.aesd"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.aesd"
+              arch: arm
+
+  - name: "vaesmcq_u8"
+    doc: "AES mix columns."
+    arguments: ["data: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "aes"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["{type[1]}"]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - [uint8x16_t, "aesmc"]
+    compose:
+      - LLVMLink:
+          name: "vaesmcq_u8"
+          links:
+            - link: "llvm.aarch64.crypto.{type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.{type[1]}"
+              arch: arm
+
+  - name: "vaesimcq_u8"
+    doc: "AES inverse mix columns."
+    arguments: ["data: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "aes"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["{type[1]}"]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - [uint8x16_t, "aesimc"]
+    compose:
+      - LLVMLink:
+          name: "vaesimcq_u8"
+          links:
+            - link: "llvm.aarch64.crypto.{type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.{type[1]}"
+              arch: arm
+
+  - name: "vsha1h_u32"
+    doc: "SHA1 fixed rotate."
+    arguments: ["hash_e: {type[0]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "sha2"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["{type[1]}"]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - [u32, "sha1h"]
+    compose:
+      - LLVMLink:
+          name: "vsha1h_u32"
+          links:
+            - link: "llvm.aarch64.crypto.{type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.{type[1]}"
+              arch: arm
+
+  - name: "vsha1cq_u32"
+    doc: "SHA1 hash update accelerator, choose."
+    arguments: ["hash_abcd: {neon_type[2]}", "hash_e: {type[0]}", "wk: {neon_type[2]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "sha2"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["{type[1]}"]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - [u32, "sha1c", "uint32x4_t"]
+    compose:
+      - LLVMLink:
+          name: "vsha1cq_u32"
+          links:
+            - link: "llvm.aarch64.crypto.{type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.{type[1]}"
+              arch: arm
+
+  - name: "vsha1mq_u32"
+    doc: "SHA1 hash update accelerator, majority"
+    arguments: ["hash_abcd: {neon_type[2]}", "hash_e: {type[0]}", "wk: {neon_type[2]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "sha2"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["{type[1]}"]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - [u32, "sha1m", "uint32x4_t"]
+    compose:
+      - LLVMLink:
+          name: "vsha1mq_u32"
+          links:
+            - link: "llvm.aarch64.crypto.{type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.{type[1]}"
+              arch: arm
+
+  - name: "vsha1pq_u32"
+    doc: "SHA1 hash update accelerator, parity"
+    arguments: ["hash_abcd: {neon_type[2]}", "hash_e: {type[0]}", "wk: {neon_type[2]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "sha2"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["{type[1]}"]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - [u32, "sha1p", "uint32x4_t"]
+    compose:
+      - LLVMLink:
+          name: "vsha1pq_u32"
+          links:
+            - link: "llvm.aarch64.crypto.{type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.{type[1]}"
+              arch: arm
+
+  - name: "vsha1su0q_u32"
+    doc: "SHA1 schedule update accelerator, first part."
+    arguments: ["w0_3: {neon_type[0]}", "w4_7: {neon_type[0]}", "w8_11: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "sha2"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["{type[1]}"]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - [uint32x4_t, "sha1su0"]
+    compose:
+      - LLVMLink:
+          name: "vsha1su0q_u32"
+          links:
+            - link: "llvm.aarch64.crypto.{type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.{type[1]}"
+              arch: arm
+
+  - name: "vsha1su1q_u32"
+    doc: "SHA1 schedule update accelerator, second part."
+    arguments: ["tw0_3: {neon_type[0]}", "w12_15: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "sha2"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["{type[1]}"]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - [uint32x4_t, "sha1su1"]
+    compose:
+      - LLVMLink:
+          name: "vsha1su0q_u32"
+          links:
+            - link: "llvm.aarch64.crypto.{type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.{type[1]}"
+              arch: arm
+
+  - name: "vsha256hq_u32"
+    doc: "SHA1 schedule update accelerator, first part."
+    arguments: ["hash_abcd: {neon_type[0]}", "hash_efgh: {neon_type[0]}", "wk: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "sha2"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["{type[1]}"]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - [uint32x4_t, "sha256h"]
+    compose:
+      - LLVMLink:
+          name: "vsha256hq_u32"
+          links:
+            - link: "llvm.aarch64.crypto.{type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.{type[1]}"
+              arch: arm
+
+  - name: "vsha256h2q_u32"
+    doc: "SHA1 schedule update accelerator, upper part."
+    arguments: ["hash_abcd: {neon_type[0]}", "hash_efgh: {neon_type[0]}", "wk: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "sha2"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["{type[1]}"]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - [uint32x4_t, "sha256h2"]
+    compose:
+      - LLVMLink:
+          name: "vsha256h2q_u32"
+          links:
+            - link: "llvm.aarch64.crypto.{type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.{type[1]}"
+              arch: arm
+
+  - name: "vsha256su0q_u32"
+    doc: "SHA256 schedule update accelerator, first part."
+    arguments: ["w0_3: {neon_type[0]}", "w4_7: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "sha2"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["{type[1]}"]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - [uint32x4_t, "sha256su0"]
+    compose:
+      - LLVMLink:
+          name: "vsha256su0q_u32"
+          links:
+            - link: "llvm.aarch64.crypto.{type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.{type[1]}"
+              arch: arm
+
+  - name: "vsha256su1q_u32"
+    doc: "SHA256 schedule update accelerator, second part."
+    arguments: ["tw0_3: {neon_type[0]}", "w8_11: {neon_type[0]}", "w12_15: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "sha2"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["{type[1]}"]] }]]
+      - *neon-cfg-arm-unstable
+      - FnCall: [cfg_attr, [*not-arm, { FnCall: [stable, ['feature = "aarch64_neon_crypto_intrinsics"', 'since = "1.72.0"']] }]]
+    safety: safe
+    types:
+      - [uint32x4_t, "sha256su1"]
+    compose:
+      - LLVMLink:
+          name: "vsha256su1q_u32"
+          links:
+            - link: "llvm.aarch64.crypto.{type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.{type[1]}"
+              arch: arm
+
+  - name: "__crc32b"
+    doc: "CRC32 single round checksum for bytes (8 bits)."
+    arguments: ["crc: {type[0]}", "data: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "crc"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32b"]] }]]
+      - *arm-crc-unstable
+      - *aarch64-crc-stable
+    safety: safe
+    types:
+      - [u32, u8]
+    compose:
+      - LLVMLink:
+          name: "crc32b"
+          arguments:
+            - "crc: u32"
+            - "data: u32"
+          links:
+            - link: "llvm.aarch64.crc32b"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.crc32b"
+              arch: arm
+      - FnCall: ["___crc32b", ["crc", "data as u32"], [], true]
+
+  - name: "__crc32h"
+    doc: "CRC32 single round checksum for bytes (16 bits)."
+    arguments: ["crc: {type[0]}", "data: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "crc"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32h"]] }]]
+      - *arm-crc-unstable
+      - *aarch64-crc-stable
+    safety: safe
+    types:
+      - [u32, u16]
+    compose:
+      - LLVMLink:
+          name: "crc32h"
+          arguments:
+            - "crc: u32"
+            - "data: u32"
+          links:
+            - link: "llvm.aarch64.crc32h"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.crc32h"
+              arch: arm
+      - FnCall: ["___crc32h", ["crc", "data as u32"], [], true]
+
+  - name: "__crc32w"
+    doc: "CRC32 single round checksum for bytes (32 bits)."
+    arguments: ["crc: {type}", "data: {type}"]
+    return_type: "{type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "crc"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32w"]] }]]
+      - *arm-crc-unstable
+      - *aarch64-crc-stable
+    safety: safe
+    types:
+      - u32
+    compose:
+      - LLVMLink:
+          name: "crc32w"
+          links:
+            - link: "llvm.aarch64.crc32w"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.crc32w"
+              arch: arm
+
+  - name: "__crc32cb"
+    doc: "CRC32-C single round checksum for bytes (8 bits)."
+    arguments: ["crc: {type[0]}", "data: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "crc"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32cb"]] }]]
+      - *arm-crc-unstable
+      - *aarch64-crc-stable
+    safety: safe
+    types:
+      - [u32, u8]
+    compose:
+      - LLVMLink:
+          name: "crc32cb"
+          arguments:
+            - "crc: u32"
+            - "data: u32"
+          links:
+            - link: "llvm.aarch64.crc32cb"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.crc32cb"
+              arch: arm
+      - FnCall: ["___crc32cb", ["crc", "data as u32"], [], true]
+
+  - name: "__crc32ch"
+    doc: "CRC32-C single round checksum for bytes (16 bits)."
+    arguments: ["crc: {type[0]}", "data: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "crc"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32ch"]] }]]
+      - *arm-crc-unstable
+      - *aarch64-crc-stable
+    safety: safe
+    types:
+      - [u32, u16]
+    compose:
+      - LLVMLink:
+          name: "crc32ch"
+          arguments:
+            - "crc: u32"
+            - "data: u32"
+          links:
+            - link: "llvm.aarch64.crc32ch"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.crc32ch"
+              arch: arm
+      - FnCall: ["___crc32ch", ["crc", "data as u32"], [], true]
+
+  - name: "__crc32cw"
+    doc: "CRC32-C single round checksum for bytes (32 bits)."
+    arguments: ["crc: {type}", "data: {type}"]
+    return_type: "{type}"
+    attr:
+      - FnCall: [target_feature, ['enable = "crc"']]
+      - *neon-v8
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32cw"]] }]]
+      - *arm-crc-unstable
+      - *aarch64-crc-stable
+    safety: safe
+    types:
+      - u32
+    compose:
+      - LLVMLink:
+          name: "crc32cw"
+          links:
+            - link: "llvm.aarch64.crc32cw"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.crc32cw"
+              arch: arm
+
+  - name: "__crc32d"
+    doc: "CRC32 single round checksum for quad words (64 bits)."
+    arguments: ["crc: {type[0]}", "data: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "crc"']]
+      - *target-is-arm
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32w"]] }]]
+      - *arm-crc-unstable
+    safety: safe
+    types:
+      - [u32, u64]
+    compose:
+      # As the call to `__crc32` does not get inlined, we define an LLVM binding
+      # here, which is the same as above, and call it directly which results 
+      # in the correct instructions being generated
+      - Let: [b, u32, '(data & 0xFFFFFFFF) as u32']
+      - Let: [c, u32, '(data >> 32) as u32']
+      - 'unsafe extern "unadjusted" {{ #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32w")] fn ___crc32w(crc: u32, data: u32) -> u32;}} unsafe {{ ___crc32w(___crc32w(crc, b), c) }}'
+
+  - name: "__crc32cd"
+    doc: "CRC32-C single round checksum for quad words (64 bits)."
+    arguments: ["crc: {type[0]}", "data: {type[1]}"]
+    return_type: "{type[0]}"
+    attr:
+      - FnCall: [target_feature, ['enable = "crc"']]
+      - *target-is-arm
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32cw"]] }]]
+      - *arm-crc-unstable
+    safety: safe
+    types:
+      - [u32, u64]
+    compose:
+      - Let: [b, u32, '(data & 0xFFFFFFFF) as u32']
+      - Let: [c, u32, '(data >> 32) as u32']
+      - 'unsafe extern "unadjusted" {{ #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32cw")] fn ___crc32cw(crc: u32, data: u32) -> u32;}} unsafe {{ ___crc32cw(___crc32cw(crc, b), c) }}'
+
+  - name: "vabs{neon_type.no}"
+    doc: "Absolute value (wrapping)."
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vabs]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [abs]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int16x4_t
+      - int32x2_t
+      - int8x16_t
+      - int16x8_t
+      - int32x4_t
+    compose:
+      - LLVMLink:
+          name: "vabs{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.abs.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vabs.{neon_type}"
+              arch: arm
+
+  - name: "vpmin{neon_type.no}"
+    doc: "Folding minimum of adjacent pairs"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vpmin]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sminp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int16x4_t
+      - int32x2_t
+    compose:
+      - LLVMLink:
+          name: "vabs{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.sminp.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vpmins.{neon_type}"
+              arch: arm
+
+  - name: "vpmin{neon_type.no}"
+    doc: "Folding minimum of adjacent pairs"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vpmin]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uminp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint8x8_t
+      - uint16x4_t
+      - uint32x2_t
+    compose:
+      - LLVMLink:
+          name: "vabs{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.uminp.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vpminu.{neon_type}"
+              arch: arm
+
+  - name: "vpmin{neon_type.no}"
+    doc: "Folding minimum of adjacent pairs"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vpmin]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fminp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+    compose:
+      - LLVMLink:
+          name: "vabs{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.fminp.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vpmins.{neon_type}"
+              arch: arm
+
+  - name: "vpmax{neon_type.no}"
+    doc: "Folding maximum of adjacent pairs"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vpmax]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smaxp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int16x4_t
+      - int32x2_t
+    compose:
+      - LLVMLink:
+          name: "vabs{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.smaxp.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vpmaxs.{neon_type}"
+              arch: arm
+
+  - name: "vpmax{neon_type.no}"
+    doc: "Folding maximum of adjacent pairs"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vpmax]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [umaxp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - uint8x8_t
+      - uint16x4_t
+      - uint32x2_t
+    compose:
+      - LLVMLink:
+          name: "vabs{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.umaxp.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vpmaxu.{neon_type}"
+              arch: arm
+
+  - name: "vpmax{neon_type.no}"
+    doc: "Folding maximum of adjacent pairs"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vpmax]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [fmaxp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - float32x2_t
+    compose:
+      - LLVMLink:
+          name: "vabs{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxp.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vpmaxs.{neon_type}"
+              arch: arm
+
+  - name: "vraddhn{neon_type[0].noq}"
+    doc: "Rounding Add returning High Narrow."
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"{type[2]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [raddhn]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int16x8_t, int8x8_t, 'vraddhn.i16'] 
+      - [int32x4_t, int16x4_t, 'vraddhn.i32'] 
+      - [int64x2_t, int32x2_t, 'vraddhn.i64'] 
+    compose:
+      - LLVMLink:
+          name: "vraddhn{neon_type[0].noq}"
+          links:
+            - link: "llvm.aarch64.neon.raddhn.{neon_type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vraddhn.{neon_type[1]}"
+              arch: arm
+
+  - name: "vraddhn{neon_type[0].noq}"
+    doc: "Rounding Add returning High Narrow."
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"{type[2]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [raddhn]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint16x8_t, uint8x8_t, 'vraddhn.i16', int16x8_t] 
+      - [uint32x4_t, uint16x4_t, 'vraddhn.i32', int32x4_t]  
+      - [uint64x2_t, uint32x2_t, 'vraddhn.i64', int64x2_t]  
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vraddhn{neon_type[3].noq}"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vraddhn_high{neon_type[1].noq}"
+    doc: "Rounding Add returning High Narrow (high half)."
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"{type[3]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [raddhn2]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t , uint16x8_t, uint8x16_t, 'vraddhn.i16', int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, uint32x4_t, uint16x8_t, 'vraddhn.i32', int32x4_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, uint64x2_t, uint32x4_t, 'vraddhn.i64', int64x2_t, '[0, 1, 2, 3]']
+    compose:
+      - Let:
+          - x
+          - "{neon_type[0]}"
+          - FnCall:
+              - transmute
+              - - FnCall:
+                    - "vraddhn{neon_type[4].noq}"
+                    - - FnCall: [transmute, [b]]
+                      - FnCall: [transmute, [c]]
+      - FnCall: ["simd_shuffle!", [a, x, '{type[5]}']]
+
+  - name: "vraddhn_high{neon_type[1].noq}"
+    doc: "Rounding Add returning High Narrow (high half)."
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"{type[3]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [raddhn2]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [int8x8_t , int16x8_t, int8x16_t, 'vraddhn.i16', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int16x4_t, int32x4_t, int16x8_t, 'vraddhn.i32', '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [int32x2_t, int64x2_t, int32x4_t, 'vraddhn.i64', '[0, 1, 2, 3]']
+    compose:
+      - Let:
+          - x
+          - FnCall:
+              - "vraddhn{neon_type[1].noq}"
+              - - b
+                - c
+      - FnCall: ["simd_shuffle!", [a, x, '{type[4]}']]
+
+  - name: "vpadd{neon_type.no}"
+    doc: "Add pairwise."
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vpadd]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [addp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - int8x8_t
+      - int16x4_t
+      - int32x2_t
+    compose:
+      - LLVMLink:
+          name: "vpadd{neon_type.no}"
+          links:
+            - link: "llvm.aarch64.neon.addp.{neon_type}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vpadd.{neon_type}"
+              arch: arm
+
+  - name: "vpadd{neon_type[0].no}"
+    doc: "Add pairwise."
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vpadd]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [addp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - [uint8x8_t, int8x8_t]
+      - [uint16x4_t, int16x4_t]
+      - [uint32x2_t, int32x2_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vpadd{neon_type[1].no}"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  # This was not publically exposed
+  - name: "priv_vpadal{neon_type[1].no}"
+    visibility: private 
+    doc: "Signed Add and Accumulate Long Pairwise."
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['{type[2]}']]}]]
+      - *neon-cfg-arm-unstable
+    types:
+      - [int16x4_t, int8x8_t,  '"vpadal.s8"']
+      - [int32x2_t, int16x4_t, '"vpadal.s16"']
+      - [int64x1_t, int32x2_t, '"vpadal.s32"']
+      - [int16x8_t, int8x16_t, '"vpadal.s8"']
+      - [int32x4_t, int16x8_t, '"vpadal.s16"']
+      - [int64x2_t, int32x4_t, '"vpadal.s32"']
+    compose:
+      - LLVMLink:
+          name: "vpadal{neon_type[1].no}"
+          links:
+            - link: "llvm.arm.neon.vpadals.{neon_type[0]}.{neon_type[1]}"
+              arch: arm
+
+  # This was not publically exposed
+  - name: "priv_vpadal{neon_type[1].no}"
+    visibility: private 
+    doc: "Signed Add and Accumulate Long Pairwise."
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['{type[2]}']]}]]
+      - *neon-cfg-arm-unstable
+    types:
+      - [uint16x4_t, uint8x8_t , '"vpadal.u8"']
+      - [uint32x2_t, uint16x4_t, '"vpadal.u16"']
+      - [uint64x1_t, uint32x2_t, '"vpadal.u32"']
+      - [uint16x8_t, uint8x16_t, '"vpadal.u8"']
+      - [uint32x4_t, uint16x8_t, '"vpadal.u16"']
+      - [uint64x2_t, uint32x4_t, '"vpadal.u32"']
+    compose:
+      - LLVMLink:
+          name: "vpadal{neon_type[1].no}"
+          links:
+            - link: "llvm.arm.neon.vpadalu.{neon_type[0]}.{neon_type[1]}"
+              arch: arm
+
+  - name: "vpaddl{neon_type[0].no}"
+    doc: "Signed Add and Accumulate Long Pairwise."
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    safety: safe
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['{type[2]}']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [saddlp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    types:
+      - [int8x8_t, int16x4_t , '"vpaddl.s8"']
+      - [int16x4_t, int32x2_t, '"vpaddl.s16"']
+      - [int32x2_t, int64x1_t, '"vpaddl.s32"']
+      - [int8x16_t, int16x8_t, '"vpaddl.s8"']
+      - [int16x8_t, int32x4_t, '"vpaddl.s16"']
+      - [int32x4_t, int64x2_t, '"vpaddl.s32"']
+    compose:
+      - LLVMLink:
+          name: "vpaddl{neon_type[1].no}"
+          links:
+            - link: "llvm.aarch64.neon.saddlp.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vpaddls.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+
+  - name: "vpaddl{neon_type[0].no}"
+    doc: "Unsigned Add and Accumulate Long Pairwise."
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    safety: safe
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['{type[2]}']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uaddlp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    types:
+      - [uint8x8_t, uint16x4_t , '"vpaddl.u8"']
+      - [uint16x4_t, uint32x2_t, '"vpaddl.u16"']
+      - [uint32x2_t, uint64x1_t, '"vpaddl.u32"']
+      - [uint8x16_t, uint16x8_t, '"vpaddl.u8"']
+      - [uint16x8_t, uint32x4_t, '"vpaddl.u16"']
+      - [uint32x4_t, uint64x2_t, '"vpaddl.u32"']
+    compose:
+      - LLVMLink:
+          name: "vpaddl{neon_type[1].no}"
+          links:
+            - link: "llvm.aarch64.neon.uaddlp.{neon_type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.vpaddlu.{neon_type[1]}.{neon_type[0]}"
+              arch: arm
+
+  - name: "vpadal{neon_type[1].no}"
+    doc: "Signed Add and Accumulate Long Pairwise."
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    safety: safe
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"{type[2]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [sadalp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    types:
+      - [int16x4_t, int8x8_t, 'vpadal.s8', 'let x: int16x4_t; #[cfg(target_arch = "arm")]   { x = priv_vpadal_s8(a, b); } #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]   unsafe { x = simd_add(vpaddl_s8(b), a);}']
+      - [int32x2_t, int16x4_t, 'vpadal.s16', 'let x: int32x2_t; #[cfg(target_arch = "arm")] { x = priv_vpadal_s16(a, b); } #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] unsafe { x = simd_add(vpaddl_s16(b), a);}']
+      - [int64x1_t, int32x2_t, 'vpadal.s32', 'let x: int64x1_t; #[cfg(target_arch = "arm")] { x = priv_vpadal_s32(a, b); } #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] unsafe { x = simd_add(vpaddl_s32(b), a);}']
+      - [int16x8_t, int8x16_t, 'vpadal.s8', 'let x: int16x8_t; #[cfg(target_arch = "arm")]  { x = priv_vpadalq_s8(a, b); } #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]   unsafe { x = simd_add(vpaddlq_s8(b), a);}']
+      - [int32x4_t, int16x8_t, 'vpadal.s16', 'let x: int32x4_t; #[cfg(target_arch = "arm")] { x = priv_vpadalq_s16(a, b); } #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] unsafe { x = simd_add(vpaddlq_s16(b), a);}']
+      - [int64x2_t, int32x4_t, 'vpadal.s32', 'let x: int64x2_t; #[cfg(target_arch = "arm")] { x = priv_vpadalq_s32(a, b); } #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] unsafe { x = simd_add(vpaddlq_s32(b), a);}']
+    compose:
+      - Identifier: ['{type[3]}', Symbol]
+      - Identifier: [x, Symbol]
+
+  - name: "vpadal{neon_type[1].no}"
+    doc: "Unsigned Add and Accumulate Long Pairwise."
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    safety: safe
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"{type[2]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uadalp]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    types:
+      - [uint16x4_t, uint8x8_t, 'vpadal.u8', 'let x: uint16x4_t; #[cfg(target_arch = "arm")]   { x = priv_vpadal_u8(a, b); } #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] unsafe { x = simd_add(vpaddl_u8(b), a);}']
+      - [uint32x2_t, uint16x4_t, 'vpadal.u16', 'let x: uint32x2_t; #[cfg(target_arch = "arm")] { x = priv_vpadal_u16(a, b); } #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] unsafe { x = simd_add(vpaddl_u16(b), a);}']
+      - [uint64x1_t, uint32x2_t, 'vpadal.u32', 'let x: uint64x1_t; #[cfg(target_arch = "arm")] { x = priv_vpadal_u32(a, b); } #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] unsafe { x = simd_add(vpaddl_u32(b), a);}']
+      - [uint16x8_t, uint8x16_t, 'vpadal.u8', 'let x: uint16x8_t; #[cfg(target_arch = "arm")]  { x = priv_vpadalq_u8(a, b); } #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] unsafe { x = simd_add(vpaddlq_u8(b), a);}']
+      - [uint32x4_t, uint16x8_t, 'vpadal.u16', 'let x: uint32x4_t; #[cfg(target_arch = "arm")] { x = priv_vpadalq_u16(a, b); } #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] unsafe { x = simd_add(vpaddlq_u16(b), a);}']
+      - [uint64x2_t, uint32x4_t, 'vpadal.u32', 'let x: uint64x2_t; #[cfg(target_arch = "arm")] { x = priv_vpadalq_u32(a, b); } #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] unsafe { x = simd_add(vpaddlq_u32(b), a);}']
+    compose:
+      - Identifier: ['{type[3]}', Symbol]
+      - Identifier: [x, Symbol]
+
+  - name: "vcnt{neon_type.no}"
+    doc: "Population count per byte."
+    arguments: ["a: {neon_type}"]
+    return_type: "{neon_type}"
+    safety: safe
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vcnt]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cnt]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    types:
+      - int8x8_t
+      - int8x16_t
+    compose:
+      - FnCall: [simd_ctpop, [a]]
+
+  - name: "vcnt{neon_type[0].no}"
+    doc: "Population count per byte."
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    safety: safe
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vcnt]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [cnt]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    types:
+      - [uint8x8_t,  int8x8_t]
+      - [uint8x16_t, int8x16_t]
+      - [poly8x8_t,  int8x8_t]
+      - [poly8x16_t, int8x16_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - "vcnt{neon_type[1].no}"
+                - - FnCall:
+                      - transmute
+                      - - a
+
+  - name: "vmmla{neon_type[0].no}"
+    doc: "8-bit integer matrix multiply-accumulate"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    safety: safe
+    attr:
+      - *neon-i8mm
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [smmla]]}]]
+      - *neon-unstable-i8mm
+      - *neon-cfg-arm-unstable
+    types:
+      - [int32x4_t, int8x16_t]
+    compose:
+      - LLVMLink:
+          name: "vmmla{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.smmla.{neon_type[0]}.{neon_type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.smmla.{neon_type[0]}.{neon_type[1]}"
+              arch: arm
+
+  - name: "vmmla{neon_type[0].no}"
+    doc: "8-bit integer matrix multiply-accumulate"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    safety: safe
+    attr:
+      - *neon-i8mm
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ummla]]}]]
+      - *neon-unstable-i8mm
+      - *neon-cfg-arm-unstable
+    types:
+      - [uint32x4_t, uint8x16_t]
+    compose:
+      - LLVMLink:
+          name: "vmmla{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.ummla.{neon_type[0]}.{neon_type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.ummla.{neon_type[0]}.{neon_type[1]}"
+              arch: arm
+
+  - name: "vusmmla{neon_type[0].no}"
+    doc: "Unsigned and signed 8-bit integer matrix multiply-accumulate"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[0]}"
+    safety: safe
+    attr:
+      - *neon-i8mm
+      - *neon-v8
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [usmmla]]}]]
+      - *neon-unstable-i8mm
+      - *neon-cfg-arm-unstable
+    types:
+      - [int32x4_t, uint8x16_t, int8x16_t]
+    compose:
+      - LLVMLink:
+          name: "vmmla{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.usmmla.{neon_type[0]}.{neon_type[1]}"
+              arch: aarch64,arm64ec
+            - link: "llvm.arm.neon.usmmla.{neon_type[0]}.{neon_type[1]}"
+              arch: arm
+
+  - name: "vtbl1"
+    visibility: private
+    doc: "Table look-up"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+    assert_instr: [vtbl]
+    types:
+      - int8x8_t
+    compose:
+      - LLVMLink:
+          name: "vtbl1"
+          links:
+            - link: "llvm.arm.neon.vtbl1"
+              arch: arm
+  
+  - name: "vtbl1_s8"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type}", "b: {neon_type}"]
+    return_type: "{neon_type}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+    assert_instr: [vtbl]
+    types:
+      - int8x8_t
+    compose:
+      - FnCall: [vtbl1, [a, b]]
+
+  - name: "vtbl1{neon_type[0].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: uint8x8_t"]
+    return_type: "{neon_type[1]}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+    assert_instr: [vtbl]
+    types:
+      - [uint8x8_t, uint8x8_t]
+      - [poly8x8_t, poly8x8_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - vtbl1
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vtbl2"
+    visibility: private
+    doc: "Table look-up"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}"]
+    return_type: "{neon_type}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+    assert_instr: [vtbl]
+    types:
+      - int8x8_t
+    compose:
+      - LLVMLink:
+          name: "vtbl2"
+          links:
+            - link: "llvm.arm.neon.vtbl2"
+              arch: arm
+
+  - name: "vtbl2_s8"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+    assert_instr: [vtbl]
+    types:
+      - [int8x8x2_t, int8x8_t]
+    compose:
+      - FnCall: [vtbl2, ['a.0', 'a.1', b]]
+
+  - name: "vtbl2{neon_type[1].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: uint8x8_t"]
+    return_type: "{neon_type[1]}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+    assert_instr: [vtbl]
+    types:
+      - [uint8x8x2_t, uint8x8_t]
+      - [poly8x8x2_t, poly8x8_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - vtbl2
+                - - FnCall: [transmute, ['a.0']]
+                  - FnCall: [transmute, ['a.1']]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vtbl3"
+    visibility: private
+    doc: "Table look-up"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}", "d: {neon_type}"]
+    return_type: "{neon_type}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+    assert_instr: [vtbl]
+    types:
+      - int8x8_t
+    compose:
+      - LLVMLink:
+          name: "vtbl3"
+          links:
+            - link: "llvm.arm.neon.vtbl3"
+              arch: arm
+
+  - name: "vtbl3_s8"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+    assert_instr: [vtbl]
+    types:
+      - [int8x8x3_t, int8x8_t]
+    compose:
+      - FnCall: [vtbl3, ['a.0', 'a.1', 'a.2', b]]
+
+  - name: "vtbl3{neon_type[1].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: uint8x8_t"]
+    return_type: "{neon_type[1]}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+    assert_instr: [vtbl]
+    types:
+      - [uint8x8x3_t, uint8x8_t]
+      - [poly8x8x3_t, poly8x8_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - vtbl3
+                - - FnCall: [transmute, ['a.0']]
+                  - FnCall: [transmute, ['a.1']]
+                  - FnCall: [transmute, ['a.2']]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vtbl4"
+    visibility: private
+    doc: "Table look-up"
+    arguments: ["a: {neon_type}", "b: {neon_type}", "c: {neon_type}", "d: {neon_type}", "e: {neon_type}"]
+    return_type: "{neon_type}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+    assert_instr: [vtbl]
+    types:
+      - int8x8_t
+    compose:
+      - LLVMLink:
+          name: "vtbl4"
+          links:
+            - link: "llvm.arm.neon.vtbl4"
+              arch: arm
+
+  - name: "vtbl4_s8"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+    assert_instr: [vtbl]
+    types:
+      - [int8x8x4_t, int8x8_t]
+    compose:
+      - FnCall: [vtbl4, ['a.0', 'a.1', 'a.2', 'a.3', b]]
+
+  - name: "vtbl4{neon_type[1].no}"
+    doc: "Table look-up"
+    arguments: ["a: {neon_type[0]}", "b: uint8x8_t"]
+    return_type: "{neon_type[1]}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+    assert_instr: [vtbl]
+    types:
+      - [uint8x8x4_t, uint8x8_t]
+      - [poly8x8x4_t, poly8x8_t]
+    compose:
+      - FnCall:
+          - transmute
+          - - FnCall:
+                - vtbl4
+                - - FnCall: [transmute, ['a.0']]
+                  - FnCall: [transmute, ['a.1']]
+                  - FnCall: [transmute, ['a.2']]
+                  - FnCall: [transmute, ['a.3']]
+                  - FnCall: [transmute, [b]]
+
+  - name: "vst1{type[0]}"
+    visibility: private
+    doc: "Store multiple single-element structures from one, two, three, or four registers."
+    arguments: ["addr: {type[1]}", "val: {neon_type[2]}", "align: {type[3]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vst1.{type[4]}"']]}]]
+    types:
+      - ['_v8i8', '* const i8', int8x8_t, i32, '8']
+      - ['q_v16i8', '* const i8', int8x16_t, i32, '8']
+      - ['_v4i16', '* const i8', int16x4_t, i32, '16']
+      - ['q_v8i16', '* const i8', int16x8_t, i32, '16']
+      - ['_v2i32', '* const i8', int32x2_t, i32, '32']
+      - ['q_v4i32', '* const i8', int32x4_t, i32, '32']
+      - ['_v1i64', '* const i8', int64x1_t, i32, '64']
+      - ['q_v2i64', '* const i8', int64x2_t, i32, '64']
+      - ['_v2f32', '* const i8', float32x2_t, i32, '32']
+      - ['q_v4f32', '* const i8', float32x4_t, i32, '32']
+    compose:
+      - LLVMLink:
+          name: "_vst1{type[0]}"
+          links:
+            - link: "llvm.arm.neon.vst1.{neon_type[2]}.p0"
+              arch: arm
+
+  - name: "vst1{type[0]}"
+    visibility: private
+    doc: "Store multiple single-element structures from one, two, three, or four registers."
+    arguments: ["addr: {type[1]}", "val: {neon_type[2]}", "align: {type[3]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-fp16
+      - *neon-unstable-f16
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vst1.{type[4]}"']]}]]
+    types:
+      - ['_v4f16', '* const i8', float16x4_t, i32, '16']
+      - ['q_v8f16', '* const i8', float16x8_t, i32, '16']
+    compose:
+      - LLVMLink:
+          name: "_vst1{type[0]}"
+          links:
+            - link: "llvm.arm.neon.vst1.{neon_type[2]}.p0"
+              arch: arm
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures from one, two, three, or four registers."
+    arguments: ["ptr: {type[0]}", "a: {neon_type[1]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vst1.{type[2]}"']]}]]
+    types:
+      - ['*mut i8',  int8x8_t,   '8', 'a', 'crate::mem::align_of::<i8>() as i32', '_v8i8']
+      - ['*mut i8',  int8x16_t,  '8', 'a', 'crate::mem::align_of::<i8>() as i32', 'q_v16i8']
+      - ['*mut i16', int16x4_t, '16', 'a', 'crate::mem::align_of::<i16>() as i32', '_v4i16']
+      - ['*mut i16', int16x8_t, '16', 'a', 'crate::mem::align_of::<i16>() as i32', 'q_v8i16']
+      - ['*mut i32', int32x2_t, '32', 'a', 'crate::mem::align_of::<i32>() as i32', '_v2i32']
+      - ['*mut i32', int32x4_t, '32', 'a', 'crate::mem::align_of::<i32>() as i32', 'q_v4i32']
+      - ['*mut i64', int64x1_t, '64', 'a', 'crate::mem::align_of::<i64>() as i32', '_v1i64']
+      - ['*mut i64', int64x2_t, '64', 'a', 'crate::mem::align_of::<i64>() as i32', 'q_v2i64']
+      - ['*mut u8',  uint8x8_t,   '8', 'transmute(a)', 'crate::mem::align_of::<u8>() as i32', '_v8i8']
+      - ['*mut u8',  uint8x16_t,  '8', 'transmute(a)', 'crate::mem::align_of::<u8>() as i32', 'q_v16i8']
+      - ['*mut u16', uint16x4_t, '16', 'transmute(a)', 'crate::mem::align_of::<u16>() as i32', '_v4i16']
+      - ['*mut u16', uint16x8_t, '16', 'transmute(a)', 'crate::mem::align_of::<u16>() as i32', 'q_v8i16']
+      - ['*mut u32', uint32x2_t, '32', 'transmute(a)', 'crate::mem::align_of::<u32>() as i32', '_v2i32']
+      - ['*mut u32', uint32x4_t, '32', 'transmute(a)', 'crate::mem::align_of::<u32>() as i32', 'q_v4i32']
+      - ['*mut u64', uint64x1_t, '64', 'transmute(a)', 'crate::mem::align_of::<u64>() as i32', '_v1i64']
+      - ['*mut u64', uint64x2_t, '64', 'transmute(a)', 'crate::mem::align_of::<u64>() as i32', 'q_v2i64']
+      - ['*mut p8',  poly8x8_t,    '8', 'transmute(a)', 'crate::mem::align_of::<p8>() as i32', '_v8i8']
+      - ['*mut p8',  poly8x16_t,   '8', 'transmute(a)', 'crate::mem::align_of::<p8>() as i32', 'q_v16i8']
+      - ['*mut p16', poly16x4_t,  '16', 'transmute(a)', 'crate::mem::align_of::<p16>() as i32', '_v4i16']
+      - ['*mut p16', poly16x8_t,  '16', 'transmute(a)', 'crate::mem::align_of::<p16>() as i32', 'q_v8i16']
+      - ['*mut p64', poly64x1_t,  '64', 'transmute(a)', 'crate::mem::align_of::<p64>() as i32', '_v1i64']
+      - ['*mut p64', poly64x2_t,  '64', 'transmute(a)', 'crate::mem::align_of::<p64>() as i32', 'q_v2i64']
+      - ['*mut f32', float32x2_t, '32', 'transmute(a)', 'crate::mem::align_of::<f32>() as i32', '_v2f32']
+      - ['*mut f32', float32x4_t, '32', 'transmute(a)', 'crate::mem::align_of::<f32>() as i32', 'q_v4f32']
+    compose:
+      - FnCall:
+          - "vst1{type[5]}"
+          - - 'ptr as *const i8'
+            - '{type[3]}'
+            - '{type[4]}'
+
+
+  - name: "vst1{neon_type[1].no}"
+    doc: "Store multiple single-element structures from one, two, three, or four registers."
+    arguments: ["ptr: {type[0]}", "a: {neon_type[1]}"]
+    safety:
+      unsafe: [neon]
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-fp16
+      - *neon-unstable-f16
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vst1.{type[2]}"']]}]]
+    types:
+      - ['*mut f16', float16x4_t, '16', 'transmute(a)', 'crate::mem::align_of::<f16>() as i32', '_v4f16']
+      - ['*mut f16', float16x8_t, '16', 'transmute(a)', 'crate::mem::align_of::<f16>() as i32', 'q_v8f16']
+    compose:
+      - FnCall:
+          - "vst1{type[5]}"
+          - - 'ptr as *const i8'
+            - '{type[3]}'
+            - '{type[4]}'
+
+
+  - name: "vshiftins{type[0]}"
+    visibility: private
+    doc: "Shift Right and Insert (immediate)"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *neon-v7
+      - *neon-arm-unstable
+    types:
+      - ['_v8i8',  "int8x8_t", '8']
+      - ['_v16i8', 'int8x16_t', '8']
+      - ['_v4i16', 'int16x4_t', '16']
+      - ['_v8i16', 'int16x8_t', '16']
+      - ['_v2i32', 'int32x2_t', '32']
+      - ['_v4i32', 'int32x4_t', '32']
+      - ['_v1i64', 'int64x1_t', '64']
+      - ['_v2i64', 'int64x2_t', '64']
+    compose:
+      - LLVMLink:
+          name: "_vshiftins{type[0]}"
+          links:
+            - link: "llvm.arm.neon.vshiftins.{neon_type[1]}"
+              arch: arm
+
+  - name: "vsri{neon_type[0].N}"
+    doc: "Shift Right and Insert (immediate)"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - FnCall: [target_feature, ['enable = "{type[1]}"']]
+      - *neon-arm-unstable
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vsri.{type[2]}"', 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ['const N: i32']
+    types:
+      - [uint8x8_t,  "neon,v7",  '8', '1 <= N && N <= 8',   'v8i8', 'int8x8_t::splat',  '-N as i8']
+      - [uint8x16_t, "neon,v7",  '8', '1 <= N && N <= 8',  'v16i8', 'int8x16_t::splat', '-N as i8']
+      - [uint16x4_t, "neon,v7", '16', '1 <= N && N <= 16', 'v4i16', 'int16x4_t::splat', '-N as i16']
+      - [uint16x8_t, "neon,v7", '16', '1 <= N && N <= 16', 'v8i16', 'int16x8_t::splat', '-N as i16']
+      - [uint32x2_t, "neon,v7", '32', '1 <= N && N <= 32', 'v2i32', 'int32x2_t::splat', '-N']
+      - [uint32x4_t, "neon,v7", '32', '1 <= N && N <= 32', 'v4i32', 'int32x4_t::splat', '-N']
+      - [uint64x1_t, "neon,v7", '64', '1 <= N && N <= 64', 'v1i64', 'int64x1_t::splat', '-N as i64']
+      - [uint64x2_t, "neon,v7", '64', '1 <= N && N <= 64', 'v2i64', 'int64x2_t::splat', '-N as i64']
+      - [poly8x8_t,  "neon,v7",  '8', '1 <= N && N <= 8',   'v8i8', 'int8x8_t::splat',  '-N as i8']
+      - [poly8x16_t, "neon,v7",  '8', '1 <= N && N <= 8',  'v16i8', 'int8x16_t::splat', '-N as i8']
+      - [poly16x4_t, "neon,v7", '16', '1 <= N && N <= 16', 'v4i16', 'int16x4_t::splat', '-N as i16']
+      - [poly16x8_t, "neon,v7", '16', '1 <= N && N <= 16', 'v8i16', 'int16x8_t::splat', '-N as i16']
+      ## These live in ./crates/core_arch/src/arm/neon.rs
+      #- [poly64x1_t, "neon,v7,aes", '64', '1 <= N && N <= 64', 'v1i64', 'int64x1_t::splat', '-N as i64']
+      #- [poly64x2_t, "neon,v7,aes", '64', '1 <= N && N <= 64', 'v2i64', 'int64x2_t::splat', '-N as i64']
+    compose:
+      - FnCall: ["static_assert!", ['{type[3]}']]
+      - FnCall:
+          - 'transmute'
+          - - FnCall:
+                - "vshiftins_{type[4]}"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+                  - FnCall: ["{type[5]}", ["{type[6]}"]]
+
+  - name: "vsri{neon_type[0].N}"
+    doc: "Shift Right and Insert (immediate)"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    static_defs: ['const N: i32']
+    attr:
+      - *enable-v7
+      - *target-is-arm
+      - *neon-arm-unstable
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vsri.{type[1]}"', 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    safety: safe
+    types:
+      - [int8x8_t,  '8',  '1 <= N && N <= 8',  'v8i8',  'int8x8_t::splat',  '-N as i8']
+      - [int8x16_t, '8',  '1 <= N && N <= 8',  'v16i8', 'int8x16_t::splat', '-N as i8']
+      - [int16x4_t, '16', '1 <= N && N <= 16', 'v4i16', 'int16x4_t::splat', '-N as i16']
+      - [int16x8_t, '16', '1 <= N && N <= 16', 'v8i16', 'int16x8_t::splat', '-N as i16']
+      - [int32x2_t, '32', '1 <= N && N <= 32', 'v2i32', 'int32x2_t::splat', '-N as i32']
+      - [int32x4_t, '32', '1 <= N && N <= 32', 'v4i32', 'int32x4_t::splat', '-N as i32']
+      - [int64x1_t, '64', '1 <= N && N <= 64', 'v1i64', 'int64x1_t::splat', '-N as i64']
+      - [int64x2_t, '64', '1 <= N && N <= 64', 'v2i64', 'int64x2_t::splat', '-N as i64']
+    compose:
+      - FnCall: ["static_assert!", ['{type[2]}']]
+      - FnCall:
+          - "vshiftins_{type[3]}"
+          - - a
+            - b
+            - FnCall: ["{type[4]}", ["{type[5]}"]]
+
+  - name: "vsli{neon_type[0].N}"
+    doc: "Shift Left and Insert (immediate)"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - FnCall: [target_feature, ['enable = "{type[1]}"']]
+      - *neon-arm-unstable
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vsli.{type[2]}"', 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ['const N: i32']
+    types:
+      - [uint8x8_t,  "neon,v7", '8',  'static_assert_uimm_bits!', 'N, 3',    'v8i8',  'int8x8_t::splat',  'N as i8']
+      - [uint8x16_t, "neon,v7", '8',  'static_assert_uimm_bits!', 'N, 3',    'v16i8', 'int8x16_t::splat', 'N as i8']
+      - [uint16x4_t, "neon,v7", '16', 'static_assert_uimm_bits!', 'N, 4',    'v4i16', 'int16x4_t::splat', 'N as i16']
+      - [uint16x8_t, "neon,v7", '16', 'static_assert_uimm_bits!', 'N, 4',    'v8i16', 'int16x8_t::splat', 'N as i16']
+      - [uint32x2_t, "neon,v7", '32', 'static_assert!', 'N >= 0 && N <= 31', 'v2i32', 'int32x2_t::splat', 'N as i32']
+      - [uint32x4_t, "neon,v7", '32', 'static_assert!', 'N >= 0 && N <= 31', 'v4i32', 'int32x4_t::splat', 'N as i32']
+      - [uint64x1_t, "neon,v7", '64', 'static_assert!', 'N >= 0 && N <= 63', 'v1i64', 'int64x1_t::splat', 'N as i64']
+      - [uint64x2_t, "neon,v7", '64', 'static_assert!', 'N >= 0 && N <= 63', 'v2i64', 'int64x2_t::splat', 'N as i64']
+      - [poly8x8_t,  "neon,v7", '8',  'static_assert_uimm_bits!', 'N, 3',     'v8i8', 'int8x8_t::splat',  'N as i8']
+      - [poly8x16_t, "neon,v7", '8',  'static_assert_uimm_bits!', 'N, 3',    'v16i8', 'int8x16_t::splat', 'N as i8']
+      - [poly16x4_t, "neon,v7", '16', 'static_assert_uimm_bits!', 'N, 4',    'v4i16', 'int16x4_t::splat', 'N as i16']
+      - [poly16x8_t, "neon,v7", '16', 'static_assert_uimm_bits!', 'N, 4',    'v8i16', 'int16x8_t::splat', 'N as i16']
+      ## These live in ./crates/core_arch/src/arm/neon.rs
+      #- [poly64x1_t, "neon,v7,aes", '"vsli.64"', 'static_assert!', '0 <= N && N <= 63', 'v1i64', 'int64x1_t::splat', 'N as i64']
+      #- [poly64x2_t, "neon,v7,aes", '"vsli.64"', 'static_assert!', '0 <= N && N <= 63', 'v2i64', 'int64x2_t::splat', 'N as i64']
+    compose:
+      - FnCall: ["{type[3]}", ['{type[4]}']]
+      - FnCall:
+          - 'transmute'
+          - - FnCall:
+                - "vshiftins_{type[5]}"
+                - - FnCall: [transmute, [a]]
+                  - FnCall: [transmute, [b]]
+                  - FnCall: ["{type[6]}", ["{type[7]}"]]
+  
+  - name: "vsli{neon_type[0].N}"
+    doc: "Shift Left and Insert (immediate)"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
+    safety: safe
+    attr:
+      - *target-is-arm
+      - *enable-v7
+      - *neon-arm-unstable
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vsli.{type[1]}"', 'N = 1']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+    static_defs: ['const N: i32']
+    types:
+      - [int8x8_t,  '8',  'static_assert_uimm_bits!', 'N, 3',     'v8i8', 'int8x8_t::splat',  'N as i8']
+      - [int8x16_t, '8',  'static_assert_uimm_bits!', 'N, 3',    'v16i8', 'int8x16_t::splat', 'N as i8']
+      - [int16x4_t, '16', 'static_assert_uimm_bits!', 'N, 4',    'v4i16', 'int16x4_t::splat', 'N as i16']
+      - [int16x8_t, '16', 'static_assert_uimm_bits!', 'N, 4',    'v8i16', 'int16x8_t::splat', 'N as i16']
+      - [int32x2_t, '32', 'static_assert!', 'N >= 0 && N <= 31', 'v2i32', 'int32x2_t::splat', 'N']
+      - [int32x4_t, '32', 'static_assert!', 'N >= 0 && N <= 31', 'v4i32', 'int32x4_t::splat', 'N']
+      - [int64x1_t, '64', 'static_assert!', 'N >= 0 && N <= 63', 'v1i64', 'int64x1_t::splat', 'N as i64']
+      - [int64x2_t, '64', 'static_assert!', 'N >= 0 && N <= 63', 'v2i64', 'int64x2_t::splat', 'N as i64']
+    compose:
+      - FnCall: ["{type[2]}", ['{type[3]}']]
+      - FnCall:
+          - "vshiftins_{type[4]}"
+          - - a
+            - b
+            - FnCall: ["{type[5]}", ["{type[6]}"]]
+
+  - name: "vcombine{neon_type[0].no}"
+    doc: Join two smaller vectors into a single larger vector
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [nop]
+    safety: safe
+    types:
+      - [float16x4_t, float16x8_t]
+    compose:
+      - FnCall: [simd_shuffle!, [a, b, '[0, 1, 2, 3, 4, 5, 6, 7]']]
+
+  - name: "vget_{type[2]}_{neon_type[0]}"
+    doc: Duplicate vector element to vector
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - *neon-fp16
+      - *neon-unstable-f16
+    assert_instr: [nop]
+    safety: safe
+    types:
+      - [float16x4_t, float16x8_t, 'low', "[0, 1, 2, 3]"]
+      - [float16x4_t, float16x8_t, 'high', "[4, 5, 6, 7]"]
+    compose:
+      - FnCall: [simd_shuffle!, [a, a, "{type[3]}"]]
+
+  - name: "vget{type[2]}"
+    doc: Duplicate vector element to scalar
+    arguments: ["a: {neon_type[0]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-v7
+      - *neon-fp16
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [nop, 'LANE = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ["1"]]
+      - *neon-unstable-f16
+    static_defs: ['const LANE: i32']
+    safety: safe
+    types:
+      - [float16x4_t, f16, '_lane_f16',  '2']
+      - [float16x8_t, f16, 'q_lane_f16', '3']
+    compose:
+      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
+      - FnCall: [simd_extract!, [a, "LANE as u32"]]
+
+  - name: "vmov{neon_type[0].N}"
+    doc: "Duplicate element to vector"
+    arguments: ["a: {type[1]}"]
+    return_type: "{neon_type[0]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vdup.16"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [dup]]}]]
+      - *neon-fp16
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - [float16x4_t, f16]
+      - [float16x8_t, f16]
+    compose:
+      - FnCall: ["vdup{neon_type[0].N}", [a]]
+
+  - name: "{type[0]}"
+    doc: "Load one single-element structure to one lane of one register."
+    arguments: ["ptr: {type[1]}", "src: {neon_type[2]}"]
+    return_type: "{neon_type[2]}"
+    static_defs: ['const LANE: i32']
+    attr:
+      - *neon-v7
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ["{type[3]}", 'LANE = {type[4]}']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[5]}', 'LANE = {type[4]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['vld1_lane_s8', '*const i8', 'int8x8_t', '"vld1.8"', '7', 'ld1', 'static_assert_uimm_bits!', 'LANE, 3']
+      - ['vld1_lane_u8', '*const u8', 'uint8x8_t', '"vld1.8"', '7', 'ld1', 'static_assert_uimm_bits!', 'LANE, 3']
+      - ['vld1_lane_p8', '*const p8', 'poly8x8_t', '"vld1.8"', '7', 'ld1', 'static_assert_uimm_bits!', 'LANE, 3']
+      - ['vld1q_lane_s8', '*const i8', 'int8x16_t', '"vld1.8"', '15', 'ld1', 'static_assert_uimm_bits!', 'LANE, 4']
+      - ['vld1q_lane_u8', '*const u8', 'uint8x16_t', '"vld1.8"', '15', 'ld1', 'static_assert_uimm_bits!', 'LANE, 4']
+      - ['vld1q_lane_p8', '*const p8', 'poly8x16_t', '"vld1.8"', '15', 'ld1', 'static_assert_uimm_bits!', 'LANE, 4']
+      - ['vld1_lane_s16', '*const i16', 'int16x4_t', '"vld1.16"', '3', 'ld1', 'static_assert_uimm_bits!', 'LANE, 2']
+      - ['vld1_lane_u16', '*const u16', 'uint16x4_t', '"vld1.16"', '3', 'ld1', 'static_assert_uimm_bits!', 'LANE, 2']
+      - ['vld1_lane_p16', '*const p16', 'poly16x4_t', '"vld1.16"', '3', 'ld1', 'static_assert_uimm_bits!', 'LANE, 2']
+      - ['vld1q_lane_s16', '*const i16', 'int16x8_t', '"vld1.16"', '7', 'ld1', 'static_assert_uimm_bits!', 'LANE, 3']
+      - ['vld1q_lane_u16', '*const u16', 'uint16x8_t', '"vld1.16"', '7', 'ld1', 'static_assert_uimm_bits!', 'LANE, 3']
+      - ['vld1q_lane_p16', '*const p16', 'poly16x8_t', '"vld1.16"', '7', 'ld1', 'static_assert_uimm_bits!', 'LANE, 3']
+      - ['vld1_lane_s32', '*const i32', 'int32x2_t', '"vld1.32"', '1', 'ld1', 'static_assert_uimm_bits!', 'LANE, 1']
+      - ['vld1_lane_u32', '*const u32', 'uint32x2_t', '"vld1.32"', '1', 'ld1', 'static_assert_uimm_bits!', 'LANE, 1']
+      - ['vld1_lane_f32', '*const f32', 'float32x2_t', '"vld1.32"', '1', 'ld1', 'static_assert_uimm_bits!', 'LANE, 1']
+      - ['vld1q_lane_s32', '*const i32', 'int32x4_t', '"vld1.32"', '3', 'ld1', 'static_assert_uimm_bits!', 'LANE, 2']
+      - ['vld1q_lane_u32', '*const u32', 'uint32x4_t', '"vld1.32"', '3', 'ld1', 'static_assert_uimm_bits!', 'LANE, 2']
+      - ['vld1q_lane_f32', '*const f32', 'float32x4_t', '"vld1.32"', '3', 'ld1', 'static_assert_uimm_bits!', 'LANE, 2']
+      - ['vld1_lane_s64', '*const i64', 'int64x1_t', 'vldr', '0', 'ldr', 'static_assert!', 'LANE == 0']
+      - ['vld1_lane_u64', '*const u64', 'uint64x1_t', 'vldr', '0', 'ldr', 'static_assert!', 'LANE == 0']
+      - ['vld1q_lane_s64', '*const i64', 'int64x2_t', 'vldr', '1', 'ld1', 'static_assert_uimm_bits!', 'LANE, 1']
+      - ['vld1q_lane_u64', '*const u64', 'uint64x2_t', 'vldr', '1', 'ld1', 'static_assert_uimm_bits!', 'LANE, 1']
+    compose:
+      - FnCall: ["{type[6]}", ["{type[7]}"]]
+      - FnCall: [simd_insert!, [src, 'LANE as u32', '*ptr']]
+
+  - name: "{type[0]}"
+    doc: "Load one single-element structure to one lane of one register."
+    arguments: ["ptr: {type[1]}", "src: {neon_type[2]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-aes
+      - *neon-v7
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ["{type[3]}", 'LANE = {type[4]}']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[5]}', 'LANE = {type[4]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const LANE: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - ['vld1_lane_p64', '*const p64', 'poly64x1_t', 'vldr', '0', 'ldr', 'static_assert!', 'LANE == 0']
+      - ['vld1q_lane_p64', '*const p64', 'poly64x2_t', 'vldr', '1', 'ld1', 'static_assert_uimm_bits!', 'LANE, 1']
+    compose:
+      - FnCall: ["{type[6]}", ["{type[7]}"]]
+      - FnCall: [simd_insert!, [src, 'LANE as u32', '*ptr']]
+
+  - name: "{type[0]}"
+    doc: "Load one single-element structure and Replicate to all lanes (of one register)."
+    arguments: ["ptr: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ["{type[3]}"]] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[4]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['vld1_dup_s64', '*const i64', 'int64x1_t', 'vldr', 'ldr', 'let x: int64x1_t; #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] { x = crate::core_arch::aarch64::vld1_s64(ptr); } #[cfg(target_arch = "arm")] { x = crate::core_arch::arm::vld1_s64(ptr); }']
+      - ['vld1_dup_u64', '*const u64', 'uint64x1_t', 'vldr', 'ldr', 'let x: uint64x1_t; #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] { x = crate::core_arch::aarch64::vld1_u64(ptr); } #[cfg(target_arch = "arm")] { x = crate::core_arch::arm::vld1_u64(ptr); }']
+    compose:
+      - Identifier: ['{type[5]}', Symbol]
+      - Identifier: [x, Symbol]
+
+  - name: "{type[0]}"
+    doc: "Load one single-element structure and Replicate to all lanes (of one register)."
+    arguments: ["ptr: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-aes
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ["{type[3]}"]] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[4]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['vld1_dup_p64', '*const p64', 'poly64x1_t', 'vldr', 'ldr', 'let x: poly64x1_t; #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] { x = crate::core_arch::aarch64::vld1_p64(ptr); } #[cfg(target_arch = "arm")] { x = crate::core_arch::arm::vld1_p64(ptr); }']
+    compose:
+      - Identifier: ['{type[5]}', Symbol]
+      - Identifier: [x, Symbol]
+
+  - name: "{type[0]}"
+    doc: "Load one single-element structure and Replicate to all lanes (of one register)."
+    arguments: ["ptr: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-aes
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ["{type[3]}"]] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[4]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['vld1q_dup_p64', '*const p64', 'poly64x2_t', 'vldr', 'ld1r', 'vld1q_lane_p64::<0>', 'u64x2::splat(0)', '[0, 0]']
+    compose:
+      - Let:
+        - x
+        - FnCall:
+            - '{type[5]}'
+            - - ptr
+              - FnCall: [transmute, ['{type[6]}']]
+      - FnCall: ['simd_shuffle!', [x, x, '{type[7]}']]
+
+  - name: "{type[0]}"
+    doc: "Load one single-element structure and Replicate to all lanes (of one register)."
+    arguments: ["ptr: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['"{type[3]}"']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[4]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['vld1_dup_s8', '*const i8', 'int8x8_t', 'vld1.8', 'ld1r', 'vld1_lane_s8::<0>', 'i8x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
+      - ['vld1_dup_u8', '*const u8', 'uint8x8_t', 'vld1.8', 'ld1r', 'vld1_lane_u8::<0>', 'u8x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
+      - ['vld1_dup_p8', '*const p8', 'poly8x8_t', 'vld1.8', 'ld1r', 'vld1_lane_p8::<0>', 'u8x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
+
+      - ['vld1q_dup_s8', '*const i8', 'int8x16_t', 'vld1.8', 'ld1r', 'vld1q_lane_s8::<0>', 'i8x16::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]']
+      - ['vld1q_dup_u8', '*const u8', 'uint8x16_t', 'vld1.8', 'ld1r', 'vld1q_lane_u8::<0>', 'u8x16::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]']
+      - ['vld1q_dup_p8', '*const p8', 'poly8x16_t', 'vld1.8', 'ld1r', 'vld1q_lane_p8::<0>', 'u8x16::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]']
+
+      - ['vld1_dup_s16', '*const i16', 'int16x4_t', 'vld1.16', 'ld1r', 'vld1_lane_s16::<0>', 'i16x4::splat(0)', '[0, 0, 0, 0]']
+      - ['vld1_dup_u16', '*const u16', 'uint16x4_t', 'vld1.16', 'ld1r', 'vld1_lane_u16::<0>', 'u16x4::splat(0)', '[0, 0, 0, 0]']
+      - ['vld1_dup_p16', '*const p16', 'poly16x4_t', 'vld1.16', 'ld1r', 'vld1_lane_p16::<0>', 'u16x4::splat(0)', '[0, 0, 0, 0]']
+
+      - ['vld1q_dup_s16', '*const i16', 'int16x8_t', 'vld1.16', 'ld1r', 'vld1q_lane_s16::<0>', 'i16x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
+      - ['vld1q_dup_u16', '*const u16', 'uint16x8_t', 'vld1.16', 'ld1r', 'vld1q_lane_u16::<0>', 'u16x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
+      - ['vld1q_dup_p16', '*const p16', 'poly16x8_t', 'vld1.16', 'ld1r', 'vld1q_lane_p16::<0>', 'u16x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
+
+      - ['vld1_dup_s32', '*const i32', 'int32x2_t', 'vld1.32', 'ld1r', 'vld1_lane_s32::<0>', 'i32x2::splat(0)', '[0, 0]']
+      - ['vld1_dup_u32', '*const u32', 'uint32x2_t', 'vld1.32', 'ld1r', 'vld1_lane_u32::<0>', 'u32x2::splat(0)', '[0, 0]']
+      - ['vld1_dup_f32', '*const f32', 'float32x2_t', 'vld1.32', 'ld1r', 'vld1_lane_f32::<0>', 'f32x2::splat(0.0)', '[0, 0]']
+
+      - ['vld1q_dup_s32', '*const i32', 'int32x4_t', 'vld1.32', 'ld1r', 'vld1q_lane_s32::<0>', 'i32x4::splat(0)', '[0, 0, 0, 0]']
+      - ['vld1q_dup_u32', '*const u32', 'uint32x4_t', 'vld1.32', 'ld1r', 'vld1q_lane_u32::<0>', 'u32x4::splat(0)', '[0, 0, 0, 0]']
+      - ['vld1q_dup_f32', '*const f32', 'float32x4_t', 'vld1.32', 'ld1r', 'vld1q_lane_f32::<0>', 'f32x4::splat(0.0)', '[0, 0, 0, 0]']
+
+      - ['vld1q_dup_s64', '*const i64', 'int64x2_t', 'vldr', 'ld1', 'vld1q_lane_s64::<0>', 'i64x2::splat(0)', '[0, 0]']
+      - ['vld1q_dup_u64', '*const u64', 'uint64x2_t', 'vldr', 'ld1', 'vld1q_lane_u64::<0>', 'u64x2::splat(0)', '[0, 0]']
+    compose:
+      - Let:
+          - x
+          - FnCall:
+              - '{type[5]}'
+              - - ptr
+                - FnCall: [transmute, ['{type[6]}']]
+      - FnCall: ['simd_shuffle!', [x, x, '{type[7]}']]
+
+  - name: "{type[0]}"
+    doc: "Absolute difference and accumulate (64-bit)"
+    arguments: ['a: {neon_type[1]}', 'b: {neon_type[1]}', 'c: {neon_type[1]}']
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['"{type[2]}"']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[3]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vaba_s8', 'int8x8_t', 'vaba.s8', 'saba', 'vabd_s8']
+      - ['vaba_u8', 'uint8x8_t', 'vaba.u8', 'uaba', 'vabd_u8']
+      - ['vaba_s16', 'int16x4_t', 'vaba.s16', 'saba', 'vabd_s16']
+      - ['vaba_u16', 'uint16x4_t', 'vaba.u16', 'uaba', 'vabd_u16']
+      - ['vaba_s32', 'int32x2_t', 'vaba.s32', 'saba', 'vabd_s32']
+      - ['vaba_u32', 'uint32x2_t', 'vaba.u32', 'uaba', 'vabd_u32']
+    compose:
+      - FnCall:
+          - 'simd_add'
+          - - a
+            - FnCall: ['{type[4]}', [b, c]]
+
+  - name: "{type[0]}"
+    doc: "Absolute difference and accumulate (128-bit)"
+    arguments: ['a: {neon_type[1]}', 'b: {neon_type[1]}', 'c: {neon_type[1]}']
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['"{type[2]}"']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[3]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vabaq_s8', 'int8x16_t', 'vaba.s8', 'saba', 'vabdq_s8']
+      - ['vabaq_u8', 'uint8x16_t', 'vaba.u8', 'uaba', 'vabdq_u8']
+      - ['vabaq_s16', 'int16x8_t', 'vaba.s16', 'saba', 'vabdq_s16']
+      - ['vabaq_u16', 'uint16x8_t', 'vaba.u16', 'uaba', 'vabdq_u16']
+      - ['vabaq_s32', 'int32x4_t', 'vaba.s32', 'saba', 'vabdq_s32']
+      - ['vabaq_u32', 'uint32x4_t', 'vaba.u32', 'uaba', 'vabdq_u32']
+    compose:
+      - FnCall:
+        - 'simd_add'
+        - - a
+          - FnCall: ['{type[4]}', [b, c]]
+
+  - name: "{type[0]}"
+    doc: "Vector add."
+    arguments: ['a: {neon_type[1]}', 'b: {neon_type[1]}']
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['{type[2]}']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[3]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vadd_s8', 'int8x8_t', 'vadd', 'add']
+      - ['vaddq_s8', 'int8x16_t', 'vadd', 'add']
+      - ['vadd_s16', 'int16x4_t', 'vadd', 'add']
+      - ['vaddq_s16', 'int16x8_t', 'vadd', 'add']
+      - ['vadd_s32', 'int32x2_t', 'vadd', 'add']
+      - ['vaddq_s32', 'int32x4_t', 'vadd', 'add']
+      - ['vaddq_s64', 'int64x2_t', 'vadd', 'add']
+      - ['vadd_f32', 'float32x2_t', 'vadd', 'fadd']
+      - ['vaddq_f32', 'float32x4_t', 'vadd', 'fadd']
+      - ['vadd_u8', 'uint8x8_t', 'vadd', 'add']
+      - ['vaddq_u8', 'uint8x16_t', 'vadd', 'add']
+      - ['vadd_u16', 'uint16x4_t', 'vadd', 'add']
+      - ['vaddq_u16', 'uint16x8_t', 'vadd', 'add']
+      - ['vadd_u32', 'uint32x2_t', 'vadd', 'add']
+      - ['vaddq_u32', 'uint32x4_t', 'vadd', 'add']
+      - ['vaddq_u64', 'uint64x2_t', 'vadd', 'add']
+    compose:
+      - FnCall: ['simd_add', [a, b]]
+
+  - name: "{type[0]}"
+    doc: "Add Long (vector)."
+    arguments: ['a: {neon_type[1]}', 'b: {neon_type[1]}']
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['{type[3]}']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[4]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vaddl_s8', 'int8x8_t',  'int16x8_t', 'vaddl', 'saddl']
+      - ['vaddl_s16', 'int16x4_t', 'int32x4_t', 'vaddl', 'saddl']
+      - ['vaddl_s32', 'int32x2_t', 'int64x2_t', 'vaddl', 'saddl']
+      - ['vaddl_u8', 'uint8x8_t', 'uint16x8_t', 'vaddl', 'uaddl']
+      - ['vaddl_u16', 'uint16x4_t', 'uint32x4_t', 'vaddl', 'uaddl']
+      - ['vaddl_u32', 'uint32x2_t', 'uint64x2_t', 'vaddl', 'uaddl']
+    compose:
+      - Let:
+          - a
+          - '{neon_type[2]}'
+          - FnCall: [simd_cast, [a]]
+      - Let:
+          - b
+          - '{neon_type[2]}'
+          - FnCall: [simd_cast, [b]]
+      - FnCall: ['simd_add', [a, b]]
+
+  - name: "{type[0]}"
+    doc: "Signed Add Long (vector, high half)."
+    arguments: ['a: {neon_type[1]}', 'b: {neon_type[1]}']
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['{type[3]}']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[4]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vaddl_high_s8', 'int8x16_t', 'int16x8_t', 'vaddl', 'saddl2',  'int8x8_t', '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - ['vaddl_high_s16', 'int16x8_t', 'int32x4_t', 'vaddl', 'saddl2',  'int16x4_t', '[4, 5, 6, 7]']
+      - ['vaddl_high_s32', 'int32x4_t', 'int64x2_t', 'vaddl', 'saddl2',  'int32x2_t', '[2, 3]']
+      - ['vaddl_high_u8', 'uint8x16_t', 'uint16x8_t', 'vaddl', 'uaddl2',  'uint8x8_t', '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - ['vaddl_high_u16', 'uint16x8_t', 'uint32x4_t', 'vaddl', 'uaddl2',  'uint16x4_t', '[4, 5, 6, 7]']
+      - ['vaddl_high_u32', 'uint32x4_t', 'uint64x2_t', 'vaddl', 'uaddl2',  'uint32x2_t', '[2, 3]']
+    compose:
+      - Let:
+          - a
+          - '{neon_type[5]}'
+          - FnCall: ['simd_shuffle!', [a, a, '{type[6]}']]
+      - Let:
+          - b
+          - '{neon_type[5]}'
+          - FnCall: ['simd_shuffle!', [b, b, '{type[6]}']]
+      - Let: [a, '{neon_type[2]}', {FnCall: [simd_cast, [a]]}]
+      - Let: [b, '{neon_type[2]}', {FnCall: [simd_cast, [b]]}]
+      - FnCall: [simd_add, [a, b]]
+
+  - name: "{type[0]}"
+    doc: "Add Wide"
+    arguments: ['a: {neon_type[1]}', 'b: {neon_type[2]}']
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['{type[3]}']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[4]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vaddw_s8', 'int16x8_t', 'int8x8_t', 'vaddw', 'saddw']
+      - ['vaddw_s16', 'int32x4_t', 'int16x4_t', 'vaddw', 'saddw']
+      - ['vaddw_s32', 'int64x2_t', 'int32x2_t', 'vaddw', 'saddw']
+      - ['vaddw_u8', 'uint16x8_t', 'uint8x8_t', 'vaddw', 'uaddw']
+      - ['vaddw_u16', 'uint32x4_t', 'uint16x4_t', 'vaddw', 'uaddw']
+      - ['vaddw_u32', 'uint64x2_t', 'uint32x2_t', 'vaddw', 'uaddw']
+    compose:
+      - Let:
+          - b
+          - '{neon_type[1]}'
+          - FnCall: ['simd_cast', [b]]
+      - FnCall: [simd_add, [a, b]]
+
+  - name: "{type[0]}"
+    doc: "Add Wide (high half)."
+    arguments: ['a: {neon_type[1]}', 'b: {neon_type[2]}']
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['{type[3]}']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[4]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vaddw_high_s8', 'int16x8_t', 'int8x16_t', 'vaddw', 'saddw2', 'int8x8_t', '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - ['vaddw_high_s16', 'int32x4_t', 'int16x8_t', 'vaddw', 'saddw2', 'int16x4_t', '[4, 5, 6, 7]']
+      - ['vaddw_high_s32', 'int64x2_t', 'int32x4_t', 'vaddw', 'saddw2', 'int32x2_t', '[2, 3]']
+      - ['vaddw_high_u8', 'uint16x8_t', 'uint8x16_t', 'vaddw', 'uaddw2', 'uint8x8_t', '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - ['vaddw_high_u16', 'uint32x4_t', 'uint16x8_t', 'vaddw', 'uaddw2', 'uint16x4_t', '[4, 5, 6, 7]']
+      - ['vaddw_high_u32', 'uint64x2_t', 'uint32x4_t', 'vaddw', 'uaddw2', 'uint32x2_t', '[2, 3]']
+    compose:
+      - Let:
+          - b
+          - '{neon_type[5]}'
+          - FnCall: ['simd_shuffle!', [b, b, '{type[6]}']]
+      - Let:
+          - b
+          - '{neon_type[1]}'
+          - FnCall: ['simd_cast', [b]]
+      - FnCall: [simd_add, [a, b]]
+
+  - name: "{type[0]}"
+    doc: "Add returning High Narrow."
+    arguments: ['a: {neon_type[1]}', 'b: {neon_type[1]}']
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['vaddhn']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['addhn']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vaddhn_s16', 'int16x8_t', 'int8x8_t', 'int16x8_t::splat(8)']
+      - ['vaddhn_s32', 'int32x4_t', 'int16x4_t', 'int32x4_t::splat(16)']
+      - ['vaddhn_s64', 'int64x2_t', 'int32x2_t', 'int64x2_t::splat(32)']
+      - ['vaddhn_u16', 'uint16x8_t', 'uint8x8_t', 'uint16x8_t::splat(8)']
+      - ['vaddhn_u32', 'uint32x4_t', 'uint16x4_t', 'uint32x4_t::splat(16)']
+      - ['vaddhn_u64', 'uint64x2_t', 'uint32x2_t', 'uint64x2_t::splat(32)']
+    compose:
+      - FnCall:
+        - simd_cast
+        - - FnCall:
+            - simd_shr
+            - - FnCall:
+                - simd_add
+                - - a
+                  - b
+              - '{type[3]}'
+
+  - name: "{type[0]}"
+    doc: "Add returning High Narrow (high half)."
+    arguments: ['r: {neon_type[1]}', 'a: {neon_type[2]}', 'b: {neon_type[2]}']
+    return_type: "{neon_type[3]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['vaddhn']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['addhn2']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vaddhn_high_s16', 'int8x8_t', 'int16x8_t', 'int8x16_t', 'int16x8_t::splat(8)', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - ['vaddhn_high_s32', 'int16x4_t', 'int32x4_t', 'int16x8_t', 'int32x4_t::splat(16)', '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - ['vaddhn_high_s64', 'int32x2_t', 'int64x2_t', 'int32x4_t', 'int64x2_t::splat(32)', '[0, 1, 2, 3]']
+      - ['vaddhn_high_u16', 'uint8x8_t', 'uint16x8_t', 'uint8x16_t', 'uint16x8_t::splat(8)', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - ['vaddhn_high_u32', 'uint16x4_t', 'uint32x4_t', 'uint16x8_t', 'uint32x4_t::splat(16)', '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - ['vaddhn_high_u64', 'uint32x2_t', 'uint64x2_t', 'uint32x4_t', 'uint64x2_t::splat(32)', '[0, 1, 2, 3]']
+    compose:
+      - Let:
+        - x
+        - FnCall:
+          - simd_cast
+          - - FnCall:
+              - simd_shr
+              - - FnCall:
+                  - simd_add
+                  - - a
+                    - b
+                - '{type[4]}'
+      - FnCall: ['simd_shuffle!', [r, x, '{type[5]}']]
+
+  - name: "{type[0]}"
+    doc: "Vector narrow integer."
+    arguments: ['a: {neon_type[1]}']
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['vmovn']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['xtn']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vmovn_s16', 'int16x8_t', 'int8x8_t']
+      - ['vmovn_s32', 'int32x4_t', 'int16x4_t']
+      - ['vmovn_s64', 'int64x2_t', 'int32x2_t']
+      - ['vmovn_u16', 'uint16x8_t', 'uint8x8_t']
+      - ['vmovn_u32', 'uint32x4_t', 'uint16x4_t']
+      - ['vmovn_u64', 'uint64x2_t', 'uint32x2_t']
+    compose:
+      - FnCall: [simd_cast, [a]]
+
+  - name: "{type[0]}"
+    doc: "Vector long move." 
+    arguments: ['a: {neon_type[1]}']
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['vmovl']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['{type[3]}']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vmovl_s8', 'int8x8_t', 'int16x8_t', 'sxtl']
+      - ['vmovl_s16', 'int16x4_t', 'int32x4_t', 'sxtl']
+      - ['vmovl_s32', 'int32x2_t', 'int64x2_t', 'sxtl']
+      - ['vmovl_u8', 'uint8x8_t', 'uint16x8_t', 'uxtl']
+      - ['vmovl_u16', 'uint16x4_t', 'uint32x4_t', 'uxtl']
+      - ['vmovl_u32', 'uint32x2_t', 'uint64x2_t', 'uxtl']
+    compose:
+      - FnCall: [simd_cast, [a]]
+
+  - name: "{type[0]}"
+    doc: "Vector bitwise not." 
+    arguments: ['a: {neon_type[1]}']
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['vmvn']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['mvn']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vmvn_s8', 'int8x8_t', 'int8x8_t::splat(-1)']
+      - ['vmvnq_s8', 'int8x16_t', 'int8x16_t::splat(-1)']
+      - ['vmvn_s16', 'int16x4_t', 'int16x4_t::splat(-1)']
+      - ['vmvnq_s16', 'int16x8_t', 'int16x8_t::splat(-1)']
+      - ['vmvn_s32', 'int32x2_t', 'int32x2_t::splat(-1)']
+      - ['vmvnq_s32', 'int32x4_t', 'int32x4_t::splat(-1)']
+      - ['vmvn_u8', 'uint8x8_t', 'uint8x8_t::splat(255)']
+      - ['vmvnq_u8', 'uint8x16_t', 'uint8x16_t::splat(255)']
+      - ['vmvn_u16', 'uint16x4_t', 'uint16x4_t::splat(65_535)']
+      - ['vmvnq_u16', 'uint16x8_t', 'uint16x8_t::splat(65_535)']
+      - ['vmvn_u32', 'uint32x2_t', 'uint32x2_t::splat(4_294_967_295)']
+      - ['vmvnq_u32', 'uint32x4_t', 'uint32x4_t::splat(4_294_967_295)']
+      - ['vmvn_p8', 'poly8x8_t', 'poly8x8_t::splat(255)']
+      - ['vmvnq_p8', 'poly8x16_t', 'poly8x16_t::splat(255)']
+    compose:
+      - Let: [b, '{type[2]}']
+      - FnCall: [simd_xor, [a, b]]
+
+  - name: "{type[0]}"
+    doc: "Vector bitwise bit clear." 
+    arguments: ['a: {neon_type[1]}', 'b: {neon_type[1]}']
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['vbic']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['bic']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vbic_s8', 'int8x8_t', 'int8x8_t::splat(-1)']
+      - ['vbic_s16', 'int16x4_t', 'int16x4_t::splat(-1)']
+      - ['vbic_s32', 'int32x2_t', 'int32x2_t::splat(-1)']
+      - ['vbic_s64', 'int64x1_t', 'int64x1_t::splat(-1)']
+      - ['vbicq_s8', 'int8x16_t', 'int8x16_t::splat(-1)']
+      - ['vbicq_s16', 'int16x8_t', 'int16x8_t::splat(-1)']
+      - ['vbicq_s32', 'int32x4_t', 'int32x4_t::splat(-1)']
+      - ['vbicq_s64', 'int64x2_t', 'int64x2_t::splat(-1)']
+    compose:
+      - Let: [c, '{type[2]}']
+      - FnCall:
+        - simd_and
+        - - FnCall: [simd_xor, [b, c]]
+          - a
+
+  - name: "{type[0]}"
+    doc: "Vector bitwise bit clear." 
+    arguments: ['a: {neon_type[1]}', 'b: {neon_type[1]}']
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['vbic']] }  ]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, { FnCall: [assert_instr, ['bic']]}] ]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vbic_u8', 'uint8x8_t', 'int8x8_t::splat(-1)']
+      - ['vbic_u16', 'uint16x4_t', 'int16x4_t::splat(-1)']
+      - ['vbic_u32', 'uint32x2_t', 'int32x2_t::splat(-1)']
+      - ['vbic_u64', 'uint64x1_t', 'int64x1_t::splat(-1)']
+      - ['vbicq_u8', 'uint8x16_t', 'int8x16_t::splat(-1)']
+      - ['vbicq_u16', 'uint16x8_t', 'int16x8_t::splat(-1)']
+      - ['vbicq_u32', 'uint32x4_t', 'int32x4_t::splat(-1)']
+      - ['vbicq_u64', 'uint64x2_t', 'int64x2_t::splat(-1)']
+    compose:
+      - Let: [c, '{type[2]}']
+      - FnCall:
+        - simd_and
+        - - FnCall:
+            - simd_xor
+            - - b
+              - FnCall: [transmute, [c]]
+          - a
+
+  - name: "{type[0]}"
+    doc: "Bitwise Select."
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['vbsl']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['bsl']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vbsl_s8', 'uint8x8_t', 'int8x8_t', 'int8x8_t::splat(-1)']
+      - ['vbsl_s16', 'uint16x4_t', 'int16x4_t', 'int16x4_t::splat(-1)']
+      - ['vbsl_s32', 'uint32x2_t', 'int32x2_t', 'int32x2_t::splat(-1)']
+      - ['vbsl_s64', 'uint64x1_t', 'int64x1_t', 'int64x1_t::splat(-1)']
+      - ['vbsl_f32', 'uint32x2_t', 'float32x2_t', 'int32x2_t::splat(-1)']
+      - ['vbslq_f32', 'uint32x4_t', 'float32x4_t', 'int32x4_t::splat(-1)']
+      - ['vbsl_p8', 'uint8x8_t', 'poly8x8_t', 'int8x8_t::splat(-1)']
+      - ['vbsl_p16', 'uint16x4_t', 'poly16x4_t', 'int16x4_t::splat(-1)']
+      - ['vbslq_s8', 'uint8x16_t', 'int8x16_t', 'int8x16_t::splat(-1)']
+      - ['vbslq_s16', 'uint16x8_t', 'int16x8_t', 'int16x8_t::splat(-1)']
+      - ['vbslq_s32', 'uint32x4_t', 'int32x4_t', 'int32x4_t::splat(-1)']
+      - ['vbslq_s64', 'uint64x2_t', 'int64x2_t', 'int64x2_t::splat(-1)']
+      - ['vbslq_p8', 'uint8x16_t', 'poly8x16_t', 'int8x16_t::splat(-1)']
+      - ['vbslq_p16', 'uint16x8_t', 'poly16x8_t', 'int16x8_t::splat(-1)']
+    compose:
+      - Let: [not, '{type[3]}']
+      - FnCall:
+        - transmute
+        - - FnCall:
+            - simd_or
+            - - FnCall:
+                - simd_and
+                - - a
+                  - FnCall: [transmute, [b]]
+              - FnCall:
+                - simd_and
+                - - FnCall:
+                    - simd_xor
+                    - - a
+                      - FnCall: [transmute, [not]]
+                  - FnCall: [transmute, [c]]
+
+  - name: "{type[0]}"
+    doc: "Bitwise Select."
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}", "c: {neon_type[2]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-fp16
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['vbsl']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['bsl']]}]]
+      - *neon-unstable-f16
+    safety: safe
+    types:
+      - ['vbslq_f16', 'uint16x8_t', 'float16x8_t', 'int16x8_t::splat(-1)']
+      - ['vbsl_f16', 'uint16x4_t', 'float16x4_t', 'int16x4_t::splat(-1)']
+    compose:
+      - Let: [not, '{type[3]}']
+      - FnCall:
+        - transmute
+        - - FnCall:
+            - simd_or
+            - - FnCall:
+                - simd_and
+                - - a
+                  - FnCall: [transmute, [b]]
+              - FnCall:
+                - simd_and
+                - - FnCall:
+                    - simd_xor
+                    - - a
+                      - FnCall: [transmute, [not]]
+                  - FnCall: [transmute, [c]]
+
+  - name: "{type[0]}"
+    doc: "Bitwise Select."
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['vbsl']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['bsl']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vbslq_u8', 'uint8x16_t', 'int8x16_t::splat(-1)']
+      - ['vbslq_u16', 'uint16x8_t', 'int16x8_t::splat(-1)']
+      - ['vbslq_u32', 'uint32x4_t', 'int32x4_t::splat(-1)']
+      - ['vbslq_u64', 'uint64x2_t', 'int64x2_t::splat(-1)']
+      - ['vbsl_u8', 'uint8x8_t', 'int8x8_t::splat(-1)']
+      - ['vbsl_u16', 'uint16x4_t', 'int16x4_t::splat(-1)']
+      - ['vbsl_u32', 'uint32x2_t', 'int32x2_t::splat(-1)']
+      - ['vbsl_u64', 'uint64x1_t', 'int64x1_t::splat(-1)']
+    compose:
+      - Let: [not, '{type[2]}']
+      - FnCall:
+        - transmute
+        - - FnCall:
+            - simd_or
+            - - FnCall: [simd_and, [a, b]]
+              - FnCall:
+                - simd_and
+                - - FnCall:
+                    - simd_xor
+                    - - a
+                      - FnCall: [transmute, [not]]
+                  - c
+
+  - name: "{type[0]}"
+    doc: "Vector bitwise inclusive OR NOT"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['vorn']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['orn']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vorn_s8', 'int8x8_t', 'int8x8_t::splat(-1)']
+      - ['vornq_s8', 'int8x16_t', 'int8x16_t::splat(-1)']
+      - ['vorn_s16', 'int16x4_t', 'int16x4_t::splat(-1)']
+      - ['vornq_s16', 'int16x8_t', 'int16x8_t::splat(-1)']
+      - ['vorn_s32', 'int32x2_t', 'int32x2_t::splat(-1)']
+      - ['vornq_s32', 'int32x4_t', 'int32x4_t::splat(-1)']
+      - ['vorn_s64', 'int64x1_t', 'int64x1_t::splat(-1)']
+      - ['vornq_s64', 'int64x2_t', 'int64x2_t::splat(-1)']
+    compose:
+      - Let: [c, '{type[2]}']
+      - FnCall:
+        - simd_or
+        - - FnCall: [simd_xor, [b, c]]
+          - a
+
+  - name: "{type[0]}"
+    doc: "Vector bitwise inclusive OR NOT"
+    arguments: ["a: {neon_type[1]}", "b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['vorn']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['orn']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vorn_u8', 'uint8x8_t', 'int8x8_t::splat(-1)']
+      - ['vornq_u8', 'uint8x16_t', 'int8x16_t::splat(-1)']
+      - ['vorn_u16', 'uint16x4_t', 'int16x4_t::splat(-1)']
+      - ['vornq_u16', 'uint16x8_t', 'int16x8_t::splat(-1)']
+      - ['vorn_u32', 'uint32x2_t', 'int32x2_t::splat(-1)']
+      - ['vornq_u32', 'uint32x4_t', 'int32x4_t::splat(-1)']
+      - ['vorn_u64', 'uint64x1_t', 'int64x1_t::splat(-1)']
+      - ['vornq_u64', 'uint64x2_t', 'int64x2_t::splat(-1)']
+    compose:
+      - Let: [c, '{type[2]}']
+      - FnCall:
+        - simd_or
+        - - FnCall:
+            - simd_xor
+            - - b
+              - FnCall: [transmute, [c]]
+          - a
+
+  - name: "{type[0]}"
+    doc: "Move vector element to general-purpose register"
+    arguments: ["v: {neon_type[1]}"]
+    return_type: "{type[2]}"
+    safety: safe
+    static_defs: ['const IMM5: i32']
+    attr:
+      - *neon-v7
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'IMM5 = {type[3]}']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    types:
+      - ['vget_lane_s8', 'int8x8_t', 'i8', '2', 'IMM5, 3', 'IMM5 as u32']
+      - ['vget_lane_u8', 'uint8x8_t', 'u8', '2', 'IMM5, 3', 'IMM5 as u32']
+      - ['vget_lane_p8', 'poly8x8_t', 'p8', '2', 'IMM5, 3', 'IMM5 as u32']
+      - ['vgetq_lane_s8', 'int8x16_t', 'i8', '2', 'IMM5, 4', 'IMM5 as u32']
+      - ['vgetq_lane_u8', 'uint8x16_t', 'u8', '2', 'IMM5, 4', 'IMM5 as u32']
+      - ['vgetq_lane_p8', 'poly8x16_t', 'p8', '2', 'IMM5, 4', 'IMM5 as u32']
+      - ['vget_lane_u16', 'uint16x4_t', 'u16', '2', 'IMM5, 2', 'IMM5 as u32']
+      - ['vget_lane_s16', 'int16x4_t', 'i16', '2', 'IMM5, 2', 'IMM5 as u32']
+      - ['vget_lane_p16', 'poly16x4_t', 'p16', '2', 'IMM5, 2', 'IMM5 as u32']
+      - ['vgetq_lane_u16', 'uint16x8_t', 'u16', '2', 'IMM5, 3', 'IMM5 as u32']
+      - ['vgetq_lane_s16', 'int16x8_t', 'i16', '2', 'IMM5, 3', 'IMM5 as u32']
+      - ['vgetq_lane_p16', 'poly16x8_t', 'p16', '2', 'IMM5, 3', 'IMM5 as u32']
+      - ['vget_lane_u32', 'uint32x2_t', 'u32', '1', 'IMM5, 1', 'IMM5 as u32']
+      - ['vget_lane_s32', 'int32x2_t', 'i32', '1', 'IMM5, 1', 'IMM5 as u32']
+      - ['vgetq_lane_u32', 'uint32x4_t', 'u32', '2', 'IMM5, 2', 'IMM5 as u32']
+      - ['vgetq_lane_s32', 'int32x4_t', 'i32', '2', 'IMM5, 2', 'IMM5 as u32']
+      - ['vget_lane_f32', 'float32x2_t', 'f32', '1', 'IMM5, 1', 'IMM5 as u32']
+      - ['vgetq_lane_f32', 'float32x4_t', 'f32', '1', 'IMM5, 2', 'IMM5 as u32']
+      - ['vgetq_lane_p64', 'poly64x2_t', 'p64', '1', 'IMM5, 1', 'IMM5 as u32']
+      - ['vgetq_lane_s64', 'int64x2_t', 'i64', '1', 'IMM5, 1', 'IMM5 as u32']
+      - ['vgetq_lane_u64', 'uint64x2_t', 'u64', '1', 'IMM5, 2', 'IMM5 as u32']
+    compose:
+      - FnCall: ['static_assert_uimm_bits!', ['{type[4]}']]
+      - FnCall: ['simd_extract!', [v, '{type[5]}']]
+
+  - name: "{type[0]}"
+    doc: "Move vector element to general-purpose register"
+    arguments: ["v: {neon_type[1]}"]
+    return_type: "{type[2]}"
+    safety: safe
+    static_defs: ['const IMM5: i32']
+    attr:
+      - *neon-v7
+      - FnCall: [rustc_legacy_const_generics, ['1']]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop, 'IMM5 = 0']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    types:
+      - ['vget_lane_u64', 'uint64x1_t', 'u64', '0']
+      - ['vget_lane_p64', 'poly64x1_t', 'p64', 'IMM5 as u32']
+      - ['vget_lane_s64', 'int64x1_t', 'i64', 'IMM5 as u32']
+    compose:
+      - FnCall: ['static_assert!', ['IMM5 == 0']]
+      - FnCall: ['simd_extract!', [v, '{type[3]}']]
+
+    # Private vfp4 version used by FMA intriniscs because LLVM does
+    # not inline the non-vfp4 version in vfp4 functions.
+  - name: "{type[0]}"
+    visibility: private
+    doc: "Duplicate vector element to vector or scalar"
+    arguments: ["value: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - FnCall: [cfg_attr, [target_arch = "arm", {FnCall: [target_feature, ['enable = "vfp4"']]}]]
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['"vdup.32"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['dup']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vdup_n_f32_vfp4', 'f32', 'float32x2_t', 'float32x2_t::splat(value)']
+      - ['vdupq_n_f32_vfp4', 'f32', 'float32x4_t', 'float32x4_t::splat(value)']
+    compose:
+      - Identifier: ['{type[3]}', Symbol]
+
+  - name: "{type[0]}"
+    doc: "Duplicate vector element to vector or scalar"
+    arguments: ["a: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['"{type[3]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['{type[4]}']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vget_high_s64', 'int64x2_t', 'int64x1_t', 'vmov', 'ext', 'unsafe { int64x1_t([simd_extract!(a, 1)]) }']
+      - ['vget_high_u64', 'uint64x2_t', 'uint64x1_t', 'vmov', 'ext', 'unsafe { uint64x1_t([simd_extract!(a, 1)]) }']
+    compose:
+      - Identifier: ['{type[5]}', Symbol]
+
+  - name: "{type[0]}"
+    doc: "Duplicate vector element to vector or scalar"
+    arguments: ["a: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vget_low_s64', 'int64x2_t', 'int64x1_t', 'unsafe { int64x1_t([simd_extract!(a, 0)]) }']
+      - ['vget_low_u64', 'uint64x2_t', 'uint64x1_t', 'unsafe { uint64x1_t([simd_extract!(a, 0)]) }']
+    compose:
+      - Identifier: ['{type[3]}', Symbol]
+
+  - name: "{type[0]}"
+    doc: "Duplicate vector element to vector or scalar"
+    arguments: ["a: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['"{type[3]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['{type[4]}']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vget_high_s8', 'int8x16_t', 'int8x8_t', 'vmov', 'ext', '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - ['vget_high_u8', 'uint8x16_t', 'uint8x8_t', 'vmov', 'ext', '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - ['vget_high_p8', 'poly8x16_t', 'poly8x8_t', 'vmov', 'ext', '[8, 9, 10, 11, 12, 13, 14, 15]']
+      - ['vget_high_s16', 'int16x8_t', 'int16x4_t', 'vmov', 'ext', '[4, 5, 6, 7]']
+      - ['vget_high_u16', 'uint16x8_t', 'uint16x4_t', 'vmov', 'ext', '[4, 5, 6, 7]']
+      - ['vget_high_p16', 'poly16x8_t', 'poly16x4_t', 'vmov', 'ext', '[4, 5, 6, 7]']
+      - ['vget_high_s32', 'int32x4_t', 'int32x2_t', 'vmov', 'ext', '[2, 3]']
+      - ['vget_high_u32', 'uint32x4_t', 'uint32x2_t', 'vmov', 'ext', '[2, 3]']
+      - ['vget_high_f32', 'float32x4_t', 'float32x2_t', 'vmov', 'ext', '[2, 3]']
+    compose:
+      - FnCall: ['simd_shuffle!', [a, a, '{type[5]}']]
+
+  - name: "{type[0]}"
+    doc: "Duplicate vector element to vector or scalar"
+    arguments: ["a: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [nop]]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vget_low_s8', 'int8x16_t', 'int8x8_t', '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - ['vget_low_u8', 'uint8x16_t', 'uint8x8_t','[0, 1, 2, 3, 4, 5, 6, 7]']
+      - ['vget_low_p8', 'poly8x16_t', 'poly8x8_t','[0, 1, 2, 3, 4, 5, 6, 7]']
+      - ['vget_low_s16', 'int16x8_t', 'int16x4_t', '[0, 1, 2, 3]']
+      - ['vget_low_u16', 'uint16x8_t', 'uint16x4_t', '[0, 1, 2, 3]']
+      - ['vget_low_p16', 'poly16x8_t', 'poly16x4_t', '[0, 1, 2, 3]']
+      - ['vget_low_s32', 'int32x4_t', 'int32x2_t', '[0, 1]']
+      - ['vget_low_f32', 'float32x4_t', 'float32x2_t', '[0, 1]']
+      - ['vget_low_u32', 'uint32x4_t', 'uint32x2_t', '[0, 1]']
+    compose:
+      - FnCall: ['simd_shuffle!', [a, a, '{type[3]}']]
+
+  - name: "{type[0]}"
+    doc: "Duplicate vector element to vector or scalar"
+    arguments: ["value: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['"{type[3]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['{type[4]}']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vdupq_n_s8', 'i8', 'int8x16_t', 'vdup.8', 'dup', 'int8x16_t::splat(value)']
+      - ['vdupq_n_s16', 'i16', 'int16x8_t', 'vdup.16', 'dup', 'int16x8_t::splat(value)']
+      - ['vdupq_n_s32', 'i32', 'int32x4_t', 'vdup.32', 'dup', 'int32x4_t::splat(value)']
+      - ['vdupq_n_s64', 'i64', 'int64x2_t', 'vmov', 'dup', 'int64x2_t::splat(value)']
+      - ['vdupq_n_u8', 'u8', 'uint8x16_t', 'vdup.8', 'dup', 'uint8x16_t::splat(value)']
+      - ['vdupq_n_u16', 'u16', 'uint16x8_t', 'vdup.16', 'dup', 'uint16x8_t::splat(value)']
+      - ['vdupq_n_u32', 'u32', 'uint32x4_t', 'vdup.32', 'dup', 'uint32x4_t::splat(value)']
+      - ['vdupq_n_f32', 'f32', 'float32x4_t', 'vdup.32', 'dup', 'float32x4_t::splat(value)']
+      - ['vdupq_n_u64', 'u64', 'uint64x2_t', 'vmov', 'dup', 'uint64x2_t::splat(value)']
+      - ['vdupq_n_p8', 'p8', 'poly8x16_t', 'vdup.8', 'dup', 'poly8x16_t::splat(value)']
+      - ['vdupq_n_p16', 'p16', 'poly16x8_t', 'vdup.16', 'dup', 'poly16x8_t::splat(value)']
+      - ['vdup_n_s8', 'i8', 'int8x8_t', 'vdup.8', 'dup', 'int8x8_t::splat(value)']
+      - ['vdup_n_s16', 'i16', 'int16x4_t', 'vdup.16', 'dup', 'int16x4_t::splat(value)']
+      - ['vdup_n_s32', 'i32', 'int32x2_t', 'vdup.32', 'dup', 'int32x2_t::splat(value)']
+      - ['vdup_n_s64', 'i64', 'int64x1_t', 'vmov', 'fmov', 'int64x1_t::splat(value)']
+      - ['vdup_n_u8', 'u8', 'uint8x8_t', 'vdup.8', 'dup', 'uint8x8_t::splat(value)']
+      - ['vdup_n_u16', 'u16', 'uint16x4_t', 'vdup.16', 'dup', 'uint16x4_t::splat(value)']
+      - ['vdup_n_u32', 'u32', 'uint32x2_t', 'vdup.32', 'dup', 'uint32x2_t::splat(value)']
+      - ['vdup_n_f32', 'f32', 'float32x2_t', 'vdup.32', 'dup', 'float32x2_t::splat(value)']
+      - ['vdup_n_u64', 'u64', 'uint64x1_t', 'vmov', 'fmov', 'uint64x1_t::splat(value)']
+      - ['vdup_n_p8', 'p8', 'poly8x8_t', 'vdup.8', 'dup', 'poly8x8_t::splat(value)']
+      - ['vdup_n_p16', 'p16', 'poly16x4_t', 'vdup.16', 'dup', 'poly16x4_t::splat(value)']
+    compose:
+      - Identifier: ['{type[5]}', Symbol]
+
+  - name: "{type[0]}"
+    doc: "Duplicate vector element to vector or scalar"
+    arguments: ["value: {type[1]}"]
+    return_type: "{neon_type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['"{type[3]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['{type[4]}']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vmov_n_s8', 'i8', 'int8x8_t', 'vdup.8', 'dup', 'vdup_n_s8']
+      - ['vmov_n_s16', 'i16', 'int16x4_t', 'vdup.16', 'dup', 'vdup_n_s16']
+      - ['vmov_n_s32', 'i32', 'int32x2_t', 'vdup.32', 'dup', 'vdup_n_s32']
+      - ['vmov_n_s64', 'i64', 'int64x1_t', 'vmov', 'fmov', 'vdup_n_s64']
+      - ['vmov_n_u8', 'u8', 'uint8x8_t', 'vdup.8', 'dup', 'vdup_n_u8']
+      - ['vmov_n_u16', 'u16', 'uint16x4_t', 'vdup.16', 'dup', 'vdup_n_u16']
+      - ['vmov_n_u32', 'u32', 'uint32x2_t', 'vdup.32', 'dup', 'vdup_n_u32']
+      - ['vmov_n_u64', 'u64', 'uint64x1_t', 'vmov', 'fmov', 'vdup_n_u64']
+      - ['vmov_n_p8', 'p8', 'poly8x8_t', 'vdup.8', 'dup', 'vdup_n_p8']
+      - ['vmov_n_p16', 'p16', 'poly16x4_t', 'vdup.16', 'dup', 'vdup_n_p16']
+      - ['vmov_n_f32', 'f32', 'float32x2_t', 'vdup.32', 'dup', 'vdup_n_f32']
+      - ['vmovq_n_s8', 'i8', 'int8x16_t', 'vdup.8', 'dup', 'vdupq_n_s8']
+      - ['vmovq_n_s16', 'i16', 'int16x8_t', 'vdup.16', 'dup', 'vdupq_n_s16']
+      - ['vmovq_n_s32', 'i32', 'int32x4_t', 'vdup.32', 'dup', 'vdupq_n_s32']
+      - ['vmovq_n_s64', 'i64', 'int64x2_t', 'vmov', 'dup', 'vdupq_n_s64']
+      - ['vmovq_n_u8', 'u8', 'uint8x16_t', 'vdup.8', 'dup', 'vdupq_n_u8']
+      - ['vmovq_n_u16', 'u16', 'uint16x8_t', 'vdup.16', 'dup', 'vdupq_n_u16']
+      - ['vmovq_n_u32', 'u32', 'uint32x4_t', 'vdup.32', 'dup', 'vdupq_n_u32']
+      - ['vmovq_n_u64', 'u64', 'uint64x2_t', 'vmov', 'dup', 'vdupq_n_u64']
+      - ['vmovq_n_p8', 'p8', 'poly8x16_t', 'vdup.8', 'dup', 'vdupq_n_p8']
+      - ['vmovq_n_p16', 'p16', 'poly16x8_t', 'vdup.16', 'dup', 'vdupq_n_p16']
+      - ['vmovq_n_f32', 'f32', 'float32x4_t', 'vdup.32', 'dup', 'vdupq_n_f32']
+    compose:
+      - FnCall: ['{type[5]}', [value]]
+
+  - name: "{type[0]}"
+    doc: "Store SIMD&FP register (immediate offset)"
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[2]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['nop']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['nop']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['vldrq_p128', '* const p128', 'p128']
+    compose:
+      - Identifier: ['*a', Symbol]
+
+  - name: "{type[0]}"
+    doc: "Store SIMD&FP register (immediate offset)"
+    arguments: ["a: {type[1]}", "b: {type[2]}"]
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['nop']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['nop']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety:
+      unsafe: [neon]
+    types:
+      - ['vstrq_p128', '* mut p128', 'p128']
+    compose:
+      - Identifier: ['*a = b', Symbol]
+
+  - name: "{type[0]}"
+    doc: "Extract vector from pair of vectors"
+    arguments: ["a: {neon_type[1]}", "_b: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['nop', 'N = 0']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['nop', 'N = 0']]}]]
+      - FnCall: [rustc_legacy_const_generics, ['2']]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    static_defs: ['const N: i32']
+    safety:
+      unsafe: [neon]
+    types:
+      - ['vext_s64', 'int64x1_t']
+      - ['vext_u64', 'uint64x1_t']
+    compose:
+      - FnCall: ['static_assert!', ['N == 0']]
+      - Identifier: ['a', Symbol]
+
+  - name: "{type[0]}"
+    doc: "Reversing vector elements (swap endianness)"
+    arguments: ["a: {neon_type[1]}"]
+    return_type: "{neon_type[1]}"
+    attr:
+      - *neon-v7
+      - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"{type[2]}"']]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, ['{type[3]}']]}]]
+      - *neon-not-arm-stable
+      - *neon-cfg-arm-unstable
+    safety: safe
+    types:
+      - ['vrev16_s8', 'int8x8_t', 'vrev16.8', 'rev16', '[1, 0, 3, 2, 5, 4, 7, 6]']
+      - ['vrev16q_s8', 'int8x16_t', 'vrev16.8', 'rev16', '[1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]']
+      - ['vrev16_u8', 'uint8x8_t', 'vrev16.8', 'rev16', '[1, 0, 3, 2, 5, 4, 7, 6]']
+      - ['vrev16q_u8', 'uint8x16_t', 'vrev16.8', 'rev16', '[1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]']
+      - ['vrev16_p8', 'poly8x8_t', 'vrev16.8', 'rev16', '[1, 0, 3, 2, 5, 4, 7, 6]']
+      - ['vrev16q_p8', 'poly8x16_t', 'vrev16.8', 'rev16', '[1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]']
+      - ['vrev32_s8', 'int8x8_t', 'vrev32.8', 'rev32', '[3, 2, 1, 0, 7, 6, 5, 4]']
+      - ['vrev32q_s8', 'int8x16_t', 'vrev32.8', 'rev32', '[3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]']
+      - ['vrev32_u8', 'uint8x8_t', 'vrev32.8', 'rev32', '[3, 2, 1, 0, 7, 6, 5, 4]']
+      - ['vrev32q_u8', 'uint8x16_t', 'vrev32.8', 'rev32', '[3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]']
+      - ['vrev32_p8', 'poly8x8_t', 'vrev32.8', 'rev32', '[3, 2, 1, 0, 7, 6, 5, 4]']
+      - ['vrev32q_p8', 'poly8x16_t', 'vrev32.8', 'rev32', '[3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]']
+      - ['vrev32_s16', 'int16x4_t', 'vrev32.16', 'rev32', '[1, 0, 3, 2]']
+      - ['vrev32q_s16', 'int16x8_t', 'vrev32.16', 'rev32', '[1, 0, 3, 2, 5, 4, 7, 6]']
+      - ['vrev32_u16', 'uint16x4_t', 'vrev32.16', 'rev32', '[1, 0, 3, 2]']
+      - ['vrev32q_u16', 'uint16x8_t', 'vrev32.16', 'rev32', '[1, 0, 3, 2, 5, 4, 7, 6]']
+      - ['vrev32_p16', 'poly16x4_t', 'vrev32.16', 'rev32', '[1, 0, 3, 2]']
+      - ['vrev32q_p16', 'poly16x8_t', 'vrev32.16', 'rev32', '[1, 0, 3, 2, 5, 4, 7, 6]']
+      - ['vrev64_s8', 'int8x8_t', 'vrev64.8', 'rev64', '[7, 6, 5, 4, 3, 2, 1, 0]']
+      - ['vrev64q_s8', 'int8x16_t', 'vrev64.8', 'rev64', '[7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8]']
+      - ['vrev64_u8', 'uint8x8_t', 'vrev64.8', 'rev64', '[7, 6, 5, 4, 3, 2, 1, 0]']
+      - ['vrev64q_u8', 'uint8x16_t', 'vrev64.8', 'rev64', '[7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8]']
+      - ['vrev64_p8', 'poly8x8_t', 'vrev64.8', 'rev64', '[7, 6, 5, 4, 3, 2, 1, 0]']
+      - ['vrev64q_p8', 'poly8x16_t', 'vrev64.8', 'rev64', '[7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8]']
+      - ['vrev64_s16', 'int16x4_t', 'vrev64.16', 'rev64', '[3, 2, 1, 0]']
+      - ['vrev64q_s16', 'int16x8_t', 'vrev64.16', 'rev64', '[3, 2, 1, 0, 7, 6, 5, 4]']
+      - ['vrev64_u16', 'uint16x4_t', 'vrev64.16', 'rev64', '[3, 2, 1, 0]']
+      - ['vrev64q_u16', 'uint16x8_t', 'vrev64.16', 'rev64', '[3, 2, 1, 0, 7, 6, 5, 4]']
+      - ['vrev64_p16', 'poly16x4_t', 'vrev64.16', 'rev64', '[3, 2, 1, 0]']
+      - ['vrev64q_p16', 'poly16x8_t', 'vrev64.16', 'rev64', '[3, 2, 1, 0, 7, 6, 5, 4]']
+      - ['vrev64_s32', 'int32x2_t', 'vrev64.32', 'rev64', '[1, 0]']
+      - ['vrev64q_s32', 'int32x4_t', 'vrev64.32', 'rev64', '[1, 0, 3, 2]']
+      - ['vrev64_u32', 'uint32x2_t', 'vrev64.32', 'rev64', '[1, 0]']
+      - ['vrev64q_u32', 'uint32x4_t', 'vrev64.32', 'rev64', '[1, 0, 3, 2]']
+      - ['vrev64_f32', 'float32x2_t', 'vrev64.32', 'rev64', '[1, 0]']
+      - ['vrev64q_f32', 'float32x4_t', 'vrev64.32', 'rev64', '[1, 0, 3, 2]']
+    compose:
+      - FnCall: ['simd_shuffle!', [a, a, '{type[4]}']]
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/assert_instr.rs b/library/stdarch/crates/stdarch-gen-arm/src/assert_instr.rs
new file mode 100644
index 0000000000000..799b3379a851c
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/assert_instr.rs
@@ -0,0 +1,372 @@
+use proc_macro2::TokenStream;
+use quote::{ToTokens, TokenStreamExt, format_ident, quote};
+use serde::de::{self, MapAccess, Visitor};
+use serde::{Deserialize, Deserializer, Serialize, ser::SerializeSeq};
+use std::fmt;
+
+use crate::{
+    context::{self, Context},
+    typekinds::{BaseType, BaseTypeKind},
+    wildstring::WildString,
+};
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum InstructionAssertion {
+    Basic(WildString),
+    WithArgs(WildString, WildString),
+}
+
+impl InstructionAssertion {
+    fn build(&mut self, ctx: &Context) -> context::Result {
+        match self {
+            InstructionAssertion::Basic(ws) => ws.build_acle(ctx.local),
+            InstructionAssertion::WithArgs(ws, args_ws) => [ws, args_ws]
+                .into_iter()
+                .try_for_each(|ws| ws.build_acle(ctx.local)),
+        }
+    }
+}
+
+impl ToTokens for InstructionAssertion {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        let instr = format_ident!(
+            "{}",
+            match self {
+                Self::Basic(instr) => instr,
+                Self::WithArgs(instr, _) => instr,
+            }
+            .to_string()
+        );
+        tokens.append_all(quote! { #instr });
+
+        if let Self::WithArgs(_, args) = self {
+            let ex: TokenStream = args
+                .to_string()
+                .parse()
+                .expect("invalid instruction assertion arguments expression given");
+            tokens.append_all(quote! {, #ex})
+        }
+    }
+}
+
+// Asserts that the given instruction is present for the intrinsic of the associated type bitsize.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(remote = "Self")]
+pub struct InstructionAssertionMethodForBitsize {
+    pub default: InstructionAssertion,
+    pub byte: Option<InstructionAssertion>,
+    pub halfword: Option<InstructionAssertion>,
+    pub word: Option<InstructionAssertion>,
+    pub doubleword: Option<InstructionAssertion>,
+}
+
+impl InstructionAssertionMethodForBitsize {
+    fn build(&mut self, ctx: &Context) -> context::Result {
+        if let Some(ref mut byte) = self.byte {
+            byte.build(ctx)?
+        }
+        if let Some(ref mut halfword) = self.halfword {
+            halfword.build(ctx)?
+        }
+        if let Some(ref mut word) = self.word {
+            word.build(ctx)?
+        }
+        if let Some(ref mut doubleword) = self.doubleword {
+            doubleword.build(ctx)?
+        }
+        self.default.build(ctx)
+    }
+}
+
+impl Serialize for InstructionAssertionMethodForBitsize {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        match self {
+            InstructionAssertionMethodForBitsize {
+                default: InstructionAssertion::Basic(instr),
+                byte: None,
+                halfword: None,
+                word: None,
+                doubleword: None,
+            } => serializer.serialize_str(&instr.to_string()),
+            InstructionAssertionMethodForBitsize {
+                default: InstructionAssertion::WithArgs(instr, args),
+                byte: None,
+                halfword: None,
+                word: None,
+                doubleword: None,
+            } => {
+                let mut seq = serializer.serialize_seq(Some(2))?;
+                seq.serialize_element(&instr.to_string())?;
+                seq.serialize_element(&args.to_string())?;
+                seq.end()
+            }
+            _ => InstructionAssertionMethodForBitsize::serialize(self, serializer),
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for InstructionAssertionMethodForBitsize {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        struct IAMVisitor;
+
+        impl<'de> Visitor<'de> for IAMVisitor {
+            type Value = InstructionAssertionMethodForBitsize;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+                formatter.write_str("array, string or map")
+            }
+
+            fn visit_str<E>(self, value: &str) -> Result<InstructionAssertionMethodForBitsize, E>
+            where
+                E: de::Error,
+            {
+                Ok(InstructionAssertionMethodForBitsize {
+                    default: InstructionAssertion::Basic(value.parse().map_err(E::custom)?),
+                    byte: None,
+                    halfword: None,
+                    word: None,
+                    doubleword: None,
+                })
+            }
+
+            fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: de::SeqAccess<'de>,
+            {
+                use serde::de::Error;
+                let make_err =
+                    || Error::custom("invalid number of arguments passed to assert_instruction");
+                let instruction = seq.next_element()?.ok_or_else(make_err)?;
+                let args = seq.next_element()?.ok_or_else(make_err)?;
+
+                if let Some(true) = seq.size_hint().map(|len| len > 0) {
+                    Err(make_err())
+                } else {
+                    Ok(InstructionAssertionMethodForBitsize {
+                        default: InstructionAssertion::WithArgs(instruction, args),
+                        byte: None,
+                        halfword: None,
+                        word: None,
+                        doubleword: None,
+                    })
+                }
+            }
+
+            fn visit_map<M>(self, map: M) -> Result<InstructionAssertionMethodForBitsize, M::Error>
+            where
+                M: MapAccess<'de>,
+            {
+                InstructionAssertionMethodForBitsize::deserialize(
+                    de::value::MapAccessDeserializer::new(map),
+                )
+            }
+        }
+
+        deserializer.deserialize_any(IAMVisitor)
+    }
+}
+
+/// Asserts that the given instruction is present for the intrinsic of the associated type.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(remote = "Self")]
+pub struct InstructionAssertionMethod {
+    /// Instruction for integer intrinsics
+    pub default: InstructionAssertionMethodForBitsize,
+    /// Instruction for floating-point intrinsics (optional)
+    #[serde(default)]
+    pub float: Option<InstructionAssertionMethodForBitsize>,
+    /// Instruction for unsigned integer intrinsics (optional)
+    #[serde(default)]
+    pub unsigned: Option<InstructionAssertionMethodForBitsize>,
+}
+
+impl InstructionAssertionMethod {
+    pub(crate) fn build(&mut self, ctx: &Context) -> context::Result {
+        if let Some(ref mut float) = self.float {
+            float.build(ctx)?
+        }
+        if let Some(ref mut unsigned) = self.unsigned {
+            unsigned.build(ctx)?
+        }
+        self.default.build(ctx)
+    }
+}
+
+impl Serialize for InstructionAssertionMethod {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        match self {
+            InstructionAssertionMethod {
+                default:
+                    InstructionAssertionMethodForBitsize {
+                        default: InstructionAssertion::Basic(instr),
+                        byte: None,
+                        halfword: None,
+                        word: None,
+                        doubleword: None,
+                    },
+                float: None,
+                unsigned: None,
+            } => serializer.serialize_str(&instr.to_string()),
+            InstructionAssertionMethod {
+                default:
+                    InstructionAssertionMethodForBitsize {
+                        default: InstructionAssertion::WithArgs(instr, args),
+                        byte: None,
+                        halfword: None,
+                        word: None,
+                        doubleword: None,
+                    },
+                float: None,
+                unsigned: None,
+            } => {
+                let mut seq = serializer.serialize_seq(Some(2))?;
+                seq.serialize_element(&instr.to_string())?;
+                seq.serialize_element(&args.to_string())?;
+                seq.end()
+            }
+            _ => InstructionAssertionMethod::serialize(self, serializer),
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for InstructionAssertionMethod {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        struct IAMVisitor;
+
+        impl<'de> Visitor<'de> for IAMVisitor {
+            type Value = InstructionAssertionMethod;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+                formatter.write_str("array, string or map")
+            }
+
+            fn visit_str<E>(self, value: &str) -> Result<InstructionAssertionMethod, E>
+            where
+                E: de::Error,
+            {
+                Ok(InstructionAssertionMethod {
+                    default: InstructionAssertionMethodForBitsize {
+                        default: InstructionAssertion::Basic(value.parse().map_err(E::custom)?),
+                        byte: None,
+                        halfword: None,
+                        word: None,
+                        doubleword: None,
+                    },
+                    float: None,
+                    unsigned: None,
+                })
+            }
+
+            fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: de::SeqAccess<'de>,
+            {
+                use serde::de::Error;
+                let make_err =
+                    || Error::custom("invalid number of arguments passed to assert_instruction");
+                let instruction = seq.next_element()?.ok_or_else(make_err)?;
+                let args = seq.next_element()?.ok_or_else(make_err)?;
+
+                if let Some(true) = seq.size_hint().map(|len| len > 0) {
+                    Err(make_err())
+                } else {
+                    Ok(InstructionAssertionMethod {
+                        default: InstructionAssertionMethodForBitsize {
+                            default: InstructionAssertion::WithArgs(instruction, args),
+                            byte: None,
+                            halfword: None,
+                            word: None,
+                            doubleword: None,
+                        },
+                        float: None,
+                        unsigned: None,
+                    })
+                }
+            }
+
+            fn visit_map<M>(self, map: M) -> Result<InstructionAssertionMethod, M::Error>
+            where
+                M: MapAccess<'de>,
+            {
+                InstructionAssertionMethod::deserialize(de::value::MapAccessDeserializer::new(map))
+            }
+        }
+
+        deserializer.deserialize_any(IAMVisitor)
+    }
+}
+
+#[derive(Debug)]
+pub struct InstructionAssertionsForBaseType<'a>(
+    pub &'a Vec<InstructionAssertionMethod>,
+    pub &'a Option<&'a BaseType>,
+);
+
+impl<'a> ToTokens for InstructionAssertionsForBaseType<'a> {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        self.0.iter().for_each(
+            |InstructionAssertionMethod {
+                 default,
+                 float,
+                 unsigned,
+             }| {
+                let kind = self.1.map(|ty| ty.kind());
+                let instruction = match (kind, float, unsigned) {
+                    (None, float, unsigned) if float.is_some() || unsigned.is_some() => {
+                        unreachable!(
+                        "cannot determine the base type kind for instruction assertion: {self:#?}")
+                    }
+                    (Some(BaseTypeKind::Float), Some(float), _) => float,
+                    (Some(BaseTypeKind::UInt), _, Some(unsigned)) => unsigned,
+                    _ => default,
+                };
+
+                let bitsize = self.1.and_then(|ty| ty.get_size().ok());
+                let instruction = match (bitsize, instruction) {
+                    (
+                        Some(8),
+                        InstructionAssertionMethodForBitsize {
+                            byte: Some(byte), ..
+                        },
+                    ) => byte,
+                    (
+                        Some(16),
+                        InstructionAssertionMethodForBitsize {
+                            halfword: Some(halfword),
+                            ..
+                        },
+                    ) => halfword,
+                    (
+                        Some(32),
+                        InstructionAssertionMethodForBitsize {
+                            word: Some(word), ..
+                        },
+                    ) => word,
+                    (
+                        Some(64),
+                        InstructionAssertionMethodForBitsize {
+                            doubleword: Some(doubleword),
+                            ..
+                        },
+                    ) => doubleword,
+                    (_, InstructionAssertionMethodForBitsize { default, .. }) => default,
+                };
+
+                tokens.append_all(quote! { #[cfg_attr(test, assert_instr(#instruction))]})
+            },
+        );
+    }
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/big_endian.rs b/library/stdarch/crates/stdarch-gen-arm/src/big_endian.rs
new file mode 100644
index 0000000000000..b982ff53ec3d2
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/big_endian.rs
@@ -0,0 +1,184 @@
+use crate::expression::LetVariant;
+use crate::wildstring::WildStringPart;
+use crate::{
+    expression::{Expression, IdentifierType},
+    typekinds::*,
+    wildstring::WildString,
+};
+
+/// Simplifies creating a string that can be used in an Expression, as Expression
+/// expects all strings to be `WildString`
+fn create_single_wild_string(name: &str) -> WildString {
+    WildString(vec![WildStringPart::String(name.to_string())])
+}
+
+/// Creates an Identifier with name `name` with no wildcards. This, for example,
+/// can be used to create variables, function names or arbitrary input. Is is
+/// extremely flexible.
+pub fn create_symbol_identifier(arbitrary_string: &str) -> Expression {
+    let identifier_name = create_single_wild_string(arbitrary_string);
+    Expression::Identifier(identifier_name, IdentifierType::Symbol)
+}
+
+/// To compose the simd_shuffle! call we need:
+/// - simd_shuffle!(<arg1>, <arg2>, <array>)
+///
+/// Here we are creating a string version of the `<array>` that can be used as an
+/// Expression Identifier
+///
+/// In textual form `a: int32x4_t` which has 4 lanes would generate:
+/// ```
+/// [0, 1, 2, 3]
+/// ```
+fn create_array(lanes: u32) -> Option<String> {
+    match lanes {
+        1 => None, /* Makes no sense to shuffle an array of size 1 */
+        2 => Some("[1, 0]".to_string()),
+        3 => Some("[2, 1, 0]".to_string()),
+        4 => Some("[3, 2, 1, 0]".to_string()),
+        8 => Some("[7, 6, 5, 4, 3, 2, 1, 0]".to_string()),
+        16 => Some("[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]".to_string()),
+        _ => panic!("Incorrect vector number of vector lanes: {lanes}"),
+    }
+}
+
+/// Creates: `let <variable_name>: <type> = <expression>`
+pub fn create_let_variable(
+    variable_name: &str,
+    type_kind: &TypeKind,
+    expression: Expression,
+) -> Expression {
+    let identifier_name = create_single_wild_string(variable_name);
+    Expression::Let(LetVariant::WithType(
+        identifier_name,
+        type_kind.clone(),
+        Box::new(expression),
+    ))
+}
+
+pub fn create_mut_let_variable(
+    variable_name: &str,
+    type_kind: &TypeKind,
+    expression: Expression,
+) -> Expression {
+    let identifier_name = create_single_wild_string(variable_name);
+    Expression::Let(LetVariant::MutWithType(
+        identifier_name,
+        type_kind.clone(),
+        Box::new(expression),
+    ))
+}
+
+pub fn type_has_tuple(type_kind: &TypeKind) -> bool {
+    if let TypeKind::Vector(vector_type) = type_kind {
+        vector_type.tuple_size().is_some()
+    } else {
+        false
+    }
+}
+
+pub fn make_variable_mutable(variable_name: &str, type_kind: &TypeKind) -> Expression {
+    let mut_variable = format!("let mut {variable_name}: {type_kind} = {variable_name}");
+    let identifier_name = create_single_wild_string(&mut_variable);
+    Expression::Identifier(identifier_name, IdentifierType::Symbol)
+}
+
+/// For creating shuffle calls, accepts function pointers for formatting for tuple
+/// types and types without a tuple
+///
+/// Example:
+///
+/// `a: int32x4_t` with formatting function `create_shuffle_call_fmt` creates:
+/// ```
+/// simd_shuffle!(a, a, [0, 1, 2, 3])
+/// ```
+///
+/// `a: int32x4x2_t` creates:
+/// ```
+/// a.0 = simd_shuffle!(a.0, a.0, [0, 1, 2, 3])
+/// a.1 = simd_shuffle!(a.1, a.1, [0, 1, 2, 3])
+/// ```
+fn create_shuffle_internal(
+    variable_name: &String,
+    type_kind: &TypeKind,
+    fmt_tuple: fn(variable_name: &String, idx: u32, array_lanes: &String) -> String,
+    fmt: fn(variable_name: &String, type_kind: &TypeKind, array_lanes: &String) -> String,
+) -> Option<Expression> {
+    let TypeKind::Vector(vector_type) = type_kind else {
+        return None;
+    };
+
+    let lane_count = vector_type.lanes();
+    let array_lanes = create_array(lane_count)?;
+
+    let tuple_count = vector_type.tuple_size().map_or_else(|| 0, |t| t.to_int());
+
+    if tuple_count > 0 {
+        let capacity_estimate: usize =
+            tuple_count as usize * (lane_count as usize + ((variable_name.len() + 2) * 3));
+        let mut string_builder = String::with_capacity(capacity_estimate);
+
+        /* <var_name>.idx = simd_shuffle!(<var_name>.idx, <var_name>.idx, [<indexes>]) */
+        for idx in 0..tuple_count {
+            let formatted = fmt_tuple(variable_name, idx, &array_lanes);
+            string_builder += formatted.as_str();
+        }
+        Some(create_symbol_identifier(&string_builder))
+    } else {
+        /* Generate a list of shuffles for each tuple */
+        let expression = fmt(variable_name, type_kind, &array_lanes);
+        Some(create_symbol_identifier(&expression))
+    }
+}
+
+fn create_assigned_tuple_shuffle_call_fmt(
+    variable_name: &String,
+    idx: u32,
+    array_lanes: &String,
+) -> String {
+    format!(
+        "{variable_name}.{idx} = unsafe {{ simd_shuffle!({variable_name}.{idx}, {variable_name}.{idx}, {array_lanes}) }};\n"
+    )
+}
+
+fn create_assigned_shuffle_call_fmt(
+    variable_name: &String,
+    type_kind: &TypeKind,
+    array_lanes: &String,
+) -> String {
+    format!(
+        "let {variable_name}: {type_kind} = unsafe {{ simd_shuffle!({variable_name}, {variable_name}, {array_lanes}) }}"
+    )
+}
+
+fn create_shuffle_call_fmt(
+    variable_name: &String,
+    _type_kind: &TypeKind,
+    array_lanes: &String,
+) -> String {
+    format!("simd_shuffle!({variable_name}, {variable_name}, {array_lanes})")
+}
+
+/// Create a `simd_shuffle!(<...>, [...])` call, where the output is stored
+/// in a variable named `variable_name`
+pub fn create_assigned_shuffle_call(
+    variable_name: &String,
+    type_kind: &TypeKind,
+) -> Option<Expression> {
+    create_shuffle_internal(
+        variable_name,
+        type_kind,
+        create_assigned_tuple_shuffle_call_fmt,
+        create_assigned_shuffle_call_fmt,
+    )
+}
+
+/// Create a `simd_shuffle!(<...>, [...])` call
+pub fn create_shuffle_call(variable_name: &String, type_kind: &TypeKind) -> Option<Expression> {
+    create_shuffle_internal(
+        variable_name,
+        type_kind,
+        create_assigned_tuple_shuffle_call_fmt,
+        create_shuffle_call_fmt,
+    )
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/context.rs b/library/stdarch/crates/stdarch-gen-arm/src/context.rs
new file mode 100644
index 0000000000000..9b8eb8e8b9bfe
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/context.rs
@@ -0,0 +1,274 @@
+use itertools::Itertools;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+use crate::{
+    expression::Expression,
+    input::{InputSet, InputType},
+    intrinsic::{Constraint, Intrinsic, Signature},
+    matching::SizeMatchable,
+    predicate_forms::PredicateForm,
+    typekinds::{ToRepr, TypeKind},
+    wildcards::Wildcard,
+    wildstring::WildString,
+};
+
+/// Maximum SVE vector size
+const SVE_VECTOR_MAX_SIZE: u32 = 2048;
+/// Vector register size
+const VECTOR_REG_SIZE: u32 = 128;
+
+/// Generator result
+pub type Result<T = ()> = std::result::Result<T, String>;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ArchitectureSettings {
+    #[serde(alias = "arch")]
+    pub arch_name: String,
+    pub target_feature: Vec<String>,
+    #[serde(alias = "llvm_prefix")]
+    pub llvm_link_prefix: String,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct GlobalContext {
+    pub arch_cfgs: Vec<ArchitectureSettings>,
+    #[serde(default)]
+    pub uses_neon_types: bool,
+
+    /// Should the yaml file automagically generate big endian shuffling
+    #[serde(default)]
+    pub auto_big_endian: Option<bool>,
+
+    /// Should all LLVM wrappers convert their arguments to a signed type
+    #[serde(default)]
+    pub auto_llvm_sign_conversion: bool,
+}
+
+/// Context of an intrinsic group
+#[derive(Debug, Clone, Default)]
+pub struct GroupContext {
+    /// LLVM links to target input sets
+    pub links: HashMap<String, InputSet>,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum VariableType {
+    Argument,
+    Internal,
+}
+
+#[derive(Debug, Clone)]
+pub struct LocalContext {
+    pub signature: Signature,
+
+    pub input: InputSet,
+
+    pub substitutions: HashMap<Wildcard, String>,
+    pub variables: HashMap<String, (TypeKind, VariableType)>,
+}
+
+impl LocalContext {
+    pub fn new(input: InputSet, original: &Intrinsic) -> LocalContext {
+        LocalContext {
+            signature: original.signature.clone(),
+            input,
+            substitutions: HashMap::new(),
+            variables: HashMap::new(),
+        }
+    }
+
+    pub fn provide_type_wildcard(&self, wildcard: &Wildcard) -> Result<TypeKind> {
+        let err = || {
+            format!(
+                "provide_type_wildcard() wildcard {{{wildcard}}} not found for {}",
+                &self.signature.name.to_string()
+            )
+        };
+
+        /* If the type is already a vector then we can just return the vector */
+        let make_neon = |tuple_size| {
+            move |ty| match ty {
+                TypeKind::Vector(_) => Ok(ty),
+                _ => TypeKind::make_vector(ty, false, tuple_size),
+            }
+        };
+        let make_sve = |tuple_size| move |ty| TypeKind::make_vector(ty, true, tuple_size);
+
+        match wildcard {
+            Wildcard::Type(idx) => self.input.typekind(*idx).ok_or_else(err),
+            Wildcard::NEONType(idx, tuple_size, _) => self
+                .input
+                .typekind(*idx)
+                .ok_or_else(|| {
+                    dbg!("{:?}", &self);
+                    err()
+                })
+                .and_then(make_neon(*tuple_size)),
+            Wildcard::SVEType(idx, tuple_size) => self
+                .input
+                .typekind(*idx)
+                .ok_or_else(err)
+                .and_then(make_sve(*tuple_size)),
+            Wildcard::Predicate(idx) => self.input.typekind(*idx).map_or_else(
+                || {
+                    if idx.is_none() && self.input.types_len() == 1 {
+                        Err(err())
+                    } else {
+                        Err(format!(
+                            "there is no type at index {} to infer the predicate from",
+                            idx.unwrap_or(0)
+                        ))
+                    }
+                },
+                |ref ty| TypeKind::make_predicate_from(ty),
+            ),
+            Wildcard::MaxPredicate => self
+                .input
+                .iter()
+                .filter_map(|arg| arg.typekind())
+                .max_by(|x, y| {
+                    x.base_type()
+                        .and_then(|bt| bt.get_size().ok())
+                        .unwrap_or(0)
+                        .cmp(&y.base_type().and_then(|bt| bt.get_size().ok()).unwrap_or(0))
+                })
+                .map_or_else(
+                    || Err("there are no types available to infer the predicate from".to_string()),
+                    TypeKind::make_predicate_from,
+                ),
+            Wildcard::Scale(w, as_ty) => {
+                let mut ty = self.provide_type_wildcard(w)?;
+                if let Some(vty) = ty.vector_mut() {
+                    let base_ty = if let Some(w) = as_ty.wildcard() {
+                        *self.provide_type_wildcard(w)?.base_type().unwrap()
+                    } else {
+                        *as_ty.base_type().unwrap()
+                    };
+                    vty.cast_base_type_as(base_ty)
+                }
+                Ok(ty)
+            }
+            _ => Err(err()),
+        }
+    }
+
+    pub fn provide_substitution_wildcard(&self, wildcard: &Wildcard) -> Result<String> {
+        let err = || Err(format!("wildcard {{{wildcard}}} not found"));
+
+        match wildcard {
+            Wildcard::SizeLiteral(idx) => self.input.typekind(*idx)
+                .map_or_else(err, |ty| Ok(ty.size_literal())),
+            Wildcard::Size(idx) => self.input.typekind(*idx)
+                .map_or_else(err, |ty| Ok(ty.size())),
+            Wildcard::SizeMinusOne(idx) => self.input.typekind(*idx)
+                .map_or_else(err, |ty| Ok((ty.size().parse::<i32>().unwrap()-1).to_string())),
+            Wildcard::SizeInBytesLog2(idx) => self.input.typekind(*idx)
+                .map_or_else(err, |ty| Ok(ty.size_in_bytes_log2())),
+            Wildcard::NVariant if !self.substitutions.contains_key(wildcard) => Ok(String::new()),
+            Wildcard::TypeKind(idx, opts) => {
+                self.input.typekind(*idx)
+                    .map_or_else(err, |ty| {
+                        let literal = if let Some(opts) = opts {
+                            #[allow(clippy::obfuscated_if_else)]
+                            opts.contains(ty.base_type().map(|bt| *bt.kind()).ok_or_else(|| {
+                                format!("cannot retrieve a type literal out of {ty}")
+                            })?)
+                            .then(|| ty.type_kind())
+                            .unwrap_or_default()
+                        } else {
+                            ty.type_kind()
+                        };
+                        Ok(literal)
+                    })
+            }
+            Wildcard::PredicateForms(_) => self
+                .input
+                .iter()
+                .find_map(|arg| {
+                    if let InputType::PredicateForm(pf) = arg {
+                        Some(pf.get_suffix().to_string())
+                    } else {
+                        None
+                    }
+                })
+                .ok_or_else(|| unreachable!("attempting to render a predicate form wildcard, but no predicate form was compiled for it")),
+            _ => self
+                .substitutions
+                .get(wildcard)
+                .map_or_else(err, |s| Ok(s.clone())),
+        }
+    }
+
+    pub fn make_assertion_from_constraint(&self, constraint: &Constraint) -> Result<Expression> {
+        match constraint {
+            Constraint::AnyI32 {
+                variable,
+                any_values,
+            } => {
+                let where_ex = any_values
+                    .iter()
+                    .map(|value| format!("{variable} == {value}"))
+                    .join(" || ");
+                Ok(Expression::MacroCall("static_assert".to_string(), where_ex))
+            }
+            Constraint::RangeI32 {
+                variable,
+                range: SizeMatchable::Matched(range),
+            } => Ok(Expression::MacroCall(
+                "static_assert_range".to_string(),
+                format!(
+                    "{variable}, {min}, {max}",
+                    min = range.start(),
+                    max = range.end()
+                ),
+            )),
+            Constraint::SVEMaxElems {
+                variable,
+                sve_max_elems_type: ty,
+            }
+            | Constraint::VecMaxElems {
+                variable,
+                vec_max_elems_type: ty,
+            } => {
+                if !self.input.is_empty() {
+                    let higher_limit = match constraint {
+                        Constraint::SVEMaxElems { .. } => SVE_VECTOR_MAX_SIZE,
+                        Constraint::VecMaxElems { .. } => VECTOR_REG_SIZE,
+                        _ => unreachable!(),
+                    };
+
+                    let max = ty.base_type()
+                        .map(|ty| ty.get_size())
+                        .transpose()?
+                        .map_or_else(
+                            || Err(format!("can't make an assertion out of constraint {self:?}: no valid type is present")),
+                            |bitsize| Ok(higher_limit / bitsize - 1))?;
+                    Ok(Expression::MacroCall(
+                        "static_assert_range".to_string(),
+                        format!("{variable}, 0, {max}"),
+                    ))
+                } else {
+                    Err(format!(
+                        "can't make an assertion out of constraint {self:?}: no types are being used"
+                    ))
+                }
+            }
+            _ => unreachable!("constraints were not built successfully!"),
+        }
+    }
+
+    pub fn predicate_form(&self) -> Option<&PredicateForm> {
+        self.input.iter().find_map(|arg| arg.predicate_form())
+    }
+
+    pub fn n_variant_op(&self) -> Option<&WildString> {
+        self.input.iter().find_map(|arg| arg.n_variant_op())
+    }
+}
+
+pub struct Context<'ctx> {
+    pub local: &'ctx mut LocalContext,
+    pub group: &'ctx mut GroupContext,
+    pub global: &'ctx GlobalContext,
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/expression.rs b/library/stdarch/crates/stdarch-gen-arm/src/expression.rs
new file mode 100644
index 0000000000000..56c94602fff94
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/expression.rs
@@ -0,0 +1,618 @@
+use itertools::Itertools;
+use lazy_static::lazy_static;
+use proc_macro2::{Literal, Punct, Spacing, TokenStream};
+use quote::{ToTokens, TokenStreamExt, format_ident, quote};
+use regex::Regex;
+use serde::de::{self, MapAccess, Visitor};
+use serde::{Deserialize, Deserializer, Serialize};
+use std::fmt;
+use std::str::FromStr;
+
+use crate::intrinsic::Intrinsic;
+use crate::wildstring::WildStringPart;
+use crate::{
+    context::{self, Context, VariableType},
+    intrinsic::{Argument, LLVMLink, StaticDefinition},
+    matching::{MatchKindValues, MatchSizeValues},
+    typekinds::{BaseType, BaseTypeKind, TypeKind},
+    wildcards::Wildcard,
+    wildstring::WildString,
+};
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub enum IdentifierType {
+    Variable,
+    Symbol,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum LetVariant {
+    Basic(WildString, Box<Expression>),
+    WithType(WildString, TypeKind, Box<Expression>),
+    MutWithType(WildString, TypeKind, Box<Expression>),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FnCall(
+    /// Function pointer
+    pub Box<Expression>,
+    /// Function arguments
+    pub Vec<Expression>,
+    /// Function turbofish arguments
+    #[serde(default)]
+    pub Vec<Expression>,
+    /// Function requires unsafe wrapper
+    #[serde(default)]
+    pub bool,
+);
+
+impl FnCall {
+    pub fn new_expression(fn_ptr: Expression, arguments: Vec<Expression>) -> Expression {
+        FnCall(Box::new(fn_ptr), arguments, Vec::new(), false).into()
+    }
+
+    pub fn new_unsafe_expression(fn_ptr: Expression, arguments: Vec<Expression>) -> Expression {
+        FnCall(Box::new(fn_ptr), arguments, Vec::new(), true).into()
+    }
+
+    pub fn is_llvm_link_call(&self, llvm_link_name: &str) -> bool {
+        self.is_expected_call(llvm_link_name)
+    }
+
+    pub fn is_target_feature_call(&self) -> bool {
+        self.is_expected_call("target_feature")
+    }
+
+    pub fn is_expected_call(&self, fn_call_name: &str) -> bool {
+        if let Expression::Identifier(fn_name, IdentifierType::Symbol) = self.0.as_ref() {
+            fn_name.to_string() == fn_call_name
+        } else {
+            false
+        }
+    }
+
+    pub fn pre_build(&mut self, ctx: &mut Context) -> context::Result {
+        self.0.pre_build(ctx)?;
+        self.1
+            .iter_mut()
+            .chain(self.2.iter_mut())
+            .try_for_each(|ex| ex.pre_build(ctx))
+    }
+
+    pub fn build(&mut self, intrinsic: &Intrinsic, ctx: &mut Context) -> context::Result {
+        self.0.build(intrinsic, ctx)?;
+        self.1
+            .iter_mut()
+            .chain(self.2.iter_mut())
+            .try_for_each(|ex| ex.build(intrinsic, ctx))
+    }
+}
+
+impl ToTokens for FnCall {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        let FnCall(fn_ptr, arguments, turbofish, _requires_unsafe_wrapper) = self;
+
+        fn_ptr.to_tokens(tokens);
+
+        if !turbofish.is_empty() {
+            tokens.append_all(quote! {::<#(#turbofish),*>});
+        }
+
+        tokens.append_all(quote! { (#(#arguments),*) })
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(remote = "Self", deny_unknown_fields)]
+pub enum Expression {
+    /// (Re)Defines a variable
+    Let(LetVariant),
+    /// Performs a variable assignment operation
+    Assign(String, Box<Expression>),
+    /// Performs a macro call
+    MacroCall(String, String),
+    /// Performs a function call
+    FnCall(FnCall),
+    /// Performs a method call. The following:
+    /// `MethodCall: ["$object", "to_string", []]`
+    /// is tokenized as:
+    /// `object.to_string()`.
+    MethodCall(Box<Expression>, String, Vec<Expression>),
+    /// Symbol identifier name, prepend with a `$` to treat it as a scope variable
+    /// which engages variable tracking and enables inference.
+    /// E.g. `my_function_name` for a generic symbol or `$my_variable` for
+    /// a variable.
+    Identifier(WildString, IdentifierType),
+    /// Constant signed integer number expression
+    IntConstant(i32),
+    /// Constant floating point number expression
+    FloatConstant(f32),
+    /// Constant boolean expression, either `true` or `false`
+    BoolConstant(bool),
+    /// Array expression
+    Array(Vec<Expression>),
+
+    // complex expressions
+    /// Makes an LLVM link.
+    ///
+    /// It stores the link's function name in the wildcard `{llvm_link}`, for use in
+    /// subsequent expressions.
+    LLVMLink(LLVMLink),
+    /// Casts the given expression to the specified (unchecked) type
+    CastAs(Box<Expression>, String),
+    /// Returns the LLVM `undef` symbol
+    SvUndef,
+    /// Multiplication
+    Multiply(Box<Expression>, Box<Expression>),
+    /// Xor
+    Xor(Box<Expression>, Box<Expression>),
+    /// Converts the specified constant to the specified type's kind
+    ConvertConst(TypeKind, i32),
+    /// Yields the given type in the Rust representation
+    Type(TypeKind),
+
+    MatchSize(TypeKind, MatchSizeValues<Box<Expression>>),
+    MatchKind(TypeKind, MatchKindValues<Box<Expression>>),
+}
+
+impl Expression {
+    pub fn pre_build(&mut self, ctx: &mut Context) -> context::Result {
+        match self {
+            Self::FnCall(fn_call) => fn_call.pre_build(ctx),
+            Self::MethodCall(cl_ptr_ex, _, arg_exs) => {
+                cl_ptr_ex.pre_build(ctx)?;
+                arg_exs.iter_mut().try_for_each(|ex| ex.pre_build(ctx))
+            }
+            Self::Let(
+                LetVariant::Basic(_, ex)
+                | LetVariant::WithType(_, _, ex)
+                | LetVariant::MutWithType(_, _, ex),
+            ) => ex.pre_build(ctx),
+            Self::CastAs(ex, _) => ex.pre_build(ctx),
+            Self::Multiply(lhs, rhs) | Self::Xor(lhs, rhs) => {
+                lhs.pre_build(ctx)?;
+                rhs.pre_build(ctx)
+            }
+            Self::MatchSize(match_ty, values) => {
+                *self = *values.get(match_ty, ctx.local)?.to_owned();
+                self.pre_build(ctx)
+            }
+            Self::MatchKind(match_ty, values) => {
+                *self = *values.get(match_ty, ctx.local)?.to_owned();
+                self.pre_build(ctx)
+            }
+            _ => Ok(()),
+        }
+    }
+
+    pub fn build(&mut self, intrinsic: &Intrinsic, ctx: &mut Context) -> context::Result {
+        match self {
+            Self::LLVMLink(link) => link.build_and_save(ctx),
+            Self::Identifier(identifier, id_type) => {
+                identifier.build_acle(ctx.local)?;
+
+                if let IdentifierType::Variable = id_type {
+                    ctx.local
+                        .variables
+                        .get(&identifier.to_string())
+                        .map(|_| ())
+                        .ok_or_else(|| format!("invalid variable {identifier} being referenced"))
+                } else {
+                    Ok(())
+                }
+            }
+            Self::FnCall(fn_call) => {
+                fn_call.build(intrinsic, ctx)?;
+
+                #[allow(clippy::collapsible_if)]
+                if let Some(llvm_link_name) = ctx.local.substitutions.get(&Wildcard::LLVMLink) {
+                    if fn_call.is_llvm_link_call(llvm_link_name) {
+                        *self = intrinsic
+                            .llvm_link()
+                            .expect("got LLVMLink wildcard without a LLVM link in `compose`")
+                            .apply_conversions_to_call(fn_call.clone(), ctx)?
+                    }
+                }
+
+                Ok(())
+            }
+            Self::MethodCall(cl_ptr_ex, _, arg_exs) => {
+                cl_ptr_ex.build(intrinsic, ctx)?;
+                arg_exs
+                    .iter_mut()
+                    .try_for_each(|ex| ex.build(intrinsic, ctx))
+            }
+            Self::Let(variant) => {
+                let (var_name, ex, ty) = match variant {
+                    LetVariant::Basic(var_name, ex) => (var_name, ex, None),
+                    LetVariant::WithType(var_name, ty, ex)
+                    | LetVariant::MutWithType(var_name, ty, ex) => {
+                        if let Some(w) = ty.wildcard() {
+                            ty.populate_wildcard(ctx.local.provide_type_wildcard(w)?)?;
+                        }
+                        (var_name, ex, Some(ty.to_owned()))
+                    }
+                };
+
+                var_name.build_acle(ctx.local)?;
+                ctx.local.variables.insert(
+                    var_name.to_string(),
+                    (
+                        ty.unwrap_or_else(|| TypeKind::Custom("unknown".to_string())),
+                        VariableType::Internal,
+                    ),
+                );
+                ex.build(intrinsic, ctx)
+            }
+            Self::CastAs(ex, _) => ex.build(intrinsic, ctx),
+            Self::Multiply(lhs, rhs) | Self::Xor(lhs, rhs) => {
+                lhs.build(intrinsic, ctx)?;
+                rhs.build(intrinsic, ctx)
+            }
+            Self::ConvertConst(ty, num) => {
+                if let Some(w) = ty.wildcard() {
+                    *ty = ctx.local.provide_type_wildcard(w)?
+                }
+
+                if let Some(BaseType::Sized(BaseTypeKind::Float, _)) = ty.base() {
+                    *self = Expression::FloatConstant(*num as f32)
+                } else {
+                    *self = Expression::IntConstant(*num)
+                }
+                Ok(())
+            }
+            Self::Type(ty) => {
+                if let Some(w) = ty.wildcard() {
+                    *ty = ctx.local.provide_type_wildcard(w)?
+                }
+
+                Ok(())
+            }
+            _ => Ok(()),
+        }
+    }
+
+    /// True if the expression requires an `unsafe` context in a safe function.
+    ///
+    /// The classification is somewhat fuzzy, based on actual usage (e.g. empirical function names)
+    /// rather than a full parse. This is a reasonable approach because mistakes here will usually
+    /// be caught at build time:
+    ///
+    ///  - Missing an `unsafe` is a build error.
+    ///  - An unnecessary `unsafe` is a warning, made into an error by the CI's `-D warnings`.
+    ///
+    /// This **panics** if it encounters an expression that shouldn't appear in a safe function at
+    /// all (such as `SvUndef`).
+    pub fn requires_unsafe_wrapper(&self, ctx_fn: &str) -> bool {
+        match self {
+            // The call will need to be unsafe, but the declaration does not.
+            Self::LLVMLink(..) => false,
+            // Identifiers, literals and type names are never unsafe.
+            Self::Identifier(..) => false,
+            Self::IntConstant(..) => false,
+            Self::FloatConstant(..) => false,
+            Self::BoolConstant(..) => false,
+            Self::Type(..) => false,
+            Self::ConvertConst(..) => false,
+            // Nested structures that aren't inherently unsafe, but could contain other expressions
+            // that might be.
+            Self::Assign(_var, exp) => exp.requires_unsafe_wrapper(ctx_fn),
+            Self::Let(
+                LetVariant::Basic(_, exp)
+                | LetVariant::WithType(_, _, exp)
+                | LetVariant::MutWithType(_, _, exp),
+            ) => exp.requires_unsafe_wrapper(ctx_fn),
+            Self::Array(exps) => exps.iter().any(|exp| exp.requires_unsafe_wrapper(ctx_fn)),
+            Self::Multiply(lhs, rhs) | Self::Xor(lhs, rhs) => {
+                lhs.requires_unsafe_wrapper(ctx_fn) || rhs.requires_unsafe_wrapper(ctx_fn)
+            }
+            Self::CastAs(exp, _ty) => exp.requires_unsafe_wrapper(ctx_fn),
+            // Functions and macros can be unsafe, but can also contain other expressions.
+            Self::FnCall(FnCall(fn_exp, args, turbo_args, requires_unsafe_wrapper)) => {
+                let fn_name = fn_exp.to_string();
+                fn_exp.requires_unsafe_wrapper(ctx_fn)
+                    || fn_name.starts_with("_sv")
+                    || fn_name.starts_with("simd_")
+                    || fn_name.ends_with("transmute")
+                    || args.iter().any(|exp| exp.requires_unsafe_wrapper(ctx_fn))
+                    || turbo_args
+                        .iter()
+                        .any(|exp| exp.requires_unsafe_wrapper(ctx_fn))
+                    || *requires_unsafe_wrapper
+            }
+            Self::MethodCall(exp, fn_name, args) => match fn_name.as_str() {
+                // `as_signed` and `as_unsigned` are unsafe because they're trait methods with
+                // target features to allow use on feature-dependent types (such as SVE vectors).
+                // We can safely wrap them here.
+                "as_signed" => true,
+                "as_unsigned" => true,
+                _ => {
+                    exp.requires_unsafe_wrapper(ctx_fn)
+                        || args.iter().any(|exp| exp.requires_unsafe_wrapper(ctx_fn))
+                }
+            },
+            // We only use macros to check const generics (using static assertions).
+            Self::MacroCall(_name, _args) => false,
+            // Materialising uninitialised values is always unsafe, and we avoid it in safe
+            // functions.
+            Self::SvUndef => panic!("Refusing to wrap unsafe SvUndef in safe function '{ctx_fn}'."),
+            // Variants that aren't tokenised. We shouldn't encounter these here.
+            Self::MatchKind(..) => {
+                unimplemented!("The unsafety of {self:?} cannot be determined in '{ctx_fn}'.")
+            }
+            Self::MatchSize(..) => {
+                unimplemented!("The unsafety of {self:?} cannot be determined in '{ctx_fn}'.")
+            }
+        }
+    }
+
+    /// Determine if an expression is a `static_assert<...>` function call.
+    pub fn is_static_assert(&self) -> bool {
+        match self {
+            Expression::FnCall(fn_call) => match fn_call.0.as_ref() {
+                Expression::Identifier(wild_string, _) => {
+                    if let WildStringPart::String(function_name) = &wild_string.0[0] {
+                        function_name.starts_with("static_assert")
+                    } else {
+                        false
+                    }
+                }
+                _ => panic!("Badly defined function call: {fn_call:?}"),
+            },
+            _ => false,
+        }
+    }
+
+    /// Determine if an espression is a LLVM binding
+    pub fn is_llvm_link(&self) -> bool {
+        matches!(self, Expression::LLVMLink(_))
+    }
+}
+
+impl FromStr for Expression {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        lazy_static! {
+            static ref MACRO_RE: Regex =
+                Regex::new(r"^(?P<name>[\w\d_]+)!\((?P<ex>.*?)\);?$").unwrap();
+        }
+
+        if s == "SvUndef" {
+            Ok(Expression::SvUndef)
+        } else if MACRO_RE.is_match(s) {
+            let c = MACRO_RE.captures(s).unwrap();
+            let ex = c["ex"].to_string();
+            let _: TokenStream = ex
+                .parse()
+                .map_err(|e| format!("could not parse macro call expression: {e:#?}"))?;
+            Ok(Expression::MacroCall(c["name"].to_string(), ex))
+        } else {
+            let (s, id_type) = if let Some(varname) = s.strip_prefix('$') {
+                (varname, IdentifierType::Variable)
+            } else {
+                (s, IdentifierType::Symbol)
+            };
+            let identifier = s.trim().parse()?;
+            Ok(Expression::Identifier(identifier, id_type))
+        }
+    }
+}
+
+impl From<FnCall> for Expression {
+    fn from(fn_call: FnCall) -> Self {
+        Expression::FnCall(fn_call)
+    }
+}
+
+impl From<WildString> for Expression {
+    fn from(ws: WildString) -> Self {
+        Expression::Identifier(ws, IdentifierType::Symbol)
+    }
+}
+
+impl From<&Argument> for Expression {
+    fn from(a: &Argument) -> Self {
+        Expression::Identifier(a.name.to_owned(), IdentifierType::Variable)
+    }
+}
+
+impl TryFrom<&StaticDefinition> for Expression {
+    type Error = String;
+
+    fn try_from(sd: &StaticDefinition) -> Result<Self, Self::Error> {
+        match sd {
+            StaticDefinition::Constant(imm) => Ok(imm.into()),
+            StaticDefinition::Generic(t) => t.parse(),
+        }
+    }
+}
+
+impl fmt::Display for Expression {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Identifier(identifier, kind) => {
+                write!(
+                    f,
+                    "{}{identifier}",
+                    matches!(kind, IdentifierType::Variable)
+                        .then_some("$")
+                        .unwrap_or_default()
+                )
+            }
+            Self::MacroCall(name, expression) => {
+                write!(f, "{name}!({expression})")
+            }
+            _ => Err(fmt::Error),
+        }
+    }
+}
+
+impl ToTokens for Expression {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        match self {
+            Self::Let(LetVariant::Basic(var_name, exp)) => {
+                let var_ident = format_ident!("{}", var_name.to_string());
+                tokens.append_all(quote! { let #var_ident = #exp })
+            }
+            Self::Let(LetVariant::WithType(var_name, ty, exp)) => {
+                let var_ident = format_ident!("{}", var_name.to_string());
+                tokens.append_all(quote! { let #var_ident: #ty = #exp })
+            }
+            Self::Let(LetVariant::MutWithType(var_name, ty, exp)) => {
+                let var_ident = format_ident!("{}", var_name.to_string());
+                tokens.append_all(quote! { let mut #var_ident: #ty = #exp })
+            }
+            Self::Assign(var_name, exp) => {
+                /* If we are dereferencing a variable to assign a value \
+                 * the 'format_ident!' macro does not like the asterix */
+                let var_name_str: &str;
+
+                if let Some(ch) = var_name.chars().nth(0) {
+                    /* Manually append the asterix and split out the rest of
+                     * the variable name */
+                    if ch == '*' {
+                        tokens.append(Punct::new('*', Spacing::Alone));
+                        var_name_str = &var_name[1..var_name.len()];
+                    } else {
+                        var_name_str = var_name.as_str();
+                    }
+                } else {
+                    /* Should not be reached as you cannot have a variable
+                     * without a name */
+                    panic!("Invalid variable name, must be at least one character")
+                }
+
+                let var_ident = format_ident!("{}", var_name_str);
+                tokens.append_all(quote! { #var_ident = #exp })
+            }
+            Self::MacroCall(name, ex) => {
+                let name = format_ident!("{name}");
+                let ex: TokenStream = ex.parse().unwrap();
+                tokens.append_all(quote! { #name!(#ex) })
+            }
+            Self::FnCall(fn_call) => fn_call.to_tokens(tokens),
+            Self::MethodCall(exp, fn_name, args) => {
+                let fn_ident = format_ident!("{}", fn_name);
+                tokens.append_all(quote! { #exp.#fn_ident(#(#args),*) })
+            }
+            Self::Identifier(identifier, _) => {
+                assert!(
+                    !identifier.has_wildcards(),
+                    "expression {self:#?} was not built before calling to_tokens"
+                );
+                identifier
+                    .to_string()
+                    .parse::<TokenStream>()
+                    .unwrap_or_else(|_| panic!("invalid syntax: {self:?}"))
+                    .to_tokens(tokens);
+            }
+            Self::IntConstant(n) => tokens.append(Literal::i32_unsuffixed(*n)),
+            Self::FloatConstant(n) => tokens.append(Literal::f32_unsuffixed(*n)),
+            Self::BoolConstant(true) => tokens.append(format_ident!("true")),
+            Self::BoolConstant(false) => tokens.append(format_ident!("false")),
+            Self::Array(vec) => tokens.append_all(quote! { [ #(#vec),* ] }),
+            Self::LLVMLink(link) => link.to_tokens(tokens),
+            Self::CastAs(ex, ty) => {
+                let ty: TokenStream = ty.parse().expect("invalid syntax");
+                tokens.append_all(quote! { #ex as #ty })
+            }
+            Self::SvUndef => tokens.append_all(quote! { simd_reinterpret(()) }),
+            Self::Multiply(lhs, rhs) => tokens.append_all(quote! { #lhs * #rhs }),
+            Self::Xor(lhs, rhs) => tokens.append_all(quote! { #lhs ^ #rhs }),
+            Self::Type(ty) => ty.to_tokens(tokens),
+            _ => unreachable!("{self:?} cannot be converted to tokens."),
+        }
+    }
+}
+
+impl Serialize for Expression {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        match self {
+            Self::IntConstant(v) => serializer.serialize_i32(*v),
+            Self::FloatConstant(v) => serializer.serialize_f32(*v),
+            Self::BoolConstant(v) => serializer.serialize_bool(*v),
+            Self::Identifier(..) => serializer.serialize_str(&self.to_string()),
+            Self::MacroCall(..) => serializer.serialize_str(&self.to_string()),
+            _ => Expression::serialize(self, serializer),
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for Expression {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        struct CustomExpressionVisitor;
+
+        impl<'de> Visitor<'de> for CustomExpressionVisitor {
+            type Value = Expression;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+                formatter.write_str("integer, float, boolean, string or map")
+            }
+
+            fn visit_bool<E>(self, v: bool) -> Result<Self::Value, E>
+            where
+                E: de::Error,
+            {
+                Ok(Expression::BoolConstant(v))
+            }
+
+            fn visit_i64<E>(self, v: i64) -> Result<Self::Value, E>
+            where
+                E: de::Error,
+            {
+                Ok(Expression::IntConstant(v as i32))
+            }
+
+            fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
+            where
+                E: de::Error,
+            {
+                Ok(Expression::IntConstant(v as i32))
+            }
+
+            fn visit_f64<E>(self, v: f64) -> Result<Self::Value, E>
+            where
+                E: de::Error,
+            {
+                Ok(Expression::FloatConstant(v as f32))
+            }
+
+            fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
+            where
+                E: de::Error,
+            {
+                FromStr::from_str(value).map_err(de::Error::custom)
+            }
+
+            fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: de::SeqAccess<'de>,
+            {
+                let arr = std::iter::from_fn(|| seq.next_element::<Self::Value>().transpose())
+                    .try_collect()?;
+                Ok(Expression::Array(arr))
+            }
+
+            fn visit_map<M>(self, map: M) -> Result<Expression, M::Error>
+            where
+                M: MapAccess<'de>,
+            {
+                // `MapAccessDeserializer` is a wrapper that turns a `MapAccess`
+                // into a `Deserializer`, allowing it to be used as the input to T's
+                // `Deserialize` implementation. T then deserializes itself using
+                // the entries from the map visitor.
+                Expression::deserialize(de::value::MapAccessDeserializer::new(map))
+            }
+        }
+
+        deserializer.deserialize_any(CustomExpressionVisitor)
+    }
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/fn_suffix.rs b/library/stdarch/crates/stdarch-gen-arm/src/fn_suffix.rs
new file mode 100644
index 0000000000000..26c156ae178aa
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/fn_suffix.rs
@@ -0,0 +1,231 @@
+use std::fmt::{self};
+
+/* This file is acting as a bridge between the old neon types and how they
+ * have a fairly complex way of picking suffixes and the new world. If possible
+ * it would be good to clean this up. At least it is self contained and the
+ * logic simple */
+use crate::typekinds::{BaseType, BaseTypeKind, TypeKind, VectorType};
+use serde::{Deserialize, Serialize};
+
+use std::str::FromStr;
+
+#[allow(clippy::enum_variant_names)]
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Deserialize, Serialize)]
+pub enum SuffixKind {
+    Normal,
+    Base,
+    NoQ,
+    NSuffix,
+    NoQNSuffix,
+    DupNox,
+    Dup,
+    /* Get the number of lanes or panic if there are not any Lanes */
+    Lane,
+    Rot270,
+    Rot270Lane,
+    Rot270LaneQ,
+    Rot180,
+    Rot180Lane,
+    Rot180LaneQ,
+    Rot90,
+    Rot90Lane,
+    Rot90LaneQ,
+    /* Force the type to be unsigned */
+    Unsigned,
+    Tuple,
+    NoX,
+    BaseByteSize,
+    LaneNoX,
+    LaneQNoX,
+}
+
+pub fn type_to_size(str_type: &str) -> i32 {
+    match str_type {
+        "int8x8_t" | "int8x16_t" | "i8" | "s8" | "uint8x8_t" | "uint8x16_t" | "u8"
+        | "poly8x8_t" | "poly8x16_t" => 8,
+        "int16x4_t" | "int16x8_t" | "i16" | "s16" | "uint16x4_t" | "uint16x8_t" | "u16"
+        | "float16x4_t" | "float16x8_t" | "_f16" | "poly16x4_t" | "poly16x8_t" => 16,
+        "int32x2_t" | "int32x4_t" | "i32" | "s32" | "uint32x2_t" | "uint32x4_t" | "u32"
+        | "float32x2_t" | "float32x4_t" | "f32" => 32,
+        "int64x1_t" | "int64x2_t" | "i64" | "s64" | "uint64x1_t" | "uint64x2_t" | "u64"
+        | "float64x1_t" | "float64x2_t" | "f64" | "poly64x1_t" | "poly64x2_t" | "p64" => 64,
+        "p128" => 128,
+        _ => panic!("unknown type: {str_type}"),
+    }
+}
+
+fn neon_get_base_and_char(ty: &VectorType) -> (u32, char, bool) {
+    let lanes = ty.lanes();
+    match ty.base_type() {
+        BaseType::Sized(BaseTypeKind::Float, size) => (*size, 'f', *size * lanes == 128),
+        BaseType::Sized(BaseTypeKind::Int, size) => (*size, 's', *size * lanes == 128),
+        BaseType::Sized(BaseTypeKind::UInt, size) => (*size, 'u', *size * lanes == 128),
+        BaseType::Sized(BaseTypeKind::Poly, size) => (*size, 'p', *size * lanes == 128),
+        _ => panic!("Unhandled {ty:?}"),
+    }
+}
+
+/* @TODO
+ * for the chained enum types we can safely delete them as we can index the
+ * types array */
+pub fn make_neon_suffix(type_kind: TypeKind, suffix_kind: SuffixKind) -> String {
+    match type_kind {
+        TypeKind::Vector(ty) => {
+            let tuple_size = ty.tuple_size().map_or(0, |t| t.to_int());
+            let (base_size, prefix_char, requires_q) = neon_get_base_and_char(&ty);
+            let prefix_q = if requires_q { "q" } else { "" };
+            let lanes = ty.lanes();
+            match suffix_kind {
+                SuffixKind::Normal => {
+                    let mut str_suffix: String = format!("{prefix_q}_{prefix_char}{base_size}");
+                    if tuple_size > 0 {
+                        str_suffix.push_str("_x");
+                        str_suffix.push_str(tuple_size.to_string().as_str());
+                    }
+                    str_suffix
+                }
+                SuffixKind::NSuffix => {
+                    format!("{prefix_q}_n_{prefix_char}{base_size}")
+                }
+
+                SuffixKind::NoQ => format!("_{prefix_char}{base_size}"),
+                SuffixKind::NoQNSuffix => format!("_n{prefix_char}{base_size}"),
+
+                SuffixKind::Unsigned => {
+                    let t = type_kind.to_string();
+                    if t.starts_with("u") {
+                        return t;
+                    }
+                    format!("u{t}")
+                }
+                SuffixKind::Lane => {
+                    if lanes == 0 {
+                        panic!("type {type_kind} has no lanes!")
+                    } else {
+                        format!("{lanes}")
+                    }
+                }
+                SuffixKind::Tuple => {
+                    if tuple_size == 0 {
+                        panic!("type {type_kind} has no lanes!")
+                    } else {
+                        format!("{tuple_size}")
+                    }
+                }
+                SuffixKind::Base => base_size.to_string(),
+                SuffixKind::NoX => {
+                    format!("{prefix_q}_{prefix_char}{base_size}")
+                }
+                SuffixKind::Dup => {
+                    let mut str_suffix: String = format!("{prefix_q}_dup_{prefix_char}{base_size}");
+                    if tuple_size > 0 {
+                        str_suffix.push_str("_x");
+                        str_suffix.push_str(tuple_size.to_string().as_str());
+                    }
+                    str_suffix
+                }
+                SuffixKind::DupNox => {
+                    format!("{prefix_q}_dup_{prefix_char}{base_size}")
+                }
+                SuffixKind::LaneNoX => {
+                    format!("{prefix_q}_lane_{prefix_char}{base_size}")
+                }
+                SuffixKind::LaneQNoX => {
+                    format!("{prefix_q}_laneq_{prefix_char}{base_size}")
+                }
+                SuffixKind::Rot270 => {
+                    format!("{prefix_q}_rot270_{prefix_char}{base_size}")
+                }
+                SuffixKind::Rot270Lane => {
+                    format!("{prefix_q}_rot270_lane_{prefix_char}{base_size}")
+                }
+                SuffixKind::Rot270LaneQ => {
+                    format!("{prefix_q}_rot270_laneq_{prefix_char}{base_size}")
+                }
+                SuffixKind::Rot180 => {
+                    format!("{prefix_q}_rot180_{prefix_char}{base_size}")
+                }
+                SuffixKind::Rot180Lane => {
+                    format!("{prefix_q}_rot180_lane_{prefix_char}{base_size}")
+                }
+                SuffixKind::Rot180LaneQ => {
+                    format!("{prefix_q}_rot180_laneq_{prefix_char}{base_size}")
+                }
+                SuffixKind::Rot90 => {
+                    format!("{prefix_q}_rot90_{prefix_char}{base_size}")
+                }
+                SuffixKind::Rot90Lane => {
+                    format!("{prefix_q}_rot90_lane_{prefix_char}{base_size}")
+                }
+                SuffixKind::Rot90LaneQ => {
+                    format!("{prefix_q}_rot90_laneq_{prefix_char}{base_size}")
+                }
+                SuffixKind::BaseByteSize => format!("{}", base_size / 8),
+            }
+        }
+        _ => panic!("Cannot only make neon vector types suffixed"),
+    }
+}
+
+impl FromStr for SuffixKind {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "no" => Ok(SuffixKind::Normal),
+            "noq" => Ok(SuffixKind::NoQ),
+            "N" => Ok(SuffixKind::NSuffix),
+            "noq_N" => Ok(SuffixKind::NoQNSuffix),
+            "dup_nox" => Ok(SuffixKind::DupNox),
+            "dup" => Ok(SuffixKind::Dup),
+            "lane" => Ok(SuffixKind::Lane),
+            "base" => Ok(SuffixKind::Base),
+            "tuple" => Ok(SuffixKind::Tuple),
+            "rot270" => Ok(SuffixKind::Rot270),
+            "rot270_lane" => Ok(SuffixKind::Rot270Lane),
+            "rot270_laneq" => Ok(SuffixKind::Rot270LaneQ),
+            "rot90" => Ok(SuffixKind::Rot90),
+            "rot90_lane" => Ok(SuffixKind::Rot90Lane),
+            "rot90_laneq" => Ok(SuffixKind::Rot90LaneQ),
+            "rot180" => Ok(SuffixKind::Rot180),
+            "rot180_lane" => Ok(SuffixKind::Rot180LaneQ),
+            "rot180_laneq" => Ok(SuffixKind::Rot180LaneQ),
+            "u" => Ok(SuffixKind::Unsigned),
+            "nox" => Ok(SuffixKind::NoX),
+            "base_byte_size" => Ok(SuffixKind::BaseByteSize),
+            "lane_nox" => Ok(SuffixKind::LaneNoX),
+            "laneq_nox" => Ok(SuffixKind::LaneQNoX),
+            _ => Err(format!("unknown suffix type: {s}")),
+        }
+    }
+}
+
+impl fmt::Display for SuffixKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            SuffixKind::Normal => write!(f, "normal"),
+            SuffixKind::NoQ => write!(f, "NoQ"),
+            SuffixKind::NSuffix => write!(f, "NSuffix"),
+            SuffixKind::NoQNSuffix => write!(f, "NoQNSuffix"),
+            SuffixKind::DupNox => write!(f, "DupNox"),
+            SuffixKind::Dup => write!(f, "Dup",),
+            SuffixKind::Lane => write!(f, "Lane"),
+            SuffixKind::LaneNoX => write!(f, "LaneNoX"),
+            SuffixKind::LaneQNoX => write!(f, "LaneQNoX"),
+            SuffixKind::Base => write!(f, "Base"),
+            SuffixKind::Rot270 => write!(f, "Rot270",),
+            SuffixKind::Rot270Lane => write!(f, "Rot270Lane"),
+            SuffixKind::Rot270LaneQ => write!(f, "Rot270LaneQ"),
+            SuffixKind::Rot90 => write!(f, "Rot90",),
+            SuffixKind::Rot90Lane => write!(f, "Rot90Lane"),
+            SuffixKind::Rot90LaneQ => write!(f, "Rot90LaneQ"),
+            SuffixKind::Rot180 => write!(f, "Rot180",),
+            SuffixKind::Rot180Lane => write!(f, "Rot180Lane"),
+            SuffixKind::Rot180LaneQ => write!(f, "Rot180LaneQ"),
+            SuffixKind::Unsigned => write!(f, "Unsigned"),
+            SuffixKind::Tuple => write!(f, "Tuple"),
+            SuffixKind::NoX => write!(f, "NoX"),
+            SuffixKind::BaseByteSize => write!(f, "BaseByteSize"),
+        }
+    }
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/input.rs b/library/stdarch/crates/stdarch-gen-arm/src/input.rs
new file mode 100644
index 0000000000000..adefbf3215b5a
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/input.rs
@@ -0,0 +1,433 @@
+use itertools::Itertools;
+use serde::{Deserialize, Deserializer, Serialize, de};
+
+use crate::{
+    context::{self, GlobalContext},
+    intrinsic::Intrinsic,
+    predicate_forms::{PredicateForm, PredicationMask, PredicationMethods},
+    typekinds::TypeKind,
+    wildstring::WildString,
+};
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum InputType {
+    /// PredicateForm variant argument
+    #[serde(skip)] // Predicate forms have their own dedicated deserialization field. Skip.
+    PredicateForm(PredicateForm),
+    /// Operand from which to generate an N variant
+    #[serde(skip)]
+    NVariantOp(Option<WildString>),
+    /// TypeKind variant argument
+    Type(TypeKind),
+}
+
+impl InputType {
+    /// Optionally unwraps as a PredicateForm.
+    pub fn predicate_form(&self) -> Option<&PredicateForm> {
+        match self {
+            InputType::PredicateForm(pf) => Some(pf),
+            _ => None,
+        }
+    }
+
+    /// Optionally unwraps as a mutable PredicateForm
+    pub fn predicate_form_mut(&mut self) -> Option<&mut PredicateForm> {
+        match self {
+            InputType::PredicateForm(pf) => Some(pf),
+            _ => None,
+        }
+    }
+
+    /// Optionally unwraps as a TypeKind.
+    pub fn typekind(&self) -> Option<&TypeKind> {
+        match self {
+            InputType::Type(ty) => Some(ty),
+            _ => None,
+        }
+    }
+
+    /// Optionally unwraps as a NVariantOp
+    pub fn n_variant_op(&self) -> Option<&WildString> {
+        match self {
+            InputType::NVariantOp(Some(op)) => Some(op),
+            _ => None,
+        }
+    }
+}
+
+impl PartialOrd for InputType {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for InputType {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        use std::cmp::Ordering::*;
+
+        match (self, other) {
+            (InputType::PredicateForm(pf1), InputType::PredicateForm(pf2)) => pf1.cmp(pf2),
+            (InputType::Type(ty1), InputType::Type(ty2)) => ty1.cmp(ty2),
+
+            (InputType::NVariantOp(None), InputType::NVariantOp(Some(..))) => Less,
+            (InputType::NVariantOp(Some(..)), InputType::NVariantOp(None)) => Greater,
+            (InputType::NVariantOp(_), InputType::NVariantOp(_)) => Equal,
+
+            (InputType::Type(..), InputType::PredicateForm(..)) => Less,
+            (InputType::PredicateForm(..), InputType::Type(..)) => Greater,
+
+            (InputType::Type(..), InputType::NVariantOp(..)) => Less,
+            (InputType::NVariantOp(..), InputType::Type(..)) => Greater,
+
+            (InputType::PredicateForm(..), InputType::NVariantOp(..)) => Less,
+            (InputType::NVariantOp(..), InputType::PredicateForm(..)) => Greater,
+        }
+    }
+}
+
+mod many_or_one {
+    use serde::{Deserialize, Serialize, de::Deserializer, ser::Serializer};
+
+    pub fn serialize<T, S>(vec: &Vec<T>, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        T: Serialize,
+        S: Serializer,
+    {
+        if vec.len() == 1 {
+            vec.first().unwrap().serialize(serializer)
+        } else {
+            vec.serialize(serializer)
+        }
+    }
+
+    pub fn deserialize<'de, T, D>(deserializer: D) -> Result<Vec<T>, D::Error>
+    where
+        T: Deserialize<'de>,
+        D: Deserializer<'de>,
+    {
+        #[derive(Debug, Clone, Serialize, Deserialize)]
+        #[serde(untagged)]
+        enum ManyOrOne<T> {
+            Many(Vec<T>),
+            One(T),
+        }
+
+        match ManyOrOne::deserialize(deserializer)? {
+            ManyOrOne::Many(vec) => Ok(vec),
+            ManyOrOne::One(val) => Ok(vec![val]),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub struct InputSet(#[serde(with = "many_or_one")] Vec<InputType>);
+
+impl InputSet {
+    pub fn get(&self, idx: usize) -> Option<&InputType> {
+        self.0.get(idx)
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = &InputType> + '_ {
+        self.0.iter()
+    }
+
+    pub fn iter_mut(&mut self) -> impl Iterator<Item = &mut InputType> + '_ {
+        self.0.iter_mut()
+    }
+
+    pub fn into_iter(self) -> impl Iterator<Item = InputType> + Clone {
+        self.0.into_iter()
+    }
+
+    pub fn types_len(&self) -> usize {
+        self.iter().filter_map(|arg| arg.typekind()).count()
+    }
+
+    pub fn typekind(&self, idx: Option<usize>) -> Option<TypeKind> {
+        let types_len = self.types_len();
+        self.get(idx.unwrap_or(0)).and_then(move |arg: &InputType| {
+            if (idx.is_none() && types_len != 1) || (idx.is_some() && types_len == 1) {
+                None
+            } else {
+                arg.typekind().cloned()
+            }
+        })
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct InputSetEntry(#[serde(with = "many_or_one")] Vec<InputSet>);
+
+impl InputSetEntry {
+    pub fn new(input: Vec<InputSet>) -> Self {
+        Self(input)
+    }
+
+    pub fn get(&self, idx: usize) -> Option<&InputSet> {
+        self.0.get(idx)
+    }
+}
+
+fn validate_types<'de, D>(deserializer: D) -> Result<Vec<InputSetEntry>, D::Error>
+where
+    D: Deserializer<'de>,
+{
+    let v: Vec<InputSetEntry> = Vec::deserialize(deserializer)?;
+
+    let mut it = v.iter();
+    if let Some(first) = it.next() {
+        it.try_fold(first, |last, cur| {
+            if last.0.len() == cur.0.len() {
+                Ok(cur)
+            } else {
+                Err("the length of the InputSets and the product lists must match".to_string())
+            }
+        })
+        .map_err(de::Error::custom)?;
+    }
+
+    Ok(v)
+}
+
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct IntrinsicInput {
+    #[serde(default)]
+    #[serde(deserialize_with = "validate_types")]
+    pub types: Vec<InputSetEntry>,
+
+    #[serde(flatten)]
+    pub predication_methods: PredicationMethods,
+
+    /// Generates a _n variant where the specified operand is a primitive type
+    /// that requires conversion to an SVE one. The `{_n}` wildcard is required
+    /// in the intrinsic's name, otherwise an error will be thrown.
+    #[serde(default)]
+    pub n_variant_op: WildString,
+}
+
+impl IntrinsicInput {
+    /// Extracts all the possible variants as an iterator.
+    pub fn variants(
+        &self,
+        intrinsic: &Intrinsic,
+    ) -> context::Result<impl Iterator<Item = InputSet> + '_> {
+        let mut top_product = vec![];
+
+        if !self.types.is_empty() {
+            top_product.push(
+                self.types
+                    .iter()
+                    .flat_map(|ty_in| {
+                        ty_in
+                            .0
+                            .iter()
+                            .map(|v| v.clone().into_iter())
+                            .multi_cartesian_product()
+                    })
+                    .collect_vec(),
+            )
+        }
+
+        if let Ok(mask) = PredicationMask::try_from(&intrinsic.signature.name) {
+            top_product.push(
+                PredicateForm::compile_list(&mask, &self.predication_methods)?
+                    .into_iter()
+                    .map(|pf| vec![InputType::PredicateForm(pf)])
+                    .collect_vec(),
+            )
+        }
+
+        if !self.n_variant_op.is_empty() {
+            top_product.push(vec![
+                vec![InputType::NVariantOp(None)],
+                vec![InputType::NVariantOp(Some(self.n_variant_op.to_owned()))],
+            ])
+        }
+
+        let it = top_product
+            .into_iter()
+            .map(|v| v.into_iter())
+            .multi_cartesian_product()
+            .filter(|set| !set.is_empty())
+            .map(|set| InputSet(set.into_iter().flatten().collect_vec()));
+        Ok(it)
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct GeneratorInput {
+    #[serde(flatten)]
+    pub ctx: GlobalContext,
+    pub intrinsics: Vec<Intrinsic>,
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        input::*,
+        predicate_forms::{DontCareMethod, ZeroingMethod},
+    };
+
+    #[test]
+    fn test_empty() {
+        let str = r#"types: []"#;
+        let input: IntrinsicInput = serde_yaml::from_str(str).expect("failed to parse");
+        let mut variants = input.variants(&Intrinsic::default()).unwrap().into_iter();
+        assert_eq!(variants.next(), None);
+    }
+
+    #[test]
+    fn test_product() {
+        let str = r#"types:
+- [f64, f32]
+- [i64, [f64, f32]]
+"#;
+        let input: IntrinsicInput = serde_yaml::from_str(str).expect("failed to parse");
+        let mut intrinsic = Intrinsic::default();
+        intrinsic.signature.name = "test_intrinsic{_mx}".parse().unwrap();
+        let mut variants = input.variants(&intrinsic).unwrap().into_iter();
+        assert_eq!(
+            variants.next(),
+            Some(InputSet(vec![
+                InputType::Type("f64".parse().unwrap()),
+                InputType::Type("f32".parse().unwrap()),
+                InputType::PredicateForm(PredicateForm::Merging),
+            ]))
+        );
+        assert_eq!(
+            variants.next(),
+            Some(InputSet(vec![
+                InputType::Type("f64".parse().unwrap()),
+                InputType::Type("f32".parse().unwrap()),
+                InputType::PredicateForm(PredicateForm::DontCare(DontCareMethod::AsMerging)),
+            ]))
+        );
+        assert_eq!(
+            variants.next(),
+            Some(InputSet(vec![
+                InputType::Type("i64".parse().unwrap()),
+                InputType::Type("f64".parse().unwrap()),
+                InputType::PredicateForm(PredicateForm::Merging),
+            ]))
+        );
+        assert_eq!(
+            variants.next(),
+            Some(InputSet(vec![
+                InputType::Type("i64".parse().unwrap()),
+                InputType::Type("f64".parse().unwrap()),
+                InputType::PredicateForm(PredicateForm::DontCare(DontCareMethod::AsMerging)),
+            ]))
+        );
+        assert_eq!(
+            variants.next(),
+            Some(InputSet(vec![
+                InputType::Type("i64".parse().unwrap()),
+                InputType::Type("f32".parse().unwrap()),
+                InputType::PredicateForm(PredicateForm::Merging),
+            ]))
+        );
+        assert_eq!(
+            variants.next(),
+            Some(InputSet(vec![
+                InputType::Type("i64".parse().unwrap()),
+                InputType::Type("f32".parse().unwrap()),
+                InputType::PredicateForm(PredicateForm::DontCare(DontCareMethod::AsMerging)),
+            ])),
+        );
+        assert_eq!(variants.next(), None);
+    }
+
+    #[test]
+    fn test_n_variant() {
+        let str = r#"types:
+- [f64, f32]
+n_variant_op: op2
+"#;
+        let input: IntrinsicInput = serde_yaml::from_str(str).expect("failed to parse");
+        let mut variants = input.variants(&Intrinsic::default()).unwrap().into_iter();
+        assert_eq!(
+            variants.next(),
+            Some(InputSet(vec![
+                InputType::Type("f64".parse().unwrap()),
+                InputType::Type("f32".parse().unwrap()),
+                InputType::NVariantOp(None),
+            ]))
+        );
+        assert_eq!(
+            variants.next(),
+            Some(InputSet(vec![
+                InputType::Type("f64".parse().unwrap()),
+                InputType::Type("f32".parse().unwrap()),
+                InputType::NVariantOp(Some("op2".parse().unwrap())),
+            ]))
+        );
+        assert_eq!(variants.next(), None)
+    }
+
+    #[test]
+    fn test_invalid_length() {
+        let str = r#"types: [i32, [[u64], [u32]]]"#;
+        serde_yaml::from_str::<IntrinsicInput>(str).expect_err("failure expected");
+    }
+
+    #[test]
+    fn test_invalid_predication() {
+        let str = "types: []";
+        let input: IntrinsicInput = serde_yaml::from_str(str).expect("failed to parse");
+        let mut intrinsic = Intrinsic::default();
+        intrinsic.signature.name = "test_intrinsic{_mxz}".parse().unwrap();
+        input
+            .variants(&intrinsic)
+            .map(|v| v.collect_vec())
+            .expect_err("failure expected");
+    }
+
+    #[test]
+    fn test_invalid_predication_mask() {
+        "test_intrinsic{_mxy}"
+            .parse::<WildString>()
+            .expect_err("failure expected");
+        "test_intrinsic{_}"
+            .parse::<WildString>()
+            .expect_err("failure expected");
+    }
+
+    #[test]
+    fn test_zeroing_predication() {
+        let str = r#"types: [i64]
+zeroing_method: { drop: inactive }"#;
+        let input: IntrinsicInput = serde_yaml::from_str(str).expect("failed to parse");
+        let mut intrinsic = Intrinsic::default();
+        intrinsic.signature.name = "test_intrinsic{_mxz}".parse().unwrap();
+        let mut variants = input.variants(&intrinsic).unwrap();
+        assert_eq!(
+            variants.next(),
+            Some(InputSet(vec![
+                InputType::Type("i64".parse().unwrap()),
+                InputType::PredicateForm(PredicateForm::Merging),
+            ]))
+        );
+        assert_eq!(
+            variants.next(),
+            Some(InputSet(vec![
+                InputType::Type("i64".parse().unwrap()),
+                InputType::PredicateForm(PredicateForm::DontCare(DontCareMethod::AsZeroing)),
+            ]))
+        );
+        assert_eq!(
+            variants.next(),
+            Some(InputSet(vec![
+                InputType::Type("i64".parse().unwrap()),
+                InputType::PredicateForm(PredicateForm::Zeroing(ZeroingMethod::Drop {
+                    drop: "inactive".parse().unwrap()
+                })),
+            ]))
+        );
+        assert_eq!(variants.next(), None)
+    }
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/intrinsic.rs b/library/stdarch/crates/stdarch-gen-arm/src/intrinsic.rs
new file mode 100644
index 0000000000000..efaa9e1418899
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/intrinsic.rs
@@ -0,0 +1,1851 @@
+use itertools::Itertools;
+use proc_macro2::{Delimiter, Group, Punct, Spacing, TokenStream};
+use quote::{ToTokens, TokenStreamExt, format_ident, quote};
+use serde::{Deserialize, Serialize};
+use serde_with::{DeserializeFromStr, SerializeDisplay};
+use std::collections::{HashMap, HashSet};
+use std::fmt::{self};
+use std::num::ParseIntError;
+use std::ops::RangeInclusive;
+use std::str::FromStr;
+
+use crate::assert_instr::InstructionAssertionsForBaseType;
+use crate::big_endian::{
+    create_assigned_shuffle_call, create_let_variable, create_mut_let_variable,
+    create_shuffle_call, create_symbol_identifier, make_variable_mutable, type_has_tuple,
+};
+use crate::context::{GlobalContext, GroupContext};
+use crate::input::{InputSet, InputSetEntry};
+use crate::predicate_forms::{DontCareMethod, PredicateForm, PredicationMask, ZeroingMethod};
+use crate::{
+    assert_instr::InstructionAssertionMethod,
+    context::{self, ArchitectureSettings, Context, LocalContext, VariableType},
+    expression::{Expression, FnCall, IdentifierType},
+    fn_suffix::{SuffixKind, type_to_size},
+    input::IntrinsicInput,
+    matching::{KindMatchable, SizeMatchable},
+    typekinds::*,
+    wildcards::Wildcard,
+    wildstring::WildString,
+};
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum SubstitutionType {
+    MatchSize(SizeMatchable<WildString>),
+    MatchKind(KindMatchable<WildString>),
+}
+
+impl SubstitutionType {
+    pub fn get(&mut self, ctx: &LocalContext) -> context::Result<WildString> {
+        match self {
+            Self::MatchSize(smws) => {
+                smws.perform_match(ctx)?;
+                Ok(smws.as_ref().clone())
+            }
+            Self::MatchKind(kmws) => {
+                kmws.perform_match(ctx)?;
+                Ok(kmws.as_ref().clone())
+            }
+        }
+    }
+}
+
+/// Mutability level
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum AccessLevel {
+    /// Immutable
+    R,
+    /// Mutable
+    RW,
+}
+
+/// Function signature argument.
+///
+/// Prepend the `mut` keyword for a mutable argument. Separate argument name
+/// and type with a semicolon `:`. Usage examples:
+/// - Mutable argument: `mut arg1: *u64`
+/// - Immutable argument: `arg2: u32`
+#[derive(Debug, Clone, SerializeDisplay, DeserializeFromStr)]
+pub struct Argument {
+    /// Argument name
+    pub name: WildString,
+    /// Mutability level
+    pub rw: AccessLevel,
+    /// Argument type
+    pub kind: TypeKind,
+}
+
+impl Argument {
+    pub fn populate_variables(&self, vars: &mut HashMap<String, (TypeKind, VariableType)>) {
+        vars.insert(
+            self.name.to_string(),
+            (self.kind.clone(), VariableType::Argument),
+        );
+    }
+}
+
+impl FromStr for Argument {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut it = s.splitn(2, ':').map(<str>::trim);
+        if let Some(mut lhs) = it.next().map(|s| s.split_whitespace()) {
+            let lhs_len = lhs.clone().count();
+            match (lhs_len, lhs.next(), it.next()) {
+                (2, Some("mut"), Some(kind)) => Ok(Argument {
+                    name: lhs.next().unwrap().parse()?,
+                    rw: AccessLevel::RW,
+                    kind: kind.parse()?,
+                }),
+                (2, Some(ident), _) => Err(format!("invalid {ident:#?} keyword")),
+                (1, Some(name), Some(kind)) => Ok(Argument {
+                    name: name.parse()?,
+                    rw: AccessLevel::R,
+                    kind: kind.parse()?,
+                }),
+                _ => Err(format!("invalid argument `{s}` provided")),
+            }
+        } else {
+            Err(format!("invalid argument `{s}` provided"))
+        }
+    }
+}
+
+impl fmt::Display for Argument {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if let AccessLevel::RW = &self.rw {
+            write!(f, "mut ")?;
+        }
+
+        write!(f, "{}: {}", self.name, self.kind)
+    }
+}
+
+impl ToTokens for Argument {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        if let AccessLevel::RW = &self.rw {
+            tokens.append(format_ident!("mut"))
+        }
+
+        let (name, kind) = (format_ident!("{}", self.name.to_string()), &self.kind);
+        tokens.append_all(quote! { #name: #kind })
+    }
+}
+
+/// Static definition part of the signature. It may evaluate to a constant
+/// expression with e.g. `const imm: u64`, or a generic `T: Into<u64>`.
+#[derive(Debug, Clone, SerializeDisplay, DeserializeFromStr)]
+pub enum StaticDefinition {
+    /// Constant expression
+    Constant(Argument),
+    /// Generic type
+    Generic(String),
+}
+
+impl StaticDefinition {
+    pub fn as_variable(&self) -> Option<(String, (TypeKind, VariableType))> {
+        match self {
+            StaticDefinition::Constant(arg) => Some((
+                arg.name.to_string(),
+                (arg.kind.clone(), VariableType::Argument),
+            )),
+            StaticDefinition::Generic(..) => None,
+        }
+    }
+}
+
+impl FromStr for StaticDefinition {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.trim() {
+            s if s.starts_with("const ") => Ok(StaticDefinition::Constant(s[6..].trim().parse()?)),
+            s => Ok(StaticDefinition::Generic(s.to_string())),
+        }
+    }
+}
+
+impl fmt::Display for StaticDefinition {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            StaticDefinition::Constant(arg) => write!(f, "const {arg}"),
+            StaticDefinition::Generic(generic) => write!(f, "{generic}"),
+        }
+    }
+}
+
+impl ToTokens for StaticDefinition {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        tokens.append_all(match self {
+            StaticDefinition::Constant(arg) => quote! { const #arg },
+            StaticDefinition::Generic(generic) => {
+                let generic: TokenStream = generic.parse().expect("invalid Rust code");
+                quote! { #generic }
+            }
+        })
+    }
+}
+
+/// Function constraints
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum Constraint {
+    /// Asserts that the given variable equals to any of the given integer values
+    AnyI32 {
+        variable: String,
+        any_values: Vec<i32>,
+    },
+    /// WildString version of RangeI32. If the string values given for the range
+    /// are valid, this gets built into a RangeI32.
+    RangeWildstring {
+        variable: String,
+        range: (WildString, WildString),
+    },
+    /// Asserts that the given variable's value falls in the specified range
+    RangeI32 {
+        variable: String,
+        range: SizeMatchable<RangeInclusive<i32>>,
+    },
+    /// Asserts that the number of elements/lanes does not exceed the 2048-bit SVE constraint
+    SVEMaxElems {
+        variable: String,
+        sve_max_elems_type: TypeKind,
+    },
+    /// Asserts that the number of elements/lanes does not exceed the 128-bit register constraint
+    VecMaxElems {
+        variable: String,
+        vec_max_elems_type: TypeKind,
+    },
+}
+
+impl Constraint {
+    fn variable(&self) -> &str {
+        match self {
+            Constraint::AnyI32 { variable, .. }
+            | Constraint::RangeWildstring { variable, .. }
+            | Constraint::RangeI32 { variable, .. }
+            | Constraint::SVEMaxElems { variable, .. }
+            | Constraint::VecMaxElems { variable, .. } => variable,
+        }
+    }
+    pub fn build(&mut self, ctx: &Context) -> context::Result {
+        if let Self::RangeWildstring {
+            variable,
+            range: (min, max),
+        } = self
+        {
+            min.build_acle(ctx.local)?;
+            max.build_acle(ctx.local)?;
+            let min = min.to_string();
+            let max = max.to_string();
+            let min: i32 = min
+                .parse()
+                .map_err(|_| format!("the minimum value `{min}` is not a valid number"))?;
+            let max: i32 = max
+                .parse()
+                .or_else(|_| Ok(type_to_size(max.as_str())))
+                .map_err(|_: ParseIntError| {
+                    format!("the maximum value `{max}` is not a valid number")
+                })?;
+            *self = Self::RangeI32 {
+                variable: variable.to_owned(),
+                range: SizeMatchable::Matched(RangeInclusive::new(min, max)),
+            }
+        }
+
+        #[allow(clippy::collapsible_if)]
+        if let Self::SVEMaxElems {
+            sve_max_elems_type: ty,
+            ..
+        }
+        | Self::VecMaxElems {
+            vec_max_elems_type: ty,
+            ..
+        } = self
+        {
+            if let Some(w) = ty.wildcard() {
+                ty.populate_wildcard(ctx.local.provide_type_wildcard(w)?)?;
+            }
+        }
+
+        if let Self::RangeI32 { range, .. } = self {
+            range.perform_match(ctx.local)?;
+        }
+
+        let variable = self.variable();
+        ctx.local
+            .variables
+            .contains_key(variable)
+            .then_some(())
+            .ok_or_else(|| format!("cannot build constraint, could not find variable {variable}"))
+    }
+}
+
+/// Function signature
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct Signature {
+    /// Function name
+    pub name: WildString,
+    /// List of function arguments, leave unset or empty for no arguments
+    pub arguments: Vec<Argument>,
+
+    /// Function return type, leave unset for void
+    pub return_type: Option<TypeKind>,
+
+    /// For some neon intrinsics we want to modify the suffix of the function name
+    pub suffix_type: Option<SuffixKind>,
+
+    /// List of static definitions, leave unset of empty if not required
+    #[serde(default)]
+    pub static_defs: Vec<StaticDefinition>,
+
+    /// **Internal use only.**
+    /// Condition for which the ultimate function is specific to predicates.
+    #[serde(skip)]
+    pub is_predicate_specific: bool,
+
+    /// **Internal use only.**
+    /// Setting this property will trigger the signature builder to convert any `svbool*_t` to `svbool_t` in the input and output.
+    #[serde(skip)]
+    pub predicate_needs_conversion: bool,
+}
+
+impl Signature {
+    pub fn drop_argument(&mut self, arg_name: &WildString) -> Result<(), String> {
+        if let Some(idx) = self
+            .arguments
+            .iter()
+            .position(|arg| arg.name.to_string() == arg_name.to_string())
+        {
+            self.arguments.remove(idx);
+            Ok(())
+        } else {
+            Err(format!("no argument {arg_name} found to drop"))
+        }
+    }
+
+    pub fn build(&mut self, ctx: &LocalContext) -> context::Result {
+        if self.name_has_neon_suffix() {
+            self.name.build_neon_intrinsic_signature(ctx)?;
+        } else {
+            self.name.build_acle(ctx)?;
+        }
+
+        #[allow(clippy::collapsible_if)]
+        if let Some(ref mut return_type) = self.return_type {
+            if let Some(w) = return_type.clone().wildcard() {
+                return_type.populate_wildcard(ctx.provide_type_wildcard(w)?)?;
+            }
+        }
+
+        self.arguments
+            .iter_mut()
+            .try_for_each(|arg| arg.name.build_acle(ctx))?;
+
+        self.arguments
+            .iter_mut()
+            .filter_map(|arg| {
+                arg.kind
+                    .clone()
+                    .wildcard()
+                    .map(|w| (&mut arg.kind, w.clone()))
+            })
+            .try_for_each(|(ty, w)| ty.populate_wildcard(ctx.provide_type_wildcard(&w)?))
+    }
+
+    pub fn fn_name(&self) -> WildString {
+        self.name.replace(['[', ']'], "")
+    }
+
+    pub fn doc_name(&self) -> String {
+        self.name.to_string()
+    }
+
+    fn name_has_neon_suffix(&self) -> bool {
+        for part in self.name.wildcards() {
+            let has_suffix = match part {
+                Wildcard::NEONType(_, _, suffix_type) => suffix_type.is_some(),
+                _ => false,
+            };
+
+            if has_suffix {
+                return true;
+            }
+        }
+        false
+    }
+}
+
+impl ToTokens for Signature {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        let name_ident = format_ident!("{}", self.fn_name().to_string());
+        let arguments = self
+            .arguments
+            .clone()
+            .into_iter()
+            .map(|mut arg| {
+                if arg.kind.vector().is_some_and(|ty| ty.base_type().is_bool())
+                    && self.predicate_needs_conversion
+                {
+                    arg.kind = TypeKind::Vector(VectorType::make_predicate_from_bitsize(8))
+                }
+                arg
+            })
+            .collect_vec();
+        let static_defs = &self.static_defs;
+        tokens.append_all(quote! { fn #name_ident<#(#static_defs),*>(#(#arguments),*) });
+
+        if let Some(ref return_type) = self.return_type {
+            if return_type
+                .vector()
+                .is_some_and(|ty| ty.base_type().is_bool())
+                && self.predicate_needs_conversion
+            {
+                tokens.append_all(quote! { -> svbool_t })
+            } else {
+                tokens.append_all(quote! { -> #return_type })
+            }
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LLVMLinkAttribute {
+    /// Either one architecture or a comma separated list of architectures with NO spaces
+    pub arch: String,
+    pub link: WildString,
+}
+
+impl ToTokens for LLVMLinkAttribute {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        let LLVMLinkAttribute { arch, link } = self;
+        let link = link.to_string();
+
+        // For example:
+        //
+        //      #[cfg_attr(target_arch = "arm", link_name = "llvm.ctlz.v4i16")]
+        //
+        //      #[cfg_attr(
+        //          any(target_arch = "aarch64", target_arch = "arm64ec"),
+        //          link_name = "llvm.aarch64.neon.suqadd.i32"
+        //      )]
+
+        let mut cfg_attr_cond = TokenStream::new();
+        let mut single_arch = true;
+        for arch in arch.split(',') {
+            if !cfg_attr_cond.is_empty() {
+                single_arch = false;
+                cfg_attr_cond.append(Punct::new(',', Spacing::Alone));
+            }
+            cfg_attr_cond.append_all(quote! { target_arch = #arch });
+        }
+        assert!(!cfg_attr_cond.is_empty());
+        if !single_arch {
+            cfg_attr_cond = quote! { any( #cfg_attr_cond ) };
+        }
+        tokens.append_all(quote! {
+            #[cfg_attr(#cfg_attr_cond, link_name = #link)]
+        })
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LLVMLink {
+    /// LLVM link function name without namespace and types,
+    /// e.g. `st1` in `llvm.aarch64.sve.st1.nxv4i32`
+    pub name: WildString,
+
+    /// LLVM link signature arguments, leave unset if it inherits from intrinsic's signature
+    pub arguments: Option<Vec<Argument>>,
+    /// LLVM link signature return type, leave unset if it inherits from intrinsic's signature
+    pub return_type: Option<TypeKind>,
+
+    /// **This will be set automatically if not set**
+    /// Attribute LLVM links for the function. First element is the architecture it targets,
+    /// second element is the LLVM link itself.
+    pub links: Option<Vec<LLVMLinkAttribute>>,
+
+    /// **Internal use only. Do not set.**
+    /// Generated signature from these `arguments` and/or `return_type` if set, and the intrinsic's signature.
+    #[serde(skip)]
+    pub signature: Option<Box<Signature>>,
+}
+
+impl LLVMLink {
+    pub fn resolve(&self, cfg: &ArchitectureSettings) -> String {
+        if self.name.starts_with("llvm") {
+            self.name.to_string()
+        } else {
+            format!("{}.{}", cfg.llvm_link_prefix, self.name)
+        }
+    }
+
+    pub fn build_and_save(&mut self, ctx: &mut Context) -> context::Result {
+        self.build(ctx)?;
+
+        // Save LLVM link to the group context
+        ctx.global.arch_cfgs.iter().for_each(|cfg| {
+            ctx.group
+                .links
+                .insert(self.resolve(cfg), ctx.local.input.clone());
+        });
+
+        Ok(())
+    }
+
+    pub fn build(&mut self, ctx: &mut Context) -> context::Result {
+        let mut sig_name = ctx.local.signature.name.clone();
+        sig_name.prepend_str("_");
+
+        let argv = self
+            .arguments
+            .clone()
+            .unwrap_or_else(|| ctx.local.signature.arguments.clone());
+
+        let mut sig = Signature {
+            name: sig_name,
+            arguments: argv,
+            return_type: self
+                .return_type
+                .clone()
+                .or_else(|| ctx.local.signature.return_type.clone()),
+            suffix_type: None,
+            static_defs: vec![],
+            is_predicate_specific: ctx.local.signature.is_predicate_specific,
+            predicate_needs_conversion: false,
+        };
+
+        sig.build(ctx.local)?;
+        self.name.build(ctx.local, TypeRepr::LLVMMachine)?;
+
+        // Add link function name to context
+        ctx.local
+            .substitutions
+            .insert(Wildcard::LLVMLink, sig.fn_name().to_string());
+
+        self.signature = Some(Box::new(sig));
+
+        if let Some(ref mut links) = self.links {
+            links.iter_mut().for_each(|ele| {
+                ele.link
+                    .build(ctx.local, TypeRepr::LLVMMachine)
+                    .expect("Failed to transform to LLVMMachine representation");
+            });
+        } else {
+            self.links = Some(
+                ctx.global
+                    .arch_cfgs
+                    .iter()
+                    .map(|cfg| LLVMLinkAttribute {
+                        arch: cfg.arch_name.to_owned(),
+                        link: self.resolve(cfg).into(),
+                    })
+                    .collect_vec(),
+            );
+        }
+
+        Ok(())
+    }
+
+    /// Alters all the unsigned types from the signature. This is required where
+    /// a signed and unsigned variant require the same binding to an exposed
+    /// LLVM instrinsic.
+    pub fn sanitise_uints(&mut self) {
+        let transform = |tk: &mut TypeKind| {
+            if let Some(BaseType::Sized(BaseTypeKind::UInt, size)) = tk.base_type() {
+                *tk.base_type_mut().unwrap() = BaseType::Sized(BaseTypeKind::Int, *size)
+            }
+        };
+
+        if let Some(sig) = self.signature.as_mut() {
+            for arg in sig.arguments.iter_mut() {
+                transform(&mut arg.kind);
+            }
+
+            sig.return_type.as_mut().map(transform);
+        }
+    }
+
+    /// Make a function call to the LLVM link
+    pub fn make_fn_call(&self, intrinsic_sig: &Signature) -> context::Result<Expression> {
+        let link_sig = self.signature.as_ref().ok_or_else(|| {
+            "cannot derive the LLVM link call, as it does not hold a valid function signature"
+                .to_string()
+        })?;
+
+        if intrinsic_sig.arguments.len() != link_sig.arguments.len() {
+            return Err(
+                "cannot derive the LLVM link call, the number of arguments does not match"
+                    .to_string(),
+            );
+        }
+
+        let call_args = intrinsic_sig
+            .arguments
+            .iter()
+            .zip(link_sig.arguments.iter())
+            .map(|(intrinsic_arg, link_arg)| {
+                // Could also add a type check...
+                if intrinsic_arg.name == link_arg.name {
+                    Ok(Expression::Identifier(
+                        intrinsic_arg.name.to_owned(),
+                        IdentifierType::Variable,
+                    ))
+                } else {
+                    Err("cannot derive the LLVM link call, the arguments do not match".to_string())
+                }
+            })
+            .try_collect()?;
+
+        Ok(FnCall::new_unsafe_expression(
+            link_sig.fn_name().into(),
+            call_args,
+        ))
+    }
+
+    /// Given a FnCall, apply all the predicate and unsigned conversions as required.
+    pub fn apply_conversions_to_call(
+        &self,
+        mut fn_call: FnCall,
+        ctx: &Context,
+    ) -> context::Result<Expression> {
+        use BaseType::{Sized, Unsized};
+        use BaseTypeKind::{Bool, UInt};
+        use VariableType::Argument;
+
+        let convert =
+            |method: &str, ex| Expression::MethodCall(Box::new(ex), method.to_string(), vec![]);
+
+        fn_call.1 = fn_call
+            .1
+            .into_iter()
+            .map(|arg| -> context::Result<Expression> {
+                if let Expression::Identifier(ref var_name, IdentifierType::Variable) = arg {
+                    let (kind, scope) = ctx
+                        .local
+                        .variables
+                        .get(&var_name.to_string())
+                        .ok_or_else(|| format!("invalid variable {var_name:?} being referenced"))?;
+
+                    match (scope, kind.base_type()) {
+                        (Argument, Some(Sized(Bool, bitsize))) if *bitsize != 8 => {
+                            Ok(convert("into", arg))
+                        }
+                        (Argument, Some(Sized(UInt, _) | Unsized(UInt))) => {
+                            if ctx.global.auto_llvm_sign_conversion {
+                                Ok(convert("as_signed", arg))
+                            } else {
+                                Ok(arg)
+                            }
+                        }
+                        _ => Ok(arg),
+                    }
+                } else {
+                    Ok(arg)
+                }
+            })
+            .try_collect()?;
+
+        let return_type_conversion = if !ctx.global.auto_llvm_sign_conversion {
+            None
+        } else {
+            self.signature
+                .as_ref()
+                .and_then(|sig| sig.return_type.as_ref())
+                .and_then(|ty| {
+                    if let Some(Sized(Bool, bitsize)) = ty.base_type() {
+                        (*bitsize != 8).then_some(Bool)
+                    } else if let Some(Sized(UInt, _) | Unsized(UInt)) = ty.base_type() {
+                        Some(UInt)
+                    } else {
+                        None
+                    }
+                })
+        };
+
+        let fn_call = Expression::FnCall(fn_call);
+        match return_type_conversion {
+            Some(Bool) => Ok(convert("into", fn_call)),
+            Some(UInt) => Ok(convert("as_unsigned", fn_call)),
+            _ => Ok(fn_call),
+        }
+    }
+}
+
+impl ToTokens for LLVMLink {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        assert!(
+            self.signature.is_some() && self.links.is_some(),
+            "expression {self:#?} was not built before calling to_tokens"
+        );
+
+        let signature = self.signature.as_ref().unwrap();
+        let links = self.links.as_ref().unwrap();
+        tokens.append_all(quote! {
+            unsafe extern "unadjusted" {
+                #(#links)*
+                #signature;
+            }
+        })
+    }
+}
+
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum FunctionVisibility {
+    #[default]
+    Public,
+    Private,
+}
+
+/// Whether to generate a load/store test, and which typeset index
+/// represents the data type of the load/store target address
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Test {
+    #[default]
+    #[serde(skip)]
+    None, // Covered by `intrinsic-test`
+    Load(usize),
+    Store(usize),
+}
+
+impl Test {
+    pub fn get_typeset_index(&self) -> Option<usize> {
+        match *self {
+            Test::Load(n) => Some(n),
+            Test::Store(n) => Some(n),
+            _ => None,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Safety {
+    Safe,
+    Unsafe(Vec<UnsafetyComment>),
+}
+
+impl Safety {
+    /// Return `Ok(Safety::Safe)` if safety appears reasonable for the given `intrinsic`'s name and
+    /// prototype. Otherwise, return `Err()` with a suitable diagnostic.
+    fn safe_checked(intrinsic: &Intrinsic) -> Result<Self, String> {
+        let name = intrinsic.signature.doc_name();
+        if name.starts_with("sv") {
+            let handles_pointers = intrinsic
+                .signature
+                .arguments
+                .iter()
+                .any(|arg| matches!(arg.kind, TypeKind::Pointer(..)));
+            if name.starts_with("svld")
+                || name.starts_with("svst")
+                || name.starts_with("svprf")
+                || name.starts_with("svundef")
+                || handles_pointers
+            {
+                let doc = intrinsic.doc.as_ref().map(|s| s.to_string());
+                let doc = doc.as_deref().unwrap_or("...");
+                Err(format!(
+                    "`{name}` has no safety specification, but it looks like it should be unsafe. \
+                Consider specifying (un)safety explicitly:
+
+  - name: {name}
+    doc: {doc}
+    safety:
+      unsafe:
+        - ...
+    ...
+"
+                ))
+            } else {
+                Ok(Self::Safe)
+            }
+        } else {
+            Err(format!(
+                "Safety::safe_checked() for non-SVE intrinsic: {name}"
+            ))
+        }
+    }
+
+    fn is_safe(&self) -> bool {
+        match self {
+            Self::Safe => true,
+            Self::Unsafe(..) => false,
+        }
+    }
+
+    fn is_unsafe(&self) -> bool {
+        !self.is_safe()
+    }
+
+    fn has_doc_comments(&self) -> bool {
+        match self {
+            Self::Safe => false,
+            Self::Unsafe(v) => !v.is_empty(),
+        }
+    }
+
+    fn doc_comments(&self) -> &[UnsafetyComment] {
+        match self {
+            Self::Safe => &[],
+            Self::Unsafe(v) => v.as_slice(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum UnsafetyComment {
+    Custom(String),
+    Uninitialized,
+    PointerOffset(GovernedBy),
+    PointerOffsetVnum(GovernedBy),
+    Dereference(GovernedBy),
+    UnpredictableOnFault,
+    NonTemporal,
+    Neon,
+    NoProvenance(String),
+}
+
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum GovernedBy {
+    #[default]
+    Predicated,
+    PredicatedNonFaulting,
+    PredicatedFirstFaulting,
+}
+
+impl fmt::Display for GovernedBy {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Predicated => write!(f, " (governed by `pg`)"),
+            Self::PredicatedNonFaulting => write!(
+                f,
+                " (governed by `pg`, the first-fault register (`FFR`) \
+                and non-faulting behaviour)"
+            ),
+            Self::PredicatedFirstFaulting => write!(
+                f,
+                " (governed by `pg`, the first-fault register (`FFR`) \
+                and first-faulting behaviour)"
+            ),
+        }
+    }
+}
+
+impl fmt::Display for UnsafetyComment {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Custom(s) => s.fmt(f),
+            Self::Neon => write!(f, "Neon instrinsic unsafe"),
+            Self::Uninitialized => write!(
+                f,
+                "This creates an uninitialized value, and may be unsound (like \
+                [`core::mem::uninitialized`])."
+            ),
+            Self::PointerOffset(gov) => write!(
+                f,
+                "[`pointer::offset`](pointer#method.offset) safety constraints must \
+                be met for the address calculation for each active element{gov}."
+            ),
+            Self::PointerOffsetVnum(gov) => write!(
+                f,
+                "[`pointer::offset`](pointer#method.offset) safety constraints must \
+                be met for the address calculation for each active element{gov}. \
+                In particular, note that `vnum` is scaled by the vector \
+                length, `VL`, which is not known at compile time."
+            ),
+            Self::Dereference(gov) => write!(
+                f,
+                "This dereferences and accesses the calculated address for each \
+                active element{gov}."
+            ),
+            Self::NonTemporal => write!(
+                f,
+                "Non-temporal accesses have special memory ordering rules, and \
+                [explicit barriers may be required for some applications]\
+                (https://developer.arm.com/documentation/den0024/a/Memory-Ordering/Barriers/Non-temporal-load-and-store-pair?lang=en)."
+            ),
+            Self::NoProvenance(arg) => write!(
+                f,
+                "Addresses passed in `{arg}` lack provenance, so this is similar to using a \
+                `usize as ptr` cast (or [`core::ptr::from_exposed_addr`]) on each lane before \
+                using it."
+            ),
+            Self::UnpredictableOnFault => write!(
+                f,
+                "Result lanes corresponding to inactive FFR lanes (either before or as a result \
+                of this intrinsic) have \"CONSTRAINED UNPREDICTABLE\" values, irrespective of \
+                predication. Refer to architectural documentation for details."
+            ),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct Intrinsic {
+    #[serde(default)]
+    pub visibility: FunctionVisibility,
+    #[serde(default)]
+    pub doc: Option<WildString>,
+    #[serde(flatten)]
+    pub signature: Signature,
+    /// Function sequential composition
+    pub compose: Vec<Expression>,
+    /// Input to generate the intrinsic against. Leave empty if the intrinsic
+    /// does not have any variants.
+    /// Specific variants contain one InputSet
+    #[serde(flatten, default)]
+    pub input: IntrinsicInput,
+    #[serde(default)]
+    pub constraints: Vec<Constraint>,
+    /// Additional target features to add to the global settings
+    #[serde(default)]
+    pub target_features: Vec<String>,
+    /// Should the intrinsic be `unsafe`? By default, the generator will try to guess from the
+    /// prototype, but it errs on the side of `unsafe`, and prints a warning in that case.
+    #[serde(default)]
+    pub safety: Option<Safety>,
+    #[serde(default)]
+    pub substitutions: HashMap<String, SubstitutionType>,
+    /// List of the only indices in a typeset that require conversion to signed
+    /// when deferring unsigned intrinsics to signed. (optional, default
+    /// behaviour is all unsigned types are converted to signed)
+    #[serde(default)]
+    pub defer_to_signed_only_indices: HashSet<usize>,
+    pub assert_instr: Option<Vec<InstructionAssertionMethod>>,
+    /// Whether we should generate a test for this intrinsic
+    #[serde(default)]
+    pub test: Test,
+    /// Primary base type, used for instruction assertion.
+    #[serde(skip)]
+    pub base_type: Option<BaseType>,
+    /// Attributes for the function
+    pub attr: Option<Vec<Expression>>,
+    /// Big endian variant for composing, this gets populated internally
+    #[serde(skip)]
+    pub big_endian_compose: Vec<Expression>,
+    /// Big endian sometimes needs the bits inverted in a way that cannot be
+    /// automatically detected
+    #[serde(default)]
+    pub big_endian_inverse: Option<bool>,
+}
+
+impl Intrinsic {
+    pub fn llvm_link(&self) -> Option<&LLVMLink> {
+        self.compose.iter().find_map(|ex| {
+            if let Expression::LLVMLink(llvm_link) = ex {
+                Some(llvm_link)
+            } else {
+                None
+            }
+        })
+    }
+
+    pub fn llvm_link_mut(&mut self) -> Option<&mut LLVMLink> {
+        self.compose.iter_mut().find_map(|ex| {
+            if let Expression::LLVMLink(llvm_link) = ex {
+                Some(llvm_link)
+            } else {
+                None
+            }
+        })
+    }
+
+    pub fn generate_variants(&self, global_ctx: &GlobalContext) -> context::Result<Vec<Intrinsic>> {
+        let wrap_err = |err| format!("{}: {err}", self.signature.name);
+
+        let mut group_ctx = GroupContext::default();
+        self.input
+            .variants(self)
+            .map_err(wrap_err)?
+            .map(|input| {
+                self.generate_variant(input.clone(), &mut group_ctx, global_ctx)
+                    .map_err(wrap_err)
+                    .map(|variant| (variant, input))
+            })
+            .collect::<context::Result<Vec<_>>>()
+            .and_then(|mut variants| {
+                variants.sort_by_cached_key(|(_, input)| input.to_owned());
+
+                if variants.is_empty() {
+                    let standalone_variant = self
+                        .generate_variant(InputSet::default(), &mut group_ctx, global_ctx)
+                        .map_err(wrap_err)?;
+
+                    Ok(vec![standalone_variant])
+                } else {
+                    Ok(variants
+                        .into_iter()
+                        .map(|(variant, _)| variant)
+                        .collect_vec())
+                }
+            })
+    }
+
+    pub fn generate_variant(
+        &self,
+        input: InputSet,
+        group_ctx: &mut GroupContext,
+        global_ctx: &GlobalContext,
+    ) -> context::Result<Intrinsic> {
+        let mut variant = self.clone();
+
+        variant.input.types = vec![InputSetEntry::new(vec![input.clone()])];
+
+        let mut local_ctx = LocalContext::new(input, self);
+        let mut ctx = Context {
+            local: &mut local_ctx,
+            group: group_ctx,
+            global: global_ctx,
+        };
+
+        variant.pre_build(&mut ctx)?;
+
+        match ctx.local.predicate_form().cloned() {
+            Some(PredicateForm::DontCare(method)) => {
+                variant.compose = variant.generate_dont_care_pass_through(&mut ctx, method)?
+            }
+            Some(PredicateForm::Zeroing(method)) => {
+                variant.compose = variant.generate_zeroing_pass_through(&mut ctx, method)?
+            }
+            _ => {
+                for idx in 0..variant.compose.len() {
+                    let mut ex = variant.compose[idx].clone();
+                    ex.build(&variant, &mut ctx)?;
+                    variant.compose[idx] = ex;
+                }
+            }
+        };
+
+        if variant.attr.is_none() && variant.assert_instr.is_none() {
+            panic!(
+                "Error: {} is missing both 'attr' and 'assert_instr' fields. You must either manually declare the attributes using the 'attr' field or use 'assert_instr'!",
+                variant.signature.name
+            );
+        }
+
+        if variant.attr.is_some() {
+            let attr: &Vec<Expression> = &variant.attr.clone().unwrap();
+            let mut expanded_attr: Vec<Expression> = Vec::new();
+            for mut ex in attr.iter().cloned() {
+                ex.build(&variant, &mut ctx)?;
+                expanded_attr.push(ex);
+            }
+            variant.attr = Some(expanded_attr);
+        }
+
+        variant.post_build(&mut ctx)?;
+
+        /* If we should generate big endian we shall do so. It's possible
+         * we may not want to in some instances */
+        if ctx.global.auto_big_endian.unwrap_or(false) {
+            self.generate_big_endian(&mut variant);
+        }
+
+        if let Some(n_variant_op) = ctx.local.n_variant_op().cloned() {
+            variant.generate_n_variant(n_variant_op, &mut ctx)
+        } else {
+            Ok(variant)
+        }
+    }
+
+    /// Add a big endian implementation
+    fn generate_big_endian(&self, variant: &mut Intrinsic) {
+        /* We can't always blindly reverse the bits only in certain conditions
+         * do we need a different order - thus this allows us to have the
+         * ability to do so without having to play codegolf with the yaml AST */
+        let should_reverse = {
+            if let Some(should_reverse) = variant.big_endian_inverse {
+                should_reverse
+            } else if variant.compose.len() == 1 {
+                match &variant.compose[0] {
+                    Expression::FnCall(fn_call) => fn_call.0.to_string() == "transmute",
+                    _ => false,
+                }
+            } else {
+                false
+            }
+        };
+
+        if !should_reverse {
+            return;
+        }
+
+        let mut big_endian_expressions: Vec<Expression> = Vec::new();
+
+        /* We cannot assign `a.0 = ` directly to a function parameter so
+         * need to make them mutable */
+        for function_parameter in &variant.signature.arguments {
+            if type_has_tuple(&function_parameter.kind) {
+                /* We do not want to be creating a `mut` variant if the type
+                 * has one lane. If it has one lane that means it does not need
+                 * shuffling */
+                #[allow(clippy::collapsible_if)]
+                if let TypeKind::Vector(vector_type) = &function_parameter.kind {
+                    if vector_type.lanes() == 1 {
+                        continue;
+                    }
+                }
+
+                let mutable_variable = make_variable_mutable(
+                    &function_parameter.name.to_string(),
+                    &function_parameter.kind,
+                );
+                big_endian_expressions.push(mutable_variable);
+            }
+        }
+
+        /* Possibly shuffle the vectors */
+        for function_parameter in &variant.signature.arguments {
+            if let Some(shuffle_call) = create_assigned_shuffle_call(
+                &function_parameter.name.to_string(),
+                &function_parameter.kind,
+            ) {
+                big_endian_expressions.push(shuffle_call);
+            }
+        }
+
+        if !big_endian_expressions.is_empty() {
+            Vec::reserve(
+                &mut variant.big_endian_compose,
+                big_endian_expressions.len() + variant.compose.len(),
+            );
+            let mut expression = &variant.compose[0];
+            let needs_reordering = expression.is_static_assert() || expression.is_llvm_link();
+
+            /* We want to keep the asserts and llvm links at the start of
+             * the new big_endian_compose vector that we are creating */
+            if needs_reordering {
+                let mut expression_idx = 0;
+                while expression.is_static_assert() || expression.is_llvm_link() {
+                    /* Add static asserts and llvm links to the start of the
+                     * vector */
+                    variant.big_endian_compose.push(expression.clone());
+                    expression_idx += 1;
+                    expression = &variant.compose[expression_idx];
+                }
+
+                /* Add the big endian specific expressions */
+                variant.big_endian_compose.extend(big_endian_expressions);
+
+                /* Add the rest of the expressions */
+                for i in expression_idx..variant.compose.len() {
+                    variant.big_endian_compose.push(variant.compose[i].clone());
+                }
+            } else {
+                /* If we do not need to reorder anything then immediately add
+                 * the expressions from the big_endian_expressions and
+                 * concatinate the compose vector */
+                variant.big_endian_compose.extend(big_endian_expressions);
+                variant
+                    .big_endian_compose
+                    .extend(variant.compose.iter().cloned());
+            }
+        }
+
+        /* If we have a return type, there is a possibility we want to generate
+         * a shuffle call */
+        if let Some(return_type) = &variant.signature.return_type {
+            let return_value = variant
+                .compose
+                .last()
+                .expect("Cannot define a return type with an empty function body");
+
+            /* If we do not create a shuffle call we do not need modify the
+             * return value and append to the big endian ast array. A bit confusing
+             * as in code we are making the final call before caputuring the return
+             * value of the intrinsic that has been called.*/
+            let ret_val_name = "ret_val".to_string();
+            if let Some(simd_shuffle_call) = create_shuffle_call(&ret_val_name, return_type) {
+                /* There is a possibility that the funcion arguments did not
+                 * require big endian treatment, thus we need to now add the
+                 * original function body before appending the return value.*/
+                if variant.big_endian_compose.is_empty() {
+                    variant
+                        .big_endian_compose
+                        .extend(variant.compose.iter().cloned());
+                }
+
+                /* Now we shuffle the return value - we are creating a new
+                 * return value for the intrinsic. */
+                let return_value_variable = if type_has_tuple(return_type) {
+                    create_mut_let_variable(&ret_val_name, return_type, return_value.clone())
+                } else {
+                    create_let_variable(&ret_val_name, return_type, return_value.clone())
+                };
+
+                /* Remove the last item which will be the return value */
+                variant.big_endian_compose.pop();
+                variant.big_endian_compose.push(return_value_variable);
+                variant.big_endian_compose.push(simd_shuffle_call);
+                if type_has_tuple(return_type) {
+                    /* We generated `tuple_count` number of calls to shuffle
+                     * re-assigning each tuple however those generated calls do
+                     * not make the parent function return. So we add the return
+                     * value here */
+                    variant
+                        .big_endian_compose
+                        .push(create_symbol_identifier(&ret_val_name));
+                }
+            }
+        }
+    }
+
+    /// Implement a "zeroing" (_z) method by calling an existing "merging" (_m) method, as required.
+    fn generate_zeroing_pass_through(
+        &mut self,
+        ctx: &mut Context,
+        method: ZeroingMethod,
+    ) -> context::Result<Vec<Expression>> {
+        PredicationMask::try_from(&ctx.local.signature.name)
+            .ok()
+            .filter(|mask| mask.has_merging())
+            .ok_or_else(|| format!("cannot generate zeroing passthrough for {}, no merging predicate form is specified", self.signature.name))?;
+
+        // Determine the function to pass through to.
+        let mut target_ctx = ctx.local.clone();
+        // Change target function predicate form to merging
+        *target_ctx.input.iter_mut()
+            .find_map(|arg| arg.predicate_form_mut())
+            .expect("failed to generate zeroing pass through, could not find predicate form in the InputSet") = PredicateForm::Merging;
+
+        let mut sig = target_ctx.signature.clone();
+        sig.build(&target_ctx)?;
+
+        let args_as_expressions = |arg: &Argument| -> context::Result<Expression> {
+            let arg_name = arg.name.to_string();
+            match &method {
+                ZeroingMethod::Drop { drop } if arg_name == drop.to_string() => {
+                    Ok(PredicateForm::make_zeroinitializer(&arg.kind))
+                }
+                ZeroingMethod::Select { select } if arg_name == select.to_string() => {
+                    let pg = sig
+                        .arguments
+                        .iter()
+                        .find_map(|arg| match arg.kind.vector() {
+                            Some(ty) if ty.base_type().is_bool() => Some(arg.name.clone()),
+                            _ => None,
+                        })
+                        .ok_or_else(|| {
+                            format!("cannot generate zeroing passthrough for {}, no predicate found in the signature for zero selection", self.signature.name)
+                        })?;
+                    Ok(PredicateForm::make_zeroselector(
+                        pg,
+                        select.clone(),
+                        &arg.kind,
+                    ))
+                }
+                _ => Ok(arg.into()),
+            }
+        };
+
+        let name: Expression = sig.fn_name().into();
+        let args: Vec<Expression> = sig
+            .arguments
+            .iter()
+            .map(args_as_expressions)
+            .try_collect()?;
+        let statics: Vec<Expression> = sig
+            .static_defs
+            .iter()
+            .map(|sd| sd.try_into())
+            .try_collect()?;
+        let mut call: Expression = FnCall(Box::new(name), args, statics, false).into();
+        call.build(self, ctx)?;
+        Ok(vec![call])
+    }
+
+    /// Implement a "don't care" (_x) method by calling an existing "merging" (_m).
+    fn generate_dont_care_pass_through(
+        &mut self,
+        ctx: &mut Context,
+        method: DontCareMethod,
+    ) -> context::Result<Vec<Expression>> {
+        PredicationMask::try_from(&ctx.local.signature.name).and_then(|mask| match method {
+            DontCareMethod::AsMerging if mask.has_merging() => Ok(()),
+            DontCareMethod::AsZeroing if mask.has_zeroing() => Ok(()),
+            _ => Err(format!(
+                "cannot generate don't care passthrough for {}, no {} predicate form is specified",
+                self.signature.name,
+                match method {
+                    DontCareMethod::AsMerging => "merging",
+                    DontCareMethod::AsZeroing => "zeroing",
+                    _ => unreachable!(),
+                }
+            )),
+        })?;
+
+        // Determine the function to pass through to.
+        let mut target_ctx = ctx.local.clone();
+        // Change target function predicate form to merging
+        *target_ctx.input.iter_mut()
+            .find_map(|arg| arg.predicate_form_mut())
+            .expect("failed to generate don't care passthrough, could not find predicate form in the InputSet") = PredicateForm::Merging;
+
+        let mut sig = target_ctx.signature.clone();
+        sig.build(&target_ctx)?;
+
+        // We might need to drop an argument for a zeroing pass-through.
+        let drop = match (method, &self.input.predication_methods.zeroing_method) {
+            (DontCareMethod::AsZeroing, Some(ZeroingMethod::Drop { drop })) => Some(drop),
+            _ => None,
+        };
+
+        let name: Expression = sig.fn_name().into();
+        let args: Vec<Expression> = sig
+            .arguments
+            .iter()
+            .map(|arg| {
+                if Some(arg.name.to_string()) == drop.as_ref().map(|v| v.to_string()) {
+                    // This argument is present in the _m form, but missing from the _x form. Clang
+                    // typically replaces these with an uninitialised vector, but to avoid
+                    // materialising uninitialised values in Rust, we instead merge with a known
+                    // vector. This usually results in the same code generation.
+                    // TODO: In many cases, it'll be better to use an unpredicated (or zeroing) form.
+                    sig.arguments
+                        .iter()
+                        .filter(|&other| arg.name.to_string() != other.name.to_string())
+                        .find_map(|other| {
+                            arg.kind.express_reinterpretation_from(&other.kind, other)
+                        })
+                        .unwrap_or_else(|| PredicateForm::make_zeroinitializer(&arg.kind))
+                } else {
+                    arg.into()
+                }
+            })
+            .collect();
+        let statics: Vec<Expression> = sig
+            .static_defs
+            .iter()
+            .map(|sd| sd.try_into())
+            .try_collect()?;
+        let mut call: Expression = FnCall(Box::new(name), args, statics, false).into();
+        call.build(self, ctx)?;
+        Ok(vec![call])
+    }
+
+    /// Implement a "_n" variant based on the given operand
+    fn generate_n_variant(
+        &self,
+        mut n_variant_op: WildString,
+        ctx: &mut Context,
+    ) -> context::Result<Intrinsic> {
+        let mut variant = self.clone();
+
+        n_variant_op.build_acle(ctx.local)?;
+
+        let n_op_arg_idx = variant
+            .signature
+            .arguments
+            .iter_mut()
+            .position(|arg| arg.name.to_string() == n_variant_op.to_string())
+            .ok_or_else(|| {
+                format!(
+                    "cannot generate `_n` variant for {}, operand `{n_variant_op}` not found",
+                    variant.signature.name
+                )
+            })?;
+
+        let has_n_wildcard = ctx
+            .local
+            .signature
+            .name
+            .wildcards()
+            .any(|w| matches!(w, Wildcard::NVariant));
+
+        if !has_n_wildcard {
+            return Err(format!(
+                "cannot generate `_n` variant for {}, no wildcard {{_n}} was specified in the intrinsic's name",
+                variant.signature.name
+            ));
+        }
+
+        // Build signature
+        variant.signature = ctx.local.signature.clone();
+        if let Some(pf) = ctx.local.predicate_form() {
+            // WARN: this may break in the future according to the underlying implementation
+            // Drops unwanted arguments if needed (required for the collection of arguments to pass to the function)
+            pf.post_build(&mut variant)?;
+        }
+
+        let sig = &mut variant.signature;
+
+        ctx.local
+            .substitutions
+            .insert(Wildcard::NVariant, "_n".to_owned());
+
+        let arg_kind = &mut sig.arguments.get_mut(n_op_arg_idx).unwrap().kind;
+        *arg_kind = match arg_kind {
+            TypeKind::Wildcard(Wildcard::SVEType(idx, None)) => {
+                TypeKind::Wildcard(Wildcard::Type(*idx))
+            }
+            _ => {
+                return Err(format!(
+                    "cannot generate `_n` variant for {}, the given operand is not a valid SVE type",
+                    variant.signature.name
+                ));
+            }
+        };
+
+        sig.build(ctx.local)?;
+
+        // Build compose
+        let name: Expression = self.signature.fn_name().into();
+        let args: Vec<Expression> = sig
+            .arguments
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                let ty = arg.kind.acle_notation_repr();
+                if idx == n_op_arg_idx {
+                    FnCall::new_expression(
+                        WildString::from(format!("svdup_n_{ty}")).into(),
+                        vec![arg.into()],
+                    )
+                } else {
+                    arg.into()
+                }
+            })
+            .collect();
+        let statics: Vec<Expression> = sig
+            .static_defs
+            .iter()
+            .map(|sd| sd.try_into())
+            .try_collect()?;
+        let mut call: Expression = FnCall(Box::new(name), args, statics, false).into();
+        call.build(self, ctx)?;
+
+        variant.compose = vec![call];
+        variant.signature.predicate_needs_conversion = true;
+
+        Ok(variant)
+    }
+
+    fn pre_build(&mut self, ctx: &mut Context) -> context::Result {
+        self.substitutions
+            .iter_mut()
+            .try_for_each(|(k, v)| -> context::Result {
+                let mut ws = v.get(ctx.local)?;
+                ws.build_acle(ctx.local)?;
+                ctx.local
+                    .substitutions
+                    .insert(Wildcard::Custom(k.to_owned()), ws.to_string());
+                Ok(())
+            })?;
+
+        self.signature.build(ctx.local)?;
+
+        if self.safety.is_none() {
+            self.safety = match Safety::safe_checked(self) {
+                Ok(safe) => Some(safe),
+                Err(err) => {
+                    eprintln!("{err}");
+                    return Err(format!(
+                        "Refusing to infer unsafety for {name}",
+                        name = self.signature.doc_name()
+                    ));
+                }
+            }
+        }
+
+        if let Some(doc) = &mut self.doc {
+            doc.build_acle(ctx.local)?
+        }
+
+        // Add arguments to variable tracking
+        self.signature
+            .arguments
+            .iter()
+            .for_each(|arg| arg.populate_variables(&mut ctx.local.variables));
+
+        // Add constant expressions to variable tracking
+        self.signature
+            .static_defs
+            .iter()
+            .filter_map(StaticDefinition::as_variable)
+            .for_each(|(var_name, var_properties)| {
+                ctx.local.variables.insert(var_name, var_properties);
+            });
+
+        // Pre-build compose expressions
+        for idx in 0..self.compose.len() {
+            let mut ex = self.compose[idx].clone();
+            ex.pre_build(ctx)?;
+            self.compose[idx] = ex;
+        }
+
+        if !ctx.local.input.is_empty() {
+            // We simplify the LLVM link transmute logic by deferring to a variant employing the same LLVM link where possible
+            if let Some(link) = self.compose.iter().find_map(|ex| match ex {
+                Expression::LLVMLink(link) => Some(link),
+                _ => None,
+            }) {
+                let mut link = link.clone();
+                link.build(ctx)?;
+
+                for cfg in ctx.global.arch_cfgs.iter() {
+                    let expected_link = link.resolve(cfg);
+                    if let Some(target_inputset) = ctx.group.links.get(&expected_link) {
+                        self.defer_to_existing_llvm_link(ctx.local, target_inputset)?;
+                        break;
+                    }
+                }
+            }
+        }
+
+        if let Some(ref mut assert_instr) = self.assert_instr {
+            assert_instr.iter_mut().try_for_each(|ai| ai.build(ctx))?;
+        }
+
+        // Prepend constraint assertions
+        self.constraints.iter_mut().try_for_each(|c| c.build(ctx))?;
+        let assertions: Vec<_> = self
+            .constraints
+            .iter()
+            .map(|c| ctx.local.make_assertion_from_constraint(c))
+            .try_collect()?;
+        self.compose.splice(0..0, assertions);
+
+        Ok(())
+    }
+
+    fn post_build(&mut self, ctx: &mut Context) -> context::Result {
+        if let Some(Expression::LLVMLink(link)) = self.compose.last() {
+            let mut fn_call = link.make_fn_call(&self.signature)?;
+            // Required to inject conversions
+            fn_call.build(self, ctx)?;
+            self.compose.push(fn_call)
+        }
+
+        if let Some(llvm_link) = self.llvm_link_mut() {
+            /* Turn all Rust unsigned types into signed if required */
+            if ctx.global.auto_llvm_sign_conversion {
+                llvm_link.sanitise_uints();
+            }
+        }
+
+        if let Some(predicate_form) = ctx.local.predicate_form() {
+            predicate_form.post_build(self)?
+        }
+
+        // Set for ToTokens<Signature> to display a generic svbool_t
+        self.signature.predicate_needs_conversion = true;
+
+        // Set base type kind for instruction assertion
+        self.base_type = ctx
+            .local
+            .input
+            .get(0)
+            .and_then(|arg| arg.typekind())
+            .and_then(|ty| ty.base_type())
+            .cloned();
+
+        // Add global target features
+        self.target_features = ctx
+            .global
+            .arch_cfgs
+            .iter()
+            .flat_map(|cfg| cfg.target_feature.clone())
+            .chain(self.target_features.clone())
+            .collect_vec();
+
+        Ok(())
+    }
+
+    fn defer_to_existing_llvm_link(
+        &mut self,
+        ctx: &LocalContext,
+        target_inputset: &InputSet,
+    ) -> context::Result {
+        let mut target_ctx = ctx.clone();
+        target_ctx.input = target_inputset.clone();
+
+        let mut target_signature = target_ctx.signature.clone();
+        target_signature.build(&target_ctx)?;
+
+        let drop_var = if let Some(pred) = ctx.predicate_form().cloned() {
+            match pred {
+                PredicateForm::Zeroing(ZeroingMethod::Drop { drop }) => Some(drop),
+                PredicateForm::DontCare(DontCareMethod::AsZeroing) => {
+                    if let Some(ZeroingMethod::Drop { drop }) =
+                        self.input.predication_methods.zeroing_method.to_owned()
+                    {
+                        Some(drop)
+                    } else {
+                        None
+                    }
+                }
+                _ => None,
+            }
+        } else {
+            None
+        };
+
+        let call_method =
+            |ex, method: &str| Expression::MethodCall(Box::new(ex), method.to_string(), vec![]);
+        let as_unsigned = |ex| call_method(ex, "as_unsigned");
+        let as_signed = |ex| call_method(ex, "as_signed");
+        let convert_if_required = |w: Option<&Wildcard>, from: &InputSet, to: &InputSet, ex| {
+            if let Some(w) = w {
+                if let Some(dest_idx) = w.get_typeset_index() {
+                    let from_type = from.get(dest_idx);
+                    let to_type = to.get(dest_idx);
+
+                    if from_type != to_type {
+                        let from_base_type = from_type
+                            .and_then(|in_arg| in_arg.typekind())
+                            .and_then(|ty| ty.base_type())
+                            .map(|bt| bt.kind());
+                        let to_base_type = to_type
+                            .and_then(|in_arg| in_arg.typekind())
+                            .and_then(|ty| ty.base_type())
+                            .map(|bt| bt.kind());
+
+                        match (from_base_type, to_base_type) {
+                            // Use AsSigned for uint -> int
+                            (Some(BaseTypeKind::UInt), Some(BaseTypeKind::Int)) => as_signed(ex),
+                            (Some(BaseTypeKind::Int), Some(BaseTypeKind::Int)) => ex,
+                            // Use AsUnsigned for int -> uint
+                            (Some(BaseTypeKind::Int), Some(BaseTypeKind::UInt)) => as_unsigned(ex),
+                            (Some(BaseTypeKind::Float), Some(BaseTypeKind::Float)) => ex,
+                            (Some(BaseTypeKind::UInt), Some(BaseTypeKind::UInt)) => ex,
+                            (Some(BaseTypeKind::Poly), Some(BaseTypeKind::Poly)) => ex,
+
+                            (None, None) => ex,
+                            _ => unreachable!(
+                                "unsupported conversion case from {from_base_type:?} to {to_base_type:?} hit"
+                            ),
+                        }
+                    } else {
+                        ex
+                    }
+                } else {
+                    ex
+                }
+            } else {
+                ex
+            }
+        };
+
+        let args = ctx
+            .signature
+            .arguments
+            .iter()
+            .filter_map(|arg| {
+                let var = Expression::Identifier(arg.name.to_owned(), IdentifierType::Variable);
+                if drop_var.as_ref().map(|v| v.to_string()) != Some(arg.name.to_string()) {
+                    Some(convert_if_required(
+                        arg.kind.wildcard(),
+                        &ctx.input,
+                        target_inputset,
+                        var,
+                    ))
+                } else {
+                    None
+                }
+            })
+            .collect_vec();
+
+        let turbofish = self
+            .signature
+            .static_defs
+            .iter()
+            .map(|def| {
+                let name = match def {
+                    StaticDefinition::Constant(Argument { name, .. }) => name.to_string(),
+                    StaticDefinition::Generic(name) => name.to_string(),
+                };
+                Expression::Identifier(name.into(), IdentifierType::Symbol)
+            })
+            .collect_vec();
+
+        let ret_wildcard = ctx
+            .signature
+            .return_type
+            .as_ref()
+            .and_then(|t| t.wildcard());
+        let call = FnCall(
+            Box::new(target_signature.fn_name().into()),
+            args,
+            turbofish,
+            false,
+        )
+        .into();
+
+        self.compose = vec![convert_if_required(
+            ret_wildcard,
+            target_inputset,
+            &ctx.input,
+            call,
+        )];
+
+        Ok(())
+    }
+}
+
+/// Some intrinsics require a little endian and big endian implementation, others
+/// do not
+enum Endianness {
+    Little,
+    Big,
+    NA,
+}
+
+/// Based on the endianess will create the appropriate intrinsic, or simply
+/// create the desired intrinsic without any endianess
+fn create_tokens(intrinsic: &Intrinsic, endianness: Endianness, tokens: &mut TokenStream) {
+    let signature = &intrinsic.signature;
+    let fn_name = signature.fn_name().to_string();
+    let target_feature = intrinsic.target_features.join(",");
+    let safety = intrinsic
+        .safety
+        .as_ref()
+        .expect("safety should be determined during `pre_build`");
+
+    if let Some(doc) = &intrinsic.doc {
+        let mut doc = vec![doc.to_string()];
+
+        doc.push(format!("[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/{})", &signature.doc_name()));
+
+        if safety.has_doc_comments() {
+            doc.push("## Safety".to_string());
+            for comment in safety.doc_comments() {
+                doc.push(format!("  * {comment}"));
+            }
+        } else {
+            assert!(
+                safety.is_safe(),
+                "{fn_name} is both public and unsafe, and so needs safety documentation"
+            );
+        }
+
+        tokens.append_all(quote! { #(#[doc = #doc])* });
+    } else {
+        assert!(
+            matches!(intrinsic.visibility, FunctionVisibility::Private),
+            "{fn_name} needs to be private, or to have documentation."
+        );
+        assert!(
+            !safety.has_doc_comments(),
+            "{fn_name} needs a documentation section for its safety comments."
+        );
+    }
+
+    tokens.append_all(quote! { #[inline] });
+
+    match endianness {
+        Endianness::Little => tokens.append_all(quote! { #[cfg(target_endian = "little")] }),
+        Endianness::Big => tokens.append_all(quote! { #[cfg(target_endian = "big")] }),
+        Endianness::NA => {}
+    };
+
+    let expressions = match endianness {
+        Endianness::Little | Endianness::NA => &intrinsic.compose,
+        Endianness::Big => &intrinsic.big_endian_compose,
+    };
+
+    /* If we have manually defined attributes on the block of yaml with
+     * 'attr:' we want to add them */
+    if let Some(attr) = &intrinsic.attr {
+        /* Scan to see if we have defined `FnCall: [target_feature, ['<bespoke>']]`*/
+        if !has_target_feature_attr(attr) {
+            /* If not add the default one that is defined at the top of
+             * the yaml file. This does mean we scan the attributes vector
+             * twice, once to see if the `target_feature` exists and again
+             * to actually append the tokens. We could impose that the
+             * `target_feature` call has to be the first argument of the
+             * `attr` block */
+            tokens.append_all(quote! {
+                #[target_feature(enable = #target_feature)]
+            });
+        }
+
+        /* Target feature will get added here */
+        let attr_expressions = &mut attr.iter().peekable();
+        for ex in attr_expressions {
+            let mut inner = TokenStream::new();
+            ex.to_tokens(&mut inner);
+            tokens.append(Punct::new('#', Spacing::Alone));
+            tokens.append(Group::new(Delimiter::Bracket, inner));
+        }
+    } else {
+        tokens.append_all(quote! {
+            #[target_feature(enable = #target_feature)]
+        });
+    }
+
+    #[allow(clippy::collapsible_if)]
+    if let Some(assert_instr) = &intrinsic.assert_instr {
+        if !assert_instr.is_empty() {
+            InstructionAssertionsForBaseType(assert_instr, &intrinsic.base_type.as_ref())
+                .to_tokens(tokens)
+        }
+    }
+
+    match &intrinsic.visibility {
+        FunctionVisibility::Public => tokens.append_all(quote! { pub }),
+        FunctionVisibility::Private => {}
+    }
+    if safety.is_unsafe() {
+        tokens.append_all(quote! { unsafe });
+    }
+    tokens.append_all(quote! { #signature });
+
+    // If the intrinsic function is explicitly unsafe, we populate `body_default_safety` with
+    // the implementation. No explicit unsafe blocks are required.
+    //
+    // If the intrinsic is safe, we fill `body_default_safety` until we encounter an expression
+    // that requires an unsafe wrapper, then switch to `body_unsafe`. Since the unsafe
+    // operation (e.g. memory access) is typically the last step, this tends to minimises the
+    // amount of unsafe code required.
+    let mut body_default_safety = TokenStream::new();
+    let mut body_unsafe = TokenStream::new();
+    let mut body_current = &mut body_default_safety;
+    for (pos, ex) in expressions.iter().with_position() {
+        if safety.is_safe() && ex.requires_unsafe_wrapper(&fn_name) {
+            body_current = &mut body_unsafe;
+        }
+        ex.to_tokens(body_current);
+        let is_last = matches!(pos, itertools::Position::Last | itertools::Position::Only);
+        let is_llvm_link = matches!(ex, Expression::LLVMLink(_));
+        if !is_last && !is_llvm_link {
+            body_current.append(Punct::new(';', Spacing::Alone));
+        }
+    }
+    let mut body = body_default_safety;
+    if !body_unsafe.is_empty() {
+        body.append_all(quote! { unsafe { #body_unsafe } });
+    }
+
+    tokens.append(Group::new(Delimiter::Brace, body));
+}
+
+impl ToTokens for Intrinsic {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        if !self.big_endian_compose.is_empty() {
+            for i in 0..2 {
+                match i {
+                    0 => create_tokens(self, Endianness::Little, tokens),
+                    1 => create_tokens(self, Endianness::Big, tokens),
+                    _ => panic!("Currently only little and big endian exist"),
+                }
+            }
+        } else {
+            create_tokens(self, Endianness::NA, tokens);
+        }
+    }
+}
+
+fn has_target_feature_attr(attrs: &[Expression]) -> bool {
+    attrs.iter().any(|attr| {
+        if let Expression::FnCall(fn_call) = attr {
+            fn_call.is_target_feature_call()
+        } else {
+            false
+        }
+    })
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/load_store_tests.rs b/library/stdarch/crates/stdarch-gen-arm/src/load_store_tests.rs
new file mode 100644
index 0000000000000..5cf39b2e11aed
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/load_store_tests.rs
@@ -0,0 +1,822 @@
+use std::fs::File;
+use std::io::Write;
+use std::path::PathBuf;
+use std::str::FromStr;
+
+use crate::format_code;
+use crate::input::InputType;
+use crate::intrinsic::Intrinsic;
+use crate::typekinds::BaseType;
+use crate::typekinds::{ToRepr, TypeKind};
+
+use itertools::Itertools;
+use lazy_static::lazy_static;
+use proc_macro2::TokenStream;
+use quote::{format_ident, quote};
+
+// Number of vectors in our buffers - the maximum tuple size, 4, plus 1 as we set the vnum
+// argument to 1.
+const NUM_VECS: usize = 5;
+// The maximum vector length (in bits)
+const VL_MAX_BITS: usize = 2048;
+// The maximum vector length (in bytes)
+const VL_MAX_BYTES: usize = VL_MAX_BITS / 8;
+// The maximum number of elements in each vector type
+const LEN_F32: usize = VL_MAX_BYTES / core::mem::size_of::<f32>();
+const LEN_F64: usize = VL_MAX_BYTES / core::mem::size_of::<f64>();
+const LEN_I8: usize = VL_MAX_BYTES / core::mem::size_of::<i8>();
+const LEN_I16: usize = VL_MAX_BYTES / core::mem::size_of::<i16>();
+const LEN_I32: usize = VL_MAX_BYTES / core::mem::size_of::<i32>();
+const LEN_I64: usize = VL_MAX_BYTES / core::mem::size_of::<i64>();
+const LEN_U8: usize = VL_MAX_BYTES / core::mem::size_of::<u8>();
+const LEN_U16: usize = VL_MAX_BYTES / core::mem::size_of::<u16>();
+const LEN_U32: usize = VL_MAX_BYTES / core::mem::size_of::<u32>();
+const LEN_U64: usize = VL_MAX_BYTES / core::mem::size_of::<u64>();
+
+/// `load_intrinsics` and `store_intrinsics` is a vector of intrinsics
+/// variants, while `out_path` is a file to write to.
+pub fn generate_load_store_tests(
+    load_intrinsics: Vec<Intrinsic>,
+    store_intrinsics: Vec<Intrinsic>,
+    out_path: Option<&PathBuf>,
+) -> Result<(), String> {
+    let output = match out_path {
+        Some(out) => {
+            Box::new(File::create(out).map_err(|e| format!("couldn't create tests file: {e}"))?)
+                as Box<dyn Write>
+        }
+        None => Box::new(std::io::stdout()) as Box<dyn Write>,
+    };
+    let mut used_stores = vec![false; store_intrinsics.len()];
+    let tests: Vec<_> = load_intrinsics
+        .iter()
+        .map(|load| {
+            let store_candidate = load
+                .signature
+                .fn_name()
+                .to_string()
+                .replace("svld1s", "svst1")
+                .replace("svld1u", "svst1")
+                .replace("svldnt1s", "svstnt1")
+                .replace("svldnt1u", "svstnt1")
+                .replace("svld", "svst")
+                .replace("gather", "scatter");
+
+            let store_index = store_intrinsics
+                .iter()
+                .position(|i| i.signature.fn_name().to_string() == store_candidate);
+            if let Some(i) = store_index {
+                used_stores[i] = true;
+            }
+
+            generate_single_test(
+                load.clone(),
+                store_index.map(|i| store_intrinsics[i].clone()),
+            )
+        })
+        .try_collect()?;
+
+    assert!(
+        used_stores.into_iter().all(|b| b),
+        "Not all store tests have been paired with a load. Consider generating specifc store-only tests"
+    );
+
+    let preamble =
+        TokenStream::from_str(&PREAMBLE).map_err(|e| format!("Preamble is invalid: {e}"))?;
+    // Only output manual tests for the SVE set
+    let manual_tests = match &load_intrinsics[0].target_features[..] {
+        [s] if s == "sve" => TokenStream::from_str(&MANUAL_TESTS)
+            .map_err(|e| format!("Manual tests are invalid: {e}"))?,
+        _ => quote!(),
+    };
+    format_code(
+        output,
+        format!(
+            "// This code is automatically generated. DO NOT MODIFY.
+//
+// Instead, modify `crates/stdarch-gen-arm/spec/sve` and run the following command to re-generate
+// this file:
+//
+// ```
+// cargo run --bin=stdarch-gen-arm -- crates/stdarch-gen-arm/spec
+// ```
+{}",
+            quote! { #preamble #(#tests)* #manual_tests }
+        ),
+    )
+    .map_err(|e| format!("couldn't write tests: {e}"))
+}
+
+/// A test looks like this:
+/// ```
+///     let data = [scalable vector];
+///
+///     let mut storage = [0; N];
+///
+///     store_intrinsic([true_predicate], storage.as_mut_ptr(), data);
+///     [test contents of storage]
+///
+///     let loaded == load_intrinsic([true_predicate], storage.as_ptr())
+///     assert!(loaded == data);
+/// ```
+/// We intialise our data such that the value stored matches the index it's stored to.
+/// By doing this we can validate scatters by checking that each value in the storage
+/// array is either 0 or the same as its index.
+fn generate_single_test(
+    load: Intrinsic,
+    store: Option<Intrinsic>,
+) -> Result<proc_macro2::TokenStream, String> {
+    let chars = LdIntrCharacteristics::new(&load)?;
+    let fn_name = load.signature.fn_name().to_string();
+
+    #[allow(clippy::collapsible_if)]
+    if let Some(ty) = &chars.gather_bases_type {
+        if ty.base_type().unwrap().get_size() == Ok(32)
+            && chars.gather_index_type.is_none()
+            && chars.gather_offset_type.is_none()
+        {
+            // We lack a way to ensure data is in the bottom 32 bits of the address space
+            println!("Skipping test for {fn_name}");
+            return Ok(quote!());
+        }
+    }
+
+    if fn_name.starts_with("svldff1") && fn_name.contains("gather") {
+        // TODO: We can remove this check when first-faulting gathers are fixed in CI's QEMU
+        // https://gitlab.com/qemu-project/qemu/-/issues/1612
+        println!("Skipping test for {fn_name}");
+        return Ok(quote!());
+    }
+
+    let fn_ident = format_ident!("{fn_name}");
+    let test_name = format_ident!(
+        "test_{fn_name}{}",
+        if let Some(ref store) = store {
+            format!("_with_{}", store.signature.fn_name())
+        } else {
+            String::new()
+        }
+    );
+
+    let load_type = &chars.load_type;
+    let acle_type = load_type.acle_notation_repr();
+
+    // If there's no return type, fallback to the load type for things that depend on it
+    let ret_type = &load
+        .signature
+        .return_type
+        .as_ref()
+        .and_then(TypeKind::base_type)
+        .unwrap_or(load_type);
+
+    let pred_fn = format_ident!("svptrue_b{}", load_type.size());
+
+    let load_type_caps = load_type.rust_repr().to_uppercase();
+    let data_array = format_ident!("{load_type_caps}_DATA");
+
+    let size_fn = format_ident!("svcnt{}", ret_type.size_literal());
+
+    let rust_ret_type = ret_type.rust_repr();
+    let assert_fn = format_ident!("assert_vector_matches_{rust_ret_type}");
+
+    // Use vnum=1, so adjust all values by one vector length
+    let (length_call, vnum_arg) = if chars.vnum {
+        if chars.is_prf {
+            (quote!(), quote!(, 1))
+        } else {
+            (quote!(let len = #size_fn() as usize;), quote!(, 1))
+        }
+    } else {
+        (quote!(), quote!())
+    };
+
+    let (bases_load, bases_arg) = if let Some(ty) = &chars.gather_bases_type {
+        // Bases is a vector of (sometimes 32-bit) pointers
+        // When we combine bases with an offset/index argument, we load from the data arrays
+        // starting at 1
+        let base_ty = ty.base_type().unwrap();
+        let rust_type = format_ident!("{}", base_ty.rust_repr());
+        let index_fn = format_ident!("svindex_{}", base_ty.acle_notation_repr());
+        let size_in_bytes = chars.load_type.get_size().unwrap() / 8;
+
+        if base_ty.get_size().unwrap() == 32 {
+            // Treat bases as a vector of offsets here - we don't test this without an offset or
+            // index argument
+            (
+                Some(quote!(
+                    let bases = #index_fn(0, #size_in_bytes.try_into().unwrap());
+                )),
+                quote!(, bases),
+            )
+        } else {
+            // Treat bases as a vector of pointers
+            let base_fn = format_ident!("svdup_n_{}", base_ty.acle_notation_repr());
+            let data_array = if store.is_some() {
+                format_ident!("storage")
+            } else {
+                format_ident!("{}_DATA", chars.load_type.rust_repr().to_uppercase())
+            };
+
+            let add_fn = format_ident!("svadd_{}_x", base_ty.acle_notation_repr());
+            (
+                Some(quote! {
+                    let bases = #base_fn(#data_array.as_ptr() as #rust_type);
+                    let offsets = #index_fn(0, #size_in_bytes.try_into().unwrap());
+                    let bases = #add_fn(#pred_fn(), bases, offsets);
+                }),
+                quote!(, bases),
+            )
+        }
+    } else {
+        (None, quote!())
+    };
+
+    let index_arg = if let Some(ty) = &chars.gather_index_type {
+        let rust_type = format_ident!("{}", ty.rust_repr());
+        if chars
+            .gather_bases_type
+            .as_ref()
+            .and_then(TypeKind::base_type)
+            .map_or(Err(String::new()), BaseType::get_size)
+            .unwrap()
+            == 32
+        {
+            // Let index be the base of the data array
+            let data_array = if store.is_some() {
+                format_ident!("storage")
+            } else {
+                format_ident!("{}_DATA", chars.load_type.rust_repr().to_uppercase())
+            };
+            let size_in_bytes = chars.load_type.get_size().unwrap() / 8;
+            quote!(, #data_array.as_ptr() as #rust_type / (#size_in_bytes as #rust_type) + 1)
+        } else {
+            quote!(, 1.try_into().unwrap())
+        }
+    } else {
+        quote!()
+    };
+
+    let offset_arg = if let Some(ty) = &chars.gather_offset_type {
+        let size_in_bytes = chars.load_type.get_size().unwrap() / 8;
+        if chars
+            .gather_bases_type
+            .as_ref()
+            .and_then(TypeKind::base_type)
+            .map_or(Err(String::new()), BaseType::get_size)
+            .unwrap()
+            == 32
+        {
+            // Let offset be the base of the data array
+            let rust_type = format_ident!("{}", ty.rust_repr());
+            let data_array = if store.is_some() {
+                format_ident!("storage")
+            } else {
+                format_ident!("{}_DATA", chars.load_type.rust_repr().to_uppercase())
+            };
+            quote!(, #data_array.as_ptr() as #rust_type + #size_in_bytes as #rust_type)
+        } else {
+            quote!(, #size_in_bytes.try_into().unwrap())
+        }
+    } else {
+        quote!()
+    };
+
+    let (offsets_load, offsets_arg) = if let Some(ty) = &chars.gather_offsets_type {
+        // Offsets is a scalable vector of per-element offsets in bytes. We re-use the contiguous
+        // data for this, then multiply to get indices
+        let offsets_fn = format_ident!("svindex_{}", ty.base_type().unwrap().acle_notation_repr());
+        let size_in_bytes = chars.load_type.get_size().unwrap() / 8;
+        (
+            Some(quote! {
+                let offsets = #offsets_fn(0, #size_in_bytes.try_into().unwrap());
+            }),
+            quote!(, offsets),
+        )
+    } else {
+        (None, quote!())
+    };
+
+    let (indices_load, indices_arg) = if let Some(ty) = &chars.gather_indices_type {
+        // There's no need to multiply indices by the load type width
+        let base_ty = ty.base_type().unwrap();
+        let indices_fn = format_ident!("svindex_{}", base_ty.acle_notation_repr());
+        (
+            Some(quote! {
+                let indices = #indices_fn(0, 1);
+            }),
+            quote! {, indices},
+        )
+    } else {
+        (None, quote!())
+    };
+
+    let ptr = if chars.gather_bases_type.is_some() {
+        quote!()
+    } else if chars.is_prf {
+        quote!(, I64_DATA.as_ptr())
+    } else {
+        quote!(, #data_array.as_ptr())
+    };
+
+    let tuple_len = &chars.tuple_len;
+    let expecteds = if chars.is_prf {
+        // No return value for prefetches
+        vec![]
+    } else {
+        (0..*tuple_len)
+            .map(|i| get_expected_range(i, &chars))
+            .collect()
+    };
+    let asserts: Vec<_> =
+        if *tuple_len > 1 {
+            let svget = format_ident!("svget{tuple_len}_{acle_type}");
+            expecteds.iter().enumerate().map(|(i, expected)| {
+            quote! (#assert_fn(#svget::<{ #i as i32 }>(loaded), #expected);)
+        }).collect()
+        } else {
+            expecteds
+                .iter()
+                .map(|expected| quote! (#assert_fn(loaded, #expected);))
+                .collect()
+        };
+
+    let function = if chars.is_prf {
+        if fn_name.contains("gather") && fn_name.contains("base") && !fn_name.starts_with("svprf_")
+        {
+            // svprf(b|h|w|d)_gather base intrinsics do not have a generic type parameter
+            quote!(#fn_ident::<{ svprfop::SV_PLDL1KEEP }>)
+        } else {
+            quote!(#fn_ident::<{ svprfop::SV_PLDL1KEEP }, i64>)
+        }
+    } else {
+        quote!(#fn_ident)
+    };
+
+    let octaword_guard = if chars.replicate_width == Some(256) {
+        let msg = format!("Skipping {test_name} due to SVE vector length");
+        quote! {
+            if svcntb() < 32 {
+                println!(#msg);
+                return;
+            }
+        }
+    } else {
+        quote!()
+    };
+
+    let feats = load.target_features.join(",");
+
+    if let Some(store) = store {
+        let data_init = if *tuple_len == 1 {
+            quote!(#(#expecteds)*)
+        } else {
+            let create = format_ident!("svcreate{tuple_len}_{acle_type}");
+            quote!(#create(#(#expecteds),*))
+        };
+        let input = store.input.types.first().unwrap().get(0).unwrap();
+        let store_type = input
+            .get(store.test.get_typeset_index().unwrap())
+            .and_then(InputType::typekind)
+            .and_then(TypeKind::base_type)
+            .unwrap();
+
+        let store_type = format_ident!("{}", store_type.rust_repr());
+        let storage_len = NUM_VECS * VL_MAX_BITS / chars.load_type.get_size()? as usize;
+        let store_fn = format_ident!("{}", store.signature.fn_name().to_string());
+        let load_type = format_ident!("{}", chars.load_type.rust_repr());
+        let (store_ptr, store_mut_ptr) = if chars.gather_bases_type.is_none() {
+            (
+                quote!(, storage.as_ptr() as *const #load_type),
+                quote!(, storage.as_mut_ptr()),
+            )
+        } else {
+            (quote!(), quote!())
+        };
+        let args = quote!(#pred_fn() #store_ptr #vnum_arg #bases_arg #offset_arg #index_arg #offsets_arg #indices_arg);
+        let call = if chars.uses_ffr {
+            // Doing a normal load first maximises the number of elements our ff/nf test loads
+            let non_ffr_fn_name = format_ident!(
+                "{}",
+                fn_name
+                    .replace("svldff1", "svld1")
+                    .replace("svldnf1", "svld1")
+            );
+            quote! {
+                svsetffr();
+                let _ = #non_ffr_fn_name(#args);
+                let loaded = #function(#args);
+            }
+        } else {
+            // Note that the FFR must be set for all tests as the assert functions mask against it
+            quote! {
+                svsetffr();
+                let loaded = #function(#args);
+            }
+        };
+
+        Ok(quote! {
+            #[simd_test(enable = #feats)]
+            unsafe fn #test_name() {
+                #octaword_guard
+                #length_call
+                let mut storage = [0 as #store_type; #storage_len];
+                let data = #data_init;
+                #bases_load
+                #offsets_load
+                #indices_load
+
+                #store_fn(#pred_fn() #store_mut_ptr #vnum_arg #bases_arg #offset_arg #index_arg #offsets_arg #indices_arg, data);
+                for (i, &val) in storage.iter().enumerate() {
+                    assert!(val == 0 as #store_type || val == i as #store_type);
+                }
+
+                #call
+                #(#asserts)*
+
+            }
+        })
+    } else {
+        let args = quote!(#pred_fn() #ptr #vnum_arg #bases_arg #offset_arg #index_arg #offsets_arg #indices_arg);
+        let call = if chars.uses_ffr {
+            // Doing a normal load first maximises the number of elements our ff/nf test loads
+            let non_ffr_fn_name = format_ident!(
+                "{}",
+                fn_name
+                    .replace("svldff1", "svld1")
+                    .replace("svldnf1", "svld1")
+            );
+            quote! {
+                svsetffr();
+                let _ = #non_ffr_fn_name(#args);
+                let loaded = #function(#args);
+            }
+        } else {
+            // Note that the FFR must be set for all tests as the assert functions mask against it
+            quote! {
+                svsetffr();
+                let loaded = #function(#args);
+            }
+        };
+        Ok(quote! {
+            #[simd_test(enable = #feats)]
+            unsafe fn #test_name() {
+                #octaword_guard
+                #bases_load
+                #offsets_load
+                #indices_load
+                #call
+                #length_call
+
+                #(#asserts)*
+            }
+        })
+    }
+}
+
+/// Assumes chars.ret_type is not None
+fn get_expected_range(tuple_idx: usize, chars: &LdIntrCharacteristics) -> proc_macro2::TokenStream {
+    // vnum=1
+    let vnum_adjust = if chars.vnum { quote!(len+) } else { quote!() };
+
+    let bases_adjust =
+        (chars.gather_index_type.is_some() || chars.gather_offset_type.is_some()) as usize;
+
+    let tuple_len = chars.tuple_len;
+    let size = chars
+        .ret_type
+        .as_ref()
+        .and_then(TypeKind::base_type)
+        .unwrap_or(&chars.load_type)
+        .get_size()
+        .unwrap() as usize;
+
+    if chars.replicate_width == Some(128) {
+        // svld1rq
+        let ty_rust = format_ident!(
+            "{}",
+            chars
+                .ret_type
+                .as_ref()
+                .unwrap()
+                .base_type()
+                .unwrap()
+                .rust_repr()
+        );
+        let args: Vec<_> = (0..(128 / size)).map(|i| quote!(#i as #ty_rust)).collect();
+        let dup = format_ident!(
+            "svdupq_n_{}",
+            chars.ret_type.as_ref().unwrap().acle_notation_repr()
+        );
+        quote!(#dup(#(#args,)*))
+    } else if chars.replicate_width == Some(256) {
+        // svld1ro - we use two interleaved svdups to create a repeating 256-bit pattern
+        let ty_rust = format_ident!(
+            "{}",
+            chars
+                .ret_type
+                .as_ref()
+                .unwrap()
+                .base_type()
+                .unwrap()
+                .rust_repr()
+        );
+        let ret_acle = chars.ret_type.as_ref().unwrap().acle_notation_repr();
+        let args: Vec<_> = (0..(128 / size)).map(|i| quote!(#i as #ty_rust)).collect();
+        let args2: Vec<_> = ((128 / size)..(256 / size))
+            .map(|i| quote!(#i as #ty_rust))
+            .collect();
+        let dup = format_ident!("svdupq_n_{ret_acle}");
+        let interleave = format_ident!("svtrn1q_{ret_acle}");
+        quote!(#interleave(#dup(#(#args,)*), #dup(#(#args2,)*)))
+    } else {
+        let start = bases_adjust + tuple_idx;
+        if chars
+            .ret_type
+            .as_ref()
+            .unwrap()
+            .base_type()
+            .unwrap()
+            .is_float()
+        {
+            // Use svcvt to create a linear sequence of floats
+            let cvt_fn = format_ident!("svcvt_f{size}_s{size}_x");
+            let pred_fn = format_ident!("svptrue_b{size}");
+            let svindex_fn = format_ident!("svindex_s{size}");
+            quote! { #cvt_fn(#pred_fn(), #svindex_fn((#vnum_adjust #start).try_into().unwrap(), #tuple_len.try_into().unwrap()))}
+        } else {
+            let ret_acle = chars.ret_type.as_ref().unwrap().acle_notation_repr();
+            let svindex = format_ident!("svindex_{ret_acle}");
+            quote!(#svindex((#vnum_adjust #start).try_into().unwrap(), #tuple_len.try_into().unwrap()))
+        }
+    }
+}
+
+struct LdIntrCharacteristics {
+    // The data type to load from (not necessarily the data type returned)
+    load_type: BaseType,
+    // The data type to return (None for unit)
+    ret_type: Option<TypeKind>,
+    // The size of tuple to load/store
+    tuple_len: usize,
+    // Whether a vnum argument is present
+    vnum: bool,
+    // Is the intrinsic first/non-faulting?
+    uses_ffr: bool,
+    // Is it a prefetch?
+    is_prf: bool,
+    // The size of data loaded with svld1ro/q intrinsics
+    replicate_width: Option<usize>,
+    // Scalable vector of pointers to load from
+    gather_bases_type: Option<TypeKind>,
+    // Scalar offset, paired with bases
+    gather_offset_type: Option<TypeKind>,
+    // Scalar index, paired with bases
+    gather_index_type: Option<TypeKind>,
+    // Scalable vector of offsets
+    gather_offsets_type: Option<TypeKind>,
+    // Scalable vector of indices
+    gather_indices_type: Option<TypeKind>,
+}
+
+impl LdIntrCharacteristics {
+    fn new(intr: &Intrinsic) -> Result<LdIntrCharacteristics, String> {
+        let input = intr.input.types.first().unwrap().get(0).unwrap();
+        let load_type = input
+            .get(intr.test.get_typeset_index().unwrap())
+            .and_then(InputType::typekind)
+            .and_then(TypeKind::base_type)
+            .unwrap();
+
+        let ret_type = intr.signature.return_type.clone();
+
+        let name = intr.signature.fn_name().to_string();
+        let tuple_len = name
+            .chars()
+            .find(|c| c.is_numeric())
+            .and_then(|c| c.to_digit(10))
+            .unwrap_or(1) as usize;
+
+        let uses_ffr = name.starts_with("svldff") || name.starts_with("svldnf");
+
+        let is_prf = name.starts_with("svprf");
+
+        let replicate_width = if name.starts_with("svld1ro") {
+            Some(256)
+        } else if name.starts_with("svld1rq") {
+            Some(128)
+        } else {
+            None
+        };
+
+        let get_ty_of_arg = |name: &str| {
+            intr.signature
+                .arguments
+                .iter()
+                .find(|a| a.name.to_string() == name)
+                .map(|a| a.kind.clone())
+        };
+
+        let gather_bases_type = get_ty_of_arg("bases");
+        let gather_offset_type = get_ty_of_arg("offset");
+        let gather_index_type = get_ty_of_arg("index");
+        let gather_offsets_type = get_ty_of_arg("offsets");
+        let gather_indices_type = get_ty_of_arg("indices");
+
+        Ok(LdIntrCharacteristics {
+            load_type: *load_type,
+            ret_type,
+            tuple_len,
+            vnum: name.contains("vnum"),
+            uses_ffr,
+            is_prf,
+            replicate_width,
+            gather_bases_type,
+            gather_offset_type,
+            gather_index_type,
+            gather_offsets_type,
+            gather_indices_type,
+        })
+    }
+}
+
+lazy_static! {
+    static ref PREAMBLE: String = format!(
+        r#"#![allow(unused)]
+
+use super::*;
+use std::boxed::Box;
+use std::convert::{{TryFrom, TryInto}};
+use std::sync::LazyLock;
+use std::vec::Vec;
+use stdarch_test::simd_test;
+
+static F32_DATA: LazyLock<[f32; {LEN_F32} * {NUM_VECS}]> = LazyLock::new(|| {{
+    (0..{LEN_F32} * {NUM_VECS})
+        .map(|i| i as f32)
+        .collect::<Vec<_>>()
+        .try_into()
+        .expect("f32 data incorrectly initialised")
+}});
+static F64_DATA: LazyLock<[f64; {LEN_F64} * {NUM_VECS}]> = LazyLock::new(|| {{
+    (0..{LEN_F64} * {NUM_VECS})
+        .map(|i| i as f64)
+        .collect::<Vec<_>>()
+        .try_into()
+        .expect("f64 data incorrectly initialised")
+}});
+static I8_DATA: LazyLock<[i8; {LEN_I8} * {NUM_VECS}]> = LazyLock::new(|| {{
+    (0..{LEN_I8} * {NUM_VECS})
+        .map(|i| ((i + 128) % 256 - 128) as i8)
+        .collect::<Vec<_>>()
+        .try_into()
+        .expect("i8 data incorrectly initialised")
+}});
+static I16_DATA: LazyLock<[i16; {LEN_I16} * {NUM_VECS}]> = LazyLock::new(|| {{
+    (0..{LEN_I16} * {NUM_VECS})
+        .map(|i| i as i16)
+        .collect::<Vec<_>>()
+        .try_into()
+        .expect("i16 data incorrectly initialised")
+}});
+static I32_DATA: LazyLock<[i32; {LEN_I32} * {NUM_VECS}]> = LazyLock::new(|| {{
+    (0..{LEN_I32} * {NUM_VECS})
+        .map(|i| i as i32)
+        .collect::<Vec<_>>()
+        .try_into()
+        .expect("i32 data incorrectly initialised")
+}});
+static I64_DATA: LazyLock<[i64; {LEN_I64} * {NUM_VECS}]> = LazyLock::new(|| {{
+    (0..{LEN_I64} * {NUM_VECS})
+        .map(|i| i as i64)
+        .collect::<Vec<_>>()
+        .try_into()
+        .expect("i64 data incorrectly initialised")
+}});
+static U8_DATA: LazyLock<[u8; {LEN_U8} * {NUM_VECS}]> = LazyLock::new(|| {{
+    (0..{LEN_U8} * {NUM_VECS})
+        .map(|i| i as u8)
+        .collect::<Vec<_>>()
+        .try_into()
+        .expect("u8 data incorrectly initialised")
+}});
+static U16_DATA: LazyLock<[u16; {LEN_U16} * {NUM_VECS}]> = LazyLock::new(|| {{
+    (0..{LEN_U16} * {NUM_VECS})
+        .map(|i| i as u16)
+        .collect::<Vec<_>>()
+        .try_into()
+        .expect("u16 data incorrectly initialised")
+}});
+static U32_DATA: LazyLock<[u32; {LEN_U32} * {NUM_VECS}]> = LazyLock::new(|| {{
+    (0..{LEN_U32} * {NUM_VECS})
+        .map(|i| i as u32)
+        .collect::<Vec<_>>()
+        .try_into()
+        .expect("u32 data incorrectly initialised")
+}});
+static U64_DATA: LazyLock<[u64; {LEN_U64} * {NUM_VECS}]> = LazyLock::new(|| {{
+    (0..{LEN_U64} * {NUM_VECS})
+        .map(|i| i as u64)
+        .collect::<Vec<_>>()
+        .try_into()
+        .expect("u64 data incorrectly initialised")
+}});
+
+#[target_feature(enable = "sve")]
+fn assert_vector_matches_f32(vector: svfloat32_t, expected: svfloat32_t) {{
+    let defined = svrdffr();
+    assert!(svptest_first(svptrue_b32(), defined));
+    let cmp = svcmpne_f32(defined, vector, expected);
+    assert!(!svptest_any(defined, cmp))
+}}
+
+#[target_feature(enable = "sve")]
+fn assert_vector_matches_f64(vector: svfloat64_t, expected: svfloat64_t) {{
+    let defined = svrdffr();
+    assert!(svptest_first(svptrue_b64(), defined));
+    let cmp = svcmpne_f64(defined, vector, expected);
+    assert!(!svptest_any(defined, cmp))
+}}
+
+#[target_feature(enable = "sve")]
+fn assert_vector_matches_i8(vector: svint8_t, expected: svint8_t) {{
+    let defined = svrdffr();
+    assert!(svptest_first(svptrue_b8(), defined));
+    let cmp = svcmpne_s8(defined, vector, expected);
+    assert!(!svptest_any(defined, cmp))
+}}
+
+#[target_feature(enable = "sve")]
+fn assert_vector_matches_i16(vector: svint16_t, expected: svint16_t) {{
+    let defined = svrdffr();
+    assert!(svptest_first(svptrue_b16(), defined));
+    let cmp = svcmpne_s16(defined, vector, expected);
+    assert!(!svptest_any(defined, cmp))
+}}
+
+#[target_feature(enable = "sve")]
+fn assert_vector_matches_i32(vector: svint32_t, expected: svint32_t) {{
+    let defined = svrdffr();
+    assert!(svptest_first(svptrue_b32(), defined));
+    let cmp = svcmpne_s32(defined, vector, expected);
+    assert!(!svptest_any(defined, cmp))
+}}
+
+#[target_feature(enable = "sve")]
+fn assert_vector_matches_i64(vector: svint64_t, expected: svint64_t) {{
+    let defined = svrdffr();
+    assert!(svptest_first(svptrue_b64(), defined));
+    let cmp = svcmpne_s64(defined, vector, expected);
+    assert!(!svptest_any(defined, cmp))
+}}
+
+#[target_feature(enable = "sve")]
+fn assert_vector_matches_u8(vector: svuint8_t, expected: svuint8_t) {{
+    let defined = svrdffr();
+    assert!(svptest_first(svptrue_b8(), defined));
+    let cmp = svcmpne_u8(defined, vector, expected);
+    assert!(!svptest_any(defined, cmp))
+}}
+
+#[target_feature(enable = "sve")]
+fn assert_vector_matches_u16(vector: svuint16_t, expected: svuint16_t) {{
+    let defined = svrdffr();
+    assert!(svptest_first(svptrue_b16(), defined));
+    let cmp = svcmpne_u16(defined, vector, expected);
+    assert!(!svptest_any(defined, cmp))
+}}
+
+#[target_feature(enable = "sve")]
+fn assert_vector_matches_u32(vector: svuint32_t, expected: svuint32_t) {{
+    let defined = svrdffr();
+    assert!(svptest_first(svptrue_b32(), defined));
+    let cmp = svcmpne_u32(defined, vector, expected);
+    assert!(!svptest_any(defined, cmp))
+}}
+
+#[target_feature(enable = "sve")]
+fn assert_vector_matches_u64(vector: svuint64_t, expected: svuint64_t) {{
+    let defined = svrdffr();
+    assert!(svptest_first(svptrue_b64(), defined));
+    let cmp = svcmpne_u64(defined, vector, expected);
+    assert!(!svptest_any(defined, cmp))
+}}
+"#
+    );
+}
+
+lazy_static! {
+    static ref MANUAL_TESTS: String = format!(
+        "#[simd_test(enable = \"sve\")]
+unsafe fn test_ffr() {{
+    svsetffr();
+    let ffr = svrdffr();
+    assert_vector_matches_u8(svdup_n_u8_z(ffr, 1), svindex_u8(1, 0));
+    let pred = svdupq_n_b8(true, false, true, false, true, false, true, false,
+                           true, false, true, false, true, false, true, false);
+    svwrffr(pred);
+    let ffr = svrdffr_z(svptrue_b8());
+    assert_vector_matches_u8(svdup_n_u8_z(ffr, 1), svdup_n_u8_z(pred, 1));
+}}
+"
+    );
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/main.rs b/library/stdarch/crates/stdarch-gen-arm/src/main.rs
new file mode 100644
index 0000000000000..9bf7d0981deb9
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/main.rs
@@ -0,0 +1,311 @@
+#![feature(pattern)]
+
+mod assert_instr;
+mod big_endian;
+mod context;
+mod expression;
+mod fn_suffix;
+mod input;
+mod intrinsic;
+mod load_store_tests;
+mod matching;
+mod predicate_forms;
+mod typekinds;
+mod wildcards;
+mod wildstring;
+
+use intrinsic::Test;
+use itertools::Itertools;
+use quote::quote;
+use std::fs::File;
+use std::io::Write;
+use std::path::{Path, PathBuf};
+use std::process::{Command, Stdio};
+use walkdir::WalkDir;
+
+fn main() -> Result<(), String> {
+    parse_args()
+        .into_iter()
+        .map(|(filepath, out)| {
+            File::open(&filepath)
+                .map(|f| (f, filepath, out))
+                .map_err(|e| format!("could not read input file: {e}"))
+        })
+        .map(|res| {
+            let (file, filepath, out) = res?;
+            serde_yaml::from_reader(file)
+                .map(|input: input::GeneratorInput| (input, filepath, out))
+                .map_err(|e| format!("could not parse input file: {e}"))
+        })
+        .collect::<Result<Vec<_>, _>>()?
+        .into_iter()
+        .map(|(input, filepath, out)| {
+            let intrinsics = input.intrinsics.into_iter()
+                .map(|intrinsic| {
+                    intrinsic.generate_variants(&input.ctx)
+                })
+                .try_collect()
+                .map(|mut vv: Vec<_>| {
+                    vv.sort_by_cached_key(|variants| {
+                        variants.first().map_or_else(String::default, |variant| {
+                            variant.signature.fn_name().to_string()
+                        })
+                    });
+                    vv.into_iter().flatten().collect_vec()
+                })?;
+
+            if filepath.ends_with("sve.spec.yml") || filepath.ends_with("sve2.spec.yml") {
+                let loads = intrinsics.iter()
+                    .filter_map(|i| {
+                        if matches!(i.test, Test::Load(..)) {
+                            Some(i.clone())
+                        } else {
+                            None
+                        }
+                    }).collect();
+                let stores = intrinsics.iter()
+                    .filter_map(|i| {
+                        if matches!(i.test, Test::Store(..)) {
+                            Some(i.clone())
+                        } else {
+                            None
+                        }
+                    }).collect();
+                load_store_tests::generate_load_store_tests(loads, stores, out.as_ref().map(|o| make_tests_filepath(&filepath, o)).as_ref())?;
+            }
+
+            Ok((
+                input::GeneratorInput {
+                    intrinsics,
+                    ctx: input.ctx,
+                },
+                filepath,
+                out,
+            ))
+        })
+        .try_for_each(
+            |result: context::Result<(input::GeneratorInput, PathBuf, Option<PathBuf>)>| -> context::Result {
+                let (generated, filepath, out) = result?;
+
+                let w = match out {
+                    Some(out) => Box::new(
+                        File::create(make_output_filepath(&filepath, &out))
+                            .map_err(|e| format!("could not create output file: {e}"))?,
+                    ) as Box<dyn Write>,
+                    None => Box::new(std::io::stdout()) as Box<dyn Write>,
+                };
+
+                generate_file(generated, w)
+                    .map_err(|e| format!("could not generate output file: {e}"))
+            },
+        )
+}
+
+fn parse_args() -> Vec<(PathBuf, Option<PathBuf>)> {
+    let mut args_it = std::env::args().skip(1);
+    assert!(
+        1 <= args_it.len() && args_it.len() <= 2,
+        "Usage: cargo run -p stdarch-gen-arm -- INPUT_DIR [OUTPUT_DIR]\n\
+        where:\n\
+        - INPUT_DIR contains a tree like: INPUT_DIR/<feature>/<arch>.spec.yml\n\
+        - OUTPUT_DIR is a directory like: crates/core_arch/src/"
+    );
+
+    let in_path = Path::new(args_it.next().unwrap().as_str()).to_path_buf();
+    assert!(
+        in_path.exists() && in_path.is_dir(),
+        "invalid path {in_path:#?} given"
+    );
+
+    let out_dir = if let Some(dir) = args_it.next() {
+        let out_path = Path::new(dir.as_str()).to_path_buf();
+        assert!(
+            out_path.exists() && out_path.is_dir(),
+            "invalid path {out_path:#?} given"
+        );
+        Some(out_path)
+    } else {
+        std::env::current_exe()
+            .map(|mut f| {
+                f.pop();
+                f.push("../../crates/core_arch/src/");
+                f.exists().then_some(f)
+            })
+            .ok()
+            .flatten()
+    };
+
+    WalkDir::new(in_path)
+        .into_iter()
+        .filter_map(Result::ok)
+        .filter(|f| f.file_type().is_file())
+        .map(|f| (f.into_path(), out_dir.clone()))
+        .collect()
+}
+
+fn generate_file(
+    generated_input: input::GeneratorInput,
+    mut out: Box<dyn Write>,
+) -> std::io::Result<()> {
+    write!(
+        out,
+        r#"// This code is automatically generated. DO NOT MODIFY.
+//
+// Instead, modify `crates/stdarch-gen-arm/spec/` and run the following command to re-generate this file:
+//
+// ```
+// cargo run --bin=stdarch-gen-arm -- crates/stdarch-gen-arm/spec
+// ```
+#![allow(improper_ctypes)]
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use super::*;{uses_neon}
+
+"#,
+        uses_neon = if generated_input.ctx.uses_neon_types {
+            "\nuse crate::core_arch::arch::aarch64::*;"
+        } else {
+            ""
+        },
+    )?;
+    let intrinsics = generated_input.intrinsics;
+    format_code(out, quote! { #(#intrinsics)* })?;
+    Ok(())
+}
+
+pub fn format_code(
+    mut output: impl std::io::Write,
+    input: impl std::fmt::Display,
+) -> std::io::Result<()> {
+    let proc = Command::new("rustfmt")
+        .stdin(Stdio::piped())
+        .stdout(Stdio::piped())
+        .spawn()?;
+    write!(proc.stdin.as_ref().unwrap(), "{input}")?;
+    output.write_all(proc.wait_with_output()?.stdout.as_slice())
+}
+
+/// Derive an output file path from an input file path and an output directory.
+///
+/// `in_filepath` is expected to have a structure like:
+///     .../<feature>/<arch>.spec.yml
+///
+/// The resulting output path will have a structure like:
+///     <out_dirpath>/<arch>/<feature>/generated.rs
+///
+/// Panics if the resulting name is empty, or if file_name() is not UTF-8.
+fn make_output_filepath(in_filepath: &Path, out_dirpath: &Path) -> PathBuf {
+    make_filepath(in_filepath, out_dirpath, |_name: &str| {
+        "generated.rs".to_owned()
+    })
+}
+
+fn make_tests_filepath(in_filepath: &Path, out_dirpath: &Path) -> PathBuf {
+    make_filepath(in_filepath, out_dirpath, |name: &str| {
+        format!("ld_st_tests_{name}.rs")
+    })
+}
+
+fn make_filepath<F: FnOnce(&str) -> String>(
+    in_filepath: &Path,
+    out_dirpath: &Path,
+    name_formatter: F,
+) -> PathBuf {
+    let mut parts = in_filepath.components().rev().map(|f| {
+        f.as_os_str()
+            .to_str()
+            .expect("Inputs must have valid, UTF-8 file_name()")
+    });
+    let yml = parts.next().expect("Not enough input path elements.");
+    let feature = parts.next().expect("Not enough input path elements.");
+
+    let arch = yml
+        .strip_suffix(".yml")
+        .expect("Expected .yml file input.")
+        .strip_suffix(".spec")
+        .expect("Expected .spec.yml file input.");
+    if arch.is_empty() {
+        panic!("Extended ARCH.spec.yml file input.");
+    }
+
+    let mut output = out_dirpath.to_path_buf();
+    output.push(arch);
+    output.push(feature);
+    output.push(name_formatter(arch));
+    output
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn infer_output_file() {
+        macro_rules! t {
+            ($src:expr, $outdir:expr, $dst:expr, $ldst:expr) => {
+                let src: PathBuf = $src.iter().collect();
+                let outdir: PathBuf = $outdir.iter().collect();
+                let dst: PathBuf = $dst.iter().collect();
+                let ldst: PathBuf = $ldst.iter().collect();
+                assert_eq!(make_output_filepath(&src, &outdir), dst);
+                assert_eq!(make_tests_filepath(&src, &outdir), ldst);
+            };
+        }
+        // Documented usage.
+        t!(
+            ["FEAT", "ARCH.spec.yml"],
+            [""],
+            ["ARCH", "FEAT", "generated.rs"],
+            ["ARCH", "FEAT", "ld_st_tests_ARCH.rs"]
+        );
+        t!(
+            ["x", "y", "FEAT", "ARCH.spec.yml"],
+            ["out"],
+            ["out", "ARCH", "FEAT", "generated.rs"],
+            ["out", "ARCH", "FEAT", "ld_st_tests_ARCH.rs"]
+        );
+        t!(
+            ["p", "q", "FEAT", "ARCH.spec.yml"],
+            ["a", "b"],
+            ["a", "b", "ARCH", "FEAT", "generated.rs"],
+            ["a", "b", "ARCH", "FEAT", "ld_st_tests_ARCH.rs"]
+        );
+        // Extra extensions get treated as part of the stem.
+        t!(
+            ["FEAT", "ARCH.variant.spec.yml"],
+            ["out"],
+            ["out", "ARCH.variant", "FEAT", "generated.rs"],
+            ["out", "ARCH.variant", "FEAT", "ld_st_tests_ARCH.variant.rs"]
+        );
+    }
+
+    #[test]
+    #[should_panic]
+    fn infer_output_file_no_stem() {
+        let src = PathBuf::from("FEAT/.spec.yml");
+        make_output_filepath(&src, Path::new(""));
+    }
+
+    #[test]
+    #[should_panic]
+    fn infer_output_file_no_feat() {
+        let src = PathBuf::from("ARCH.spec.yml");
+        make_output_filepath(&src, Path::new(""));
+    }
+
+    #[test]
+    #[should_panic]
+    fn infer_output_file_ldst_no_stem() {
+        let src = PathBuf::from("FEAT/.spec.yml");
+        make_tests_filepath(&src, Path::new(""));
+    }
+
+    #[test]
+    #[should_panic]
+    fn infer_output_file_ldst_no_feat() {
+        let src = PathBuf::from("ARCH.spec.yml");
+        make_tests_filepath(&src, Path::new(""));
+    }
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/matching.rs b/library/stdarch/crates/stdarch-gen-arm/src/matching.rs
new file mode 100644
index 0000000000000..0c48062042827
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/matching.rs
@@ -0,0 +1,170 @@
+use proc_macro2::TokenStream;
+use quote::ToTokens;
+use serde::{Deserialize, Serialize};
+use std::fmt;
+
+use crate::context::{self, LocalContext};
+use crate::typekinds::{BaseType, BaseTypeKind, TypeKind};
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct MatchSizeValues<T> {
+    pub default: T,
+    pub byte: Option<T>,
+    pub halfword: Option<T>,
+    pub doubleword: Option<T>,
+}
+
+impl<T> MatchSizeValues<T> {
+    pub fn get(&mut self, ty: &TypeKind, ctx: &LocalContext) -> context::Result<&T> {
+        let base_ty = if let Some(w) = ty.wildcard() {
+            ctx.provide_type_wildcard(w)?
+        } else {
+            ty.clone()
+        };
+
+        if let BaseType::Sized(_, bitsize) = base_ty.base_type().unwrap() {
+            match (bitsize, &self.byte, &self.halfword, &self.doubleword) {
+                (64, _, _, Some(v)) | (16, _, Some(v), _) | (8, Some(v), _, _) => Ok(v),
+                _ => Ok(&self.default),
+            }
+        } else {
+            Err(format!("cannot match bitsize to unsized type {ty:?}!"))
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct MatchKindValues<T> {
+    pub default: T,
+    pub float: Option<T>,
+    pub unsigned: Option<T>,
+}
+
+impl<T> MatchKindValues<T> {
+    pub fn get(&mut self, ty: &TypeKind, ctx: &LocalContext) -> context::Result<&T> {
+        let base_ty = if let Some(w) = ty.wildcard() {
+            ctx.provide_type_wildcard(w)?
+        } else {
+            ty.clone()
+        };
+
+        match (
+            base_ty.base_type().unwrap().kind(),
+            &self.float,
+            &self.unsigned,
+        ) {
+            (BaseTypeKind::Float, Some(v), _) | (BaseTypeKind::UInt, _, Some(v)) => Ok(v),
+            _ => Ok(&self.default),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged, deny_unknown_fields)]
+pub enum SizeMatchable<T> {
+    Matched(T),
+    Unmatched {
+        match_size: Option<TypeKind>,
+        #[serde(flatten)]
+        values: MatchSizeValues<Box<T>>,
+    },
+}
+
+impl<T: Clone> SizeMatchable<T> {
+    pub fn perform_match(&mut self, ctx: &LocalContext) -> context::Result {
+        match self {
+            Self::Unmatched {
+                match_size: None,
+                values: MatchSizeValues { default, .. },
+            } => *self = Self::Matched(*default.to_owned()),
+            Self::Unmatched {
+                match_size: Some(ty),
+                values,
+            } => *self = Self::Matched(*values.get(ty, ctx)?.to_owned()),
+            _ => {}
+        }
+        Ok(())
+    }
+}
+
+impl<T: fmt::Debug> AsRef<T> for SizeMatchable<T> {
+    fn as_ref(&self) -> &T {
+        if let SizeMatchable::Matched(v) = self {
+            v
+        } else {
+            panic!("no match for {self:?} was performed");
+        }
+    }
+}
+
+impl<T: fmt::Debug> AsMut<T> for SizeMatchable<T> {
+    fn as_mut(&mut self) -> &mut T {
+        if let SizeMatchable::Matched(v) = self {
+            v
+        } else {
+            panic!("no match for {self:?} was performed");
+        }
+    }
+}
+
+impl<T: fmt::Debug + ToTokens> ToTokens for SizeMatchable<T> {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        self.as_ref().to_tokens(tokens)
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged, deny_unknown_fields)]
+pub enum KindMatchable<T> {
+    Matched(T),
+    Unmatched {
+        match_kind: Option<TypeKind>,
+        #[serde(flatten)]
+        values: MatchKindValues<Box<T>>,
+    },
+}
+
+impl<T: Clone> KindMatchable<T> {
+    pub fn perform_match(&mut self, ctx: &LocalContext) -> context::Result {
+        match self {
+            Self::Unmatched {
+                match_kind: None,
+                values: MatchKindValues { default, .. },
+            } => *self = Self::Matched(*default.to_owned()),
+            Self::Unmatched {
+                match_kind: Some(ty),
+                values,
+            } => *self = Self::Matched(*values.get(ty, ctx)?.to_owned()),
+            _ => {}
+        }
+        Ok(())
+    }
+}
+
+impl<T: fmt::Debug> AsRef<T> for KindMatchable<T> {
+    fn as_ref(&self) -> &T {
+        if let KindMatchable::Matched(v) = self {
+            v
+        } else {
+            panic!("no match for {self:?} was performed");
+        }
+    }
+}
+
+impl<T: fmt::Debug> AsMut<T> for KindMatchable<T> {
+    fn as_mut(&mut self) -> &mut T {
+        if let KindMatchable::Matched(v) = self {
+            v
+        } else {
+            panic!("no match for {self:?} was performed");
+        }
+    }
+}
+
+impl<T: fmt::Debug + ToTokens> ToTokens for KindMatchable<T> {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        self.as_ref().to_tokens(tokens)
+    }
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/predicate_forms.rs b/library/stdarch/crates/stdarch-gen-arm/src/predicate_forms.rs
new file mode 100644
index 0000000000000..02789bf7eb0b7
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/predicate_forms.rs
@@ -0,0 +1,249 @@
+use serde::{Deserialize, Serialize};
+use serde_with::{DeserializeFromStr, SerializeDisplay};
+use std::fmt;
+use std::str::FromStr;
+
+use crate::context;
+use crate::expression::{Expression, FnCall, IdentifierType};
+use crate::intrinsic::Intrinsic;
+use crate::typekinds::{ToRepr, TypeKind};
+use crate::wildcards::Wildcard;
+use crate::wildstring::WildString;
+
+const ZEROING_SUFFIX: &str = "_z";
+const MERGING_SUFFIX: &str = "_m";
+const DONT_CARE_SUFFIX: &str = "_x";
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum ZeroingMethod {
+    /// Drop the specified argument and replace it with a zeroinitializer
+    Drop { drop: WildString },
+    /// Apply zero selection to the specified variable when zeroing
+    Select { select: WildString },
+}
+
+impl PartialOrd for ZeroingMethod {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for ZeroingMethod {
+    fn cmp(&self, _: &Self) -> std::cmp::Ordering {
+        std::cmp::Ordering::Equal
+    }
+}
+
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub enum DontCareMethod {
+    #[default]
+    Inferred,
+    AsZeroing,
+    AsMerging,
+}
+
+#[derive(Debug, Clone, Default, PartialEq, Eq, Deserialize, Serialize)]
+pub struct PredicationMethods {
+    /// Zeroing method, if the zeroing predicate form is used
+    #[serde(default)]
+    pub zeroing_method: Option<ZeroingMethod>,
+    /// Don't care method, if the don't care predicate form is used
+    #[serde(default)]
+    pub dont_care_method: DontCareMethod,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub enum PredicateForm {
+    /// Enables merging predicate form
+    Merging,
+    /// Enables "don't care" predicate form.
+    DontCare(DontCareMethod),
+    /// Enables zeroing predicate form. If LLVM zeroselection is performed, then
+    /// set the `select` field to the variable that gets set. Otherwise set the
+    /// `drop` field if the zeroinitializer replaces a predicate when merging.
+    Zeroing(ZeroingMethod),
+}
+
+impl PredicateForm {
+    pub fn get_suffix(&self) -> &'static str {
+        match self {
+            PredicateForm::Zeroing { .. } => ZEROING_SUFFIX,
+            PredicateForm::Merging => MERGING_SUFFIX,
+            PredicateForm::DontCare { .. } => DONT_CARE_SUFFIX,
+        }
+    }
+
+    pub fn make_zeroinitializer(ty: &TypeKind) -> Expression {
+        FnCall::new_expression(
+            format!("svdup_n_{}", ty.acle_notation_repr())
+                .parse()
+                .unwrap(),
+            vec![if ty.base_type().unwrap().is_float() {
+                Expression::FloatConstant(0.0)
+            } else {
+                Expression::IntConstant(0)
+            }],
+        )
+    }
+
+    pub fn make_zeroselector(pg_var: WildString, op_var: WildString, ty: &TypeKind) -> Expression {
+        FnCall::new_expression(
+            format!("svsel_{}", ty.acle_notation_repr())
+                .parse()
+                .unwrap(),
+            vec![
+                Expression::Identifier(pg_var, IdentifierType::Variable),
+                Expression::Identifier(op_var, IdentifierType::Variable),
+                Self::make_zeroinitializer(ty),
+            ],
+        )
+    }
+
+    pub fn post_build(&self, intrinsic: &mut Intrinsic) -> context::Result {
+        // Drop the argument
+        match self {
+            PredicateForm::Zeroing(ZeroingMethod::Drop { drop: drop_var }) => {
+                intrinsic.signature.drop_argument(drop_var)?
+            }
+            PredicateForm::DontCare(DontCareMethod::AsZeroing) => {
+                if let ZeroingMethod::Drop { drop } = intrinsic
+                    .input
+                    .predication_methods
+                    .zeroing_method
+                    .to_owned()
+                    .ok_or_else(|| {
+                        "DontCareMethod::AsZeroing without zeroing method.".to_string()
+                    })?
+                {
+                    intrinsic.signature.drop_argument(&drop)?
+                }
+            }
+            _ => {}
+        }
+
+        Ok(())
+    }
+
+    fn infer_dont_care(mask: &PredicationMask, methods: &PredicationMethods) -> PredicateForm {
+        let method = if methods.dont_care_method == DontCareMethod::Inferred {
+            if mask.has_zeroing()
+                && matches!(methods.zeroing_method, Some(ZeroingMethod::Drop { .. }))
+            {
+                DontCareMethod::AsZeroing
+            } else {
+                DontCareMethod::AsMerging
+            }
+        } else {
+            methods.dont_care_method
+        };
+
+        PredicateForm::DontCare(method)
+    }
+
+    pub fn compile_list(
+        mask: &PredicationMask,
+        methods: &PredicationMethods,
+    ) -> context::Result<Vec<PredicateForm>> {
+        let mut forms = Vec::new();
+
+        if mask.has_merging() {
+            forms.push(PredicateForm::Merging)
+        }
+
+        if mask.has_dont_care() {
+            forms.push(Self::infer_dont_care(mask, methods))
+        }
+
+        if mask.has_zeroing() {
+            if let Some(method) = methods.zeroing_method.to_owned() {
+                forms.push(PredicateForm::Zeroing(method))
+            } else {
+                return Err(
+                    "cannot create a zeroing variant without a zeroing method specified!"
+                        .to_string(),
+                );
+            }
+        }
+
+        Ok(forms)
+    }
+}
+
+#[derive(
+    Debug, Clone, Copy, Default, PartialEq, Eq, Hash, DeserializeFromStr, SerializeDisplay,
+)]
+pub struct PredicationMask {
+    /// Merging
+    m: bool,
+    /// Don't care
+    x: bool,
+    /// Zeroing
+    z: bool,
+}
+
+impl PredicationMask {
+    pub fn has_merging(&self) -> bool {
+        self.m
+    }
+
+    pub fn has_dont_care(&self) -> bool {
+        self.x
+    }
+
+    pub fn has_zeroing(&self) -> bool {
+        self.z
+    }
+}
+
+impl FromStr for PredicationMask {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut result = Self::default();
+        for kind in s.bytes() {
+            match kind {
+                b'm' => result.m = true,
+                b'x' => result.x = true,
+                b'z' => result.z = true,
+                _ => {
+                    return Err(format!(
+                        "unknown predicate form modifier: {}",
+                        char::from(kind)
+                    ));
+                }
+            }
+        }
+
+        if result.m || result.x || result.z {
+            Ok(result)
+        } else {
+            Err("invalid predication mask".to_string())
+        }
+    }
+}
+
+impl fmt::Display for PredicationMask {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.m.then(|| write!(f, "m")).transpose()?;
+        self.x.then(|| write!(f, "x")).transpose()?;
+        self.z.then(|| write!(f, "z")).transpose().map(|_| ())
+    }
+}
+
+impl TryFrom<&WildString> for PredicationMask {
+    type Error = String;
+
+    fn try_from(value: &WildString) -> Result<Self, Self::Error> {
+        value
+            .wildcards()
+            .find_map(|w| {
+                if let Wildcard::PredicateForms(mask) = w {
+                    Some(*mask)
+                } else {
+                    None
+                }
+            })
+            .ok_or_else(|| "no predicate forms were specified in the name".to_string())
+    }
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/typekinds.rs b/library/stdarch/crates/stdarch-gen-arm/src/typekinds.rs
new file mode 100644
index 0000000000000..7c697cb7c0c43
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/typekinds.rs
@@ -0,0 +1,1051 @@
+use lazy_static::lazy_static;
+use proc_macro2::TokenStream;
+use quote::{ToTokens, TokenStreamExt, quote};
+use regex::Regex;
+use serde_with::{DeserializeFromStr, SerializeDisplay};
+use std::fmt;
+use std::str::FromStr;
+
+use crate::context;
+use crate::expression::{Expression, FnCall};
+use crate::intrinsic::AccessLevel;
+use crate::wildcards::Wildcard;
+
+const VECTOR_FULL_REGISTER_SIZE: u32 = 128;
+const VECTOR_HALF_REGISTER_SIZE: u32 = VECTOR_FULL_REGISTER_SIZE / 2;
+
+#[derive(Debug, Clone, Copy)]
+pub enum TypeRepr {
+    C,
+    Rust,
+    LLVMMachine,
+    ACLENotation,
+    Size,
+    SizeLiteral,
+    TypeKind,
+    SizeInBytesLog2,
+}
+
+pub trait ToRepr {
+    fn repr(&self, repr: TypeRepr) -> String;
+
+    fn c_repr(&self) -> String {
+        self.repr(TypeRepr::C)
+    }
+
+    fn rust_repr(&self) -> String {
+        self.repr(TypeRepr::Rust)
+    }
+
+    fn llvm_machine_repr(&self) -> String {
+        self.repr(TypeRepr::LLVMMachine)
+    }
+
+    fn acle_notation_repr(&self) -> String {
+        self.repr(TypeRepr::ACLENotation)
+    }
+
+    fn size(&self) -> String {
+        self.repr(TypeRepr::Size)
+    }
+
+    fn size_literal(&self) -> String {
+        self.repr(TypeRepr::SizeLiteral)
+    }
+
+    fn type_kind(&self) -> String {
+        self.repr(TypeRepr::TypeKind)
+    }
+
+    fn size_in_bytes_log2(&self) -> String {
+        self.repr(TypeRepr::SizeInBytesLog2)
+    }
+}
+
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)]
+pub struct TypeKindOptions {
+    f: bool,
+    s: bool,
+    u: bool,
+    p: bool,
+}
+
+impl TypeKindOptions {
+    pub fn contains(&self, kind: BaseTypeKind) -> bool {
+        match kind {
+            BaseTypeKind::Float => self.f,
+            BaseTypeKind::Int => self.s,
+            BaseTypeKind::UInt => self.u,
+            BaseTypeKind::Poly => self.p,
+            BaseTypeKind::Bool => false,
+        }
+    }
+}
+
+impl FromStr for TypeKindOptions {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut result = Self::default();
+        for kind in s.bytes() {
+            match kind {
+                b'f' => result.f = true,
+                b's' => result.s = true,
+                b'u' => result.u = true,
+                b'p' => result.p = true,
+                _ => {
+                    return Err(format!("unknown type kind: {}", char::from(kind)));
+                }
+            }
+        }
+        Ok(result)
+    }
+}
+
+impl fmt::Display for TypeKindOptions {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.f.then(|| write!(f, "f")).transpose()?;
+        self.s.then(|| write!(f, "s")).transpose()?;
+        self.u.then(|| write!(f, "u")).transpose().map(|_| ())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum BaseTypeKind {
+    Float,
+    Int,
+    UInt,
+    Bool,
+    Poly,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum BaseType {
+    Sized(BaseTypeKind, u32),
+    Unsized(BaseTypeKind),
+}
+
+#[derive(
+    Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, SerializeDisplay, DeserializeFromStr,
+)]
+pub enum VectorTupleSize {
+    Two,
+    Three,
+    Four,
+}
+
+impl VectorTupleSize {
+    pub fn to_int(self) -> u32 {
+        match self {
+            Self::Two => 2,
+            Self::Three => 3,
+            Self::Four => 4,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct VectorType {
+    base_type: BaseType,
+    lanes: u32,
+    is_scalable: bool,
+    tuple_size: Option<VectorTupleSize>,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash, SerializeDisplay, DeserializeFromStr)]
+pub enum TypeKind {
+    Vector(VectorType),
+    Base(BaseType),
+    Pointer(Box<TypeKind>, AccessLevel),
+    Custom(String),
+    Wildcard(Wildcard),
+}
+
+impl TypeKind {
+    pub fn base_type(&self) -> Option<&BaseType> {
+        match self {
+            Self::Vector(t) => Some(t.base_type()),
+            Self::Pointer(t, _) => t.base_type(),
+            Self::Base(t) => Some(t),
+            Self::Wildcard(..) => None,
+            Self::Custom(..) => None,
+        }
+    }
+
+    pub fn base_type_mut(&mut self) -> Option<&mut BaseType> {
+        match self {
+            Self::Vector(t) => Some(t.base_type_mut()),
+            Self::Pointer(t, _) => t.base_type_mut(),
+            Self::Base(t) => Some(t),
+            Self::Wildcard(..) => None,
+            Self::Custom(..) => None,
+        }
+    }
+
+    pub fn populate_wildcard(&mut self, type_kind: TypeKind) -> context::Result {
+        match self {
+            Self::Wildcard(..) => *self = type_kind,
+            Self::Pointer(t, _) => t.populate_wildcard(type_kind)?,
+            _ => return Err("no wildcard available to populate".to_string()),
+        }
+        Ok(())
+    }
+
+    pub fn base(&self) -> Option<&BaseType> {
+        match self {
+            Self::Base(ty) => Some(ty),
+            Self::Pointer(tk, _) => tk.base(),
+            Self::Vector(ty) => Some(&ty.base_type),
+            _ => None,
+        }
+    }
+
+    pub fn vector(&self) -> Option<&VectorType> {
+        match self {
+            Self::Vector(ty) => Some(ty),
+            _ => None,
+        }
+    }
+
+    pub fn vector_mut(&mut self) -> Option<&mut VectorType> {
+        match self {
+            Self::Vector(ty) => Some(ty),
+            _ => None,
+        }
+    }
+
+    pub fn wildcard(&self) -> Option<&Wildcard> {
+        match self {
+            Self::Wildcard(w) => Some(w),
+            Self::Pointer(w, _) => w.wildcard(),
+            _ => None,
+        }
+    }
+
+    pub fn make_predicate_from(ty: &TypeKind) -> context::Result<TypeKind> {
+        Ok(TypeKind::Vector(VectorType::make_predicate_from_bitsize(
+            ty.base_type()
+                .ok_or_else(|| format!("cannot infer predicate from type {ty}"))?
+                .get_size()
+                .map_err(|_| format!("cannot infer predicate from unsized type {ty}"))?,
+        )))
+    }
+
+    pub fn make_vector(
+        from: TypeKind,
+        is_scalable: bool,
+        tuple_size: Option<VectorTupleSize>,
+    ) -> context::Result<TypeKind> {
+        from.base().cloned().map_or_else(
+            || Err(format!("cannot make a vector type out of {from}!")),
+            |base| {
+                let vt = VectorType::make_from_base(base, is_scalable, tuple_size);
+                Ok(TypeKind::Vector(vt))
+            },
+        )
+    }
+
+    /// Return a new expression that converts the provided `expr` from type `other` to `self`.
+    ///
+    /// Conversions are bitwise over the whole value, like `transmute`, though `transmute`
+    /// itself is only used as a last resort.
+    ///
+    /// This can fail (returning `None`) due to incompatible types, and many conversions are simply
+    /// unimplemented.
+    pub fn express_reinterpretation_from(
+        &self,
+        other: &TypeKind,
+        expr: impl Into<Expression>,
+    ) -> Option<Expression> {
+        if self == other {
+            Some(expr.into())
+        } else if let (Some(self_vty), Some(other_vty)) = (self.vector(), other.vector()) {
+            if self_vty.is_scalable
+                && self_vty.tuple_size.is_none()
+                && other_vty.is_scalable
+                && other_vty.tuple_size.is_none()
+            {
+                // Plain scalable vectors.
+                use BaseTypeKind::*;
+                match (self_vty.base_type, other_vty.base_type) {
+                    (BaseType::Sized(Int, self_size), BaseType::Sized(UInt, other_size))
+                        if self_size == other_size =>
+                    {
+                        Some(Expression::MethodCall(
+                            Box::new(expr.into()),
+                            "as_signed".parse().unwrap(),
+                            vec![],
+                        ))
+                    }
+                    (BaseType::Sized(UInt, self_size), BaseType::Sized(Int, other_size))
+                        if self_size == other_size =>
+                    {
+                        Some(Expression::MethodCall(
+                            Box::new(expr.into()),
+                            "as_unsigned".parse().unwrap(),
+                            vec![],
+                        ))
+                    }
+                    (
+                        BaseType::Sized(Float | Int | UInt, _),
+                        BaseType::Sized(Float | Int | UInt, _),
+                    ) => Some(FnCall::new_expression(
+                        // Conversions between float and (u)int, or where the lane size changes.
+                        "simd_reinterpret".parse().unwrap(),
+                        vec![expr.into()],
+                    )),
+                    _ => None,
+                }
+            } else {
+                // Tuples and fixed-width vectors.
+                None
+            }
+        } else {
+            // Scalar types.
+            None
+        }
+    }
+}
+
+impl FromStr for TypeKind {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(match s {
+            s if s.starts_with('{') && s.ends_with('}') => {
+                Self::Wildcard(s[1..s.len() - 1].trim().parse()?)
+            }
+            s if s.starts_with('*') => {
+                let mut split = s[1..].split_whitespace();
+                let (ty, rw) = match (split.clone().count(), split.next(), split.next()) {
+                    (2, Some("mut"), Some(ty)) => (ty, AccessLevel::RW),
+                    (2, Some("const"), Some(ty)) => (ty, AccessLevel::R),
+                    (1, Some(ty), None) => (ty, AccessLevel::R),
+                    _ => return Err(format!("invalid pointer type {s:#?} given")),
+                };
+                Self::Pointer(Box::new(ty.parse()?), rw)
+            }
+            _ => s
+                .parse::<VectorType>()
+                .map(TypeKind::Vector)
+                .or_else(|_| s.parse::<BaseType>().map(TypeKind::Base))
+                .unwrap_or_else(|_| TypeKind::Custom(s.to_string())),
+        })
+    }
+}
+
+impl fmt::Display for TypeKind {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Vector(ty) => write!(f, "{ty}"),
+            Self::Pointer(ty, _) => write!(f, "{ty}"),
+            Self::Base(ty) => write!(f, "{ty}"),
+            Self::Wildcard(w) => write!(f, "{{{w}}}"),
+            Self::Custom(s) => write!(f, "{s}"),
+        }
+    }
+}
+
+impl ToRepr for TypeKind {
+    fn repr(&self, repr: TypeRepr) -> String {
+        match self {
+            Self::Vector(ty) => ty.repr(repr),
+            Self::Pointer(ty, _) => ty.repr(repr),
+            Self::Base(ty) => ty.repr(repr),
+            Self::Wildcard(w) => format!("{w}"),
+            Self::Custom(s) => s.to_string(),
+        }
+    }
+}
+
+impl ToTokens for TypeKind {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        if let Self::Pointer(_, rw) = self {
+            tokens.append_all(match rw {
+                AccessLevel::RW => quote! { *mut },
+                AccessLevel::R => quote! { *const },
+            })
+        }
+
+        tokens.append_all(
+            self.to_string()
+                .parse::<TokenStream>()
+                .expect("invalid syntax"),
+        )
+    }
+}
+
+impl PartialOrd for TypeKind {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl From<&TypeKind> for usize {
+    fn from(ty: &TypeKind) -> Self {
+        match ty {
+            TypeKind::Base(_) => 1,
+            TypeKind::Pointer(_, _) => 2,
+            TypeKind::Vector(_) => 3,
+            TypeKind::Custom(_) => 4,
+            TypeKind::Wildcard(_) => 5,
+        }
+    }
+}
+
+impl Ord for TypeKind {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        use std::cmp::Ordering::*;
+
+        let self_int: usize = self.into();
+        let other_int: usize = other.into();
+
+        if self_int == other_int {
+            match (self, other) {
+                (TypeKind::Base(ty1), TypeKind::Base(ty2)) => ty1.cmp(ty2),
+                (TypeKind::Pointer(ty1, _), TypeKind::Pointer(ty2, _)) => ty1.cmp(ty2),
+                (TypeKind::Vector(vt1), TypeKind::Vector(vt2)) => vt1.cmp(vt2),
+                (TypeKind::Custom(s1), TypeKind::Custom(s2)) => s1.cmp(s2),
+                (TypeKind::Wildcard(..), TypeKind::Wildcard(..)) => Equal,
+                _ => unreachable!(),
+            }
+        } else {
+            self_int.cmp(&other_int)
+        }
+    }
+}
+
+impl VectorType {
+    pub fn base_type(&self) -> &BaseType {
+        &self.base_type
+    }
+
+    pub fn base_type_mut(&mut self) -> &mut BaseType {
+        &mut self.base_type
+    }
+
+    fn sanitise_lanes(
+        mut base_type: BaseType,
+        lanes: Option<u32>,
+    ) -> Result<(BaseType, u32), String> {
+        let lanes = match (base_type, lanes) {
+            (BaseType::Sized(BaseTypeKind::Bool, lanes), None) => {
+                base_type = BaseType::Sized(BaseTypeKind::Bool, VECTOR_FULL_REGISTER_SIZE / lanes);
+                lanes
+            }
+            (BaseType::Unsized(BaseTypeKind::Bool), None) => {
+                base_type = BaseType::Sized(BaseTypeKind::Bool, 8);
+                16
+            }
+            (BaseType::Sized(_, size), None) => VECTOR_FULL_REGISTER_SIZE / size,
+            (BaseType::Sized(_, size), Some(lanes)) => match size * lanes {
+                VECTOR_FULL_REGISTER_SIZE | VECTOR_HALF_REGISTER_SIZE => lanes,
+                _ => return Err("invalid number of lanes".to_string()),
+            },
+            _ => return Err("cannot infer number of lanes".to_string()),
+        };
+
+        Ok((base_type, lanes))
+    }
+
+    pub fn make_from_base(
+        base_ty: BaseType,
+        is_scalable: bool,
+        tuple_size: Option<VectorTupleSize>,
+    ) -> VectorType {
+        #[allow(clippy::collapsible_if)]
+        if is_scalable {
+            if let BaseType::Sized(BaseTypeKind::Bool, size) = base_ty {
+                return Self::make_predicate_from_bitsize(size);
+            }
+        }
+
+        let (base_type, lanes) = Self::sanitise_lanes(base_ty, None).unwrap();
+
+        VectorType {
+            base_type,
+            lanes,
+            is_scalable,
+            tuple_size,
+        }
+    }
+
+    pub fn make_predicate_from_bitsize(size: u32) -> VectorType {
+        VectorType {
+            base_type: BaseType::Sized(BaseTypeKind::Bool, size),
+            lanes: (VECTOR_FULL_REGISTER_SIZE / size),
+            is_scalable: true,
+            tuple_size: None,
+        }
+    }
+
+    pub fn cast_base_type_as(&mut self, ty: BaseType) {
+        self.base_type = ty
+    }
+
+    pub fn lanes(&self) -> u32 {
+        self.lanes
+    }
+
+    pub fn tuple_size(&self) -> Option<VectorTupleSize> {
+        self.tuple_size
+    }
+}
+
+impl FromStr for VectorType {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        lazy_static! {
+            static ref RE: Regex = Regex::new(r"^(?:(?:sv(?P<sv_ty>(?:uint|int|bool|float)(?:\d+)?))|(?:(?P<ty>(?:uint|int|bool|poly|float)(?:\d+)?)x(?P<lanes>(?:\d+)?)))(?:x(?P<tuple_size>2|3|4))?_t$").unwrap();
+        }
+
+        if let Some(c) = RE.captures(s) {
+            let (base_type, lanes) = Self::sanitise_lanes(
+                c.name("sv_ty")
+                    .or_else(|| c.name("ty"))
+                    .map(<&str>::from)
+                    .map(BaseType::from_str)
+                    .unwrap()?,
+                c.name("lanes")
+                    .map(<&str>::from)
+                    .map(u32::from_str)
+                    .transpose()
+                    .unwrap(),
+            )
+            .map_err(|e| format!("invalid {s:#?} vector type: {e}"))?;
+
+            let tuple_size = c
+                .name("tuple_size")
+                .map(<&str>::from)
+                .map(VectorTupleSize::from_str)
+                .transpose()
+                .unwrap();
+
+            Ok(VectorType {
+                base_type,
+                is_scalable: c.name("sv_ty").is_some(),
+                lanes,
+                tuple_size,
+            })
+        } else {
+            Err(format!("invalid vector type {s:#?} given"))
+        }
+    }
+}
+
+impl ToRepr for VectorType {
+    fn repr(&self, repr: TypeRepr) -> String {
+        let make_llvm_repr = |show_unsigned| {
+            format!(
+                "{}v{}{}",
+                if self.is_scalable { "nx" } else { "" },
+                self.lanes * (self.tuple_size.map(usize::from).unwrap_or(1) as u32),
+                match self.base_type {
+                    BaseType::Sized(BaseTypeKind::UInt, size) if show_unsigned =>
+                        format!("u{size}"),
+                    _ => self.base_type.llvm_machine_repr(),
+                }
+            )
+        };
+
+        if matches!(repr, TypeRepr::ACLENotation) {
+            self.base_type.acle_notation_repr()
+        } else if matches!(repr, TypeRepr::LLVMMachine) {
+            make_llvm_repr(false)
+        } else if self.is_scalable {
+            match (self.base_type, self.lanes, self.tuple_size) {
+                (BaseType::Sized(BaseTypeKind::Bool, _), 16, _) => "svbool_t".to_string(),
+                (BaseType::Sized(BaseTypeKind::Bool, _), lanes, _) => format!("svbool{lanes}_t"),
+                (BaseType::Sized(_, size), lanes, _)
+                    if VECTOR_FULL_REGISTER_SIZE != (size * lanes) =>
+                {
+                    // Special internal type case
+                    make_llvm_repr(true)
+                }
+                (ty, _, None) => format!("sv{}_t", ty.c_repr()),
+                (ty, _, Some(tuple_size)) => format!("sv{}x{tuple_size}_t", ty.c_repr()),
+            }
+        } else {
+            match self.tuple_size {
+                Some(tuple_size) => format!(
+                    "{}x{}x{}_t",
+                    self.base_type.c_repr(),
+                    self.lanes,
+                    tuple_size
+                ),
+                None => format!("{}x{}_t", self.base_type.c_repr(), self.lanes),
+            }
+        }
+    }
+}
+
+impl fmt::Display for VectorType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.c_repr())
+    }
+}
+
+impl From<VectorTupleSize> for usize {
+    fn from(t: VectorTupleSize) -> Self {
+        match t {
+            VectorTupleSize::Two => 2,
+            VectorTupleSize::Three => 3,
+            VectorTupleSize::Four => 4,
+        }
+    }
+}
+
+impl FromStr for VectorTupleSize {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "2" => Ok(Self::Two),
+            "3" => Ok(Self::Three),
+            "4" => Ok(Self::Four),
+            _ => Err(format!("invalid vector tuple size `{s}` provided")),
+        }
+    }
+}
+
+impl TryFrom<usize> for VectorTupleSize {
+    type Error = String;
+
+    fn try_from(value: usize) -> Result<Self, Self::Error> {
+        match value {
+            2 => Ok(Self::Two),
+            3 => Ok(Self::Three),
+            4 => Ok(Self::Four),
+            _ => Err(format!("invalid vector tuple size `{value}` provided")),
+        }
+    }
+}
+
+impl fmt::Display for VectorTupleSize {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", usize::from(*self))
+    }
+}
+
+impl FromStr for BaseTypeKind {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "float" | "f" => Ok(Self::Float),
+            "int" | "i" => Ok(Self::Int),
+            "uint" | "u" => Ok(Self::UInt),
+            "poly" | "p" => Ok(Self::Poly),
+            "bool" | "b" => Ok(Self::Bool),
+            _ => Err(format!("no match for {s}")),
+        }
+    }
+}
+
+impl ToRepr for BaseTypeKind {
+    fn repr(&self, repr: TypeRepr) -> String {
+        match (repr, self) {
+            (TypeRepr::C, Self::Float) => "float",
+            (TypeRepr::C, Self::Int) => "int",
+            (TypeRepr::C, Self::UInt) => "uint",
+            (TypeRepr::C, Self::Poly) => "poly",
+            (TypeRepr::Rust | TypeRepr::LLVMMachine | TypeRepr::ACLENotation, Self::Float) => "f",
+            (TypeRepr::Rust, Self::Int) | (TypeRepr::LLVMMachine, Self::Int | Self::UInt) => "i",
+            (TypeRepr::Rust | TypeRepr::ACLENotation, Self::UInt) => "u",
+            (TypeRepr::Rust | TypeRepr::LLVMMachine | TypeRepr::ACLENotation, Self::Poly) => "p",
+            (TypeRepr::ACLENotation, Self::Int) => "s",
+            (TypeRepr::ACLENotation, Self::Bool) => "b",
+            (_, Self::Bool) => "bool",
+            _ => {
+                unreachable!("no base type kind available for representation {repr:?}")
+            }
+        }
+        .to_string()
+    }
+}
+
+impl fmt::Display for BaseTypeKind {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.c_repr())
+    }
+}
+
+impl BaseType {
+    pub fn get_size(&self) -> Result<u32, String> {
+        match self {
+            Self::Sized(_, size) => Ok(*size),
+            _ => Err(format!("unexpected invalid base type given {self:#?}")),
+        }
+    }
+
+    pub fn kind(&self) -> &BaseTypeKind {
+        match self {
+            BaseType::Sized(kind, _) | BaseType::Unsized(kind) => kind,
+        }
+    }
+
+    pub fn is_bool(&self) -> bool {
+        self.kind() == &BaseTypeKind::Bool
+    }
+
+    pub fn is_float(&self) -> bool {
+        self.kind() == &BaseTypeKind::Float
+    }
+}
+
+impl FromStr for BaseType {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        lazy_static! {
+            static ref RE: Regex = Regex::new(r"^(?P<kind>[a-zA-Z]+)(?P<size>\d+)?(_t)?$").unwrap();
+        }
+
+        if let Some(c) = RE.captures(s) {
+            let kind = c["kind"].parse()?;
+            let size = c
+                .name("size")
+                .map(<&str>::from)
+                .map(u32::from_str)
+                .transpose()
+                .unwrap();
+            match size {
+                Some(size) => Ok(Self::Sized(kind, size)),
+                None => Ok(Self::Unsized(kind)),
+            }
+        } else {
+            Err(format!("failed to parse type `{s}`"))
+        }
+    }
+}
+
+impl ToRepr for BaseType {
+    fn repr(&self, repr: TypeRepr) -> String {
+        use BaseType::*;
+        use BaseTypeKind::*;
+        use TypeRepr::*;
+        match (self, &repr) {
+            (Sized(Bool, _) | Unsized(Bool), LLVMMachine) => "i1".to_string(),
+            (Sized(_, size), SizeLiteral) if *size == 8 => "b".to_string(),
+            (Sized(_, size), SizeLiteral) if *size == 16 => "h".to_string(),
+            (Sized(_, size), SizeLiteral) if *size == 32 => "w".to_string(),
+            (Sized(_, size), SizeLiteral) if *size == 64 => "d".to_string(),
+            (Sized(_, size), SizeLiteral) if *size == 128 => "q".to_string(),
+            (_, SizeLiteral) => unreachable!("cannot represent {self:#?} as size literal"),
+            (Sized(Float, _) | Unsized(Float), TypeKind) => "f".to_string(),
+            (Sized(Int, _) | Unsized(Int), TypeKind) => "s".to_string(),
+            (Sized(UInt, _) | Unsized(UInt), TypeKind) => "u".to_string(),
+            (Sized(_, size), Size) => size.to_string(),
+            (Sized(_, size), SizeInBytesLog2) => {
+                assert!(size.is_power_of_two() && *size >= 8);
+                (size >> 3).trailing_zeros().to_string()
+            }
+            (Sized(kind, size), _) => format!("{}{size}", kind.repr(repr)),
+            (Unsized(kind), _) => kind.repr(repr),
+        }
+    }
+}
+
+impl fmt::Display for BaseType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.rust_repr())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::typekinds::*;
+
+    #[test]
+    fn test_predicate() {
+        assert_eq!(
+            "svbool_t".parse::<TypeKind>().unwrap(),
+            TypeKind::Vector(VectorType {
+                base_type: BaseType::Sized(BaseTypeKind::Bool, 8),
+                is_scalable: true,
+                lanes: 16,
+                tuple_size: None
+            })
+        );
+    }
+
+    #[test]
+    fn test_llvm_internal_predicate() {
+        assert_eq!(
+            "svbool4_t".parse::<TypeKind>().unwrap(),
+            TypeKind::Vector(VectorType {
+                base_type: BaseType::Sized(BaseTypeKind::Bool, 32),
+                is_scalable: true,
+                lanes: 4,
+                tuple_size: None
+            })
+        );
+    }
+
+    #[test]
+    fn test_llvm_internal_predicate_llvm() {
+        assert_eq!(
+            "svbool4_t".parse::<TypeKind>().unwrap().llvm_machine_repr(),
+            "nxv4i1"
+        );
+    }
+
+    #[test]
+    fn test_llvm_internal_predicate_acle() {
+        assert_eq!(
+            "svbool4_t"
+                .parse::<TypeKind>()
+                .unwrap()
+                .acle_notation_repr(),
+            "b32"
+        );
+    }
+
+    #[test]
+    fn test_predicate_from_bitsize() {
+        let pg = VectorType::make_predicate_from_bitsize(32);
+        assert_eq!(pg.acle_notation_repr(), "b32");
+        assert_eq!(pg, "svbool4_t".parse().unwrap());
+        assert_eq!(pg.lanes, 4);
+        assert_eq!(pg.base_type, BaseType::Sized(BaseTypeKind::Bool, 32));
+    }
+
+    #[test]
+    fn test_scalable_single() {
+        assert_eq!(
+            "svuint8_t".parse::<TypeKind>().unwrap(),
+            TypeKind::Vector(VectorType {
+                base_type: BaseType::Sized(BaseTypeKind::UInt, 8),
+                is_scalable: true,
+                lanes: 16,
+                tuple_size: None
+            })
+        );
+    }
+
+    #[test]
+    fn test_scalable_tuple() {
+        assert_eq!(
+            "svint64x3_t".parse::<TypeKind>().unwrap(),
+            TypeKind::Vector(VectorType {
+                base_type: BaseType::Sized(BaseTypeKind::Int, 64),
+                is_scalable: true,
+                lanes: 2,
+                tuple_size: Some(VectorTupleSize::Three),
+            })
+        );
+    }
+
+    #[test]
+    fn test_scalable_single_llvm() {
+        assert_eq!(
+            "svuint32_t"
+                .parse::<TypeKind>()
+                .unwrap()
+                .llvm_machine_repr(),
+            "nxv4i32"
+        );
+    }
+
+    #[test]
+    fn test_scalable_tuple_llvm() {
+        assert_eq!(
+            "svint32x4_t"
+                .parse::<TypeKind>()
+                .unwrap()
+                .llvm_machine_repr(),
+            "nxv16i32"
+        );
+    }
+
+    #[test]
+    fn test_vector_single_full() {
+        assert_eq!(
+            "uint32x4_t".parse::<TypeKind>().unwrap(),
+            TypeKind::Vector(VectorType {
+                base_type: BaseType::Sized(BaseTypeKind::UInt, 32),
+                is_scalable: false,
+                lanes: 4,
+                tuple_size: None,
+            })
+        );
+    }
+
+    #[test]
+    fn test_vector_single_half() {
+        assert_eq!(
+            "uint32x2_t".parse::<TypeKind>().unwrap(),
+            TypeKind::Vector(VectorType {
+                base_type: BaseType::Sized(BaseTypeKind::UInt, 32),
+                is_scalable: false,
+                lanes: 2,
+                tuple_size: None,
+            })
+        );
+    }
+
+    #[test]
+    fn test_vector_tuple() {
+        assert_eq!(
+            "uint64x2x4_t".parse::<TypeKind>().unwrap(),
+            TypeKind::Vector(VectorType {
+                base_type: BaseType::Sized(BaseTypeKind::UInt, 64),
+                is_scalable: false,
+                lanes: 2,
+                tuple_size: Some(VectorTupleSize::Four),
+            })
+        );
+    }
+
+    #[test]
+    fn test_const_pointer() {
+        let p = "*u32".parse::<TypeKind>().unwrap();
+        assert_eq!(
+            p,
+            TypeKind::Pointer(
+                Box::new(TypeKind::Base(BaseType::Sized(BaseTypeKind::UInt, 32))),
+                AccessLevel::R
+            )
+        );
+        assert_eq!(p.to_token_stream().to_string(), "* const u32")
+    }
+
+    #[test]
+    fn test_mut_pointer() {
+        let p = "*mut u32".parse::<TypeKind>().unwrap();
+        assert_eq!(
+            p,
+            TypeKind::Pointer(
+                Box::new(TypeKind::Base(BaseType::Sized(BaseTypeKind::UInt, 32))),
+                AccessLevel::RW
+            )
+        );
+        assert_eq!(p.to_token_stream().to_string(), "* mut u32")
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_invalid_vector_single() {
+        assert_eq!(
+            "uint32x8_t".parse::<TypeKind>().unwrap(),
+            TypeKind::Vector(VectorType {
+                base_type: BaseType::Sized(BaseTypeKind::UInt, 32),
+                is_scalable: false,
+                lanes: 8,
+                tuple_size: None,
+            })
+        );
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_invalid_vector_tuple() {
+        assert_eq!(
+            "uint32x4x5_t".parse::<TypeKind>().unwrap(),
+            TypeKind::Vector(VectorType {
+                base_type: BaseType::Sized(BaseTypeKind::UInt, 32),
+                is_scalable: false,
+                lanes: 8,
+                tuple_size: None, // cannot represent
+            })
+        );
+    }
+
+    #[test]
+    fn test_base() {
+        assert_eq!(
+            "u32".parse::<TypeKind>().unwrap(),
+            TypeKind::Base(BaseType::Sized(BaseTypeKind::UInt, 32)),
+        )
+    }
+
+    #[test]
+    fn test_custom() {
+        assert_eq!(
+            "svpattern".parse::<TypeKind>().unwrap(),
+            TypeKind::Custom("svpattern".to_string()),
+        )
+    }
+
+    #[test]
+    fn test_wildcard_type() {
+        assert_eq!(
+            "{type}".parse::<TypeKind>().unwrap(),
+            TypeKind::Wildcard(Wildcard::Type(None)),
+        )
+    }
+
+    #[test]
+    fn test_wildcard_typeset() {
+        assert_eq!(
+            "{type[0]}".parse::<TypeKind>().unwrap(),
+            TypeKind::Wildcard(Wildcard::Type(Some(0))),
+        )
+    }
+
+    #[test]
+    fn test_wildcard_sve_type() {
+        assert_eq!(
+            "{sve_type}".parse::<TypeKind>().unwrap(),
+            TypeKind::Wildcard(Wildcard::SVEType(None, None)),
+        )
+    }
+
+    #[test]
+    fn test_wildcard_sve_typeset() {
+        assert_eq!(
+            "{sve_type[0]}".parse::<TypeKind>().unwrap(),
+            TypeKind::Wildcard(Wildcard::SVEType(Some(0), None)),
+        )
+    }
+
+    #[test]
+    fn test_wildcard_sve_tuple_type() {
+        assert_eq!(
+            "{sve_type_x2}".parse::<TypeKind>().unwrap(),
+            TypeKind::Wildcard(Wildcard::SVEType(None, Some(VectorTupleSize::Two))),
+        )
+    }
+
+    #[test]
+    fn test_wildcard_sve_tuple_typeset() {
+        assert_eq!(
+            "{sve_type_x2[0]}".parse::<TypeKind>().unwrap(),
+            TypeKind::Wildcard(Wildcard::SVEType(Some(0), Some(VectorTupleSize::Two))),
+        )
+    }
+
+    #[test]
+    fn test_wildcard_predicate() {
+        assert_eq!(
+            "{predicate}".parse::<TypeKind>().unwrap(),
+            TypeKind::Wildcard(Wildcard::Predicate(None))
+        )
+    }
+
+    #[test]
+    fn test_wildcard_scale() {
+        assert_eq!(
+            "{sve_type as i8}".parse::<TypeKind>().unwrap(),
+            TypeKind::Wildcard(Wildcard::Scale(
+                Box::new(Wildcard::SVEType(None, None)),
+                Box::new(TypeKind::Base(BaseType::Sized(BaseTypeKind::Int, 8)))
+            ))
+        )
+    }
+
+    #[test]
+    fn test_size_in_bytes_log2() {
+        assert_eq!("i8".parse::<TypeKind>().unwrap().size_in_bytes_log2(), "0");
+        assert_eq!("i16".parse::<TypeKind>().unwrap().size_in_bytes_log2(), "1");
+        assert_eq!("i32".parse::<TypeKind>().unwrap().size_in_bytes_log2(), "2");
+        assert_eq!("i64".parse::<TypeKind>().unwrap().size_in_bytes_log2(), "3")
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_invalid_size_in_bytes_log2() {
+        "i9".parse::<TypeKind>().unwrap().size_in_bytes_log2();
+    }
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/wildcards.rs b/library/stdarch/crates/stdarch-gen-arm/src/wildcards.rs
new file mode 100644
index 0000000000000..25aa803489270
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/wildcards.rs
@@ -0,0 +1,197 @@
+use lazy_static::lazy_static;
+use regex::Regex;
+use serde_with::{DeserializeFromStr, SerializeDisplay};
+use std::fmt;
+use std::str::FromStr;
+
+use crate::{
+    fn_suffix::SuffixKind,
+    predicate_forms::PredicationMask,
+    typekinds::{ToRepr, TypeKind, TypeKindOptions, VectorTupleSize},
+};
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash, SerializeDisplay, DeserializeFromStr)]
+pub enum Wildcard {
+    Type(Option<usize>),
+    /// NEON type derivated by a base type
+    NEONType(Option<usize>, Option<VectorTupleSize>, Option<SuffixKind>),
+    /// SVE type derivated by a base type
+    SVEType(Option<usize>, Option<VectorTupleSize>),
+    /// Integer representation of bitsize
+    Size(Option<usize>),
+    /// Integer representation of bitsize minus one
+    SizeMinusOne(Option<usize>),
+    /// Literal representation of the bitsize: b(yte), h(half), w(ord) or d(ouble)
+    SizeLiteral(Option<usize>),
+    /// Literal representation of the type kind: f(loat), s(igned), u(nsigned)
+    TypeKind(Option<usize>, Option<TypeKindOptions>),
+    /// Log2 of the size in bytes
+    SizeInBytesLog2(Option<usize>),
+    /// Predicate to be inferred from the specified type
+    Predicate(Option<usize>),
+    /// Predicate to be inferred from the greatest type
+    MaxPredicate,
+
+    Scale(Box<Wildcard>, Box<TypeKind>),
+
+    // Other wildcards
+    LLVMLink,
+    NVariant,
+    /// Predicate forms to use and placeholder for a predicate form function name modifier
+    PredicateForms(PredicationMask),
+
+    /// User-set wildcard through `substitutions`
+    Custom(String),
+}
+
+impl Wildcard {
+    pub fn is_nonpredicate_type(&self) -> bool {
+        matches!(
+            self,
+            Wildcard::Type(..) | Wildcard::NEONType(..) | Wildcard::SVEType(..)
+        )
+    }
+
+    pub fn get_typeset_index(&self) -> Option<usize> {
+        match self {
+            Wildcard::Type(idx) | Wildcard::NEONType(idx, ..) | Wildcard::SVEType(idx, ..) => {
+                Some(idx.unwrap_or(0))
+            }
+            _ => None,
+        }
+    }
+}
+
+impl FromStr for Wildcard {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        lazy_static! {
+            static ref RE: Regex = Regex::new(r"^(?P<wildcard>\w+?)(?:_x(?P<tuple_size>[2-4]))?(?:\[(?P<index>\d+)\])?(?:\.(?P<modifiers>\w+))?(?:\s+as\s+(?P<scale_to>.*?))?$").unwrap();
+        }
+
+        if let Some(c) = RE.captures(s) {
+            let wildcard_name = &c["wildcard"];
+            let inputset_index = c
+                .name("index")
+                .map(<&str>::from)
+                .map(usize::from_str)
+                .transpose()
+                .map_err(|_| format!("{:#?} is not a valid type index", &c["index"]))?;
+            let tuple_size = c
+                .name("tuple_size")
+                .map(<&str>::from)
+                .map(VectorTupleSize::from_str)
+                .transpose()
+                .map_err(|_| format!("{:#?} is not a valid tuple size", &c["tuple_size"]))?;
+            let modifiers = c.name("modifiers").map(<&str>::from);
+
+            let wildcard = match (wildcard_name, inputset_index, tuple_size, modifiers) {
+                ("type", index, None, None) => Ok(Wildcard::Type(index)),
+                ("neon_type", index, tuple, modifier) => {
+                    if let Some(str_suffix) = modifier {
+                        let suffix_kind = SuffixKind::from_str(str_suffix);
+                        return Ok(Wildcard::NEONType(index, tuple, Some(suffix_kind.unwrap())));
+                    } else {
+                        Ok(Wildcard::NEONType(index, tuple, None))
+                    }
+                }
+                ("sve_type", index, tuple, None) => Ok(Wildcard::SVEType(index, tuple)),
+                ("size", index, None, None) => Ok(Wildcard::Size(index)),
+                ("size_minus_one", index, None, None) => Ok(Wildcard::SizeMinusOne(index)),
+                ("size_literal", index, None, None) => Ok(Wildcard::SizeLiteral(index)),
+                ("type_kind", index, None, modifiers) => Ok(Wildcard::TypeKind(
+                    index,
+                    modifiers.map(|modifiers| modifiers.parse()).transpose()?,
+                )),
+                ("size_in_bytes_log2", index, None, None) => Ok(Wildcard::SizeInBytesLog2(index)),
+                ("predicate", index, None, None) => Ok(Wildcard::Predicate(index)),
+                ("max_predicate", None, None, None) => Ok(Wildcard::MaxPredicate),
+                ("llvm_link", None, None, None) => Ok(Wildcard::LLVMLink),
+                ("_n", None, None, None) => Ok(Wildcard::NVariant),
+                (w, None, None, None) if w.starts_with('_') => {
+                    // test for predicate forms
+                    let pf_mask = PredicationMask::from_str(&w[1..]);
+                    if let Ok(mask) = pf_mask {
+                        if mask.has_merging() {
+                            Ok(Wildcard::PredicateForms(mask))
+                        } else {
+                            Err("cannot add predication without a Merging form".to_string())
+                        }
+                    } else {
+                        Err(format!("invalid wildcard `{s:#?}`"))
+                    }
+                }
+                (cw, None, None, None) => Ok(Wildcard::Custom(cw.to_string())),
+                _ => Err(format!("invalid wildcard `{s:#?}`")),
+            }?;
+
+            let scale_to = c
+                .name("scale_to")
+                .map(<&str>::from)
+                .map(TypeKind::from_str)
+                .transpose()
+                .map_err(|_| format!("{:#?} is not a valid type", &c["scale_to"]))?;
+
+            if let Some(scale_to) = scale_to {
+                Ok(Wildcard::Scale(Box::new(wildcard), Box::new(scale_to)))
+            } else {
+                Ok(wildcard)
+            }
+        } else {
+            Err(format!("## invalid wildcard `{s:#?}`"))
+        }
+    }
+}
+
+impl fmt::Display for Wildcard {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Type(None) => write!(f, "type"),
+            Self::Type(Some(index)) => write!(f, "type[{index}]"),
+            Self::NEONType(None, None, None) => write!(f, "neon_type"),
+            Self::NEONType(None, None, Some(suffix_kind)) => write!(f, "neon_type.{suffix_kind}"),
+            Self::NEONType(Some(index), None, None) => write!(f, "neon_type[{index}]"),
+            Self::NEONType(Some(index), None, Some(suffix_kind)) => {
+                write!(f, "neon_type[{index}].{suffix_kind}")
+            }
+            Self::NEONType(None, Some(tuple_size), Some(suffix_kind)) => {
+                write!(f, "neon_type_x{tuple_size}.{suffix_kind}")
+            }
+            Self::NEONType(None, Some(tuple_size), None) => write!(f, "neon_type_x{tuple_size}"),
+            Self::NEONType(Some(index), Some(tuple_size), None) => {
+                write!(f, "neon_type_x{tuple_size}[{index}]")
+            }
+            Self::NEONType(Some(index), Some(tuple_size), Some(suffix_kind)) => {
+                write!(f, "neon_type_x{tuple_size}[{index}].{suffix_kind}")
+            }
+            Self::SVEType(None, None) => write!(f, "sve_type"),
+            Self::SVEType(Some(index), None) => write!(f, "sve_type[{index}]"),
+            Self::SVEType(None, Some(tuple_size)) => write!(f, "sve_type_x{tuple_size}"),
+            Self::SVEType(Some(index), Some(tuple_size)) => {
+                write!(f, "sve_type_x{tuple_size}[{index}]")
+            }
+            Self::Size(None) => write!(f, "size"),
+            Self::Size(Some(index)) => write!(f, "size[{index}]"),
+            Self::SizeMinusOne(None) => write!(f, "size_minus_one"),
+            Self::SizeMinusOne(Some(index)) => write!(f, "size_minus_one[{index}]"),
+            Self::SizeLiteral(None) => write!(f, "size_literal"),
+            Self::SizeLiteral(Some(index)) => write!(f, "size_literal[{index}]"),
+            Self::TypeKind(None, None) => write!(f, "type_kind"),
+            Self::TypeKind(None, Some(opts)) => write!(f, "type_kind.{opts}"),
+            Self::TypeKind(Some(index), None) => write!(f, "type_kind[{index}]"),
+            Self::TypeKind(Some(index), Some(opts)) => write!(f, "type_kind[{index}].{opts}"),
+            Self::SizeInBytesLog2(None) => write!(f, "size_in_bytes_log2"),
+            Self::SizeInBytesLog2(Some(index)) => write!(f, "size_in_bytes_log2[{index}]"),
+            Self::Predicate(None) => write!(f, "predicate"),
+            Self::Predicate(Some(index)) => write!(f, "predicate[{index}]"),
+            Self::MaxPredicate => write!(f, "max_predicate"),
+            Self::LLVMLink => write!(f, "llvm_link"),
+            Self::NVariant => write!(f, "_n"),
+            Self::PredicateForms(mask) => write!(f, "_{mask}"),
+
+            Self::Scale(wildcard, ty) => write!(f, "{wildcard} as {}", ty.rust_repr()),
+            Self::Custom(cw) => write!(f, "{cw}"),
+        }
+    }
+}
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/wildstring.rs b/library/stdarch/crates/stdarch-gen-arm/src/wildstring.rs
new file mode 100644
index 0000000000000..4f8cc67f5e019
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-arm/src/wildstring.rs
@@ -0,0 +1,399 @@
+use itertools::Itertools;
+use proc_macro2::TokenStream;
+use quote::{ToTokens, TokenStreamExt, quote};
+use serde_with::{DeserializeFromStr, SerializeDisplay};
+use std::str::pattern::Pattern;
+use std::{fmt, str::FromStr};
+
+use crate::context::LocalContext;
+use crate::fn_suffix::make_neon_suffix;
+use crate::typekinds::{ToRepr, TypeRepr};
+use crate::wildcards::Wildcard;
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum WildStringPart {
+    String(String),
+    Wildcard(Wildcard),
+}
+
+/// Wildcard-able string
+#[derive(Debug, Clone, PartialEq, Eq, Default, SerializeDisplay, DeserializeFromStr)]
+pub struct WildString(pub Vec<WildStringPart>);
+
+impl WildString {
+    pub fn has_wildcards(&self) -> bool {
+        for part in self.0.iter() {
+            if let WildStringPart::Wildcard(..) = part {
+                return true;
+            }
+        }
+
+        false
+    }
+
+    pub fn wildcards(&self) -> impl Iterator<Item = &Wildcard> + '_ {
+        self.0.iter().filter_map(|part| match part {
+            WildStringPart::Wildcard(w) => Some(w),
+            _ => None,
+        })
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = &WildStringPart> + '_ {
+        self.0.iter()
+    }
+
+    pub fn iter_mut(&mut self) -> impl Iterator<Item = &mut WildStringPart> + '_ {
+        self.0.iter_mut()
+    }
+
+    pub fn starts_with(&self, s2: &str) -> bool {
+        self.to_string().starts_with(s2)
+    }
+
+    pub fn prepend_str(&mut self, s: impl Into<String>) {
+        self.0.insert(0, WildStringPart::String(s.into()))
+    }
+
+    pub fn push_str(&mut self, s: impl Into<String>) {
+        self.0.push(WildStringPart::String(s.into()))
+    }
+
+    pub fn push_wildcard(&mut self, w: Wildcard) {
+        self.0.push(WildStringPart::Wildcard(w))
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    pub fn replace<P>(&self, from: P, to: &str) -> WildString
+    where
+        P: Pattern + Copy,
+    {
+        WildString(
+            self.0
+                .iter()
+                .map(|part| match part {
+                    WildStringPart::String(s) => WildStringPart::String(s.replace(from, to)),
+                    part => part.clone(),
+                })
+                .collect_vec(),
+        )
+    }
+
+    pub fn build_acle(&mut self, ctx: &LocalContext) -> Result<(), String> {
+        self.build(ctx, TypeRepr::ACLENotation)
+    }
+
+    pub fn build_neon_intrinsic_signature(&mut self, ctx: &LocalContext) -> Result<(), String> {
+        let repr = TypeRepr::ACLENotation;
+        self.iter_mut().try_for_each(|wp| -> Result<(), String> {
+            if let WildStringPart::Wildcard(w) = wp {
+                match w {
+                    &mut Wildcard::NEONType(_, _, ref maybe_suffix_kind) => {
+                        if let Some(suffix_kind) = maybe_suffix_kind {
+                            let x = ctx.provide_type_wildcard(w).unwrap();
+                            *wp = WildStringPart::String(make_neon_suffix(x, *suffix_kind))
+                        } else {
+                            *wp = WildString::make_default_build(ctx, repr, w)
+                        }
+                    }
+                    _ => *wp = WildString::make_default_build(ctx, repr, w),
+                }
+            }
+            Ok(())
+        })
+    }
+
+    pub fn build(&mut self, ctx: &LocalContext, repr: TypeRepr) -> Result<(), String> {
+        match repr {
+            TypeRepr::ACLENotation | TypeRepr::LLVMMachine => {
+                self.iter_mut().try_for_each(|wp| -> Result<(), String> {
+                    if let WildStringPart::Wildcard(w) = wp {
+                        match w {
+                            &mut Wildcard::NEONType(_, _, ref maybe_suffix_kind) => {
+                                if let Some(suffix_kind) = maybe_suffix_kind {
+                                    let x = ctx.provide_type_wildcard(w).unwrap();
+                                    *wp = WildStringPart::String(make_neon_suffix(x, *suffix_kind))
+                                } else {
+                                    *wp = WildString::make_default_build(ctx, repr, w)
+                                }
+                            }
+                            _ => *wp = WildString::make_default_build(ctx, repr, w),
+                        }
+                    }
+                    Ok(())
+                })
+            }
+            _ => self.iter_mut().try_for_each(|wp| -> Result<(), String> {
+                if let WildStringPart::Wildcard(w) = wp {
+                    *wp = WildString::make_default_build(ctx, repr, w);
+                }
+                Ok(())
+            }),
+        }
+    }
+
+    fn make_default_build(ctx: &LocalContext, repr: TypeRepr, w: &mut Wildcard) -> WildStringPart {
+        WildStringPart::String(
+            ctx.provide_substitution_wildcard(w)
+                .or_else(|_| ctx.provide_type_wildcard(w).map(|ty| ty.repr(repr)))
+                .unwrap(),
+        )
+    }
+}
+
+impl From<String> for WildString {
+    fn from(s: String) -> Self {
+        WildString(vec![WildStringPart::String(s)])
+    }
+}
+
+impl FromStr for WildString {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        enum State {
+            Normal { start: usize },
+            Wildcard { start: usize, count: usize },
+            EscapeTokenOpen { start: usize, at: usize },
+            EscapeTokenClose { start: usize, at: usize },
+        }
+
+        let mut ws = WildString::default();
+        match s
+            .char_indices()
+            .try_fold(State::Normal { start: 0 }, |state, (idx, ch)| {
+                match (state, ch) {
+                    (State::Normal { start }, '{') => Ok(State::EscapeTokenOpen { start, at: idx }),
+                    (State::Normal { start }, '}') => {
+                        Ok(State::EscapeTokenClose { start, at: idx })
+                    }
+                    (State::EscapeTokenOpen { start, at }, '{')
+                    | (State::EscapeTokenClose { start, at }, '}') => {
+                        if start < at {
+                            ws.push_str(&s[start..at])
+                        }
+
+                        Ok(State::Normal { start: idx })
+                    }
+                    (State::EscapeTokenOpen { at, .. }, '}') => Err(format!(
+                        "empty wildcard given in string {s:?} at position {at}"
+                    )),
+                    (State::EscapeTokenOpen { start, at }, _) => {
+                        if start < at {
+                            ws.push_str(&s[start..at])
+                        }
+
+                        Ok(State::Wildcard {
+                            start: idx,
+                            count: 0,
+                        })
+                    }
+                    (State::EscapeTokenClose { at, .. }, _) => Err(format!(
+                        "closing a non-wildcard/bad escape in string {s:?} at position {at}"
+                    )),
+                    // Nesting wildcards is only supported for `{foo as {bar}}`, wildcards cannot be
+                    // nested at the start of a WildString.
+                    (State::Wildcard { start, count }, '{') => Ok(State::Wildcard {
+                        start,
+                        count: count + 1,
+                    }),
+                    (State::Wildcard { start, count: 0 }, '}') => {
+                        ws.push_wildcard(s[start..idx].parse()?);
+                        Ok(State::Normal { start: idx + 1 })
+                    }
+                    (State::Wildcard { start, count }, '}') => Ok(State::Wildcard {
+                        start,
+                        count: count - 1,
+                    }),
+                    (state @ State::Normal { .. }, _) | (state @ State::Wildcard { .. }, _) => {
+                        Ok(state)
+                    }
+                }
+            })? {
+            State::Normal { start } => {
+                if start < s.len() {
+                    ws.push_str(&s[start..]);
+                }
+
+                Ok(ws)
+            }
+            State::EscapeTokenOpen { at, .. } | State::Wildcard { start: at, .. } => Err(format!(
+                "unclosed wildcard in string {s:?} at position {at}"
+            )),
+            State::EscapeTokenClose { at, .. } => Err(format!(
+                "closing a non-wildcard/bad escape in string {s:?} at position {at}"
+            )),
+        }
+    }
+}
+
+impl fmt::Display for WildString {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{}",
+            self.0
+                .iter()
+                .map(|part| match part {
+                    WildStringPart::String(s) => s.to_owned(),
+                    WildStringPart::Wildcard(w) => format!("{{{w}}}"),
+                })
+                .join("")
+        )
+    }
+}
+
+impl ToTokens for WildString {
+    fn to_tokens(&self, tokens: &mut TokenStream) {
+        assert!(
+            !self.has_wildcards(),
+            "cannot convert string with wildcards {self:?} to TokenStream"
+        );
+        let str = self.to_string();
+        tokens.append_all(quote! { #str })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::typekinds::*;
+    use crate::wildstring::*;
+
+    #[test]
+    fn test_empty_string() {
+        let ws: WildString = "".parse().unwrap();
+        assert_eq!(ws.0.len(), 0);
+    }
+
+    #[test]
+    fn test_plain_string() {
+        let ws: WildString = "plain string".parse().unwrap();
+        assert_eq!(ws.0.len(), 1);
+        assert_eq!(
+            ws,
+            WildString(vec![WildStringPart::String("plain string".to_string())])
+        )
+    }
+
+    #[test]
+    fn test_escaped_curly_brackets() {
+        let ws: WildString = "VALUE = {{value}}".parse().unwrap();
+        assert_eq!(ws.to_string(), "VALUE = {value}");
+        assert!(!ws.has_wildcards());
+    }
+
+    #[test]
+    fn test_escaped_curly_brackets_wildcard() {
+        let ws: WildString = "TYPE = {{{type}}}".parse().unwrap();
+        assert_eq!(ws.to_string(), "TYPE = {{type}}");
+        assert_eq!(ws.0.len(), 4);
+        assert!(ws.has_wildcards());
+    }
+
+    #[test]
+    fn test_wildcard_right_boundary() {
+        let s = "string test {type}";
+        let ws: WildString = s.parse().unwrap();
+        assert_eq!(&ws.to_string(), s);
+        assert!(ws.has_wildcards());
+    }
+
+    #[test]
+    fn test_wildcard_left_boundary() {
+        let s = "{type} string test";
+        let ws: WildString = s.parse().unwrap();
+        assert_eq!(&ws.to_string(), s);
+        assert!(ws.has_wildcards());
+    }
+
+    #[test]
+    fn test_recursive_wildcard() {
+        let s = "string test {type[0] as {type[1]}}";
+        let ws: WildString = s.parse().unwrap();
+
+        assert_eq!(ws.0.len(), 2);
+        assert_eq!(
+            ws,
+            WildString(vec![
+                WildStringPart::String("string test ".to_string()),
+                WildStringPart::Wildcard(Wildcard::Scale(
+                    Box::new(Wildcard::Type(Some(0))),
+                    Box::new(TypeKind::Wildcard(Wildcard::Type(Some(1)))),
+                ))
+            ])
+        );
+    }
+
+    #[test]
+    fn test_scale_wildcard() {
+        let s = "string {type[0] as i8} test";
+        let ws: WildString = s.parse().unwrap();
+
+        assert_eq!(ws.0.len(), 3);
+        assert_eq!(
+            ws,
+            WildString(vec![
+                WildStringPart::String("string ".to_string()),
+                WildStringPart::Wildcard(Wildcard::Scale(
+                    Box::new(Wildcard::Type(Some(0))),
+                    Box::new(TypeKind::Base(BaseType::Sized(BaseTypeKind::Int, 8))),
+                )),
+                WildStringPart::String(" test".to_string())
+            ])
+        );
+    }
+
+    #[test]
+    fn test_solitaire_wildcard() {
+        let ws: WildString = "{type}".parse().unwrap();
+        assert_eq!(ws.0.len(), 1);
+        assert_eq!(
+            ws,
+            WildString(vec![WildStringPart::Wildcard(Wildcard::Type(None))])
+        )
+    }
+
+    #[test]
+    fn test_empty_wildcard() {
+        "string {}"
+            .parse::<WildString>()
+            .expect_err("expected parse error");
+    }
+
+    #[test]
+    fn test_invalid_open_wildcard_right() {
+        "string {"
+            .parse::<WildString>()
+            .expect_err("expected parse error");
+    }
+
+    #[test]
+    fn test_invalid_close_wildcard_right() {
+        "string }"
+            .parse::<WildString>()
+            .expect_err("expected parse error");
+    }
+
+    #[test]
+    fn test_invalid_open_wildcard_left() {
+        "{string"
+            .parse::<WildString>()
+            .expect_err("expected parse error");
+    }
+
+    #[test]
+    fn test_invalid_close_wildcard_left() {
+        "}string"
+            .parse::<WildString>()
+            .expect_err("expected parse error");
+    }
+
+    #[test]
+    fn test_consecutive_wildcards() {
+        let s = "svprf{size_literal[1]}_gather_{type[0]}{index_or_offset}";
+        let ws: WildString = s.parse().unwrap();
+        assert_eq!(ws.to_string(), s)
+    }
+}
diff --git a/library/stdarch/crates/stdarch-gen-loongarch/Cargo.toml b/library/stdarch/crates/stdarch-gen-loongarch/Cargo.toml
new file mode 100644
index 0000000000000..d3ac607c5576c
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-loongarch/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "stdarch-gen-loongarch"
+version = "0.1.0"
+authors = ["ZHAI Xiang <zhaixiang@loongson.cn>", "WANG Rui <wangrui@loongson.cn>"]
+edition = "2024"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+rand = "0.8.5"
diff --git a/library/stdarch/crates/stdarch-gen-loongarch/README.md b/library/stdarch/crates/stdarch-gen-loongarch/README.md
new file mode 100644
index 0000000000000..1fc81483a12e7
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-loongarch/README.md
@@ -0,0 +1,35 @@
+# LoongArch LSX/LASX intrinsic code generator
+
+A small tool that allows to quickly generate intrinsics for the LoongArch LSX/LASX architectures.
+
+The specification for the intrinsics can be found in `lsx.spec` or `lasx.spec`.
+
+To run and re-generate the code run the following from the root of the `stdarch` crate.
+
+LSX:
+```
+# Generate bindings
+OUT_DIR=`pwd`/crates/stdarch-gen-loongarch cargo run -p stdarch-gen-loongarch -- crates/stdarch-gen-loongarch/lsxintrin.h
+OUT_DIR=`pwd`/crates/core_arch cargo run -p stdarch-gen-loongarch -- crates/stdarch-gen-loongarch/lsx.spec
+rustfmt crates/core_arch/src/loongarch64/lsx/generated.rs
+
+# Generate tests
+OUT_DIR=`pwd`/crates/stdarch-gen-loongarch cargo run -p stdarch-gen-loongarch -- crates/stdarch-gen-loongarch/lsx.spec test
+loongarch64-unknown-linux-gnu-gcc -static -o lsx crates/stdarch-gen-loongarch/lsx.c -mlasx -mfrecipe
+qemu-loongarch64 ./lsx > crates/core_arch/src/loongarch64/lsx/tests.rs
+rustfmt crates/core_arch/src/loongarch64/lsx/tests.rs
+```
+
+LASX:
+```
+# Generate bindings
+OUT_DIR=`pwd`/crates/stdarch-gen-loongarch cargo run -p stdarch-gen-loongarch -- crates/stdarch-gen-loongarch/lasxintrin.h
+OUT_DIR=`pwd`/crates/core_arch cargo run -p stdarch-gen-loongarch -- crates/stdarch-gen-loongarch/lasx.spec
+rustfmt crates/core_arch/src/loongarch64/lasx/generated.rs
+
+# Generate tests
+OUT_DIR=`pwd`/crates/stdarch-gen-loongarch cargo run -p stdarch-gen-loongarch -- crates/stdarch-gen-loongarch/lasx.spec test
+loongarch64-unknown-linux-gnu-gcc -static -o lasx crates/stdarch-gen-loongarch/lasx.c -mlasx -mfrecipe
+qemu-loongarch64 ./lasx > crates/core_arch/src/loongarch64/lasx/tests.rs
+rustfmt crates/core_arch/src/loongarch64/lasx/tests.rs
+```
diff --git a/library/stdarch/crates/stdarch-gen-loongarch/lasx.spec b/library/stdarch/crates/stdarch-gen-loongarch/lasx.spec
new file mode 100644
index 0000000000000..e3bdfcb5e9faa
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-loongarch/lasx.spec
@@ -0,0 +1,3705 @@
+// This code is automatically generated. DO NOT MODIFY.
+// ```
+// OUT_DIR=`pwd`/crates/stdarch-gen-loongarch cargo run -p stdarch-gen-loongarch -- crates/stdarch-gen-loongarch/lasxintrin.h
+// ```
+
+/// lasx_xvsll_b
+name = lasx_xvsll_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvsll_h
+name = lasx_xvsll_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvsll_w
+name = lasx_xvsll_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvsll_d
+name = lasx_xvsll_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvslli_b
+name = lasx_xvslli_b
+asm-fmts = xd, xj, ui3
+data-types = V32QI, V32QI, UQI
+
+/// lasx_xvslli_h
+name = lasx_xvslli_h
+asm-fmts = xd, xj, ui4
+data-types = V16HI, V16HI, UQI
+
+/// lasx_xvslli_w
+name = lasx_xvslli_w
+asm-fmts = xd, xj, ui5
+data-types = V8SI, V8SI, UQI
+
+/// lasx_xvslli_d
+name = lasx_xvslli_d
+asm-fmts = xd, xj, ui6
+data-types = V4DI, V4DI, UQI
+
+/// lasx_xvsra_b
+name = lasx_xvsra_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvsra_h
+name = lasx_xvsra_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvsra_w
+name = lasx_xvsra_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvsra_d
+name = lasx_xvsra_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvsrai_b
+name = lasx_xvsrai_b
+asm-fmts = xd, xj, ui3
+data-types = V32QI, V32QI, UQI
+
+/// lasx_xvsrai_h
+name = lasx_xvsrai_h
+asm-fmts = xd, xj, ui4
+data-types = V16HI, V16HI, UQI
+
+/// lasx_xvsrai_w
+name = lasx_xvsrai_w
+asm-fmts = xd, xj, ui5
+data-types = V8SI, V8SI, UQI
+
+/// lasx_xvsrai_d
+name = lasx_xvsrai_d
+asm-fmts = xd, xj, ui6
+data-types = V4DI, V4DI, UQI
+
+/// lasx_xvsrar_b
+name = lasx_xvsrar_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvsrar_h
+name = lasx_xvsrar_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvsrar_w
+name = lasx_xvsrar_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvsrar_d
+name = lasx_xvsrar_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvsrari_b
+name = lasx_xvsrari_b
+asm-fmts = xd, xj, ui3
+data-types = V32QI, V32QI, UQI
+
+/// lasx_xvsrari_h
+name = lasx_xvsrari_h
+asm-fmts = xd, xj, ui4
+data-types = V16HI, V16HI, UQI
+
+/// lasx_xvsrari_w
+name = lasx_xvsrari_w
+asm-fmts = xd, xj, ui5
+data-types = V8SI, V8SI, UQI
+
+/// lasx_xvsrari_d
+name = lasx_xvsrari_d
+asm-fmts = xd, xj, ui6
+data-types = V4DI, V4DI, UQI
+
+/// lasx_xvsrl_b
+name = lasx_xvsrl_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvsrl_h
+name = lasx_xvsrl_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvsrl_w
+name = lasx_xvsrl_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvsrl_d
+name = lasx_xvsrl_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvsrli_b
+name = lasx_xvsrli_b
+asm-fmts = xd, xj, ui3
+data-types = V32QI, V32QI, UQI
+
+/// lasx_xvsrli_h
+name = lasx_xvsrli_h
+asm-fmts = xd, xj, ui4
+data-types = V16HI, V16HI, UQI
+
+/// lasx_xvsrli_w
+name = lasx_xvsrli_w
+asm-fmts = xd, xj, ui5
+data-types = V8SI, V8SI, UQI
+
+/// lasx_xvsrli_d
+name = lasx_xvsrli_d
+asm-fmts = xd, xj, ui6
+data-types = V4DI, V4DI, UQI
+
+/// lasx_xvsrlr_b
+name = lasx_xvsrlr_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvsrlr_h
+name = lasx_xvsrlr_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvsrlr_w
+name = lasx_xvsrlr_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvsrlr_d
+name = lasx_xvsrlr_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvsrlri_b
+name = lasx_xvsrlri_b
+asm-fmts = xd, xj, ui3
+data-types = V32QI, V32QI, UQI
+
+/// lasx_xvsrlri_h
+name = lasx_xvsrlri_h
+asm-fmts = xd, xj, ui4
+data-types = V16HI, V16HI, UQI
+
+/// lasx_xvsrlri_w
+name = lasx_xvsrlri_w
+asm-fmts = xd, xj, ui5
+data-types = V8SI, V8SI, UQI
+
+/// lasx_xvsrlri_d
+name = lasx_xvsrlri_d
+asm-fmts = xd, xj, ui6
+data-types = V4DI, V4DI, UQI
+
+/// lasx_xvbitclr_b
+name = lasx_xvbitclr_b
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvbitclr_h
+name = lasx_xvbitclr_h
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV16HI
+
+/// lasx_xvbitclr_w
+name = lasx_xvbitclr_w
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV8SI
+
+/// lasx_xvbitclr_d
+name = lasx_xvbitclr_d
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvbitclri_b
+name = lasx_xvbitclri_b
+asm-fmts = xd, xj, ui3
+data-types = UV32QI, UV32QI, UQI
+
+/// lasx_xvbitclri_h
+name = lasx_xvbitclri_h
+asm-fmts = xd, xj, ui4
+data-types = UV16HI, UV16HI, UQI
+
+/// lasx_xvbitclri_w
+name = lasx_xvbitclri_w
+asm-fmts = xd, xj, ui5
+data-types = UV8SI, UV8SI, UQI
+
+/// lasx_xvbitclri_d
+name = lasx_xvbitclri_d
+asm-fmts = xd, xj, ui6
+data-types = UV4DI, UV4DI, UQI
+
+/// lasx_xvbitset_b
+name = lasx_xvbitset_b
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvbitset_h
+name = lasx_xvbitset_h
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV16HI
+
+/// lasx_xvbitset_w
+name = lasx_xvbitset_w
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV8SI
+
+/// lasx_xvbitset_d
+name = lasx_xvbitset_d
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvbitseti_b
+name = lasx_xvbitseti_b
+asm-fmts = xd, xj, ui3
+data-types = UV32QI, UV32QI, UQI
+
+/// lasx_xvbitseti_h
+name = lasx_xvbitseti_h
+asm-fmts = xd, xj, ui4
+data-types = UV16HI, UV16HI, UQI
+
+/// lasx_xvbitseti_w
+name = lasx_xvbitseti_w
+asm-fmts = xd, xj, ui5
+data-types = UV8SI, UV8SI, UQI
+
+/// lasx_xvbitseti_d
+name = lasx_xvbitseti_d
+asm-fmts = xd, xj, ui6
+data-types = UV4DI, UV4DI, UQI
+
+/// lasx_xvbitrev_b
+name = lasx_xvbitrev_b
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvbitrev_h
+name = lasx_xvbitrev_h
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV16HI
+
+/// lasx_xvbitrev_w
+name = lasx_xvbitrev_w
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV8SI
+
+/// lasx_xvbitrev_d
+name = lasx_xvbitrev_d
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvbitrevi_b
+name = lasx_xvbitrevi_b
+asm-fmts = xd, xj, ui3
+data-types = UV32QI, UV32QI, UQI
+
+/// lasx_xvbitrevi_h
+name = lasx_xvbitrevi_h
+asm-fmts = xd, xj, ui4
+data-types = UV16HI, UV16HI, UQI
+
+/// lasx_xvbitrevi_w
+name = lasx_xvbitrevi_w
+asm-fmts = xd, xj, ui5
+data-types = UV8SI, UV8SI, UQI
+
+/// lasx_xvbitrevi_d
+name = lasx_xvbitrevi_d
+asm-fmts = xd, xj, ui6
+data-types = UV4DI, UV4DI, UQI
+
+/// lasx_xvadd_b
+name = lasx_xvadd_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvadd_h
+name = lasx_xvadd_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvadd_w
+name = lasx_xvadd_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvadd_d
+name = lasx_xvadd_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvaddi_bu
+name = lasx_xvaddi_bu
+asm-fmts = xd, xj, ui5
+data-types = V32QI, V32QI, UQI
+
+/// lasx_xvaddi_hu
+name = lasx_xvaddi_hu
+asm-fmts = xd, xj, ui5
+data-types = V16HI, V16HI, UQI
+
+/// lasx_xvaddi_wu
+name = lasx_xvaddi_wu
+asm-fmts = xd, xj, ui5
+data-types = V8SI, V8SI, UQI
+
+/// lasx_xvaddi_du
+name = lasx_xvaddi_du
+asm-fmts = xd, xj, ui5
+data-types = V4DI, V4DI, UQI
+
+/// lasx_xvsub_b
+name = lasx_xvsub_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvsub_h
+name = lasx_xvsub_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvsub_w
+name = lasx_xvsub_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvsub_d
+name = lasx_xvsub_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvsubi_bu
+name = lasx_xvsubi_bu
+asm-fmts = xd, xj, ui5
+data-types = V32QI, V32QI, UQI
+
+/// lasx_xvsubi_hu
+name = lasx_xvsubi_hu
+asm-fmts = xd, xj, ui5
+data-types = V16HI, V16HI, UQI
+
+/// lasx_xvsubi_wu
+name = lasx_xvsubi_wu
+asm-fmts = xd, xj, ui5
+data-types = V8SI, V8SI, UQI
+
+/// lasx_xvsubi_du
+name = lasx_xvsubi_du
+asm-fmts = xd, xj, ui5
+data-types = V4DI, V4DI, UQI
+
+/// lasx_xvmax_b
+name = lasx_xvmax_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvmax_h
+name = lasx_xvmax_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvmax_w
+name = lasx_xvmax_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvmax_d
+name = lasx_xvmax_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvmaxi_b
+name = lasx_xvmaxi_b
+asm-fmts = xd, xj, si5
+data-types = V32QI, V32QI, QI
+
+/// lasx_xvmaxi_h
+name = lasx_xvmaxi_h
+asm-fmts = xd, xj, si5
+data-types = V16HI, V16HI, QI
+
+/// lasx_xvmaxi_w
+name = lasx_xvmaxi_w
+asm-fmts = xd, xj, si5
+data-types = V8SI, V8SI, QI
+
+/// lasx_xvmaxi_d
+name = lasx_xvmaxi_d
+asm-fmts = xd, xj, si5
+data-types = V4DI, V4DI, QI
+
+/// lasx_xvmax_bu
+name = lasx_xvmax_bu
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvmax_hu
+name = lasx_xvmax_hu
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV16HI
+
+/// lasx_xvmax_wu
+name = lasx_xvmax_wu
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV8SI
+
+/// lasx_xvmax_du
+name = lasx_xvmax_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvmaxi_bu
+name = lasx_xvmaxi_bu
+asm-fmts = xd, xj, ui5
+data-types = UV32QI, UV32QI, UQI
+
+/// lasx_xvmaxi_hu
+name = lasx_xvmaxi_hu
+asm-fmts = xd, xj, ui5
+data-types = UV16HI, UV16HI, UQI
+
+/// lasx_xvmaxi_wu
+name = lasx_xvmaxi_wu
+asm-fmts = xd, xj, ui5
+data-types = UV8SI, UV8SI, UQI
+
+/// lasx_xvmaxi_du
+name = lasx_xvmaxi_du
+asm-fmts = xd, xj, ui5
+data-types = UV4DI, UV4DI, UQI
+
+/// lasx_xvmin_b
+name = lasx_xvmin_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvmin_h
+name = lasx_xvmin_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvmin_w
+name = lasx_xvmin_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvmin_d
+name = lasx_xvmin_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvmini_b
+name = lasx_xvmini_b
+asm-fmts = xd, xj, si5
+data-types = V32QI, V32QI, QI
+
+/// lasx_xvmini_h
+name = lasx_xvmini_h
+asm-fmts = xd, xj, si5
+data-types = V16HI, V16HI, QI
+
+/// lasx_xvmini_w
+name = lasx_xvmini_w
+asm-fmts = xd, xj, si5
+data-types = V8SI, V8SI, QI
+
+/// lasx_xvmini_d
+name = lasx_xvmini_d
+asm-fmts = xd, xj, si5
+data-types = V4DI, V4DI, QI
+
+/// lasx_xvmin_bu
+name = lasx_xvmin_bu
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvmin_hu
+name = lasx_xvmin_hu
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV16HI
+
+/// lasx_xvmin_wu
+name = lasx_xvmin_wu
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV8SI
+
+/// lasx_xvmin_du
+name = lasx_xvmin_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvmini_bu
+name = lasx_xvmini_bu
+asm-fmts = xd, xj, ui5
+data-types = UV32QI, UV32QI, UQI
+
+/// lasx_xvmini_hu
+name = lasx_xvmini_hu
+asm-fmts = xd, xj, ui5
+data-types = UV16HI, UV16HI, UQI
+
+/// lasx_xvmini_wu
+name = lasx_xvmini_wu
+asm-fmts = xd, xj, ui5
+data-types = UV8SI, UV8SI, UQI
+
+/// lasx_xvmini_du
+name = lasx_xvmini_du
+asm-fmts = xd, xj, ui5
+data-types = UV4DI, UV4DI, UQI
+
+/// lasx_xvseq_b
+name = lasx_xvseq_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvseq_h
+name = lasx_xvseq_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvseq_w
+name = lasx_xvseq_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvseq_d
+name = lasx_xvseq_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvseqi_b
+name = lasx_xvseqi_b
+asm-fmts = xd, xj, si5
+data-types = V32QI, V32QI, QI
+
+/// lasx_xvseqi_h
+name = lasx_xvseqi_h
+asm-fmts = xd, xj, si5
+data-types = V16HI, V16HI, QI
+
+/// lasx_xvseqi_w
+name = lasx_xvseqi_w
+asm-fmts = xd, xj, si5
+data-types = V8SI, V8SI, QI
+
+/// lasx_xvseqi_d
+name = lasx_xvseqi_d
+asm-fmts = xd, xj, si5
+data-types = V4DI, V4DI, QI
+
+/// lasx_xvslt_b
+name = lasx_xvslt_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvslt_h
+name = lasx_xvslt_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvslt_w
+name = lasx_xvslt_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvslt_d
+name = lasx_xvslt_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvslti_b
+name = lasx_xvslti_b
+asm-fmts = xd, xj, si5
+data-types = V32QI, V32QI, QI
+
+/// lasx_xvslti_h
+name = lasx_xvslti_h
+asm-fmts = xd, xj, si5
+data-types = V16HI, V16HI, QI
+
+/// lasx_xvslti_w
+name = lasx_xvslti_w
+asm-fmts = xd, xj, si5
+data-types = V8SI, V8SI, QI
+
+/// lasx_xvslti_d
+name = lasx_xvslti_d
+asm-fmts = xd, xj, si5
+data-types = V4DI, V4DI, QI
+
+/// lasx_xvslt_bu
+name = lasx_xvslt_bu
+asm-fmts = xd, xj, xk
+data-types = V32QI, UV32QI, UV32QI
+
+/// lasx_xvslt_hu
+name = lasx_xvslt_hu
+asm-fmts = xd, xj, xk
+data-types = V16HI, UV16HI, UV16HI
+
+/// lasx_xvslt_wu
+name = lasx_xvslt_wu
+asm-fmts = xd, xj, xk
+data-types = V8SI, UV8SI, UV8SI
+
+/// lasx_xvslt_du
+name = lasx_xvslt_du
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV4DI, UV4DI
+
+/// lasx_xvslti_bu
+name = lasx_xvslti_bu
+asm-fmts = xd, xj, ui5
+data-types = V32QI, UV32QI, UQI
+
+/// lasx_xvslti_hu
+name = lasx_xvslti_hu
+asm-fmts = xd, xj, ui5
+data-types = V16HI, UV16HI, UQI
+
+/// lasx_xvslti_wu
+name = lasx_xvslti_wu
+asm-fmts = xd, xj, ui5
+data-types = V8SI, UV8SI, UQI
+
+/// lasx_xvslti_du
+name = lasx_xvslti_du
+asm-fmts = xd, xj, ui5
+data-types = V4DI, UV4DI, UQI
+
+/// lasx_xvsle_b
+name = lasx_xvsle_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvsle_h
+name = lasx_xvsle_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvsle_w
+name = lasx_xvsle_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvsle_d
+name = lasx_xvsle_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvslei_b
+name = lasx_xvslei_b
+asm-fmts = xd, xj, si5
+data-types = V32QI, V32QI, QI
+
+/// lasx_xvslei_h
+name = lasx_xvslei_h
+asm-fmts = xd, xj, si5
+data-types = V16HI, V16HI, QI
+
+/// lasx_xvslei_w
+name = lasx_xvslei_w
+asm-fmts = xd, xj, si5
+data-types = V8SI, V8SI, QI
+
+/// lasx_xvslei_d
+name = lasx_xvslei_d
+asm-fmts = xd, xj, si5
+data-types = V4DI, V4DI, QI
+
+/// lasx_xvsle_bu
+name = lasx_xvsle_bu
+asm-fmts = xd, xj, xk
+data-types = V32QI, UV32QI, UV32QI
+
+/// lasx_xvsle_hu
+name = lasx_xvsle_hu
+asm-fmts = xd, xj, xk
+data-types = V16HI, UV16HI, UV16HI
+
+/// lasx_xvsle_wu
+name = lasx_xvsle_wu
+asm-fmts = xd, xj, xk
+data-types = V8SI, UV8SI, UV8SI
+
+/// lasx_xvsle_du
+name = lasx_xvsle_du
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV4DI, UV4DI
+
+/// lasx_xvslei_bu
+name = lasx_xvslei_bu
+asm-fmts = xd, xj, ui5
+data-types = V32QI, UV32QI, UQI
+
+/// lasx_xvslei_hu
+name = lasx_xvslei_hu
+asm-fmts = xd, xj, ui5
+data-types = V16HI, UV16HI, UQI
+
+/// lasx_xvslei_wu
+name = lasx_xvslei_wu
+asm-fmts = xd, xj, ui5
+data-types = V8SI, UV8SI, UQI
+
+/// lasx_xvslei_du
+name = lasx_xvslei_du
+asm-fmts = xd, xj, ui5
+data-types = V4DI, UV4DI, UQI
+
+/// lasx_xvsat_b
+name = lasx_xvsat_b
+asm-fmts = xd, xj, ui3
+data-types = V32QI, V32QI, UQI
+
+/// lasx_xvsat_h
+name = lasx_xvsat_h
+asm-fmts = xd, xj, ui4
+data-types = V16HI, V16HI, UQI
+
+/// lasx_xvsat_w
+name = lasx_xvsat_w
+asm-fmts = xd, xj, ui5
+data-types = V8SI, V8SI, UQI
+
+/// lasx_xvsat_d
+name = lasx_xvsat_d
+asm-fmts = xd, xj, ui6
+data-types = V4DI, V4DI, UQI
+
+/// lasx_xvsat_bu
+name = lasx_xvsat_bu
+asm-fmts = xd, xj, ui3
+data-types = UV32QI, UV32QI, UQI
+
+/// lasx_xvsat_hu
+name = lasx_xvsat_hu
+asm-fmts = xd, xj, ui4
+data-types = UV16HI, UV16HI, UQI
+
+/// lasx_xvsat_wu
+name = lasx_xvsat_wu
+asm-fmts = xd, xj, ui5
+data-types = UV8SI, UV8SI, UQI
+
+/// lasx_xvsat_du
+name = lasx_xvsat_du
+asm-fmts = xd, xj, ui6
+data-types = UV4DI, UV4DI, UQI
+
+/// lasx_xvadda_b
+name = lasx_xvadda_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvadda_h
+name = lasx_xvadda_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvadda_w
+name = lasx_xvadda_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvadda_d
+name = lasx_xvadda_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvsadd_b
+name = lasx_xvsadd_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvsadd_h
+name = lasx_xvsadd_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvsadd_w
+name = lasx_xvsadd_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvsadd_d
+name = lasx_xvsadd_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvsadd_bu
+name = lasx_xvsadd_bu
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvsadd_hu
+name = lasx_xvsadd_hu
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV16HI
+
+/// lasx_xvsadd_wu
+name = lasx_xvsadd_wu
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV8SI
+
+/// lasx_xvsadd_du
+name = lasx_xvsadd_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvavg_b
+name = lasx_xvavg_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvavg_h
+name = lasx_xvavg_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvavg_w
+name = lasx_xvavg_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvavg_d
+name = lasx_xvavg_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvavg_bu
+name = lasx_xvavg_bu
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvavg_hu
+name = lasx_xvavg_hu
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV16HI
+
+/// lasx_xvavg_wu
+name = lasx_xvavg_wu
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV8SI
+
+/// lasx_xvavg_du
+name = lasx_xvavg_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvavgr_b
+name = lasx_xvavgr_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvavgr_h
+name = lasx_xvavgr_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvavgr_w
+name = lasx_xvavgr_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvavgr_d
+name = lasx_xvavgr_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvavgr_bu
+name = lasx_xvavgr_bu
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvavgr_hu
+name = lasx_xvavgr_hu
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV16HI
+
+/// lasx_xvavgr_wu
+name = lasx_xvavgr_wu
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV8SI
+
+/// lasx_xvavgr_du
+name = lasx_xvavgr_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvssub_b
+name = lasx_xvssub_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvssub_h
+name = lasx_xvssub_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvssub_w
+name = lasx_xvssub_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvssub_d
+name = lasx_xvssub_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvssub_bu
+name = lasx_xvssub_bu
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvssub_hu
+name = lasx_xvssub_hu
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV16HI
+
+/// lasx_xvssub_wu
+name = lasx_xvssub_wu
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV8SI
+
+/// lasx_xvssub_du
+name = lasx_xvssub_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvabsd_b
+name = lasx_xvabsd_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvabsd_h
+name = lasx_xvabsd_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvabsd_w
+name = lasx_xvabsd_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvabsd_d
+name = lasx_xvabsd_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvabsd_bu
+name = lasx_xvabsd_bu
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvabsd_hu
+name = lasx_xvabsd_hu
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV16HI
+
+/// lasx_xvabsd_wu
+name = lasx_xvabsd_wu
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV8SI
+
+/// lasx_xvabsd_du
+name = lasx_xvabsd_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvmul_b
+name = lasx_xvmul_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvmul_h
+name = lasx_xvmul_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvmul_w
+name = lasx_xvmul_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvmul_d
+name = lasx_xvmul_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvmadd_b
+name = lasx_xvmadd_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI, V32QI
+
+/// lasx_xvmadd_h
+name = lasx_xvmadd_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI, V16HI
+
+/// lasx_xvmadd_w
+name = lasx_xvmadd_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI, V8SI
+
+/// lasx_xvmadd_d
+name = lasx_xvmadd_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI, V4DI
+
+/// lasx_xvmsub_b
+name = lasx_xvmsub_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI, V32QI
+
+/// lasx_xvmsub_h
+name = lasx_xvmsub_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI, V16HI
+
+/// lasx_xvmsub_w
+name = lasx_xvmsub_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI, V8SI
+
+/// lasx_xvmsub_d
+name = lasx_xvmsub_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI, V4DI
+
+/// lasx_xvdiv_b
+name = lasx_xvdiv_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvdiv_h
+name = lasx_xvdiv_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvdiv_w
+name = lasx_xvdiv_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvdiv_d
+name = lasx_xvdiv_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvdiv_bu
+name = lasx_xvdiv_bu
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvdiv_hu
+name = lasx_xvdiv_hu
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV16HI
+
+/// lasx_xvdiv_wu
+name = lasx_xvdiv_wu
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV8SI
+
+/// lasx_xvdiv_du
+name = lasx_xvdiv_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvhaddw_h_b
+name = lasx_xvhaddw_h_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, V32QI, V32QI
+
+/// lasx_xvhaddw_w_h
+name = lasx_xvhaddw_w_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, V16HI, V16HI
+
+/// lasx_xvhaddw_d_w
+name = lasx_xvhaddw_d_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, V8SI, V8SI
+
+/// lasx_xvhaddw_hu_bu
+name = lasx_xvhaddw_hu_bu
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV32QI, UV32QI
+
+/// lasx_xvhaddw_wu_hu
+name = lasx_xvhaddw_wu_hu
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV16HI, UV16HI
+
+/// lasx_xvhaddw_du_wu
+name = lasx_xvhaddw_du_wu
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV8SI, UV8SI
+
+/// lasx_xvhsubw_h_b
+name = lasx_xvhsubw_h_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, V32QI, V32QI
+
+/// lasx_xvhsubw_w_h
+name = lasx_xvhsubw_w_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, V16HI, V16HI
+
+/// lasx_xvhsubw_d_w
+name = lasx_xvhsubw_d_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, V8SI, V8SI
+
+/// lasx_xvhsubw_hu_bu
+name = lasx_xvhsubw_hu_bu
+asm-fmts = xd, xj, xk
+data-types = V16HI, UV32QI, UV32QI
+
+/// lasx_xvhsubw_wu_hu
+name = lasx_xvhsubw_wu_hu
+asm-fmts = xd, xj, xk
+data-types = V8SI, UV16HI, UV16HI
+
+/// lasx_xvhsubw_du_wu
+name = lasx_xvhsubw_du_wu
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV8SI, UV8SI
+
+/// lasx_xvmod_b
+name = lasx_xvmod_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvmod_h
+name = lasx_xvmod_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvmod_w
+name = lasx_xvmod_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvmod_d
+name = lasx_xvmod_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvmod_bu
+name = lasx_xvmod_bu
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvmod_hu
+name = lasx_xvmod_hu
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV16HI
+
+/// lasx_xvmod_wu
+name = lasx_xvmod_wu
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV8SI
+
+/// lasx_xvmod_du
+name = lasx_xvmod_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvrepl128vei_b
+name = lasx_xvrepl128vei_b
+asm-fmts = xd, xj, ui4
+data-types = V32QI, V32QI, UQI
+
+/// lasx_xvrepl128vei_h
+name = lasx_xvrepl128vei_h
+asm-fmts = xd, xj, ui3
+data-types = V16HI, V16HI, UQI
+
+/// lasx_xvrepl128vei_w
+name = lasx_xvrepl128vei_w
+asm-fmts = xd, xj, ui2
+data-types = V8SI, V8SI, UQI
+
+/// lasx_xvrepl128vei_d
+name = lasx_xvrepl128vei_d
+asm-fmts = xd, xj, ui1
+data-types = V4DI, V4DI, UQI
+
+/// lasx_xvpickev_b
+name = lasx_xvpickev_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvpickev_h
+name = lasx_xvpickev_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvpickev_w
+name = lasx_xvpickev_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvpickev_d
+name = lasx_xvpickev_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvpickod_b
+name = lasx_xvpickod_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvpickod_h
+name = lasx_xvpickod_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvpickod_w
+name = lasx_xvpickod_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvpickod_d
+name = lasx_xvpickod_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvilvh_b
+name = lasx_xvilvh_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvilvh_h
+name = lasx_xvilvh_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvilvh_w
+name = lasx_xvilvh_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvilvh_d
+name = lasx_xvilvh_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvilvl_b
+name = lasx_xvilvl_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvilvl_h
+name = lasx_xvilvl_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvilvl_w
+name = lasx_xvilvl_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvilvl_d
+name = lasx_xvilvl_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvpackev_b
+name = lasx_xvpackev_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvpackev_h
+name = lasx_xvpackev_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvpackev_w
+name = lasx_xvpackev_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvpackev_d
+name = lasx_xvpackev_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvpackod_b
+name = lasx_xvpackod_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvpackod_h
+name = lasx_xvpackod_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvpackod_w
+name = lasx_xvpackod_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvpackod_d
+name = lasx_xvpackod_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvshuf_b
+name = lasx_xvshuf_b
+asm-fmts = xd, xj, xk, xa
+data-types = V32QI, V32QI, V32QI, V32QI
+
+/// lasx_xvshuf_h
+name = lasx_xvshuf_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI, V16HI
+
+/// lasx_xvshuf_w
+name = lasx_xvshuf_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI, V8SI
+
+/// lasx_xvshuf_d
+name = lasx_xvshuf_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI, V4DI
+
+/// lasx_xvand_v
+name = lasx_xvand_v
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvandi_b
+name = lasx_xvandi_b
+asm-fmts = xd, xj, ui8
+data-types = UV32QI, UV32QI, UQI
+
+/// lasx_xvor_v
+name = lasx_xvor_v
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvori_b
+name = lasx_xvori_b
+asm-fmts = xd, xj, ui8
+data-types = UV32QI, UV32QI, UQI
+
+/// lasx_xvnor_v
+name = lasx_xvnor_v
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvnori_b
+name = lasx_xvnori_b
+asm-fmts = xd, xj, ui8
+data-types = UV32QI, UV32QI, UQI
+
+/// lasx_xvxor_v
+name = lasx_xvxor_v
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvxori_b
+name = lasx_xvxori_b
+asm-fmts = xd, xj, ui8
+data-types = UV32QI, UV32QI, UQI
+
+/// lasx_xvbitsel_v
+name = lasx_xvbitsel_v
+asm-fmts = xd, xj, xk, xa
+data-types = UV32QI, UV32QI, UV32QI, UV32QI
+
+/// lasx_xvbitseli_b
+name = lasx_xvbitseli_b
+asm-fmts = xd, xj, ui8
+data-types = UV32QI, UV32QI, UV32QI, USI
+
+/// lasx_xvshuf4i_b
+name = lasx_xvshuf4i_b
+asm-fmts = xd, xj, ui8
+data-types = V32QI, V32QI, USI
+
+/// lasx_xvshuf4i_h
+name = lasx_xvshuf4i_h
+asm-fmts = xd, xj, ui8
+data-types = V16HI, V16HI, USI
+
+/// lasx_xvshuf4i_w
+name = lasx_xvshuf4i_w
+asm-fmts = xd, xj, ui8
+data-types = V8SI, V8SI, USI
+
+/// lasx_xvreplgr2vr_b
+name = lasx_xvreplgr2vr_b
+asm-fmts = xd, rj
+data-types = V32QI, SI
+
+/// lasx_xvreplgr2vr_h
+name = lasx_xvreplgr2vr_h
+asm-fmts = xd, rj
+data-types = V16HI, SI
+
+/// lasx_xvreplgr2vr_w
+name = lasx_xvreplgr2vr_w
+asm-fmts = xd, rj
+data-types = V8SI, SI
+
+/// lasx_xvreplgr2vr_d
+name = lasx_xvreplgr2vr_d
+asm-fmts = xd, rj
+data-types = V4DI, DI
+
+/// lasx_xvpcnt_b
+name = lasx_xvpcnt_b
+asm-fmts = xd, xj
+data-types = V32QI, V32QI
+
+/// lasx_xvpcnt_h
+name = lasx_xvpcnt_h
+asm-fmts = xd, xj
+data-types = V16HI, V16HI
+
+/// lasx_xvpcnt_w
+name = lasx_xvpcnt_w
+asm-fmts = xd, xj
+data-types = V8SI, V8SI
+
+/// lasx_xvpcnt_d
+name = lasx_xvpcnt_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DI
+
+/// lasx_xvclo_b
+name = lasx_xvclo_b
+asm-fmts = xd, xj
+data-types = V32QI, V32QI
+
+/// lasx_xvclo_h
+name = lasx_xvclo_h
+asm-fmts = xd, xj
+data-types = V16HI, V16HI
+
+/// lasx_xvclo_w
+name = lasx_xvclo_w
+asm-fmts = xd, xj
+data-types = V8SI, V8SI
+
+/// lasx_xvclo_d
+name = lasx_xvclo_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DI
+
+/// lasx_xvclz_b
+name = lasx_xvclz_b
+asm-fmts = xd, xj
+data-types = V32QI, V32QI
+
+/// lasx_xvclz_h
+name = lasx_xvclz_h
+asm-fmts = xd, xj
+data-types = V16HI, V16HI
+
+/// lasx_xvclz_w
+name = lasx_xvclz_w
+asm-fmts = xd, xj
+data-types = V8SI, V8SI
+
+/// lasx_xvclz_d
+name = lasx_xvclz_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DI
+
+/// lasx_xvfadd_s
+name = lasx_xvfadd_s
+asm-fmts = xd, xj, xk
+data-types = V8SF, V8SF, V8SF
+
+/// lasx_xvfadd_d
+name = lasx_xvfadd_d
+asm-fmts = xd, xj, xk
+data-types = V4DF, V4DF, V4DF
+
+/// lasx_xvfsub_s
+name = lasx_xvfsub_s
+asm-fmts = xd, xj, xk
+data-types = V8SF, V8SF, V8SF
+
+/// lasx_xvfsub_d
+name = lasx_xvfsub_d
+asm-fmts = xd, xj, xk
+data-types = V4DF, V4DF, V4DF
+
+/// lasx_xvfmul_s
+name = lasx_xvfmul_s
+asm-fmts = xd, xj, xk
+data-types = V8SF, V8SF, V8SF
+
+/// lasx_xvfmul_d
+name = lasx_xvfmul_d
+asm-fmts = xd, xj, xk
+data-types = V4DF, V4DF, V4DF
+
+/// lasx_xvfdiv_s
+name = lasx_xvfdiv_s
+asm-fmts = xd, xj, xk
+data-types = V8SF, V8SF, V8SF
+
+/// lasx_xvfdiv_d
+name = lasx_xvfdiv_d
+asm-fmts = xd, xj, xk
+data-types = V4DF, V4DF, V4DF
+
+/// lasx_xvfcvt_h_s
+name = lasx_xvfcvt_h_s
+asm-fmts = xd, xj, xk
+data-types = V16HI, V8SF, V8SF
+
+/// lasx_xvfcvt_s_d
+name = lasx_xvfcvt_s_d
+asm-fmts = xd, xj, xk
+data-types = V8SF, V4DF, V4DF
+
+/// lasx_xvfmin_s
+name = lasx_xvfmin_s
+asm-fmts = xd, xj, xk
+data-types = V8SF, V8SF, V8SF
+
+/// lasx_xvfmin_d
+name = lasx_xvfmin_d
+asm-fmts = xd, xj, xk
+data-types = V4DF, V4DF, V4DF
+
+/// lasx_xvfmina_s
+name = lasx_xvfmina_s
+asm-fmts = xd, xj, xk
+data-types = V8SF, V8SF, V8SF
+
+/// lasx_xvfmina_d
+name = lasx_xvfmina_d
+asm-fmts = xd, xj, xk
+data-types = V4DF, V4DF, V4DF
+
+/// lasx_xvfmax_s
+name = lasx_xvfmax_s
+asm-fmts = xd, xj, xk
+data-types = V8SF, V8SF, V8SF
+
+/// lasx_xvfmax_d
+name = lasx_xvfmax_d
+asm-fmts = xd, xj, xk
+data-types = V4DF, V4DF, V4DF
+
+/// lasx_xvfmaxa_s
+name = lasx_xvfmaxa_s
+asm-fmts = xd, xj, xk
+data-types = V8SF, V8SF, V8SF
+
+/// lasx_xvfmaxa_d
+name = lasx_xvfmaxa_d
+asm-fmts = xd, xj, xk
+data-types = V4DF, V4DF, V4DF
+
+/// lasx_xvfclass_s
+name = lasx_xvfclass_s
+asm-fmts = xd, xj
+data-types = V8SI, V8SF
+
+/// lasx_xvfclass_d
+name = lasx_xvfclass_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DF
+
+/// lasx_xvfsqrt_s
+name = lasx_xvfsqrt_s
+asm-fmts = xd, xj
+data-types = V8SF, V8SF
+
+/// lasx_xvfsqrt_d
+name = lasx_xvfsqrt_d
+asm-fmts = xd, xj
+data-types = V4DF, V4DF
+
+/// lasx_xvfrecip_s
+name = lasx_xvfrecip_s
+asm-fmts = xd, xj
+data-types = V8SF, V8SF
+
+/// lasx_xvfrecip_d
+name = lasx_xvfrecip_d
+asm-fmts = xd, xj
+data-types = V4DF, V4DF
+
+/// lasx_xvfrecipe_s
+name = lasx_xvfrecipe_s
+asm-fmts = xd, xj
+data-types = V8SF, V8SF
+
+/// lasx_xvfrecipe_d
+name = lasx_xvfrecipe_d
+asm-fmts = xd, xj
+data-types = V4DF, V4DF
+
+/// lasx_xvfrsqrte_s
+name = lasx_xvfrsqrte_s
+asm-fmts = xd, xj
+data-types = V8SF, V8SF
+
+/// lasx_xvfrsqrte_d
+name = lasx_xvfrsqrte_d
+asm-fmts = xd, xj
+data-types = V4DF, V4DF
+
+/// lasx_xvfrint_s
+name = lasx_xvfrint_s
+asm-fmts = xd, xj
+data-types = V8SF, V8SF
+
+/// lasx_xvfrint_d
+name = lasx_xvfrint_d
+asm-fmts = xd, xj
+data-types = V4DF, V4DF
+
+/// lasx_xvfrsqrt_s
+name = lasx_xvfrsqrt_s
+asm-fmts = xd, xj
+data-types = V8SF, V8SF
+
+/// lasx_xvfrsqrt_d
+name = lasx_xvfrsqrt_d
+asm-fmts = xd, xj
+data-types = V4DF, V4DF
+
+/// lasx_xvflogb_s
+name = lasx_xvflogb_s
+asm-fmts = xd, xj
+data-types = V8SF, V8SF
+
+/// lasx_xvflogb_d
+name = lasx_xvflogb_d
+asm-fmts = xd, xj
+data-types = V4DF, V4DF
+
+/// lasx_xvfcvth_s_h
+name = lasx_xvfcvth_s_h
+asm-fmts = xd, xj
+data-types = V8SF, V16HI
+
+/// lasx_xvfcvth_d_s
+name = lasx_xvfcvth_d_s
+asm-fmts = xd, xj
+data-types = V4DF, V8SF
+
+/// lasx_xvfcvtl_s_h
+name = lasx_xvfcvtl_s_h
+asm-fmts = xd, xj
+data-types = V8SF, V16HI
+
+/// lasx_xvfcvtl_d_s
+name = lasx_xvfcvtl_d_s
+asm-fmts = xd, xj
+data-types = V4DF, V8SF
+
+/// lasx_xvftint_w_s
+name = lasx_xvftint_w_s
+asm-fmts = xd, xj
+data-types = V8SI, V8SF
+
+/// lasx_xvftint_l_d
+name = lasx_xvftint_l_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DF
+
+/// lasx_xvftint_wu_s
+name = lasx_xvftint_wu_s
+asm-fmts = xd, xj
+data-types = UV8SI, V8SF
+
+/// lasx_xvftint_lu_d
+name = lasx_xvftint_lu_d
+asm-fmts = xd, xj
+data-types = UV4DI, V4DF
+
+/// lasx_xvftintrz_w_s
+name = lasx_xvftintrz_w_s
+asm-fmts = xd, xj
+data-types = V8SI, V8SF
+
+/// lasx_xvftintrz_l_d
+name = lasx_xvftintrz_l_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DF
+
+/// lasx_xvftintrz_wu_s
+name = lasx_xvftintrz_wu_s
+asm-fmts = xd, xj
+data-types = UV8SI, V8SF
+
+/// lasx_xvftintrz_lu_d
+name = lasx_xvftintrz_lu_d
+asm-fmts = xd, xj
+data-types = UV4DI, V4DF
+
+/// lasx_xvffint_s_w
+name = lasx_xvffint_s_w
+asm-fmts = xd, xj
+data-types = V8SF, V8SI
+
+/// lasx_xvffint_d_l
+name = lasx_xvffint_d_l
+asm-fmts = xd, xj
+data-types = V4DF, V4DI
+
+/// lasx_xvffint_s_wu
+name = lasx_xvffint_s_wu
+asm-fmts = xd, xj
+data-types = V8SF, UV8SI
+
+/// lasx_xvffint_d_lu
+name = lasx_xvffint_d_lu
+asm-fmts = xd, xj
+data-types = V4DF, UV4DI
+
+/// lasx_xvreplve_b
+name = lasx_xvreplve_b
+asm-fmts = xd, xj, rk
+data-types = V32QI, V32QI, SI
+
+/// lasx_xvreplve_h
+name = lasx_xvreplve_h
+asm-fmts = xd, xj, rk
+data-types = V16HI, V16HI, SI
+
+/// lasx_xvreplve_w
+name = lasx_xvreplve_w
+asm-fmts = xd, xj, rk
+data-types = V8SI, V8SI, SI
+
+/// lasx_xvreplve_d
+name = lasx_xvreplve_d
+asm-fmts = xd, xj, rk
+data-types = V4DI, V4DI, SI
+
+/// lasx_xvpermi_w
+name = lasx_xvpermi_w
+asm-fmts = xd, xj, ui8
+data-types = V8SI, V8SI, V8SI, USI
+
+/// lasx_xvandn_v
+name = lasx_xvandn_v
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvneg_b
+name = lasx_xvneg_b
+asm-fmts = xd, xj
+data-types = V32QI, V32QI
+
+/// lasx_xvneg_h
+name = lasx_xvneg_h
+asm-fmts = xd, xj
+data-types = V16HI, V16HI
+
+/// lasx_xvneg_w
+name = lasx_xvneg_w
+asm-fmts = xd, xj
+data-types = V8SI, V8SI
+
+/// lasx_xvneg_d
+name = lasx_xvneg_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DI
+
+/// lasx_xvmuh_b
+name = lasx_xvmuh_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvmuh_h
+name = lasx_xvmuh_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvmuh_w
+name = lasx_xvmuh_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvmuh_d
+name = lasx_xvmuh_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvmuh_bu
+name = lasx_xvmuh_bu
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV32QI, UV32QI
+
+/// lasx_xvmuh_hu
+name = lasx_xvmuh_hu
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV16HI
+
+/// lasx_xvmuh_wu
+name = lasx_xvmuh_wu
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV8SI
+
+/// lasx_xvmuh_du
+name = lasx_xvmuh_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvsllwil_h_b
+name = lasx_xvsllwil_h_b
+asm-fmts = xd, xj, ui3
+data-types = V16HI, V32QI, UQI
+
+/// lasx_xvsllwil_w_h
+name = lasx_xvsllwil_w_h
+asm-fmts = xd, xj, ui4
+data-types = V8SI, V16HI, UQI
+
+/// lasx_xvsllwil_d_w
+name = lasx_xvsllwil_d_w
+asm-fmts = xd, xj, ui5
+data-types = V4DI, V8SI, UQI
+
+/// lasx_xvsllwil_hu_bu
+name = lasx_xvsllwil_hu_bu
+asm-fmts = xd, xj, ui3
+data-types = UV16HI, UV32QI, UQI
+
+/// lasx_xvsllwil_wu_hu
+name = lasx_xvsllwil_wu_hu
+asm-fmts = xd, xj, ui4
+data-types = UV8SI, UV16HI, UQI
+
+/// lasx_xvsllwil_du_wu
+name = lasx_xvsllwil_du_wu
+asm-fmts = xd, xj, ui5
+data-types = UV4DI, UV8SI, UQI
+
+/// lasx_xvsran_b_h
+name = lasx_xvsran_b_h
+asm-fmts = xd, xj, xk
+data-types = V32QI, V16HI, V16HI
+
+/// lasx_xvsran_h_w
+name = lasx_xvsran_h_w
+asm-fmts = xd, xj, xk
+data-types = V16HI, V8SI, V8SI
+
+/// lasx_xvsran_w_d
+name = lasx_xvsran_w_d
+asm-fmts = xd, xj, xk
+data-types = V8SI, V4DI, V4DI
+
+/// lasx_xvssran_b_h
+name = lasx_xvssran_b_h
+asm-fmts = xd, xj, xk
+data-types = V32QI, V16HI, V16HI
+
+/// lasx_xvssran_h_w
+name = lasx_xvssran_h_w
+asm-fmts = xd, xj, xk
+data-types = V16HI, V8SI, V8SI
+
+/// lasx_xvssran_w_d
+name = lasx_xvssran_w_d
+asm-fmts = xd, xj, xk
+data-types = V8SI, V4DI, V4DI
+
+/// lasx_xvssran_bu_h
+name = lasx_xvssran_bu_h
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV16HI, UV16HI
+
+/// lasx_xvssran_hu_w
+name = lasx_xvssran_hu_w
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV8SI, UV8SI
+
+/// lasx_xvssran_wu_d
+name = lasx_xvssran_wu_d
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV4DI, UV4DI
+
+/// lasx_xvsrarn_b_h
+name = lasx_xvsrarn_b_h
+asm-fmts = xd, xj, xk
+data-types = V32QI, V16HI, V16HI
+
+/// lasx_xvsrarn_h_w
+name = lasx_xvsrarn_h_w
+asm-fmts = xd, xj, xk
+data-types = V16HI, V8SI, V8SI
+
+/// lasx_xvsrarn_w_d
+name = lasx_xvsrarn_w_d
+asm-fmts = xd, xj, xk
+data-types = V8SI, V4DI, V4DI
+
+/// lasx_xvssrarn_b_h
+name = lasx_xvssrarn_b_h
+asm-fmts = xd, xj, xk
+data-types = V32QI, V16HI, V16HI
+
+/// lasx_xvssrarn_h_w
+name = lasx_xvssrarn_h_w
+asm-fmts = xd, xj, xk
+data-types = V16HI, V8SI, V8SI
+
+/// lasx_xvssrarn_w_d
+name = lasx_xvssrarn_w_d
+asm-fmts = xd, xj, xk
+data-types = V8SI, V4DI, V4DI
+
+/// lasx_xvssrarn_bu_h
+name = lasx_xvssrarn_bu_h
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV16HI, UV16HI
+
+/// lasx_xvssrarn_hu_w
+name = lasx_xvssrarn_hu_w
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV8SI, UV8SI
+
+/// lasx_xvssrarn_wu_d
+name = lasx_xvssrarn_wu_d
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV4DI, UV4DI
+
+/// lasx_xvsrln_b_h
+name = lasx_xvsrln_b_h
+asm-fmts = xd, xj, xk
+data-types = V32QI, V16HI, V16HI
+
+/// lasx_xvsrln_h_w
+name = lasx_xvsrln_h_w
+asm-fmts = xd, xj, xk
+data-types = V16HI, V8SI, V8SI
+
+/// lasx_xvsrln_w_d
+name = lasx_xvsrln_w_d
+asm-fmts = xd, xj, xk
+data-types = V8SI, V4DI, V4DI
+
+/// lasx_xvssrln_bu_h
+name = lasx_xvssrln_bu_h
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV16HI, UV16HI
+
+/// lasx_xvssrln_hu_w
+name = lasx_xvssrln_hu_w
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV8SI, UV8SI
+
+/// lasx_xvssrln_wu_d
+name = lasx_xvssrln_wu_d
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV4DI, UV4DI
+
+/// lasx_xvsrlrn_b_h
+name = lasx_xvsrlrn_b_h
+asm-fmts = xd, xj, xk
+data-types = V32QI, V16HI, V16HI
+
+/// lasx_xvsrlrn_h_w
+name = lasx_xvsrlrn_h_w
+asm-fmts = xd, xj, xk
+data-types = V16HI, V8SI, V8SI
+
+/// lasx_xvsrlrn_w_d
+name = lasx_xvsrlrn_w_d
+asm-fmts = xd, xj, xk
+data-types = V8SI, V4DI, V4DI
+
+/// lasx_xvssrlrn_bu_h
+name = lasx_xvssrlrn_bu_h
+asm-fmts = xd, xj, xk
+data-types = UV32QI, UV16HI, UV16HI
+
+/// lasx_xvssrlrn_hu_w
+name = lasx_xvssrlrn_hu_w
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV8SI, UV8SI
+
+/// lasx_xvssrlrn_wu_d
+name = lasx_xvssrlrn_wu_d
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV4DI, UV4DI
+
+/// lasx_xvfrstpi_b
+name = lasx_xvfrstpi_b
+asm-fmts = xd, xj, ui5
+data-types = V32QI, V32QI, V32QI, UQI
+
+/// lasx_xvfrstpi_h
+name = lasx_xvfrstpi_h
+asm-fmts = xd, xj, ui5
+data-types = V16HI, V16HI, V16HI, UQI
+
+/// lasx_xvfrstp_b
+name = lasx_xvfrstp_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI, V32QI
+
+/// lasx_xvfrstp_h
+name = lasx_xvfrstp_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI, V16HI
+
+/// lasx_xvshuf4i_d
+name = lasx_xvshuf4i_d
+asm-fmts = xd, xj, ui8
+data-types = V4DI, V4DI, V4DI, USI
+
+/// lasx_xvbsrl_v
+name = lasx_xvbsrl_v
+asm-fmts = xd, xj, ui5
+data-types = V32QI, V32QI, UQI
+
+/// lasx_xvbsll_v
+name = lasx_xvbsll_v
+asm-fmts = xd, xj, ui5
+data-types = V32QI, V32QI, UQI
+
+/// lasx_xvextrins_b
+name = lasx_xvextrins_b
+asm-fmts = xd, xj, ui8
+data-types = V32QI, V32QI, V32QI, USI
+
+/// lasx_xvextrins_h
+name = lasx_xvextrins_h
+asm-fmts = xd, xj, ui8
+data-types = V16HI, V16HI, V16HI, USI
+
+/// lasx_xvextrins_w
+name = lasx_xvextrins_w
+asm-fmts = xd, xj, ui8
+data-types = V8SI, V8SI, V8SI, USI
+
+/// lasx_xvextrins_d
+name = lasx_xvextrins_d
+asm-fmts = xd, xj, ui8
+data-types = V4DI, V4DI, V4DI, USI
+
+/// lasx_xvmskltz_b
+name = lasx_xvmskltz_b
+asm-fmts = xd, xj
+data-types = V32QI, V32QI
+
+/// lasx_xvmskltz_h
+name = lasx_xvmskltz_h
+asm-fmts = xd, xj
+data-types = V16HI, V16HI
+
+/// lasx_xvmskltz_w
+name = lasx_xvmskltz_w
+asm-fmts = xd, xj
+data-types = V8SI, V8SI
+
+/// lasx_xvmskltz_d
+name = lasx_xvmskltz_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DI
+
+/// lasx_xvsigncov_b
+name = lasx_xvsigncov_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvsigncov_h
+name = lasx_xvsigncov_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvsigncov_w
+name = lasx_xvsigncov_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvsigncov_d
+name = lasx_xvsigncov_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvfmadd_s
+name = lasx_xvfmadd_s
+asm-fmts = xd, xj, xk, xa
+data-types = V8SF, V8SF, V8SF, V8SF
+
+/// lasx_xvfmadd_d
+name = lasx_xvfmadd_d
+asm-fmts = xd, xj, xk, xa
+data-types = V4DF, V4DF, V4DF, V4DF
+
+/// lasx_xvfmsub_s
+name = lasx_xvfmsub_s
+asm-fmts = xd, xj, xk, xa
+data-types = V8SF, V8SF, V8SF, V8SF
+
+/// lasx_xvfmsub_d
+name = lasx_xvfmsub_d
+asm-fmts = xd, xj, xk, xa
+data-types = V4DF, V4DF, V4DF, V4DF
+
+/// lasx_xvfnmadd_s
+name = lasx_xvfnmadd_s
+asm-fmts = xd, xj, xk, xa
+data-types = V8SF, V8SF, V8SF, V8SF
+
+/// lasx_xvfnmadd_d
+name = lasx_xvfnmadd_d
+asm-fmts = xd, xj, xk, xa
+data-types = V4DF, V4DF, V4DF, V4DF
+
+/// lasx_xvfnmsub_s
+name = lasx_xvfnmsub_s
+asm-fmts = xd, xj, xk, xa
+data-types = V8SF, V8SF, V8SF, V8SF
+
+/// lasx_xvfnmsub_d
+name = lasx_xvfnmsub_d
+asm-fmts = xd, xj, xk, xa
+data-types = V4DF, V4DF, V4DF, V4DF
+
+/// lasx_xvftintrne_w_s
+name = lasx_xvftintrne_w_s
+asm-fmts = xd, xj
+data-types = V8SI, V8SF
+
+/// lasx_xvftintrne_l_d
+name = lasx_xvftintrne_l_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DF
+
+/// lasx_xvftintrp_w_s
+name = lasx_xvftintrp_w_s
+asm-fmts = xd, xj
+data-types = V8SI, V8SF
+
+/// lasx_xvftintrp_l_d
+name = lasx_xvftintrp_l_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DF
+
+/// lasx_xvftintrm_w_s
+name = lasx_xvftintrm_w_s
+asm-fmts = xd, xj
+data-types = V8SI, V8SF
+
+/// lasx_xvftintrm_l_d
+name = lasx_xvftintrm_l_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DF
+
+/// lasx_xvftint_w_d
+name = lasx_xvftint_w_d
+asm-fmts = xd, xj, xk
+data-types = V8SI, V4DF, V4DF
+
+/// lasx_xvffint_s_l
+name = lasx_xvffint_s_l
+asm-fmts = xd, xj, xk
+data-types = V8SF, V4DI, V4DI
+
+/// lasx_xvftintrz_w_d
+name = lasx_xvftintrz_w_d
+asm-fmts = xd, xj, xk
+data-types = V8SI, V4DF, V4DF
+
+/// lasx_xvftintrp_w_d
+name = lasx_xvftintrp_w_d
+asm-fmts = xd, xj, xk
+data-types = V8SI, V4DF, V4DF
+
+/// lasx_xvftintrm_w_d
+name = lasx_xvftintrm_w_d
+asm-fmts = xd, xj, xk
+data-types = V8SI, V4DF, V4DF
+
+/// lasx_xvftintrne_w_d
+name = lasx_xvftintrne_w_d
+asm-fmts = xd, xj, xk
+data-types = V8SI, V4DF, V4DF
+
+/// lasx_xvftinth_l_s
+name = lasx_xvftinth_l_s
+asm-fmts = xd, xj
+data-types = V4DI, V8SF
+
+/// lasx_xvftintl_l_s
+name = lasx_xvftintl_l_s
+asm-fmts = xd, xj
+data-types = V4DI, V8SF
+
+/// lasx_xvffinth_d_w
+name = lasx_xvffinth_d_w
+asm-fmts = xd, xj
+data-types = V4DF, V8SI
+
+/// lasx_xvffintl_d_w
+name = lasx_xvffintl_d_w
+asm-fmts = xd, xj
+data-types = V4DF, V8SI
+
+/// lasx_xvftintrzh_l_s
+name = lasx_xvftintrzh_l_s
+asm-fmts = xd, xj
+data-types = V4DI, V8SF
+
+/// lasx_xvftintrzl_l_s
+name = lasx_xvftintrzl_l_s
+asm-fmts = xd, xj
+data-types = V4DI, V8SF
+
+/// lasx_xvftintrph_l_s
+name = lasx_xvftintrph_l_s
+asm-fmts = xd, xj
+data-types = V4DI, V8SF
+
+/// lasx_xvftintrpl_l_s
+name = lasx_xvftintrpl_l_s
+asm-fmts = xd, xj
+data-types = V4DI, V8SF
+
+/// lasx_xvftintrmh_l_s
+name = lasx_xvftintrmh_l_s
+asm-fmts = xd, xj
+data-types = V4DI, V8SF
+
+/// lasx_xvftintrml_l_s
+name = lasx_xvftintrml_l_s
+asm-fmts = xd, xj
+data-types = V4DI, V8SF
+
+/// lasx_xvftintrneh_l_s
+name = lasx_xvftintrneh_l_s
+asm-fmts = xd, xj
+data-types = V4DI, V8SF
+
+/// lasx_xvftintrnel_l_s
+name = lasx_xvftintrnel_l_s
+asm-fmts = xd, xj
+data-types = V4DI, V8SF
+
+/// lasx_xvfrintrne_s
+name = lasx_xvfrintrne_s
+asm-fmts = xd, xj
+data-types = V8SF, V8SF
+
+/// lasx_xvfrintrne_d
+name = lasx_xvfrintrne_d
+asm-fmts = xd, xj
+data-types = V4DF, V4DF
+
+/// lasx_xvfrintrz_s
+name = lasx_xvfrintrz_s
+asm-fmts = xd, xj
+data-types = V8SF, V8SF
+
+/// lasx_xvfrintrz_d
+name = lasx_xvfrintrz_d
+asm-fmts = xd, xj
+data-types = V4DF, V4DF
+
+/// lasx_xvfrintrp_s
+name = lasx_xvfrintrp_s
+asm-fmts = xd, xj
+data-types = V8SF, V8SF
+
+/// lasx_xvfrintrp_d
+name = lasx_xvfrintrp_d
+asm-fmts = xd, xj
+data-types = V4DF, V4DF
+
+/// lasx_xvfrintrm_s
+name = lasx_xvfrintrm_s
+asm-fmts = xd, xj
+data-types = V8SF, V8SF
+
+/// lasx_xvfrintrm_d
+name = lasx_xvfrintrm_d
+asm-fmts = xd, xj
+data-types = V4DF, V4DF
+
+/// lasx_xvld
+name = lasx_xvld
+asm-fmts = xd, rj, si12
+data-types = V32QI, CVPOINTER, SI
+
+/// lasx_xvst
+name = lasx_xvst
+asm-fmts = xd, rj, si12
+data-types = VOID, V32QI, CVPOINTER, SI
+
+/// lasx_xvstelm_b
+name = lasx_xvstelm_b
+asm-fmts = xd, rj, si8, idx
+data-types = VOID, V32QI, CVPOINTER, SI, UQI
+
+/// lasx_xvstelm_h
+name = lasx_xvstelm_h
+asm-fmts = xd, rj, si8, idx
+data-types = VOID, V16HI, CVPOINTER, SI, UQI
+
+/// lasx_xvstelm_w
+name = lasx_xvstelm_w
+asm-fmts = xd, rj, si8, idx
+data-types = VOID, V8SI, CVPOINTER, SI, UQI
+
+/// lasx_xvstelm_d
+name = lasx_xvstelm_d
+asm-fmts = xd, rj, si8, idx
+data-types = VOID, V4DI, CVPOINTER, SI, UQI
+
+/// lasx_xvinsve0_w
+name = lasx_xvinsve0_w
+asm-fmts = xd, xj, ui3
+data-types = V8SI, V8SI, V8SI, UQI
+
+/// lasx_xvinsve0_d
+name = lasx_xvinsve0_d
+asm-fmts = xd, xj, ui2
+data-types = V4DI, V4DI, V4DI, UQI
+
+/// lasx_xvpickve_w
+name = lasx_xvpickve_w
+asm-fmts = xd, xj, ui3
+data-types = V8SI, V8SI, UQI
+
+/// lasx_xvpickve_d
+name = lasx_xvpickve_d
+asm-fmts = xd, xj, ui2
+data-types = V4DI, V4DI, UQI
+
+/// lasx_xvssrlrn_b_h
+name = lasx_xvssrlrn_b_h
+asm-fmts = xd, xj, xk
+data-types = V32QI, V16HI, V16HI
+
+/// lasx_xvssrlrn_h_w
+name = lasx_xvssrlrn_h_w
+asm-fmts = xd, xj, xk
+data-types = V16HI, V8SI, V8SI
+
+/// lasx_xvssrlrn_w_d
+name = lasx_xvssrlrn_w_d
+asm-fmts = xd, xj, xk
+data-types = V8SI, V4DI, V4DI
+
+/// lasx_xvssrln_b_h
+name = lasx_xvssrln_b_h
+asm-fmts = xd, xj, xk
+data-types = V32QI, V16HI, V16HI
+
+/// lasx_xvssrln_h_w
+name = lasx_xvssrln_h_w
+asm-fmts = xd, xj, xk
+data-types = V16HI, V8SI, V8SI
+
+/// lasx_xvssrln_w_d
+name = lasx_xvssrln_w_d
+asm-fmts = xd, xj, xk
+data-types = V8SI, V4DI, V4DI
+
+/// lasx_xvorn_v
+name = lasx_xvorn_v
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvldi
+name = lasx_xvldi
+asm-fmts = xd, i13
+data-types = V4DI, HI
+
+/// lasx_xvldx
+name = lasx_xvldx
+asm-fmts = xd, rj, rk
+data-types = V32QI, CVPOINTER, DI
+
+/// lasx_xvstx
+name = lasx_xvstx
+asm-fmts = xd, rj, rk
+data-types = VOID, V32QI, CVPOINTER, DI
+
+/// lasx_xvextl_qu_du
+name = lasx_xvextl_qu_du
+asm-fmts = xd, xj
+data-types = UV4DI, UV4DI
+
+/// lasx_xvinsgr2vr_w
+name = lasx_xvinsgr2vr_w
+asm-fmts = xd, rj, ui3
+data-types = V8SI, V8SI, SI, UQI
+
+/// lasx_xvinsgr2vr_d
+name = lasx_xvinsgr2vr_d
+asm-fmts = xd, rj, ui2
+data-types = V4DI, V4DI, DI, UQI
+
+/// lasx_xvreplve0_b
+name = lasx_xvreplve0_b
+asm-fmts = xd, xj
+data-types = V32QI, V32QI
+
+/// lasx_xvreplve0_h
+name = lasx_xvreplve0_h
+asm-fmts = xd, xj
+data-types = V16HI, V16HI
+
+/// lasx_xvreplve0_w
+name = lasx_xvreplve0_w
+asm-fmts = xd, xj
+data-types = V8SI, V8SI
+
+/// lasx_xvreplve0_d
+name = lasx_xvreplve0_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DI
+
+/// lasx_xvreplve0_q
+name = lasx_xvreplve0_q
+asm-fmts = xd, xj
+data-types = V32QI, V32QI
+
+/// lasx_vext2xv_h_b
+name = lasx_vext2xv_h_b
+asm-fmts = xd, xj
+data-types = V16HI, V32QI
+
+/// lasx_vext2xv_w_h
+name = lasx_vext2xv_w_h
+asm-fmts = xd, xj
+data-types = V8SI, V16HI
+
+/// lasx_vext2xv_d_w
+name = lasx_vext2xv_d_w
+asm-fmts = xd, xj
+data-types = V4DI, V8SI
+
+/// lasx_vext2xv_w_b
+name = lasx_vext2xv_w_b
+asm-fmts = xd, xj
+data-types = V8SI, V32QI
+
+/// lasx_vext2xv_d_h
+name = lasx_vext2xv_d_h
+asm-fmts = xd, xj
+data-types = V4DI, V16HI
+
+/// lasx_vext2xv_d_b
+name = lasx_vext2xv_d_b
+asm-fmts = xd, xj
+data-types = V4DI, V32QI
+
+/// lasx_vext2xv_hu_bu
+name = lasx_vext2xv_hu_bu
+asm-fmts = xd, xj
+data-types = V16HI, V32QI
+
+/// lasx_vext2xv_wu_hu
+name = lasx_vext2xv_wu_hu
+asm-fmts = xd, xj
+data-types = V8SI, V16HI
+
+/// lasx_vext2xv_du_wu
+name = lasx_vext2xv_du_wu
+asm-fmts = xd, xj
+data-types = V4DI, V8SI
+
+/// lasx_vext2xv_wu_bu
+name = lasx_vext2xv_wu_bu
+asm-fmts = xd, xj
+data-types = V8SI, V32QI
+
+/// lasx_vext2xv_du_hu
+name = lasx_vext2xv_du_hu
+asm-fmts = xd, xj
+data-types = V4DI, V16HI
+
+/// lasx_vext2xv_du_bu
+name = lasx_vext2xv_du_bu
+asm-fmts = xd, xj
+data-types = V4DI, V32QI
+
+/// lasx_xvpermi_q
+name = lasx_xvpermi_q
+asm-fmts = xd, xj, ui8
+data-types = V32QI, V32QI, V32QI, USI
+
+/// lasx_xvpermi_d
+name = lasx_xvpermi_d
+asm-fmts = xd, xj, ui8
+data-types = V4DI, V4DI, USI
+
+/// lasx_xvperm_w
+name = lasx_xvperm_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvldrepl_b
+name = lasx_xvldrepl_b
+asm-fmts = xd, rj, si12
+data-types = V32QI, CVPOINTER, SI
+
+/// lasx_xvldrepl_h
+name = lasx_xvldrepl_h
+asm-fmts = xd, rj, si11
+data-types = V16HI, CVPOINTER, SI
+
+/// lasx_xvldrepl_w
+name = lasx_xvldrepl_w
+asm-fmts = xd, rj, si10
+data-types = V8SI, CVPOINTER, SI
+
+/// lasx_xvldrepl_d
+name = lasx_xvldrepl_d
+asm-fmts = xd, rj, si9
+data-types = V4DI, CVPOINTER, SI
+
+/// lasx_xvpickve2gr_w
+name = lasx_xvpickve2gr_w
+asm-fmts = rd, xj, ui3
+data-types = SI, V8SI, UQI
+
+/// lasx_xvpickve2gr_wu
+name = lasx_xvpickve2gr_wu
+asm-fmts = rd, xj, ui3
+data-types = USI, V8SI, UQI
+
+/// lasx_xvpickve2gr_d
+name = lasx_xvpickve2gr_d
+asm-fmts = rd, xj, ui2
+data-types = DI, V4DI, UQI
+
+/// lasx_xvpickve2gr_du
+name = lasx_xvpickve2gr_du
+asm-fmts = rd, xj, ui2
+data-types = UDI, V4DI, UQI
+
+/// lasx_xvaddwev_q_d
+name = lasx_xvaddwev_q_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvaddwev_d_w
+name = lasx_xvaddwev_d_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, V8SI, V8SI
+
+/// lasx_xvaddwev_w_h
+name = lasx_xvaddwev_w_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, V16HI, V16HI
+
+/// lasx_xvaddwev_h_b
+name = lasx_xvaddwev_h_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, V32QI, V32QI
+
+/// lasx_xvaddwev_q_du
+name = lasx_xvaddwev_q_du
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV4DI, UV4DI
+
+/// lasx_xvaddwev_d_wu
+name = lasx_xvaddwev_d_wu
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV8SI, UV8SI
+
+/// lasx_xvaddwev_w_hu
+name = lasx_xvaddwev_w_hu
+asm-fmts = xd, xj, xk
+data-types = V8SI, UV16HI, UV16HI
+
+/// lasx_xvaddwev_h_bu
+name = lasx_xvaddwev_h_bu
+asm-fmts = xd, xj, xk
+data-types = V16HI, UV32QI, UV32QI
+
+/// lasx_xvsubwev_q_d
+name = lasx_xvsubwev_q_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvsubwev_d_w
+name = lasx_xvsubwev_d_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, V8SI, V8SI
+
+/// lasx_xvsubwev_w_h
+name = lasx_xvsubwev_w_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, V16HI, V16HI
+
+/// lasx_xvsubwev_h_b
+name = lasx_xvsubwev_h_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, V32QI, V32QI
+
+/// lasx_xvsubwev_q_du
+name = lasx_xvsubwev_q_du
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV4DI, UV4DI
+
+/// lasx_xvsubwev_d_wu
+name = lasx_xvsubwev_d_wu
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV8SI, UV8SI
+
+/// lasx_xvsubwev_w_hu
+name = lasx_xvsubwev_w_hu
+asm-fmts = xd, xj, xk
+data-types = V8SI, UV16HI, UV16HI
+
+/// lasx_xvsubwev_h_bu
+name = lasx_xvsubwev_h_bu
+asm-fmts = xd, xj, xk
+data-types = V16HI, UV32QI, UV32QI
+
+/// lasx_xvmulwev_q_d
+name = lasx_xvmulwev_q_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvmulwev_d_w
+name = lasx_xvmulwev_d_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, V8SI, V8SI
+
+/// lasx_xvmulwev_w_h
+name = lasx_xvmulwev_w_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, V16HI, V16HI
+
+/// lasx_xvmulwev_h_b
+name = lasx_xvmulwev_h_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, V32QI, V32QI
+
+/// lasx_xvmulwev_q_du
+name = lasx_xvmulwev_q_du
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV4DI, UV4DI
+
+/// lasx_xvmulwev_d_wu
+name = lasx_xvmulwev_d_wu
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV8SI, UV8SI
+
+/// lasx_xvmulwev_w_hu
+name = lasx_xvmulwev_w_hu
+asm-fmts = xd, xj, xk
+data-types = V8SI, UV16HI, UV16HI
+
+/// lasx_xvmulwev_h_bu
+name = lasx_xvmulwev_h_bu
+asm-fmts = xd, xj, xk
+data-types = V16HI, UV32QI, UV32QI
+
+/// lasx_xvaddwod_q_d
+name = lasx_xvaddwod_q_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvaddwod_d_w
+name = lasx_xvaddwod_d_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, V8SI, V8SI
+
+/// lasx_xvaddwod_w_h
+name = lasx_xvaddwod_w_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, V16HI, V16HI
+
+/// lasx_xvaddwod_h_b
+name = lasx_xvaddwod_h_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, V32QI, V32QI
+
+/// lasx_xvaddwod_q_du
+name = lasx_xvaddwod_q_du
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV4DI, UV4DI
+
+/// lasx_xvaddwod_d_wu
+name = lasx_xvaddwod_d_wu
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV8SI, UV8SI
+
+/// lasx_xvaddwod_w_hu
+name = lasx_xvaddwod_w_hu
+asm-fmts = xd, xj, xk
+data-types = V8SI, UV16HI, UV16HI
+
+/// lasx_xvaddwod_h_bu
+name = lasx_xvaddwod_h_bu
+asm-fmts = xd, xj, xk
+data-types = V16HI, UV32QI, UV32QI
+
+/// lasx_xvsubwod_q_d
+name = lasx_xvsubwod_q_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvsubwod_d_w
+name = lasx_xvsubwod_d_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, V8SI, V8SI
+
+/// lasx_xvsubwod_w_h
+name = lasx_xvsubwod_w_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, V16HI, V16HI
+
+/// lasx_xvsubwod_h_b
+name = lasx_xvsubwod_h_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, V32QI, V32QI
+
+/// lasx_xvsubwod_q_du
+name = lasx_xvsubwod_q_du
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV4DI, UV4DI
+
+/// lasx_xvsubwod_d_wu
+name = lasx_xvsubwod_d_wu
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV8SI, UV8SI
+
+/// lasx_xvsubwod_w_hu
+name = lasx_xvsubwod_w_hu
+asm-fmts = xd, xj, xk
+data-types = V8SI, UV16HI, UV16HI
+
+/// lasx_xvsubwod_h_bu
+name = lasx_xvsubwod_h_bu
+asm-fmts = xd, xj, xk
+data-types = V16HI, UV32QI, UV32QI
+
+/// lasx_xvmulwod_q_d
+name = lasx_xvmulwod_q_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvmulwod_d_w
+name = lasx_xvmulwod_d_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, V8SI, V8SI
+
+/// lasx_xvmulwod_w_h
+name = lasx_xvmulwod_w_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, V16HI, V16HI
+
+/// lasx_xvmulwod_h_b
+name = lasx_xvmulwod_h_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, V32QI, V32QI
+
+/// lasx_xvmulwod_q_du
+name = lasx_xvmulwod_q_du
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV4DI, UV4DI
+
+/// lasx_xvmulwod_d_wu
+name = lasx_xvmulwod_d_wu
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV8SI, UV8SI
+
+/// lasx_xvmulwod_w_hu
+name = lasx_xvmulwod_w_hu
+asm-fmts = xd, xj, xk
+data-types = V8SI, UV16HI, UV16HI
+
+/// lasx_xvmulwod_h_bu
+name = lasx_xvmulwod_h_bu
+asm-fmts = xd, xj, xk
+data-types = V16HI, UV32QI, UV32QI
+
+/// lasx_xvaddwev_d_wu_w
+name = lasx_xvaddwev_d_wu_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV8SI, V8SI
+
+/// lasx_xvaddwev_w_hu_h
+name = lasx_xvaddwev_w_hu_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, UV16HI, V16HI
+
+/// lasx_xvaddwev_h_bu_b
+name = lasx_xvaddwev_h_bu_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, UV32QI, V32QI
+
+/// lasx_xvmulwev_d_wu_w
+name = lasx_xvmulwev_d_wu_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV8SI, V8SI
+
+/// lasx_xvmulwev_w_hu_h
+name = lasx_xvmulwev_w_hu_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, UV16HI, V16HI
+
+/// lasx_xvmulwev_h_bu_b
+name = lasx_xvmulwev_h_bu_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, UV32QI, V32QI
+
+/// lasx_xvaddwod_d_wu_w
+name = lasx_xvaddwod_d_wu_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV8SI, V8SI
+
+/// lasx_xvaddwod_w_hu_h
+name = lasx_xvaddwod_w_hu_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, UV16HI, V16HI
+
+/// lasx_xvaddwod_h_bu_b
+name = lasx_xvaddwod_h_bu_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, UV32QI, V32QI
+
+/// lasx_xvmulwod_d_wu_w
+name = lasx_xvmulwod_d_wu_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV8SI, V8SI
+
+/// lasx_xvmulwod_w_hu_h
+name = lasx_xvmulwod_w_hu_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, UV16HI, V16HI
+
+/// lasx_xvmulwod_h_bu_b
+name = lasx_xvmulwod_h_bu_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, UV32QI, V32QI
+
+/// lasx_xvhaddw_q_d
+name = lasx_xvhaddw_q_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvhaddw_qu_du
+name = lasx_xvhaddw_qu_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvhsubw_q_d
+name = lasx_xvhsubw_q_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvhsubw_qu_du
+name = lasx_xvhsubw_qu_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI
+
+/// lasx_xvmaddwev_q_d
+name = lasx_xvmaddwev_q_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI, V4DI
+
+/// lasx_xvmaddwev_d_w
+name = lasx_xvmaddwev_d_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V8SI, V8SI
+
+/// lasx_xvmaddwev_w_h
+name = lasx_xvmaddwev_w_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V16HI, V16HI
+
+/// lasx_xvmaddwev_h_b
+name = lasx_xvmaddwev_h_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V32QI, V32QI
+
+/// lasx_xvmaddwev_q_du
+name = lasx_xvmaddwev_q_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI, UV4DI
+
+/// lasx_xvmaddwev_d_wu
+name = lasx_xvmaddwev_d_wu
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV8SI, UV8SI
+
+/// lasx_xvmaddwev_w_hu
+name = lasx_xvmaddwev_w_hu
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV16HI, UV16HI
+
+/// lasx_xvmaddwev_h_bu
+name = lasx_xvmaddwev_h_bu
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV32QI, UV32QI
+
+/// lasx_xvmaddwod_q_d
+name = lasx_xvmaddwod_q_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI, V4DI
+
+/// lasx_xvmaddwod_d_w
+name = lasx_xvmaddwod_d_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V8SI, V8SI
+
+/// lasx_xvmaddwod_w_h
+name = lasx_xvmaddwod_w_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V16HI, V16HI
+
+/// lasx_xvmaddwod_h_b
+name = lasx_xvmaddwod_h_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V32QI, V32QI
+
+/// lasx_xvmaddwod_q_du
+name = lasx_xvmaddwod_q_du
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV4DI, UV4DI
+
+/// lasx_xvmaddwod_d_wu
+name = lasx_xvmaddwod_d_wu
+asm-fmts = xd, xj, xk
+data-types = UV4DI, UV4DI, UV8SI, UV8SI
+
+/// lasx_xvmaddwod_w_hu
+name = lasx_xvmaddwod_w_hu
+asm-fmts = xd, xj, xk
+data-types = UV8SI, UV8SI, UV16HI, UV16HI
+
+/// lasx_xvmaddwod_h_bu
+name = lasx_xvmaddwod_h_bu
+asm-fmts = xd, xj, xk
+data-types = UV16HI, UV16HI, UV32QI, UV32QI
+
+/// lasx_xvmaddwev_q_du_d
+name = lasx_xvmaddwev_q_du_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, UV4DI, V4DI
+
+/// lasx_xvmaddwev_d_wu_w
+name = lasx_xvmaddwev_d_wu_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, UV8SI, V8SI
+
+/// lasx_xvmaddwev_w_hu_h
+name = lasx_xvmaddwev_w_hu_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, UV16HI, V16HI
+
+/// lasx_xvmaddwev_h_bu_b
+name = lasx_xvmaddwev_h_bu_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, UV32QI, V32QI
+
+/// lasx_xvmaddwod_q_du_d
+name = lasx_xvmaddwod_q_du_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, UV4DI, V4DI
+
+/// lasx_xvmaddwod_d_wu_w
+name = lasx_xvmaddwod_d_wu_w
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, UV8SI, V8SI
+
+/// lasx_xvmaddwod_w_hu_h
+name = lasx_xvmaddwod_w_hu_h
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, UV16HI, V16HI
+
+/// lasx_xvmaddwod_h_bu_b
+name = lasx_xvmaddwod_h_bu_b
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, UV32QI, V32QI
+
+/// lasx_xvrotr_b
+name = lasx_xvrotr_b
+asm-fmts = xd, xj, xk
+data-types = V32QI, V32QI, V32QI
+
+/// lasx_xvrotr_h
+name = lasx_xvrotr_h
+asm-fmts = xd, xj, xk
+data-types = V16HI, V16HI, V16HI
+
+/// lasx_xvrotr_w
+name = lasx_xvrotr_w
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SI, V8SI
+
+/// lasx_xvrotr_d
+name = lasx_xvrotr_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvadd_q
+name = lasx_xvadd_q
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvsub_q
+name = lasx_xvsub_q
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DI, V4DI
+
+/// lasx_xvaddwev_q_du_d
+name = lasx_xvaddwev_q_du_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV4DI, V4DI
+
+/// lasx_xvaddwod_q_du_d
+name = lasx_xvaddwod_q_du_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV4DI, V4DI
+
+/// lasx_xvmulwev_q_du_d
+name = lasx_xvmulwev_q_du_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV4DI, V4DI
+
+/// lasx_xvmulwod_q_du_d
+name = lasx_xvmulwod_q_du_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, UV4DI, V4DI
+
+/// lasx_xvmskgez_b
+name = lasx_xvmskgez_b
+asm-fmts = xd, xj
+data-types = V32QI, V32QI
+
+/// lasx_xvmsknz_b
+name = lasx_xvmsknz_b
+asm-fmts = xd, xj
+data-types = V32QI, V32QI
+
+/// lasx_xvexth_h_b
+name = lasx_xvexth_h_b
+asm-fmts = xd, xj
+data-types = V16HI, V32QI
+
+/// lasx_xvexth_w_h
+name = lasx_xvexth_w_h
+asm-fmts = xd, xj
+data-types = V8SI, V16HI
+
+/// lasx_xvexth_d_w
+name = lasx_xvexth_d_w
+asm-fmts = xd, xj
+data-types = V4DI, V8SI
+
+/// lasx_xvexth_q_d
+name = lasx_xvexth_q_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DI
+
+/// lasx_xvexth_hu_bu
+name = lasx_xvexth_hu_bu
+asm-fmts = xd, xj
+data-types = UV16HI, UV32QI
+
+/// lasx_xvexth_wu_hu
+name = lasx_xvexth_wu_hu
+asm-fmts = xd, xj
+data-types = UV8SI, UV16HI
+
+/// lasx_xvexth_du_wu
+name = lasx_xvexth_du_wu
+asm-fmts = xd, xj
+data-types = UV4DI, UV8SI
+
+/// lasx_xvexth_qu_du
+name = lasx_xvexth_qu_du
+asm-fmts = xd, xj
+data-types = UV4DI, UV4DI
+
+/// lasx_xvrotri_b
+name = lasx_xvrotri_b
+asm-fmts = xd, xj, ui3
+data-types = V32QI, V32QI, UQI
+
+/// lasx_xvrotri_h
+name = lasx_xvrotri_h
+asm-fmts = xd, xj, ui4
+data-types = V16HI, V16HI, UQI
+
+/// lasx_xvrotri_w
+name = lasx_xvrotri_w
+asm-fmts = xd, xj, ui5
+data-types = V8SI, V8SI, UQI
+
+/// lasx_xvrotri_d
+name = lasx_xvrotri_d
+asm-fmts = xd, xj, ui6
+data-types = V4DI, V4DI, UQI
+
+/// lasx_xvextl_q_d
+name = lasx_xvextl_q_d
+asm-fmts = xd, xj
+data-types = V4DI, V4DI
+
+/// lasx_xvsrlni_b_h
+name = lasx_xvsrlni_b_h
+asm-fmts = xd, xj, ui4
+data-types = V32QI, V32QI, V32QI, USI
+
+/// lasx_xvsrlni_h_w
+name = lasx_xvsrlni_h_w
+asm-fmts = xd, xj, ui5
+data-types = V16HI, V16HI, V16HI, USI
+
+/// lasx_xvsrlni_w_d
+name = lasx_xvsrlni_w_d
+asm-fmts = xd, xj, ui6
+data-types = V8SI, V8SI, V8SI, USI
+
+/// lasx_xvsrlni_d_q
+name = lasx_xvsrlni_d_q
+asm-fmts = xd, xj, ui7
+data-types = V4DI, V4DI, V4DI, USI
+
+/// lasx_xvsrlrni_b_h
+name = lasx_xvsrlrni_b_h
+asm-fmts = xd, xj, ui4
+data-types = V32QI, V32QI, V32QI, USI
+
+/// lasx_xvsrlrni_h_w
+name = lasx_xvsrlrni_h_w
+asm-fmts = xd, xj, ui5
+data-types = V16HI, V16HI, V16HI, USI
+
+/// lasx_xvsrlrni_w_d
+name = lasx_xvsrlrni_w_d
+asm-fmts = xd, xj, ui6
+data-types = V8SI, V8SI, V8SI, USI
+
+/// lasx_xvsrlrni_d_q
+name = lasx_xvsrlrni_d_q
+asm-fmts = xd, xj, ui7
+data-types = V4DI, V4DI, V4DI, USI
+
+/// lasx_xvssrlni_b_h
+name = lasx_xvssrlni_b_h
+asm-fmts = xd, xj, ui4
+data-types = V32QI, V32QI, V32QI, USI
+
+/// lasx_xvssrlni_h_w
+name = lasx_xvssrlni_h_w
+asm-fmts = xd, xj, ui5
+data-types = V16HI, V16HI, V16HI, USI
+
+/// lasx_xvssrlni_w_d
+name = lasx_xvssrlni_w_d
+asm-fmts = xd, xj, ui6
+data-types = V8SI, V8SI, V8SI, USI
+
+/// lasx_xvssrlni_d_q
+name = lasx_xvssrlni_d_q
+asm-fmts = xd, xj, ui7
+data-types = V4DI, V4DI, V4DI, USI
+
+/// lasx_xvssrlni_bu_h
+name = lasx_xvssrlni_bu_h
+asm-fmts = xd, xj, ui4
+data-types = UV32QI, UV32QI, V32QI, USI
+
+/// lasx_xvssrlni_hu_w
+name = lasx_xvssrlni_hu_w
+asm-fmts = xd, xj, ui5
+data-types = UV16HI, UV16HI, V16HI, USI
+
+/// lasx_xvssrlni_wu_d
+name = lasx_xvssrlni_wu_d
+asm-fmts = xd, xj, ui6
+data-types = UV8SI, UV8SI, V8SI, USI
+
+/// lasx_xvssrlni_du_q
+name = lasx_xvssrlni_du_q
+asm-fmts = xd, xj, ui7
+data-types = UV4DI, UV4DI, V4DI, USI
+
+/// lasx_xvssrlrni_b_h
+name = lasx_xvssrlrni_b_h
+asm-fmts = xd, xj, ui4
+data-types = V32QI, V32QI, V32QI, USI
+
+/// lasx_xvssrlrni_h_w
+name = lasx_xvssrlrni_h_w
+asm-fmts = xd, xj, ui5
+data-types = V16HI, V16HI, V16HI, USI
+
+/// lasx_xvssrlrni_w_d
+name = lasx_xvssrlrni_w_d
+asm-fmts = xd, xj, ui6
+data-types = V8SI, V8SI, V8SI, USI
+
+/// lasx_xvssrlrni_d_q
+name = lasx_xvssrlrni_d_q
+asm-fmts = xd, xj, ui7
+data-types = V4DI, V4DI, V4DI, USI
+
+/// lasx_xvssrlrni_bu_h
+name = lasx_xvssrlrni_bu_h
+asm-fmts = xd, xj, ui4
+data-types = UV32QI, UV32QI, V32QI, USI
+
+/// lasx_xvssrlrni_hu_w
+name = lasx_xvssrlrni_hu_w
+asm-fmts = xd, xj, ui5
+data-types = UV16HI, UV16HI, V16HI, USI
+
+/// lasx_xvssrlrni_wu_d
+name = lasx_xvssrlrni_wu_d
+asm-fmts = xd, xj, ui6
+data-types = UV8SI, UV8SI, V8SI, USI
+
+/// lasx_xvssrlrni_du_q
+name = lasx_xvssrlrni_du_q
+asm-fmts = xd, xj, ui7
+data-types = UV4DI, UV4DI, V4DI, USI
+
+/// lasx_xvsrani_b_h
+name = lasx_xvsrani_b_h
+asm-fmts = xd, xj, ui4
+data-types = V32QI, V32QI, V32QI, USI
+
+/// lasx_xvsrani_h_w
+name = lasx_xvsrani_h_w
+asm-fmts = xd, xj, ui5
+data-types = V16HI, V16HI, V16HI, USI
+
+/// lasx_xvsrani_w_d
+name = lasx_xvsrani_w_d
+asm-fmts = xd, xj, ui6
+data-types = V8SI, V8SI, V8SI, USI
+
+/// lasx_xvsrani_d_q
+name = lasx_xvsrani_d_q
+asm-fmts = xd, xj, ui7
+data-types = V4DI, V4DI, V4DI, USI
+
+/// lasx_xvsrarni_b_h
+name = lasx_xvsrarni_b_h
+asm-fmts = xd, xj, ui4
+data-types = V32QI, V32QI, V32QI, USI
+
+/// lasx_xvsrarni_h_w
+name = lasx_xvsrarni_h_w
+asm-fmts = xd, xj, ui5
+data-types = V16HI, V16HI, V16HI, USI
+
+/// lasx_xvsrarni_w_d
+name = lasx_xvsrarni_w_d
+asm-fmts = xd, xj, ui6
+data-types = V8SI, V8SI, V8SI, USI
+
+/// lasx_xvsrarni_d_q
+name = lasx_xvsrarni_d_q
+asm-fmts = xd, xj, ui7
+data-types = V4DI, V4DI, V4DI, USI
+
+/// lasx_xvssrani_b_h
+name = lasx_xvssrani_b_h
+asm-fmts = xd, xj, ui4
+data-types = V32QI, V32QI, V32QI, USI
+
+/// lasx_xvssrani_h_w
+name = lasx_xvssrani_h_w
+asm-fmts = xd, xj, ui5
+data-types = V16HI, V16HI, V16HI, USI
+
+/// lasx_xvssrani_w_d
+name = lasx_xvssrani_w_d
+asm-fmts = xd, xj, ui6
+data-types = V8SI, V8SI, V8SI, USI
+
+/// lasx_xvssrani_d_q
+name = lasx_xvssrani_d_q
+asm-fmts = xd, xj, ui7
+data-types = V4DI, V4DI, V4DI, USI
+
+/// lasx_xvssrani_bu_h
+name = lasx_xvssrani_bu_h
+asm-fmts = xd, xj, ui4
+data-types = UV32QI, UV32QI, V32QI, USI
+
+/// lasx_xvssrani_hu_w
+name = lasx_xvssrani_hu_w
+asm-fmts = xd, xj, ui5
+data-types = UV16HI, UV16HI, V16HI, USI
+
+/// lasx_xvssrani_wu_d
+name = lasx_xvssrani_wu_d
+asm-fmts = xd, xj, ui6
+data-types = UV8SI, UV8SI, V8SI, USI
+
+/// lasx_xvssrani_du_q
+name = lasx_xvssrani_du_q
+asm-fmts = xd, xj, ui7
+data-types = UV4DI, UV4DI, V4DI, USI
+
+/// lasx_xvssrarni_b_h
+name = lasx_xvssrarni_b_h
+asm-fmts = xd, xj, ui4
+data-types = V32QI, V32QI, V32QI, USI
+
+/// lasx_xvssrarni_h_w
+name = lasx_xvssrarni_h_w
+asm-fmts = xd, xj, ui5
+data-types = V16HI, V16HI, V16HI, USI
+
+/// lasx_xvssrarni_w_d
+name = lasx_xvssrarni_w_d
+asm-fmts = xd, xj, ui6
+data-types = V8SI, V8SI, V8SI, USI
+
+/// lasx_xvssrarni_d_q
+name = lasx_xvssrarni_d_q
+asm-fmts = xd, xj, ui7
+data-types = V4DI, V4DI, V4DI, USI
+
+/// lasx_xvssrarni_bu_h
+name = lasx_xvssrarni_bu_h
+asm-fmts = xd, xj, ui4
+data-types = UV32QI, UV32QI, V32QI, USI
+
+/// lasx_xvssrarni_hu_w
+name = lasx_xvssrarni_hu_w
+asm-fmts = xd, xj, ui5
+data-types = UV16HI, UV16HI, V16HI, USI
+
+/// lasx_xvssrarni_wu_d
+name = lasx_xvssrarni_wu_d
+asm-fmts = xd, xj, ui6
+data-types = UV8SI, UV8SI, V8SI, USI
+
+/// lasx_xvssrarni_du_q
+name = lasx_xvssrarni_du_q
+asm-fmts = xd, xj, ui7
+data-types = UV4DI, UV4DI, V4DI, USI
+
+/// lasx_xbnz_b
+name = lasx_xbnz_b
+asm-fmts = cd, xj
+data-types = SI, UV32QI
+
+/// lasx_xbnz_d
+name = lasx_xbnz_d
+asm-fmts = cd, xj
+data-types = SI, UV4DI
+
+/// lasx_xbnz_h
+name = lasx_xbnz_h
+asm-fmts = cd, xj
+data-types = SI, UV16HI
+
+/// lasx_xbnz_v
+name = lasx_xbnz_v
+asm-fmts = cd, xj
+data-types = SI, UV32QI
+
+/// lasx_xbnz_w
+name = lasx_xbnz_w
+asm-fmts = cd, xj
+data-types = SI, UV8SI
+
+/// lasx_xbz_b
+name = lasx_xbz_b
+asm-fmts = cd, xj
+data-types = SI, UV32QI
+
+/// lasx_xbz_d
+name = lasx_xbz_d
+asm-fmts = cd, xj
+data-types = SI, UV4DI
+
+/// lasx_xbz_h
+name = lasx_xbz_h
+asm-fmts = cd, xj
+data-types = SI, UV16HI
+
+/// lasx_xbz_v
+name = lasx_xbz_v
+asm-fmts = cd, xj
+data-types = SI, UV32QI
+
+/// lasx_xbz_w
+name = lasx_xbz_w
+asm-fmts = cd, xj
+data-types = SI, UV8SI
+
+/// lasx_xvfcmp_caf_d
+name = lasx_xvfcmp_caf_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_caf_s
+name = lasx_xvfcmp_caf_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_ceq_d
+name = lasx_xvfcmp_ceq_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_ceq_s
+name = lasx_xvfcmp_ceq_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_cle_d
+name = lasx_xvfcmp_cle_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_cle_s
+name = lasx_xvfcmp_cle_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_clt_d
+name = lasx_xvfcmp_clt_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_clt_s
+name = lasx_xvfcmp_clt_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_cne_d
+name = lasx_xvfcmp_cne_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_cne_s
+name = lasx_xvfcmp_cne_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_cor_d
+name = lasx_xvfcmp_cor_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_cor_s
+name = lasx_xvfcmp_cor_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_cueq_d
+name = lasx_xvfcmp_cueq_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_cueq_s
+name = lasx_xvfcmp_cueq_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_cule_d
+name = lasx_xvfcmp_cule_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_cule_s
+name = lasx_xvfcmp_cule_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_cult_d
+name = lasx_xvfcmp_cult_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_cult_s
+name = lasx_xvfcmp_cult_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_cun_d
+name = lasx_xvfcmp_cun_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_cune_d
+name = lasx_xvfcmp_cune_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_cune_s
+name = lasx_xvfcmp_cune_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_cun_s
+name = lasx_xvfcmp_cun_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_saf_d
+name = lasx_xvfcmp_saf_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_saf_s
+name = lasx_xvfcmp_saf_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_seq_d
+name = lasx_xvfcmp_seq_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_seq_s
+name = lasx_xvfcmp_seq_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_sle_d
+name = lasx_xvfcmp_sle_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_sle_s
+name = lasx_xvfcmp_sle_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_slt_d
+name = lasx_xvfcmp_slt_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_slt_s
+name = lasx_xvfcmp_slt_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_sne_d
+name = lasx_xvfcmp_sne_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_sne_s
+name = lasx_xvfcmp_sne_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_sor_d
+name = lasx_xvfcmp_sor_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_sor_s
+name = lasx_xvfcmp_sor_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_sueq_d
+name = lasx_xvfcmp_sueq_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_sueq_s
+name = lasx_xvfcmp_sueq_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_sule_d
+name = lasx_xvfcmp_sule_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_sule_s
+name = lasx_xvfcmp_sule_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_sult_d
+name = lasx_xvfcmp_sult_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_sult_s
+name = lasx_xvfcmp_sult_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_sun_d
+name = lasx_xvfcmp_sun_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_sune_d
+name = lasx_xvfcmp_sune_d
+asm-fmts = xd, xj, xk
+data-types = V4DI, V4DF, V4DF
+
+/// lasx_xvfcmp_sune_s
+name = lasx_xvfcmp_sune_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvfcmp_sun_s
+name = lasx_xvfcmp_sun_s
+asm-fmts = xd, xj, xk
+data-types = V8SI, V8SF, V8SF
+
+/// lasx_xvpickve_d_f
+name = lasx_xvpickve_d_f
+asm-fmts = xd, xj, ui2
+data-types = V4DF, V4DF, UQI
+
+/// lasx_xvpickve_w_f
+name = lasx_xvpickve_w_f
+asm-fmts = xd, xj, ui3
+data-types = V8SF, V8SF, UQI
+
+/// lasx_xvrepli_b
+name = lasx_xvrepli_b
+asm-fmts = xd, si10
+data-types = V32QI, HI
+
+/// lasx_xvrepli_d
+name = lasx_xvrepli_d
+asm-fmts = xd, si10
+data-types = V4DI, HI
+
+/// lasx_xvrepli_h
+name = lasx_xvrepli_h
+asm-fmts = xd, si10
+data-types = V16HI, HI
+
+/// lasx_xvrepli_w
+name = lasx_xvrepli_w
+asm-fmts = xd, si10
+data-types = V8SI, HI
+
diff --git a/library/stdarch/crates/stdarch-gen-loongarch/lasxintrin.h b/library/stdarch/crates/stdarch-gen-loongarch/lasxintrin.h
new file mode 100644
index 0000000000000..c525b6106b897
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-loongarch/lasxintrin.h
@@ -0,0 +1,5376 @@
+/*
+ * https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/lasxintrin.h;hb=61f1001f2f4ab9128e5eb6e9a4adbbb0f9f0bc75
+ */
+
+/* LARCH Loongson ASX intrinsics include file.
+
+   Copyright (C) 2018-2024 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _GCC_LOONGSON_ASXINTRIN_H
+#define _GCC_LOONGSON_ASXINTRIN_H 1
+
+#if defined(__loongarch_asx)
+
+typedef signed char v32i8 __attribute__ ((vector_size(32), aligned(32)));
+typedef signed char v32i8_b __attribute__ ((vector_size(32), aligned(1)));
+typedef unsigned char v32u8 __attribute__ ((vector_size(32), aligned(32)));
+typedef unsigned char v32u8_b __attribute__ ((vector_size(32), aligned(1)));
+typedef short v16i16 __attribute__ ((vector_size(32), aligned(32)));
+typedef short v16i16_h __attribute__ ((vector_size(32), aligned(2)));
+typedef unsigned short v16u16 __attribute__ ((vector_size(32), aligned(32)));
+typedef unsigned short v16u16_h __attribute__ ((vector_size(32), aligned(2)));
+typedef int v8i32 __attribute__ ((vector_size(32), aligned(32)));
+typedef int v8i32_w __attribute__ ((vector_size(32), aligned(4)));
+typedef unsigned int v8u32 __attribute__ ((vector_size(32), aligned(32)));
+typedef unsigned int v8u32_w __attribute__ ((vector_size(32), aligned(4)));
+typedef long long v4i64 __attribute__ ((vector_size(32), aligned(32)));
+typedef long long v4i64_d __attribute__ ((vector_size(32), aligned(8)));
+typedef unsigned long long v4u64 __attribute__ ((vector_size(32), aligned(32)));
+typedef unsigned long long v4u64_d __attribute__ ((vector_size(32), aligned(8)));
+typedef float v8f32 __attribute__ ((vector_size(32), aligned(32)));
+typedef float v8f32_w __attribute__ ((vector_size(32), aligned(4)));
+typedef double v4f64 __attribute__ ((vector_size(32), aligned(32)));
+typedef double v4f64_d __attribute__ ((vector_size(32), aligned(8)));
+typedef float __m256 __attribute__ ((__vector_size__ (32),
+				     __may_alias__));
+typedef long long __m256i __attribute__ ((__vector_size__ (32),
+					  __may_alias__));
+typedef double __m256d __attribute__ ((__vector_size__ (32),
+				       __may_alias__));
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsll_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsll_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsll_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsll_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsll_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsll_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsll_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsll_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  V32QI, V32QI, UQI.  */
+#define __lasx_xvslli_b(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvslli_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V16HI, V16HI, UQI.  */
+#define __lasx_xvslli_h(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvslli_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V8SI, V8SI, UQI.  */
+#define __lasx_xvslli_w(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslli_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V4DI, V4DI, UQI.  */
+#define __lasx_xvslli_d(/*__m256i*/ _1, /*ui6*/ _2) \
+  ((__m256i)__builtin_lasx_xvslli_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsra_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsra_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsra_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsra_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsra_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsra_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsra_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsra_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  V32QI, V32QI, UQI.  */
+#define __lasx_xvsrai_b(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrai_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V16HI, V16HI, UQI.  */
+#define __lasx_xvsrai_h(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrai_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V8SI, V8SI, UQI.  */
+#define __lasx_xvsrai_w(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrai_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V4DI, V4DI, UQI.  */
+#define __lasx_xvsrai_d(/*__m256i*/ _1, /*ui6*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrai_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrar_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrar_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrar_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrar_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrar_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrar_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrar_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrar_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  V32QI, V32QI, UQI.  */
+#define __lasx_xvsrari_b(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrari_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V16HI, V16HI, UQI.  */
+#define __lasx_xvsrari_h(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrari_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V8SI, V8SI, UQI.  */
+#define __lasx_xvsrari_w(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrari_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V4DI, V4DI, UQI.  */
+#define __lasx_xvsrari_d(/*__m256i*/ _1, /*ui6*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrari_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrl_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrl_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrl_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrl_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrl_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrl_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrl_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrl_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  V32QI, V32QI, UQI.  */
+#define __lasx_xvsrli_b(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrli_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V16HI, V16HI, UQI.  */
+#define __lasx_xvsrli_h(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrli_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V8SI, V8SI, UQI.  */
+#define __lasx_xvsrli_w(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrli_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V4DI, V4DI, UQI.  */
+#define __lasx_xvsrli_d(/*__m256i*/ _1, /*ui6*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrli_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrlr_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrlr_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrlr_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrlr_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrlr_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrlr_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrlr_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrlr_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  V32QI, V32QI, UQI.  */
+#define __lasx_xvsrlri_b(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrlri_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V16HI, V16HI, UQI.  */
+#define __lasx_xvsrlri_h(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrlri_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V8SI, V8SI, UQI.  */
+#define __lasx_xvsrlri_w(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrlri_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V4DI, V4DI, UQI.  */
+#define __lasx_xvsrlri_d(/*__m256i*/ _1, /*ui6*/ _2) \
+  ((__m256i)__builtin_lasx_xvsrlri_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvbitclr_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvbitclr_b ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvbitclr_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvbitclr_h ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvbitclr_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvbitclr_w ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvbitclr_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvbitclr_d ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UQI.  */
+#define __lasx_xvbitclri_b(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvbitclri_b ((v32u8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UQI.  */
+#define __lasx_xvbitclri_h(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvbitclri_h ((v16u16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UQI.  */
+#define __lasx_xvbitclri_w(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvbitclri_w ((v8u32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UQI.  */
+#define __lasx_xvbitclri_d(/*__m256i*/ _1, /*ui6*/ _2) \
+  ((__m256i)__builtin_lasx_xvbitclri_d ((v4u64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvbitset_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvbitset_b ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvbitset_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvbitset_h ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvbitset_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvbitset_w ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvbitset_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvbitset_d ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UQI.  */
+#define __lasx_xvbitseti_b(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvbitseti_b ((v32u8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UQI.  */
+#define __lasx_xvbitseti_h(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvbitseti_h ((v16u16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UQI.  */
+#define __lasx_xvbitseti_w(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvbitseti_w ((v8u32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UQI.  */
+#define __lasx_xvbitseti_d(/*__m256i*/ _1, /*ui6*/ _2) \
+  ((__m256i)__builtin_lasx_xvbitseti_d ((v4u64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvbitrev_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvbitrev_b ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvbitrev_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvbitrev_h ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvbitrev_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvbitrev_w ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvbitrev_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvbitrev_d ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UQI.  */
+#define __lasx_xvbitrevi_b(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvbitrevi_b ((v32u8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UQI.  */
+#define __lasx_xvbitrevi_h(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvbitrevi_h ((v16u16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UQI.  */
+#define __lasx_xvbitrevi_w(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvbitrevi_w ((v8u32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UQI.  */
+#define __lasx_xvbitrevi_d(/*__m256i*/ _1, /*ui6*/ _2) \
+  ((__m256i)__builtin_lasx_xvbitrevi_d ((v4u64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvadd_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvadd_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvadd_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvadd_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvadd_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvadd_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvadd_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvadd_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V32QI, V32QI, UQI.  */
+#define __lasx_xvaddi_bu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvaddi_bu ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V16HI, V16HI, UQI.  */
+#define __lasx_xvaddi_hu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvaddi_hu ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V8SI, V8SI, UQI.  */
+#define __lasx_xvaddi_wu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvaddi_wu ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V4DI, V4DI, UQI.  */
+#define __lasx_xvaddi_du(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvaddi_du ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsub_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsub_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsub_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsub_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsub_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsub_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsub_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsub_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V32QI, V32QI, UQI.  */
+#define __lasx_xvsubi_bu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvsubi_bu ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V16HI, V16HI, UQI.  */
+#define __lasx_xvsubi_hu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvsubi_hu ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V8SI, V8SI, UQI.  */
+#define __lasx_xvsubi_wu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvsubi_wu ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V4DI, V4DI, UQI.  */
+#define __lasx_xvsubi_du(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvsubi_du ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmax_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmax_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmax_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmax_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmax_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmax_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmax_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmax_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V32QI, V32QI, QI.  */
+#define __lasx_xvmaxi_b(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmaxi_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V16HI, V16HI, QI.  */
+#define __lasx_xvmaxi_h(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmaxi_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V8SI, V8SI, QI.  */
+#define __lasx_xvmaxi_w(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmaxi_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V4DI, V4DI, QI.  */
+#define __lasx_xvmaxi_d(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmaxi_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmax_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmax_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmax_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmax_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmax_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmax_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmax_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmax_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UQI.  */
+#define __lasx_xvmaxi_bu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmaxi_bu ((v32u8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UQI.  */
+#define __lasx_xvmaxi_hu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmaxi_hu ((v16u16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UQI.  */
+#define __lasx_xvmaxi_wu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmaxi_wu ((v8u32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UQI.  */
+#define __lasx_xvmaxi_du(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmaxi_du ((v4u64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmin_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmin_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmin_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmin_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmin_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmin_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmin_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmin_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V32QI, V32QI, QI.  */
+#define __lasx_xvmini_b(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmini_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V16HI, V16HI, QI.  */
+#define __lasx_xvmini_h(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmini_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V8SI, V8SI, QI.  */
+#define __lasx_xvmini_w(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmini_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V4DI, V4DI, QI.  */
+#define __lasx_xvmini_d(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmini_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmin_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmin_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmin_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmin_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmin_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmin_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmin_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmin_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UQI.  */
+#define __lasx_xvmini_bu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmini_bu ((v32u8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UQI.  */
+#define __lasx_xvmini_hu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmini_hu ((v16u16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UQI.  */
+#define __lasx_xvmini_wu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmini_wu ((v8u32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UQI.  */
+#define __lasx_xvmini_du(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvmini_du ((v4u64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvseq_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvseq_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvseq_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvseq_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvseq_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvseq_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvseq_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvseq_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V32QI, V32QI, QI.  */
+#define __lasx_xvseqi_b(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvseqi_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V16HI, V16HI, QI.  */
+#define __lasx_xvseqi_h(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvseqi_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V8SI, V8SI, QI.  */
+#define __lasx_xvseqi_w(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvseqi_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V4DI, V4DI, QI.  */
+#define __lasx_xvseqi_d(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvseqi_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvslt_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvslt_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvslt_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvslt_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvslt_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvslt_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvslt_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvslt_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V32QI, V32QI, QI.  */
+#define __lasx_xvslti_b(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslti_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V16HI, V16HI, QI.  */
+#define __lasx_xvslti_h(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslti_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V8SI, V8SI, QI.  */
+#define __lasx_xvslti_w(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslti_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V4DI, V4DI, QI.  */
+#define __lasx_xvslti_d(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslti_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvslt_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvslt_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvslt_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvslt_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvslt_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvslt_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvslt_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvslt_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V32QI, UV32QI, UQI.  */
+#define __lasx_xvslti_bu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslti_bu ((v32u8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V16HI, UV16HI, UQI.  */
+#define __lasx_xvslti_hu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslti_hu ((v16u16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V8SI, UV8SI, UQI.  */
+#define __lasx_xvslti_wu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslti_wu ((v8u32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V4DI, UV4DI, UQI.  */
+#define __lasx_xvslti_du(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslti_du ((v4u64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsle_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsle_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsle_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsle_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsle_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsle_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsle_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsle_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V32QI, V32QI, QI.  */
+#define __lasx_xvslei_b(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslei_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V16HI, V16HI, QI.  */
+#define __lasx_xvslei_h(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslei_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V8SI, V8SI, QI.  */
+#define __lasx_xvslei_w(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslei_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, si5.  */
+/* Data types in instruction templates:  V4DI, V4DI, QI.  */
+#define __lasx_xvslei_d(/*__m256i*/ _1, /*si5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslei_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsle_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsle_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsle_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsle_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsle_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsle_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsle_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsle_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V32QI, UV32QI, UQI.  */
+#define __lasx_xvslei_bu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslei_bu ((v32u8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V16HI, UV16HI, UQI.  */
+#define __lasx_xvslei_hu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslei_hu ((v16u16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V8SI, UV8SI, UQI.  */
+#define __lasx_xvslei_wu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslei_wu ((v8u32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V4DI, UV4DI, UQI.  */
+#define __lasx_xvslei_du(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvslei_du ((v4u64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  V32QI, V32QI, UQI.  */
+#define __lasx_xvsat_b(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvsat_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V16HI, V16HI, UQI.  */
+#define __lasx_xvsat_h(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvsat_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V8SI, V8SI, UQI.  */
+#define __lasx_xvsat_w(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvsat_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V4DI, V4DI, UQI.  */
+#define __lasx_xvsat_d(/*__m256i*/ _1, /*ui6*/ _2) \
+  ((__m256i)__builtin_lasx_xvsat_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UQI.  */
+#define __lasx_xvsat_bu(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvsat_bu ((v32u8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UQI.  */
+#define __lasx_xvsat_hu(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvsat_hu ((v16u16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UQI.  */
+#define __lasx_xvsat_wu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvsat_wu ((v8u32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UQI.  */
+#define __lasx_xvsat_du(/*__m256i*/ _1, /*ui6*/ _2) \
+  ((__m256i)__builtin_lasx_xvsat_du ((v4u64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvadda_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvadda_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvadda_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvadda_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvadda_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvadda_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvadda_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvadda_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsadd_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsadd_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsadd_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsadd_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsadd_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsadd_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsadd_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsadd_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsadd_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsadd_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsadd_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsadd_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsadd_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsadd_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsadd_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsadd_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavg_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavg_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavg_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavg_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavg_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavg_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavg_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavg_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavg_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavg_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavg_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavg_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavg_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavg_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavg_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavg_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavgr_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavgr_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavgr_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavgr_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavgr_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavgr_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavgr_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavgr_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavgr_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavgr_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavgr_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavgr_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavgr_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavgr_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvavgr_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvavgr_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssub_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssub_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssub_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssub_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssub_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssub_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssub_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssub_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssub_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssub_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssub_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssub_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssub_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssub_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssub_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssub_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvabsd_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvabsd_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvabsd_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvabsd_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvabsd_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvabsd_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvabsd_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvabsd_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvabsd_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvabsd_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvabsd_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvabsd_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvabsd_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvabsd_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvabsd_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvabsd_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmul_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmul_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmul_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmul_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmul_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmul_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmul_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmul_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmadd_b (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmadd_b ((v32i8)_1, (v32i8)_2, (v32i8)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmadd_h (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmadd_h ((v16i16)_1, (v16i16)_2, (v16i16)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmadd_w (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmadd_w ((v8i32)_1, (v8i32)_2, (v8i32)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmadd_d (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmadd_d ((v4i64)_1, (v4i64)_2, (v4i64)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmsub_b (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmsub_b ((v32i8)_1, (v32i8)_2, (v32i8)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmsub_h (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmsub_h ((v16i16)_1, (v16i16)_2, (v16i16)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmsub_w (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmsub_w ((v8i32)_1, (v8i32)_2, (v8i32)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmsub_d (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmsub_d ((v4i64)_1, (v4i64)_2, (v4i64)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvdiv_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvdiv_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvdiv_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvdiv_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvdiv_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvdiv_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvdiv_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvdiv_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvdiv_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvdiv_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvdiv_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvdiv_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvdiv_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvdiv_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvdiv_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvdiv_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhaddw_h_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhaddw_h_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhaddw_w_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhaddw_w_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhaddw_d_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhaddw_d_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhaddw_hu_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhaddw_hu_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhaddw_wu_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhaddw_wu_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhaddw_du_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhaddw_du_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhsubw_h_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhsubw_h_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhsubw_w_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhsubw_w_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhsubw_d_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhsubw_d_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhsubw_hu_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhsubw_hu_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhsubw_wu_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhsubw_wu_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhsubw_du_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhsubw_du_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmod_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmod_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmod_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmod_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmod_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmod_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmod_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmod_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmod_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmod_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmod_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmod_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmod_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmod_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmod_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmod_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V32QI, V32QI, UQI.  */
+#define __lasx_xvrepl128vei_b(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvrepl128vei_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  V16HI, V16HI, UQI.  */
+#define __lasx_xvrepl128vei_h(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvrepl128vei_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui2.  */
+/* Data types in instruction templates:  V8SI, V8SI, UQI.  */
+#define __lasx_xvrepl128vei_w(/*__m256i*/ _1, /*ui2*/ _2) \
+  ((__m256i)__builtin_lasx_xvrepl128vei_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui1.  */
+/* Data types in instruction templates:  V4DI, V4DI, UQI.  */
+#define __lasx_xvrepl128vei_d(/*__m256i*/ _1, /*ui1*/ _2) \
+  ((__m256i)__builtin_lasx_xvrepl128vei_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpickev_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpickev_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpickev_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpickev_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpickev_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpickev_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpickev_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpickev_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpickod_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpickod_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpickod_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpickod_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpickod_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpickod_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpickod_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpickod_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvilvh_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvilvh_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvilvh_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvilvh_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvilvh_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvilvh_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvilvh_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvilvh_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvilvl_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvilvl_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvilvl_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvilvl_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvilvl_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvilvl_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvilvl_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvilvl_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpackev_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpackev_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpackev_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpackev_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpackev_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpackev_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpackev_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpackev_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpackod_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpackod_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpackod_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpackod_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpackod_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpackod_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpackod_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvpackod_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk, xa.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvshuf_b (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvshuf_b ((v32i8)_1, (v32i8)_2, (v32i8)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvshuf_h (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvshuf_h ((v16i16)_1, (v16i16)_2, (v16i16)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvshuf_w (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvshuf_w ((v8i32)_1, (v8i32)_2, (v8i32)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvshuf_d (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvshuf_d ((v4i64)_1, (v4i64)_2, (v4i64)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvand_v (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvand_v ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UQI.  */
+#define __lasx_xvandi_b(/*__m256i*/ _1, /*ui8*/ _2) \
+  ((__m256i)__builtin_lasx_xvandi_b ((v32u8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvor_v (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvor_v ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UQI.  */
+#define __lasx_xvori_b(/*__m256i*/ _1, /*ui8*/ _2) \
+  ((__m256i)__builtin_lasx_xvori_b ((v32u8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvnor_v (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvnor_v ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UQI.  */
+#define __lasx_xvnori_b(/*__m256i*/ _1, /*ui8*/ _2) \
+  ((__m256i)__builtin_lasx_xvnori_b ((v32u8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvxor_v (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvxor_v ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UQI.  */
+#define __lasx_xvxori_b(/*__m256i*/ _1, /*ui8*/ _2) \
+  ((__m256i)__builtin_lasx_xvxori_b ((v32u8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk, xa.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvbitsel_v (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvbitsel_v ((v32u8)_1, (v32u8)_2, (v32u8)_3);
+}
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI, USI.  */
+#define __lasx_xvbitseli_b(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3) \
+  ((__m256i)__builtin_lasx_xvbitseli_b ((v32u8)(_1), (v32u8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  V32QI, V32QI, USI.  */
+#define __lasx_xvshuf4i_b(/*__m256i*/ _1, /*ui8*/ _2) \
+  ((__m256i)__builtin_lasx_xvshuf4i_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  V16HI, V16HI, USI.  */
+#define __lasx_xvshuf4i_h(/*__m256i*/ _1, /*ui8*/ _2) \
+  ((__m256i)__builtin_lasx_xvshuf4i_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  V8SI, V8SI, USI.  */
+#define __lasx_xvshuf4i_w(/*__m256i*/ _1, /*ui8*/ _2) \
+  ((__m256i)__builtin_lasx_xvshuf4i_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, rj.  */
+/* Data types in instruction templates:  V32QI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvreplgr2vr_b (int _1)
+{
+  return (__m256i)__builtin_lasx_xvreplgr2vr_b ((int)_1);
+}
+
+/* Assembly instruction format:	xd, rj.  */
+/* Data types in instruction templates:  V16HI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvreplgr2vr_h (int _1)
+{
+  return (__m256i)__builtin_lasx_xvreplgr2vr_h ((int)_1);
+}
+
+/* Assembly instruction format:	xd, rj.  */
+/* Data types in instruction templates:  V8SI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvreplgr2vr_w (int _1)
+{
+  return (__m256i)__builtin_lasx_xvreplgr2vr_w ((int)_1);
+}
+
+/* Assembly instruction format:	xd, rj.  */
+/* Data types in instruction templates:  V4DI, DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvreplgr2vr_d (long int _1)
+{
+  return (__m256i)__builtin_lasx_xvreplgr2vr_d ((long int)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpcnt_b (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvpcnt_b ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpcnt_h (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvpcnt_h ((v16i16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpcnt_w (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvpcnt_w ((v8i32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvpcnt_d (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvpcnt_d ((v4i64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvclo_b (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvclo_b ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvclo_h (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvclo_h ((v16i16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvclo_w (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvclo_w ((v8i32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvclo_d (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvclo_d ((v4i64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvclz_b (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvclz_b ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvclz_h (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvclz_h ((v16i16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvclz_w (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvclz_w ((v8i32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvclz_d (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvclz_d ((v4i64)_1);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SF, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfadd_s (__m256 _1, __m256 _2)
+{
+  return (__m256)__builtin_lasx_xvfadd_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DF, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfadd_d (__m256d _1, __m256d _2)
+{
+  return (__m256d)__builtin_lasx_xvfadd_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SF, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfsub_s (__m256 _1, __m256 _2)
+{
+  return (__m256)__builtin_lasx_xvfsub_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DF, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfsub_d (__m256d _1, __m256d _2)
+{
+  return (__m256d)__builtin_lasx_xvfsub_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SF, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfmul_s (__m256 _1, __m256 _2)
+{
+  return (__m256)__builtin_lasx_xvfmul_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DF, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfmul_d (__m256d _1, __m256d _2)
+{
+  return (__m256d)__builtin_lasx_xvfmul_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SF, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfdiv_s (__m256 _1, __m256 _2)
+{
+  return (__m256)__builtin_lasx_xvfdiv_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DF, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfdiv_d (__m256d _1, __m256d _2)
+{
+  return (__m256d)__builtin_lasx_xvfdiv_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcvt_h_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcvt_h_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SF, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfcvt_s_d (__m256d _1, __m256d _2)
+{
+  return (__m256)__builtin_lasx_xvfcvt_s_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SF, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfmin_s (__m256 _1, __m256 _2)
+{
+  return (__m256)__builtin_lasx_xvfmin_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DF, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfmin_d (__m256d _1, __m256d _2)
+{
+  return (__m256d)__builtin_lasx_xvfmin_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SF, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfmina_s (__m256 _1, __m256 _2)
+{
+  return (__m256)__builtin_lasx_xvfmina_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DF, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfmina_d (__m256d _1, __m256d _2)
+{
+  return (__m256d)__builtin_lasx_xvfmina_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SF, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfmax_s (__m256 _1, __m256 _2)
+{
+  return (__m256)__builtin_lasx_xvfmax_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DF, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfmax_d (__m256d _1, __m256d _2)
+{
+  return (__m256d)__builtin_lasx_xvfmax_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SF, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfmaxa_s (__m256 _1, __m256 _2)
+{
+  return (__m256)__builtin_lasx_xvfmaxa_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DF, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfmaxa_d (__m256d _1, __m256d _2)
+{
+  return (__m256d)__builtin_lasx_xvfmaxa_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfclass_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvfclass_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfclass_d (__m256d _1)
+{
+  return (__m256i)__builtin_lasx_xvfclass_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfsqrt_s (__m256 _1)
+{
+  return (__m256)__builtin_lasx_xvfsqrt_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfsqrt_d (__m256d _1)
+{
+  return (__m256d)__builtin_lasx_xvfsqrt_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfrecip_s (__m256 _1)
+{
+  return (__m256)__builtin_lasx_xvfrecip_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfrecip_d (__m256d _1)
+{
+  return (__m256d)__builtin_lasx_xvfrecip_d ((v4f64)_1);
+}
+
+#if defined(__loongarch_frecipe)
+/* Assembly instruction format: xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfrecipe_s (__m256 _1)
+{
+  return (__m256)__builtin_lasx_xvfrecipe_s ((v8f32)_1);
+}
+
+/* Assembly instruction format: xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfrecipe_d (__m256d _1)
+{
+  return (__m256d)__builtin_lasx_xvfrecipe_d ((v4f64)_1);
+}
+
+/* Assembly instruction format: xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfrsqrte_s (__m256 _1)
+{
+  return (__m256)__builtin_lasx_xvfrsqrte_s ((v8f32)_1);
+}
+
+/* Assembly instruction format: xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfrsqrte_d (__m256d _1)
+{
+  return (__m256d)__builtin_lasx_xvfrsqrte_d ((v4f64)_1);
+}
+#endif
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfrint_s (__m256 _1)
+{
+  return (__m256)__builtin_lasx_xvfrint_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfrint_d (__m256d _1)
+{
+  return (__m256d)__builtin_lasx_xvfrint_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfrsqrt_s (__m256 _1)
+{
+  return (__m256)__builtin_lasx_xvfrsqrt_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfrsqrt_d (__m256d _1)
+{
+  return (__m256d)__builtin_lasx_xvfrsqrt_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvflogb_s (__m256 _1)
+{
+  return (__m256)__builtin_lasx_xvflogb_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvflogb_d (__m256d _1)
+{
+  return (__m256d)__builtin_lasx_xvflogb_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SF, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfcvth_s_h (__m256i _1)
+{
+  return (__m256)__builtin_lasx_xvfcvth_s_h ((v16i16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfcvth_d_s (__m256 _1)
+{
+  return (__m256d)__builtin_lasx_xvfcvth_d_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SF, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfcvtl_s_h (__m256i _1)
+{
+  return (__m256)__builtin_lasx_xvfcvtl_s_h ((v16i16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfcvtl_d_s (__m256 _1)
+{
+  return (__m256d)__builtin_lasx_xvfcvtl_d_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftint_w_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftint_w_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftint_l_d (__m256d _1)
+{
+  return (__m256i)__builtin_lasx_xvftint_l_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  UV8SI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftint_wu_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftint_wu_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  UV4DI, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftint_lu_d (__m256d _1)
+{
+  return (__m256i)__builtin_lasx_xvftint_lu_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrz_w_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrz_w_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrz_l_d (__m256d _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrz_l_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  UV8SI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrz_wu_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrz_wu_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  UV4DI, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrz_lu_d (__m256d _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrz_lu_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvffint_s_w (__m256i _1)
+{
+  return (__m256)__builtin_lasx_xvffint_s_w ((v8i32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvffint_d_l (__m256i _1)
+{
+  return (__m256d)__builtin_lasx_xvffint_d_l ((v4i64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SF, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvffint_s_wu (__m256i _1)
+{
+  return (__m256)__builtin_lasx_xvffint_s_wu ((v8u32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvffint_d_lu (__m256i _1)
+{
+  return (__m256d)__builtin_lasx_xvffint_d_lu ((v4u64)_1);
+}
+
+/* Assembly instruction format:	xd, xj, rk.  */
+/* Data types in instruction templates:  V32QI, V32QI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvreplve_b (__m256i _1, int _2)
+{
+  return (__m256i)__builtin_lasx_xvreplve_b ((v32i8)_1, (int)_2);
+}
+
+/* Assembly instruction format:	xd, xj, rk.  */
+/* Data types in instruction templates:  V16HI, V16HI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvreplve_h (__m256i _1, int _2)
+{
+  return (__m256i)__builtin_lasx_xvreplve_h ((v16i16)_1, (int)_2);
+}
+
+/* Assembly instruction format:	xd, xj, rk.  */
+/* Data types in instruction templates:  V8SI, V8SI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvreplve_w (__m256i _1, int _2)
+{
+  return (__m256i)__builtin_lasx_xvreplve_w ((v8i32)_1, (int)_2);
+}
+
+/* Assembly instruction format:	xd, xj, rk.  */
+/* Data types in instruction templates:  V4DI, V4DI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvreplve_d (__m256i _1, int _2)
+{
+  return (__m256i)__builtin_lasx_xvreplve_d ((v4i64)_1, (int)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, USI.  */
+#define __lasx_xvpermi_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3) \
+  ((__m256i)__builtin_lasx_xvpermi_w ((v8i32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvandn_v (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvandn_v ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvneg_b (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvneg_b ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvneg_h (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvneg_h ((v16i16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvneg_w (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvneg_w ((v8i32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvneg_d (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvneg_d ((v4i64)_1);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmuh_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmuh_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmuh_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmuh_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmuh_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmuh_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmuh_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmuh_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmuh_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmuh_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmuh_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmuh_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmuh_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmuh_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmuh_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmuh_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  V16HI, V32QI, UQI.  */
+#define __lasx_xvsllwil_h_b(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvsllwil_h_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V8SI, V16HI, UQI.  */
+#define __lasx_xvsllwil_w_h(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvsllwil_w_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V4DI, V8SI, UQI.  */
+#define __lasx_xvsllwil_d_w(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvsllwil_d_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  UV16HI, UV32QI, UQI.  */
+#define __lasx_xvsllwil_hu_bu(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvsllwil_hu_bu ((v32u8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  UV8SI, UV16HI, UQI.  */
+#define __lasx_xvsllwil_wu_hu(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvsllwil_wu_hu ((v16u16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV4DI, UV8SI, UQI.  */
+#define __lasx_xvsllwil_du_wu(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvsllwil_du_wu ((v8u32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsran_b_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsran_b_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsran_h_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsran_h_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsran_w_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsran_w_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssran_b_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssran_b_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssran_h_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssran_h_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssran_w_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssran_w_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssran_bu_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssran_bu_h ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssran_hu_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssran_hu_w ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssran_wu_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssran_wu_d ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrarn_b_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrarn_b_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrarn_h_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrarn_h_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrarn_w_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrarn_w_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrarn_b_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrarn_b_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrarn_h_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrarn_h_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrarn_w_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrarn_w_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrarn_bu_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrarn_bu_h ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrarn_hu_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrarn_hu_w ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrarn_wu_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrarn_wu_d ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrln_b_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrln_b_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrln_h_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrln_h_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrln_w_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrln_w_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrln_bu_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrln_bu_h ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrln_hu_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrln_hu_w ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrln_wu_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrln_wu_d ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrlrn_b_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrlrn_b_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrlrn_h_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrlrn_h_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsrlrn_w_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsrlrn_w_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV32QI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrlrn_bu_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrlrn_bu_h ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrlrn_hu_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrlrn_hu_w ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrlrn_wu_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrlrn_wu_d ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, UQI.  */
+#define __lasx_xvfrstpi_b(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvfrstpi_b ((v32i8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, UQI.  */
+#define __lasx_xvfrstpi_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvfrstpi_h ((v16i16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfrstp_b (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvfrstp_b ((v32i8)_1, (v32i8)_2, (v32i8)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfrstp_h (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvfrstp_h ((v16i16)_1, (v16i16)_2, (v16i16)_3);
+}
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, USI.  */
+#define __lasx_xvshuf4i_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3) \
+  ((__m256i)__builtin_lasx_xvshuf4i_d ((v4i64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V32QI, V32QI, UQI.  */
+#define __lasx_xvbsrl_v(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvbsrl_v ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V32QI, V32QI, UQI.  */
+#define __lasx_xvbsll_v(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvbsll_v ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, USI.  */
+#define __lasx_xvextrins_b(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3) \
+  ((__m256i)__builtin_lasx_xvextrins_b ((v32i8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, USI.  */
+#define __lasx_xvextrins_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3) \
+  ((__m256i)__builtin_lasx_xvextrins_h ((v16i16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, USI.  */
+#define __lasx_xvextrins_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3) \
+  ((__m256i)__builtin_lasx_xvextrins_w ((v8i32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, USI.  */
+#define __lasx_xvextrins_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3) \
+  ((__m256i)__builtin_lasx_xvextrins_d ((v4i64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmskltz_b (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvmskltz_b ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmskltz_h (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvmskltz_h ((v16i16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmskltz_w (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvmskltz_w ((v8i32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmskltz_d (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvmskltz_d ((v4i64)_1);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsigncov_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsigncov_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsigncov_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsigncov_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsigncov_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsigncov_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsigncov_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsigncov_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk, xa.  */
+/* Data types in instruction templates:  V8SF, V8SF, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfmadd_s (__m256 _1, __m256 _2, __m256 _3)
+{
+  return (__m256)__builtin_lasx_xvfmadd_s ((v8f32)_1, (v8f32)_2, (v8f32)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk, xa.  */
+/* Data types in instruction templates:  V4DF, V4DF, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfmadd_d (__m256d _1, __m256d _2, __m256d _3)
+{
+  return (__m256d)__builtin_lasx_xvfmadd_d ((v4f64)_1, (v4f64)_2, (v4f64)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk, xa.  */
+/* Data types in instruction templates:  V8SF, V8SF, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfmsub_s (__m256 _1, __m256 _2, __m256 _3)
+{
+  return (__m256)__builtin_lasx_xvfmsub_s ((v8f32)_1, (v8f32)_2, (v8f32)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk, xa.  */
+/* Data types in instruction templates:  V4DF, V4DF, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfmsub_d (__m256d _1, __m256d _2, __m256d _3)
+{
+  return (__m256d)__builtin_lasx_xvfmsub_d ((v4f64)_1, (v4f64)_2, (v4f64)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk, xa.  */
+/* Data types in instruction templates:  V8SF, V8SF, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfnmadd_s (__m256 _1, __m256 _2, __m256 _3)
+{
+  return (__m256)__builtin_lasx_xvfnmadd_s ((v8f32)_1, (v8f32)_2, (v8f32)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk, xa.  */
+/* Data types in instruction templates:  V4DF, V4DF, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfnmadd_d (__m256d _1, __m256d _2, __m256d _3)
+{
+  return (__m256d)__builtin_lasx_xvfnmadd_d ((v4f64)_1, (v4f64)_2, (v4f64)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk, xa.  */
+/* Data types in instruction templates:  V8SF, V8SF, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfnmsub_s (__m256 _1, __m256 _2, __m256 _3)
+{
+  return (__m256)__builtin_lasx_xvfnmsub_s ((v8f32)_1, (v8f32)_2, (v8f32)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk, xa.  */
+/* Data types in instruction templates:  V4DF, V4DF, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfnmsub_d (__m256d _1, __m256d _2, __m256d _3)
+{
+  return (__m256d)__builtin_lasx_xvfnmsub_d ((v4f64)_1, (v4f64)_2, (v4f64)_3);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrne_w_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrne_w_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrne_l_d (__m256d _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrne_l_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrp_w_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrp_w_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrp_l_d (__m256d _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrp_l_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrm_w_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrm_w_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrm_l_d (__m256d _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrm_l_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftint_w_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvftint_w_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SF, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvffint_s_l (__m256i _1, __m256i _2)
+{
+  return (__m256)__builtin_lasx_xvffint_s_l ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrz_w_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvftintrz_w_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrp_w_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvftintrp_w_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrm_w_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvftintrm_w_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrne_w_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvftintrne_w_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftinth_l_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftinth_l_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintl_l_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintl_l_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvffinth_d_w (__m256i _1)
+{
+  return (__m256d)__builtin_lasx_xvffinth_d_w ((v8i32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvffintl_d_w (__m256i _1)
+{
+  return (__m256d)__builtin_lasx_xvffintl_d_w ((v8i32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrzh_l_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrzh_l_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrzl_l_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrzl_l_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrph_l_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrph_l_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrpl_l_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrpl_l_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrmh_l_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrmh_l_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrml_l_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrml_l_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrneh_l_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrneh_l_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvftintrnel_l_s (__m256 _1)
+{
+  return (__m256i)__builtin_lasx_xvftintrnel_l_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfrintrne_s (__m256 _1)
+{
+  return (__m256)__builtin_lasx_xvfrintrne_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfrintrne_d (__m256d _1)
+{
+  return (__m256d)__builtin_lasx_xvfrintrne_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfrintrz_s (__m256 _1)
+{
+  return (__m256)__builtin_lasx_xvfrintrz_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfrintrz_d (__m256d _1)
+{
+  return (__m256d)__builtin_lasx_xvfrintrz_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfrintrp_s (__m256 _1)
+{
+  return (__m256)__builtin_lasx_xvfrintrp_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfrintrp_d (__m256d _1)
+{
+  return (__m256d)__builtin_lasx_xvfrintrp_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_xvfrintrm_s (__m256 _1)
+{
+  return (__m256)__builtin_lasx_xvfrintrm_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_xvfrintrm_d (__m256d _1)
+{
+  return (__m256d)__builtin_lasx_xvfrintrm_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:	xd, rj, si12.  */
+/* Data types in instruction templates:  V32QI, CVPOINTER, SI.  */
+#define __lasx_xvld(/*void **/ _1, /*si12*/ _2) \
+  ((__m256i)__builtin_lasx_xvld ((void *)(_1), (_2)))
+
+/* Assembly instruction format:	xd, rj, si12.  */
+/* Data types in instruction templates:  VOID, V32QI, CVPOINTER, SI.  */
+#define __lasx_xvst(/*__m256i*/ _1, /*void **/ _2, /*si12*/ _3) \
+  ((void)__builtin_lasx_xvst ((v32i8)(_1), (void *)(_2), (_3)))
+
+/* Assembly instruction format:	xd, rj, si8, idx.  */
+/* Data types in instruction templates:  VOID, V32QI, CVPOINTER, SI, UQI.  */
+#define __lasx_xvstelm_b(/*__m256i*/ _1, /*void **/ _2, /*si8*/ _3, /*idx*/ _4) \
+  ((void)__builtin_lasx_xvstelm_b ((v32i8)(_1), (void *)(_2), (_3), (_4)))
+
+/* Assembly instruction format:	xd, rj, si8, idx.  */
+/* Data types in instruction templates:  VOID, V16HI, CVPOINTER, SI, UQI.  */
+#define __lasx_xvstelm_h(/*__m256i*/ _1, /*void **/ _2, /*si8*/ _3, /*idx*/ _4) \
+  ((void)__builtin_lasx_xvstelm_h ((v16i16)(_1), (void *)(_2), (_3), (_4)))
+
+/* Assembly instruction format:	xd, rj, si8, idx.  */
+/* Data types in instruction templates:  VOID, V8SI, CVPOINTER, SI, UQI.  */
+#define __lasx_xvstelm_w(/*__m256i*/ _1, /*void **/ _2, /*si8*/ _3, /*idx*/ _4) \
+  ((void)__builtin_lasx_xvstelm_w ((v8i32)(_1), (void *)(_2), (_3), (_4)))
+
+/* Assembly instruction format:	xd, rj, si8, idx.  */
+/* Data types in instruction templates:  VOID, V4DI, CVPOINTER, SI, UQI.  */
+#define __lasx_xvstelm_d(/*__m256i*/ _1, /*void **/ _2, /*si8*/ _3, /*idx*/ _4) \
+  ((void)__builtin_lasx_xvstelm_d ((v4i64)(_1), (void *)(_2), (_3), (_4)))
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, UQI.  */
+#define __lasx_xvinsve0_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui3*/ _3) \
+  ((__m256i)__builtin_lasx_xvinsve0_w ((v8i32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui2.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, UQI.  */
+#define __lasx_xvinsve0_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui2*/ _3) \
+  ((__m256i)__builtin_lasx_xvinsve0_d ((v4i64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  V8SI, V8SI, UQI.  */
+#define __lasx_xvpickve_w(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvpickve_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui2.  */
+/* Data types in instruction templates:  V4DI, V4DI, UQI.  */
+#define __lasx_xvpickve_d(/*__m256i*/ _1, /*ui2*/ _2) \
+  ((__m256i)__builtin_lasx_xvpickve_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrlrn_b_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrlrn_b_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrlrn_h_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrlrn_h_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrlrn_w_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrlrn_w_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrln_b_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrln_b_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrln_h_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrln_h_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvssrln_w_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvssrln_w_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvorn_v (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvorn_v ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, i13.  */
+/* Data types in instruction templates:  V4DI, HI.  */
+#define __lasx_xvldi(/*i13*/ _1) \
+  ((__m256i)__builtin_lasx_xvldi ((_1)))
+
+/* Assembly instruction format:	xd, rj, rk.  */
+/* Data types in instruction templates:  V32QI, CVPOINTER, DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvldx (void * _1, long int _2)
+{
+  return (__m256i)__builtin_lasx_xvldx ((void *)_1, (long int)_2);
+}
+
+/* Assembly instruction format:	xd, rj, rk.  */
+/* Data types in instruction templates:  VOID, V32QI, CVPOINTER, DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+void __lasx_xvstx (__m256i _1, void * _2, long int _3)
+{
+  return (void)__builtin_lasx_xvstx ((v32i8)_1, (void *)_2, (long int)_3);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvextl_qu_du (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvextl_qu_du ((v4u64)_1);
+}
+
+/* Assembly instruction format:	xd, rj, ui3.  */
+/* Data types in instruction templates:  V8SI, V8SI, SI, UQI.  */
+#define __lasx_xvinsgr2vr_w(/*__m256i*/ _1, /*int*/ _2, /*ui3*/ _3) \
+  ((__m256i)__builtin_lasx_xvinsgr2vr_w ((v8i32)(_1), (int)(_2), (_3)))
+
+/* Assembly instruction format:	xd, rj, ui2.  */
+/* Data types in instruction templates:  V4DI, V4DI, DI, UQI.  */
+#define __lasx_xvinsgr2vr_d(/*__m256i*/ _1, /*long int*/ _2, /*ui2*/ _3) \
+  ((__m256i)__builtin_lasx_xvinsgr2vr_d ((v4i64)(_1), (long int)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvreplve0_b (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvreplve0_b ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvreplve0_h (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvreplve0_h ((v16i16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvreplve0_w (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvreplve0_w ((v8i32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvreplve0_d (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvreplve0_d ((v4i64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvreplve0_q (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvreplve0_q ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V16HI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_vext2xv_h_b (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_vext2xv_h_b ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_vext2xv_w_h (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_vext2xv_w_h ((v16i16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_vext2xv_d_w (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_vext2xv_d_w ((v8i32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_vext2xv_w_b (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_vext2xv_w_b ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_vext2xv_d_h (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_vext2xv_d_h ((v16i16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_vext2xv_d_b (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_vext2xv_d_b ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V16HI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_vext2xv_hu_bu (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_vext2xv_hu_bu ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_vext2xv_wu_hu (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_vext2xv_wu_hu ((v16i16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_vext2xv_du_wu (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_vext2xv_du_wu ((v8i32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_vext2xv_wu_bu (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_vext2xv_wu_bu ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_vext2xv_du_hu (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_vext2xv_du_hu ((v16i16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_vext2xv_du_bu (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_vext2xv_du_bu ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, USI.  */
+#define __lasx_xvpermi_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3) \
+  ((__m256i)__builtin_lasx_xvpermi_q ((v32i8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui8.  */
+/* Data types in instruction templates:  V4DI, V4DI, USI.  */
+#define __lasx_xvpermi_d(/*__m256i*/ _1, /*ui8*/ _2) \
+  ((__m256i)__builtin_lasx_xvpermi_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvperm_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvperm_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, rj, si12.  */
+/* Data types in instruction templates:  V32QI, CVPOINTER, SI.  */
+#define __lasx_xvldrepl_b(/*void **/ _1, /*si12*/ _2) \
+  ((__m256i)__builtin_lasx_xvldrepl_b ((void *)(_1), (_2)))
+
+/* Assembly instruction format:	xd, rj, si11.  */
+/* Data types in instruction templates:  V16HI, CVPOINTER, SI.  */
+#define __lasx_xvldrepl_h(/*void **/ _1, /*si11*/ _2) \
+  ((__m256i)__builtin_lasx_xvldrepl_h ((void *)(_1), (_2)))
+
+/* Assembly instruction format:	xd, rj, si10.  */
+/* Data types in instruction templates:  V8SI, CVPOINTER, SI.  */
+#define __lasx_xvldrepl_w(/*void **/ _1, /*si10*/ _2) \
+  ((__m256i)__builtin_lasx_xvldrepl_w ((void *)(_1), (_2)))
+
+/* Assembly instruction format:	xd, rj, si9.  */
+/* Data types in instruction templates:  V4DI, CVPOINTER, SI.  */
+#define __lasx_xvldrepl_d(/*void **/ _1, /*si9*/ _2) \
+  ((__m256i)__builtin_lasx_xvldrepl_d ((void *)(_1), (_2)))
+
+/* Assembly instruction format:	rd, xj, ui3.  */
+/* Data types in instruction templates:  SI, V8SI, UQI.  */
+#define __lasx_xvpickve2gr_w(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((int)__builtin_lasx_xvpickve2gr_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	rd, xj, ui3.  */
+/* Data types in instruction templates:  USI, V8SI, UQI.  */
+#define __lasx_xvpickve2gr_wu(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((unsigned int)__builtin_lasx_xvpickve2gr_wu ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	rd, xj, ui2.  */
+/* Data types in instruction templates:  DI, V4DI, UQI.  */
+#define __lasx_xvpickve2gr_d(/*__m256i*/ _1, /*ui2*/ _2) \
+  ((long int)__builtin_lasx_xvpickve2gr_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	rd, xj, ui2.  */
+/* Data types in instruction templates:  UDI, V4DI, UQI.  */
+#define __lasx_xvpickve2gr_du(/*__m256i*/ _1, /*ui2*/ _2) \
+  ((unsigned long int)__builtin_lasx_xvpickve2gr_du ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwev_q_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwev_q_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwev_d_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwev_d_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwev_w_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwev_w_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwev_h_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwev_h_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwev_q_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwev_q_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwev_d_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwev_d_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwev_w_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwev_w_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwev_h_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwev_h_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwev_q_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwev_q_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwev_d_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwev_d_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwev_w_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwev_w_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwev_h_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwev_h_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwev_q_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwev_q_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwev_d_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwev_d_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwev_w_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwev_w_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwev_h_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwev_h_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwev_q_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwev_q_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwev_d_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwev_d_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwev_w_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwev_w_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwev_h_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwev_h_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwev_q_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwev_q_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwev_d_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwev_d_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwev_w_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwev_w_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwev_h_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwev_h_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwod_q_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwod_q_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwod_d_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwod_d_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwod_w_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwod_w_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwod_h_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwod_h_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwod_q_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwod_q_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwod_d_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwod_d_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwod_w_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwod_w_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwod_h_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwod_h_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwod_q_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwod_q_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwod_d_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwod_d_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwod_w_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwod_w_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwod_h_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwod_h_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwod_q_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwod_q_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwod_d_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwod_d_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwod_w_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwod_w_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsubwod_h_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsubwod_h_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwod_q_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwod_q_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwod_d_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwod_d_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwod_w_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwod_w_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwod_h_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwod_h_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwod_q_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwod_q_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwod_d_wu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwod_d_wu ((v8u32)_1, (v8u32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwod_w_hu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwod_w_hu ((v16u16)_1, (v16u16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwod_h_bu (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwod_h_bu ((v32u8)_1, (v32u8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwev_d_wu_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwev_d_wu_w ((v8u32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, UV16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwev_w_hu_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwev_w_hu_h ((v16u16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, UV32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwev_h_bu_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwev_h_bu_b ((v32u8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwev_d_wu_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwev_d_wu_w ((v8u32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, UV16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwev_w_hu_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwev_w_hu_h ((v16u16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, UV32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwev_h_bu_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwev_h_bu_b ((v32u8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwod_d_wu_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwod_d_wu_w ((v8u32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, UV16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwod_w_hu_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwod_w_hu_h ((v16u16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, UV32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwod_h_bu_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwod_h_bu_b ((v32u8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwod_d_wu_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwod_d_wu_w ((v8u32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, UV16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwod_w_hu_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwod_w_hu_h ((v16u16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, UV32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwod_h_bu_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwod_h_bu_b ((v32u8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhaddw_q_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhaddw_q_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhaddw_qu_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhaddw_qu_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhsubw_q_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhsubw_q_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvhsubw_qu_du (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvhsubw_qu_du ((v4u64)_1, (v4u64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwev_q_d (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwev_q_d ((v4i64)_1, (v4i64)_2, (v4i64)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwev_d_w (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwev_d_w ((v4i64)_1, (v8i32)_2, (v8i32)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwev_w_h (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwev_w_h ((v8i32)_1, (v16i16)_2, (v16i16)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwev_h_b (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwev_h_b ((v16i16)_1, (v32i8)_2, (v32i8)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwev_q_du (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwev_q_du ((v4u64)_1, (v4u64)_2, (v4u64)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwev_d_wu (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwev_d_wu ((v4u64)_1, (v8u32)_2, (v8u32)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwev_w_hu (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwev_w_hu ((v8u32)_1, (v16u16)_2, (v16u16)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwev_h_bu (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwev_h_bu ((v16u16)_1, (v32u8)_2, (v32u8)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwod_q_d (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwod_q_d ((v4i64)_1, (v4i64)_2, (v4i64)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwod_d_w (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwod_d_w ((v4i64)_1, (v8i32)_2, (v8i32)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwod_w_h (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwod_w_h ((v8i32)_1, (v16i16)_2, (v16i16)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwod_h_b (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwod_h_b ((v16i16)_1, (v32i8)_2, (v32i8)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwod_q_du (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwod_q_du ((v4u64)_1, (v4u64)_2, (v4u64)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, UV8SI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwod_d_wu (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwod_d_wu ((v4u64)_1, (v8u32)_2, (v8u32)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, UV16HI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwod_w_hu (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwod_w_hu ((v8u32)_1, (v16u16)_2, (v16u16)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, UV32QI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwod_h_bu (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwod_h_bu ((v16u16)_1, (v32u8)_2, (v32u8)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, UV4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwev_q_du_d (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwev_q_du_d ((v4i64)_1, (v4u64)_2, (v4i64)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, UV8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwev_d_wu_w (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwev_d_wu_w ((v4i64)_1, (v8u32)_2, (v8i32)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, UV16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwev_w_hu_h (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwev_w_hu_h ((v8i32)_1, (v16u16)_2, (v16i16)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, UV32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwev_h_bu_b (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwev_h_bu_b ((v16i16)_1, (v32u8)_2, (v32i8)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, UV4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwod_q_du_d (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwod_q_du_d ((v4i64)_1, (v4u64)_2, (v4i64)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, UV8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwod_d_wu_w (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwod_d_wu_w ((v4i64)_1, (v8u32)_2, (v8i32)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, UV16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwod_w_hu_h (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwod_w_hu_h ((v8i32)_1, (v16u16)_2, (v16i16)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, UV32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmaddwod_h_bu_b (__m256i _1, __m256i _2, __m256i _3)
+{
+  return (__m256i)__builtin_lasx_xvmaddwod_h_bu_b ((v16i16)_1, (v32u8)_2, (v32i8)_3);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvrotr_b (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvrotr_b ((v32i8)_1, (v32i8)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvrotr_h (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvrotr_h ((v16i16)_1, (v16i16)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvrotr_w (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvrotr_w ((v8i32)_1, (v8i32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvrotr_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvrotr_d ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvadd_q (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvadd_q ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvsub_q (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvsub_q ((v4i64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwev_q_du_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwev_q_du_d ((v4u64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvaddwod_q_du_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvaddwod_q_du_d ((v4u64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwev_q_du_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwev_q_du_d ((v4u64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, UV4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmulwod_q_du_d (__m256i _1, __m256i _2)
+{
+  return (__m256i)__builtin_lasx_xvmulwod_q_du_d ((v4u64)_1, (v4i64)_2);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmskgez_b (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvmskgez_b ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V32QI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvmsknz_b (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvmsknz_b ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V16HI, V32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvexth_h_b (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvexth_h_b ((v32i8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V8SI, V16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvexth_w_h (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvexth_w_h ((v16i16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvexth_d_w (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvexth_d_w ((v8i32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvexth_q_d (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvexth_q_d ((v4i64)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  UV16HI, UV32QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvexth_hu_bu (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvexth_hu_bu ((v32u8)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  UV8SI, UV16HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvexth_wu_hu (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvexth_wu_hu ((v16u16)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  UV4DI, UV8SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvexth_du_wu (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvexth_du_wu ((v8u32)_1);
+}
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  UV4DI, UV4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvexth_qu_du (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvexth_qu_du ((v4u64)_1);
+}
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  V32QI, V32QI, UQI.  */
+#define __lasx_xvrotri_b(/*__m256i*/ _1, /*ui3*/ _2) \
+  ((__m256i)__builtin_lasx_xvrotri_b ((v32i8)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V16HI, V16HI, UQI.  */
+#define __lasx_xvrotri_h(/*__m256i*/ _1, /*ui4*/ _2) \
+  ((__m256i)__builtin_lasx_xvrotri_h ((v16i16)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V8SI, V8SI, UQI.  */
+#define __lasx_xvrotri_w(/*__m256i*/ _1, /*ui5*/ _2) \
+  ((__m256i)__builtin_lasx_xvrotri_w ((v8i32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V4DI, V4DI, UQI.  */
+#define __lasx_xvrotri_d(/*__m256i*/ _1, /*ui6*/ _2) \
+  ((__m256i)__builtin_lasx_xvrotri_d ((v4i64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj.  */
+/* Data types in instruction templates:  V4DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvextl_q_d (__m256i _1)
+{
+  return (__m256i)__builtin_lasx_xvextl_q_d ((v4i64)_1);
+}
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, USI.  */
+#define __lasx_xvsrlni_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrlni_b_h ((v32i8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, USI.  */
+#define __lasx_xvsrlni_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrlni_h_w ((v16i16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, USI.  */
+#define __lasx_xvsrlni_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrlni_w_d ((v8i32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui7.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, USI.  */
+#define __lasx_xvsrlni_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrlni_d_q ((v4i64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, USI.  */
+#define __lasx_xvsrlrni_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrlrni_b_h ((v32i8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, USI.  */
+#define __lasx_xvsrlrni_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrlrni_h_w ((v16i16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, USI.  */
+#define __lasx_xvsrlrni_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrlrni_w_d ((v8i32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui7.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, USI.  */
+#define __lasx_xvsrlrni_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrlrni_d_q ((v4i64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, USI.  */
+#define __lasx_xvssrlni_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlni_b_h ((v32i8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, USI.  */
+#define __lasx_xvssrlni_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlni_h_w ((v16i16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, USI.  */
+#define __lasx_xvssrlni_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlni_w_d ((v8i32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui7.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, USI.  */
+#define __lasx_xvssrlni_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlni_d_q ((v4i64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, V32QI, USI.  */
+#define __lasx_xvssrlni_bu_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlni_bu_h ((v32u8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, V16HI, USI.  */
+#define __lasx_xvssrlni_hu_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlni_hu_w ((v16u16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, V8SI, USI.  */
+#define __lasx_xvssrlni_wu_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlni_wu_d ((v8u32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui7.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, V4DI, USI.  */
+#define __lasx_xvssrlni_du_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlni_du_q ((v4u64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, USI.  */
+#define __lasx_xvssrlrni_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlrni_b_h ((v32i8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, USI.  */
+#define __lasx_xvssrlrni_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlrni_h_w ((v16i16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, USI.  */
+#define __lasx_xvssrlrni_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlrni_w_d ((v8i32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui7.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, USI.  */
+#define __lasx_xvssrlrni_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlrni_d_q ((v4i64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, V32QI, USI.  */
+#define __lasx_xvssrlrni_bu_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlrni_bu_h ((v32u8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, V16HI, USI.  */
+#define __lasx_xvssrlrni_hu_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlrni_hu_w ((v16u16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, V8SI, USI.  */
+#define __lasx_xvssrlrni_wu_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlrni_wu_d ((v8u32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui7.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, V4DI, USI.  */
+#define __lasx_xvssrlrni_du_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrlrni_du_q ((v4u64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, USI.  */
+#define __lasx_xvsrani_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrani_b_h ((v32i8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, USI.  */
+#define __lasx_xvsrani_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrani_h_w ((v16i16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, USI.  */
+#define __lasx_xvsrani_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrani_w_d ((v8i32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui7.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, USI.  */
+#define __lasx_xvsrani_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrani_d_q ((v4i64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, USI.  */
+#define __lasx_xvsrarni_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrarni_b_h ((v32i8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, USI.  */
+#define __lasx_xvsrarni_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrarni_h_w ((v16i16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, USI.  */
+#define __lasx_xvsrarni_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrarni_w_d ((v8i32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui7.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, USI.  */
+#define __lasx_xvsrarni_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3) \
+  ((__m256i)__builtin_lasx_xvsrarni_d_q ((v4i64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, USI.  */
+#define __lasx_xvssrani_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrani_b_h ((v32i8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, USI.  */
+#define __lasx_xvssrani_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrani_h_w ((v16i16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, USI.  */
+#define __lasx_xvssrani_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrani_w_d ((v8i32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui7.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, USI.  */
+#define __lasx_xvssrani_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrani_d_q ((v4i64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, V32QI, USI.  */
+#define __lasx_xvssrani_bu_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrani_bu_h ((v32u8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, V16HI, USI.  */
+#define __lasx_xvssrani_hu_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrani_hu_w ((v16u16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, V8SI, USI.  */
+#define __lasx_xvssrani_wu_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrani_wu_d ((v8u32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui7.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, V4DI, USI.  */
+#define __lasx_xvssrani_du_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrani_du_q ((v4u64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  V32QI, V32QI, V32QI, USI.  */
+#define __lasx_xvssrarni_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrarni_b_h ((v32i8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  V16HI, V16HI, V16HI, USI.  */
+#define __lasx_xvssrarni_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrarni_h_w ((v16i16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  V8SI, V8SI, V8SI, USI.  */
+#define __lasx_xvssrarni_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrarni_w_d ((v8i32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui7.  */
+/* Data types in instruction templates:  V4DI, V4DI, V4DI, USI.  */
+#define __lasx_xvssrarni_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrarni_d_q ((v4i64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui4.  */
+/* Data types in instruction templates:  UV32QI, UV32QI, V32QI, USI.  */
+#define __lasx_xvssrarni_bu_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrarni_bu_h ((v32u8)(_1), (v32i8)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui5.  */
+/* Data types in instruction templates:  UV16HI, UV16HI, V16HI, USI.  */
+#define __lasx_xvssrarni_hu_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrarni_hu_w ((v16u16)(_1), (v16i16)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui6.  */
+/* Data types in instruction templates:  UV8SI, UV8SI, V8SI, USI.  */
+#define __lasx_xvssrarni_wu_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrarni_wu_d ((v8u32)(_1), (v8i32)(_2), (_3)))
+
+/* Assembly instruction format:	xd, xj, ui7.  */
+/* Data types in instruction templates:  UV4DI, UV4DI, V4DI, USI.  */
+#define __lasx_xvssrarni_du_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3) \
+  ((__m256i)__builtin_lasx_xvssrarni_du_q ((v4u64)(_1), (v4i64)(_2), (_3)))
+
+/* Assembly instruction format:	cd, xj.  */
+/* Data types in instruction templates:  SI, UV32QI.  */
+#define __lasx_xbnz_b(/*__m256i*/ _1) \
+  ((int)__builtin_lasx_xbnz_b ((v32u8)(_1)))
+
+/* Assembly instruction format:	cd, xj.  */
+/* Data types in instruction templates:  SI, UV4DI.  */
+#define __lasx_xbnz_d(/*__m256i*/ _1) \
+  ((int)__builtin_lasx_xbnz_d ((v4u64)(_1)))
+
+/* Assembly instruction format:	cd, xj.  */
+/* Data types in instruction templates:  SI, UV16HI.  */
+#define __lasx_xbnz_h(/*__m256i*/ _1) \
+  ((int)__builtin_lasx_xbnz_h ((v16u16)(_1)))
+
+/* Assembly instruction format:	cd, xj.  */
+/* Data types in instruction templates:  SI, UV32QI.  */
+#define __lasx_xbnz_v(/*__m256i*/ _1) \
+  ((int)__builtin_lasx_xbnz_v ((v32u8)(_1)))
+
+/* Assembly instruction format:	cd, xj.  */
+/* Data types in instruction templates:  SI, UV8SI.  */
+#define __lasx_xbnz_w(/*__m256i*/ _1) \
+  ((int)__builtin_lasx_xbnz_w ((v8u32)(_1)))
+
+/* Assembly instruction format:	cd, xj.  */
+/* Data types in instruction templates:  SI, UV32QI.  */
+#define __lasx_xbz_b(/*__m256i*/ _1) \
+  ((int)__builtin_lasx_xbz_b ((v32u8)(_1)))
+
+/* Assembly instruction format:	cd, xj.  */
+/* Data types in instruction templates:  SI, UV4DI.  */
+#define __lasx_xbz_d(/*__m256i*/ _1) \
+  ((int)__builtin_lasx_xbz_d ((v4u64)(_1)))
+
+/* Assembly instruction format:	cd, xj.  */
+/* Data types in instruction templates:  SI, UV16HI.  */
+#define __lasx_xbz_h(/*__m256i*/ _1) \
+  ((int)__builtin_lasx_xbz_h ((v16u16)(_1)))
+
+/* Assembly instruction format:	cd, xj.  */
+/* Data types in instruction templates:  SI, UV32QI.  */
+#define __lasx_xbz_v(/*__m256i*/ _1) \
+  ((int)__builtin_lasx_xbz_v ((v32u8)(_1)))
+
+/* Assembly instruction format:	cd, xj.  */
+/* Data types in instruction templates:  SI, UV8SI.  */
+#define __lasx_xbz_w(/*__m256i*/ _1) \
+  ((int)__builtin_lasx_xbz_w ((v8u32)(_1)))
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_caf_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_caf_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_caf_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_caf_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_ceq_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_ceq_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_ceq_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_ceq_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cle_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cle_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cle_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cle_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_clt_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_clt_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_clt_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_clt_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cne_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cne_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cne_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cne_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cor_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cor_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cor_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cor_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cueq_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cueq_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cueq_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cueq_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cule_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cule_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cule_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cule_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cult_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cult_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cult_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cult_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cun_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cun_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cune_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cune_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cune_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cune_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_cun_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_cun_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_saf_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_saf_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_saf_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_saf_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_seq_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_seq_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_seq_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_seq_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sle_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sle_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sle_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sle_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_slt_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_slt_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_slt_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_slt_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sne_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sne_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sne_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sne_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sor_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sor_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sor_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sor_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sueq_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sueq_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sueq_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sueq_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sule_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sule_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sule_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sule_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sult_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sult_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sult_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sult_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sun_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sun_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V4DI, V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sune_d (__m256d _1, __m256d _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sune_d ((v4f64)_1, (v4f64)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sune_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sune_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, xk.  */
+/* Data types in instruction templates:  V8SI, V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_xvfcmp_sun_s (__m256 _1, __m256 _2)
+{
+  return (__m256i)__builtin_lasx_xvfcmp_sun_s ((v8f32)_1, (v8f32)_2);
+}
+
+/* Assembly instruction format:	xd, xj, ui2.  */
+/* Data types in instruction templates:  V4DF, V4DF, UQI.  */
+#define __lasx_xvpickve_d_f(/*__m256d*/ _1, /*ui2*/ _2) \
+  ((__m256d)__builtin_lasx_xvpickve_d_f ((v4f64)(_1), (_2)))
+
+/* Assembly instruction format:	xd, xj, ui3.  */
+/* Data types in instruction templates:  V8SF, V8SF, UQI.  */
+#define __lasx_xvpickve_w_f(/*__m256*/ _1, /*ui3*/ _2) \
+  ((__m256)__builtin_lasx_xvpickve_w_f ((v8f32)(_1), (_2)))
+
+/* Assembly instruction format:	xd, si10.  */
+/* Data types in instruction templates:  V32QI, HI.  */
+#define __lasx_xvrepli_b(/*si10*/ _1) \
+  ((__m256i)__builtin_lasx_xvrepli_b ((_1)))
+
+/* Assembly instruction format:	xd, si10.  */
+/* Data types in instruction templates:  V4DI, HI.  */
+#define __lasx_xvrepli_d(/*si10*/ _1) \
+  ((__m256i)__builtin_lasx_xvrepli_d ((_1)))
+
+/* Assembly instruction format:	xd, si10.  */
+/* Data types in instruction templates:  V16HI, HI.  */
+#define __lasx_xvrepli_h(/*si10*/ _1) \
+  ((__m256i)__builtin_lasx_xvrepli_h ((_1)))
+
+/* Assembly instruction format:	xd, si10.  */
+/* Data types in instruction templates:  V8SI, HI.  */
+#define __lasx_xvrepli_w(/*si10*/ _1) \
+  ((__m256i)__builtin_lasx_xvrepli_w ((_1)))
+
+#endif /* defined(__loongarch_asx).  */
+#endif /* _GCC_LOONGSON_ASXINTRIN_H.  */
diff --git a/library/stdarch/crates/stdarch-gen-loongarch/lsx.spec b/library/stdarch/crates/stdarch-gen-loongarch/lsx.spec
new file mode 100644
index 0000000000000..dc835770d566e
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-loongarch/lsx.spec
@@ -0,0 +1,3605 @@
+// This code is automatically generated. DO NOT MODIFY.
+// ```
+// OUT_DIR=`pwd`/crates/stdarch-gen-loongarch cargo run -p stdarch-gen-loongarch -- crates/stdarch-gen-loongarch/lsxintrin.h
+// ```
+
+/// lsx_vsll_b
+name = lsx_vsll_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vsll_h
+name = lsx_vsll_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vsll_w
+name = lsx_vsll_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vsll_d
+name = lsx_vsll_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vslli_b
+name = lsx_vslli_b
+asm-fmts = vd, vj, ui3
+data-types = V16QI, V16QI, UQI
+
+/// lsx_vslli_h
+name = lsx_vslli_h
+asm-fmts = vd, vj, ui4
+data-types = V8HI, V8HI, UQI
+
+/// lsx_vslli_w
+name = lsx_vslli_w
+asm-fmts = vd, vj, ui5
+data-types = V4SI, V4SI, UQI
+
+/// lsx_vslli_d
+name = lsx_vslli_d
+asm-fmts = vd, vj, ui6
+data-types = V2DI, V2DI, UQI
+
+/// lsx_vsra_b
+name = lsx_vsra_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vsra_h
+name = lsx_vsra_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vsra_w
+name = lsx_vsra_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vsra_d
+name = lsx_vsra_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vsrai_b
+name = lsx_vsrai_b
+asm-fmts = vd, vj, ui3
+data-types = V16QI, V16QI, UQI
+
+/// lsx_vsrai_h
+name = lsx_vsrai_h
+asm-fmts = vd, vj, ui4
+data-types = V8HI, V8HI, UQI
+
+/// lsx_vsrai_w
+name = lsx_vsrai_w
+asm-fmts = vd, vj, ui5
+data-types = V4SI, V4SI, UQI
+
+/// lsx_vsrai_d
+name = lsx_vsrai_d
+asm-fmts = vd, vj, ui6
+data-types = V2DI, V2DI, UQI
+
+/// lsx_vsrar_b
+name = lsx_vsrar_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vsrar_h
+name = lsx_vsrar_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vsrar_w
+name = lsx_vsrar_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vsrar_d
+name = lsx_vsrar_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vsrari_b
+name = lsx_vsrari_b
+asm-fmts = vd, vj, ui3
+data-types = V16QI, V16QI, UQI
+
+/// lsx_vsrari_h
+name = lsx_vsrari_h
+asm-fmts = vd, vj, ui4
+data-types = V8HI, V8HI, UQI
+
+/// lsx_vsrari_w
+name = lsx_vsrari_w
+asm-fmts = vd, vj, ui5
+data-types = V4SI, V4SI, UQI
+
+/// lsx_vsrari_d
+name = lsx_vsrari_d
+asm-fmts = vd, vj, ui6
+data-types = V2DI, V2DI, UQI
+
+/// lsx_vsrl_b
+name = lsx_vsrl_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vsrl_h
+name = lsx_vsrl_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vsrl_w
+name = lsx_vsrl_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vsrl_d
+name = lsx_vsrl_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vsrli_b
+name = lsx_vsrli_b
+asm-fmts = vd, vj, ui3
+data-types = V16QI, V16QI, UQI
+
+/// lsx_vsrli_h
+name = lsx_vsrli_h
+asm-fmts = vd, vj, ui4
+data-types = V8HI, V8HI, UQI
+
+/// lsx_vsrli_w
+name = lsx_vsrli_w
+asm-fmts = vd, vj, ui5
+data-types = V4SI, V4SI, UQI
+
+/// lsx_vsrli_d
+name = lsx_vsrli_d
+asm-fmts = vd, vj, ui6
+data-types = V2DI, V2DI, UQI
+
+/// lsx_vsrlr_b
+name = lsx_vsrlr_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vsrlr_h
+name = lsx_vsrlr_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vsrlr_w
+name = lsx_vsrlr_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vsrlr_d
+name = lsx_vsrlr_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vsrlri_b
+name = lsx_vsrlri_b
+asm-fmts = vd, vj, ui3
+data-types = V16QI, V16QI, UQI
+
+/// lsx_vsrlri_h
+name = lsx_vsrlri_h
+asm-fmts = vd, vj, ui4
+data-types = V8HI, V8HI, UQI
+
+/// lsx_vsrlri_w
+name = lsx_vsrlri_w
+asm-fmts = vd, vj, ui5
+data-types = V4SI, V4SI, UQI
+
+/// lsx_vsrlri_d
+name = lsx_vsrlri_d
+asm-fmts = vd, vj, ui6
+data-types = V2DI, V2DI, UQI
+
+/// lsx_vbitclr_b
+name = lsx_vbitclr_b
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vbitclr_h
+name = lsx_vbitclr_h
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV8HI
+
+/// lsx_vbitclr_w
+name = lsx_vbitclr_w
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV4SI
+
+/// lsx_vbitclr_d
+name = lsx_vbitclr_d
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vbitclri_b
+name = lsx_vbitclri_b
+asm-fmts = vd, vj, ui3
+data-types = UV16QI, UV16QI, UQI
+
+/// lsx_vbitclri_h
+name = lsx_vbitclri_h
+asm-fmts = vd, vj, ui4
+data-types = UV8HI, UV8HI, UQI
+
+/// lsx_vbitclri_w
+name = lsx_vbitclri_w
+asm-fmts = vd, vj, ui5
+data-types = UV4SI, UV4SI, UQI
+
+/// lsx_vbitclri_d
+name = lsx_vbitclri_d
+asm-fmts = vd, vj, ui6
+data-types = UV2DI, UV2DI, UQI
+
+/// lsx_vbitset_b
+name = lsx_vbitset_b
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vbitset_h
+name = lsx_vbitset_h
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV8HI
+
+/// lsx_vbitset_w
+name = lsx_vbitset_w
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV4SI
+
+/// lsx_vbitset_d
+name = lsx_vbitset_d
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vbitseti_b
+name = lsx_vbitseti_b
+asm-fmts = vd, vj, ui3
+data-types = UV16QI, UV16QI, UQI
+
+/// lsx_vbitseti_h
+name = lsx_vbitseti_h
+asm-fmts = vd, vj, ui4
+data-types = UV8HI, UV8HI, UQI
+
+/// lsx_vbitseti_w
+name = lsx_vbitseti_w
+asm-fmts = vd, vj, ui5
+data-types = UV4SI, UV4SI, UQI
+
+/// lsx_vbitseti_d
+name = lsx_vbitseti_d
+asm-fmts = vd, vj, ui6
+data-types = UV2DI, UV2DI, UQI
+
+/// lsx_vbitrev_b
+name = lsx_vbitrev_b
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vbitrev_h
+name = lsx_vbitrev_h
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV8HI
+
+/// lsx_vbitrev_w
+name = lsx_vbitrev_w
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV4SI
+
+/// lsx_vbitrev_d
+name = lsx_vbitrev_d
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vbitrevi_b
+name = lsx_vbitrevi_b
+asm-fmts = vd, vj, ui3
+data-types = UV16QI, UV16QI, UQI
+
+/// lsx_vbitrevi_h
+name = lsx_vbitrevi_h
+asm-fmts = vd, vj, ui4
+data-types = UV8HI, UV8HI, UQI
+
+/// lsx_vbitrevi_w
+name = lsx_vbitrevi_w
+asm-fmts = vd, vj, ui5
+data-types = UV4SI, UV4SI, UQI
+
+/// lsx_vbitrevi_d
+name = lsx_vbitrevi_d
+asm-fmts = vd, vj, ui6
+data-types = UV2DI, UV2DI, UQI
+
+/// lsx_vadd_b
+name = lsx_vadd_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vadd_h
+name = lsx_vadd_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vadd_w
+name = lsx_vadd_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vadd_d
+name = lsx_vadd_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vaddi_bu
+name = lsx_vaddi_bu
+asm-fmts = vd, vj, ui5
+data-types = V16QI, V16QI, UQI
+
+/// lsx_vaddi_hu
+name = lsx_vaddi_hu
+asm-fmts = vd, vj, ui5
+data-types = V8HI, V8HI, UQI
+
+/// lsx_vaddi_wu
+name = lsx_vaddi_wu
+asm-fmts = vd, vj, ui5
+data-types = V4SI, V4SI, UQI
+
+/// lsx_vaddi_du
+name = lsx_vaddi_du
+asm-fmts = vd, vj, ui5
+data-types = V2DI, V2DI, UQI
+
+/// lsx_vsub_b
+name = lsx_vsub_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vsub_h
+name = lsx_vsub_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vsub_w
+name = lsx_vsub_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vsub_d
+name = lsx_vsub_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vsubi_bu
+name = lsx_vsubi_bu
+asm-fmts = vd, vj, ui5
+data-types = V16QI, V16QI, UQI
+
+/// lsx_vsubi_hu
+name = lsx_vsubi_hu
+asm-fmts = vd, vj, ui5
+data-types = V8HI, V8HI, UQI
+
+/// lsx_vsubi_wu
+name = lsx_vsubi_wu
+asm-fmts = vd, vj, ui5
+data-types = V4SI, V4SI, UQI
+
+/// lsx_vsubi_du
+name = lsx_vsubi_du
+asm-fmts = vd, vj, ui5
+data-types = V2DI, V2DI, UQI
+
+/// lsx_vmax_b
+name = lsx_vmax_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vmax_h
+name = lsx_vmax_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vmax_w
+name = lsx_vmax_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vmax_d
+name = lsx_vmax_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vmaxi_b
+name = lsx_vmaxi_b
+asm-fmts = vd, vj, si5
+data-types = V16QI, V16QI, QI
+
+/// lsx_vmaxi_h
+name = lsx_vmaxi_h
+asm-fmts = vd, vj, si5
+data-types = V8HI, V8HI, QI
+
+/// lsx_vmaxi_w
+name = lsx_vmaxi_w
+asm-fmts = vd, vj, si5
+data-types = V4SI, V4SI, QI
+
+/// lsx_vmaxi_d
+name = lsx_vmaxi_d
+asm-fmts = vd, vj, si5
+data-types = V2DI, V2DI, QI
+
+/// lsx_vmax_bu
+name = lsx_vmax_bu
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vmax_hu
+name = lsx_vmax_hu
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV8HI
+
+/// lsx_vmax_wu
+name = lsx_vmax_wu
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV4SI
+
+/// lsx_vmax_du
+name = lsx_vmax_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vmaxi_bu
+name = lsx_vmaxi_bu
+asm-fmts = vd, vj, ui5
+data-types = UV16QI, UV16QI, UQI
+
+/// lsx_vmaxi_hu
+name = lsx_vmaxi_hu
+asm-fmts = vd, vj, ui5
+data-types = UV8HI, UV8HI, UQI
+
+/// lsx_vmaxi_wu
+name = lsx_vmaxi_wu
+asm-fmts = vd, vj, ui5
+data-types = UV4SI, UV4SI, UQI
+
+/// lsx_vmaxi_du
+name = lsx_vmaxi_du
+asm-fmts = vd, vj, ui5
+data-types = UV2DI, UV2DI, UQI
+
+/// lsx_vmin_b
+name = lsx_vmin_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vmin_h
+name = lsx_vmin_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vmin_w
+name = lsx_vmin_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vmin_d
+name = lsx_vmin_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vmini_b
+name = lsx_vmini_b
+asm-fmts = vd, vj, si5
+data-types = V16QI, V16QI, QI
+
+/// lsx_vmini_h
+name = lsx_vmini_h
+asm-fmts = vd, vj, si5
+data-types = V8HI, V8HI, QI
+
+/// lsx_vmini_w
+name = lsx_vmini_w
+asm-fmts = vd, vj, si5
+data-types = V4SI, V4SI, QI
+
+/// lsx_vmini_d
+name = lsx_vmini_d
+asm-fmts = vd, vj, si5
+data-types = V2DI, V2DI, QI
+
+/// lsx_vmin_bu
+name = lsx_vmin_bu
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vmin_hu
+name = lsx_vmin_hu
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV8HI
+
+/// lsx_vmin_wu
+name = lsx_vmin_wu
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV4SI
+
+/// lsx_vmin_du
+name = lsx_vmin_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vmini_bu
+name = lsx_vmini_bu
+asm-fmts = vd, vj, ui5
+data-types = UV16QI, UV16QI, UQI
+
+/// lsx_vmini_hu
+name = lsx_vmini_hu
+asm-fmts = vd, vj, ui5
+data-types = UV8HI, UV8HI, UQI
+
+/// lsx_vmini_wu
+name = lsx_vmini_wu
+asm-fmts = vd, vj, ui5
+data-types = UV4SI, UV4SI, UQI
+
+/// lsx_vmini_du
+name = lsx_vmini_du
+asm-fmts = vd, vj, ui5
+data-types = UV2DI, UV2DI, UQI
+
+/// lsx_vseq_b
+name = lsx_vseq_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vseq_h
+name = lsx_vseq_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vseq_w
+name = lsx_vseq_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vseq_d
+name = lsx_vseq_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vseqi_b
+name = lsx_vseqi_b
+asm-fmts = vd, vj, si5
+data-types = V16QI, V16QI, QI
+
+/// lsx_vseqi_h
+name = lsx_vseqi_h
+asm-fmts = vd, vj, si5
+data-types = V8HI, V8HI, QI
+
+/// lsx_vseqi_w
+name = lsx_vseqi_w
+asm-fmts = vd, vj, si5
+data-types = V4SI, V4SI, QI
+
+/// lsx_vseqi_d
+name = lsx_vseqi_d
+asm-fmts = vd, vj, si5
+data-types = V2DI, V2DI, QI
+
+/// lsx_vslti_b
+name = lsx_vslti_b
+asm-fmts = vd, vj, si5
+data-types = V16QI, V16QI, QI
+
+/// lsx_vslt_b
+name = lsx_vslt_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vslt_h
+name = lsx_vslt_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vslt_w
+name = lsx_vslt_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vslt_d
+name = lsx_vslt_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vslti_h
+name = lsx_vslti_h
+asm-fmts = vd, vj, si5
+data-types = V8HI, V8HI, QI
+
+/// lsx_vslti_w
+name = lsx_vslti_w
+asm-fmts = vd, vj, si5
+data-types = V4SI, V4SI, QI
+
+/// lsx_vslti_d
+name = lsx_vslti_d
+asm-fmts = vd, vj, si5
+data-types = V2DI, V2DI, QI
+
+/// lsx_vslt_bu
+name = lsx_vslt_bu
+asm-fmts = vd, vj, vk
+data-types = V16QI, UV16QI, UV16QI
+
+/// lsx_vslt_hu
+name = lsx_vslt_hu
+asm-fmts = vd, vj, vk
+data-types = V8HI, UV8HI, UV8HI
+
+/// lsx_vslt_wu
+name = lsx_vslt_wu
+asm-fmts = vd, vj, vk
+data-types = V4SI, UV4SI, UV4SI
+
+/// lsx_vslt_du
+name = lsx_vslt_du
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV2DI, UV2DI
+
+/// lsx_vslti_bu
+name = lsx_vslti_bu
+asm-fmts = vd, vj, ui5
+data-types = V16QI, UV16QI, UQI
+
+/// lsx_vslti_hu
+name = lsx_vslti_hu
+asm-fmts = vd, vj, ui5
+data-types = V8HI, UV8HI, UQI
+
+/// lsx_vslti_wu
+name = lsx_vslti_wu
+asm-fmts = vd, vj, ui5
+data-types = V4SI, UV4SI, UQI
+
+/// lsx_vslti_du
+name = lsx_vslti_du
+asm-fmts = vd, vj, ui5
+data-types = V2DI, UV2DI, UQI
+
+/// lsx_vsle_b
+name = lsx_vsle_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vsle_h
+name = lsx_vsle_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vsle_w
+name = lsx_vsle_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vsle_d
+name = lsx_vsle_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vslei_b
+name = lsx_vslei_b
+asm-fmts = vd, vj, si5
+data-types = V16QI, V16QI, QI
+
+/// lsx_vslei_h
+name = lsx_vslei_h
+asm-fmts = vd, vj, si5
+data-types = V8HI, V8HI, QI
+
+/// lsx_vslei_w
+name = lsx_vslei_w
+asm-fmts = vd, vj, si5
+data-types = V4SI, V4SI, QI
+
+/// lsx_vslei_d
+name = lsx_vslei_d
+asm-fmts = vd, vj, si5
+data-types = V2DI, V2DI, QI
+
+/// lsx_vsle_bu
+name = lsx_vsle_bu
+asm-fmts = vd, vj, vk
+data-types = V16QI, UV16QI, UV16QI
+
+/// lsx_vsle_hu
+name = lsx_vsle_hu
+asm-fmts = vd, vj, vk
+data-types = V8HI, UV8HI, UV8HI
+
+/// lsx_vsle_wu
+name = lsx_vsle_wu
+asm-fmts = vd, vj, vk
+data-types = V4SI, UV4SI, UV4SI
+
+/// lsx_vsle_du
+name = lsx_vsle_du
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV2DI, UV2DI
+
+/// lsx_vslei_bu
+name = lsx_vslei_bu
+asm-fmts = vd, vj, ui5
+data-types = V16QI, UV16QI, UQI
+
+/// lsx_vslei_hu
+name = lsx_vslei_hu
+asm-fmts = vd, vj, ui5
+data-types = V8HI, UV8HI, UQI
+
+/// lsx_vslei_wu
+name = lsx_vslei_wu
+asm-fmts = vd, vj, ui5
+data-types = V4SI, UV4SI, UQI
+
+/// lsx_vslei_du
+name = lsx_vslei_du
+asm-fmts = vd, vj, ui5
+data-types = V2DI, UV2DI, UQI
+
+/// lsx_vsat_b
+name = lsx_vsat_b
+asm-fmts = vd, vj, ui3
+data-types = V16QI, V16QI, UQI
+
+/// lsx_vsat_h
+name = lsx_vsat_h
+asm-fmts = vd, vj, ui4
+data-types = V8HI, V8HI, UQI
+
+/// lsx_vsat_w
+name = lsx_vsat_w
+asm-fmts = vd, vj, ui5
+data-types = V4SI, V4SI, UQI
+
+/// lsx_vsat_d
+name = lsx_vsat_d
+asm-fmts = vd, vj, ui6
+data-types = V2DI, V2DI, UQI
+
+/// lsx_vsat_bu
+name = lsx_vsat_bu
+asm-fmts = vd, vj, ui3
+data-types = UV16QI, UV16QI, UQI
+
+/// lsx_vsat_hu
+name = lsx_vsat_hu
+asm-fmts = vd, vj, ui4
+data-types = UV8HI, UV8HI, UQI
+
+/// lsx_vsat_wu
+name = lsx_vsat_wu
+asm-fmts = vd, vj, ui5
+data-types = UV4SI, UV4SI, UQI
+
+/// lsx_vsat_du
+name = lsx_vsat_du
+asm-fmts = vd, vj, ui6
+data-types = UV2DI, UV2DI, UQI
+
+/// lsx_vadda_b
+name = lsx_vadda_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vadda_h
+name = lsx_vadda_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vadda_w
+name = lsx_vadda_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vadda_d
+name = lsx_vadda_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vsadd_b
+name = lsx_vsadd_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vsadd_h
+name = lsx_vsadd_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vsadd_w
+name = lsx_vsadd_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vsadd_d
+name = lsx_vsadd_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vsadd_bu
+name = lsx_vsadd_bu
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vsadd_hu
+name = lsx_vsadd_hu
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV8HI
+
+/// lsx_vsadd_wu
+name = lsx_vsadd_wu
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV4SI
+
+/// lsx_vsadd_du
+name = lsx_vsadd_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vavg_b
+name = lsx_vavg_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vavg_h
+name = lsx_vavg_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vavg_w
+name = lsx_vavg_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vavg_d
+name = lsx_vavg_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vavg_bu
+name = lsx_vavg_bu
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vavg_hu
+name = lsx_vavg_hu
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV8HI
+
+/// lsx_vavg_wu
+name = lsx_vavg_wu
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV4SI
+
+/// lsx_vavg_du
+name = lsx_vavg_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vavgr_b
+name = lsx_vavgr_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vavgr_h
+name = lsx_vavgr_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vavgr_w
+name = lsx_vavgr_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vavgr_d
+name = lsx_vavgr_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vavgr_bu
+name = lsx_vavgr_bu
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vavgr_hu
+name = lsx_vavgr_hu
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV8HI
+
+/// lsx_vavgr_wu
+name = lsx_vavgr_wu
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV4SI
+
+/// lsx_vavgr_du
+name = lsx_vavgr_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vssub_b
+name = lsx_vssub_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vssub_h
+name = lsx_vssub_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vssub_w
+name = lsx_vssub_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vssub_d
+name = lsx_vssub_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vssub_bu
+name = lsx_vssub_bu
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vssub_hu
+name = lsx_vssub_hu
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV8HI
+
+/// lsx_vssub_wu
+name = lsx_vssub_wu
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV4SI
+
+/// lsx_vssub_du
+name = lsx_vssub_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vabsd_b
+name = lsx_vabsd_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vabsd_h
+name = lsx_vabsd_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vabsd_w
+name = lsx_vabsd_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vabsd_d
+name = lsx_vabsd_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vabsd_bu
+name = lsx_vabsd_bu
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vabsd_hu
+name = lsx_vabsd_hu
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV8HI
+
+/// lsx_vabsd_wu
+name = lsx_vabsd_wu
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV4SI
+
+/// lsx_vabsd_du
+name = lsx_vabsd_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vmul_b
+name = lsx_vmul_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vmul_h
+name = lsx_vmul_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vmul_w
+name = lsx_vmul_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vmul_d
+name = lsx_vmul_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vmadd_b
+name = lsx_vmadd_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI, V16QI
+
+/// lsx_vmadd_h
+name = lsx_vmadd_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI, V8HI
+
+/// lsx_vmadd_w
+name = lsx_vmadd_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI, V4SI
+
+/// lsx_vmadd_d
+name = lsx_vmadd_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI, V2DI
+
+/// lsx_vmsub_b
+name = lsx_vmsub_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI, V16QI
+
+/// lsx_vmsub_h
+name = lsx_vmsub_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI, V8HI
+
+/// lsx_vmsub_w
+name = lsx_vmsub_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI, V4SI
+
+/// lsx_vmsub_d
+name = lsx_vmsub_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI, V2DI
+
+/// lsx_vdiv_b
+name = lsx_vdiv_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vdiv_h
+name = lsx_vdiv_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vdiv_w
+name = lsx_vdiv_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vdiv_d
+name = lsx_vdiv_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vdiv_bu
+name = lsx_vdiv_bu
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vdiv_hu
+name = lsx_vdiv_hu
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV8HI
+
+/// lsx_vdiv_wu
+name = lsx_vdiv_wu
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV4SI
+
+/// lsx_vdiv_du
+name = lsx_vdiv_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vhaddw_h_b
+name = lsx_vhaddw_h_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, V16QI, V16QI
+
+/// lsx_vhaddw_w_h
+name = lsx_vhaddw_w_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, V8HI, V8HI
+
+/// lsx_vhaddw_d_w
+name = lsx_vhaddw_d_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, V4SI, V4SI
+
+/// lsx_vhaddw_hu_bu
+name = lsx_vhaddw_hu_bu
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV16QI, UV16QI
+
+/// lsx_vhaddw_wu_hu
+name = lsx_vhaddw_wu_hu
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV8HI, UV8HI
+
+/// lsx_vhaddw_du_wu
+name = lsx_vhaddw_du_wu
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV4SI, UV4SI
+
+/// lsx_vhsubw_h_b
+name = lsx_vhsubw_h_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, V16QI, V16QI
+
+/// lsx_vhsubw_w_h
+name = lsx_vhsubw_w_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, V8HI, V8HI
+
+/// lsx_vhsubw_d_w
+name = lsx_vhsubw_d_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, V4SI, V4SI
+
+/// lsx_vhsubw_hu_bu
+name = lsx_vhsubw_hu_bu
+asm-fmts = vd, vj, vk
+data-types = V8HI, UV16QI, UV16QI
+
+/// lsx_vhsubw_wu_hu
+name = lsx_vhsubw_wu_hu
+asm-fmts = vd, vj, vk
+data-types = V4SI, UV8HI, UV8HI
+
+/// lsx_vhsubw_du_wu
+name = lsx_vhsubw_du_wu
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV4SI, UV4SI
+
+/// lsx_vmod_b
+name = lsx_vmod_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vmod_h
+name = lsx_vmod_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vmod_w
+name = lsx_vmod_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vmod_d
+name = lsx_vmod_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vmod_bu
+name = lsx_vmod_bu
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vmod_hu
+name = lsx_vmod_hu
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV8HI
+
+/// lsx_vmod_wu
+name = lsx_vmod_wu
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV4SI
+
+/// lsx_vmod_du
+name = lsx_vmod_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vreplve_b
+name = lsx_vreplve_b
+asm-fmts = vd, vj, rk
+data-types = V16QI, V16QI, SI
+
+/// lsx_vreplve_h
+name = lsx_vreplve_h
+asm-fmts = vd, vj, rk
+data-types = V8HI, V8HI, SI
+
+/// lsx_vreplve_w
+name = lsx_vreplve_w
+asm-fmts = vd, vj, rk
+data-types = V4SI, V4SI, SI
+
+/// lsx_vreplve_d
+name = lsx_vreplve_d
+asm-fmts = vd, vj, rk
+data-types = V2DI, V2DI, SI
+
+/// lsx_vreplvei_b
+name = lsx_vreplvei_b
+asm-fmts = vd, vj, ui4
+data-types = V16QI, V16QI, UQI
+
+/// lsx_vreplvei_h
+name = lsx_vreplvei_h
+asm-fmts = vd, vj, ui3
+data-types = V8HI, V8HI, UQI
+
+/// lsx_vreplvei_w
+name = lsx_vreplvei_w
+asm-fmts = vd, vj, ui2
+data-types = V4SI, V4SI, UQI
+
+/// lsx_vreplvei_d
+name = lsx_vreplvei_d
+asm-fmts = vd, vj, ui1
+data-types = V2DI, V2DI, UQI
+
+/// lsx_vpickev_b
+name = lsx_vpickev_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vpickev_h
+name = lsx_vpickev_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vpickev_w
+name = lsx_vpickev_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vpickev_d
+name = lsx_vpickev_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vpickod_b
+name = lsx_vpickod_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vpickod_h
+name = lsx_vpickod_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vpickod_w
+name = lsx_vpickod_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vpickod_d
+name = lsx_vpickod_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vilvh_b
+name = lsx_vilvh_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vilvh_h
+name = lsx_vilvh_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vilvh_w
+name = lsx_vilvh_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vilvh_d
+name = lsx_vilvh_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vilvl_b
+name = lsx_vilvl_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vilvl_h
+name = lsx_vilvl_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vilvl_w
+name = lsx_vilvl_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vilvl_d
+name = lsx_vilvl_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vpackev_b
+name = lsx_vpackev_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vpackev_h
+name = lsx_vpackev_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vpackev_w
+name = lsx_vpackev_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vpackev_d
+name = lsx_vpackev_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vpackod_b
+name = lsx_vpackod_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vpackod_h
+name = lsx_vpackod_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vpackod_w
+name = lsx_vpackod_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vpackod_d
+name = lsx_vpackod_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vshuf_h
+name = lsx_vshuf_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI, V8HI
+
+/// lsx_vshuf_w
+name = lsx_vshuf_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI, V4SI
+
+/// lsx_vshuf_d
+name = lsx_vshuf_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI, V2DI
+
+/// lsx_vand_v
+name = lsx_vand_v
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vandi_b
+name = lsx_vandi_b
+asm-fmts = vd, vj, ui8
+data-types = UV16QI, UV16QI, UQI
+
+/// lsx_vor_v
+name = lsx_vor_v
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vori_b
+name = lsx_vori_b
+asm-fmts = vd, vj, ui8
+data-types = UV16QI, UV16QI, UQI
+
+/// lsx_vnor_v
+name = lsx_vnor_v
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vnori_b
+name = lsx_vnori_b
+asm-fmts = vd, vj, ui8
+data-types = UV16QI, UV16QI, UQI
+
+/// lsx_vxor_v
+name = lsx_vxor_v
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vxori_b
+name = lsx_vxori_b
+asm-fmts = vd, vj, ui8
+data-types = UV16QI, UV16QI, UQI
+
+/// lsx_vbitsel_v
+name = lsx_vbitsel_v
+asm-fmts = vd, vj, vk, va
+data-types = UV16QI, UV16QI, UV16QI, UV16QI
+
+/// lsx_vbitseli_b
+name = lsx_vbitseli_b
+asm-fmts = vd, vj, ui8
+data-types = UV16QI, UV16QI, UV16QI, USI
+
+/// lsx_vshuf4i_b
+name = lsx_vshuf4i_b
+asm-fmts = vd, vj, ui8
+data-types = V16QI, V16QI, USI
+
+/// lsx_vshuf4i_h
+name = lsx_vshuf4i_h
+asm-fmts = vd, vj, ui8
+data-types = V8HI, V8HI, USI
+
+/// lsx_vshuf4i_w
+name = lsx_vshuf4i_w
+asm-fmts = vd, vj, ui8
+data-types = V4SI, V4SI, USI
+
+/// lsx_vreplgr2vr_b
+name = lsx_vreplgr2vr_b
+asm-fmts = vd, rj
+data-types = V16QI, SI
+
+/// lsx_vreplgr2vr_h
+name = lsx_vreplgr2vr_h
+asm-fmts = vd, rj
+data-types = V8HI, SI
+
+/// lsx_vreplgr2vr_w
+name = lsx_vreplgr2vr_w
+asm-fmts = vd, rj
+data-types = V4SI, SI
+
+/// lsx_vreplgr2vr_d
+name = lsx_vreplgr2vr_d
+asm-fmts = vd, rj
+data-types = V2DI, DI
+
+/// lsx_vpcnt_b
+name = lsx_vpcnt_b
+asm-fmts = vd, vj
+data-types = V16QI, V16QI
+
+/// lsx_vpcnt_h
+name = lsx_vpcnt_h
+asm-fmts = vd, vj
+data-types = V8HI, V8HI
+
+/// lsx_vpcnt_w
+name = lsx_vpcnt_w
+asm-fmts = vd, vj
+data-types = V4SI, V4SI
+
+/// lsx_vpcnt_d
+name = lsx_vpcnt_d
+asm-fmts = vd, vj
+data-types = V2DI, V2DI
+
+/// lsx_vclo_b
+name = lsx_vclo_b
+asm-fmts = vd, vj
+data-types = V16QI, V16QI
+
+/// lsx_vclo_h
+name = lsx_vclo_h
+asm-fmts = vd, vj
+data-types = V8HI, V8HI
+
+/// lsx_vclo_w
+name = lsx_vclo_w
+asm-fmts = vd, vj
+data-types = V4SI, V4SI
+
+/// lsx_vclo_d
+name = lsx_vclo_d
+asm-fmts = vd, vj
+data-types = V2DI, V2DI
+
+/// lsx_vclz_b
+name = lsx_vclz_b
+asm-fmts = vd, vj
+data-types = V16QI, V16QI
+
+/// lsx_vclz_h
+name = lsx_vclz_h
+asm-fmts = vd, vj
+data-types = V8HI, V8HI
+
+/// lsx_vclz_w
+name = lsx_vclz_w
+asm-fmts = vd, vj
+data-types = V4SI, V4SI
+
+/// lsx_vclz_d
+name = lsx_vclz_d
+asm-fmts = vd, vj
+data-types = V2DI, V2DI
+
+/// lsx_vpickve2gr_b
+name = lsx_vpickve2gr_b
+asm-fmts = rd, vj, ui4
+data-types = SI, V16QI, UQI
+
+/// lsx_vpickve2gr_h
+name = lsx_vpickve2gr_h
+asm-fmts = rd, vj, ui3
+data-types = SI, V8HI, UQI
+
+/// lsx_vpickve2gr_w
+name = lsx_vpickve2gr_w
+asm-fmts = rd, vj, ui2
+data-types = SI, V4SI, UQI
+
+/// lsx_vpickve2gr_d
+name = lsx_vpickve2gr_d
+asm-fmts = rd, vj, ui1
+data-types = DI, V2DI, UQI
+
+/// lsx_vpickve2gr_bu
+name = lsx_vpickve2gr_bu
+asm-fmts = rd, vj, ui4
+data-types = USI, V16QI, UQI
+
+/// lsx_vpickve2gr_hu
+name = lsx_vpickve2gr_hu
+asm-fmts = rd, vj, ui3
+data-types = USI, V8HI, UQI
+
+/// lsx_vpickve2gr_wu
+name = lsx_vpickve2gr_wu
+asm-fmts = rd, vj, ui2
+data-types = USI, V4SI, UQI
+
+/// lsx_vpickve2gr_du
+name = lsx_vpickve2gr_du
+asm-fmts = rd, vj, ui1
+data-types = UDI, V2DI, UQI
+
+/// lsx_vinsgr2vr_b
+name = lsx_vinsgr2vr_b
+asm-fmts = vd, rj, ui4
+data-types = V16QI, V16QI, SI, UQI
+
+/// lsx_vinsgr2vr_h
+name = lsx_vinsgr2vr_h
+asm-fmts = vd, rj, ui3
+data-types = V8HI, V8HI, SI, UQI
+
+/// lsx_vinsgr2vr_w
+name = lsx_vinsgr2vr_w
+asm-fmts = vd, rj, ui2
+data-types = V4SI, V4SI, SI, UQI
+
+/// lsx_vinsgr2vr_d
+name = lsx_vinsgr2vr_d
+asm-fmts = vd, rj, ui1
+data-types = V2DI, V2DI, DI, UQI
+
+/// lsx_vfadd_s
+name = lsx_vfadd_s
+asm-fmts = vd, vj, vk
+data-types = V4SF, V4SF, V4SF
+
+/// lsx_vfadd_d
+name = lsx_vfadd_d
+asm-fmts = vd, vj, vk
+data-types = V2DF, V2DF, V2DF
+
+/// lsx_vfsub_s
+name = lsx_vfsub_s
+asm-fmts = vd, vj, vk
+data-types = V4SF, V4SF, V4SF
+
+/// lsx_vfsub_d
+name = lsx_vfsub_d
+asm-fmts = vd, vj, vk
+data-types = V2DF, V2DF, V2DF
+
+/// lsx_vfmul_s
+name = lsx_vfmul_s
+asm-fmts = vd, vj, vk
+data-types = V4SF, V4SF, V4SF
+
+/// lsx_vfmul_d
+name = lsx_vfmul_d
+asm-fmts = vd, vj, vk
+data-types = V2DF, V2DF, V2DF
+
+/// lsx_vfdiv_s
+name = lsx_vfdiv_s
+asm-fmts = vd, vj, vk
+data-types = V4SF, V4SF, V4SF
+
+/// lsx_vfdiv_d
+name = lsx_vfdiv_d
+asm-fmts = vd, vj, vk
+data-types = V2DF, V2DF, V2DF
+
+/// lsx_vfcvt_h_s
+name = lsx_vfcvt_h_s
+asm-fmts = vd, vj, vk
+data-types = V8HI, V4SF, V4SF
+
+/// lsx_vfcvt_s_d
+name = lsx_vfcvt_s_d
+asm-fmts = vd, vj, vk
+data-types = V4SF, V2DF, V2DF
+
+/// lsx_vfmin_s
+name = lsx_vfmin_s
+asm-fmts = vd, vj, vk
+data-types = V4SF, V4SF, V4SF
+
+/// lsx_vfmin_d
+name = lsx_vfmin_d
+asm-fmts = vd, vj, vk
+data-types = V2DF, V2DF, V2DF
+
+/// lsx_vfmina_s
+name = lsx_vfmina_s
+asm-fmts = vd, vj, vk
+data-types = V4SF, V4SF, V4SF
+
+/// lsx_vfmina_d
+name = lsx_vfmina_d
+asm-fmts = vd, vj, vk
+data-types = V2DF, V2DF, V2DF
+
+/// lsx_vfmax_s
+name = lsx_vfmax_s
+asm-fmts = vd, vj, vk
+data-types = V4SF, V4SF, V4SF
+
+/// lsx_vfmax_d
+name = lsx_vfmax_d
+asm-fmts = vd, vj, vk
+data-types = V2DF, V2DF, V2DF
+
+/// lsx_vfmaxa_s
+name = lsx_vfmaxa_s
+asm-fmts = vd, vj, vk
+data-types = V4SF, V4SF, V4SF
+
+/// lsx_vfmaxa_d
+name = lsx_vfmaxa_d
+asm-fmts = vd, vj, vk
+data-types = V2DF, V2DF, V2DF
+
+/// lsx_vfclass_s
+name = lsx_vfclass_s
+asm-fmts = vd, vj
+data-types = V4SI, V4SF
+
+/// lsx_vfclass_d
+name = lsx_vfclass_d
+asm-fmts = vd, vj
+data-types = V2DI, V2DF
+
+/// lsx_vfsqrt_s
+name = lsx_vfsqrt_s
+asm-fmts = vd, vj
+data-types = V4SF, V4SF
+
+/// lsx_vfsqrt_d
+name = lsx_vfsqrt_d
+asm-fmts = vd, vj
+data-types = V2DF, V2DF
+
+/// lsx_vfrecip_s
+name = lsx_vfrecip_s
+asm-fmts = vd, vj
+data-types = V4SF, V4SF
+
+/// lsx_vfrecip_d
+name = lsx_vfrecip_d
+asm-fmts = vd, vj
+data-types = V2DF, V2DF
+
+/// lsx_vfrecipe_s
+name = lsx_vfrecipe_s
+asm-fmts = vd, vj
+data-types = V4SF, V4SF
+
+/// lsx_vfrecipe_d
+name = lsx_vfrecipe_d
+asm-fmts = vd, vj
+data-types = V2DF, V2DF
+
+/// lsx_vfrsqrte_s
+name = lsx_vfrsqrte_s
+asm-fmts = vd, vj
+data-types = V4SF, V4SF
+
+/// lsx_vfrsqrte_d
+name = lsx_vfrsqrte_d
+asm-fmts = vd, vj
+data-types = V2DF, V2DF
+
+/// lsx_vfrint_s
+name = lsx_vfrint_s
+asm-fmts = vd, vj
+data-types = V4SF, V4SF
+
+/// lsx_vfrint_d
+name = lsx_vfrint_d
+asm-fmts = vd, vj
+data-types = V2DF, V2DF
+
+/// lsx_vfrsqrt_s
+name = lsx_vfrsqrt_s
+asm-fmts = vd, vj
+data-types = V4SF, V4SF
+
+/// lsx_vfrsqrt_d
+name = lsx_vfrsqrt_d
+asm-fmts = vd, vj
+data-types = V2DF, V2DF
+
+/// lsx_vflogb_s
+name = lsx_vflogb_s
+asm-fmts = vd, vj
+data-types = V4SF, V4SF
+
+/// lsx_vflogb_d
+name = lsx_vflogb_d
+asm-fmts = vd, vj
+data-types = V2DF, V2DF
+
+/// lsx_vfcvth_s_h
+name = lsx_vfcvth_s_h
+asm-fmts = vd, vj
+data-types = V4SF, V8HI
+
+/// lsx_vfcvth_d_s
+name = lsx_vfcvth_d_s
+asm-fmts = vd, vj
+data-types = V2DF, V4SF
+
+/// lsx_vfcvtl_s_h
+name = lsx_vfcvtl_s_h
+asm-fmts = vd, vj
+data-types = V4SF, V8HI
+
+/// lsx_vfcvtl_d_s
+name = lsx_vfcvtl_d_s
+asm-fmts = vd, vj
+data-types = V2DF, V4SF
+
+/// lsx_vftint_w_s
+name = lsx_vftint_w_s
+asm-fmts = vd, vj
+data-types = V4SI, V4SF
+
+/// lsx_vftint_l_d
+name = lsx_vftint_l_d
+asm-fmts = vd, vj
+data-types = V2DI, V2DF
+
+/// lsx_vftint_wu_s
+name = lsx_vftint_wu_s
+asm-fmts = vd, vj
+data-types = UV4SI, V4SF
+
+/// lsx_vftint_lu_d
+name = lsx_vftint_lu_d
+asm-fmts = vd, vj
+data-types = UV2DI, V2DF
+
+/// lsx_vftintrz_w_s
+name = lsx_vftintrz_w_s
+asm-fmts = vd, vj
+data-types = V4SI, V4SF
+
+/// lsx_vftintrz_l_d
+name = lsx_vftintrz_l_d
+asm-fmts = vd, vj
+data-types = V2DI, V2DF
+
+/// lsx_vftintrz_wu_s
+name = lsx_vftintrz_wu_s
+asm-fmts = vd, vj
+data-types = UV4SI, V4SF
+
+/// lsx_vftintrz_lu_d
+name = lsx_vftintrz_lu_d
+asm-fmts = vd, vj
+data-types = UV2DI, V2DF
+
+/// lsx_vffint_s_w
+name = lsx_vffint_s_w
+asm-fmts = vd, vj
+data-types = V4SF, V4SI
+
+/// lsx_vffint_d_l
+name = lsx_vffint_d_l
+asm-fmts = vd, vj
+data-types = V2DF, V2DI
+
+/// lsx_vffint_s_wu
+name = lsx_vffint_s_wu
+asm-fmts = vd, vj
+data-types = V4SF, UV4SI
+
+/// lsx_vffint_d_lu
+name = lsx_vffint_d_lu
+asm-fmts = vd, vj
+data-types = V2DF, UV2DI
+
+/// lsx_vandn_v
+name = lsx_vandn_v
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vneg_b
+name = lsx_vneg_b
+asm-fmts = vd, vj
+data-types = V16QI, V16QI
+
+/// lsx_vneg_h
+name = lsx_vneg_h
+asm-fmts = vd, vj
+data-types = V8HI, V8HI
+
+/// lsx_vneg_w
+name = lsx_vneg_w
+asm-fmts = vd, vj
+data-types = V4SI, V4SI
+
+/// lsx_vneg_d
+name = lsx_vneg_d
+asm-fmts = vd, vj
+data-types = V2DI, V2DI
+
+/// lsx_vmuh_b
+name = lsx_vmuh_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vmuh_h
+name = lsx_vmuh_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vmuh_w
+name = lsx_vmuh_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vmuh_d
+name = lsx_vmuh_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vmuh_bu
+name = lsx_vmuh_bu
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV16QI, UV16QI
+
+/// lsx_vmuh_hu
+name = lsx_vmuh_hu
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV8HI
+
+/// lsx_vmuh_wu
+name = lsx_vmuh_wu
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV4SI
+
+/// lsx_vmuh_du
+name = lsx_vmuh_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vsllwil_h_b
+name = lsx_vsllwil_h_b
+asm-fmts = vd, vj, ui3
+data-types = V8HI, V16QI, UQI
+
+/// lsx_vsllwil_w_h
+name = lsx_vsllwil_w_h
+asm-fmts = vd, vj, ui4
+data-types = V4SI, V8HI, UQI
+
+/// lsx_vsllwil_d_w
+name = lsx_vsllwil_d_w
+asm-fmts = vd, vj, ui5
+data-types = V2DI, V4SI, UQI
+
+/// lsx_vsllwil_hu_bu
+name = lsx_vsllwil_hu_bu
+asm-fmts = vd, vj, ui3
+data-types = UV8HI, UV16QI, UQI
+
+/// lsx_vsllwil_wu_hu
+name = lsx_vsllwil_wu_hu
+asm-fmts = vd, vj, ui4
+data-types = UV4SI, UV8HI, UQI
+
+/// lsx_vsllwil_du_wu
+name = lsx_vsllwil_du_wu
+asm-fmts = vd, vj, ui5
+data-types = UV2DI, UV4SI, UQI
+
+/// lsx_vsran_b_h
+name = lsx_vsran_b_h
+asm-fmts = vd, vj, vk
+data-types = V16QI, V8HI, V8HI
+
+/// lsx_vsran_h_w
+name = lsx_vsran_h_w
+asm-fmts = vd, vj, vk
+data-types = V8HI, V4SI, V4SI
+
+/// lsx_vsran_w_d
+name = lsx_vsran_w_d
+asm-fmts = vd, vj, vk
+data-types = V4SI, V2DI, V2DI
+
+/// lsx_vssran_b_h
+name = lsx_vssran_b_h
+asm-fmts = vd, vj, vk
+data-types = V16QI, V8HI, V8HI
+
+/// lsx_vssran_h_w
+name = lsx_vssran_h_w
+asm-fmts = vd, vj, vk
+data-types = V8HI, V4SI, V4SI
+
+/// lsx_vssran_w_d
+name = lsx_vssran_w_d
+asm-fmts = vd, vj, vk
+data-types = V4SI, V2DI, V2DI
+
+/// lsx_vssran_bu_h
+name = lsx_vssran_bu_h
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV8HI, UV8HI
+
+/// lsx_vssran_hu_w
+name = lsx_vssran_hu_w
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV4SI, UV4SI
+
+/// lsx_vssran_wu_d
+name = lsx_vssran_wu_d
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV2DI, UV2DI
+
+/// lsx_vsrarn_b_h
+name = lsx_vsrarn_b_h
+asm-fmts = vd, vj, vk
+data-types = V16QI, V8HI, V8HI
+
+/// lsx_vsrarn_h_w
+name = lsx_vsrarn_h_w
+asm-fmts = vd, vj, vk
+data-types = V8HI, V4SI, V4SI
+
+/// lsx_vsrarn_w_d
+name = lsx_vsrarn_w_d
+asm-fmts = vd, vj, vk
+data-types = V4SI, V2DI, V2DI
+
+/// lsx_vssrarn_b_h
+name = lsx_vssrarn_b_h
+asm-fmts = vd, vj, vk
+data-types = V16QI, V8HI, V8HI
+
+/// lsx_vssrarn_h_w
+name = lsx_vssrarn_h_w
+asm-fmts = vd, vj, vk
+data-types = V8HI, V4SI, V4SI
+
+/// lsx_vssrarn_w_d
+name = lsx_vssrarn_w_d
+asm-fmts = vd, vj, vk
+data-types = V4SI, V2DI, V2DI
+
+/// lsx_vssrarn_bu_h
+name = lsx_vssrarn_bu_h
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV8HI, UV8HI
+
+/// lsx_vssrarn_hu_w
+name = lsx_vssrarn_hu_w
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV4SI, UV4SI
+
+/// lsx_vssrarn_wu_d
+name = lsx_vssrarn_wu_d
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV2DI, UV2DI
+
+/// lsx_vsrln_b_h
+name = lsx_vsrln_b_h
+asm-fmts = vd, vj, vk
+data-types = V16QI, V8HI, V8HI
+
+/// lsx_vsrln_h_w
+name = lsx_vsrln_h_w
+asm-fmts = vd, vj, vk
+data-types = V8HI, V4SI, V4SI
+
+/// lsx_vsrln_w_d
+name = lsx_vsrln_w_d
+asm-fmts = vd, vj, vk
+data-types = V4SI, V2DI, V2DI
+
+/// lsx_vssrln_bu_h
+name = lsx_vssrln_bu_h
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV8HI, UV8HI
+
+/// lsx_vssrln_hu_w
+name = lsx_vssrln_hu_w
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV4SI, UV4SI
+
+/// lsx_vssrln_wu_d
+name = lsx_vssrln_wu_d
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV2DI, UV2DI
+
+/// lsx_vsrlrn_b_h
+name = lsx_vsrlrn_b_h
+asm-fmts = vd, vj, vk
+data-types = V16QI, V8HI, V8HI
+
+/// lsx_vsrlrn_h_w
+name = lsx_vsrlrn_h_w
+asm-fmts = vd, vj, vk
+data-types = V8HI, V4SI, V4SI
+
+/// lsx_vsrlrn_w_d
+name = lsx_vsrlrn_w_d
+asm-fmts = vd, vj, vk
+data-types = V4SI, V2DI, V2DI
+
+/// lsx_vssrlrn_bu_h
+name = lsx_vssrlrn_bu_h
+asm-fmts = vd, vj, vk
+data-types = UV16QI, UV8HI, UV8HI
+
+/// lsx_vssrlrn_hu_w
+name = lsx_vssrlrn_hu_w
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV4SI, UV4SI
+
+/// lsx_vssrlrn_wu_d
+name = lsx_vssrlrn_wu_d
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV2DI, UV2DI
+
+/// lsx_vfrstpi_b
+name = lsx_vfrstpi_b
+asm-fmts = vd, vj, ui5
+data-types = V16QI, V16QI, V16QI, UQI
+
+/// lsx_vfrstpi_h
+name = lsx_vfrstpi_h
+asm-fmts = vd, vj, ui5
+data-types = V8HI, V8HI, V8HI, UQI
+
+/// lsx_vfrstp_b
+name = lsx_vfrstp_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI, V16QI
+
+/// lsx_vfrstp_h
+name = lsx_vfrstp_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI, V8HI
+
+/// lsx_vshuf4i_d
+name = lsx_vshuf4i_d
+asm-fmts = vd, vj, ui8
+data-types = V2DI, V2DI, V2DI, USI
+
+/// lsx_vbsrl_v
+name = lsx_vbsrl_v
+asm-fmts = vd, vj, ui5
+data-types = V16QI, V16QI, UQI
+
+/// lsx_vbsll_v
+name = lsx_vbsll_v
+asm-fmts = vd, vj, ui5
+data-types = V16QI, V16QI, UQI
+
+/// lsx_vextrins_b
+name = lsx_vextrins_b
+asm-fmts = vd, vj, ui8
+data-types = V16QI, V16QI, V16QI, USI
+
+/// lsx_vextrins_h
+name = lsx_vextrins_h
+asm-fmts = vd, vj, ui8
+data-types = V8HI, V8HI, V8HI, USI
+
+/// lsx_vextrins_w
+name = lsx_vextrins_w
+asm-fmts = vd, vj, ui8
+data-types = V4SI, V4SI, V4SI, USI
+
+/// lsx_vextrins_d
+name = lsx_vextrins_d
+asm-fmts = vd, vj, ui8
+data-types = V2DI, V2DI, V2DI, USI
+
+/// lsx_vmskltz_b
+name = lsx_vmskltz_b
+asm-fmts = vd, vj
+data-types = V16QI, V16QI
+
+/// lsx_vmskltz_h
+name = lsx_vmskltz_h
+asm-fmts = vd, vj
+data-types = V8HI, V8HI
+
+/// lsx_vmskltz_w
+name = lsx_vmskltz_w
+asm-fmts = vd, vj
+data-types = V4SI, V4SI
+
+/// lsx_vmskltz_d
+name = lsx_vmskltz_d
+asm-fmts = vd, vj
+data-types = V2DI, V2DI
+
+/// lsx_vsigncov_b
+name = lsx_vsigncov_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vsigncov_h
+name = lsx_vsigncov_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vsigncov_w
+name = lsx_vsigncov_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vsigncov_d
+name = lsx_vsigncov_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vfmadd_s
+name = lsx_vfmadd_s
+asm-fmts = vd, vj, vk, va
+data-types = V4SF, V4SF, V4SF, V4SF
+
+/// lsx_vfmadd_d
+name = lsx_vfmadd_d
+asm-fmts = vd, vj, vk, va
+data-types = V2DF, V2DF, V2DF, V2DF
+
+/// lsx_vfmsub_s
+name = lsx_vfmsub_s
+asm-fmts = vd, vj, vk, va
+data-types = V4SF, V4SF, V4SF, V4SF
+
+/// lsx_vfmsub_d
+name = lsx_vfmsub_d
+asm-fmts = vd, vj, vk, va
+data-types = V2DF, V2DF, V2DF, V2DF
+
+/// lsx_vfnmadd_s
+name = lsx_vfnmadd_s
+asm-fmts = vd, vj, vk, va
+data-types = V4SF, V4SF, V4SF, V4SF
+
+/// lsx_vfnmadd_d
+name = lsx_vfnmadd_d
+asm-fmts = vd, vj, vk, va
+data-types = V2DF, V2DF, V2DF, V2DF
+
+/// lsx_vfnmsub_s
+name = lsx_vfnmsub_s
+asm-fmts = vd, vj, vk, va
+data-types = V4SF, V4SF, V4SF, V4SF
+
+/// lsx_vfnmsub_d
+name = lsx_vfnmsub_d
+asm-fmts = vd, vj, vk, va
+data-types = V2DF, V2DF, V2DF, V2DF
+
+/// lsx_vftintrne_w_s
+name = lsx_vftintrne_w_s
+asm-fmts = vd, vj
+data-types = V4SI, V4SF
+
+/// lsx_vftintrne_l_d
+name = lsx_vftintrne_l_d
+asm-fmts = vd, vj
+data-types = V2DI, V2DF
+
+/// lsx_vftintrp_w_s
+name = lsx_vftintrp_w_s
+asm-fmts = vd, vj
+data-types = V4SI, V4SF
+
+/// lsx_vftintrp_l_d
+name = lsx_vftintrp_l_d
+asm-fmts = vd, vj
+data-types = V2DI, V2DF
+
+/// lsx_vftintrm_w_s
+name = lsx_vftintrm_w_s
+asm-fmts = vd, vj
+data-types = V4SI, V4SF
+
+/// lsx_vftintrm_l_d
+name = lsx_vftintrm_l_d
+asm-fmts = vd, vj
+data-types = V2DI, V2DF
+
+/// lsx_vftint_w_d
+name = lsx_vftint_w_d
+asm-fmts = vd, vj, vk
+data-types = V4SI, V2DF, V2DF
+
+/// lsx_vffint_s_l
+name = lsx_vffint_s_l
+asm-fmts = vd, vj, vk
+data-types = V4SF, V2DI, V2DI
+
+/// lsx_vftintrz_w_d
+name = lsx_vftintrz_w_d
+asm-fmts = vd, vj, vk
+data-types = V4SI, V2DF, V2DF
+
+/// lsx_vftintrp_w_d
+name = lsx_vftintrp_w_d
+asm-fmts = vd, vj, vk
+data-types = V4SI, V2DF, V2DF
+
+/// lsx_vftintrm_w_d
+name = lsx_vftintrm_w_d
+asm-fmts = vd, vj, vk
+data-types = V4SI, V2DF, V2DF
+
+/// lsx_vftintrne_w_d
+name = lsx_vftintrne_w_d
+asm-fmts = vd, vj, vk
+data-types = V4SI, V2DF, V2DF
+
+/// lsx_vftintl_l_s
+name = lsx_vftintl_l_s
+asm-fmts = vd, vj
+data-types = V2DI, V4SF
+
+/// lsx_vftinth_l_s
+name = lsx_vftinth_l_s
+asm-fmts = vd, vj
+data-types = V2DI, V4SF
+
+/// lsx_vffinth_d_w
+name = lsx_vffinth_d_w
+asm-fmts = vd, vj
+data-types = V2DF, V4SI
+
+/// lsx_vffintl_d_w
+name = lsx_vffintl_d_w
+asm-fmts = vd, vj
+data-types = V2DF, V4SI
+
+/// lsx_vftintrzl_l_s
+name = lsx_vftintrzl_l_s
+asm-fmts = vd, vj
+data-types = V2DI, V4SF
+
+/// lsx_vftintrzh_l_s
+name = lsx_vftintrzh_l_s
+asm-fmts = vd, vj
+data-types = V2DI, V4SF
+
+/// lsx_vftintrpl_l_s
+name = lsx_vftintrpl_l_s
+asm-fmts = vd, vj
+data-types = V2DI, V4SF
+
+/// lsx_vftintrph_l_s
+name = lsx_vftintrph_l_s
+asm-fmts = vd, vj
+data-types = V2DI, V4SF
+
+/// lsx_vftintrml_l_s
+name = lsx_vftintrml_l_s
+asm-fmts = vd, vj
+data-types = V2DI, V4SF
+
+/// lsx_vftintrmh_l_s
+name = lsx_vftintrmh_l_s
+asm-fmts = vd, vj
+data-types = V2DI, V4SF
+
+/// lsx_vftintrnel_l_s
+name = lsx_vftintrnel_l_s
+asm-fmts = vd, vj
+data-types = V2DI, V4SF
+
+/// lsx_vftintrneh_l_s
+name = lsx_vftintrneh_l_s
+asm-fmts = vd, vj
+data-types = V2DI, V4SF
+
+/// lsx_vfrintrne_s
+name = lsx_vfrintrne_s
+asm-fmts = vd, vj
+data-types = V4SF, V4SF
+
+/// lsx_vfrintrne_d
+name = lsx_vfrintrne_d
+asm-fmts = vd, vj
+data-types = V2DF, V2DF
+
+/// lsx_vfrintrz_s
+name = lsx_vfrintrz_s
+asm-fmts = vd, vj
+data-types = V4SF, V4SF
+
+/// lsx_vfrintrz_d
+name = lsx_vfrintrz_d
+asm-fmts = vd, vj
+data-types = V2DF, V2DF
+
+/// lsx_vfrintrp_s
+name = lsx_vfrintrp_s
+asm-fmts = vd, vj
+data-types = V4SF, V4SF
+
+/// lsx_vfrintrp_d
+name = lsx_vfrintrp_d
+asm-fmts = vd, vj
+data-types = V2DF, V2DF
+
+/// lsx_vfrintrm_s
+name = lsx_vfrintrm_s
+asm-fmts = vd, vj
+data-types = V4SF, V4SF
+
+/// lsx_vfrintrm_d
+name = lsx_vfrintrm_d
+asm-fmts = vd, vj
+data-types = V2DF, V2DF
+
+/// lsx_vstelm_b
+name = lsx_vstelm_b
+asm-fmts = vd, rj, si8, idx
+data-types = VOID, V16QI, CVPOINTER, SI, UQI
+
+/// lsx_vstelm_h
+name = lsx_vstelm_h
+asm-fmts = vd, rj, si8, idx
+data-types = VOID, V8HI, CVPOINTER, SI, UQI
+
+/// lsx_vstelm_w
+name = lsx_vstelm_w
+asm-fmts = vd, rj, si8, idx
+data-types = VOID, V4SI, CVPOINTER, SI, UQI
+
+/// lsx_vstelm_d
+name = lsx_vstelm_d
+asm-fmts = vd, rj, si8, idx
+data-types = VOID, V2DI, CVPOINTER, SI, UQI
+
+/// lsx_vaddwev_d_w
+name = lsx_vaddwev_d_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, V4SI, V4SI
+
+/// lsx_vaddwev_w_h
+name = lsx_vaddwev_w_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, V8HI, V8HI
+
+/// lsx_vaddwev_h_b
+name = lsx_vaddwev_h_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, V16QI, V16QI
+
+/// lsx_vaddwod_d_w
+name = lsx_vaddwod_d_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, V4SI, V4SI
+
+/// lsx_vaddwod_w_h
+name = lsx_vaddwod_w_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, V8HI, V8HI
+
+/// lsx_vaddwod_h_b
+name = lsx_vaddwod_h_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, V16QI, V16QI
+
+/// lsx_vaddwev_d_wu
+name = lsx_vaddwev_d_wu
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV4SI, UV4SI
+
+/// lsx_vaddwev_w_hu
+name = lsx_vaddwev_w_hu
+asm-fmts = vd, vj, vk
+data-types = V4SI, UV8HI, UV8HI
+
+/// lsx_vaddwev_h_bu
+name = lsx_vaddwev_h_bu
+asm-fmts = vd, vj, vk
+data-types = V8HI, UV16QI, UV16QI
+
+/// lsx_vaddwod_d_wu
+name = lsx_vaddwod_d_wu
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV4SI, UV4SI
+
+/// lsx_vaddwod_w_hu
+name = lsx_vaddwod_w_hu
+asm-fmts = vd, vj, vk
+data-types = V4SI, UV8HI, UV8HI
+
+/// lsx_vaddwod_h_bu
+name = lsx_vaddwod_h_bu
+asm-fmts = vd, vj, vk
+data-types = V8HI, UV16QI, UV16QI
+
+/// lsx_vaddwev_d_wu_w
+name = lsx_vaddwev_d_wu_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV4SI, V4SI
+
+/// lsx_vaddwev_w_hu_h
+name = lsx_vaddwev_w_hu_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, UV8HI, V8HI
+
+/// lsx_vaddwev_h_bu_b
+name = lsx_vaddwev_h_bu_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, UV16QI, V16QI
+
+/// lsx_vaddwod_d_wu_w
+name = lsx_vaddwod_d_wu_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV4SI, V4SI
+
+/// lsx_vaddwod_w_hu_h
+name = lsx_vaddwod_w_hu_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, UV8HI, V8HI
+
+/// lsx_vaddwod_h_bu_b
+name = lsx_vaddwod_h_bu_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, UV16QI, V16QI
+
+/// lsx_vsubwev_d_w
+name = lsx_vsubwev_d_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, V4SI, V4SI
+
+/// lsx_vsubwev_w_h
+name = lsx_vsubwev_w_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, V8HI, V8HI
+
+/// lsx_vsubwev_h_b
+name = lsx_vsubwev_h_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, V16QI, V16QI
+
+/// lsx_vsubwod_d_w
+name = lsx_vsubwod_d_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, V4SI, V4SI
+
+/// lsx_vsubwod_w_h
+name = lsx_vsubwod_w_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, V8HI, V8HI
+
+/// lsx_vsubwod_h_b
+name = lsx_vsubwod_h_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, V16QI, V16QI
+
+/// lsx_vsubwev_d_wu
+name = lsx_vsubwev_d_wu
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV4SI, UV4SI
+
+/// lsx_vsubwev_w_hu
+name = lsx_vsubwev_w_hu
+asm-fmts = vd, vj, vk
+data-types = V4SI, UV8HI, UV8HI
+
+/// lsx_vsubwev_h_bu
+name = lsx_vsubwev_h_bu
+asm-fmts = vd, vj, vk
+data-types = V8HI, UV16QI, UV16QI
+
+/// lsx_vsubwod_d_wu
+name = lsx_vsubwod_d_wu
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV4SI, UV4SI
+
+/// lsx_vsubwod_w_hu
+name = lsx_vsubwod_w_hu
+asm-fmts = vd, vj, vk
+data-types = V4SI, UV8HI, UV8HI
+
+/// lsx_vsubwod_h_bu
+name = lsx_vsubwod_h_bu
+asm-fmts = vd, vj, vk
+data-types = V8HI, UV16QI, UV16QI
+
+/// lsx_vaddwev_q_d
+name = lsx_vaddwev_q_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vaddwod_q_d
+name = lsx_vaddwod_q_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vaddwev_q_du
+name = lsx_vaddwev_q_du
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV2DI, UV2DI
+
+/// lsx_vaddwod_q_du
+name = lsx_vaddwod_q_du
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV2DI, UV2DI
+
+/// lsx_vsubwev_q_d
+name = lsx_vsubwev_q_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vsubwod_q_d
+name = lsx_vsubwod_q_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vsubwev_q_du
+name = lsx_vsubwev_q_du
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV2DI, UV2DI
+
+/// lsx_vsubwod_q_du
+name = lsx_vsubwod_q_du
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV2DI, UV2DI
+
+/// lsx_vaddwev_q_du_d
+name = lsx_vaddwev_q_du_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV2DI, V2DI
+
+/// lsx_vaddwod_q_du_d
+name = lsx_vaddwod_q_du_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV2DI, V2DI
+
+/// lsx_vmulwev_d_w
+name = lsx_vmulwev_d_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, V4SI, V4SI
+
+/// lsx_vmulwev_w_h
+name = lsx_vmulwev_w_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, V8HI, V8HI
+
+/// lsx_vmulwev_h_b
+name = lsx_vmulwev_h_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, V16QI, V16QI
+
+/// lsx_vmulwod_d_w
+name = lsx_vmulwod_d_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, V4SI, V4SI
+
+/// lsx_vmulwod_w_h
+name = lsx_vmulwod_w_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, V8HI, V8HI
+
+/// lsx_vmulwod_h_b
+name = lsx_vmulwod_h_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, V16QI, V16QI
+
+/// lsx_vmulwev_d_wu
+name = lsx_vmulwev_d_wu
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV4SI, UV4SI
+
+/// lsx_vmulwev_w_hu
+name = lsx_vmulwev_w_hu
+asm-fmts = vd, vj, vk
+data-types = V4SI, UV8HI, UV8HI
+
+/// lsx_vmulwev_h_bu
+name = lsx_vmulwev_h_bu
+asm-fmts = vd, vj, vk
+data-types = V8HI, UV16QI, UV16QI
+
+/// lsx_vmulwod_d_wu
+name = lsx_vmulwod_d_wu
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV4SI, UV4SI
+
+/// lsx_vmulwod_w_hu
+name = lsx_vmulwod_w_hu
+asm-fmts = vd, vj, vk
+data-types = V4SI, UV8HI, UV8HI
+
+/// lsx_vmulwod_h_bu
+name = lsx_vmulwod_h_bu
+asm-fmts = vd, vj, vk
+data-types = V8HI, UV16QI, UV16QI
+
+/// lsx_vmulwev_d_wu_w
+name = lsx_vmulwev_d_wu_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV4SI, V4SI
+
+/// lsx_vmulwev_w_hu_h
+name = lsx_vmulwev_w_hu_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, UV8HI, V8HI
+
+/// lsx_vmulwev_h_bu_b
+name = lsx_vmulwev_h_bu_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, UV16QI, V16QI
+
+/// lsx_vmulwod_d_wu_w
+name = lsx_vmulwod_d_wu_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV4SI, V4SI
+
+/// lsx_vmulwod_w_hu_h
+name = lsx_vmulwod_w_hu_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, UV8HI, V8HI
+
+/// lsx_vmulwod_h_bu_b
+name = lsx_vmulwod_h_bu_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, UV16QI, V16QI
+
+/// lsx_vmulwev_q_d
+name = lsx_vmulwev_q_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vmulwod_q_d
+name = lsx_vmulwod_q_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vmulwev_q_du
+name = lsx_vmulwev_q_du
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV2DI, UV2DI
+
+/// lsx_vmulwod_q_du
+name = lsx_vmulwod_q_du
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV2DI, UV2DI
+
+/// lsx_vmulwev_q_du_d
+name = lsx_vmulwev_q_du_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV2DI, V2DI
+
+/// lsx_vmulwod_q_du_d
+name = lsx_vmulwod_q_du_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, UV2DI, V2DI
+
+/// lsx_vhaddw_q_d
+name = lsx_vhaddw_q_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vhaddw_qu_du
+name = lsx_vhaddw_qu_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vhsubw_q_d
+name = lsx_vhsubw_q_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vhsubw_qu_du
+name = lsx_vhsubw_qu_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI
+
+/// lsx_vmaddwev_d_w
+name = lsx_vmaddwev_d_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V4SI, V4SI
+
+/// lsx_vmaddwev_w_h
+name = lsx_vmaddwev_w_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V8HI, V8HI
+
+/// lsx_vmaddwev_h_b
+name = lsx_vmaddwev_h_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V16QI, V16QI
+
+/// lsx_vmaddwev_d_wu
+name = lsx_vmaddwev_d_wu
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV4SI, UV4SI
+
+/// lsx_vmaddwev_w_hu
+name = lsx_vmaddwev_w_hu
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV8HI, UV8HI
+
+/// lsx_vmaddwev_h_bu
+name = lsx_vmaddwev_h_bu
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV16QI, UV16QI
+
+/// lsx_vmaddwod_d_w
+name = lsx_vmaddwod_d_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V4SI, V4SI
+
+/// lsx_vmaddwod_w_h
+name = lsx_vmaddwod_w_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V8HI, V8HI
+
+/// lsx_vmaddwod_h_b
+name = lsx_vmaddwod_h_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V16QI, V16QI
+
+/// lsx_vmaddwod_d_wu
+name = lsx_vmaddwod_d_wu
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV4SI, UV4SI
+
+/// lsx_vmaddwod_w_hu
+name = lsx_vmaddwod_w_hu
+asm-fmts = vd, vj, vk
+data-types = UV4SI, UV4SI, UV8HI, UV8HI
+
+/// lsx_vmaddwod_h_bu
+name = lsx_vmaddwod_h_bu
+asm-fmts = vd, vj, vk
+data-types = UV8HI, UV8HI, UV16QI, UV16QI
+
+/// lsx_vmaddwev_d_wu_w
+name = lsx_vmaddwev_d_wu_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, UV4SI, V4SI
+
+/// lsx_vmaddwev_w_hu_h
+name = lsx_vmaddwev_w_hu_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, UV8HI, V8HI
+
+/// lsx_vmaddwev_h_bu_b
+name = lsx_vmaddwev_h_bu_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, UV16QI, V16QI
+
+/// lsx_vmaddwod_d_wu_w
+name = lsx_vmaddwod_d_wu_w
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, UV4SI, V4SI
+
+/// lsx_vmaddwod_w_hu_h
+name = lsx_vmaddwod_w_hu_h
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, UV8HI, V8HI
+
+/// lsx_vmaddwod_h_bu_b
+name = lsx_vmaddwod_h_bu_b
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, UV16QI, V16QI
+
+/// lsx_vmaddwev_q_d
+name = lsx_vmaddwev_q_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI, V2DI
+
+/// lsx_vmaddwod_q_d
+name = lsx_vmaddwod_q_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI, V2DI
+
+/// lsx_vmaddwev_q_du
+name = lsx_vmaddwev_q_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI, UV2DI
+
+/// lsx_vmaddwod_q_du
+name = lsx_vmaddwod_q_du
+asm-fmts = vd, vj, vk
+data-types = UV2DI, UV2DI, UV2DI, UV2DI
+
+/// lsx_vmaddwev_q_du_d
+name = lsx_vmaddwev_q_du_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, UV2DI, V2DI
+
+/// lsx_vmaddwod_q_du_d
+name = lsx_vmaddwod_q_du_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, UV2DI, V2DI
+
+/// lsx_vrotr_b
+name = lsx_vrotr_b
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vrotr_h
+name = lsx_vrotr_h
+asm-fmts = vd, vj, vk
+data-types = V8HI, V8HI, V8HI
+
+/// lsx_vrotr_w
+name = lsx_vrotr_w
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SI, V4SI
+
+/// lsx_vrotr_d
+name = lsx_vrotr_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vadd_q
+name = lsx_vadd_q
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vsub_q
+name = lsx_vsub_q
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DI, V2DI
+
+/// lsx_vldrepl_b
+name = lsx_vldrepl_b
+asm-fmts = vd, rj, si12
+data-types = V16QI, CVPOINTER, SI
+
+/// lsx_vldrepl_h
+name = lsx_vldrepl_h
+asm-fmts = vd, rj, si11
+data-types = V8HI, CVPOINTER, SI
+
+/// lsx_vldrepl_w
+name = lsx_vldrepl_w
+asm-fmts = vd, rj, si10
+data-types = V4SI, CVPOINTER, SI
+
+/// lsx_vldrepl_d
+name = lsx_vldrepl_d
+asm-fmts = vd, rj, si9
+data-types = V2DI, CVPOINTER, SI
+
+/// lsx_vmskgez_b
+name = lsx_vmskgez_b
+asm-fmts = vd, vj
+data-types = V16QI, V16QI
+
+/// lsx_vmsknz_b
+name = lsx_vmsknz_b
+asm-fmts = vd, vj
+data-types = V16QI, V16QI
+
+/// lsx_vexth_h_b
+name = lsx_vexth_h_b
+asm-fmts = vd, vj
+data-types = V8HI, V16QI
+
+/// lsx_vexth_w_h
+name = lsx_vexth_w_h
+asm-fmts = vd, vj
+data-types = V4SI, V8HI
+
+/// lsx_vexth_d_w
+name = lsx_vexth_d_w
+asm-fmts = vd, vj
+data-types = V2DI, V4SI
+
+/// lsx_vexth_q_d
+name = lsx_vexth_q_d
+asm-fmts = vd, vj
+data-types = V2DI, V2DI
+
+/// lsx_vexth_hu_bu
+name = lsx_vexth_hu_bu
+asm-fmts = vd, vj
+data-types = UV8HI, UV16QI
+
+/// lsx_vexth_wu_hu
+name = lsx_vexth_wu_hu
+asm-fmts = vd, vj
+data-types = UV4SI, UV8HI
+
+/// lsx_vexth_du_wu
+name = lsx_vexth_du_wu
+asm-fmts = vd, vj
+data-types = UV2DI, UV4SI
+
+/// lsx_vexth_qu_du
+name = lsx_vexth_qu_du
+asm-fmts = vd, vj
+data-types = UV2DI, UV2DI
+
+/// lsx_vrotri_b
+name = lsx_vrotri_b
+asm-fmts = vd, vj, ui3
+data-types = V16QI, V16QI, UQI
+
+/// lsx_vrotri_h
+name = lsx_vrotri_h
+asm-fmts = vd, vj, ui4
+data-types = V8HI, V8HI, UQI
+
+/// lsx_vrotri_w
+name = lsx_vrotri_w
+asm-fmts = vd, vj, ui5
+data-types = V4SI, V4SI, UQI
+
+/// lsx_vrotri_d
+name = lsx_vrotri_d
+asm-fmts = vd, vj, ui6
+data-types = V2DI, V2DI, UQI
+
+/// lsx_vextl_q_d
+name = lsx_vextl_q_d
+asm-fmts = vd, vj
+data-types = V2DI, V2DI
+
+/// lsx_vsrlni_b_h
+name = lsx_vsrlni_b_h
+asm-fmts = vd, vj, ui4
+data-types = V16QI, V16QI, V16QI, USI
+
+/// lsx_vsrlni_h_w
+name = lsx_vsrlni_h_w
+asm-fmts = vd, vj, ui5
+data-types = V8HI, V8HI, V8HI, USI
+
+/// lsx_vsrlni_w_d
+name = lsx_vsrlni_w_d
+asm-fmts = vd, vj, ui6
+data-types = V4SI, V4SI, V4SI, USI
+
+/// lsx_vsrlni_d_q
+name = lsx_vsrlni_d_q
+asm-fmts = vd, vj, ui7
+data-types = V2DI, V2DI, V2DI, USI
+
+/// lsx_vsrlrni_b_h
+name = lsx_vsrlrni_b_h
+asm-fmts = vd, vj, ui4
+data-types = V16QI, V16QI, V16QI, USI
+
+/// lsx_vsrlrni_h_w
+name = lsx_vsrlrni_h_w
+asm-fmts = vd, vj, ui5
+data-types = V8HI, V8HI, V8HI, USI
+
+/// lsx_vsrlrni_w_d
+name = lsx_vsrlrni_w_d
+asm-fmts = vd, vj, ui6
+data-types = V4SI, V4SI, V4SI, USI
+
+/// lsx_vsrlrni_d_q
+name = lsx_vsrlrni_d_q
+asm-fmts = vd, vj, ui7
+data-types = V2DI, V2DI, V2DI, USI
+
+/// lsx_vssrlni_b_h
+name = lsx_vssrlni_b_h
+asm-fmts = vd, vj, ui4
+data-types = V16QI, V16QI, V16QI, USI
+
+/// lsx_vssrlni_h_w
+name = lsx_vssrlni_h_w
+asm-fmts = vd, vj, ui5
+data-types = V8HI, V8HI, V8HI, USI
+
+/// lsx_vssrlni_w_d
+name = lsx_vssrlni_w_d
+asm-fmts = vd, vj, ui6
+data-types = V4SI, V4SI, V4SI, USI
+
+/// lsx_vssrlni_d_q
+name = lsx_vssrlni_d_q
+asm-fmts = vd, vj, ui7
+data-types = V2DI, V2DI, V2DI, USI
+
+/// lsx_vssrlni_bu_h
+name = lsx_vssrlni_bu_h
+asm-fmts = vd, vj, ui4
+data-types = UV16QI, UV16QI, V16QI, USI
+
+/// lsx_vssrlni_hu_w
+name = lsx_vssrlni_hu_w
+asm-fmts = vd, vj, ui5
+data-types = UV8HI, UV8HI, V8HI, USI
+
+/// lsx_vssrlni_wu_d
+name = lsx_vssrlni_wu_d
+asm-fmts = vd, vj, ui6
+data-types = UV4SI, UV4SI, V4SI, USI
+
+/// lsx_vssrlni_du_q
+name = lsx_vssrlni_du_q
+asm-fmts = vd, vj, ui7
+data-types = UV2DI, UV2DI, V2DI, USI
+
+/// lsx_vssrlrni_b_h
+name = lsx_vssrlrni_b_h
+asm-fmts = vd, vj, ui4
+data-types = V16QI, V16QI, V16QI, USI
+
+/// lsx_vssrlrni_h_w
+name = lsx_vssrlrni_h_w
+asm-fmts = vd, vj, ui5
+data-types = V8HI, V8HI, V8HI, USI
+
+/// lsx_vssrlrni_w_d
+name = lsx_vssrlrni_w_d
+asm-fmts = vd, vj, ui6
+data-types = V4SI, V4SI, V4SI, USI
+
+/// lsx_vssrlrni_d_q
+name = lsx_vssrlrni_d_q
+asm-fmts = vd, vj, ui7
+data-types = V2DI, V2DI, V2DI, USI
+
+/// lsx_vssrlrni_bu_h
+name = lsx_vssrlrni_bu_h
+asm-fmts = vd, vj, ui4
+data-types = UV16QI, UV16QI, V16QI, USI
+
+/// lsx_vssrlrni_hu_w
+name = lsx_vssrlrni_hu_w
+asm-fmts = vd, vj, ui5
+data-types = UV8HI, UV8HI, V8HI, USI
+
+/// lsx_vssrlrni_wu_d
+name = lsx_vssrlrni_wu_d
+asm-fmts = vd, vj, ui6
+data-types = UV4SI, UV4SI, V4SI, USI
+
+/// lsx_vssrlrni_du_q
+name = lsx_vssrlrni_du_q
+asm-fmts = vd, vj, ui7
+data-types = UV2DI, UV2DI, V2DI, USI
+
+/// lsx_vsrani_b_h
+name = lsx_vsrani_b_h
+asm-fmts = vd, vj, ui4
+data-types = V16QI, V16QI, V16QI, USI
+
+/// lsx_vsrani_h_w
+name = lsx_vsrani_h_w
+asm-fmts = vd, vj, ui5
+data-types = V8HI, V8HI, V8HI, USI
+
+/// lsx_vsrani_w_d
+name = lsx_vsrani_w_d
+asm-fmts = vd, vj, ui6
+data-types = V4SI, V4SI, V4SI, USI
+
+/// lsx_vsrani_d_q
+name = lsx_vsrani_d_q
+asm-fmts = vd, vj, ui7
+data-types = V2DI, V2DI, V2DI, USI
+
+/// lsx_vsrarni_b_h
+name = lsx_vsrarni_b_h
+asm-fmts = vd, vj, ui4
+data-types = V16QI, V16QI, V16QI, USI
+
+/// lsx_vsrarni_h_w
+name = lsx_vsrarni_h_w
+asm-fmts = vd, vj, ui5
+data-types = V8HI, V8HI, V8HI, USI
+
+/// lsx_vsrarni_w_d
+name = lsx_vsrarni_w_d
+asm-fmts = vd, vj, ui6
+data-types = V4SI, V4SI, V4SI, USI
+
+/// lsx_vsrarni_d_q
+name = lsx_vsrarni_d_q
+asm-fmts = vd, vj, ui7
+data-types = V2DI, V2DI, V2DI, USI
+
+/// lsx_vssrani_b_h
+name = lsx_vssrani_b_h
+asm-fmts = vd, vj, ui4
+data-types = V16QI, V16QI, V16QI, USI
+
+/// lsx_vssrani_h_w
+name = lsx_vssrani_h_w
+asm-fmts = vd, vj, ui5
+data-types = V8HI, V8HI, V8HI, USI
+
+/// lsx_vssrani_w_d
+name = lsx_vssrani_w_d
+asm-fmts = vd, vj, ui6
+data-types = V4SI, V4SI, V4SI, USI
+
+/// lsx_vssrani_d_q
+name = lsx_vssrani_d_q
+asm-fmts = vd, vj, ui7
+data-types = V2DI, V2DI, V2DI, USI
+
+/// lsx_vssrani_bu_h
+name = lsx_vssrani_bu_h
+asm-fmts = vd, vj, ui4
+data-types = UV16QI, UV16QI, V16QI, USI
+
+/// lsx_vssrani_hu_w
+name = lsx_vssrani_hu_w
+asm-fmts = vd, vj, ui5
+data-types = UV8HI, UV8HI, V8HI, USI
+
+/// lsx_vssrani_wu_d
+name = lsx_vssrani_wu_d
+asm-fmts = vd, vj, ui6
+data-types = UV4SI, UV4SI, V4SI, USI
+
+/// lsx_vssrani_du_q
+name = lsx_vssrani_du_q
+asm-fmts = vd, vj, ui7
+data-types = UV2DI, UV2DI, V2DI, USI
+
+/// lsx_vssrarni_b_h
+name = lsx_vssrarni_b_h
+asm-fmts = vd, vj, ui4
+data-types = V16QI, V16QI, V16QI, USI
+
+/// lsx_vssrarni_h_w
+name = lsx_vssrarni_h_w
+asm-fmts = vd, vj, ui5
+data-types = V8HI, V8HI, V8HI, USI
+
+/// lsx_vssrarni_w_d
+name = lsx_vssrarni_w_d
+asm-fmts = vd, vj, ui6
+data-types = V4SI, V4SI, V4SI, USI
+
+/// lsx_vssrarni_d_q
+name = lsx_vssrarni_d_q
+asm-fmts = vd, vj, ui7
+data-types = V2DI, V2DI, V2DI, USI
+
+/// lsx_vssrarni_bu_h
+name = lsx_vssrarni_bu_h
+asm-fmts = vd, vj, ui4
+data-types = UV16QI, UV16QI, V16QI, USI
+
+/// lsx_vssrarni_hu_w
+name = lsx_vssrarni_hu_w
+asm-fmts = vd, vj, ui5
+data-types = UV8HI, UV8HI, V8HI, USI
+
+/// lsx_vssrarni_wu_d
+name = lsx_vssrarni_wu_d
+asm-fmts = vd, vj, ui6
+data-types = UV4SI, UV4SI, V4SI, USI
+
+/// lsx_vssrarni_du_q
+name = lsx_vssrarni_du_q
+asm-fmts = vd, vj, ui7
+data-types = UV2DI, UV2DI, V2DI, USI
+
+/// lsx_vpermi_w
+name = lsx_vpermi_w
+asm-fmts = vd, vj, ui8
+data-types = V4SI, V4SI, V4SI, USI
+
+/// lsx_vld
+name = lsx_vld
+asm-fmts = vd, rj, si12
+data-types = V16QI, CVPOINTER, SI
+
+/// lsx_vst
+name = lsx_vst
+asm-fmts = vd, rj, si12
+data-types = VOID, V16QI, CVPOINTER, SI
+
+/// lsx_vssrlrn_b_h
+name = lsx_vssrlrn_b_h
+asm-fmts = vd, vj, vk
+data-types = V16QI, V8HI, V8HI
+
+/// lsx_vssrlrn_h_w
+name = lsx_vssrlrn_h_w
+asm-fmts = vd, vj, vk
+data-types = V8HI, V4SI, V4SI
+
+/// lsx_vssrlrn_w_d
+name = lsx_vssrlrn_w_d
+asm-fmts = vd, vj, vk
+data-types = V4SI, V2DI, V2DI
+
+/// lsx_vssrln_b_h
+name = lsx_vssrln_b_h
+asm-fmts = vd, vj, vk
+data-types = V16QI, V8HI, V8HI
+
+/// lsx_vssrln_h_w
+name = lsx_vssrln_h_w
+asm-fmts = vd, vj, vk
+data-types = V8HI, V4SI, V4SI
+
+/// lsx_vssrln_w_d
+name = lsx_vssrln_w_d
+asm-fmts = vd, vj, vk
+data-types = V4SI, V2DI, V2DI
+
+/// lsx_vorn_v
+name = lsx_vorn_v
+asm-fmts = vd, vj, vk
+data-types = V16QI, V16QI, V16QI
+
+/// lsx_vldi
+name = lsx_vldi
+asm-fmts = vd, i13
+data-types = V2DI, HI
+
+/// lsx_vshuf_b
+name = lsx_vshuf_b
+asm-fmts = vd, vj, vk, va
+data-types = V16QI, V16QI, V16QI, V16QI
+
+/// lsx_vldx
+name = lsx_vldx
+asm-fmts = vd, rj, rk
+data-types = V16QI, CVPOINTER, DI
+
+/// lsx_vstx
+name = lsx_vstx
+asm-fmts = vd, rj, rk
+data-types = VOID, V16QI, CVPOINTER, DI
+
+/// lsx_vextl_qu_du
+name = lsx_vextl_qu_du
+asm-fmts = vd, vj
+data-types = UV2DI, UV2DI
+
+/// lsx_bnz_b
+name = lsx_bnz_b
+asm-fmts = cd, vj
+data-types = SI, UV16QI
+
+/// lsx_bnz_d
+name = lsx_bnz_d
+asm-fmts = cd, vj
+data-types = SI, UV2DI
+
+/// lsx_bnz_h
+name = lsx_bnz_h
+asm-fmts = cd, vj
+data-types = SI, UV8HI
+
+/// lsx_bnz_v
+name = lsx_bnz_v
+asm-fmts = cd, vj
+data-types = SI, UV16QI
+
+/// lsx_bnz_w
+name = lsx_bnz_w
+asm-fmts = cd, vj
+data-types = SI, UV4SI
+
+/// lsx_bz_b
+name = lsx_bz_b
+asm-fmts = cd, vj
+data-types = SI, UV16QI
+
+/// lsx_bz_d
+name = lsx_bz_d
+asm-fmts = cd, vj
+data-types = SI, UV2DI
+
+/// lsx_bz_h
+name = lsx_bz_h
+asm-fmts = cd, vj
+data-types = SI, UV8HI
+
+/// lsx_bz_v
+name = lsx_bz_v
+asm-fmts = cd, vj
+data-types = SI, UV16QI
+
+/// lsx_bz_w
+name = lsx_bz_w
+asm-fmts = cd, vj
+data-types = SI, UV4SI
+
+/// lsx_vfcmp_caf_d
+name = lsx_vfcmp_caf_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_caf_s
+name = lsx_vfcmp_caf_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_ceq_d
+name = lsx_vfcmp_ceq_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_ceq_s
+name = lsx_vfcmp_ceq_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_cle_d
+name = lsx_vfcmp_cle_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_cle_s
+name = lsx_vfcmp_cle_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_clt_d
+name = lsx_vfcmp_clt_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_clt_s
+name = lsx_vfcmp_clt_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_cne_d
+name = lsx_vfcmp_cne_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_cne_s
+name = lsx_vfcmp_cne_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_cor_d
+name = lsx_vfcmp_cor_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_cor_s
+name = lsx_vfcmp_cor_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_cueq_d
+name = lsx_vfcmp_cueq_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_cueq_s
+name = lsx_vfcmp_cueq_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_cule_d
+name = lsx_vfcmp_cule_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_cule_s
+name = lsx_vfcmp_cule_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_cult_d
+name = lsx_vfcmp_cult_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_cult_s
+name = lsx_vfcmp_cult_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_cun_d
+name = lsx_vfcmp_cun_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_cune_d
+name = lsx_vfcmp_cune_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_cune_s
+name = lsx_vfcmp_cune_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_cun_s
+name = lsx_vfcmp_cun_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_saf_d
+name = lsx_vfcmp_saf_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_saf_s
+name = lsx_vfcmp_saf_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_seq_d
+name = lsx_vfcmp_seq_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_seq_s
+name = lsx_vfcmp_seq_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_sle_d
+name = lsx_vfcmp_sle_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_sle_s
+name = lsx_vfcmp_sle_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_slt_d
+name = lsx_vfcmp_slt_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_slt_s
+name = lsx_vfcmp_slt_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_sne_d
+name = lsx_vfcmp_sne_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_sne_s
+name = lsx_vfcmp_sne_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_sor_d
+name = lsx_vfcmp_sor_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_sor_s
+name = lsx_vfcmp_sor_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_sueq_d
+name = lsx_vfcmp_sueq_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_sueq_s
+name = lsx_vfcmp_sueq_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_sule_d
+name = lsx_vfcmp_sule_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_sule_s
+name = lsx_vfcmp_sule_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_sult_d
+name = lsx_vfcmp_sult_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_sult_s
+name = lsx_vfcmp_sult_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_sun_d
+name = lsx_vfcmp_sun_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_sune_d
+name = lsx_vfcmp_sune_d
+asm-fmts = vd, vj, vk
+data-types = V2DI, V2DF, V2DF
+
+/// lsx_vfcmp_sune_s
+name = lsx_vfcmp_sune_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vfcmp_sun_s
+name = lsx_vfcmp_sun_s
+asm-fmts = vd, vj, vk
+data-types = V4SI, V4SF, V4SF
+
+/// lsx_vrepli_b
+name = lsx_vrepli_b
+asm-fmts = vd, si10
+data-types = V16QI, HI
+
+/// lsx_vrepli_d
+name = lsx_vrepli_d
+asm-fmts = vd, si10
+data-types = V2DI, HI
+
+/// lsx_vrepli_h
+name = lsx_vrepli_h
+asm-fmts = vd, si10
+data-types = V8HI, HI
+
+/// lsx_vrepli_w
+name = lsx_vrepli_w
+asm-fmts = vd, si10
+data-types = V4SI, HI
+
diff --git a/library/stdarch/crates/stdarch-gen-loongarch/lsxintrin.h b/library/stdarch/crates/stdarch-gen-loongarch/lsxintrin.h
new file mode 100644
index 0000000000000..943f2df913e4d
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-loongarch/lsxintrin.h
@@ -0,0 +1,5219 @@
+/*
+ * https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/lsxintrin.h;hb=61f1001f2f4ab9128e5eb6e9a4adbbb0f9f0bc75
+ */
+
+/* LARCH Loongson SX intrinsics include file.
+
+   Copyright (C) 2018-2024 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _GCC_LOONGSON_SXINTRIN_H
+#define _GCC_LOONGSON_SXINTRIN_H 1
+
+#if defined(__loongarch_sx)
+typedef signed char v16i8 __attribute__ ((vector_size(16), aligned(16)));
+typedef signed char v16i8_b __attribute__ ((vector_size(16), aligned(1)));
+typedef unsigned char v16u8 __attribute__ ((vector_size(16), aligned(16)));
+typedef unsigned char v16u8_b __attribute__ ((vector_size(16), aligned(1)));
+typedef short v8i16 __attribute__ ((vector_size(16), aligned(16)));
+typedef short v8i16_h __attribute__ ((vector_size(16), aligned(2)));
+typedef unsigned short v8u16 __attribute__ ((vector_size(16), aligned(16)));
+typedef unsigned short v8u16_h __attribute__ ((vector_size(16), aligned(2)));
+typedef int v4i32 __attribute__ ((vector_size(16), aligned(16)));
+typedef int v4i32_w __attribute__ ((vector_size(16), aligned(4)));
+typedef unsigned int v4u32 __attribute__ ((vector_size(16), aligned(16)));
+typedef unsigned int v4u32_w __attribute__ ((vector_size(16), aligned(4)));
+typedef long long v2i64 __attribute__ ((vector_size(16), aligned(16)));
+typedef long long v2i64_d __attribute__ ((vector_size(16), aligned(8)));
+typedef unsigned long long v2u64 __attribute__ ((vector_size(16), aligned(16)));
+typedef unsigned long long v2u64_d __attribute__ ((vector_size(16), aligned(8)));
+typedef float v4f32 __attribute__ ((vector_size(16), aligned(16)));
+typedef float v4f32_w __attribute__ ((vector_size(16), aligned(4)));
+typedef double v2f64 __attribute__ ((vector_size(16), aligned(16)));
+typedef double v2f64_d __attribute__ ((vector_size(16), aligned(8)));
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsll_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsll_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsll_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsll_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsll_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsll_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsll_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsll_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  V16QI, V16QI, UQI.  */
+#define __lsx_vslli_b(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vslli_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V8HI, V8HI, UQI.  */
+#define __lsx_vslli_h(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vslli_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V4SI, V4SI, UQI.  */
+#define __lsx_vslli_w(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vslli_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V2DI, V2DI, UQI.  */
+#define __lsx_vslli_d(/*__m128i*/ _1, /*ui6*/ _2) \
+  ((__m128i)__builtin_lsx_vslli_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsra_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsra_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsra_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsra_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsra_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsra_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsra_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsra_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  V16QI, V16QI, UQI.  */
+#define __lsx_vsrai_b(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vsrai_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V8HI, V8HI, UQI.  */
+#define __lsx_vsrai_h(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vsrai_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V4SI, V4SI, UQI.  */
+#define __lsx_vsrai_w(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vsrai_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V2DI, V2DI, UQI.  */
+#define __lsx_vsrai_d(/*__m128i*/ _1, /*ui6*/ _2) \
+  ((__m128i)__builtin_lsx_vsrai_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrar_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrar_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrar_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrar_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrar_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrar_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrar_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrar_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  V16QI, V16QI, UQI.  */
+#define __lsx_vsrari_b(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vsrari_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V8HI, V8HI, UQI.  */
+#define __lsx_vsrari_h(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vsrari_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V4SI, V4SI, UQI.  */
+#define __lsx_vsrari_w(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vsrari_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V2DI, V2DI, UQI.  */
+#define __lsx_vsrari_d(/*__m128i*/ _1, /*ui6*/ _2) \
+  ((__m128i)__builtin_lsx_vsrari_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrl_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrl_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrl_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrl_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrl_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrl_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrl_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrl_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  V16QI, V16QI, UQI.  */
+#define __lsx_vsrli_b(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vsrli_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V8HI, V8HI, UQI.  */
+#define __lsx_vsrli_h(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vsrli_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V4SI, V4SI, UQI.  */
+#define __lsx_vsrli_w(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vsrli_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V2DI, V2DI, UQI.  */
+#define __lsx_vsrli_d(/*__m128i*/ _1, /*ui6*/ _2) \
+  ((__m128i)__builtin_lsx_vsrli_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrlr_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrlr_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrlr_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrlr_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrlr_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrlr_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrlr_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrlr_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  V16QI, V16QI, UQI.  */
+#define __lsx_vsrlri_b(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vsrlri_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V8HI, V8HI, UQI.  */
+#define __lsx_vsrlri_h(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vsrlri_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V4SI, V4SI, UQI.  */
+#define __lsx_vsrlri_w(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vsrlri_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V2DI, V2DI, UQI.  */
+#define __lsx_vsrlri_d(/*__m128i*/ _1, /*ui6*/ _2) \
+  ((__m128i)__builtin_lsx_vsrlri_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vbitclr_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vbitclr_b ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vbitclr_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vbitclr_h ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vbitclr_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vbitclr_w ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vbitclr_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vbitclr_d ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UQI.  */
+#define __lsx_vbitclri_b(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vbitclri_b ((v16u8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UQI.  */
+#define __lsx_vbitclri_h(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vbitclri_h ((v8u16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UQI.  */
+#define __lsx_vbitclri_w(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vbitclri_w ((v4u32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UQI.  */
+#define __lsx_vbitclri_d(/*__m128i*/ _1, /*ui6*/ _2) \
+  ((__m128i)__builtin_lsx_vbitclri_d ((v2u64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vbitset_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vbitset_b ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vbitset_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vbitset_h ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vbitset_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vbitset_w ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vbitset_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vbitset_d ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UQI.  */
+#define __lsx_vbitseti_b(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vbitseti_b ((v16u8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UQI.  */
+#define __lsx_vbitseti_h(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vbitseti_h ((v8u16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UQI.  */
+#define __lsx_vbitseti_w(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vbitseti_w ((v4u32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UQI.  */
+#define __lsx_vbitseti_d(/*__m128i*/ _1, /*ui6*/ _2) \
+  ((__m128i)__builtin_lsx_vbitseti_d ((v2u64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vbitrev_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vbitrev_b ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vbitrev_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vbitrev_h ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vbitrev_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vbitrev_w ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vbitrev_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vbitrev_d ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UQI.  */
+#define __lsx_vbitrevi_b(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vbitrevi_b ((v16u8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UQI.  */
+#define __lsx_vbitrevi_h(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vbitrevi_h ((v8u16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UQI.  */
+#define __lsx_vbitrevi_w(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vbitrevi_w ((v4u32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UQI.  */
+#define __lsx_vbitrevi_d(/*__m128i*/ _1, /*ui6*/ _2) \
+  ((__m128i)__builtin_lsx_vbitrevi_d ((v2u64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vadd_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vadd_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vadd_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vadd_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vadd_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vadd_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vadd_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vadd_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V16QI, V16QI, UQI.  */
+#define __lsx_vaddi_bu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vaddi_bu ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V8HI, V8HI, UQI.  */
+#define __lsx_vaddi_hu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vaddi_hu ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V4SI, V4SI, UQI.  */
+#define __lsx_vaddi_wu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vaddi_wu ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V2DI, V2DI, UQI.  */
+#define __lsx_vaddi_du(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vaddi_du ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsub_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsub_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsub_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsub_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsub_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsub_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsub_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsub_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V16QI, V16QI, UQI.  */
+#define __lsx_vsubi_bu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vsubi_bu ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V8HI, V8HI, UQI.  */
+#define __lsx_vsubi_hu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vsubi_hu ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V4SI, V4SI, UQI.  */
+#define __lsx_vsubi_wu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vsubi_wu ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V2DI, V2DI, UQI.  */
+#define __lsx_vsubi_du(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vsubi_du ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmax_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmax_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmax_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmax_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmax_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmax_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmax_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmax_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V16QI, V16QI, QI.  */
+#define __lsx_vmaxi_b(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vmaxi_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V8HI, V8HI, QI.  */
+#define __lsx_vmaxi_h(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vmaxi_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V4SI, V4SI, QI.  */
+#define __lsx_vmaxi_w(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vmaxi_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V2DI, V2DI, QI.  */
+#define __lsx_vmaxi_d(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vmaxi_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmax_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmax_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmax_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmax_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmax_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmax_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmax_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmax_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UQI.  */
+#define __lsx_vmaxi_bu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vmaxi_bu ((v16u8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UQI.  */
+#define __lsx_vmaxi_hu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vmaxi_hu ((v8u16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UQI.  */
+#define __lsx_vmaxi_wu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vmaxi_wu ((v4u32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UQI.  */
+#define __lsx_vmaxi_du(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vmaxi_du ((v2u64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmin_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmin_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmin_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmin_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmin_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmin_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmin_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmin_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V16QI, V16QI, QI.  */
+#define __lsx_vmini_b(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vmini_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V8HI, V8HI, QI.  */
+#define __lsx_vmini_h(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vmini_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V4SI, V4SI, QI.  */
+#define __lsx_vmini_w(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vmini_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V2DI, V2DI, QI.  */
+#define __lsx_vmini_d(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vmini_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmin_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmin_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmin_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmin_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmin_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmin_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmin_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmin_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UQI.  */
+#define __lsx_vmini_bu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vmini_bu ((v16u8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UQI.  */
+#define __lsx_vmini_hu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vmini_hu ((v8u16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UQI.  */
+#define __lsx_vmini_wu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vmini_wu ((v4u32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UQI.  */
+#define __lsx_vmini_du(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vmini_du ((v2u64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vseq_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vseq_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vseq_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vseq_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vseq_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vseq_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vseq_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vseq_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V16QI, V16QI, QI.  */
+#define __lsx_vseqi_b(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vseqi_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V8HI, V8HI, QI.  */
+#define __lsx_vseqi_h(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vseqi_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V4SI, V4SI, QI.  */
+#define __lsx_vseqi_w(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vseqi_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V2DI, V2DI, QI.  */
+#define __lsx_vseqi_d(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vseqi_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V16QI, V16QI, QI.  */
+#define __lsx_vslti_b(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vslti_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vslt_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vslt_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vslt_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vslt_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vslt_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vslt_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vslt_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vslt_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V8HI, V8HI, QI.  */
+#define __lsx_vslti_h(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vslti_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V4SI, V4SI, QI.  */
+#define __lsx_vslti_w(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vslti_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V2DI, V2DI, QI.  */
+#define __lsx_vslti_d(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vslti_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vslt_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vslt_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vslt_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vslt_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vslt_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vslt_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vslt_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vslt_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V16QI, UV16QI, UQI.  */
+#define __lsx_vslti_bu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vslti_bu ((v16u8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V8HI, UV8HI, UQI.  */
+#define __lsx_vslti_hu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vslti_hu ((v8u16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V4SI, UV4SI, UQI.  */
+#define __lsx_vslti_wu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vslti_wu ((v4u32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V2DI, UV2DI, UQI.  */
+#define __lsx_vslti_du(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vslti_du ((v2u64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsle_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsle_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsle_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsle_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsle_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsle_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsle_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsle_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V16QI, V16QI, QI.  */
+#define __lsx_vslei_b(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vslei_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V8HI, V8HI, QI.  */
+#define __lsx_vslei_h(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vslei_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V4SI, V4SI, QI.  */
+#define __lsx_vslei_w(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vslei_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, si5.  */
+/* Data types in instruction templates:  V2DI, V2DI, QI.  */
+#define __lsx_vslei_d(/*__m128i*/ _1, /*si5*/ _2) \
+  ((__m128i)__builtin_lsx_vslei_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsle_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsle_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsle_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsle_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsle_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsle_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsle_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsle_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V16QI, UV16QI, UQI.  */
+#define __lsx_vslei_bu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vslei_bu ((v16u8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V8HI, UV8HI, UQI.  */
+#define __lsx_vslei_hu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vslei_hu ((v8u16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V4SI, UV4SI, UQI.  */
+#define __lsx_vslei_wu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vslei_wu ((v4u32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V2DI, UV2DI, UQI.  */
+#define __lsx_vslei_du(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vslei_du ((v2u64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  V16QI, V16QI, UQI.  */
+#define __lsx_vsat_b(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vsat_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V8HI, V8HI, UQI.  */
+#define __lsx_vsat_h(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vsat_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V4SI, V4SI, UQI.  */
+#define __lsx_vsat_w(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vsat_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V2DI, V2DI, UQI.  */
+#define __lsx_vsat_d(/*__m128i*/ _1, /*ui6*/ _2) \
+  ((__m128i)__builtin_lsx_vsat_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UQI.  */
+#define __lsx_vsat_bu(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vsat_bu ((v16u8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UQI.  */
+#define __lsx_vsat_hu(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vsat_hu ((v8u16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UQI.  */
+#define __lsx_vsat_wu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vsat_wu ((v4u32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UQI.  */
+#define __lsx_vsat_du(/*__m128i*/ _1, /*ui6*/ _2) \
+  ((__m128i)__builtin_lsx_vsat_du ((v2u64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vadda_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vadda_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vadda_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vadda_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vadda_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vadda_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vadda_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vadda_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsadd_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsadd_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsadd_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsadd_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsadd_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsadd_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsadd_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsadd_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsadd_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsadd_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsadd_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsadd_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsadd_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsadd_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsadd_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsadd_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavg_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavg_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavg_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavg_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavg_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavg_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavg_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavg_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavg_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavg_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavg_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavg_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavg_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavg_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavg_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavg_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavgr_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavgr_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavgr_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavgr_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavgr_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavgr_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavgr_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavgr_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavgr_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavgr_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavgr_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavgr_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavgr_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavgr_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vavgr_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vavgr_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssub_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssub_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssub_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssub_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssub_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssub_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssub_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssub_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssub_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssub_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssub_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssub_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssub_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssub_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssub_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssub_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vabsd_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vabsd_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vabsd_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vabsd_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vabsd_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vabsd_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vabsd_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vabsd_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vabsd_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vabsd_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vabsd_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vabsd_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vabsd_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vabsd_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vabsd_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vabsd_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmul_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmul_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmul_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmul_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmul_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmul_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmul_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmul_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmadd_b (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmadd_b ((v16i8)_1, (v16i8)_2, (v16i8)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmadd_h (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmadd_h ((v8i16)_1, (v8i16)_2, (v8i16)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmadd_w (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmadd_w ((v4i32)_1, (v4i32)_2, (v4i32)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmadd_d (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmadd_d ((v2i64)_1, (v2i64)_2, (v2i64)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmsub_b (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmsub_b ((v16i8)_1, (v16i8)_2, (v16i8)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmsub_h (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmsub_h ((v8i16)_1, (v8i16)_2, (v8i16)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmsub_w (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmsub_w ((v4i32)_1, (v4i32)_2, (v4i32)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmsub_d (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmsub_d ((v2i64)_1, (v2i64)_2, (v2i64)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vdiv_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vdiv_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vdiv_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vdiv_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vdiv_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vdiv_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vdiv_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vdiv_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vdiv_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vdiv_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vdiv_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vdiv_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vdiv_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vdiv_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vdiv_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vdiv_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhaddw_h_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhaddw_h_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhaddw_w_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhaddw_w_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhaddw_d_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhaddw_d_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhaddw_hu_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhaddw_hu_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhaddw_wu_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhaddw_wu_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhaddw_du_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhaddw_du_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhsubw_h_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhsubw_h_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhsubw_w_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhsubw_w_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhsubw_d_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhsubw_d_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhsubw_hu_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhsubw_hu_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhsubw_wu_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhsubw_wu_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhsubw_du_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhsubw_du_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmod_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmod_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmod_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmod_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmod_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmod_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmod_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmod_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmod_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmod_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmod_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmod_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmod_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmod_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmod_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmod_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, rk.  */
+/* Data types in instruction templates:  V16QI, V16QI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vreplve_b (__m128i _1, int _2)
+{
+  return (__m128i)__builtin_lsx_vreplve_b ((v16i8)_1, (int)_2);
+}
+
+/* Assembly instruction format:	vd, vj, rk.  */
+/* Data types in instruction templates:  V8HI, V8HI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vreplve_h (__m128i _1, int _2)
+{
+  return (__m128i)__builtin_lsx_vreplve_h ((v8i16)_1, (int)_2);
+}
+
+/* Assembly instruction format:	vd, vj, rk.  */
+/* Data types in instruction templates:  V4SI, V4SI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vreplve_w (__m128i _1, int _2)
+{
+  return (__m128i)__builtin_lsx_vreplve_w ((v4i32)_1, (int)_2);
+}
+
+/* Assembly instruction format:	vd, vj, rk.  */
+/* Data types in instruction templates:  V2DI, V2DI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vreplve_d (__m128i _1, int _2)
+{
+  return (__m128i)__builtin_lsx_vreplve_d ((v2i64)_1, (int)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V16QI, V16QI, UQI.  */
+#define __lsx_vreplvei_b(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vreplvei_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  V8HI, V8HI, UQI.  */
+#define __lsx_vreplvei_h(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vreplvei_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui2.  */
+/* Data types in instruction templates:  V4SI, V4SI, UQI.  */
+#define __lsx_vreplvei_w(/*__m128i*/ _1, /*ui2*/ _2) \
+  ((__m128i)__builtin_lsx_vreplvei_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui1.  */
+/* Data types in instruction templates:  V2DI, V2DI, UQI.  */
+#define __lsx_vreplvei_d(/*__m128i*/ _1, /*ui1*/ _2) \
+  ((__m128i)__builtin_lsx_vreplvei_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpickev_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpickev_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpickev_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpickev_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpickev_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpickev_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpickev_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpickev_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpickod_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpickod_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpickod_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpickod_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpickod_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpickod_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpickod_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpickod_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vilvh_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vilvh_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vilvh_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vilvh_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vilvh_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vilvh_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vilvh_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vilvh_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vilvl_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vilvl_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vilvl_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vilvl_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vilvl_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vilvl_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vilvl_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vilvl_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpackev_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpackev_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpackev_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpackev_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpackev_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpackev_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpackev_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpackev_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpackod_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpackod_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpackod_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpackod_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpackod_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpackod_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpackod_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vpackod_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vshuf_h (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vshuf_h ((v8i16)_1, (v8i16)_2, (v8i16)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vshuf_w (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vshuf_w ((v4i32)_1, (v4i32)_2, (v4i32)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vshuf_d (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vshuf_d ((v2i64)_1, (v2i64)_2, (v2i64)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vand_v (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vand_v ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UQI.  */
+#define __lsx_vandi_b(/*__m128i*/ _1, /*ui8*/ _2) \
+  ((__m128i)__builtin_lsx_vandi_b ((v16u8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vor_v (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vor_v ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UQI.  */
+#define __lsx_vori_b(/*__m128i*/ _1, /*ui8*/ _2) \
+  ((__m128i)__builtin_lsx_vori_b ((v16u8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vnor_v (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vnor_v ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UQI.  */
+#define __lsx_vnori_b(/*__m128i*/ _1, /*ui8*/ _2) \
+  ((__m128i)__builtin_lsx_vnori_b ((v16u8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vxor_v (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vxor_v ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UQI.  */
+#define __lsx_vxori_b(/*__m128i*/ _1, /*ui8*/ _2) \
+  ((__m128i)__builtin_lsx_vxori_b ((v16u8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk, va.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vbitsel_v (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vbitsel_v ((v16u8)_1, (v16u8)_2, (v16u8)_3);
+}
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI, USI.  */
+#define __lsx_vbitseli_b(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3) \
+  ((__m128i)__builtin_lsx_vbitseli_b ((v16u8)(_1), (v16u8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  V16QI, V16QI, USI.  */
+#define __lsx_vshuf4i_b(/*__m128i*/ _1, /*ui8*/ _2) \
+  ((__m128i)__builtin_lsx_vshuf4i_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  V8HI, V8HI, USI.  */
+#define __lsx_vshuf4i_h(/*__m128i*/ _1, /*ui8*/ _2) \
+  ((__m128i)__builtin_lsx_vshuf4i_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  V4SI, V4SI, USI.  */
+#define __lsx_vshuf4i_w(/*__m128i*/ _1, /*ui8*/ _2) \
+  ((__m128i)__builtin_lsx_vshuf4i_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, rj.  */
+/* Data types in instruction templates:  V16QI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vreplgr2vr_b (int _1)
+{
+  return (__m128i)__builtin_lsx_vreplgr2vr_b ((int)_1);
+}
+
+/* Assembly instruction format:	vd, rj.  */
+/* Data types in instruction templates:  V8HI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vreplgr2vr_h (int _1)
+{
+  return (__m128i)__builtin_lsx_vreplgr2vr_h ((int)_1);
+}
+
+/* Assembly instruction format:	vd, rj.  */
+/* Data types in instruction templates:  V4SI, SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vreplgr2vr_w (int _1)
+{
+  return (__m128i)__builtin_lsx_vreplgr2vr_w ((int)_1);
+}
+
+/* Assembly instruction format:	vd, rj.  */
+/* Data types in instruction templates:  V2DI, DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vreplgr2vr_d (long int _1)
+{
+  return (__m128i)__builtin_lsx_vreplgr2vr_d ((long int)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpcnt_b (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vpcnt_b ((v16i8)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpcnt_h (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vpcnt_h ((v8i16)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpcnt_w (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vpcnt_w ((v4i32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vpcnt_d (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vpcnt_d ((v2i64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vclo_b (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vclo_b ((v16i8)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vclo_h (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vclo_h ((v8i16)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vclo_w (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vclo_w ((v4i32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vclo_d (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vclo_d ((v2i64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vclz_b (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vclz_b ((v16i8)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vclz_h (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vclz_h ((v8i16)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vclz_w (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vclz_w ((v4i32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vclz_d (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vclz_d ((v2i64)_1);
+}
+
+/* Assembly instruction format:	rd, vj, ui4.  */
+/* Data types in instruction templates:  SI, V16QI, UQI.  */
+#define __lsx_vpickve2gr_b(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((int)__builtin_lsx_vpickve2gr_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	rd, vj, ui3.  */
+/* Data types in instruction templates:  SI, V8HI, UQI.  */
+#define __lsx_vpickve2gr_h(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((int)__builtin_lsx_vpickve2gr_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	rd, vj, ui2.  */
+/* Data types in instruction templates:  SI, V4SI, UQI.  */
+#define __lsx_vpickve2gr_w(/*__m128i*/ _1, /*ui2*/ _2) \
+  ((int)__builtin_lsx_vpickve2gr_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	rd, vj, ui1.  */
+/* Data types in instruction templates:  DI, V2DI, UQI.  */
+#define __lsx_vpickve2gr_d(/*__m128i*/ _1, /*ui1*/ _2) \
+  ((long int)__builtin_lsx_vpickve2gr_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	rd, vj, ui4.  */
+/* Data types in instruction templates:  USI, V16QI, UQI.  */
+#define __lsx_vpickve2gr_bu(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((unsigned int)__builtin_lsx_vpickve2gr_bu ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	rd, vj, ui3.  */
+/* Data types in instruction templates:  USI, V8HI, UQI.  */
+#define __lsx_vpickve2gr_hu(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((unsigned int)__builtin_lsx_vpickve2gr_hu ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	rd, vj, ui2.  */
+/* Data types in instruction templates:  USI, V4SI, UQI.  */
+#define __lsx_vpickve2gr_wu(/*__m128i*/ _1, /*ui2*/ _2) \
+  ((unsigned int)__builtin_lsx_vpickve2gr_wu ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	rd, vj, ui1.  */
+/* Data types in instruction templates:  UDI, V2DI, UQI.  */
+#define __lsx_vpickve2gr_du(/*__m128i*/ _1, /*ui1*/ _2) \
+  ((unsigned long int)__builtin_lsx_vpickve2gr_du ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, rj, ui4.  */
+/* Data types in instruction templates:  V16QI, V16QI, SI, UQI.  */
+#define __lsx_vinsgr2vr_b(/*__m128i*/ _1, /*int*/ _2, /*ui4*/ _3) \
+  ((__m128i)__builtin_lsx_vinsgr2vr_b ((v16i8)(_1), (int)(_2), (_3)))
+
+/* Assembly instruction format:	vd, rj, ui3.  */
+/* Data types in instruction templates:  V8HI, V8HI, SI, UQI.  */
+#define __lsx_vinsgr2vr_h(/*__m128i*/ _1, /*int*/ _2, /*ui3*/ _3) \
+  ((__m128i)__builtin_lsx_vinsgr2vr_h ((v8i16)(_1), (int)(_2), (_3)))
+
+/* Assembly instruction format:	vd, rj, ui2.  */
+/* Data types in instruction templates:  V4SI, V4SI, SI, UQI.  */
+#define __lsx_vinsgr2vr_w(/*__m128i*/ _1, /*int*/ _2, /*ui2*/ _3) \
+  ((__m128i)__builtin_lsx_vinsgr2vr_w ((v4i32)(_1), (int)(_2), (_3)))
+
+/* Assembly instruction format:	vd, rj, ui1.  */
+/* Data types in instruction templates:  V2DI, V2DI, DI, UQI.  */
+#define __lsx_vinsgr2vr_d(/*__m128i*/ _1, /*long int*/ _2, /*ui1*/ _3) \
+  ((__m128i)__builtin_lsx_vinsgr2vr_d ((v2i64)(_1), (long int)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SF, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfadd_s (__m128 _1, __m128 _2)
+{
+  return (__m128)__builtin_lsx_vfadd_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfadd_d (__m128d _1, __m128d _2)
+{
+  return (__m128d)__builtin_lsx_vfadd_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SF, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfsub_s (__m128 _1, __m128 _2)
+{
+  return (__m128)__builtin_lsx_vfsub_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfsub_d (__m128d _1, __m128d _2)
+{
+  return (__m128d)__builtin_lsx_vfsub_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SF, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfmul_s (__m128 _1, __m128 _2)
+{
+  return (__m128)__builtin_lsx_vfmul_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfmul_d (__m128d _1, __m128d _2)
+{
+  return (__m128d)__builtin_lsx_vfmul_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SF, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfdiv_s (__m128 _1, __m128 _2)
+{
+  return (__m128)__builtin_lsx_vfdiv_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfdiv_d (__m128d _1, __m128d _2)
+{
+  return (__m128d)__builtin_lsx_vfdiv_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcvt_h_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcvt_h_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfcvt_s_d (__m128d _1, __m128d _2)
+{
+  return (__m128)__builtin_lsx_vfcvt_s_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SF, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfmin_s (__m128 _1, __m128 _2)
+{
+  return (__m128)__builtin_lsx_vfmin_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfmin_d (__m128d _1, __m128d _2)
+{
+  return (__m128d)__builtin_lsx_vfmin_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SF, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfmina_s (__m128 _1, __m128 _2)
+{
+  return (__m128)__builtin_lsx_vfmina_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfmina_d (__m128d _1, __m128d _2)
+{
+  return (__m128d)__builtin_lsx_vfmina_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SF, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfmax_s (__m128 _1, __m128 _2)
+{
+  return (__m128)__builtin_lsx_vfmax_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfmax_d (__m128d _1, __m128d _2)
+{
+  return (__m128d)__builtin_lsx_vfmax_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SF, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfmaxa_s (__m128 _1, __m128 _2)
+{
+  return (__m128)__builtin_lsx_vfmaxa_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfmaxa_d (__m128d _1, __m128d _2)
+{
+  return (__m128d)__builtin_lsx_vfmaxa_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfclass_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vfclass_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfclass_d (__m128d _1)
+{
+  return (__m128i)__builtin_lsx_vfclass_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfsqrt_s (__m128 _1)
+{
+  return (__m128)__builtin_lsx_vfsqrt_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfsqrt_d (__m128d _1)
+{
+  return (__m128d)__builtin_lsx_vfsqrt_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfrecip_s (__m128 _1)
+{
+  return (__m128)__builtin_lsx_vfrecip_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfrecip_d (__m128d _1)
+{
+  return (__m128d)__builtin_lsx_vfrecip_d ((v2f64)_1);
+}
+
+#if defined(__loongarch_frecipe)
+/* Assembly instruction format: vd, vj.  */
+/* Data types in instruction templates:  V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfrecipe_s (__m128 _1)
+{
+  return (__m128)__builtin_lsx_vfrecipe_s ((v4f32)_1);
+}
+
+/* Assembly instruction format: vd, vj.  */
+/* Data types in instruction templates:  V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfrecipe_d (__m128d _1)
+{
+  return (__m128d)__builtin_lsx_vfrecipe_d ((v2f64)_1);
+}
+
+/* Assembly instruction format: vd, vj.  */
+/* Data types in instruction templates:  V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfrsqrte_s (__m128 _1)
+{
+  return (__m128)__builtin_lsx_vfrsqrte_s ((v4f32)_1);
+}
+
+/* Assembly instruction format: vd, vj.  */
+/* Data types in instruction templates:  V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfrsqrte_d (__m128d _1)
+{
+  return (__m128d)__builtin_lsx_vfrsqrte_d ((v2f64)_1);
+}
+#endif
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfrint_s (__m128 _1)
+{
+  return (__m128)__builtin_lsx_vfrint_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfrint_d (__m128d _1)
+{
+  return (__m128d)__builtin_lsx_vfrint_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfrsqrt_s (__m128 _1)
+{
+  return (__m128)__builtin_lsx_vfrsqrt_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfrsqrt_d (__m128d _1)
+{
+  return (__m128d)__builtin_lsx_vfrsqrt_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vflogb_s (__m128 _1)
+{
+  return (__m128)__builtin_lsx_vflogb_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vflogb_d (__m128d _1)
+{
+  return (__m128d)__builtin_lsx_vflogb_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SF, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfcvth_s_h (__m128i _1)
+{
+  return (__m128)__builtin_lsx_vfcvth_s_h ((v8i16)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfcvth_d_s (__m128 _1)
+{
+  return (__m128d)__builtin_lsx_vfcvth_d_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SF, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfcvtl_s_h (__m128i _1)
+{
+  return (__m128)__builtin_lsx_vfcvtl_s_h ((v8i16)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfcvtl_d_s (__m128 _1)
+{
+  return (__m128d)__builtin_lsx_vfcvtl_d_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftint_w_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftint_w_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftint_l_d (__m128d _1)
+{
+  return (__m128i)__builtin_lsx_vftint_l_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  UV4SI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftint_wu_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftint_wu_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  UV2DI, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftint_lu_d (__m128d _1)
+{
+  return (__m128i)__builtin_lsx_vftint_lu_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrz_w_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintrz_w_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrz_l_d (__m128d _1)
+{
+  return (__m128i)__builtin_lsx_vftintrz_l_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  UV4SI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrz_wu_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintrz_wu_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  UV2DI, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrz_lu_d (__m128d _1)
+{
+  return (__m128i)__builtin_lsx_vftintrz_lu_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SF, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vffint_s_w (__m128i _1)
+{
+  return (__m128)__builtin_lsx_vffint_s_w ((v4i32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vffint_d_l (__m128i _1)
+{
+  return (__m128d)__builtin_lsx_vffint_d_l ((v2i64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SF, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vffint_s_wu (__m128i _1)
+{
+  return (__m128)__builtin_lsx_vffint_s_wu ((v4u32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vffint_d_lu (__m128i _1)
+{
+  return (__m128d)__builtin_lsx_vffint_d_lu ((v2u64)_1);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vandn_v (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vandn_v ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vneg_b (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vneg_b ((v16i8)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vneg_h (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vneg_h ((v8i16)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vneg_w (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vneg_w ((v4i32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vneg_d (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vneg_d ((v2i64)_1);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmuh_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmuh_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmuh_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmuh_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmuh_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmuh_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmuh_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmuh_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmuh_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmuh_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmuh_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmuh_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmuh_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmuh_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmuh_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmuh_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  V8HI, V16QI, UQI.  */
+#define __lsx_vsllwil_h_b(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vsllwil_h_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V4SI, V8HI, UQI.  */
+#define __lsx_vsllwil_w_h(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vsllwil_w_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V2DI, V4SI, UQI.  */
+#define __lsx_vsllwil_d_w(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vsllwil_d_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  UV8HI, UV16QI, UQI.  */
+#define __lsx_vsllwil_hu_bu(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vsllwil_hu_bu ((v16u8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  UV4SI, UV8HI, UQI.  */
+#define __lsx_vsllwil_wu_hu(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vsllwil_wu_hu ((v8u16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV2DI, UV4SI, UQI.  */
+#define __lsx_vsllwil_du_wu(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vsllwil_du_wu ((v4u32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsran_b_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsran_b_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsran_h_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsran_h_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsran_w_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsran_w_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssran_b_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssran_b_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssran_h_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssran_h_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssran_w_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssran_w_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssran_bu_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssran_bu_h ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssran_hu_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssran_hu_w ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssran_wu_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssran_wu_d ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrarn_b_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrarn_b_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrarn_h_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrarn_h_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrarn_w_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrarn_w_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrarn_b_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrarn_b_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrarn_h_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrarn_h_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrarn_w_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrarn_w_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrarn_bu_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrarn_bu_h ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrarn_hu_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrarn_hu_w ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrarn_wu_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrarn_wu_d ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrln_b_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrln_b_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrln_h_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrln_h_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrln_w_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrln_w_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrln_bu_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrln_bu_h ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrln_hu_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrln_hu_w ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrln_wu_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrln_wu_d ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrlrn_b_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrlrn_b_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrlrn_h_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrlrn_h_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsrlrn_w_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsrlrn_w_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV16QI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrlrn_bu_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrlrn_bu_h ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrlrn_hu_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrlrn_hu_w ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrlrn_wu_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrlrn_wu_d ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, UQI.  */
+#define __lsx_vfrstpi_b(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vfrstpi_b ((v16i8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, UQI.  */
+#define __lsx_vfrstpi_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vfrstpi_h ((v8i16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfrstp_b (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vfrstp_b ((v16i8)_1, (v16i8)_2, (v16i8)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfrstp_h (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vfrstp_h ((v8i16)_1, (v8i16)_2, (v8i16)_3);
+}
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, USI.  */
+#define __lsx_vshuf4i_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3) \
+  ((__m128i)__builtin_lsx_vshuf4i_d ((v2i64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V16QI, V16QI, UQI.  */
+#define __lsx_vbsrl_v(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vbsrl_v ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V16QI, V16QI, UQI.  */
+#define __lsx_vbsll_v(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vbsll_v ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, USI.  */
+#define __lsx_vextrins_b(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3) \
+  ((__m128i)__builtin_lsx_vextrins_b ((v16i8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, USI.  */
+#define __lsx_vextrins_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3) \
+  ((__m128i)__builtin_lsx_vextrins_h ((v8i16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI, USI.  */
+#define __lsx_vextrins_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3) \
+  ((__m128i)__builtin_lsx_vextrins_w ((v4i32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, USI.  */
+#define __lsx_vextrins_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3) \
+  ((__m128i)__builtin_lsx_vextrins_d ((v2i64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmskltz_b (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vmskltz_b ((v16i8)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmskltz_h (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vmskltz_h ((v8i16)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmskltz_w (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vmskltz_w ((v4i32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmskltz_d (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vmskltz_d ((v2i64)_1);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsigncov_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsigncov_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsigncov_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsigncov_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsigncov_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsigncov_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsigncov_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsigncov_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk, va.  */
+/* Data types in instruction templates:  V4SF, V4SF, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfmadd_s (__m128 _1, __m128 _2, __m128 _3)
+{
+  return (__m128)__builtin_lsx_vfmadd_s ((v4f32)_1, (v4f32)_2, (v4f32)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk, va.  */
+/* Data types in instruction templates:  V2DF, V2DF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfmadd_d (__m128d _1, __m128d _2, __m128d _3)
+{
+  return (__m128d)__builtin_lsx_vfmadd_d ((v2f64)_1, (v2f64)_2, (v2f64)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk, va.  */
+/* Data types in instruction templates:  V4SF, V4SF, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfmsub_s (__m128 _1, __m128 _2, __m128 _3)
+{
+  return (__m128)__builtin_lsx_vfmsub_s ((v4f32)_1, (v4f32)_2, (v4f32)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk, va.  */
+/* Data types in instruction templates:  V2DF, V2DF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfmsub_d (__m128d _1, __m128d _2, __m128d _3)
+{
+  return (__m128d)__builtin_lsx_vfmsub_d ((v2f64)_1, (v2f64)_2, (v2f64)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk, va.  */
+/* Data types in instruction templates:  V4SF, V4SF, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfnmadd_s (__m128 _1, __m128 _2, __m128 _3)
+{
+  return (__m128)__builtin_lsx_vfnmadd_s ((v4f32)_1, (v4f32)_2, (v4f32)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk, va.  */
+/* Data types in instruction templates:  V2DF, V2DF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfnmadd_d (__m128d _1, __m128d _2, __m128d _3)
+{
+  return (__m128d)__builtin_lsx_vfnmadd_d ((v2f64)_1, (v2f64)_2, (v2f64)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk, va.  */
+/* Data types in instruction templates:  V4SF, V4SF, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfnmsub_s (__m128 _1, __m128 _2, __m128 _3)
+{
+  return (__m128)__builtin_lsx_vfnmsub_s ((v4f32)_1, (v4f32)_2, (v4f32)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk, va.  */
+/* Data types in instruction templates:  V2DF, V2DF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfnmsub_d (__m128d _1, __m128d _2, __m128d _3)
+{
+  return (__m128d)__builtin_lsx_vfnmsub_d ((v2f64)_1, (v2f64)_2, (v2f64)_3);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrne_w_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintrne_w_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrne_l_d (__m128d _1)
+{
+  return (__m128i)__builtin_lsx_vftintrne_l_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrp_w_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintrp_w_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrp_l_d (__m128d _1)
+{
+  return (__m128i)__builtin_lsx_vftintrp_l_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrm_w_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintrm_w_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrm_l_d (__m128d _1)
+{
+  return (__m128i)__builtin_lsx_vftintrm_l_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftint_w_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vftint_w_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SF, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vffint_s_l (__m128i _1, __m128i _2)
+{
+  return (__m128)__builtin_lsx_vffint_s_l ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrz_w_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vftintrz_w_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrp_w_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vftintrp_w_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrm_w_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vftintrm_w_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrne_w_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vftintrne_w_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintl_l_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintl_l_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftinth_l_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftinth_l_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vffinth_d_w (__m128i _1)
+{
+  return (__m128d)__builtin_lsx_vffinth_d_w ((v4i32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vffintl_d_w (__m128i _1)
+{
+  return (__m128d)__builtin_lsx_vffintl_d_w ((v4i32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrzl_l_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintrzl_l_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrzh_l_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintrzh_l_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrpl_l_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintrpl_l_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrph_l_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintrph_l_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrml_l_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintrml_l_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrmh_l_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintrmh_l_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrnel_l_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintrnel_l_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vftintrneh_l_s (__m128 _1)
+{
+  return (__m128i)__builtin_lsx_vftintrneh_l_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfrintrne_s (__m128 _1)
+{
+  return (__m128)__builtin_lsx_vfrintrne_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfrintrne_d (__m128d _1)
+{
+  return (__m128d)__builtin_lsx_vfrintrne_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfrintrz_s (__m128 _1)
+{
+  return (__m128)__builtin_lsx_vfrintrz_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfrintrz_d (__m128d _1)
+{
+  return (__m128d)__builtin_lsx_vfrintrz_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfrintrp_s (__m128 _1)
+{
+  return (__m128)__builtin_lsx_vfrintrp_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfrintrp_d (__m128d _1)
+{
+  return (__m128d)__builtin_lsx_vfrintrp_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lsx_vfrintrm_s (__m128 _1)
+{
+  return (__m128)__builtin_lsx_vfrintrm_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lsx_vfrintrm_d (__m128d _1)
+{
+  return (__m128d)__builtin_lsx_vfrintrm_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:	vd, rj, si8, idx.  */
+/* Data types in instruction templates:  VOID, V16QI, CVPOINTER, SI, UQI.  */
+#define __lsx_vstelm_b(/*__m128i*/ _1, /*void **/ _2, /*si8*/ _3, /*idx*/ _4) \
+  ((void)__builtin_lsx_vstelm_b ((v16i8)(_1), (void *)(_2), (_3), (_4)))
+
+/* Assembly instruction format:	vd, rj, si8, idx.  */
+/* Data types in instruction templates:  VOID, V8HI, CVPOINTER, SI, UQI.  */
+#define __lsx_vstelm_h(/*__m128i*/ _1, /*void **/ _2, /*si8*/ _3, /*idx*/ _4) \
+  ((void)__builtin_lsx_vstelm_h ((v8i16)(_1), (void *)(_2), (_3), (_4)))
+
+/* Assembly instruction format:	vd, rj, si8, idx.  */
+/* Data types in instruction templates:  VOID, V4SI, CVPOINTER, SI, UQI.  */
+#define __lsx_vstelm_w(/*__m128i*/ _1, /*void **/ _2, /*si8*/ _3, /*idx*/ _4) \
+  ((void)__builtin_lsx_vstelm_w ((v4i32)(_1), (void *)(_2), (_3), (_4)))
+
+/* Assembly instruction format:	vd, rj, si8, idx.  */
+/* Data types in instruction templates:  VOID, V2DI, CVPOINTER, SI, UQI.  */
+#define __lsx_vstelm_d(/*__m128i*/ _1, /*void **/ _2, /*si8*/ _3, /*idx*/ _4) \
+  ((void)__builtin_lsx_vstelm_d ((v2i64)(_1), (void *)(_2), (_3), (_4)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwev_d_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwev_d_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwev_w_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwev_w_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwev_h_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwev_h_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwod_d_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwod_d_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwod_w_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwod_w_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwod_h_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwod_h_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwev_d_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwev_d_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwev_w_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwev_w_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwev_h_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwev_h_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwod_d_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwod_d_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwod_w_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwod_w_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwod_h_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwod_h_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwev_d_wu_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwev_d_wu_w ((v4u32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, UV8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwev_w_hu_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwev_w_hu_h ((v8u16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, UV16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwev_h_bu_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwev_h_bu_b ((v16u8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwod_d_wu_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwod_d_wu_w ((v4u32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, UV8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwod_w_hu_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwod_w_hu_h ((v8u16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, UV16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwod_h_bu_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwod_h_bu_b ((v16u8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwev_d_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwev_d_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwev_w_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwev_w_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwev_h_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwev_h_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwod_d_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwod_d_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwod_w_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwod_w_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwod_h_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwod_h_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwev_d_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwev_d_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwev_w_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwev_w_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwev_h_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwev_h_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwod_d_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwod_d_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwod_w_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwod_w_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwod_h_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwod_h_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwev_q_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwev_q_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwod_q_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwod_q_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwev_q_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwev_q_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwod_q_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwod_q_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwev_q_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwev_q_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwod_q_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwod_q_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwev_q_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwev_q_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsubwod_q_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsubwod_q_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwev_q_du_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwev_q_du_d ((v2u64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vaddwod_q_du_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vaddwod_q_du_d ((v2u64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwev_d_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwev_d_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwev_w_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwev_w_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwev_h_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwev_h_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwod_d_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwod_d_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwod_w_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwod_w_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwod_h_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwod_h_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwev_d_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwev_d_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwev_w_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwev_w_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwev_h_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwev_h_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwod_d_wu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwod_d_wu ((v4u32)_1, (v4u32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwod_w_hu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwod_w_hu ((v8u16)_1, (v8u16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwod_h_bu (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwod_h_bu ((v16u8)_1, (v16u8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwev_d_wu_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwev_d_wu_w ((v4u32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, UV8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwev_w_hu_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwev_w_hu_h ((v8u16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, UV16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwev_h_bu_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwev_h_bu_b ((v16u8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwod_d_wu_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwod_d_wu_w ((v4u32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, UV8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwod_w_hu_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwod_w_hu_h ((v8u16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, UV16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwod_h_bu_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwod_h_bu_b ((v16u8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwev_q_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwev_q_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwod_q_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwod_q_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwev_q_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwev_q_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwod_q_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwod_q_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwev_q_du_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwev_q_du_d ((v2u64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, UV2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmulwod_q_du_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vmulwod_q_du_d ((v2u64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhaddw_q_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhaddw_q_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhaddw_qu_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhaddw_qu_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhsubw_q_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhsubw_q_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vhsubw_qu_du (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vhsubw_qu_du ((v2u64)_1, (v2u64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwev_d_w (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwev_d_w ((v2i64)_1, (v4i32)_2, (v4i32)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwev_w_h (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwev_w_h ((v4i32)_1, (v8i16)_2, (v8i16)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwev_h_b (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwev_h_b ((v8i16)_1, (v16i8)_2, (v16i8)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwev_d_wu (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwev_d_wu ((v2u64)_1, (v4u32)_2, (v4u32)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwev_w_hu (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwev_w_hu ((v4u32)_1, (v8u16)_2, (v8u16)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwev_h_bu (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwev_h_bu ((v8u16)_1, (v16u8)_2, (v16u8)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwod_d_w (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwod_d_w ((v2i64)_1, (v4i32)_2, (v4i32)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwod_w_h (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwod_w_h ((v4i32)_1, (v8i16)_2, (v8i16)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwod_h_b (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwod_h_b ((v8i16)_1, (v16i8)_2, (v16i8)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV4SI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwod_d_wu (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwod_d_wu ((v2u64)_1, (v4u32)_2, (v4u32)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, UV8HI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwod_w_hu (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwod_w_hu ((v4u32)_1, (v8u16)_2, (v8u16)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, UV16QI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwod_h_bu (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwod_h_bu ((v8u16)_1, (v16u8)_2, (v16u8)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, UV4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwev_d_wu_w (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwev_d_wu_w ((v2i64)_1, (v4u32)_2, (v4i32)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, UV8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwev_w_hu_h (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwev_w_hu_h ((v4i32)_1, (v8u16)_2, (v8i16)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, UV16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwev_h_bu_b (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwev_h_bu_b ((v8i16)_1, (v16u8)_2, (v16i8)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, UV4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwod_d_wu_w (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwod_d_wu_w ((v2i64)_1, (v4u32)_2, (v4i32)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, UV8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwod_w_hu_h (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwod_w_hu_h ((v4i32)_1, (v8u16)_2, (v8i16)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, UV16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwod_h_bu_b (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwod_h_bu_b ((v8i16)_1, (v16u8)_2, (v16i8)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwev_q_d (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwev_q_d ((v2i64)_1, (v2i64)_2, (v2i64)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwod_q_d (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwod_q_d ((v2i64)_1, (v2i64)_2, (v2i64)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwev_q_du (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwev_q_du ((v2u64)_1, (v2u64)_2, (v2u64)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwod_q_du (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwod_q_du ((v2u64)_1, (v2u64)_2, (v2u64)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, UV2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwev_q_du_d (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwev_q_du_d ((v2i64)_1, (v2u64)_2, (v2i64)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, UV2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmaddwod_q_du_d (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vmaddwod_q_du_d ((v2i64)_1, (v2u64)_2, (v2i64)_3);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vrotr_b (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vrotr_b ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vrotr_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vrotr_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vrotr_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vrotr_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vrotr_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vrotr_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vadd_q (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vadd_q ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vsub_q (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vsub_q ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, rj, si12.  */
+/* Data types in instruction templates:  V16QI, CVPOINTER, SI.  */
+#define __lsx_vldrepl_b(/*void **/ _1, /*si12*/ _2) \
+  ((__m128i)__builtin_lsx_vldrepl_b ((void *)(_1), (_2)))
+
+/* Assembly instruction format:	vd, rj, si11.  */
+/* Data types in instruction templates:  V8HI, CVPOINTER, SI.  */
+#define __lsx_vldrepl_h(/*void **/ _1, /*si11*/ _2) \
+  ((__m128i)__builtin_lsx_vldrepl_h ((void *)(_1), (_2)))
+
+/* Assembly instruction format:	vd, rj, si10.  */
+/* Data types in instruction templates:  V4SI, CVPOINTER, SI.  */
+#define __lsx_vldrepl_w(/*void **/ _1, /*si10*/ _2) \
+  ((__m128i)__builtin_lsx_vldrepl_w ((void *)(_1), (_2)))
+
+/* Assembly instruction format:	vd, rj, si9.  */
+/* Data types in instruction templates:  V2DI, CVPOINTER, SI.  */
+#define __lsx_vldrepl_d(/*void **/ _1, /*si9*/ _2) \
+  ((__m128i)__builtin_lsx_vldrepl_d ((void *)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmskgez_b (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vmskgez_b ((v16i8)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vmsknz_b (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vmsknz_b ((v16i8)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V8HI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vexth_h_b (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vexth_h_b ((v16i8)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V4SI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vexth_w_h (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vexth_w_h ((v8i16)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vexth_d_w (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vexth_d_w ((v4i32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vexth_q_d (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vexth_q_d ((v2i64)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  UV8HI, UV16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vexth_hu_bu (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vexth_hu_bu ((v16u8)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  UV4SI, UV8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vexth_wu_hu (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vexth_wu_hu ((v8u16)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  UV2DI, UV4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vexth_du_wu (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vexth_du_wu ((v4u32)_1);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vexth_qu_du (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vexth_qu_du ((v2u64)_1);
+}
+
+/* Assembly instruction format:	vd, vj, ui3.  */
+/* Data types in instruction templates:  V16QI, V16QI, UQI.  */
+#define __lsx_vrotri_b(/*__m128i*/ _1, /*ui3*/ _2) \
+  ((__m128i)__builtin_lsx_vrotri_b ((v16i8)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V8HI, V8HI, UQI.  */
+#define __lsx_vrotri_h(/*__m128i*/ _1, /*ui4*/ _2) \
+  ((__m128i)__builtin_lsx_vrotri_h ((v8i16)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V4SI, V4SI, UQI.  */
+#define __lsx_vrotri_w(/*__m128i*/ _1, /*ui5*/ _2) \
+  ((__m128i)__builtin_lsx_vrotri_w ((v4i32)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V2DI, V2DI, UQI.  */
+#define __lsx_vrotri_d(/*__m128i*/ _1, /*ui6*/ _2) \
+  ((__m128i)__builtin_lsx_vrotri_d ((v2i64)(_1), (_2)))
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vextl_q_d (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vextl_q_d ((v2i64)_1);
+}
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, USI.  */
+#define __lsx_vsrlni_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3) \
+  ((__m128i)__builtin_lsx_vsrlni_b_h ((v16i8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, USI.  */
+#define __lsx_vsrlni_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vsrlni_h_w ((v8i16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI, USI.  */
+#define __lsx_vsrlni_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3) \
+  ((__m128i)__builtin_lsx_vsrlni_w_d ((v4i32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui7.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, USI.  */
+#define __lsx_vsrlni_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3) \
+  ((__m128i)__builtin_lsx_vsrlni_d_q ((v2i64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, USI.  */
+#define __lsx_vsrlrni_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3) \
+  ((__m128i)__builtin_lsx_vsrlrni_b_h ((v16i8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, USI.  */
+#define __lsx_vsrlrni_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vsrlrni_h_w ((v8i16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI, USI.  */
+#define __lsx_vsrlrni_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3) \
+  ((__m128i)__builtin_lsx_vsrlrni_w_d ((v4i32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui7.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, USI.  */
+#define __lsx_vsrlrni_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3) \
+  ((__m128i)__builtin_lsx_vsrlrni_d_q ((v2i64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, USI.  */
+#define __lsx_vssrlni_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlni_b_h ((v16i8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, USI.  */
+#define __lsx_vssrlni_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlni_h_w ((v8i16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI, USI.  */
+#define __lsx_vssrlni_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlni_w_d ((v4i32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui7.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, USI.  */
+#define __lsx_vssrlni_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlni_d_q ((v2i64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, V16QI, USI.  */
+#define __lsx_vssrlni_bu_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlni_bu_h ((v16u8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, V8HI, USI.  */
+#define __lsx_vssrlni_hu_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlni_hu_w ((v8u16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, V4SI, USI.  */
+#define __lsx_vssrlni_wu_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlni_wu_d ((v4u32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui7.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, V2DI, USI.  */
+#define __lsx_vssrlni_du_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlni_du_q ((v2u64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, USI.  */
+#define __lsx_vssrlrni_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlrni_b_h ((v16i8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, USI.  */
+#define __lsx_vssrlrni_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlrni_h_w ((v8i16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI, USI.  */
+#define __lsx_vssrlrni_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlrni_w_d ((v4i32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui7.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, USI.  */
+#define __lsx_vssrlrni_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlrni_d_q ((v2i64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, V16QI, USI.  */
+#define __lsx_vssrlrni_bu_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlrni_bu_h ((v16u8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, V8HI, USI.  */
+#define __lsx_vssrlrni_hu_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlrni_hu_w ((v8u16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, V4SI, USI.  */
+#define __lsx_vssrlrni_wu_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlrni_wu_d ((v4u32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui7.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, V2DI, USI.  */
+#define __lsx_vssrlrni_du_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3) \
+  ((__m128i)__builtin_lsx_vssrlrni_du_q ((v2u64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, USI.  */
+#define __lsx_vsrani_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3) \
+  ((__m128i)__builtin_lsx_vsrani_b_h ((v16i8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, USI.  */
+#define __lsx_vsrani_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vsrani_h_w ((v8i16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI, USI.  */
+#define __lsx_vsrani_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3) \
+  ((__m128i)__builtin_lsx_vsrani_w_d ((v4i32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui7.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, USI.  */
+#define __lsx_vsrani_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3) \
+  ((__m128i)__builtin_lsx_vsrani_d_q ((v2i64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, USI.  */
+#define __lsx_vsrarni_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3) \
+  ((__m128i)__builtin_lsx_vsrarni_b_h ((v16i8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, USI.  */
+#define __lsx_vsrarni_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vsrarni_h_w ((v8i16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI, USI.  */
+#define __lsx_vsrarni_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3) \
+  ((__m128i)__builtin_lsx_vsrarni_w_d ((v4i32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui7.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, USI.  */
+#define __lsx_vsrarni_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3) \
+  ((__m128i)__builtin_lsx_vsrarni_d_q ((v2i64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, USI.  */
+#define __lsx_vssrani_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3) \
+  ((__m128i)__builtin_lsx_vssrani_b_h ((v16i8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, USI.  */
+#define __lsx_vssrani_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vssrani_h_w ((v8i16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI, USI.  */
+#define __lsx_vssrani_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3) \
+  ((__m128i)__builtin_lsx_vssrani_w_d ((v4i32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui7.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, USI.  */
+#define __lsx_vssrani_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3) \
+  ((__m128i)__builtin_lsx_vssrani_d_q ((v2i64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, V16QI, USI.  */
+#define __lsx_vssrani_bu_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3) \
+  ((__m128i)__builtin_lsx_vssrani_bu_h ((v16u8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, V8HI, USI.  */
+#define __lsx_vssrani_hu_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vssrani_hu_w ((v8u16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, V4SI, USI.  */
+#define __lsx_vssrani_wu_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3) \
+  ((__m128i)__builtin_lsx_vssrani_wu_d ((v4u32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui7.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, V2DI, USI.  */
+#define __lsx_vssrani_du_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3) \
+  ((__m128i)__builtin_lsx_vssrani_du_q ((v2u64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, USI.  */
+#define __lsx_vssrarni_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3) \
+  ((__m128i)__builtin_lsx_vssrarni_b_h ((v16i8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  V8HI, V8HI, V8HI, USI.  */
+#define __lsx_vssrarni_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vssrarni_h_w ((v8i16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI, USI.  */
+#define __lsx_vssrarni_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3) \
+  ((__m128i)__builtin_lsx_vssrarni_w_d ((v4i32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui7.  */
+/* Data types in instruction templates:  V2DI, V2DI, V2DI, USI.  */
+#define __lsx_vssrarni_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3) \
+  ((__m128i)__builtin_lsx_vssrarni_d_q ((v2i64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui4.  */
+/* Data types in instruction templates:  UV16QI, UV16QI, V16QI, USI.  */
+#define __lsx_vssrarni_bu_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3) \
+  ((__m128i)__builtin_lsx_vssrarni_bu_h ((v16u8)(_1), (v16i8)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui5.  */
+/* Data types in instruction templates:  UV8HI, UV8HI, V8HI, USI.  */
+#define __lsx_vssrarni_hu_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3) \
+  ((__m128i)__builtin_lsx_vssrarni_hu_w ((v8u16)(_1), (v8i16)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui6.  */
+/* Data types in instruction templates:  UV4SI, UV4SI, V4SI, USI.  */
+#define __lsx_vssrarni_wu_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3) \
+  ((__m128i)__builtin_lsx_vssrarni_wu_d ((v4u32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui7.  */
+/* Data types in instruction templates:  UV2DI, UV2DI, V2DI, USI.  */
+#define __lsx_vssrarni_du_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3) \
+  ((__m128i)__builtin_lsx_vssrarni_du_q ((v2u64)(_1), (v2i64)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, ui8.  */
+/* Data types in instruction templates:  V4SI, V4SI, V4SI, USI.  */
+#define __lsx_vpermi_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3) \
+  ((__m128i)__builtin_lsx_vpermi_w ((v4i32)(_1), (v4i32)(_2), (_3)))
+
+/* Assembly instruction format:	vd, rj, si12.  */
+/* Data types in instruction templates:  V16QI, CVPOINTER, SI.  */
+#define __lsx_vld(/*void **/ _1, /*si12*/ _2) \
+  ((__m128i)__builtin_lsx_vld ((void *)(_1), (_2)))
+
+/* Assembly instruction format:	vd, rj, si12.  */
+/* Data types in instruction templates:  VOID, V16QI, CVPOINTER, SI.  */
+#define __lsx_vst(/*__m128i*/ _1, /*void **/ _2, /*si12*/ _3) \
+  ((void)__builtin_lsx_vst ((v16i8)(_1), (void *)(_2), (_3)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrlrn_b_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrlrn_b_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrlrn_h_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrlrn_h_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrlrn_w_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrlrn_w_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V8HI, V8HI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrln_b_h (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrln_b_h ((v8i16)_1, (v8i16)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V8HI, V4SI, V4SI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrln_h_w (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrln_h_w ((v4i32)_1, (v4i32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vssrln_w_d (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vssrln_w_d ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vorn_v (__m128i _1, __m128i _2)
+{
+  return (__m128i)__builtin_lsx_vorn_v ((v16i8)_1, (v16i8)_2);
+}
+
+/* Assembly instruction format:	vd, i13.  */
+/* Data types in instruction templates:  V2DI, HI.  */
+#define __lsx_vldi(/*i13*/ _1) \
+  ((__m128i)__builtin_lsx_vldi ((_1)))
+
+/* Assembly instruction format:	vd, vj, vk, va.  */
+/* Data types in instruction templates:  V16QI, V16QI, V16QI, V16QI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vshuf_b (__m128i _1, __m128i _2, __m128i _3)
+{
+  return (__m128i)__builtin_lsx_vshuf_b ((v16i8)_1, (v16i8)_2, (v16i8)_3);
+}
+
+/* Assembly instruction format:	vd, rj, rk.  */
+/* Data types in instruction templates:  V16QI, CVPOINTER, DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vldx (void * _1, long int _2)
+{
+  return (__m128i)__builtin_lsx_vldx ((void *)_1, (long int)_2);
+}
+
+/* Assembly instruction format:	vd, rj, rk.  */
+/* Data types in instruction templates:  VOID, V16QI, CVPOINTER, DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+void __lsx_vstx (__m128i _1, void * _2, long int _3)
+{
+  return (void)__builtin_lsx_vstx ((v16i8)_1, (void *)_2, (long int)_3);
+}
+
+/* Assembly instruction format:	vd, vj.  */
+/* Data types in instruction templates:  UV2DI, UV2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vextl_qu_du (__m128i _1)
+{
+  return (__m128i)__builtin_lsx_vextl_qu_du ((v2u64)_1);
+}
+
+/* Assembly instruction format:	cd, vj.  */
+/* Data types in instruction templates:  SI, UV16QI.  */
+#define __lsx_bnz_b(/*__m128i*/ _1) \
+  ((int)__builtin_lsx_bnz_b ((v16u8)(_1)))
+
+/* Assembly instruction format:	cd, vj.  */
+/* Data types in instruction templates:  SI, UV2DI.  */
+#define __lsx_bnz_d(/*__m128i*/ _1) \
+  ((int)__builtin_lsx_bnz_d ((v2u64)(_1)))
+
+/* Assembly instruction format:	cd, vj.  */
+/* Data types in instruction templates:  SI, UV8HI.  */
+#define __lsx_bnz_h(/*__m128i*/ _1) \
+  ((int)__builtin_lsx_bnz_h ((v8u16)(_1)))
+
+/* Assembly instruction format:	cd, vj.  */
+/* Data types in instruction templates:  SI, UV16QI.  */
+#define __lsx_bnz_v(/*__m128i*/ _1) \
+  ((int)__builtin_lsx_bnz_v ((v16u8)(_1)))
+
+/* Assembly instruction format:	cd, vj.  */
+/* Data types in instruction templates:  SI, UV4SI.  */
+#define __lsx_bnz_w(/*__m128i*/ _1) \
+  ((int)__builtin_lsx_bnz_w ((v4u32)(_1)))
+
+/* Assembly instruction format:	cd, vj.  */
+/* Data types in instruction templates:  SI, UV16QI.  */
+#define __lsx_bz_b(/*__m128i*/ _1) \
+  ((int)__builtin_lsx_bz_b ((v16u8)(_1)))
+
+/* Assembly instruction format:	cd, vj.  */
+/* Data types in instruction templates:  SI, UV2DI.  */
+#define __lsx_bz_d(/*__m128i*/ _1) \
+  ((int)__builtin_lsx_bz_d ((v2u64)(_1)))
+
+/* Assembly instruction format:	cd, vj.  */
+/* Data types in instruction templates:  SI, UV8HI.  */
+#define __lsx_bz_h(/*__m128i*/ _1) \
+  ((int)__builtin_lsx_bz_h ((v8u16)(_1)))
+
+/* Assembly instruction format:	cd, vj.  */
+/* Data types in instruction templates:  SI, UV16QI.  */
+#define __lsx_bz_v(/*__m128i*/ _1) \
+  ((int)__builtin_lsx_bz_v ((v16u8)(_1)))
+
+/* Assembly instruction format:	cd, vj.  */
+/* Data types in instruction templates:  SI, UV4SI.  */
+#define __lsx_bz_w(/*__m128i*/ _1) \
+  ((int)__builtin_lsx_bz_w ((v4u32)(_1)))
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_caf_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_caf_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_caf_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_caf_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_ceq_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_ceq_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_ceq_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_ceq_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cle_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cle_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cle_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cle_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_clt_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_clt_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_clt_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_clt_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cne_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cne_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cne_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cne_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cor_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cor_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cor_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cor_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cueq_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cueq_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cueq_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cueq_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cule_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cule_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cule_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cule_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cult_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cult_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cult_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cult_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cun_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cun_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cune_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cune_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cune_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cune_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_cun_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_cun_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_saf_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_saf_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_saf_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_saf_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_seq_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_seq_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_seq_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_seq_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sle_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sle_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sle_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sle_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_slt_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_slt_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_slt_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_slt_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sne_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sne_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sne_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sne_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sor_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sor_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sor_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sor_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sueq_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sueq_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sueq_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sueq_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sule_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sule_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sule_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sule_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sult_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sult_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sult_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sult_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sun_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sun_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V2DI, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sune_d (__m128d _1, __m128d _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sune_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sune_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sune_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, vj, vk.  */
+/* Data types in instruction templates:  V4SI, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lsx_vfcmp_sun_s (__m128 _1, __m128 _2)
+{
+  return (__m128i)__builtin_lsx_vfcmp_sun_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:	vd, si10.  */
+/* Data types in instruction templates:  V16QI, HI.  */
+#define __lsx_vrepli_b(/*si10*/ _1) \
+  ((__m128i)__builtin_lsx_vrepli_b ((_1)))
+
+/* Assembly instruction format:	vd, si10.  */
+/* Data types in instruction templates:  V2DI, HI.  */
+#define __lsx_vrepli_d(/*si10*/ _1) \
+  ((__m128i)__builtin_lsx_vrepli_d ((_1)))
+
+/* Assembly instruction format:	vd, si10.  */
+/* Data types in instruction templates:  V8HI, HI.  */
+#define __lsx_vrepli_h(/*si10*/ _1) \
+  ((__m128i)__builtin_lsx_vrepli_h ((_1)))
+
+/* Assembly instruction format:	vd, si10.  */
+/* Data types in instruction templates:  V4SI, HI.  */
+#define __lsx_vrepli_w(/*si10*/ _1) \
+  ((__m128i)__builtin_lsx_vrepli_w ((_1)))
+
+#endif /* defined(__loongarch_sx) */
+#endif /* _GCC_LOONGSON_SXINTRIN_H */
diff --git a/library/stdarch/crates/stdarch-gen-loongarch/src/main.rs b/library/stdarch/crates/stdarch-gen-loongarch/src/main.rs
new file mode 100644
index 0000000000000..aa9990b6ccd13
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen-loongarch/src/main.rs
@@ -0,0 +1,1551 @@
+use std::env;
+use std::fmt;
+use std::fs::File;
+use std::io::prelude::*;
+use std::io::{self, BufReader};
+use std::path::PathBuf;
+
+/// Complete lines of generated source.
+///
+/// This enables common generation tasks to be factored out without precluding basic
+/// context-specific formatting.
+///
+/// The convention in this generator is to prefix (not suffix) lines with a newline, so the
+/// implementation of `std::fmt::Display` behaves in the same way.
+struct Lines {
+    indent: usize,
+    lines: Vec<String>,
+}
+
+impl Lines {
+    fn single(line: String) -> Self {
+        Self::from(vec![line])
+    }
+}
+
+impl From<Vec<String>> for Lines {
+    fn from(lines: Vec<String>) -> Self {
+        Self { indent: 0, lines }
+    }
+}
+
+impl std::fmt::Display for Lines {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> fmt::Result {
+        for line in self.lines.iter() {
+            write!(f, "\n{:width$}{line}", "", width = self.indent)?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Clone, Copy, PartialEq)]
+enum TargetFeature {
+    Lsx,
+    Lasx,
+}
+
+impl TargetFeature {
+    fn new(ext: &str) -> TargetFeature {
+        match ext {
+            "lasx" => Self::Lasx,
+            _ => Self::Lsx,
+        }
+    }
+
+    /// A string for use with `#[target_feature(...)]`.
+    fn as_target_feature_arg(&self, ins: &str) -> String {
+        let vec = match *self {
+            // Features included with LoongArch64 LSX and LASX.
+            Self::Lsx => "lsx",
+            Self::Lasx => "lasx",
+        };
+        let frecipe = match ins {
+            "lsx_vfrecipe_s" | "lsx_vfrecipe_d" | "lsx_vfrsqrte_s" | "lsx_vfrsqrte_d"
+            | "lasx_xvfrecipe_s" | "lasx_xvfrecipe_d" | "lasx_xvfrsqrte_s" | "lasx_xvfrsqrte_d" => {
+                ",frecipe"
+            }
+            _ => "",
+        };
+        format!("{vec}{frecipe}")
+    }
+
+    fn attr(name: &str, value: impl fmt::Display) -> String {
+        format!(r#"#[{name}(enable = "{value}")]"#)
+    }
+
+    /// Generate a target_feature attribute
+    fn to_target_feature_attr(self, ins: &str) -> Lines {
+        Lines::single(Self::attr(
+            "target_feature",
+            self.as_target_feature_arg(ins),
+        ))
+    }
+
+    fn bytes(&self) -> u8 {
+        match *self {
+            // Features included with LoongArch64 LSX and LASX.
+            Self::Lsx => 16,
+            Self::Lasx => 32,
+        }
+    }
+}
+
+fn gen_spec(in_file: String, ext_name: &str) -> io::Result<()> {
+    let f = File::open(in_file.clone()).unwrap_or_else(|_| panic!("Failed to open {in_file}"));
+    let f = BufReader::new(f);
+    let mut out = format!(
+        r#"// This code is automatically generated. DO NOT MODIFY.
+// ```
+// OUT_DIR=`pwd`/crates/stdarch-gen-loongarch cargo run -p stdarch-gen-loongarch -- {in_file}
+// ```
+"#
+    );
+    out.push('\n');
+
+    let mut asm_fmts = String::new();
+    let mut data_types = String::new();
+    let fn_pat = format!("__{ext_name}_");
+    for line in f.lines() {
+        let line = line.unwrap();
+        if line.is_empty() {
+            continue;
+        }
+
+        if let Some(s) = line.find("/* Assembly instruction format:") {
+            let e = line.find('.').unwrap();
+            asm_fmts = line.get(s + 31..e).unwrap().trim().to_string();
+        } else if let Some(s) = line.find("/* Data types in instruction templates:") {
+            let e = line.find('.').unwrap();
+            data_types = line.get(s + 39..e).unwrap().trim().to_string();
+        } else if let Some(s) = line.find(fn_pat.as_str()) {
+            let e = line.find('(').unwrap();
+            let name = line.get(s + 2..e).unwrap().trim().to_string();
+            out.push_str(&format!("/// {name}\n"));
+            out.push_str(&format!("name = {name}\n"));
+            out.push_str(&format!("asm-fmts = {asm_fmts}\n"));
+            out.push_str(&format!("data-types = {data_types}\n"));
+            out.push('\n');
+        }
+    }
+
+    let out_dir_path: PathBuf = PathBuf::from(env::var("OUT_DIR").unwrap());
+    std::fs::create_dir_all(&out_dir_path)?;
+    let mut f = File::create(out_dir_path.join(format!("{ext_name}.spec")))?;
+    f.write_all(out.as_bytes())?;
+    Ok(())
+}
+
+fn gen_bind(in_file: String, ext_name: &str) -> io::Result<()> {
+    let f = File::open(in_file.clone()).unwrap_or_else(|_| panic!("Failed to open {in_file}"));
+    let f = BufReader::new(f);
+
+    let target: TargetFeature = TargetFeature::new(ext_name);
+    let mut para_num;
+    let mut current_name: Option<String> = None;
+    let mut asm_fmts: Vec<String> = Vec::new();
+    let mut link_function_str = String::new();
+    let mut function_str = String::new();
+    let mut out = String::new();
+
+    out.push_str(&format!(
+        r#"// This code is automatically generated. DO NOT MODIFY.
+//
+// Instead, modify `{in_file}` and run the following command to re-generate this file:
+//
+// ```
+// OUT_DIR=`pwd`/crates/core_arch cargo run -p stdarch-gen-loongarch -- {in_file}
+// ```
+
+use super::types::*;
+"#
+    ));
+
+    out.push_str(
+        r#"
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+"#,
+    );
+
+    for line in f.lines() {
+        let line = line.unwrap();
+        if line.is_empty() {
+            continue;
+        }
+        if let Some(name) = line.strip_prefix("name = ") {
+            current_name = Some(String::from(name));
+        } else if line.starts_with("asm-fmts = ") {
+            asm_fmts = line[10..]
+                .split(',')
+                .map(|v| v.trim().to_string())
+                .collect();
+        } else if line.starts_with("data-types = ") {
+            let current_name = current_name.clone().unwrap();
+            let data_types: Vec<&str> = line
+                .get(12..)
+                .unwrap()
+                .split(',')
+                .map(|e| e.trim())
+                .collect();
+            let in_t;
+            let out_t;
+            if data_types.len() == 2 {
+                in_t = [data_types[1], "NULL", "NULL", "NULL"];
+                out_t = data_types[0];
+                para_num = 1;
+            } else if data_types.len() == 3 {
+                in_t = [data_types[1], data_types[2], "NULL", "NULL"];
+                out_t = data_types[0];
+                para_num = 2;
+            } else if data_types.len() == 4 {
+                in_t = [data_types[1], data_types[2], data_types[3], "NULL"];
+                out_t = data_types[0];
+                para_num = 3;
+            } else if data_types.len() == 5 {
+                in_t = [data_types[1], data_types[2], data_types[3], data_types[4]];
+                out_t = data_types[0];
+                para_num = 4;
+            } else {
+                panic!("DEBUG: line: {0} len: {1}", line, data_types.len());
+            }
+
+            let (link_function, function) =
+                gen_bind_body(&current_name, &asm_fmts, &in_t, out_t, para_num, target);
+            link_function_str.push_str(&link_function);
+            function_str.push_str(&function);
+        }
+    }
+    out.push_str(&link_function_str);
+    out.push_str("}\n");
+    out.push_str(&function_str);
+
+    let out_path: PathBuf =
+        PathBuf::from(env::var("OUT_DIR").unwrap_or("crates/core_arch".to_string()))
+            .join("src")
+            .join("loongarch64")
+            .join(ext_name);
+    std::fs::create_dir_all(&out_path)?;
+
+    let mut file = File::create(out_path.join("generated.rs"))?;
+    file.write_all(out.as_bytes())?;
+    Ok(())
+}
+
+fn gen_bind_body(
+    current_name: &str,
+    asm_fmts: &[String],
+    in_t: &[&str; 4],
+    out_t: &str,
+    para_num: i32,
+    target: TargetFeature,
+) -> (String, String) {
+    let type_to_rst = |t: &str, s: bool| -> &str {
+        match (t, s) {
+            ("V16QI", _) => "v16i8",
+            ("V32QI", _) => "v32i8",
+            ("V8HI", _) => "v8i16",
+            ("V16HI", _) => "v16i16",
+            ("V4SI", _) => "v4i32",
+            ("V8SI", _) => "v8i32",
+            ("V2DI", _) => "v2i64",
+            ("V4DI", _) => "v4i64",
+            ("UV16QI", _) => "v16u8",
+            ("UV32QI", _) => "v32u8",
+            ("UV8HI", _) => "v8u16",
+            ("UV16HI", _) => "v16u16",
+            ("UV4SI", _) => "v4u32",
+            ("UV8SI", _) => "v8u32",
+            ("UV2DI", _) => "v2u64",
+            ("UV4DI", _) => "v4u64",
+            ("SI", _) => "i32",
+            ("DI", _) => "i64",
+            ("USI", _) => "u32",
+            ("UDI", _) => "u64",
+            ("V4SF", _) => "v4f32",
+            ("V8SF", _) => "v8f32",
+            ("V2DF", _) => "v2f64",
+            ("V4DF", _) => "v4f64",
+            ("UQI", _) => "u32",
+            ("QI", _) => "i32",
+            ("CVPOINTER", false) => "*const i8",
+            ("CVPOINTER", true) => "*mut i8",
+            ("HI", _) => "i32",
+            (_, _) => panic!("unknown type: {t}"),
+        }
+    };
+
+    let is_store = current_name.to_string().contains("vst");
+    let link_function = {
+        let fn_decl = {
+            let fn_output = if out_t.to_lowercase() == "void" {
+                String::new()
+            } else {
+                format!("-> {}", type_to_rst(out_t, is_store))
+            };
+            let fn_inputs = match para_num {
+                1 => format!("(a: {})", type_to_rst(in_t[0], is_store)),
+                2 => format!(
+                    "(a: {}, b: {})",
+                    type_to_rst(in_t[0], is_store),
+                    type_to_rst(in_t[1], is_store)
+                ),
+                3 => format!(
+                    "(a: {}, b: {}, c: {})",
+                    type_to_rst(in_t[0], is_store),
+                    type_to_rst(in_t[1], is_store),
+                    type_to_rst(in_t[2], is_store)
+                ),
+                4 => format!(
+                    "(a: {}, b: {}, c: {}, d: {})",
+                    type_to_rst(in_t[0], is_store),
+                    type_to_rst(in_t[1], is_store),
+                    type_to_rst(in_t[2], is_store),
+                    type_to_rst(in_t[3], is_store)
+                ),
+                _ => panic!("unsupported parameter number"),
+            };
+            format!("fn __{current_name}{fn_inputs} {fn_output};")
+        };
+        let function = format!(
+            r#"    #[link_name = "llvm.loongarch.{}"]
+    {fn_decl}
+"#,
+            current_name.replace('_', ".")
+        );
+        function
+    };
+
+    let type_to_imm = |t| -> i8 {
+        match t {
+            'b' => 4,
+            'h' => 3,
+            'w' => 2,
+            'd' => 1,
+            _ => panic!("unsupported type"),
+        }
+    };
+    let mut rustc_legacy_const_generics = "";
+    let fn_decl = {
+        let fn_output = if out_t.to_lowercase() == "void" {
+            String::new()
+        } else {
+            format!("-> {} ", type_to_rst(out_t, is_store))
+        };
+        let mut fn_inputs = match para_num {
+            1 => format!("(a: {})", type_to_rst(in_t[0], is_store)),
+            2 => format!(
+                "(a: {}, b: {})",
+                type_to_rst(in_t[0], is_store),
+                type_to_rst(in_t[1], is_store)
+            ),
+            3 => format!(
+                "(a: {}, b: {}, c: {})",
+                type_to_rst(in_t[0], is_store),
+                type_to_rst(in_t[1], is_store),
+                type_to_rst(in_t[2], is_store)
+            ),
+            4 => format!(
+                "(a: {}, b: {}, c: {}, d: {})",
+                type_to_rst(in_t[0], is_store),
+                type_to_rst(in_t[1], is_store),
+                type_to_rst(in_t[2], is_store),
+                type_to_rst(in_t[3], is_store)
+            ),
+            _ => panic!("unsupported parameter number"),
+        };
+        if para_num == 1 && in_t[0] == "HI" {
+            fn_inputs = match asm_fmts[1].as_str() {
+                "si13" | "i13" => format!("<const IMM_S13: {}>()", type_to_rst(in_t[0], is_store)),
+                "si10" => format!("<const IMM_S10: {}>()", type_to_rst(in_t[0], is_store)),
+                _ => panic!("unsupported assembly format: {}", asm_fmts[1]),
+            };
+            rustc_legacy_const_generics = "rustc_legacy_const_generics(0)";
+        } else if para_num == 2 && (in_t[1] == "UQI" || in_t[1] == "USI") {
+            fn_inputs = if asm_fmts[2].starts_with("ui") {
+                format!(
+                    "<const IMM{2}: {1}>(a: {0})",
+                    type_to_rst(in_t[0], is_store),
+                    type_to_rst(in_t[1], is_store),
+                    asm_fmts[2].get(2..).unwrap()
+                )
+            } else {
+                panic!("unsupported assembly format: {}", asm_fmts[2]);
+            };
+            rustc_legacy_const_generics = "rustc_legacy_const_generics(1)";
+        } else if para_num == 2 && in_t[1] == "QI" {
+            fn_inputs = if asm_fmts[2].starts_with("si") {
+                format!(
+                    "<const IMM_S{2}: {1}>(a: {0})",
+                    type_to_rst(in_t[0], is_store),
+                    type_to_rst(in_t[1], is_store),
+                    asm_fmts[2].get(2..).unwrap()
+                )
+            } else {
+                panic!("unsupported assembly format: {}", asm_fmts[2]);
+            };
+            rustc_legacy_const_generics = "rustc_legacy_const_generics(1)";
+        } else if para_num == 2 && in_t[0] == "CVPOINTER" && in_t[1] == "SI" {
+            fn_inputs = if asm_fmts[2].starts_with("si") {
+                format!(
+                    "<const IMM_S{2}: {1}>(mem_addr: {0})",
+                    type_to_rst(in_t[0], is_store),
+                    type_to_rst(in_t[1], is_store),
+                    asm_fmts[2].get(2..).unwrap()
+                )
+            } else {
+                panic!("unsupported assembly format: {}", asm_fmts[2]);
+            };
+            rustc_legacy_const_generics = "rustc_legacy_const_generics(1)";
+        } else if para_num == 2 && in_t[0] == "CVPOINTER" && in_t[1] == "DI" {
+            fn_inputs = match asm_fmts[2].as_str() {
+                "rk" => format!(
+                    "(mem_addr: {}, b: {})",
+                    type_to_rst(in_t[0], is_store),
+                    type_to_rst(in_t[1], is_store)
+                ),
+                _ => panic!("unsupported assembly format: {}", asm_fmts[2]),
+            };
+        } else if para_num == 3 && (in_t[2] == "USI" || in_t[2] == "UQI") {
+            fn_inputs = if asm_fmts[2].starts_with("ui") {
+                format!(
+                    "<const IMM{3}: {2}>(a: {0}, b: {1})",
+                    type_to_rst(in_t[0], is_store),
+                    type_to_rst(in_t[1], is_store),
+                    type_to_rst(in_t[2], is_store),
+                    asm_fmts[2].get(2..).unwrap()
+                )
+            } else {
+                panic!("unsupported assembly format: {}", asm_fmts[2])
+            };
+            rustc_legacy_const_generics = "rustc_legacy_const_generics(2)";
+        } else if para_num == 3 && in_t[1] == "CVPOINTER" && in_t[2] == "SI" {
+            fn_inputs = match asm_fmts[2].as_str() {
+                "si12" => format!(
+                    "<const IMM_S12: {2}>(a: {0}, mem_addr: {1})",
+                    type_to_rst(in_t[0], is_store),
+                    type_to_rst(in_t[1], is_store),
+                    type_to_rst(in_t[2], is_store)
+                ),
+                _ => panic!("unsupported assembly format: {}", asm_fmts[2]),
+            };
+            rustc_legacy_const_generics = "rustc_legacy_const_generics(2)";
+        } else if para_num == 3 && in_t[1] == "CVPOINTER" && in_t[2] == "DI" {
+            fn_inputs = match asm_fmts[2].as_str() {
+                "rk" => format!(
+                    "(a: {}, mem_addr: {}, b: {})",
+                    type_to_rst(in_t[0], is_store),
+                    type_to_rst(in_t[1], is_store),
+                    type_to_rst(in_t[2], is_store)
+                ),
+                _ => panic!("unsupported assembly format: {}", asm_fmts[2]),
+            };
+        } else if para_num == 4 {
+            fn_inputs = match (asm_fmts[2].as_str(), current_name.chars().last().unwrap()) {
+                ("si8", t) => format!(
+                    "<const IMM_S8: {2}, const IMM{4}: {3}>(a: {0}, mem_addr: {1})",
+                    type_to_rst(in_t[0], is_store),
+                    type_to_rst(in_t[1], is_store),
+                    type_to_rst(in_t[2], is_store),
+                    type_to_rst(in_t[3], is_store),
+                    type_to_imm(t),
+                ),
+                (_, _) => panic!(
+                    "unsupported assembly format: {} for {}",
+                    asm_fmts[2], current_name
+                ),
+            };
+            rustc_legacy_const_generics = "rustc_legacy_const_generics(2, 3)";
+        }
+        format!("pub unsafe fn {current_name}{fn_inputs} {fn_output}")
+    };
+    let mut call_params = {
+        match para_num {
+            1 => format!("__{current_name}(a)"),
+            2 => format!("__{current_name}(a, b)"),
+            3 => format!("__{current_name}(a, b, c)"),
+            4 => format!("__{current_name}(a, b, c, d)"),
+            _ => panic!("unsupported parameter number"),
+        }
+    };
+    if para_num == 1 && in_t[0] == "HI" {
+        call_params = match asm_fmts[1].as_str() {
+            "si10" => {
+                format!("static_assert_simm_bits!(IMM_S10, 10);\n    __{current_name}(IMM_S10)")
+            }
+            "i13" => {
+                format!("static_assert_simm_bits!(IMM_S13, 13);\n    __{current_name}(IMM_S13)")
+            }
+            _ => panic!("unsupported assembly format: {}", asm_fmts[2]),
+        }
+    } else if para_num == 2 && (in_t[1] == "UQI" || in_t[1] == "USI") {
+        call_params = if asm_fmts[2].starts_with("ui") {
+            format!(
+                "static_assert_uimm_bits!(IMM{0}, {0});\n    __{current_name}(a, IMM{0})",
+                asm_fmts[2].get(2..).unwrap()
+            )
+        } else {
+            panic!("unsupported assembly format: {}", asm_fmts[2])
+        };
+    } else if para_num == 2 && in_t[1] == "QI" {
+        call_params = match asm_fmts[2].as_str() {
+            "si5" => {
+                format!("static_assert_simm_bits!(IMM_S5, 5);\n    __{current_name}(a, IMM_S5)")
+            }
+            _ => panic!("unsupported assembly format: {}", asm_fmts[2]),
+        };
+    } else if para_num == 2 && in_t[0] == "CVPOINTER" && in_t[1] == "SI" {
+        call_params = if asm_fmts[2].starts_with("si") {
+            format!(
+                "static_assert_simm_bits!(IMM_S{0}, {0});\n    __{current_name}(mem_addr, IMM_S{0})",
+                asm_fmts[2].get(2..).unwrap()
+            )
+        } else {
+            panic!("unsupported assembly format: {}", asm_fmts[2])
+        }
+    } else if para_num == 2 && in_t[0] == "CVPOINTER" && in_t[1] == "DI" {
+        call_params = match asm_fmts[2].as_str() {
+            "rk" => format!("__{current_name}(mem_addr, b)"),
+            _ => panic!("unsupported assembly format: {}", asm_fmts[2]),
+        };
+    } else if para_num == 3 && (in_t[2] == "USI" || in_t[2] == "UQI") {
+        call_params = if asm_fmts[2].starts_with("ui") {
+            format!(
+                "static_assert_uimm_bits!(IMM{0}, {0});\n    __{current_name}(a, b, IMM{0})",
+                asm_fmts[2].get(2..).unwrap()
+            )
+        } else {
+            panic!("unsupported assembly format: {}", asm_fmts[2])
+        }
+    } else if para_num == 3 && in_t[1] == "CVPOINTER" && in_t[2] == "SI" {
+        call_params = match asm_fmts[2].as_str() {
+            "si12" => format!(
+                "static_assert_simm_bits!(IMM_S12, 12);\n    __{current_name}(a, mem_addr, IMM_S12)"
+            ),
+            _ => panic!("unsupported assembly format: {}", asm_fmts[2]),
+        };
+    } else if para_num == 3 && in_t[1] == "CVPOINTER" && in_t[2] == "DI" {
+        call_params = match asm_fmts[2].as_str() {
+            "rk" => format!("__{current_name}(a, mem_addr, b)"),
+            _ => panic!("unsupported assembly format: {}", asm_fmts[2]),
+        };
+    } else if para_num == 4 {
+        call_params = match (asm_fmts[2].as_str(), current_name.chars().last().unwrap()) {
+            ("si8", t) => format!(
+                "static_assert_simm_bits!(IMM_S8, 8);\n    static_assert_uimm_bits!(IMM{0}, {0});\n    __{current_name}(a, mem_addr, IMM_S8, IMM{0})",
+                type_to_imm(t)
+            ),
+            (_, _) => panic!(
+                "unsupported assembly format: {} for {}",
+                asm_fmts[2], current_name
+            ),
+        }
+    }
+    let function = if !rustc_legacy_const_generics.is_empty() {
+        format!(
+            r#"
+#[inline]{target_feature}
+#[{rustc_legacy_const_generics}]
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+{fn_decl}{{
+    {call_params}
+}}
+"#,
+            target_feature = target.to_target_feature_attr(current_name)
+        )
+    } else {
+        format!(
+            r#"
+#[inline]{target_feature}
+#[unstable(feature = "stdarch_loongarch", issue = "117427")]
+{fn_decl}{{
+    {call_params}
+}}
+"#,
+            target_feature = target.to_target_feature_attr(current_name)
+        )
+    };
+    (link_function, function)
+}
+
+fn gen_test(in_file: String, ext_name: &str) -> io::Result<()> {
+    let f = File::open(in_file.clone()).unwrap_or_else(|_| panic!("Failed to open {in_file}"));
+    let f = BufReader::new(f);
+
+    let target: TargetFeature = TargetFeature::new(ext_name);
+    let mut para_num;
+    let mut current_name: Option<String> = None;
+    let mut asm_fmts: Vec<String> = Vec::new();
+    let mut impl_function_str = String::new();
+    let mut call_function_str = String::new();
+    let mut out = String::new();
+
+    out.push_str(&format!(
+        r#"/*
+ * This code is automatically generated. DO NOT MODIFY.
+ *
+ * Instead, modify `{in_file}` and run the following command to re-generate this file:
+ *
+ * ```
+ * OUT_DIR=`pwd`/crates/stdarch-gen-loongarch cargo run -p stdarch-gen-loongarch -- {in_file} test
+ * ```
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <lsxintrin.h>
+#include <lasxintrin.h>
+
+union v16qi
+{{
+    __m128i v;
+    int64_t i64[2];
+    int8_t i8[16];
+}};
+
+union v32qi
+{{
+    __m256i v;
+    int64_t i64[4];
+    int8_t i8[32];
+}};
+
+union v8hi
+{{
+    __m128i v;
+    int64_t i64[2];
+    int16_t i16[8];
+}};
+
+union v16hi
+{{
+    __m256i v;
+    int64_t i64[4];
+    int16_t i16[16];
+}};
+
+union v4si
+{{
+    __m128i v;
+    int64_t i64[2];
+    int32_t i32[4];
+}};
+
+union v8si
+{{
+    __m256i v;
+    int64_t i64[4];
+    int32_t i32[8];
+}};
+
+union v2di
+{{
+    __m128i v;
+    int64_t i64[2];
+}};
+
+union v4di
+{{
+    __m256i v;
+    int64_t i64[4];
+}};
+
+union uv16qi
+{{
+    __m128i v;
+    uint64_t i64[2];
+    uint8_t i8[16];
+}};
+
+union uv32qi
+{{
+    __m256i v;
+    uint64_t i64[4];
+    uint8_t i8[32];
+}};
+
+union uv8hi
+{{
+    __m128i v;
+    uint64_t i64[2];
+    uint16_t i16[8];
+}};
+
+union uv16hi
+{{
+    __m256i v;
+    uint64_t i64[4];
+    uint16_t i16[16];
+}};
+
+union uv4si
+{{
+    __m128i v;
+    uint64_t i64[2];
+    uint32_t i32[4];
+}};
+
+union uv8si
+{{
+    __m256i v;
+    uint64_t i64[4];
+    uint32_t i32[8];
+}};
+
+union uv2di
+{{
+    __m128i v;
+    uint64_t i64[2];
+}};
+
+union uv4di
+{{
+    __m256i v;
+    uint64_t i64[4];
+}};
+
+union v4sf
+{{
+    __m128 v;
+    int64_t i64[2];
+    uint32_t i32[2];
+    float f32[4];
+}};
+
+union v8sf
+{{
+    __m256 v;
+    int64_t i64[4];
+    uint32_t i32[4];
+    float f32[8];
+}};
+
+union v2df
+{{
+    __m128d v;
+    uint64_t i64[2];
+    double f64[2];
+}};
+
+union v4df
+{{
+    __m256d v;
+    uint64_t i64[4];
+    double f64[4];
+}};
+"#
+    ));
+
+    for line in f.lines() {
+        let line = line.unwrap();
+        if line.is_empty() {
+            continue;
+        }
+        if let Some(name) = line.strip_prefix("name = ") {
+            current_name = Some(String::from(name));
+        } else if line.starts_with("asm-fmts = ") {
+            asm_fmts = line[10..]
+                .split(',')
+                .map(|v| v.trim().to_string())
+                .collect();
+        } else if line.starts_with("data-types = ") {
+            let current_name = current_name.clone().unwrap();
+            let data_types: Vec<&str> = line
+                .get(12..)
+                .unwrap()
+                .split(',')
+                .map(|e| e.trim())
+                .collect();
+            let in_t;
+            let out_t;
+            if data_types.len() == 2 {
+                in_t = [data_types[1], "NULL", "NULL", "NULL"];
+                out_t = data_types[0];
+                para_num = 1;
+            } else if data_types.len() == 3 {
+                in_t = [data_types[1], data_types[2], "NULL", "NULL"];
+                out_t = data_types[0];
+                para_num = 2;
+            } else if data_types.len() == 4 {
+                in_t = [data_types[1], data_types[2], data_types[3], "NULL"];
+                out_t = data_types[0];
+                para_num = 3;
+            } else if data_types.len() == 5 {
+                in_t = [data_types[1], data_types[2], data_types[3], data_types[4]];
+                out_t = data_types[0];
+                para_num = 4;
+            } else {
+                panic!("DEBUG: line: {0} len: {1}", line, data_types.len());
+            }
+
+            let (link_function, function) =
+                gen_test_body(&current_name, &asm_fmts, &in_t, out_t, para_num, target);
+            impl_function_str.push_str(&link_function);
+            call_function_str.push_str(&function);
+        }
+    }
+    out.push_str(&impl_function_str);
+    out.push('\n');
+    out.push_str("int main(int argc, char *argv[])\n");
+    out.push_str("{\n");
+    out.push_str("    printf(\"// This code is automatically generated. DO NOT MODIFY.\\n\");\n");
+    out.push_str("    printf(\"// See crates/stdarch-gen-loongarch/README.md\\n\\n\");\n");
+    out.push_str("    printf(\"use crate::{\\n\");\n");
+    out.push_str("    printf(\"    core_arch::{loongarch64::*, simd::*},\\n\");\n");
+    out.push_str("    printf(\"    mem::transmute,\\n\");\n");
+    out.push_str("    printf(\"};\\n\");\n");
+    out.push_str("    printf(\"use stdarch_test::simd_test;\\n\");\n");
+    out.push_str(&call_function_str);
+    out.push_str("    return 0;\n");
+    out.push('}');
+
+    let out_dir_path: PathBuf = PathBuf::from(env::var("OUT_DIR").unwrap());
+    std::fs::create_dir_all(&out_dir_path)?;
+    let mut f = File::create(out_dir_path.join(format!("{ext_name}.c")))?;
+    f.write_all(out.as_bytes())?;
+    Ok(())
+}
+
+fn gen_test_body(
+    current_name: &str,
+    asm_fmts: &[String],
+    in_t: &[&str; 4],
+    out_t: &str,
+    para_num: i32,
+    target: TargetFeature,
+) -> (String, String) {
+    let rand_i32 = |bits: u8| -> i32 {
+        let val = rand::random::<i32>();
+        let bits = 32 - bits;
+        (val << bits) >> bits
+    };
+    let rand_u32 = |bits: u8| -> u32 {
+        let val = rand::random::<u32>();
+        let bits = 32 - bits;
+        (val << bits) >> bits
+    };
+    let rand_i64 = || -> i64 { rand::random::<i64>() };
+    let rand_u64 = || -> u64 { rand::random::<u64>() };
+    let rand_f32 = || -> f32 { rand::random::<f32>() };
+    let rand_f64 = || -> f64 { rand::random::<f64>() };
+    let type_to_ct = |t: &str| -> &str {
+        match t {
+            "V16QI" => "union v16qi",
+            "V32QI" => "union v32qi",
+            "V8HI" => "union v8hi",
+            "V16HI" => "union v16hi",
+            "V4SI" => "union v4si",
+            "V8SI" => "union v8si",
+            "V2DI" => "union v2di",
+            "V4DI" => "union v4di",
+            "UV16QI" => "union uv16qi",
+            "UV32QI" => "union uv32qi",
+            "UV8HI" => "union uv8hi",
+            "UV16HI" => "union uv16hi",
+            "UV4SI" => "union uv4si",
+            "UV8SI" => "union uv8si",
+            "UV2DI" => "union uv2di",
+            "UV4DI" => "union uv4di",
+            "SI" => "int32_t",
+            "DI" => "int64_t",
+            "USI" => "uint32_t",
+            "UDI" => "uint64_t",
+            "V4SF" => "union v4sf",
+            "V8SF" => "union v8sf",
+            "V2DF" => "union v2df",
+            "V4DF" => "union v4df",
+            "UQI" => "uint32_t",
+            "QI" => "int32_t",
+            "CVPOINTER" => "void*",
+            "HI" => "int32_t",
+            _ => panic!("unknown type: {t}"),
+        }
+    };
+    let type_to_va = |v: &str, t: &str| -> String {
+        let n = if v.starts_with('_') {
+            v.get(1..).unwrap()
+        } else {
+            v
+        };
+        let mut out = String::new();
+        match t {
+            "A16QI" => {
+                for i in 0..16 {
+                    out.push_str(&format!("    {v}.i8[{i}] = {};\n", rand_i32(8)));
+                }
+                out.push_str(&format!("    printf(\"    let {n}: [i8; 16] = [%d"));
+                for _ in 1..16 {
+                    out.push_str(", %d");
+                }
+                out.push_str(&format!("];\\n\",\n    {v}.i8[0]"));
+                for i in 1..16 {
+                    out.push_str(&format!(", {v}.i8[{i}]"));
+                }
+            }
+            "AM16QI" => {
+                for i in 0..16 {
+                    out.push_str(&format!("    {v}.i8[{i}] = {};\n", rand_i32(8)));
+                }
+                out.push_str(&format!("    printf(\"    let mut {n}: [i8; 16] = [%d"));
+                for _ in 1..16 {
+                    out.push_str(", %d");
+                }
+                out.push_str(&format!("];\\n\",\n    {v}.i8[0]"));
+                for i in 1..16 {
+                    out.push_str(&format!(", {v}.i8[{i}]"));
+                }
+            }
+            "V16QI" => {
+                for i in 0..16 {
+                    out.push_str(&format!("    {v}.i8[{i}] = {};\n", rand_i32(8)));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = i8x16::new(%d"));
+                for _ in 1..16 {
+                    out.push_str(", %d");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i8[0]"));
+                for i in 1..16 {
+                    out.push_str(&format!(", {v}.i8[{i}]"));
+                }
+            }
+            "V32QI" => {
+                for i in 0..32 {
+                    out.push_str(&format!("    {v}.i8[{i}] = {};\n", rand_i32(8)));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = i8x32::new(%d"));
+                for _ in 1..32 {
+                    out.push_str(", %d");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i8[0]"));
+                for i in 1..32 {
+                    out.push_str(&format!(", {v}.i8[{i}]"));
+                }
+            }
+            "A32QI" => {
+                for i in 0..32 {
+                    out.push_str(&format!("    {v}.i8[{i}] = {};\n", rand_i32(8)));
+                }
+                out.push_str(&format!("    printf(\"    let {n}: [i8; 32] = [%d"));
+                for _ in 1..32 {
+                    out.push_str(", %d");
+                }
+                out.push_str(&format!("];\\n\",\n    {v}.i8[0]"));
+                for i in 1..32 {
+                    out.push_str(&format!(", {v}.i8[{i}]"));
+                }
+            }
+            "AM32QI" => {
+                for i in 0..32 {
+                    out.push_str(&format!("    {v}.i8[{i}] = {};\n", rand_i32(8)));
+                }
+                out.push_str(&format!("    printf(\"    let mut {n}: [i8; 32] = [%d"));
+                for _ in 1..32 {
+                    out.push_str(", %d");
+                }
+                out.push_str(&format!("];\\n\",\n    {v}.i8[0]"));
+                for i in 1..32 {
+                    out.push_str(&format!(", {v}.i8[{i}]"));
+                }
+            }
+            "V8HI" => {
+                for i in 0..8 {
+                    out.push_str(&format!("    {v}.i16[{i}] = {};\n", rand_i32(16)));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = i16x8::new(%d"));
+                for _ in 1..8 {
+                    out.push_str(", %d");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i16[0]"));
+                for i in 1..8 {
+                    out.push_str(&format!(", {v}.i16[{i}]"));
+                }
+            }
+            "V16HI" => {
+                for i in 0..16 {
+                    out.push_str(&format!("    {v}.i16[{i}] = {};\n", rand_i32(16)));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = i16x16::new(%d"));
+                for _ in 1..16 {
+                    out.push_str(", %d");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i16[0]"));
+                for i in 1..16 {
+                    out.push_str(&format!(", {v}.i16[{i}]"));
+                }
+            }
+            "V4SI" => {
+                for i in 0..4 {
+                    out.push_str(&format!("    {v}.i32[{i}] = {};\n", rand_i32(32)));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = i32x4::new(%d"));
+                for _ in 1..4 {
+                    out.push_str(", %d");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i32[0]"));
+                for i in 1..4 {
+                    out.push_str(&format!(", {v}.i32[{i}]"));
+                }
+            }
+            "V8SI" => {
+                for i in 0..8 {
+                    out.push_str(&format!("    {v}.i32[{i}] = {};\n", rand_i32(32)));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = i32x8::new(%d"));
+                for _ in 1..8 {
+                    out.push_str(", %d");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i32[0]"));
+                for i in 1..8 {
+                    out.push_str(&format!(", {v}.i32[{i}]"));
+                }
+            }
+            "V2DI" => {
+                for i in 0..2 {
+                    out.push_str(&format!("    {v}.i64[{i}] = {}L;\n", rand_i64()));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = i64x2::new(%ld"));
+                for _ in 1..2 {
+                    out.push_str(", %ld");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i64[0]"));
+                for i in 1..2 {
+                    out.push_str(&format!(", {v}.i64[{i}]"));
+                }
+            }
+            "V4DI" => {
+                for i in 0..4 {
+                    out.push_str(&format!("    {v}.i64[{i}] = {}L;\n", rand_i64()));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = i64x4::new(%ld"));
+                for _ in 1..4 {
+                    out.push_str(", %ld");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i64[0]"));
+                for i in 1..4 {
+                    out.push_str(&format!(", {v}.i64[{i}]"));
+                }
+            }
+            "UV16QI" => {
+                for i in 0..16 {
+                    out.push_str(&format!("    {v}.i8[{i}] = {};\n", rand_u32(8)));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = u8x16::new(%u"));
+                for _ in 1..16 {
+                    out.push_str(", %u");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i8[0]"));
+                for i in 1..16 {
+                    out.push_str(&format!(", {v}.i8[{i}]"));
+                }
+            }
+            "UV32QI" => {
+                for i in 0..32 {
+                    out.push_str(&format!("    {v}.i8[{i}] = {};\n", rand_u32(8)));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = u8x32::new(%u"));
+                for _ in 1..32 {
+                    out.push_str(", %u");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i8[0]"));
+                for i in 1..32 {
+                    out.push_str(&format!(", {v}.i8[{i}]"));
+                }
+            }
+            "UV8HI" => {
+                for i in 0..8 {
+                    out.push_str(&format!("    {v}.i16[{i}] = {};\n", rand_u32(16)));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = u16x8::new(%u"));
+                for _ in 1..8 {
+                    out.push_str(", %u");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i16[0]"));
+                for i in 1..8 {
+                    out.push_str(&format!(", {v}.i16[{i}]"));
+                }
+            }
+            "UV16HI" => {
+                for i in 0..16 {
+                    out.push_str(&format!("    {v}.i16[{i}] = {};\n", rand_u32(16)));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = u16x16::new(%u"));
+                for _ in 1..16 {
+                    out.push_str(", %u");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i16[0]"));
+                for i in 1..16 {
+                    out.push_str(&format!(", {v}.i16[{i}]"));
+                }
+            }
+            "UV4SI" => {
+                for i in 0..4 {
+                    out.push_str(&format!("    {v}.i32[{i}] = {};\n", rand_u32(32)));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = u32x4::new(%u"));
+                for _ in 1..4 {
+                    out.push_str(", %u");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i32[0]"));
+                for i in 1..4 {
+                    out.push_str(&format!(", {v}.i32[{i}]"));
+                }
+            }
+            "UV8SI" => {
+                for i in 0..8 {
+                    out.push_str(&format!("    {v}.i32[{i}] = {};\n", rand_u32(32)));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = u32x8::new(%u"));
+                for _ in 1..8 {
+                    out.push_str(", %u");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i32[0]"));
+                for i in 1..8 {
+                    out.push_str(&format!(", {v}.i32[{i}]"));
+                }
+            }
+            "UV2DI" => {
+                for i in 0..2 {
+                    out.push_str(&format!("    {v}.i64[{i}] = {}UL;\n", rand_u64()));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = u64x2::new(%lu"));
+                for _ in 1..2 {
+                    out.push_str(", %lu");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i64[0]"));
+                for i in 1..2 {
+                    out.push_str(&format!(", {v}.i64[{i}]"));
+                }
+            }
+            "UV4DI" => {
+                for i in 0..4 {
+                    out.push_str(&format!("    {v}.i64[{i}] = {}UL;\n", rand_u64()));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = u64x4::new(%lu"));
+                for _ in 1..4 {
+                    out.push_str(", %lu");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i64[0]"));
+                for i in 1..4 {
+                    out.push_str(&format!(", {v}.i64[{i}]"));
+                }
+            }
+            "V4SF" => {
+                for i in 0..4 {
+                    out.push_str(&format!("    {v}.f32[{i}] = {};\n", rand_f32()));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = u32x4::new(%u"));
+                for _ in 1..4 {
+                    out.push_str(", %u");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i32[0]"));
+                for i in 1..4 {
+                    out.push_str(&format!(", {v}.i32[{i}]"));
+                }
+            }
+            "V8SF" => {
+                for i in 0..8 {
+                    out.push_str(&format!("    {v}.f32[{i}] = {};\n", rand_f32()));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = u32x8::new(%u"));
+                for _ in 1..8 {
+                    out.push_str(", %u");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i32[0]"));
+                for i in 1..8 {
+                    out.push_str(&format!(", {v}.i32[{i}]"));
+                }
+            }
+            "V2DF" => {
+                for i in 0..2 {
+                    out.push_str(&format!("    {v}.f64[{i}] = {};\n", rand_f64()));
+                }
+                out.push_str(&format!("    printf(\"    let {n} = u64x2::new(%lu"));
+                for _ in 1..2 {
+                    out.push_str(", %lu");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i64[0]"));
+                for i in 1..2 {
+                    out.push_str(&format!(", {v}.i64[{i}]"));
+                }
+            }
+            "V4DF" => {
+                for i in 0..4 {
+                    out.push_str(&format!("    {v}.f64[{i}] = {};\n", rand_f64()));
+                }
+                out.push_str(&format!("    printf(\"    let {n} =  u64x4::new(%lu"));
+                for _ in 1..4 {
+                    out.push_str(", %lu");
+                }
+                out.push_str(&format!(");\\n\",\n    {v}.i64[0]"));
+                for i in 1..4 {
+                    out.push_str(&format!(", {v}.i64[{i}]"));
+                }
+            }
+            "SI" | "DI" | "USI" | "UDI" | "UQI" | "QI" | "CVPOINTER" | "HI" => (),
+            _ => panic!("unknown type: {t}"),
+        }
+        if !out.is_empty() {
+            out.push_str(");");
+        }
+        out
+    };
+    let type_to_rp = |t: &str| -> &str {
+        match t {
+            "SI" => "    printf(\"    let r: i32 = %d;\\n\", o);",
+            "DI" => "    printf(\"    let r: i64 = %ld;\\n\", o);",
+            "USI" => "    printf(\"    let r: u32 = %u;\\n\", o);",
+            "UDI" => "    printf(\"    let r: u64 = %lu;\\n\", o);",
+            "UQI" => "    printf(\"    let r: u32 = %u;\\n\", o);",
+            "QI" => "    printf(\"    let r: i32 = %d;\\n\", o);",
+            "HI" => "    printf(\"    let r: i32 = %d;\\n\", o);",
+            "V32QI" | "V16HI" | "V8SI" | "V4DI" | "UV32QI" | "UV16HI" | "UV8SI" | "UV4DI"
+            | "V8SF" | "V4DF" => {
+                "    printf(\"    let r = i64x4::new(%ld, %ld, %ld, %ld);\\n\", o.i64[0], o.i64[1], o.i64[2], o.i64[3]);"
+            }
+            _ => "    printf(\"    let r = i64x2::new(%ld, %ld);\\n\", o.i64[0], o.i64[1]);",
+        }
+    };
+    let type_to_rx = |t: &str| -> &str {
+        match t {
+            "SI" | "DI" | "USI" | "UDI" | "UQI" | "QI" | "HI" => "o",
+            _ => "o.v",
+        }
+    };
+    let type_to_imm = |t| -> i8 {
+        match t {
+            'b' => 4,
+            'h' => 3,
+            'w' => 2,
+            'd' => 1,
+            _ => panic!("unsupported type"),
+        }
+    };
+
+    let impl_function = {
+        let fn_output = if out_t.to_lowercase() == "void" {
+            String::new()
+        } else {
+            format!("    {} o;", type_to_ct(out_t))
+        };
+        let mut fn_inputs = match para_num {
+            1 => format!(
+                "    {} a;\n{}",
+                type_to_ct(in_t[0]),
+                type_to_va("a", in_t[0])
+            ),
+            2 => format!(
+                "    {} a;\n{}\n    {} b;\n{}",
+                type_to_ct(in_t[0]),
+                type_to_va("a", in_t[0]),
+                type_to_ct(in_t[1]),
+                type_to_va("b", in_t[1])
+            ),
+            3 => format!(
+                "    {} a;\n{}\n    {} b;\n{}\n    {} c;\n{}",
+                type_to_ct(in_t[0]),
+                type_to_va("a", in_t[0]),
+                type_to_ct(in_t[1]),
+                type_to_va("b", in_t[1]),
+                type_to_ct(in_t[2]),
+                type_to_va("c", in_t[2])
+            ),
+            4 => format!(
+                "    {} a;\n{}\n    {} b;\n{}\n    {} c;\n{}\n    {} d;\n{}",
+                type_to_ct(in_t[0]),
+                type_to_va("a", in_t[0]),
+                type_to_ct(in_t[1]),
+                type_to_va("b", in_t[1]),
+                type_to_ct(in_t[2]),
+                type_to_va("c", in_t[2]),
+                type_to_ct(in_t[3]),
+                type_to_va("d", in_t[3])
+            ),
+            _ => panic!("unsupported parameter number"),
+        };
+        let mut fn_params = match para_num {
+            1 => "(a.v)".to_string(),
+            2 => "(a.v, b.v)".to_string(),
+            3 => "(a.v, b.v, c.v)".to_string(),
+            4 => "(a.v, b.v, c.v, d.v)".to_string(),
+            _ => "unsupported parameter number".to_string(),
+        };
+        let mut as_params = match para_num {
+            1 => "(transmute(a))".to_string(),
+            2 => "(transmute(a), transmute(b))".to_string(),
+            3 => "(transmute(a), transmute(b), transmute(c))".to_string(),
+            4 => "(transmute(a), transmute(b), transmute(c), transmute(d))".to_string(),
+            _ => panic!("unsupported parameter number"),
+        };
+        let mut as_args = String::new();
+        if para_num == 1 && in_t[0] == "HI" {
+            fn_inputs = "".to_string();
+            match asm_fmts[1].as_str() {
+                "si13" => {
+                    let val = rand_i32(13);
+                    fn_params = format!("({val})");
+                    as_params = format!("::<{val}>()");
+                }
+                "i13" => {
+                    let val = rand_u32(12);
+                    fn_params = format!("({val})");
+                    as_params = format!("::<{val}>()");
+                }
+                "si10" => {
+                    let val = rand_i32(10);
+                    fn_params = format!("({val})");
+                    as_params = format!("::<{val}>()");
+                }
+                _ => panic!("unsupported assembly format: {}", asm_fmts[1]),
+            }
+        } else if para_num == 1
+            && (in_t[0] == "SI" || in_t[0] == "DI")
+            && asm_fmts[1].starts_with("rj")
+        {
+            fn_params = "(a)".to_string();
+            if in_t[0] == "SI" {
+                as_params = "(%d)".to_string();
+            } else {
+                as_params = "(%ld)".to_string();
+            }
+            as_args = ", a".to_string();
+        } else if para_num == 2 && (in_t[1] == "UQI" || in_t[1] == "USI") {
+            if asm_fmts[2].starts_with("ui") {
+                fn_inputs = format!(
+                    "    {} a;\n{}",
+                    type_to_ct(in_t[0]),
+                    type_to_va("a", in_t[0])
+                );
+                let val = rand_u32(asm_fmts[2].get(2..).unwrap().parse::<u8>().unwrap());
+                fn_params = format!("(a.v, {val})");
+                as_params = format!("::<{val}>(transmute(a))");
+            } else {
+                panic!("unsupported assembly format: {}", asm_fmts[2]);
+            }
+        } else if para_num == 2 && in_t[1] == "QI" {
+            if asm_fmts[2].starts_with("si") {
+                fn_inputs = format!(
+                    "    {} a;\n{}",
+                    type_to_ct(in_t[0]),
+                    type_to_va("a", in_t[0])
+                );
+                let val = rand_i32(asm_fmts[2].get(2..).unwrap().parse::<u8>().unwrap());
+                fn_params = format!("(a.v, {val})");
+                as_params = format!("::<{val}>(transmute(a))");
+            } else {
+                panic!("unsupported assembly format: {}", asm_fmts[2]);
+            }
+        } else if para_num == 2 && in_t[1] == "SI" && asm_fmts[2].starts_with("rk") {
+            fn_params = "(a.v, b)".to_string();
+            as_params = "(transmute(a), %d)".to_string();
+            as_args = ", b".to_string();
+        } else if para_num == 2 && in_t[0] == "CVPOINTER" && in_t[1] == "SI" {
+            if asm_fmts[2].starts_with("si") {
+                fn_inputs = format!(
+                    "    union v{}qi _a;\n{}\n    {} a = &_a;",
+                    target.bytes(),
+                    type_to_va(
+                        "_a",
+                        if target == TargetFeature::Lsx {
+                            "A16QI"
+                        } else {
+                            "A32QI"
+                        }
+                    ),
+                    type_to_ct(in_t[0])
+                );
+                fn_params = "(a, 0)".to_string();
+                as_params = "::<0>(a.as_ptr())".to_string();
+            } else {
+                panic!("unsupported assembly format: {}", asm_fmts[2]);
+            }
+        } else if para_num == 2 && in_t[0] == "CVPOINTER" && in_t[1] == "DI" {
+            if asm_fmts[2].as_str() == "rk" {
+                fn_inputs = format!(
+                    "    union v{}qi _a;\n{}\n    {} a = &_a;",
+                    target.bytes(),
+                    type_to_va(
+                        "_a",
+                        if target == TargetFeature::Lsx {
+                            "A16QI"
+                        } else {
+                            "A32QI"
+                        }
+                    ),
+                    type_to_ct(in_t[0])
+                );
+                fn_params = "(a, 0)".to_string();
+                as_params = "(a.as_ptr(), 0)".to_string();
+            } else {
+                panic!("unsupported assembly format: {}", asm_fmts[2]);
+            }
+        } else if para_num == 3 && in_t[2] == "UQI" && asm_fmts[1].starts_with("rj") {
+            if asm_fmts[2].starts_with("ui") {
+                fn_inputs = format!(
+                    "    {} a;\n{}",
+                    type_to_ct(in_t[0]),
+                    type_to_va("a", in_t[0])
+                );
+                let ival = rand_i32(32);
+                let uval = rand_u32(asm_fmts[2].get(2..).unwrap().parse::<u8>().unwrap());
+                fn_params = format!("(a.v, {ival}, {uval})");
+                as_params = format!("::<{uval}>(transmute(a), {ival})");
+            } else {
+                panic!("unsupported assembly format: {}", asm_fmts[2]);
+            }
+        } else if para_num == 3 && (in_t[2] == "USI" || in_t[2] == "UQI") {
+            if asm_fmts[2].starts_with("ui") {
+                fn_inputs = format!(
+                    "    {} a;\n{}\n    {} b;\n{}",
+                    type_to_ct(in_t[0]),
+                    type_to_va("a", in_t[0]),
+                    type_to_ct(in_t[1]),
+                    type_to_va("b", in_t[1]),
+                );
+                let val = rand_u32(asm_fmts[2].get(2..).unwrap().parse::<u8>().unwrap());
+                fn_params = format!("(a.v, b.v, {val})");
+                as_params = format!("::<{val}>(transmute(a), transmute(b))");
+            } else {
+                panic!("unsupported assembly format: {}", asm_fmts[2]);
+            }
+        } else if para_num == 3 && in_t[1] == "CVPOINTER" && in_t[2] == "SI" {
+            if asm_fmts[2].as_str() == "si12" {
+                fn_inputs = format!(
+                    "    {} a;\n{}\n    union v{}qi o;\n{}\n    {} b = &o;",
+                    type_to_ct(in_t[0]),
+                    type_to_va("a", in_t[0]),
+                    target.bytes(),
+                    type_to_va(
+                        "o",
+                        if target == TargetFeature::Lsx {
+                            "AM16QI"
+                        } else {
+                            "AM32QI"
+                        }
+                    ),
+                    type_to_ct(in_t[1])
+                );
+                fn_params = "(a.v, b, 0)".to_string();
+                as_params = "::<0>(transmute(a), o.as_mut_ptr())".to_string();
+            } else {
+                panic!("unsupported assembly format: {}", asm_fmts[2]);
+            }
+        } else if para_num == 3 && in_t[1] == "CVPOINTER" && in_t[2] == "DI" {
+            if asm_fmts[2].as_str() == "rk" {
+                fn_inputs = format!(
+                    "    {} a;\n{}\n    union v{}qi o;\n{}\n    {} b = &o;",
+                    type_to_ct(in_t[0]),
+                    type_to_va("a", in_t[0]),
+                    target.bytes(),
+                    type_to_va(
+                        "o",
+                        if target == TargetFeature::Lsx {
+                            "AM16QI"
+                        } else {
+                            "AM32QI"
+                        }
+                    ),
+                    type_to_ct(in_t[1])
+                );
+                fn_params = "(a.v, b, 0)".to_string();
+                as_params = "(transmute(a), o.as_mut_ptr(), 0)".to_string();
+            } else {
+                panic!("unsupported assembly format: {}", asm_fmts[2]);
+            }
+        } else if para_num == 4 {
+            match (asm_fmts[2].as_str(), current_name.chars().last().unwrap()) {
+                ("si8", t) => {
+                    fn_inputs = format!(
+                        "    {} a;\n{}\n    union v{}qi o;\n{}\n    {} b = &o;",
+                        type_to_ct(in_t[0]),
+                        type_to_va("a", in_t[0]),
+                        target.bytes(),
+                        type_to_va(
+                            "o",
+                            if target == TargetFeature::Lsx {
+                                "AM16QI"
+                            } else {
+                                "AM32QI"
+                            }
+                        ),
+                        type_to_ct(in_t[1])
+                    );
+                    let val = rand_u32(type_to_imm(t).try_into().unwrap());
+                    fn_params = format!("(a.v, b, 0, {val})");
+                    as_params = format!("::<0, {val}>(transmute(a), o.as_mut_ptr())");
+                }
+                (_, _) => panic!(
+                    "unsupported assembly format: {} for {}",
+                    asm_fmts[2], current_name
+                ),
+            };
+        }
+        let fn_docall = if out_t.to_lowercase() == "void" {
+            format!("    __{current_name}{fn_params};")
+        } else {
+            format!("    {} = __{current_name}{fn_params};", type_to_rx(out_t))
+        };
+        let fn_result = if out_t.to_lowercase() == "void" {
+            if target == TargetFeature::Lsx {
+                type_to_rp("V16QI")
+            } else {
+                type_to_rp("V32QI")
+            }
+        } else {
+            type_to_rp(out_t)
+        };
+        let fn_assert = {
+            if out_t.to_lowercase() == "void" {
+                format!(
+                    "    printf(\"\\n    {current_name}{as_params};\\n    assert_eq!(r, transmute(o));\\n\"{as_args});"
+                )
+            } else {
+                format!(
+                    "    printf(\"\\n    assert_eq!(r, transmute({current_name}{as_params}));\\n\"{as_args});"
+                )
+            }
+        };
+        format!(
+            r#"
+static void {current_name}(void)
+{{
+    printf("\n#[simd_test(enable = \"{}\")]\n");
+    printf("unsafe fn test_{current_name}() {{\n");
+{fn_inputs}
+{fn_output}
+{fn_docall}
+{fn_result}
+{fn_assert}
+    printf("}}\n");
+}}
+"#,
+            target.as_target_feature_arg(current_name)
+        )
+    };
+    let call_function = format!("    {current_name}();\n");
+    (impl_function, call_function)
+}
+
+pub fn main() -> io::Result<()> {
+    let args: Vec<String> = env::args().collect();
+    let in_file = args.get(1).cloned().expect("Input file missing!");
+    let in_file_path = PathBuf::from(&in_file);
+    let in_file_name = in_file_path
+        .file_name()
+        .unwrap()
+        .to_os_string()
+        .into_string()
+        .unwrap();
+
+    let ext_name = if in_file_name.starts_with("lasx") {
+        "lasx"
+    } else {
+        "lsx"
+    };
+
+    if in_file_name.ends_with(".h") {
+        gen_spec(in_file, ext_name)
+    } else if args.get(2).is_some() {
+        gen_test(in_file, ext_name)
+    } else {
+        gen_bind(in_file, ext_name)
+    }
+}
diff --git a/library/stdarch/crates/stdarch-test/Cargo.toml b/library/stdarch/crates/stdarch-test/Cargo.toml
new file mode 100644
index 0000000000000..e4791e4ec5251
--- /dev/null
+++ b/library/stdarch/crates/stdarch-test/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "stdarch-test"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+edition = "2024"
+
+[dependencies]
+assert-instr-macro = { path = "../assert-instr-macro" }
+simd-test-macro = { path = "../simd-test-macro" }
+lazy_static = "1.0"
+rustc-demangle = "0.1.8"
+cfg-if = "1.0"
+
+[target.'cfg(windows)'.dependencies]
+cc = "1.0"
+
+# We use a crates.io dependency to disassemble wasm binaries to look for
+# instructions for `#[assert_instr]`. Note that we use an `=` dependency here
+# instead of a floating dependency because the text format for wasm changes over
+# time, and we want to make updates to this explicit rather than automatically
+# picking up updates which might break CI with new instruction names.
+[target.'cfg(target_arch = "wasm32")'.dependencies]
+wasmprinter = "=0.2.67"
+
+[features]
+default = []
diff --git a/library/stdarch/crates/stdarch-test/src/disassembly.rs b/library/stdarch/crates/stdarch-test/src/disassembly.rs
new file mode 100644
index 0000000000000..f5167ea8d8ef3
--- /dev/null
+++ b/library/stdarch/crates/stdarch-test/src/disassembly.rs
@@ -0,0 +1,208 @@
+//! Disassembly calling function for most targets.
+
+use crate::Function;
+use std::{collections::HashSet, env, str};
+
+// Extracts the "shim" name from the `symbol`.
+fn normalize(mut symbol: &str) -> String {
+    // Remove trailing colon:
+    if symbol.ends_with(':') {
+        symbol = &symbol[..symbol.len() - 1];
+    }
+    if symbol.ends_with('>') {
+        symbol = &symbol[..symbol.len() - 1];
+    }
+    if let Some(idx) = symbol.find('<') {
+        symbol = &symbol[idx + 1..];
+    }
+
+    let mut symbol = rustc_demangle::demangle(symbol).to_string();
+    symbol = match symbol.rfind("::h") {
+        Some(i) => symbol[..i].to_string(),
+        None => symbol.to_string(),
+    };
+
+    // Remove Rust paths
+    if let Some(last_colon) = symbol.rfind(':') {
+        symbol = symbol[last_colon + 1..].to_string();
+    }
+
+    // Normalize to no leading underscore to handle platforms that may
+    // inject extra ones in symbol names.
+    while symbol.starts_with('_') || symbol.starts_with('.') {
+        symbol.remove(0);
+    }
+    // Windows/x86 has a suffix such as @@4.
+    if let Some(idx) = symbol.find("@@") {
+        symbol = symbol[..idx].to_string();
+    }
+    symbol
+}
+
+#[cfg(target_env = "msvc")]
+pub(crate) fn disassemble_myself() -> HashSet<Function> {
+    let me = env::current_exe().expect("failed to get current exe");
+
+    let target = if cfg!(target_arch = "x86_64") {
+        "x86_64-pc-windows-msvc"
+    } else if cfg!(target_arch = "x86") {
+        "i686-pc-windows-msvc"
+    } else if cfg!(target_arch = "aarch64") {
+        "aarch64-pc-windows-msvc"
+    } else {
+        panic!("disassembly unimplemented")
+    };
+    let mut cmd =
+        cc::windows_registry::find(target, "dumpbin.exe").expect("failed to find `dumpbin` tool");
+    let output = cmd
+        .arg("/DISASM:NOBYTES")
+        .arg(&me)
+        .output()
+        .expect("failed to execute dumpbin");
+    println!(
+        "{}\n{}",
+        output.status,
+        String::from_utf8_lossy(&output.stderr)
+    );
+    assert!(output.status.success());
+    // Windows does not return valid UTF-8 output:
+    parse(&String::from_utf8_lossy(Vec::leak(output.stdout)))
+}
+
+#[cfg(not(target_env = "msvc"))]
+pub(crate) fn disassemble_myself() -> HashSet<Function> {
+    let me = env::current_exe().expect("failed to get current exe");
+
+    let objdump = env::var("OBJDUMP").unwrap_or_else(|_| "objdump".to_string());
+    let add_args = if cfg!(target_vendor = "apple") && cfg!(target_arch = "aarch64") {
+        // Target features need to be enabled for LLVM objdump on Darwin ARM64
+        vec!["--mattr=+v8.6a,+crypto,+tme"]
+    } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
+        vec!["--mattr=+zk,+zks,+zbc,+zbb"]
+    } else {
+        vec![]
+    };
+    let output = std::process::Command::new(objdump.clone())
+        .arg("--disassemble")
+        .arg("--no-show-raw-insn")
+        .args(add_args)
+        .arg(&me)
+        .output()
+        .unwrap_or_else(|_| panic!("failed to execute objdump. OBJDUMP={objdump}"));
+    println!(
+        "{}\n{}",
+        output.status,
+        String::from_utf8_lossy(&output.stderr)
+    );
+    assert!(output.status.success());
+
+    let disassembly = String::from_utf8_lossy(Vec::leak(output.stdout));
+
+    parse(&disassembly)
+}
+
+fn parse(output: &str) -> HashSet<Function> {
+    let mut lines = output.lines();
+
+    println!(
+        "First 100 lines of the disassembly input containing {} lines:",
+        lines.clone().count()
+    );
+    for line in output.lines().take(100) {
+        println!("{line}");
+    }
+
+    let mut functions = HashSet::new();
+    let mut cached_header = None;
+    while let Some(header) = cached_header.take().or_else(|| lines.next()) {
+        if !header.ends_with(':') || !header.contains("stdarch_test_shim") {
+            continue;
+        }
+        eprintln!("header: {header}");
+        let symbol = normalize(header);
+        eprintln!("normalized symbol: {symbol}");
+        let mut instructions = Vec::new();
+        for instruction in lines.by_ref() {
+            if instruction.ends_with(':') {
+                cached_header = Some(instruction);
+                break;
+            }
+            if instruction.is_empty() {
+                cached_header = None;
+                break;
+            }
+            let mut parts = if cfg!(target_env = "msvc") {
+                // Each line looks like:
+                //
+                // >  $addr: $instr..
+                instruction
+                    .split(&[' ', ','])
+                    .filter(|&x| !x.is_empty())
+                    .skip(1)
+                    .map(str::to_lowercase)
+                    .skip_while(|s| matches!(&**s, "lock" | "vex")) // skip x86-specific prefix
+                    .collect::<Vec<String>>()
+            } else {
+                // objdump with --no-show-raw-insn
+                // Each line of instructions should look like:
+                //
+                //      $rel_offset:       $instruction...
+                instruction
+                    .split_whitespace()
+                    .skip(1)
+                    .skip_while(|s| matches!(*s, "lock" | "{evex}" | "{vex}")) // skip x86-specific prefix
+                    .map(ToString::to_string)
+                    .collect::<Vec<String>>()
+            };
+
+            if cfg!(any(target_arch = "aarch64", target_arch = "arm64ec")) {
+                // Normalize [us]shll.* ..., #0 instructions to the preferred form: [us]xtl.* ...
+                // as neither LLVM objdump nor dumpbin does that.
+                // See https://developer.arm.com/documentation/ddi0602/latest/SIMD-FP-Instructions/UXTL--UXTL2--Unsigned-extend-Long--an-alias-of-USHLL--USHLL2-
+                // and https://developer.arm.com/documentation/ddi0602/latest/SIMD-FP-Instructions/SXTL--SXTL2--Signed-extend-Long--an-alias-of-SSHLL--SSHLL2-
+                // for details.
+                fn is_shll(instr: &str) -> bool {
+                    if cfg!(target_env = "msvc") {
+                        instr.starts_with("ushll") || instr.starts_with("sshll")
+                    } else {
+                        instr.starts_with("ushll.") || instr.starts_with("sshll.")
+                    }
+                }
+                match (parts.first(), parts.last()) {
+                    (Some(instr), Some(last_arg)) if is_shll(instr) && last_arg == "#0" => {
+                        assert_eq!(parts.len(), 4);
+                        let mut new_parts = Vec::with_capacity(3);
+                        let new_instr = format!("{}{}{}", &instr[..1], "xtl", &instr[5..]);
+                        new_parts.push(new_instr);
+                        new_parts.push(parts[1].clone());
+                        new_parts.push(parts[2][0..parts[2].len() - 1].to_owned()); // strip trailing comma
+                        parts = new_parts;
+                    }
+                    // dumpbin uses "ins" instead of "mov"
+                    (Some(instr), _) if cfg!(target_env = "msvc") && instr == "ins" => {
+                        parts[0] = "mov".to_string()
+                    }
+                    _ => {}
+                };
+            }
+
+            instructions.push(parts.join(" "));
+            if matches!(&**instructions.last().unwrap(), "ret" | "retq") {
+                cached_header = None;
+                break;
+            }
+        }
+        let function = Function {
+            name: symbol,
+            instrs: instructions,
+        };
+        assert!(functions.insert(function));
+    }
+
+    eprintln!("all found functions dump:");
+    for k in &functions {
+        eprintln!("  f: {}", k.name);
+    }
+
+    functions
+}
diff --git a/library/stdarch/crates/stdarch-test/src/lib.rs b/library/stdarch/crates/stdarch-test/src/lib.rs
new file mode 100644
index 0000000000000..f6614f6d51c90
--- /dev/null
+++ b/library/stdarch/crates/stdarch-test/src/lib.rs
@@ -0,0 +1,218 @@
+//! Runtime support needed for testing the stdarch crate.
+//!
+//! This basically just disassembles the current executable and then parses the
+//! output once globally and then provides the `assert` function which makes
+//! assertions about the disassembly of a function.
+#![deny(rust_2018_idioms)]
+#![allow(clippy::missing_docs_in_private_items, clippy::print_stdout)]
+
+#[macro_use]
+extern crate lazy_static;
+#[macro_use]
+extern crate cfg_if;
+
+pub use assert_instr_macro::*;
+pub use simd_test_macro::*;
+use std::{cmp, collections::HashSet, env, hash, hint::black_box, str};
+
+cfg_if! {
+    if #[cfg(target_arch = "wasm32")] {
+        pub mod wasm;
+        use wasm::disassemble_myself;
+    } else {
+        mod disassembly;
+        use crate::disassembly::disassemble_myself;
+    }
+}
+
+lazy_static! {
+    static ref DISASSEMBLY: HashSet<Function> = disassemble_myself();
+}
+
+#[derive(Debug)]
+struct Function {
+    name: String,
+    instrs: Vec<String>,
+}
+impl Function {
+    fn new(n: &str) -> Self {
+        Self {
+            name: n.to_string(),
+            instrs: Vec::new(),
+        }
+    }
+}
+
+impl cmp::PartialEq for Function {
+    fn eq(&self, other: &Self) -> bool {
+        self.name == other.name
+    }
+}
+impl cmp::Eq for Function {}
+
+impl hash::Hash for Function {
+    fn hash<H: hash::Hasher>(&self, state: &mut H) {
+        self.name.hash(state)
+    }
+}
+
+/// Main entry point for this crate, called by the `#[assert_instr]` macro.
+///
+/// This asserts that the function at `fnptr` contains the instruction
+/// `expected` provided.
+pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
+    // Make sure that the shim is not removed
+    black_box(shim_addr);
+
+    //eprintln!("shim name: {fnname}");
+    let function = &DISASSEMBLY
+        .get(&Function::new(fnname))
+        .unwrap_or_else(|| panic!("function \"{fnname}\" not found in the disassembly"));
+    //eprintln!("  function: {:?}", function);
+
+    let mut instrs = &function.instrs[..];
+    while instrs.last().is_some_and(|s| s == "nop" || s == "int3") {
+        instrs = &instrs[..instrs.len() - 1];
+    }
+
+    // Look for `expected` as the first part of any instruction in this
+    // function, e.g., tzcntl in tzcntl %rax,%rax.
+    //
+    // There are two cases when the expected instruction is nop:
+    // 1. The expected intrinsic is compiled away so we can't
+    // check for it - aka the intrinsic is not generating any code.
+    // 2. It is a mark, indicating that the instruction will be
+    // compiled into other instructions - mainly because of llvm
+    // optimization.
+    let expected = if expected == "unknown" {
+        "<unknown>" // Workaround for rust-lang/stdarch#1674, todo: remove when the issue is fixed
+    } else {
+        expected
+    };
+    let found = expected == "nop" || instrs.iter().any(|s| s.starts_with(expected));
+
+    // Look for subroutine call instructions in the disassembly to detect whether
+    // inlining failed: all intrinsics are `#[inline(always)]`, so calling one
+    // intrinsic from another should not generate subroutine call instructions.
+    let inlining_failed = if cfg!(target_arch = "x86_64") || cfg!(target_arch = "wasm32") {
+        instrs.iter().any(|s| s.starts_with("call "))
+    } else if cfg!(target_arch = "x86") {
+        instrs.windows(2).any(|s| {
+            // On 32-bit x86 position independent code will call itself and be
+            // immediately followed by a `pop` to learn about the current address.
+            // Let's not take that into account when considering whether a function
+            // failed inlining something.
+            s[0].starts_with("call ") && s[1].starts_with("pop") // FIXME: original logic but does not match comment
+        })
+    } else if cfg!(any(
+        target_arch = "aarch64",
+        target_arch = "arm64ec",
+        target_arch = "powerpc",
+        target_arch = "powerpc64"
+    )) {
+        instrs.iter().any(|s| s.starts_with("bl "))
+    } else {
+        // FIXME: Add detection for other archs
+        false
+    };
+
+    let instruction_limit = std::env::var("STDARCH_ASSERT_INSTR_LIMIT")
+        .ok()
+        .map_or_else(
+            || match expected {
+                // `cpuid` returns a pretty big aggregate structure, so exempt
+                // it from the slightly more restrictive 22 instructions below.
+                "cpuid" => 30,
+
+                // These require 8 loads and stores, so it _just_ overflows the limit
+                "aesencwide128kl" | "aesencwide256kl" | "aesdecwide128kl" | "aesdecwide256kl" => 24,
+
+                // Apparently, on Windows, LLVM generates a bunch of
+                // saves/restores of xmm registers around these instructions,
+                // which exceeds the limit of 20 below. As it seems dictated by
+                // Windows's ABI (I believe?), we probably can't do much
+                // about it.
+                "vzeroall" | "vzeroupper" if cfg!(windows) => 30,
+
+                // Intrinsics using `cvtpi2ps` are typically "composites" and
+                // in some cases exceed the limit.
+                "cvtpi2ps" => 25,
+                // core_arch/src/arm_shared/simd32
+                // vfmaq_n_f32_vfma : #instructions = 26 >= 22 (limit)
+                "usad8" | "vfma" | "vfms" => 27,
+                "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
+                // core_arch/src/arm_shared/simd32
+                // vst1q_s64_x4_vst1 : #instructions = 27 >= 22 (limit)
+                "vld3" => 28,
+                // core_arch/src/arm_shared/simd32
+                // vld4q_lane_u32_vld4 : #instructions = 36 >= 22 (limit)
+                "vld4" => 37,
+                // core_arch/src/arm_shared/simd32
+                // vst1q_s64_x4_vst1 : #instructions = 40 >= 22 (limit)
+                "vst1" => 41,
+                // core_arch/src/arm_shared/simd32
+                // vst3q_u32_vst3 : #instructions = 25 >= 22 (limit)
+                "vst3" => 26,
+                // core_arch/src/arm_shared/simd32
+                // vst4q_u32_vst4 : #instructions = 33 >= 22 (limit)
+                "vst4" => 34,
+
+                // core_arch/src/arm_shared/simd32
+                // vst1q_p64_x4_nop : #instructions = 33 >= 22 (limit)
+                "nop" if fnname.contains("vst1q_p64") => 34,
+
+                // Original limit was 20 instructions, but ARM DSP Intrinsics
+                // are exactly 20 instructions long. So, bump the limit to 22
+                // instead of adding here a long list of exceptions.
+                _ => {
+                    // aarch64_be may add reverse instructions which increases
+                    // the number of instructions generated.
+                    if cfg!(all(target_endian = "big", target_arch = "aarch64")) {
+                        32
+                    } else {
+                        22
+                    }
+                }
+            },
+            |v| v.parse().unwrap(),
+        );
+    let probably_only_one_instruction = instrs.len() < instruction_limit;
+
+    if found && probably_only_one_instruction && !inlining_failed {
+        return;
+    }
+
+    // Help debug by printing out the found disassembly, and then panic as we
+    // didn't find the instruction.
+    println!("disassembly for {fnname}: ",);
+    for (i, instr) in instrs.iter().enumerate() {
+        println!("\t{i:2}: {instr}");
+    }
+
+    if !found {
+        panic!("failed to find instruction `{expected}` in the disassembly");
+    } else if !probably_only_one_instruction {
+        panic!(
+            "instruction found, but the disassembly contains too many \
+             instructions: #instructions = {} >= {} (limit)",
+            instrs.len(),
+            instruction_limit
+        );
+    } else if inlining_failed {
+        panic!(
+            "instruction found, but the disassembly contains subroutine \
+             call instructions, which hint that inlining failed"
+        );
+    }
+}
+
+pub fn assert_skip_test_ok(name: &str, missing_features: &[&str]) {
+    println!("Skipping test `{name}` due to missing target features:");
+    for feature in missing_features {
+        println!("  - {feature}");
+    }
+    match env::var("STDARCH_TEST_EVERYTHING") {
+        Ok(_) => panic!("skipped test `{name}` when it shouldn't be skipped"),
+        Err(_) => println!("Set STDARCH_TEST_EVERYTHING to make this an error."),
+    }
+}
diff --git a/library/stdarch/crates/stdarch-test/src/wasm.rs b/library/stdarch/crates/stdarch-test/src/wasm.rs
new file mode 100644
index 0000000000000..bf411c12148e2
--- /dev/null
+++ b/library/stdarch/crates/stdarch-test/src/wasm.rs
@@ -0,0 +1,55 @@
+//! Disassembly calling function for `wasm32` targets.
+
+use crate::Function;
+use std::collections::HashSet;
+
+pub(crate) fn disassemble_myself() -> HashSet<Function> {
+    // Use `std::env::args` to find the path to our executable. Assume the
+    // environment is configured such that we can read that file. Read it and
+    // use the `wasmprinter` crate to transform the binary to text, then search
+    // the text for appropriately named functions.
+    let me = std::env::args()
+        .next()
+        .expect("failed to find current wasm file");
+    let output = wasmprinter::print_file(&me).unwrap();
+
+    let mut ret: HashSet<Function> = HashSet::new();
+    let mut lines = output.lines().map(|s| s.trim());
+    while let Some(line) = lines.next() {
+        // If this isn't a function, we don't care about it.
+        if !line.starts_with("(func ") {
+            continue;
+        }
+
+        let mut function = Function {
+            name: String::new(),
+            instrs: Vec::new(),
+        };
+
+        // Empty functions will end in `))` so there's nothing to do, otherwise
+        // we'll have a bunch of following lines which are instructions.
+        //
+        // Lines that have an imbalanced `)` mark the end of a function.
+        if !line.ends_with("))") {
+            while let Some(line) = lines.next() {
+                function.instrs.push(line.to_string());
+                if !line.starts_with("(") && line.ends_with(")") {
+                    break;
+                }
+            }
+        }
+        // The second element here split on whitespace should be the name of
+        // the function, skipping the type/params/results
+        function.name = line.split_whitespace().nth(1).unwrap().to_string();
+        if function.name.starts_with("$") {
+            function.name = function.name[1..].to_string()
+        }
+
+        if !function.name.contains("stdarch_test_shim") {
+            continue;
+        }
+
+        assert!(ret.insert(function));
+    }
+    return ret;
+}
diff --git a/library/stdarch/crates/stdarch-verify/.gitattributes b/library/stdarch/crates/stdarch-verify/.gitattributes
new file mode 100644
index 0000000000000..621fdea6f7d66
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/.gitattributes
@@ -0,0 +1 @@
+*.xml binary
diff --git a/library/stdarch/crates/stdarch-verify/Cargo.toml b/library/stdarch/crates/stdarch-verify/Cargo.toml
new file mode 100644
index 0000000000000..c82a1262d04fc
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "stdarch-verify"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+edition = "2024"
+
+[dependencies]
+proc-macro2 = "1.0"
+quote = "1.0"
+syn = { version = "2.0", features = ["full"] }
+
+[lib]
+proc-macro = true
+test = false
+
+[dev-dependencies]
+serde = { version = "1.0", features = ['derive'] }
+serde_json = "1.0.96"
+quick-xml = { version = "0.33.0", features = ["serialize", "overlapped-lists"] }
diff --git a/library/stdarch/crates/stdarch-verify/build.rs b/library/stdarch/crates/stdarch-verify/build.rs
new file mode 100644
index 0000000000000..c0dc81b6a6131
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/build.rs
@@ -0,0 +1,28 @@
+use std::path::Path;
+
+fn main() {
+    let dir = Path::new(env!("CARGO_MANIFEST_DIR"));
+    let root = dir.parent().unwrap();
+    eprintln!("root: {}", root.display());
+    walk(&root.join("core_arch/src/x86"));
+    walk(&root.join("core_arch/src/x86_64"));
+    walk(&root.join("core_arch/src/arm"));
+    walk(&root.join("core_arch/src/aarch64"));
+}
+
+fn walk(root: &Path) {
+    for file in root.read_dir().unwrap() {
+        eprintln!("root: {}", root.display());
+        let file = file.unwrap();
+        if file.file_type().unwrap().is_dir() {
+            walk(&file.path());
+            continue;
+        }
+        let path = file.path();
+        if path.extension().and_then(|s| s.to_str()) != Some("rs") {
+            continue;
+        }
+
+        println!("cargo:rerun-if-changed={}", path.display());
+    }
+}
diff --git a/library/stdarch/crates/stdarch-verify/mips-msa.h b/library/stdarch/crates/stdarch-verify/mips-msa.h
new file mode 100644
index 0000000000000..881f1918f6bd2
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/mips-msa.h
@@ -0,0 +1,707 @@
+v16i8 __builtin_msa_add_a_b (v16i8, v16i8);
+v8i16 __builtin_msa_add_a_h (v8i16, v8i16);
+v4i32 __builtin_msa_add_a_w (v4i32, v4i32);
+v2i64 __builtin_msa_add_a_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_adds_a_b (v16i8, v16i8);
+v8i16 __builtin_msa_adds_a_h (v8i16, v8i16);
+v4i32 __builtin_msa_adds_a_w (v4i32, v4i32);
+v2i64 __builtin_msa_adds_a_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_adds_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_adds_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_adds_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_adds_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_adds_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_adds_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_adds_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_adds_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_addv_b (v16i8, v16i8);
+v8i16 __builtin_msa_addv_h (v8i16, v8i16);
+v4i32 __builtin_msa_addv_w (v4i32, v4i32);
+v2i64 __builtin_msa_addv_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_addvi_b (v16i8, imm0_31);
+v8i16 __builtin_msa_addvi_h (v8i16, imm0_31);
+v4i32 __builtin_msa_addvi_w (v4i32, imm0_31);
+v2i64 __builtin_msa_addvi_d (v2i64, imm0_31);
+
+v16u8 __builtin_msa_and_v (v16u8, v16u8);
+
+v16u8 __builtin_msa_andi_b (v16u8, imm0_255);
+
+v16i8 __builtin_msa_asub_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_asub_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_asub_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_asub_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_asub_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_asub_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_asub_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_asub_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_ave_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_ave_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_ave_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_ave_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_ave_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_ave_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_ave_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_ave_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_aver_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_aver_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_aver_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_aver_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_aver_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_aver_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_aver_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_aver_u_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_bclr_b (v16u8, v16u8);
+v8u16 __builtin_msa_bclr_h (v8u16, v8u16);
+v4u32 __builtin_msa_bclr_w (v4u32, v4u32);
+v2u64 __builtin_msa_bclr_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_bclri_b (v16u8, imm0_7);
+v8u16 __builtin_msa_bclri_h (v8u16, imm0_15);
+v4u32 __builtin_msa_bclri_w (v4u32, imm0_31);
+v2u64 __builtin_msa_bclri_d (v2u64, imm0_63);
+
+v16u8 __builtin_msa_binsl_b (v16u8, v16u8, v16u8);
+v8u16 __builtin_msa_binsl_h (v8u16, v8u16, v8u16);
+v4u32 __builtin_msa_binsl_w (v4u32, v4u32, v4u32);
+v2u64 __builtin_msa_binsl_d (v2u64, v2u64, v2u64);
+
+v16u8 __builtin_msa_binsli_b (v16u8, v16u8, imm0_7);
+v8u16 __builtin_msa_binsli_h (v8u16, v8u16, imm0_15);
+v4u32 __builtin_msa_binsli_w (v4u32, v4u32, imm0_31);
+v2u64 __builtin_msa_binsli_d (v2u64, v2u64, imm0_63);
+
+v16u8 __builtin_msa_binsr_b (v16u8, v16u8, v16u8);
+v8u16 __builtin_msa_binsr_h (v8u16, v8u16, v8u16);
+v4u32 __builtin_msa_binsr_w (v4u32, v4u32, v4u32);
+v2u64 __builtin_msa_binsr_d (v2u64, v2u64, v2u64);
+
+v16u8 __builtin_msa_binsri_b (v16u8, v16u8, imm0_7);
+v8u16 __builtin_msa_binsri_h (v8u16, v8u16, imm0_15);
+v4u32 __builtin_msa_binsri_w (v4u32, v4u32, imm0_31);
+v2u64 __builtin_msa_binsri_d (v2u64, v2u64, imm0_63);
+
+v16u8 __builtin_msa_bmnz_v (v16u8, v16u8, v16u8);
+
+v16u8 __builtin_msa_bmnzi_b (v16u8, v16u8, imm0_255);
+
+v16u8 __builtin_msa_bmz_v (v16u8, v16u8, v16u8);
+
+v16u8 __builtin_msa_bmzi_b (v16u8, v16u8, imm0_255);
+
+v16u8 __builtin_msa_bneg_b (v16u8, v16u8);
+v8u16 __builtin_msa_bneg_h (v8u16, v8u16);
+v4u32 __builtin_msa_bneg_w (v4u32, v4u32);
+v2u64 __builtin_msa_bneg_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_bnegi_b (v16u8, imm0_7);
+v8u16 __builtin_msa_bnegi_h (v8u16, imm0_15);
+v4u32 __builtin_msa_bnegi_w (v4u32, imm0_31);
+v2u64 __builtin_msa_bnegi_d (v2u64, imm0_63);
+
+i32 __builtin_msa_bnz_b (v16u8);
+i32 __builtin_msa_bnz_h (v8u16);
+i32 __builtin_msa_bnz_w (v4u32);
+i32 __builtin_msa_bnz_d (v2u64);
+
+i32 __builtin_msa_bnz_v (v16u8);
+
+v16u8 __builtin_msa_bsel_v (v16u8, v16u8, v16u8);
+
+v16u8 __builtin_msa_bseli_b (v16u8, v16u8, imm0_255);
+
+v16u8 __builtin_msa_bset_b (v16u8, v16u8);
+v8u16 __builtin_msa_bset_h (v8u16, v8u16);
+v4u32 __builtin_msa_bset_w (v4u32, v4u32);
+v2u64 __builtin_msa_bset_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_bseti_b (v16u8, imm0_7);
+v8u16 __builtin_msa_bseti_h (v8u16, imm0_15);
+v4u32 __builtin_msa_bseti_w (v4u32, imm0_31);
+v2u64 __builtin_msa_bseti_d (v2u64, imm0_63);
+
+i32 __builtin_msa_bz_b (v16u8);
+i32 __builtin_msa_bz_h (v8u16);
+i32 __builtin_msa_bz_w (v4u32);
+i32 __builtin_msa_bz_d (v2u64);
+
+i32 __builtin_msa_bz_v (v16u8);
+
+v16i8 __builtin_msa_ceq_b (v16i8, v16i8);
+v8i16 __builtin_msa_ceq_h (v8i16, v8i16);
+v4i32 __builtin_msa_ceq_w (v4i32, v4i32);
+v2i64 __builtin_msa_ceq_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_ceqi_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_ceqi_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_ceqi_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_ceqi_d (v2i64, imm_n16_15);
+
+i32 __builtin_msa_cfcmsa (imm0_31);
+
+v16i8 __builtin_msa_cle_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_cle_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_cle_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_cle_s_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_cle_u_b (v16u8, v16u8);
+v8i16 __builtin_msa_cle_u_h (v8u16, v8u16);
+v4i32 __builtin_msa_cle_u_w (v4u32, v4u32);
+v2i64 __builtin_msa_cle_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_clei_s_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_clei_s_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_clei_s_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_clei_s_d (v2i64, imm_n16_15);
+
+v16i8 __builtin_msa_clei_u_b (v16u8, imm0_31);
+v8i16 __builtin_msa_clei_u_h (v8u16, imm0_31);
+v4i32 __builtin_msa_clei_u_w (v4u32, imm0_31);
+v2i64 __builtin_msa_clei_u_d (v2u64, imm0_31);
+
+v16i8 __builtin_msa_clt_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_clt_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_clt_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_clt_s_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_clt_u_b (v16u8, v16u8);
+v8i16 __builtin_msa_clt_u_h (v8u16, v8u16);
+v4i32 __builtin_msa_clt_u_w (v4u32, v4u32);
+v2i64 __builtin_msa_clt_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_clti_s_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_clti_s_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_clti_s_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_clti_s_d (v2i64, imm_n16_15);
+
+v16i8 __builtin_msa_clti_u_b (v16u8, imm0_31);
+v8i16 __builtin_msa_clti_u_h (v8u16, imm0_31);
+v4i32 __builtin_msa_clti_u_w (v4u32, imm0_31);
+v2i64 __builtin_msa_clti_u_d (v2u64, imm0_31);
+
+i32 __builtin_msa_copy_s_b (v16i8, imm0_15);
+i32 __builtin_msa_copy_s_h (v8i16, imm0_7);
+i32 __builtin_msa_copy_s_w (v4i32, imm0_3);
+i64 __builtin_msa_copy_s_d (v2i64, imm0_1);
+
+u32 __builtin_msa_copy_u_b (v16i8, imm0_15);
+u32 __builtin_msa_copy_u_h (v8i16, imm0_7);
+u32 __builtin_msa_copy_u_w (v4i32, imm0_3);
+u64 __builtin_msa_copy_u_d (v2i64, imm0_1);
+
+void __builtin_msa_ctcmsa (imm0_31, i32);
+
+v16i8 __builtin_msa_div_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_div_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_div_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_div_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_div_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_div_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_div_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_div_u_d (v2u64, v2u64);
+
+v8i16 __builtin_msa_dotp_s_h (v16i8, v16i8);
+v4i32 __builtin_msa_dotp_s_w (v8i16, v8i16);
+v2i64 __builtin_msa_dotp_s_d (v4i32, v4i32);
+
+v8u16 __builtin_msa_dotp_u_h (v16u8, v16u8);
+v4u32 __builtin_msa_dotp_u_w (v8u16, v8u16);
+v2u64 __builtin_msa_dotp_u_d (v4u32, v4u32);
+
+v8i16 __builtin_msa_dpadd_s_h (v8i16, v16i8, v16i8);
+v4i32 __builtin_msa_dpadd_s_w (v4i32, v8i16, v8i16);
+v2i64 __builtin_msa_dpadd_s_d (v2i64, v4i32, v4i32);
+
+v8u16 __builtin_msa_dpadd_u_h (v8u16, v16u8, v16u8);
+v4u32 __builtin_msa_dpadd_u_w (v4u32, v8u16, v8u16);
+v2u64 __builtin_msa_dpadd_u_d (v2u64, v4u32, v4u32);
+
+v8i16 __builtin_msa_dpsub_s_h (v8i16, v16i8, v16i8);
+v4i32 __builtin_msa_dpsub_s_w (v4i32, v8i16, v8i16);
+v2i64 __builtin_msa_dpsub_s_d (v2i64, v4i32, v4i32);
+
+v8i16 __builtin_msa_dpsub_u_h (v8i16, v16u8, v16u8);
+v4i32 __builtin_msa_dpsub_u_w (v4i32, v8u16, v8u16);
+v2i64 __builtin_msa_dpsub_u_d (v2i64, v4u32, v4u32);
+
+v4f32 __builtin_msa_fadd_w (v4f32, v4f32);
+v2f64 __builtin_msa_fadd_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcaf_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcaf_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fceq_w (v4f32, v4f32);
+v2i64 __builtin_msa_fceq_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fclass_w (v4f32);
+v2i64 __builtin_msa_fclass_d (v2f64);
+
+v4i32 __builtin_msa_fcle_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcle_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fclt_w (v4f32, v4f32);
+v2i64 __builtin_msa_fclt_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcne_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcne_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcor_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcor_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcueq_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcueq_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcule_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcule_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcult_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcult_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcun_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcun_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcune_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcune_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fdiv_w (v4f32, v4f32);
+v2f64 __builtin_msa_fdiv_d (v2f64, v2f64);
+
+v8i16 __builtin_msa_fexdo_h (v4f32, v4f32);
+v4f32 __builtin_msa_fexdo_w (v2f64, v2f64);
+
+v4f32 __builtin_msa_fexp2_w (v4f32, v4i32);
+v2f64 __builtin_msa_fexp2_d (v2f64, v2i64);
+
+v4f32 __builtin_msa_fexupl_w (v8i16);
+v2f64 __builtin_msa_fexupl_d (v4f32);
+
+v4f32 __builtin_msa_fexupr_w (v8i16);
+v2f64 __builtin_msa_fexupr_d (v4f32);
+
+v4f32 __builtin_msa_ffint_s_w (v4i32);
+v2f64 __builtin_msa_ffint_s_d (v2i64);
+
+v4f32 __builtin_msa_ffint_u_w (v4u32);
+v2f64 __builtin_msa_ffint_u_d (v2u64);
+
+v4f32 __builtin_msa_ffql_w (v8i16);
+v2f64 __builtin_msa_ffql_d (v4i32);
+
+v4f32 __builtin_msa_ffqr_w (v8i16);
+v2f64 __builtin_msa_ffqr_d (v4i32);
+
+v16i8 __builtin_msa_fill_b (i32);
+v8i16 __builtin_msa_fill_h (i32);
+v4i32 __builtin_msa_fill_w (i32);
+v2i64 __builtin_msa_fill_d (i64);
+
+v4f32 __builtin_msa_flog2_w (v4f32);
+v2f64 __builtin_msa_flog2_d (v2f64);
+
+v4f32 __builtin_msa_fmadd_w (v4f32, v4f32, v4f32);
+v2f64 __builtin_msa_fmadd_d (v2f64, v2f64, v2f64);
+
+v4f32 __builtin_msa_fmax_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmax_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fmax_a_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmax_a_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fmin_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmin_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fmin_a_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmin_a_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fmsub_w (v4f32, v4f32, v4f32);
+v2f64 __builtin_msa_fmsub_d (v2f64, v2f64, v2f64);
+
+v4f32 __builtin_msa_fmul_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmul_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_frint_w (v4f32);
+v2f64 __builtin_msa_frint_d (v2f64);
+
+v4f32 __builtin_msa_frcp_w (v4f32);
+v2f64 __builtin_msa_frcp_d (v2f64);
+
+v4f32 __builtin_msa_frsqrt_w (v4f32);
+v2f64 __builtin_msa_frsqrt_d (v2f64);
+
+v4i32 __builtin_msa_fsaf_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsaf_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fseq_w (v4f32, v4f32);
+v2i64 __builtin_msa_fseq_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsle_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsle_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fslt_w (v4f32, v4f32);
+v2i64 __builtin_msa_fslt_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsne_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsne_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsor_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsor_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fsqrt_w (v4f32);
+v2f64 __builtin_msa_fsqrt_d (v2f64);
+
+v4f32 __builtin_msa_fsub_w (v4f32, v4f32);
+v2f64 __builtin_msa_fsub_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsueq_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsueq_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsule_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsule_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsult_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsult_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsun_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsun_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsune_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsune_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_ftint_s_w (v4f32);
+v2i64 __builtin_msa_ftint_s_d (v2f64);
+
+v4u32 __builtin_msa_ftint_u_w (v4f32);
+v2u64 __builtin_msa_ftint_u_d (v2f64);
+
+v8i16 __builtin_msa_ftq_h (v4f32, v4f32);
+v4i32 __builtin_msa_ftq_w (v2f64, v2f64);
+
+v4i32 __builtin_msa_ftrunc_s_w (v4f32);
+v2i64 __builtin_msa_ftrunc_s_d (v2f64);
+
+v4u32 __builtin_msa_ftrunc_u_w (v4f32);
+v2u64 __builtin_msa_ftrunc_u_d (v2f64);
+
+v8i16 __builtin_msa_hadd_s_h (v16i8, v16i8);
+v4i32 __builtin_msa_hadd_s_w (v8i16, v8i16);
+v2i64 __builtin_msa_hadd_s_d (v4i32, v4i32);
+
+v8u16 __builtin_msa_hadd_u_h (v16u8, v16u8);
+v4u32 __builtin_msa_hadd_u_w (v8u16, v8u16);
+v2u64 __builtin_msa_hadd_u_d (v4u32, v4u32);
+
+v8i16 __builtin_msa_hsub_s_h (v16i8, v16i8);
+v4i32 __builtin_msa_hsub_s_w (v8i16, v8i16);
+v2i64 __builtin_msa_hsub_s_d (v4i32, v4i32);
+
+v8i16 __builtin_msa_hsub_u_h (v16u8, v16u8);
+v4i32 __builtin_msa_hsub_u_w (v8u16, v8u16);
+v2i64 __builtin_msa_hsub_u_d (v4u32, v4u32);
+
+v16i8 __builtin_msa_ilvev_b (v16i8, v16i8);
+v8i16 __builtin_msa_ilvev_h (v8i16, v8i16);
+v4i32 __builtin_msa_ilvev_w (v4i32, v4i32);
+v2i64 __builtin_msa_ilvev_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_ilvl_b (v16i8, v16i8);
+v8i16 __builtin_msa_ilvl_h (v8i16, v8i16);
+v4i32 __builtin_msa_ilvl_w (v4i32, v4i32);
+v2i64 __builtin_msa_ilvl_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_ilvod_b (v16i8, v16i8);
+v8i16 __builtin_msa_ilvod_h (v8i16, v8i16);
+v4i32 __builtin_msa_ilvod_w (v4i32, v4i32);
+v2i64 __builtin_msa_ilvod_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_ilvr_b (v16i8, v16i8);
+v8i16 __builtin_msa_ilvr_h (v8i16, v8i16);
+v4i32 __builtin_msa_ilvr_w (v4i32, v4i32);
+v2i64 __builtin_msa_ilvr_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_insert_b (v16i8, imm0_15, i32);
+v8i16 __builtin_msa_insert_h (v8i16, imm0_7, i32);
+v4i32 __builtin_msa_insert_w (v4i32, imm0_3, i32);
+v2i64 __builtin_msa_insert_d (v2i64, imm0_1, i64);
+
+v16i8 __builtin_msa_insve_b (v16i8, imm0_15, v16i8);
+v8i16 __builtin_msa_insve_h (v8i16, imm0_7, v8i16);
+v4i32 __builtin_msa_insve_w (v4i32, imm0_3, v4i32);
+v2i64 __builtin_msa_insve_d (v2i64, imm0_1, v2i64);
+
+v16i8 __builtin_msa_ld_b (void *, imm_n512_511);
+v8i16 __builtin_msa_ld_h (void *, imm_n1024_1022);
+v4i32 __builtin_msa_ld_w (void *, imm_n2048_2044);
+v2i64 __builtin_msa_ld_d (void *, imm_n4096_4088);
+
+v16i8 __builtin_msa_ldi_b (imm_n512_511);
+v8i16 __builtin_msa_ldi_h (imm_n512_511);
+v4i32 __builtin_msa_ldi_w (imm_n512_511);
+v2i64 __builtin_msa_ldi_d (imm_n512_511);
+
+v8i16 __builtin_msa_madd_q_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_madd_q_w (v4i32, v4i32, v4i32);
+
+v8i16 __builtin_msa_maddr_q_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_maddr_q_w (v4i32, v4i32, v4i32);
+
+v16i8 __builtin_msa_maddv_b (v16i8, v16i8, v16i8);
+v8i16 __builtin_msa_maddv_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_maddv_w (v4i32, v4i32, v4i32);
+v2i64 __builtin_msa_maddv_d (v2i64, v2i64, v2i64);
+
+v16i8 __builtin_msa_max_a_b (v16i8, v16i8);
+v8i16 __builtin_msa_max_a_h (v8i16, v8i16);
+v4i32 __builtin_msa_max_a_w (v4i32, v4i32);
+v2i64 __builtin_msa_max_a_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_max_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_max_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_max_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_max_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_max_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_max_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_max_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_max_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_maxi_s_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_maxi_s_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_maxi_s_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_maxi_s_d (v2i64, imm_n16_15);
+
+v16u8 __builtin_msa_maxi_u_b (v16u8, imm0_31);
+v8u16 __builtin_msa_maxi_u_h (v8u16, imm0_31);
+v4u32 __builtin_msa_maxi_u_w (v4u32, imm0_31);
+v2u64 __builtin_msa_maxi_u_d (v2u64, imm0_31);
+
+v16i8 __builtin_msa_min_a_b (v16i8, v16i8);
+v8i16 __builtin_msa_min_a_h (v8i16, v8i16);
+v4i32 __builtin_msa_min_a_w (v4i32, v4i32);
+v2i64 __builtin_msa_min_a_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_min_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_min_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_min_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_min_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_min_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_min_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_min_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_min_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_mini_s_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_mini_s_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_mini_s_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_mini_s_d (v2i64, imm_n16_15);
+
+v16u8 __builtin_msa_mini_u_b (v16u8, imm0_31);
+v8u16 __builtin_msa_mini_u_h (v8u16, imm0_31);
+v4u32 __builtin_msa_mini_u_w (v4u32, imm0_31);
+v2u64 __builtin_msa_mini_u_d (v2u64, imm0_31);
+
+v16i8 __builtin_msa_mod_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_mod_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_mod_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_mod_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_mod_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_mod_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_mod_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_mod_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_move_v (v16i8);
+
+v8i16 __builtin_msa_msub_q_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_msub_q_w (v4i32, v4i32, v4i32);
+
+v8i16 __builtin_msa_msubr_q_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_msubr_q_w (v4i32, v4i32, v4i32);
+
+v16i8 __builtin_msa_msubv_b (v16i8, v16i8, v16i8);
+v8i16 __builtin_msa_msubv_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_msubv_w (v4i32, v4i32, v4i32);
+v2i64 __builtin_msa_msubv_d (v2i64, v2i64, v2i64);
+
+v8i16 __builtin_msa_mul_q_h (v8i16, v8i16);
+v4i32 __builtin_msa_mul_q_w (v4i32, v4i32);
+
+v8i16 __builtin_msa_mulr_q_h (v8i16, v8i16);
+v4i32 __builtin_msa_mulr_q_w (v4i32, v4i32);
+
+v16i8 __builtin_msa_mulv_b (v16i8, v16i8);
+v8i16 __builtin_msa_mulv_h (v8i16, v8i16);
+v4i32 __builtin_msa_mulv_w (v4i32, v4i32);
+v2i64 __builtin_msa_mulv_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_nloc_b (v16i8);
+v8i16 __builtin_msa_nloc_h (v8i16);
+v4i32 __builtin_msa_nloc_w (v4i32);
+v2i64 __builtin_msa_nloc_d (v2i64);
+
+v16i8 __builtin_msa_nlzc_b (v16i8);
+v8i16 __builtin_msa_nlzc_h (v8i16);
+v4i32 __builtin_msa_nlzc_w (v4i32);
+v2i64 __builtin_msa_nlzc_d (v2i64);
+
+v16u8 __builtin_msa_nor_v (v16u8, v16u8);
+
+v16u8 __builtin_msa_nori_b (v16u8, imm0_255);
+
+v16u8 __builtin_msa_or_v (v16u8, v16u8);
+
+v16u8 __builtin_msa_ori_b (v16u8, imm0_255);
+
+v16i8 __builtin_msa_pckev_b (v16i8, v16i8);
+v8i16 __builtin_msa_pckev_h (v8i16, v8i16);
+v4i32 __builtin_msa_pckev_w (v4i32, v4i32);
+v2i64 __builtin_msa_pckev_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_pckod_b (v16i8, v16i8);
+v8i16 __builtin_msa_pckod_h (v8i16, v8i16);
+v4i32 __builtin_msa_pckod_w (v4i32, v4i32);
+v2i64 __builtin_msa_pckod_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_pcnt_b (v16i8);
+v8i16 __builtin_msa_pcnt_h (v8i16);
+v4i32 __builtin_msa_pcnt_w (v4i32);
+v2i64 __builtin_msa_pcnt_d (v2i64);
+
+v16i8 __builtin_msa_sat_s_b (v16i8, imm0_7);
+v8i16 __builtin_msa_sat_s_h (v8i16, imm0_15);
+v4i32 __builtin_msa_sat_s_w (v4i32, imm0_31);
+v2i64 __builtin_msa_sat_s_d (v2i64, imm0_63);
+
+v16u8 __builtin_msa_sat_u_b (v16u8, imm0_7);
+v8u16 __builtin_msa_sat_u_h (v8u16, imm0_15);
+v4u32 __builtin_msa_sat_u_w (v4u32, imm0_31);
+v2u64 __builtin_msa_sat_u_d (v2u64, imm0_63);
+
+v16i8 __builtin_msa_shf_b (v16i8, imm0_255);
+v8i16 __builtin_msa_shf_h (v8i16, imm0_255);
+v4i32 __builtin_msa_shf_w (v4i32, imm0_255);
+
+v16i8 __builtin_msa_sld_b (v16i8, v16i8, i32);
+v8i16 __builtin_msa_sld_h (v8i16, v8i16, i32);
+v4i32 __builtin_msa_sld_w (v4i32, v4i32, i32);
+v2i64 __builtin_msa_sld_d (v2i64, v2i64, i32);
+
+v16i8 __builtin_msa_sldi_b (v16i8, v16i8, imm0_15);
+v8i16 __builtin_msa_sldi_h (v8i16, v8i16, imm0_7);
+v4i32 __builtin_msa_sldi_w (v4i32, v4i32, imm0_3);
+v2i64 __builtin_msa_sldi_d (v2i64, v2i64, imm0_1);
+
+v16i8 __builtin_msa_sll_b (v16i8, v16i8);
+v8i16 __builtin_msa_sll_h (v8i16, v8i16);
+v4i32 __builtin_msa_sll_w (v4i32, v4i32);
+v2i64 __builtin_msa_sll_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_slli_b (v16i8, imm0_7);
+v8i16 __builtin_msa_slli_h (v8i16, imm0_15);
+v4i32 __builtin_msa_slli_w (v4i32, imm0_31);
+v2i64 __builtin_msa_slli_d (v2i64, imm0_63);
+
+v16i8 __builtin_msa_splat_b (v16i8, i32);
+v8i16 __builtin_msa_splat_h (v8i16, i32);
+v4i32 __builtin_msa_splat_w (v4i32, i32);
+v2i64 __builtin_msa_splat_d (v2i64, i32);
+
+v16i8 __builtin_msa_splati_b (v16i8, imm0_15);
+v8i16 __builtin_msa_splati_h (v8i16, imm0_7);
+v4i32 __builtin_msa_splati_w (v4i32, imm0_3);
+v2i64 __builtin_msa_splati_d (v2i64, imm0_1);
+
+v16i8 __builtin_msa_sra_b (v16i8, v16i8);
+v8i16 __builtin_msa_sra_h (v8i16, v8i16);
+v4i32 __builtin_msa_sra_w (v4i32, v4i32);
+v2i64 __builtin_msa_sra_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_srai_b (v16i8, imm0_7);
+v8i16 __builtin_msa_srai_h (v8i16, imm0_15);
+v4i32 __builtin_msa_srai_w (v4i32, imm0_31);
+v2i64 __builtin_msa_srai_d (v2i64, imm0_63);
+
+v16i8 __builtin_msa_srar_b (v16i8, v16i8);
+v8i16 __builtin_msa_srar_h (v8i16, v8i16);
+v4i32 __builtin_msa_srar_w (v4i32, v4i32);
+v2i64 __builtin_msa_srar_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_srari_b (v16i8, imm0_7);
+v8i16 __builtin_msa_srari_h (v8i16, imm0_15);
+v4i32 __builtin_msa_srari_w (v4i32, imm0_31);
+v2i64 __builtin_msa_srari_d (v2i64, imm0_63);
+
+v16i8 __builtin_msa_srl_b (v16i8, v16i8);
+v8i16 __builtin_msa_srl_h (v8i16, v8i16);
+v4i32 __builtin_msa_srl_w (v4i32, v4i32);
+v2i64 __builtin_msa_srl_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_srli_b (v16i8, imm0_7);
+v8i16 __builtin_msa_srli_h (v8i16, imm0_15);
+v4i32 __builtin_msa_srli_w (v4i32, imm0_31);
+v2i64 __builtin_msa_srli_d (v2i64, imm0_63);
+
+v16i8 __builtin_msa_srlr_b (v16i8, v16i8);
+v8i16 __builtin_msa_srlr_h (v8i16, v8i16);
+v4i32 __builtin_msa_srlr_w (v4i32, v4i32);
+v2i64 __builtin_msa_srlr_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_srlri_b (v16i8, imm0_7);
+v8i16 __builtin_msa_srlri_h (v8i16, imm0_15);
+v4i32 __builtin_msa_srlri_w (v4i32, imm0_31);
+v2i64 __builtin_msa_srlri_d (v2i64, imm0_63);
+
+void __builtin_msa_st_b (v16i8, void *, imm_n512_511);
+void __builtin_msa_st_h (v8i16, void *, imm_n1024_1022);
+void __builtin_msa_st_w (v4i32, void *, imm_n2048_2044);
+void __builtin_msa_st_d (v2i64, void *, imm_n4096_4088);
+
+v16i8 __builtin_msa_subs_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_subs_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_subs_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_subs_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_subs_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_subs_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_subs_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_subs_u_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_subsus_u_b (v16u8, v16i8);
+v8u16 __builtin_msa_subsus_u_h (v8u16, v8i16);
+v4u32 __builtin_msa_subsus_u_w (v4u32, v4i32);
+v2u64 __builtin_msa_subsus_u_d (v2u64, v2i64);
+
+v16i8 __builtin_msa_subsuu_s_b (v16u8, v16u8);
+v8i16 __builtin_msa_subsuu_s_h (v8u16, v8u16);
+v4i32 __builtin_msa_subsuu_s_w (v4u32, v4u32);
+v2i64 __builtin_msa_subsuu_s_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_subv_b (v16i8, v16i8);
+v8i16 __builtin_msa_subv_h (v8i16, v8i16);
+v4i32 __builtin_msa_subv_w (v4i32, v4i32);
+v2i64 __builtin_msa_subv_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_subvi_b (v16i8, imm0_31);
+v8i16 __builtin_msa_subvi_h (v8i16, imm0_31);
+v4i32 __builtin_msa_subvi_w (v4i32, imm0_31);
+v2i64 __builtin_msa_subvi_d (v2i64, imm0_31);
+
+v16i8 __builtin_msa_vshf_b (v16i8, v16i8, v16i8);
+v8i16 __builtin_msa_vshf_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_vshf_w (v4i32, v4i32, v4i32);
+v2i64 __builtin_msa_vshf_d (v2i64, v2i64, v2i64);
+
+v16u8 __builtin_msa_xor_v (v16u8, v16u8);
+
+v16u8 __builtin_msa_xori_b (v16u8, imm0_255);
diff --git a/library/stdarch/crates/stdarch-verify/src/lib.rs b/library/stdarch/crates/stdarch-verify/src/lib.rs
new file mode 100644
index 0000000000000..c81f5f45bcce4
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/src/lib.rs
@@ -0,0 +1,583 @@
+#![deny(rust_2018_idioms)]
+#[macro_use]
+extern crate quote;
+#[macro_use]
+extern crate syn;
+
+use proc_macro::TokenStream;
+use std::{fs::File, io::Read, path::Path};
+use syn::ext::IdentExt;
+use syn::parse::Parser as _;
+
+#[proc_macro]
+pub fn x86_functions(input: TokenStream) -> TokenStream {
+    functions(input, &["core_arch/src/x86", "core_arch/src/x86_64"])
+}
+
+#[proc_macro]
+pub fn arm_functions(input: TokenStream) -> TokenStream {
+    functions(
+        input,
+        &[
+            "core_arch/src/arm",
+            "core_arch/src/aarch64",
+            "core_arch/src/arm_shared/neon",
+        ],
+    )
+}
+
+#[proc_macro]
+pub fn mips_functions(input: TokenStream) -> TokenStream {
+    functions(input, &["core_arch/src/mips"])
+}
+
+fn functions(input: TokenStream, dirs: &[&str]) -> TokenStream {
+    let dir = Path::new(env!("CARGO_MANIFEST_DIR"));
+    let root = dir.parent().expect("root-dir not found");
+
+    let mut files = Vec::new();
+    for dir in dirs {
+        walk(&root.join(dir), &mut files);
+    }
+    assert!(!files.is_empty());
+
+    let mut functions = Vec::new();
+    for &mut (ref mut file, ref path) in &mut files {
+        for mut item in file.items.drain(..) {
+            match item {
+                syn::Item::Fn(f) => functions.push((f, path)),
+                syn::Item::Mod(ref mut m) => {
+                    if let Some(ref mut m) = m.content {
+                        for i in m.1.drain(..) {
+                            if let syn::Item::Fn(f) = i {
+                                functions.push((f, path))
+                            }
+                        }
+                    }
+                }
+                _ => (),
+            }
+        }
+    }
+    assert!(!functions.is_empty());
+
+    let mut tests = std::collections::HashSet::<String>::new();
+    for f in &functions {
+        let id = format!("{}", f.0.sig.ident);
+        if id.starts_with("test_") {
+            tests.insert(id);
+        }
+    }
+    assert!(!tests.is_empty());
+
+    functions.retain(|(f, _)| matches!(f.vis, syn::Visibility::Public(_)));
+    assert!(!functions.is_empty());
+
+    let input = proc_macro2::TokenStream::from(input);
+
+    let functions = functions
+        .iter()
+        .map(|&(ref f, path)| {
+            let name = &f.sig.ident;
+            // println!("{name}");
+            let mut arguments = Vec::new();
+            let mut const_arguments = Vec::new();
+            for input in f.sig.inputs.iter() {
+                let ty = match *input {
+                    syn::FnArg::Typed(ref c) => &c.ty,
+                    _ => panic!("invalid argument on {name}"),
+                };
+                arguments.push(to_type(ty));
+            }
+            for generic in f.sig.generics.params.iter() {
+                match *generic {
+                    syn::GenericParam::Const(ref c) => const_arguments.push(to_type(&c.ty)),
+                    syn::GenericParam::Type(ref _t) => (),
+                    _ => panic!("invalid generic argument on {name}"),
+                };
+            }
+            let ret = match f.sig.output {
+                syn::ReturnType::Default => quote! { None },
+                syn::ReturnType::Type(_, ref t) => {
+                    let ty = to_type(t);
+                    quote! { Some(#ty) }
+                }
+            };
+            let instrs = find_instrs(&f.attrs);
+            let target_feature = if let Some(i) = find_target_feature(&f.attrs) {
+                quote! { Some(#i) }
+            } else {
+                quote! { None }
+            };
+
+            let required_const = find_required_const("rustc_args_required_const", &f.attrs);
+            let mut legacy_const_generics =
+                find_required_const("rustc_legacy_const_generics", &f.attrs);
+            if !required_const.is_empty() && !legacy_const_generics.is_empty() {
+                panic!(
+                    "Can't have both #[rustc_args_required_const] and \
+                     #[rustc_legacy_const_generics]"
+                );
+            }
+
+            // The list of required consts, used to verify the arguments, comes from either the
+            // `rustc_args_required_const` or the `rustc_legacy_const_generics` attribute.
+            let required_const = if required_const.is_empty() {
+                legacy_const_generics.clone()
+            } else {
+                required_const
+            };
+
+            legacy_const_generics.sort();
+            for (idx, ty) in legacy_const_generics
+                .into_iter()
+                .zip(const_arguments.into_iter())
+            {
+                arguments.insert(idx, ty);
+            }
+
+            // strip leading underscore from fn name when building a test
+            // _mm_foo -> mm_foo such that the test name is test_mm_foo.
+            let test_name_string = format!("{name}");
+            let mut test_name_id = test_name_string.as_str();
+            while test_name_id.starts_with('_') {
+                test_name_id = &test_name_id[1..];
+            }
+            let has_test = tests.contains(&format!("test_{test_name_id}"));
+
+            let doc = find_doc(&f.attrs);
+
+            quote! {
+                Function {
+                    name: stringify!(#name),
+                    arguments: &[#(#arguments),*],
+                    ret: #ret,
+                    target_feature: #target_feature,
+                    instrs: &[#(#instrs),*],
+                    file: stringify!(#path),
+                    required_const: &[#(#required_const),*],
+                    has_test: #has_test,
+                    doc: #doc
+                }
+            }
+        })
+        .collect::<Vec<_>>();
+
+    let ret = quote! { #input: &[Function] = &[#(#functions),*]; };
+    // println!("{ret}");
+    ret.into()
+}
+
+fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
+    match *t {
+        syn::Type::Path(ref p) => match extract_path_ident(&p.path).to_string().as_ref() {
+            // x86 ...
+            "__m128" => quote! { &M128 },
+            "__m128bh" => quote! { &M128BH },
+            "__m128d" => quote! { &M128D },
+            "__m128h" => quote! { &M128H },
+            "__m128i" => quote! { &M128I },
+            "__m256" => quote! { &M256 },
+            "__m256bh" => quote! { &M256BH },
+            "__m256d" => quote! { &M256D },
+            "__m256h" => quote! { &M256H },
+            "__m256i" => quote! { &M256I },
+            "__m512" => quote! { &M512 },
+            "__m512bh" => quote! { &M512BH },
+            "__m512d" => quote! { &M512D },
+            "__m512h" => quote! { &M512H },
+            "__m512i" => quote! { &M512I },
+            "__mmask8" => quote! { &MMASK8 },
+            "__mmask16" => quote! { &MMASK16 },
+            "__mmask32" => quote! { &MMASK32 },
+            "__mmask64" => quote! { &MMASK64 },
+            "_MM_CMPINT_ENUM" => quote! { &MM_CMPINT_ENUM },
+            "_MM_MANTISSA_NORM_ENUM" => quote! { &MM_MANTISSA_NORM_ENUM },
+            "_MM_MANTISSA_SIGN_ENUM" => quote! { &MM_MANTISSA_SIGN_ENUM },
+            "_MM_PERM_ENUM" => quote! { &MM_PERM_ENUM },
+            "bool" => quote! { &BOOL },
+            "bf16" => quote! { &BF16 },
+            "f16" => quote! { &F16 },
+            "f32" => quote! { &F32 },
+            "f64" => quote! { &F64 },
+            "i16" => quote! { &I16 },
+            "i32" => quote! { &I32 },
+            "i64" => quote! { &I64 },
+            "i8" => quote! { &I8 },
+            "u16" => quote! { &U16 },
+            "u32" => quote! { &U32 },
+            "u64" => quote! { &U64 },
+            "u128" => quote! { &U128 },
+            "usize" => quote! { &USIZE },
+            "u8" => quote! { &U8 },
+            "p8" => quote! { &P8 },
+            "p16" => quote! { &P16 },
+            "Ordering" => quote! { &ORDERING },
+            "CpuidResult" => quote! { &CPUID },
+
+            // arm ...
+            "int8x4_t" => quote! { &I8X4 },
+            "int8x8_t" => quote! { &I8X8 },
+            "int8x8x2_t" => quote! { &I8X8X2 },
+            "int8x8x3_t" => quote! { &I8X8X3 },
+            "int8x8x4_t" => quote! { &I8X8X4 },
+            "int8x16x2_t" => quote! { &I8X16X2 },
+            "int8x16x3_t" => quote! { &I8X16X3 },
+            "int8x16x4_t" => quote! { &I8X16X4 },
+            "int8x16_t" => quote! { &I8X16 },
+            "int16x2_t" => quote! { &I16X2 },
+            "int16x4_t" => quote! { &I16X4 },
+            "int16x4x2_t" => quote! { &I16X4X2 },
+            "int16x4x3_t" => quote! { &I16X4X3 },
+            "int16x4x4_t" => quote! { &I16X4X4 },
+            "int16x8_t" => quote! { &I16X8 },
+            "int16x8x2_t" => quote! { &I16X8X2 },
+            "int16x8x3_t" => quote! { &I16X8X3 },
+            "int16x8x4_t" => quote! { &I16X8X4 },
+            "int32x2_t" => quote! { &I32X2 },
+            "int32x2x2_t" => quote! { &I32X2X2 },
+            "int32x2x3_t" => quote! { &I32X2X3 },
+            "int32x2x4_t" => quote! { &I32X2X4 },
+            "int32x4_t" => quote! { &I32X4 },
+            "int32x4x2_t" => quote! { &I32X4X2 },
+            "int32x4x3_t" => quote! { &I32X4X3 },
+            "int32x4x4_t" => quote! { &I32X4X4 },
+            "int64x1_t" => quote! { &I64X1 },
+            "int64x1x2_t" => quote! { &I64X1X2 },
+            "int64x1x3_t" => quote! { &I64X1X3 },
+            "int64x1x4_t" => quote! { &I64X1X4 },
+            "int64x2_t" => quote! { &I64X2 },
+            "int64x2x2_t" => quote! { &I64X2X2 },
+            "int64x2x3_t" => quote! { &I64X2X3 },
+            "int64x2x4_t" => quote! { &I64X2X4 },
+            "uint8x8_t" => quote! { &U8X8 },
+            "uint8x4_t" => quote! { &U8X4 },
+            "uint8x8x2_t" => quote! { &U8X8X2 },
+            "uint8x16x2_t" => quote! { &U8X16X2 },
+            "uint8x16x3_t" => quote! { &U8X16X3 },
+            "uint8x16x4_t" => quote! { &U8X16X4 },
+            "uint8x8x3_t" => quote! { &U8X8X3 },
+            "uint8x8x4_t" => quote! { &U8X8X4 },
+            "uint8x16_t" => quote! { &U8X16 },
+            "uint16x4_t" => quote! { &U16X4 },
+            "uint16x4x2_t" => quote! { &U16X4X2 },
+            "uint16x4x3_t" => quote! { &U16X4X3 },
+            "uint16x4x4_t" => quote! { &U16X4X4 },
+            "uint16x8_t" => quote! { &U16X8 },
+            "uint16x8x2_t" => quote! { &U16X8X2 },
+            "uint16x8x3_t" => quote! { &U16X8X3 },
+            "uint16x8x4_t" => quote! { &U16X8X4 },
+            "uint32x2_t" => quote! { &U32X2 },
+            "uint32x2x2_t" => quote! { &U32X2X2 },
+            "uint32x2x3_t" => quote! { &U32X2X3 },
+            "uint32x2x4_t" => quote! { &U32X2X4 },
+            "uint32x4_t" => quote! { &U32X4 },
+            "uint32x4x2_t" => quote! { &U32X4X2 },
+            "uint32x4x3_t" => quote! { &U32X4X3 },
+            "uint32x4x4_t" => quote! { &U32X4X4 },
+            "uint64x1_t" => quote! { &U64X1 },
+            "uint64x1x2_t" => quote! { &U64X1X2 },
+            "uint64x1x3_t" => quote! { &U64X1X3 },
+            "uint64x1x4_t" => quote! { &U64X1X4 },
+            "uint64x2_t" => quote! { &U64X2 },
+            "uint64x2x2_t" => quote! { &U64X2X2 },
+            "uint64x2x3_t" => quote! { &U64X2X3 },
+            "uint64x2x4_t" => quote! { &U64X2X4 },
+            "float16x2_t" => quote! { &F16X2 },
+            "float16x4_t" => quote! { &F16X4 },
+            "float16x4x2_t" => quote! { &F16X4X2 },
+            "float16x4x3_t" => quote! { &F16X4X3 },
+            "float16x4x4_t" => quote! { &F16X4X4 },
+            "float16x8_t" => quote! { &F16X8 },
+            "float16x8x2_t" => quote! { &F16X8X2 },
+            "float16x8x3_t" => quote! { &F16X8X3 },
+            "float16x8x4_t" => quote! { &F16X8X4 },
+            "float32x2_t" => quote! { &F32X2 },
+            "float32x2x2_t" => quote! { &F32X2X2 },
+            "float32x2x3_t" => quote! { &F32X2X3 },
+            "float32x2x4_t" => quote! { &F32X2X4 },
+            "float32x4_t" => quote! { &F32X4 },
+            "float32x4x2_t" => quote! { &F32X4X2 },
+            "float32x4x3_t" => quote! { &F32X4X3 },
+            "float32x4x4_t" => quote! { &F32X4X4 },
+            "float64x1_t" => quote! { &F64X1 },
+            "float64x1x2_t" => quote! { &F64X1X2 },
+            "float64x1x3_t" => quote! { &F64X1X3 },
+            "float64x1x4_t" => quote! { &F64X1X4 },
+            "float64x2_t" => quote! { &F64X2 },
+            "float64x2x2_t" => quote! { &F64X2X2 },
+            "float64x2x3_t" => quote! { &F64X2X3 },
+            "float64x2x4_t" => quote! { &F64X2X4 },
+            "poly8x8_t" => quote! { &POLY8X8 },
+            "poly8x8x2_t" => quote! { &POLY8X8X2 },
+            "poly8x8x3_t" => quote! { &POLY8X8X3 },
+            "poly8x8x4_t" => quote! { &POLY8X8X4 },
+            "poly8x16x2_t" => quote! { &POLY8X16X2 },
+            "poly8x16x3_t" => quote! { &POLY8X16X3 },
+            "poly8x16x4_t" => quote! { &POLY8X16X4 },
+            "p64" => quote! { &P64 },
+            "poly64x1_t" => quote! { &POLY64X1 },
+            "poly64x2_t" => quote! { &POLY64X2 },
+            "poly8x16_t" => quote! { &POLY8X16 },
+            "poly16x4_t" => quote! { &POLY16X4 },
+            "poly16x4x2_t" => quote! { &P16X4X2 },
+            "poly16x4x3_t" => quote! { &P16X4X3 },
+            "poly16x4x4_t" => quote! { &P16X4X4 },
+            "poly16x8_t" => quote! { &POLY16X8 },
+            "poly16x8x2_t" => quote! { &P16X8X2 },
+            "poly16x8x3_t" => quote! { &P16X8X3 },
+            "poly16x8x4_t" => quote! { &P16X8X4 },
+            "poly64x1x2_t" => quote! { &P64X1X2 },
+            "poly64x1x3_t" => quote! { &P64X1X3 },
+            "poly64x1x4_t" => quote! { &P64X1X4 },
+            "poly64x2x2_t" => quote! { &P64X2X2 },
+            "poly64x2x3_t" => quote! { &P64X2X3 },
+            "poly64x2x4_t" => quote! { &P64X2X4 },
+            "p128" => quote! { &P128 },
+
+            "v16i8" => quote! { &v16i8 },
+            "v8i16" => quote! { &v8i16 },
+            "v4i32" => quote! { &v4i32 },
+            "v2i64" => quote! { &v2i64 },
+            "v16u8" => quote! { &v16u8 },
+            "v8u16" => quote! { &v8u16 },
+            "v4u32" => quote! { &v4u32 },
+            "v2u64" => quote! { &v2u64 },
+            "v8f16" => quote! { &v8f16 },
+            "v4f32" => quote! { &v4f32 },
+            "v2f64" => quote! { &v2f64 },
+
+            // Generic types
+            "T" => quote! { &GENERICT },
+            "U" => quote! { &GENERICU },
+
+            s => panic!("unsupported type: \"{s}\""),
+        },
+        syn::Type::Ptr(syn::TypePtr {
+            ref elem,
+            ref mutability,
+            ..
+        })
+        | syn::Type::Reference(syn::TypeReference {
+            ref elem,
+            ref mutability,
+            ..
+        }) => {
+            // Both pointers and references can have a mut token (*mut and &mut)
+            if mutability.is_some() {
+                let tokens = to_type(elem);
+                quote! { &Type::MutPtr(#tokens) }
+            } else {
+                // If they don't (*const or &) then they are "const"
+                let tokens = to_type(elem);
+                quote! { &Type::ConstPtr(#tokens) }
+            }
+        }
+
+        syn::Type::Slice(_) => panic!("unsupported slice"),
+        syn::Type::Array(_) => panic!("unsupported array"),
+        syn::Type::Tuple(_) => quote! { &TUPLE },
+        syn::Type::Never(_) => quote! { &NEVER },
+        _ => panic!("unsupported type"),
+    }
+}
+
+fn extract_path_ident(path: &syn::Path) -> syn::Ident {
+    if path.leading_colon.is_some() {
+        panic!("unsupported leading colon in path")
+    }
+    if path.segments.len() != 1 {
+        panic!("unsupported path that needs name resolution")
+    }
+    match path.segments.first().expect("segment not found").arguments {
+        syn::PathArguments::None => {}
+        _ => panic!("unsupported path that has path arguments"),
+    }
+    path.segments
+        .first()
+        .expect("segment not found")
+        .ident
+        .clone()
+}
+
+fn walk(root: &Path, files: &mut Vec<(syn::File, String)>) {
+    for file in root.read_dir().unwrap() {
+        let file = file.unwrap();
+        if file.file_type().unwrap().is_dir() {
+            walk(&file.path(), files);
+            continue;
+        }
+        let path = file.path();
+        if path.extension().and_then(std::ffi::OsStr::to_str) != Some("rs") {
+            continue;
+        }
+
+        if path.file_name().and_then(std::ffi::OsStr::to_str) == Some("test.rs") {
+            continue;
+        }
+
+        let mut contents = String::new();
+        File::open(&path)
+            .unwrap_or_else(|_| panic!("can't open file at path: {}", path.display()))
+            .read_to_string(&mut contents)
+            .expect("failed to read file to string");
+
+        files.push((
+            syn::parse_str::<syn::File>(&contents).expect("failed to parse"),
+            path.display().to_string(),
+        ));
+    }
+}
+
+fn find_instrs(attrs: &[syn::Attribute]) -> Vec<String> {
+    struct AssertInstr {
+        instr: Option<String>,
+    }
+
+    // A small custom parser to parse out the instruction in `assert_instr`.
+    //
+    // TODO: should probably just reuse `Invoc` from the `assert-instr-macro`
+    // crate.
+    impl syn::parse::Parse for AssertInstr {
+        fn parse(input: syn::parse::ParseStream<'_>) -> syn::Result<Self> {
+            let _ = input.parse::<syn::Meta>().unwrap();
+            let _ = input.parse::<Token![,]>().unwrap();
+
+            match input.parse::<syn::Ident>() {
+                Ok(ident) if ident == "assert_instr" => {}
+                _ => {
+                    while !input.is_empty() {
+                        // consume everything
+                        drop(input.parse::<proc_macro2::TokenStream>());
+                    }
+                    return Ok(Self { instr: None });
+                }
+            }
+
+            let instrs;
+            parenthesized!(instrs in input);
+
+            let mut instr = String::new();
+            while !instrs.is_empty() {
+                if let Ok(lit) = instrs.parse::<syn::LitStr>() {
+                    instr.push_str(&lit.value());
+                } else if let Ok(ident) = instrs.call(syn::Ident::parse_any) {
+                    instr.push_str(&ident.to_string());
+                } else if instrs.parse::<Token![.]>().is_ok() {
+                    instr.push('.');
+                } else if instrs.parse::<Token![,]>().is_ok() {
+                    // consume everything remaining
+                    drop(instrs.parse::<proc_macro2::TokenStream>());
+                    break;
+                } else {
+                    return Err(input.error("failed to parse instruction"));
+                }
+            }
+            Ok(Self { instr: Some(instr) })
+        }
+    }
+
+    attrs
+        .iter()
+        .filter_map(|a| {
+            if let syn::Meta::List(ref l) = a.meta {
+                if l.path.is_ident("cfg_attr") {
+                    Some(l)
+                } else {
+                    None
+                }
+            } else {
+                None
+            }
+        })
+        .filter_map(|l| syn::parse2::<AssertInstr>(l.tokens.clone()).unwrap().instr)
+        .collect()
+}
+
+fn find_target_feature(attrs: &[syn::Attribute]) -> Option<syn::Lit> {
+    attrs
+        .iter()
+        .flat_map(|a| {
+            #[allow(clippy::collapsible_if)]
+            if let syn::Meta::List(ref l) = a.meta {
+                if l.path.is_ident("target_feature") {
+                    if let Ok(l) =
+                        syn::punctuated::Punctuated::<syn::Meta, Token![,]>::parse_terminated
+                            .parse2(l.tokens.clone())
+                    {
+                        return l;
+                    }
+                }
+            }
+            syn::punctuated::Punctuated::new()
+        })
+        .find_map(|m| match m {
+            syn::Meta::NameValue(i) if i.path.is_ident("enable") => {
+                if let syn::Expr::Lit(lit) = i.value {
+                    Some(lit.lit)
+                } else {
+                    None
+                }
+            }
+            _ => None,
+        })
+}
+
+fn find_doc(attrs: &[syn::Attribute]) -> String {
+    attrs
+        .iter()
+        .filter_map(|a| {
+            #[allow(clippy::collapsible_if)]
+            if let syn::Meta::NameValue(ref l) = a.meta {
+                if l.path.is_ident("doc") {
+                    if let syn::Expr::Lit(syn::ExprLit {
+                        lit: syn::Lit::Str(ref s),
+                        ..
+                    }) = l.value
+                    {
+                        return Some(s.value());
+                    }
+                }
+            }
+            None
+        })
+        .collect()
+}
+
+fn find_required_const(name: &str, attrs: &[syn::Attribute]) -> Vec<usize> {
+    attrs
+        .iter()
+        .filter_map(|a| {
+            if let syn::Meta::List(ref l) = a.meta {
+                Some(l)
+            } else {
+                None
+            }
+        })
+        .flat_map(|l| {
+            if l.path.segments[0].ident == name {
+                syn::parse2::<RustcArgsRequiredConst>(l.tokens.clone())
+                    .unwrap()
+                    .args
+            } else {
+                Vec::new()
+            }
+        })
+        .collect()
+}
+
+struct RustcArgsRequiredConst {
+    args: Vec<usize>,
+}
+
+impl syn::parse::Parse for RustcArgsRequiredConst {
+    fn parse(input: syn::parse::ParseStream<'_>) -> syn::Result<Self> {
+        let list = syn::punctuated::Punctuated::<syn::LitInt, Token![,]>::parse_terminated(input)?;
+        Ok(Self {
+            args: list
+                .into_iter()
+                .map(|a| a.base10_parse::<usize>())
+                .collect::<syn::Result<_>>()?,
+        })
+    }
+}
diff --git a/library/stdarch/crates/stdarch-verify/tests/arm.rs b/library/stdarch/crates/stdarch-verify/tests/arm.rs
new file mode 100644
index 0000000000000..a35b8175fb223
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/tests/arm.rs
@@ -0,0 +1,745 @@
+#![allow(unused)]
+
+use std::collections::HashMap;
+
+use serde::Deserialize;
+
+struct Function {
+    name: &'static str,
+    arguments: &'static [&'static Type],
+    ret: Option<&'static Type>,
+    target_feature: Option<&'static str>,
+    instrs: &'static [&'static str],
+    file: &'static str,
+    required_const: &'static [usize],
+    has_test: bool,
+    doc: &'static str,
+}
+
+static F16: Type = Type::PrimFloat(16);
+static F32: Type = Type::PrimFloat(32);
+static F64: Type = Type::PrimFloat(64);
+static I16: Type = Type::PrimSigned(16);
+static I32: Type = Type::PrimSigned(32);
+static I64: Type = Type::PrimSigned(64);
+static I8: Type = Type::PrimSigned(8);
+static U16: Type = Type::PrimUnsigned(16);
+static U32: Type = Type::PrimUnsigned(32);
+static U64: Type = Type::PrimUnsigned(64);
+static U8: Type = Type::PrimUnsigned(8);
+static NEVER: Type = Type::Never;
+static GENERICT: Type = Type::GenericParam("T");
+static GENERICU: Type = Type::GenericParam("U");
+
+static F16X4: Type = Type::F(16, 4, 1);
+static F16X4X2: Type = Type::F(16, 4, 2);
+static F16X4X3: Type = Type::F(16, 4, 3);
+static F16X4X4: Type = Type::F(16, 4, 4);
+static F16X8: Type = Type::F(16, 8, 1);
+static F16X8X2: Type = Type::F(16, 8, 2);
+static F16X8X3: Type = Type::F(16, 8, 3);
+static F16X8X4: Type = Type::F(16, 8, 4);
+static F32X2: Type = Type::F(32, 2, 1);
+static F32X2X2: Type = Type::F(32, 2, 2);
+static F32X2X3: Type = Type::F(32, 2, 3);
+static F32X2X4: Type = Type::F(32, 2, 4);
+static F32X4: Type = Type::F(32, 4, 1);
+static F32X4X2: Type = Type::F(32, 4, 2);
+static F32X4X3: Type = Type::F(32, 4, 3);
+static F32X4X4: Type = Type::F(32, 4, 4);
+static F64X1: Type = Type::F(64, 1, 1);
+static F64X1X2: Type = Type::F(64, 1, 2);
+static F64X1X3: Type = Type::F(64, 1, 3);
+static F64X1X4: Type = Type::F(64, 1, 4);
+static F64X2: Type = Type::F(64, 2, 1);
+static F64X2X2: Type = Type::F(64, 2, 2);
+static F64X2X3: Type = Type::F(64, 2, 3);
+static F64X2X4: Type = Type::F(64, 2, 4);
+static I16X2: Type = Type::I(16, 2, 1);
+static I16X4: Type = Type::I(16, 4, 1);
+static I16X4X2: Type = Type::I(16, 4, 2);
+static I16X4X3: Type = Type::I(16, 4, 3);
+static I16X4X4: Type = Type::I(16, 4, 4);
+static I16X8: Type = Type::I(16, 8, 1);
+static I16X8X2: Type = Type::I(16, 8, 2);
+static I16X8X3: Type = Type::I(16, 8, 3);
+static I16X8X4: Type = Type::I(16, 8, 4);
+static I32X2: Type = Type::I(32, 2, 1);
+static I32X2X2: Type = Type::I(32, 2, 2);
+static I32X2X3: Type = Type::I(32, 2, 3);
+static I32X2X4: Type = Type::I(32, 2, 4);
+static I32X4: Type = Type::I(32, 4, 1);
+static I32X4X2: Type = Type::I(32, 4, 2);
+static I32X4X3: Type = Type::I(32, 4, 3);
+static I32X4X4: Type = Type::I(32, 4, 4);
+static I64X1: Type = Type::I(64, 1, 1);
+static I64X1X2: Type = Type::I(64, 1, 2);
+static I64X1X3: Type = Type::I(64, 1, 3);
+static I64X1X4: Type = Type::I(64, 1, 4);
+static I64X2: Type = Type::I(64, 2, 1);
+static I64X2X2: Type = Type::I(64, 2, 2);
+static I64X2X3: Type = Type::I(64, 2, 3);
+static I64X2X4: Type = Type::I(64, 2, 4);
+static I8X16: Type = Type::I(8, 16, 1);
+static I8X16X2: Type = Type::I(8, 16, 2);
+static I8X16X3: Type = Type::I(8, 16, 3);
+static I8X16X4: Type = Type::I(8, 16, 4);
+static I8X4: Type = Type::I(8, 4, 1);
+static I8X8: Type = Type::I(8, 8, 1);
+static I8X8X2: Type = Type::I(8, 8, 2);
+static I8X8X3: Type = Type::I(8, 8, 3);
+static I8X8X4: Type = Type::I(8, 8, 4);
+static P128: Type = Type::PrimPoly(128);
+static P16: Type = Type::PrimPoly(16);
+static P16X4X2: Type = Type::P(16, 4, 2);
+static P16X4X3: Type = Type::P(16, 4, 3);
+static P16X4X4: Type = Type::P(16, 4, 4);
+static P16X8X2: Type = Type::P(16, 8, 2);
+static P16X8X3: Type = Type::P(16, 8, 3);
+static P16X8X4: Type = Type::P(16, 8, 4);
+static P64: Type = Type::PrimPoly(64);
+static P64X1X2: Type = Type::P(64, 1, 2);
+static P64X1X3: Type = Type::P(64, 1, 3);
+static P64X1X4: Type = Type::P(64, 1, 4);
+static P64X2X2: Type = Type::P(64, 2, 2);
+static P64X2X3: Type = Type::P(64, 2, 3);
+static P64X2X4: Type = Type::P(64, 2, 4);
+static P8: Type = Type::PrimPoly(8);
+static POLY16X4: Type = Type::P(16, 4, 1);
+static POLY16X8: Type = Type::P(16, 8, 1);
+static POLY64X1: Type = Type::P(64, 1, 1);
+static POLY64X2: Type = Type::P(64, 2, 1);
+static POLY8X16: Type = Type::P(8, 16, 1);
+static POLY8X16X2: Type = Type::P(8, 16, 2);
+static POLY8X16X3: Type = Type::P(8, 16, 3);
+static POLY8X16X4: Type = Type::P(8, 16, 4);
+static POLY8X8: Type = Type::P(8, 8, 1);
+static POLY8X8X2: Type = Type::P(8, 8, 2);
+static POLY8X8X3: Type = Type::P(8, 8, 3);
+static POLY8X8X4: Type = Type::P(8, 8, 4);
+static U16X4: Type = Type::U(16, 4, 1);
+static U16X4X2: Type = Type::U(16, 4, 2);
+static U16X4X3: Type = Type::U(16, 4, 3);
+static U16X4X4: Type = Type::U(16, 4, 4);
+static U16X8: Type = Type::U(16, 8, 1);
+static U16X8X2: Type = Type::U(16, 8, 2);
+static U16X8X3: Type = Type::U(16, 8, 3);
+static U16X8X4: Type = Type::U(16, 8, 4);
+static U32X2: Type = Type::U(32, 2, 1);
+static U32X2X2: Type = Type::U(32, 2, 2);
+static U32X2X3: Type = Type::U(32, 2, 3);
+static U32X2X4: Type = Type::U(32, 2, 4);
+static U32X4: Type = Type::U(32, 4, 1);
+static U32X4X2: Type = Type::U(32, 4, 2);
+static U32X4X3: Type = Type::U(32, 4, 3);
+static U32X4X4: Type = Type::U(32, 4, 4);
+static U64X1: Type = Type::U(64, 1, 1);
+static U64X1X2: Type = Type::U(64, 1, 2);
+static U64X1X3: Type = Type::U(64, 1, 3);
+static U64X1X4: Type = Type::U(64, 1, 4);
+static U64X2: Type = Type::U(64, 2, 1);
+static U64X2X2: Type = Type::U(64, 2, 2);
+static U64X2X3: Type = Type::U(64, 2, 3);
+static U64X2X4: Type = Type::U(64, 2, 4);
+static U8X16: Type = Type::U(8, 16, 1);
+static U8X16X2: Type = Type::U(8, 16, 2);
+static U8X16X3: Type = Type::U(8, 16, 3);
+static U8X16X4: Type = Type::U(8, 16, 4);
+static U8X8: Type = Type::U(8, 8, 1);
+static U8X4: Type = Type::U(8, 4, 1);
+static U8X8X2: Type = Type::U(8, 8, 2);
+static U8X8X3: Type = Type::U(8, 8, 3);
+static U8X8X4: Type = Type::U(8, 8, 4);
+
+#[derive(Debug, Copy, Clone, PartialEq)]
+enum Type {
+    PrimFloat(u8),
+    PrimSigned(u8),
+    PrimUnsigned(u8),
+    PrimPoly(u8),
+    MutPtr(&'static Type),
+    ConstPtr(&'static Type),
+    GenericParam(&'static str),
+    I(u8, u8, u8),
+    U(u8, u8, u8),
+    P(u8, u8, u8),
+    F(u8, u8, u8),
+    Never,
+}
+
+stdarch_verify::arm_functions!(static FUNCTIONS);
+
+macro_rules! bail {
+    ($($t:tt)*) => (return Err(format!($($t)*)))
+}
+
+#[test]
+fn verify_all_signatures() {
+    // Reference: https://developer.arm.com/architectures/instruction-sets/intrinsics
+    let json = include_bytes!("../../../intrinsics_data/arm_intrinsics.json");
+    let intrinsics: Vec<JsonIntrinsic> = serde_json::from_slice(json).unwrap();
+    let map = parse_intrinsics(intrinsics);
+
+    let mut all_valid = true;
+    for rust in FUNCTIONS {
+        if !rust.has_test {
+            let skip = [
+                "vaddq_s64",
+                "vaddq_u64",
+                "vrsqrte_f32",
+                "vtbl1_s8",
+                "vtbl1_u8",
+                "vtbl1_p8",
+                "vtbl2_s8",
+                "vtbl2_u8",
+                "vtbl2_p8",
+                "vtbl3_s8",
+                "vtbl3_u8",
+                "vtbl3_p8",
+                "vtbl4_s8",
+                "vtbl4_u8",
+                "vtbl4_p8",
+                "vtbx1_s8",
+                "vtbx1_u8",
+                "vtbx1_p8",
+                "vtbx2_s8",
+                "vtbx2_u8",
+                "vtbx2_p8",
+                "vtbx3_s8",
+                "vtbx3_u8",
+                "vtbx3_p8",
+                "vtbx4_s8",
+                "vtbx4_u8",
+                "vtbx4_p8",
+                "udf",
+                "_clz_u8",
+                "_clz_u16",
+                "_clz_u32",
+                "_rbit_u32",
+                "_rev_u16",
+                "_rev_u32",
+                "__breakpoint",
+                "vpminq_f32",
+                "vpminq_f64",
+                "vpmaxq_f32",
+                "vpmaxq_f64",
+                "vcombine_s8",
+                "vcombine_s16",
+                "vcombine_s32",
+                "vcombine_s64",
+                "vcombine_u8",
+                "vcombine_u16",
+                "vcombine_u32",
+                "vcombine_u64",
+                "vcombine_p64",
+                "vcombine_f32",
+                "vcombine_p8",
+                "vcombine_p16",
+                "vcombine_f64",
+                "vtbl1_s8",
+                "vtbl1_u8",
+                "vtbl1_p8",
+                "vtbl2_s8",
+                "vtbl2_u8",
+                "vtbl2_p8",
+                "vtbl3_s8",
+                "vtbl3_u8",
+                "vtbl3_p8",
+                "vtbl4_s8",
+                "vtbl4_u8",
+                "vtbl4_p8",
+                "vtbx1_s8",
+                "vtbx1_u8",
+                "vtbx1_p8",
+                "vtbx2_s8",
+                "vtbx2_u8",
+                "vtbx2_p8",
+                "vtbx3_s8",
+                "vtbx3_u8",
+                "vtbx3_p8",
+                "vtbx4_s8",
+                "vtbx4_u8",
+                "vtbx4_p8",
+                "vqtbl1_s8",
+                "vqtbl1q_s8",
+                "vqtbl1_u8",
+                "vqtbl1q_u8",
+                "vqtbl1_p8",
+                "vqtbl1q_p8",
+                "vqtbx1_s8",
+                "vqtbx1q_s8",
+                "vqtbx1_u8",
+                "vqtbx1q_u8",
+                "vqtbx1_p8",
+                "vqtbx1q_p8",
+                "vqtbl2_s8",
+                "vqtbl2q_s8",
+                "vqtbl2_u8",
+                "vqtbl2q_u8",
+                "vqtbl2_p8",
+                "vqtbl2q_p8",
+                "vqtbx2_s8",
+                "vqtbx2q_s8",
+                "vqtbx2_u8",
+                "vqtbx2q_u8",
+                "vqtbx2_p8",
+                "vqtbx2q_p8",
+                "vqtbl3_s8",
+                "vqtbl3q_s8",
+                "vqtbl3_u8",
+                "vqtbl3q_u8",
+                "vqtbl3_p8",
+                "vqtbl3q_p8",
+                "vqtbx3_s8",
+                "vqtbx3q_s8",
+                "vqtbx3_u8",
+                "vqtbx3q_u8",
+                "vqtbx3_p8",
+                "vqtbx3q_p8",
+                "vqtbl4_s8",
+                "vqtbl4q_s8",
+                "vqtbl4_u8",
+                "vqtbl4q_u8",
+                "vqtbl4_p8",
+                "vqtbl4q_p8",
+                "vqtbx4_s8",
+                "vqtbx4q_s8",
+                "vqtbx4_u8",
+                "vqtbx4q_u8",
+                "vqtbx4_p8",
+                "vqtbx4q_p8",
+                "brk",
+                "_rev_u64",
+                "_clz_u64",
+                "_rbit_u64",
+                "_cls_u32",
+                "_cls_u64",
+                "_prefetch",
+                "vsli_n_s8",
+                "vsliq_n_s8",
+                "vsli_n_s16",
+                "vsliq_n_s16",
+                "vsli_n_s32",
+                "vsliq_n_s32",
+                "vsli_n_s64",
+                "vsliq_n_s64",
+                "vsli_n_u8",
+                "vsliq_n_u8",
+                "vsli_n_u16",
+                "vsliq_n_u16",
+                "vsli_n_u32",
+                "vsliq_n_u32",
+                "vsli_n_u64",
+                "vsliq_n_u64",
+                "vsli_n_p8",
+                "vsliq_n_p8",
+                "vsli_n_p16",
+                "vsliq_n_p16",
+                "vsli_n_p64",
+                "vsliq_n_p64",
+                "vsri_n_s8",
+                "vsriq_n_s8",
+                "vsri_n_s16",
+                "vsriq_n_s16",
+                "vsri_n_s32",
+                "vsriq_n_s32",
+                "vsri_n_s64",
+                "vsriq_n_s64",
+                "vsri_n_u8",
+                "vsriq_n_u8",
+                "vsri_n_u16",
+                "vsriq_n_u16",
+                "vsri_n_u32",
+                "vsriq_n_u32",
+                "vsri_n_u64",
+                "vsriq_n_u64",
+                "vsri_n_p8",
+                "vsriq_n_p8",
+                "vsri_n_p16",
+                "vsriq_n_p16",
+                "vsri_n_p64",
+                "vsriq_n_p64",
+                "__smulbb",
+                "__smultb",
+                "__smulbt",
+                "__smultt",
+                "__smulwb",
+                "__smulwt",
+                "__qadd",
+                "__qsub",
+                "__qdbl",
+                "__smlabb",
+                "__smlabt",
+                "__smlatb",
+                "__smlatt",
+                "__smlawb",
+                "__smlawt",
+                "__qadd8",
+                "__qsub8",
+                "__qsub16",
+                "__qadd16",
+                "__qasx",
+                "__qsax",
+                "__sadd16",
+                "__sadd8",
+                "__smlad",
+                "__smlsd",
+                "__sasx",
+                "__sel",
+                "__shadd8",
+                "__shadd16",
+                "__shsub8",
+                "__usub8",
+                "__ssub8",
+                "__shsub16",
+                "__smuad",
+                "__smuadx",
+                "__smusd",
+                "__smusdx",
+                "__usad8",
+                "__usada8",
+                "__ldrex",
+                "__strex",
+                "__ldrexb",
+                "__strexb",
+                "__ldrexh",
+                "__strexh",
+                "__clrex",
+                "__dbg",
+            ];
+        }
+
+        // Skip some intrinsics that aren't NEON and are located in different
+        // places than the whitelists below.
+        match rust.name {
+            "brk" | "__breakpoint" | "udf" | "_prefetch" => continue,
+            _ => {}
+        }
+        // Skip some intrinsics that are present in GCC and Clang but
+        // are missing from the official documentation.
+        let skip_intrinsic_verify = [
+            "vmov_n_p64",
+            "vmovq_n_p64",
+            "vreinterpret_p64_s64",
+            "vreinterpret_f32_p64",
+            "vreinterpretq_f32_p64",
+            "vreinterpretq_p64_p128",
+            "vreinterpretq_p128_p64",
+            "vreinterpretq_f32_p128",
+            "vtst_p16",
+            "vtstq_p16",
+            "__dbg",
+        ];
+        let arm = match map.get(rust.name) {
+            Some(i) => i,
+            None => {
+                // Skip all these intrinsics as they're not listed in NEON
+                // descriptions online.
+                //
+                // TODO: we still need to verify these intrinsics or find a
+                // reference for them, need to figure out where though!
+                if !rust.file.ends_with("dsp.rs\"")
+                    && !rust.file.ends_with("sat.rs\"")
+                    && !rust.file.ends_with("simd32.rs\"")
+                    && !rust.file.ends_with("v6.rs\"")
+                    && !rust.file.ends_with("v7.rs\"")
+                    && !rust.file.ends_with("v8.rs\"")
+                    && !rust.file.ends_with("tme.rs\"")
+                    && !rust.file.ends_with("mte.rs\"")
+                    && !rust.file.ends_with("ex.rs\"")
+                    && !skip_intrinsic_verify.contains(&rust.name)
+                {
+                    println!(
+                        "missing arm definition for {:?} in {}",
+                        rust.name, rust.file
+                    );
+                    all_valid = false;
+                }
+                continue;
+            }
+        };
+
+        if let Err(e) = matches(rust, arm) {
+            println!("failed to verify `{}`", rust.name);
+            println!("  * {e}");
+            all_valid = false;
+        }
+    }
+    assert!(all_valid);
+}
+
+fn matches(rust: &Function, arm: &Intrinsic) -> Result<(), String> {
+    if rust.ret != arm.ret.as_ref() {
+        bail!("mismatched return value")
+    }
+    if rust.arguments.len() != arm.arguments.len() {
+        bail!("mismatched argument lengths");
+    }
+
+    let mut nconst = 0;
+    let iter = rust.arguments.iter().zip(&arm.arguments).enumerate();
+    for (i, (rust_ty, (arm, arm_const))) in iter {
+        if *rust_ty != arm {
+            bail!("mismatched arguments: {rust_ty:?} != {arm:?}")
+        }
+        if *arm_const {
+            nconst += 1;
+            if !rust.required_const.contains(&i) {
+                bail!("argument const mismatch");
+            }
+        }
+    }
+    if nconst != rust.required_const.len() {
+        bail!("wrong number of const arguments");
+    }
+
+    if rust.instrs.is_empty() {
+        bail!(
+            "instruction not listed for `{}`, but arm lists {:?}",
+            rust.name,
+            arm.instruction
+        );
+    } else if false
+    // TODO: This instruction checking logic needs work to handle multiple instructions and to only
+    // look at aarch64 insructions.
+    // The ACLE's listed instructions are a guideline only and compilers have the freedom to use
+    // different instructions in dfferent cases which makes this an unreliable testing method. It
+    // is of questionable value given the intrinsic test tool.
+    {
+        for instr in rust.instrs {
+            if arm.instruction.starts_with(instr) {
+                continue;
+            }
+            // sometimes arm says `foo` and disassemblers say `vfoo`, or
+            // sometimes disassemblers say `vfoo` and arm says `sfoo` or `ffoo`
+            if instr.starts_with('v')
+                && (arm.instruction.starts_with(&instr[1..])
+                    || arm.instruction[1..].starts_with(&instr[1..]))
+            {
+                continue;
+            }
+            bail!(
+                "arm failed to list `{}` as an instruction for `{}` in {:?}",
+                instr,
+                rust.name,
+                arm.instruction,
+            );
+        }
+    }
+
+    // TODO: verify `target_feature`.
+
+    Ok(())
+}
+
+#[derive(PartialEq)]
+struct Intrinsic {
+    name: String,
+    ret: Option<Type>,
+    arguments: Vec<(Type, bool)>,
+    instruction: String,
+}
+
+// These structures are similar to those in json_parser.rs in intrinsics-test
+#[derive(Deserialize, Debug)]
+struct JsonIntrinsic {
+    name: String,
+    arguments: Vec<String>,
+    return_type: ReturnType,
+    #[serde(default)]
+    instructions: Vec<Vec<String>>,
+}
+
+#[derive(Deserialize, Debug)]
+struct ReturnType {
+    value: String,
+}
+
+fn parse_intrinsics(intrinsics: Vec<JsonIntrinsic>) -> HashMap<String, Intrinsic> {
+    let mut ret = HashMap::new();
+    for intr in intrinsics.into_iter() {
+        let f = parse_intrinsic(intr);
+        ret.insert(f.name.clone(), f);
+    }
+    ret
+}
+
+fn parse_intrinsic(mut intr: JsonIntrinsic) -> Intrinsic {
+    let name = intr.name;
+    let ret = if intr.return_type.value == "void" {
+        None
+    } else {
+        Some(parse_ty(&intr.return_type.value))
+    };
+
+    // This ignores multiple instructions and different optional sequences for now to mimic
+    // the old HTML scraping behaviour
+    let instruction = intr.instructions.swap_remove(0).swap_remove(0);
+
+    let arguments = intr
+        .arguments
+        .iter()
+        .map(|s| {
+            let (ty, konst) = match s.strip_prefix("const") {
+                Some(stripped) => (stripped.trim_start(), true),
+                None => (s.as_str(), false),
+            };
+            let ty = ty.rsplit_once(' ').unwrap().0;
+            (parse_ty(ty), konst)
+        })
+        .collect::<Vec<_>>();
+
+    Intrinsic {
+        name,
+        ret,
+        instruction,
+        arguments,
+    }
+}
+
+fn parse_ty(s: &str) -> Type {
+    let suffix = " const *";
+    if let Some(base) = s.strip_suffix(suffix) {
+        Type::ConstPtr(parse_ty_base(base))
+    } else if let Some(base) = s.strip_suffix(" *") {
+        Type::MutPtr(parse_ty_base(base))
+    } else {
+        *parse_ty_base(s)
+    }
+}
+
+fn parse_ty_base(s: &str) -> &'static Type {
+    match s {
+        "float16_t" => &F16,
+        "float16x4_t" => &F16X4,
+        "float16x4x2_t" => &F16X4X2,
+        "float16x4x3_t" => &F16X4X3,
+        "float16x4x4_t" => &F16X4X4,
+        "float16x8_t" => &F16X8,
+        "float16x8x2_t" => &F16X8X2,
+        "float16x8x3_t" => &F16X8X3,
+        "float16x8x4_t" => &F16X8X4,
+        "float32_t" => &F32,
+        "float32x2_t" => &F32X2,
+        "float32x2x2_t" => &F32X2X2,
+        "float32x2x3_t" => &F32X2X3,
+        "float32x2x4_t" => &F32X2X4,
+        "float32x4_t" => &F32X4,
+        "float32x4x2_t" => &F32X4X2,
+        "float32x4x3_t" => &F32X4X3,
+        "float32x4x4_t" => &F32X4X4,
+        "float64_t" => &F64,
+        "float64x1_t" => &F64X1,
+        "float64x1x2_t" => &F64X1X2,
+        "float64x1x3_t" => &F64X1X3,
+        "float64x1x4_t" => &F64X1X4,
+        "float64x2_t" => &F64X2,
+        "float64x2x2_t" => &F64X2X2,
+        "float64x2x3_t" => &F64X2X3,
+        "float64x2x4_t" => &F64X2X4,
+        "int16_t" => &I16,
+        "int16x2_t" => &I16X2,
+        "int16x4_t" => &I16X4,
+        "int16x4x2_t" => &I16X4X2,
+        "int16x4x3_t" => &I16X4X3,
+        "int16x4x4_t" => &I16X4X4,
+        "int16x8_t" => &I16X8,
+        "int16x8x2_t" => &I16X8X2,
+        "int16x8x3_t" => &I16X8X3,
+        "int16x8x4_t" => &I16X8X4,
+        "int32_t" | "int" => &I32,
+        "int32x2_t" => &I32X2,
+        "int32x2x2_t" => &I32X2X2,
+        "int32x2x3_t" => &I32X2X3,
+        "int32x2x4_t" => &I32X2X4,
+        "int32x4_t" => &I32X4,
+        "int32x4x2_t" => &I32X4X2,
+        "int32x4x3_t" => &I32X4X3,
+        "int32x4x4_t" => &I32X4X4,
+        "int64_t" => &I64,
+        "int64x1_t" => &I64X1,
+        "int64x1x2_t" => &I64X1X2,
+        "int64x1x3_t" => &I64X1X3,
+        "int64x1x4_t" => &I64X1X4,
+        "int64x2_t" => &I64X2,
+        "int64x2x2_t" => &I64X2X2,
+        "int64x2x3_t" => &I64X2X3,
+        "int64x2x4_t" => &I64X2X4,
+        "int8_t" => &I8,
+        "int8x16_t" => &I8X16,
+        "int8x16x2_t" => &I8X16X2,
+        "int8x16x3_t" => &I8X16X3,
+        "int8x16x4_t" => &I8X16X4,
+        "int8x4_t" => &I8X4,
+        "int8x8_t" => &I8X8,
+        "int8x8x2_t" => &I8X8X2,
+        "int8x8x3_t" => &I8X8X3,
+        "int8x8x4_t" => &I8X8X4,
+        "poly128_t" => &P128,
+        "poly16_t" => &P16,
+        "poly16x4_t" => &POLY16X4,
+        "poly16x4x2_t" => &P16X4X2,
+        "poly16x4x3_t" => &P16X4X3,
+        "poly16x4x4_t" => &P16X4X4,
+        "poly16x8_t" => &POLY16X8,
+        "poly16x8x2_t" => &P16X8X2,
+        "poly16x8x3_t" => &P16X8X3,
+        "poly16x8x4_t" => &P16X8X4,
+        "poly64_t" => &P64,
+        "poly64x1_t" => &POLY64X1,
+        "poly64x1x2_t" => &P64X1X2,
+        "poly64x1x3_t" => &P64X1X3,
+        "poly64x1x4_t" => &P64X1X4,
+        "poly64x2_t" => &POLY64X2,
+        "poly64x2x2_t" => &P64X2X2,
+        "poly64x2x3_t" => &P64X2X3,
+        "poly64x2x4_t" => &P64X2X4,
+        "poly8_t" => &P8,
+        "poly8x16_t" => &POLY8X16,
+        "poly8x16x2_t" => &POLY8X16X2,
+        "poly8x16x3_t" => &POLY8X16X3,
+        "poly8x16x4_t" => &POLY8X16X4,
+        "poly8x8_t" => &POLY8X8,
+        "poly8x8x2_t" => &POLY8X8X2,
+        "poly8x8x3_t" => &POLY8X8X3,
+        "poly8x8x4_t" => &POLY8X8X4,
+        "uint16_t" => &U16,
+        "uint16x4_t" => &U16X4,
+        "uint16x4x2_t" => &U16X4X2,
+        "uint16x4x3_t" => &U16X4X3,
+        "uint16x4x4_t" => &U16X4X4,
+        "uint16x8_t" => &U16X8,
+        "uint16x8x2_t" => &U16X8X2,
+        "uint16x8x3_t" => &U16X8X3,
+        "uint16x8x4_t" => &U16X8X4,
+        "uint32_t" => &U32,
+        "uint32x2_t" => &U32X2,
+        "uint32x2x2_t" => &U32X2X2,
+        "uint32x2x3_t" => &U32X2X3,
+        "uint32x2x4_t" => &U32X2X4,
+        "uint32x4_t" => &U32X4,
+        "uint32x4x2_t" => &U32X4X2,
+        "uint32x4x3_t" => &U32X4X3,
+        "uint32x4x4_t" => &U32X4X4,
+        "uint64_t" => &U64,
+        "uint64x1_t" => &U64X1,
+        "uint64x1x2_t" => &U64X1X2,
+        "uint64x1x3_t" => &U64X1X3,
+        "uint64x1x4_t" => &U64X1X4,
+        "uint64x2_t" => &U64X2,
+        "uint64x2x2_t" => &U64X2X2,
+        "uint64x2x3_t" => &U64X2X3,
+        "uint64x2x4_t" => &U64X2X4,
+        "uint8_t" => &U8,
+        "uint8x16_t" => &U8X16,
+        "uint8x16x2_t" => &U8X16X2,
+        "uint8x16x3_t" => &U8X16X3,
+        "uint8x16x4_t" => &U8X16X4,
+        "uint8x8_t" => &U8X8,
+        "uint8x8x2_t" => &U8X8X2,
+        "uint8x8x3_t" => &U8X8X3,
+        "uint8x8x4_t" => &U8X8X4,
+
+        _ => panic!("failed to parse json type {s:?}"),
+    }
+}
diff --git a/library/stdarch/crates/stdarch-verify/tests/mips.rs b/library/stdarch/crates/stdarch-verify/tests/mips.rs
new file mode 100644
index 0000000000000..ba639c3f92f76
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/tests/mips.rs
@@ -0,0 +1,367 @@
+//! Verification of MIPS MSA intrinsics
+#![allow(unused, non_upper_case_globals, clippy::single_match)]
+
+// This file is obtained from
+// https://gcc.gnu.org/onlinedocs//gcc/MIPS-SIMD-Architecture-Built-in-Functions.html
+static HEADER: &str = include_str!("../mips-msa.h");
+
+stdarch_verify::mips_functions!(static FUNCTIONS);
+
+struct Function {
+    name: &'static str,
+    arguments: &'static [&'static Type],
+    ret: Option<&'static Type>,
+    target_feature: Option<&'static str>,
+    instrs: &'static [&'static str],
+    file: &'static str,
+    required_const: &'static [usize],
+    has_test: bool,
+    doc: &'static str,
+}
+
+static F16: Type = Type::PrimFloat(16);
+static F32: Type = Type::PrimFloat(32);
+static F64: Type = Type::PrimFloat(64);
+static I8: Type = Type::PrimSigned(8);
+static I16: Type = Type::PrimSigned(16);
+static I32: Type = Type::PrimSigned(32);
+static I64: Type = Type::PrimSigned(64);
+static U8: Type = Type::PrimUnsigned(8);
+static U16: Type = Type::PrimUnsigned(16);
+static U32: Type = Type::PrimUnsigned(32);
+static U64: Type = Type::PrimUnsigned(64);
+static NEVER: Type = Type::Never;
+static TUPLE: Type = Type::Tuple;
+static v16i8: Type = Type::I(8, 16, 1);
+static v8i16: Type = Type::I(16, 8, 1);
+static v4i32: Type = Type::I(32, 4, 1);
+static v2i64: Type = Type::I(64, 2, 1);
+static v16u8: Type = Type::U(8, 16, 1);
+static v8u16: Type = Type::U(16, 8, 1);
+static v4u32: Type = Type::U(32, 4, 1);
+static v2u64: Type = Type::U(64, 2, 1);
+static v8f16: Type = Type::F(16, 8, 1);
+static v4f32: Type = Type::F(32, 4, 1);
+static v2f64: Type = Type::F(64, 2, 1);
+
+#[derive(Debug, Copy, Clone, PartialEq)]
+enum Type {
+    PrimFloat(u8),
+    PrimSigned(u8),
+    PrimUnsigned(u8),
+    PrimPoly(u8),
+    MutPtr(&'static Type),
+    ConstPtr(&'static Type),
+    Tuple,
+    I(u8, u8, u8),
+    U(u8, u8, u8),
+    P(u8, u8, u8),
+    F(u8, u8, u8),
+    Never,
+}
+
+#[derive(Copy, Clone, Debug, PartialEq)]
+#[allow(non_camel_case_types)]
+enum MsaTy {
+    v16i8,
+    v8i16,
+    v4i32,
+    v2i64,
+    v16u8,
+    v8u16,
+    v4u32,
+    v2u64,
+    v8f16,
+    v4f32,
+    v2f64,
+    imm0_1,
+    imm0_3,
+    imm0_7,
+    imm0_15,
+    imm0_31,
+    imm0_63,
+    imm0_255,
+    imm_n16_15,
+    imm_n512_511,
+    imm_n1024_1022,
+    imm_n2048_2044,
+    imm_n4096_4088,
+    i32,
+    u32,
+    i64,
+    u64,
+    Void,
+    MutVoidPtr,
+}
+
+impl<'a> From<&'a str> for MsaTy {
+    fn from(s: &'a str) -> MsaTy {
+        match s {
+            "v16i8" => MsaTy::v16i8,
+            "v8i16" => MsaTy::v8i16,
+            "v4i32" => MsaTy::v4i32,
+            "v2i64" => MsaTy::v2i64,
+            "v16u8" => MsaTy::v16u8,
+            "v8u16" => MsaTy::v8u16,
+            "v4u32" => MsaTy::v4u32,
+            "v2u64" => MsaTy::v2u64,
+            "v8f16" => MsaTy::v8f16,
+            "v4f32" => MsaTy::v4f32,
+            "v2f64" => MsaTy::v2f64,
+            "imm0_1" => MsaTy::imm0_1,
+            "imm0_3" => MsaTy::imm0_3,
+            "imm0_7" => MsaTy::imm0_7,
+            "imm0_15" => MsaTy::imm0_15,
+            "imm0_31" => MsaTy::imm0_31,
+            "imm0_63" => MsaTy::imm0_63,
+            "imm0_255" => MsaTy::imm0_255,
+            "imm_n16_15" => MsaTy::imm_n16_15,
+            "imm_n512_511" => MsaTy::imm_n512_511,
+            "imm_n1024_1022" => MsaTy::imm_n1024_1022,
+            "imm_n2048_2044" => MsaTy::imm_n2048_2044,
+            "imm_n4096_4088" => MsaTy::imm_n4096_4088,
+            "i32" => MsaTy::i32,
+            "u32" => MsaTy::u32,
+            "i64" => MsaTy::i64,
+            "u64" => MsaTy::u64,
+            "void" => MsaTy::Void,
+            "void *" => MsaTy::MutVoidPtr,
+            v => panic!("unknown ty: \"{v}\""),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct MsaIntrinsic {
+    id: String,
+    arg_tys: Vec<MsaTy>,
+    ret_ty: MsaTy,
+    instruction: String,
+}
+
+struct NoneError;
+
+impl std::convert::TryFrom<&'static str> for MsaIntrinsic {
+    // The intrinsics are just C function declarations of the form:
+    // $ret_ty __builtin_${fn_id}($($arg_ty),*);
+    type Error = NoneError;
+    fn try_from(line: &'static str) -> Result<Self, Self::Error> {
+        return inner(line).ok_or(NoneError);
+
+        fn inner(line: &'static str) -> Option<MsaIntrinsic> {
+            let first_whitespace = line.find(char::is_whitespace)?;
+            let ret_ty = &line[0..first_whitespace];
+            let ret_ty = MsaTy::from(ret_ty);
+
+            let first_parentheses = line.find('(')?;
+            assert!(first_parentheses > first_whitespace);
+            let id = &line[first_whitespace + 1..first_parentheses].trim();
+            assert!(id.starts_with("__builtin"));
+            let mut id_str = "_".to_string();
+            id_str += &id[9..];
+            let id = id_str;
+
+            let mut arg_tys = Vec::new();
+
+            let last_parentheses = line.find(')')?;
+            for arg in line[first_parentheses + 1..last_parentheses].split(',') {
+                let arg = arg.trim();
+                arg_tys.push(MsaTy::from(arg));
+            }
+
+            // The instruction is the intrinsic name without the __msa_ prefix.
+            let instruction = &id[6..];
+            let mut instruction = instruction.to_string();
+            // With all underscores but the first one replaced with a `.`
+            if let Some(first_underscore) = instruction.find('_') {
+                let postfix = instruction[first_underscore + 1..].replace('_', ".");
+                instruction = instruction[0..=first_underscore].to_string();
+                instruction += &postfix;
+            }
+
+            Some(MsaIntrinsic {
+                id,
+                ret_ty,
+                arg_tys,
+                instruction,
+            })
+        }
+    }
+}
+
+#[test]
+fn verify_all_signatures() {
+    // Parse the C intrinsic header file:
+    let mut intrinsics = std::collections::HashMap::<String, MsaIntrinsic>::new();
+    for line in HEADER.lines() {
+        if line.is_empty() {
+            continue;
+        }
+
+        use std::convert::TryFrom;
+        let intrinsic: MsaIntrinsic =
+            TryFrom::try_from(line).unwrap_or_else(|_| panic!("failed to parse line: \"{line}\""));
+        assert!(!intrinsics.contains_key(&intrinsic.id));
+        intrinsics.insert(intrinsic.id.clone(), intrinsic);
+    }
+
+    let mut all_valid = true;
+    for rust in FUNCTIONS {
+        if !rust.has_test {
+            let skip = [
+                "__msa_ceqi_d",
+                "__msa_cfcmsa",
+                "__msa_clei_s_d",
+                "__msa_clti_s_d",
+                "__msa_ctcmsa",
+                "__msa_ldi_d",
+                "__msa_maxi_s_d",
+                "__msa_mini_s_d",
+                "break_",
+            ];
+            if !skip.contains(&rust.name) {
+                println!(
+                    "missing run-time test named `test_{}` for `{}`",
+                    {
+                        let mut id = rust.name;
+                        while id.starts_with('_') {
+                            id = &id[1..];
+                        }
+                        id
+                    },
+                    rust.name
+                );
+                all_valid = false;
+            }
+        }
+
+        // Skip some intrinsics that aren't part of MSA
+        match rust.name {
+            "break_" => continue,
+            _ => {}
+        }
+        let mips = match intrinsics.get(rust.name) {
+            Some(i) => i,
+            None => {
+                eprintln!(
+                    "missing mips definition for {:?} in {}",
+                    rust.name, rust.file
+                );
+                all_valid = false;
+                continue;
+            }
+        };
+
+        if let Err(e) = matches(rust, mips) {
+            println!("failed to verify `{}`", rust.name);
+            println!("  * {e}");
+            all_valid = false;
+        }
+    }
+    assert!(all_valid);
+}
+
+fn matches(rust: &Function, mips: &MsaIntrinsic) -> Result<(), String> {
+    macro_rules! bail {
+        ($($t:tt)*) => (return Err(format!($($t)*)))
+    }
+
+    if rust.ret.is_none() && mips.ret_ty != MsaTy::Void {
+        bail!("mismatched return value")
+    }
+
+    if rust.arguments.len() != mips.arg_tys.len() {
+        bail!("mismatched argument lengths");
+    }
+
+    let mut nconst = 0;
+    for (i, (rust_arg, mips_arg)) in rust.arguments.iter().zip(mips.arg_tys.iter()).enumerate() {
+        match mips_arg {
+            MsaTy::v16i8 if **rust_arg == v16i8 => (),
+            MsaTy::v8i16 if **rust_arg == v8i16 => (),
+            MsaTy::v4i32 if **rust_arg == v4i32 => (),
+            MsaTy::v2i64 if **rust_arg == v2i64 => (),
+            MsaTy::v16u8 if **rust_arg == v16u8 => (),
+            MsaTy::v8u16 if **rust_arg == v8u16 => (),
+            MsaTy::v4u32 if **rust_arg == v4u32 => (),
+            MsaTy::v2u64 if **rust_arg == v2u64 => (),
+            MsaTy::v4f32 if **rust_arg == v4f32 => (),
+            MsaTy::v2f64 if **rust_arg == v2f64 => (),
+            MsaTy::imm0_1
+            | MsaTy::imm0_3
+            | MsaTy::imm0_7
+            | MsaTy::imm0_15
+            | MsaTy::imm0_31
+            | MsaTy::imm0_63
+            | MsaTy::imm0_255
+            | MsaTy::imm_n16_15
+            | MsaTy::imm_n512_511
+            | MsaTy::imm_n1024_1022
+            | MsaTy::imm_n2048_2044
+            | MsaTy::imm_n4096_4088
+                if **rust_arg == I32 => {}
+            MsaTy::i32 if **rust_arg == I32 => (),
+            MsaTy::i64 if **rust_arg == I64 => (),
+            MsaTy::u32 if **rust_arg == U32 => (),
+            MsaTy::u64 if **rust_arg == U64 => (),
+            MsaTy::MutVoidPtr if **rust_arg == Type::MutPtr(&U8) => (),
+            m => bail!(
+                "mismatched argument \"{}\"= \"{:?}\" != \"{:?}\"",
+                i,
+                m,
+                *rust_arg
+            ),
+        }
+
+        let is_const = matches!(
+            mips_arg,
+            MsaTy::imm0_1
+                | MsaTy::imm0_3
+                | MsaTy::imm0_7
+                | MsaTy::imm0_15
+                | MsaTy::imm0_31
+                | MsaTy::imm0_63
+                | MsaTy::imm0_255
+                | MsaTy::imm_n16_15
+                | MsaTy::imm_n512_511
+                | MsaTy::imm_n1024_1022
+                | MsaTy::imm_n2048_2044
+                | MsaTy::imm_n4096_4088
+        );
+        if is_const {
+            nconst += 1;
+            if !rust.required_const.contains(&i) {
+                bail!("argument const mismatch");
+            }
+        }
+    }
+
+    if nconst != rust.required_const.len() {
+        bail!("wrong number of const arguments");
+    }
+
+    if rust.target_feature != Some("msa") {
+        bail!("wrong target_feature");
+    }
+
+    if !rust.instrs.is_empty() {
+        // Normalize slightly to get rid of assembler differences
+        let actual = rust.instrs[0].replace('.', "_");
+        let expected = mips.instruction.replace('.', "_");
+        if actual != expected {
+            bail!(
+                "wrong instruction: \"{}\" != \"{}\"",
+                rust.instrs[0],
+                mips.instruction
+            );
+        }
+    } else {
+        bail!(
+            "missing assert_instr for \"{}\" (should be \"{}\")",
+            mips.id,
+            mips.instruction
+        );
+    }
+
+    Ok(())
+}
diff --git a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
new file mode 100644
index 0000000000000..02b6bdc76840e
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
@@ -0,0 +1,884 @@
+#![allow(unused, non_camel_case_types)]
+
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::fs::File;
+use std::io;
+use std::io::{BufWriter, Write};
+
+use serde::Deserialize;
+
+const PRINT_INSTRUCTION_VIOLATIONS: bool = false;
+const GENERATE_MISSING_X86_MD: bool = false;
+const SS: u8 = (8 * size_of::<usize>()) as u8;
+
+struct Function {
+    name: &'static str,
+    arguments: &'static [&'static Type],
+    ret: Option<&'static Type>,
+    target_feature: Option<&'static str>,
+    instrs: &'static [&'static str],
+    file: &'static str,
+    required_const: &'static [usize],
+    has_test: bool,
+    doc: &'static str,
+}
+
+static BF16: Type = Type::BFloat16;
+static F16: Type = Type::PrimFloat(16);
+static F32: Type = Type::PrimFloat(32);
+static F64: Type = Type::PrimFloat(64);
+static I8: Type = Type::PrimSigned(8);
+static I16: Type = Type::PrimSigned(16);
+static I32: Type = Type::PrimSigned(32);
+static I64: Type = Type::PrimSigned(64);
+static U8: Type = Type::PrimUnsigned(8);
+static U16: Type = Type::PrimUnsigned(16);
+static U32: Type = Type::PrimUnsigned(32);
+static U64: Type = Type::PrimUnsigned(64);
+static U128: Type = Type::PrimUnsigned(128);
+static USIZE: Type = Type::PrimUnsigned(SS);
+static ORDERING: Type = Type::Ordering;
+
+static M128: Type = Type::M128;
+static M128BH: Type = Type::M128BH;
+static M128I: Type = Type::M128I;
+static M128D: Type = Type::M128D;
+static M128H: Type = Type::M128H;
+static M256: Type = Type::M256;
+static M256BH: Type = Type::M256BH;
+static M256I: Type = Type::M256I;
+static M256D: Type = Type::M256D;
+static M256H: Type = Type::M256H;
+static M512: Type = Type::M512;
+static M512BH: Type = Type::M512BH;
+static M512I: Type = Type::M512I;
+static M512D: Type = Type::M512D;
+static M512H: Type = Type::M512H;
+static MMASK8: Type = Type::MMASK8;
+static MMASK16: Type = Type::MMASK16;
+static MMASK32: Type = Type::MMASK32;
+static MMASK64: Type = Type::MMASK64;
+static MM_CMPINT_ENUM: Type = Type::MM_CMPINT_ENUM;
+static MM_MANTISSA_NORM_ENUM: Type = Type::MM_MANTISSA_NORM_ENUM;
+static MM_MANTISSA_SIGN_ENUM: Type = Type::MM_MANTISSA_SIGN_ENUM;
+static MM_PERM_ENUM: Type = Type::MM_PERM_ENUM;
+
+static TUPLE: Type = Type::Tuple;
+static CPUID: Type = Type::CpuidResult;
+static NEVER: Type = Type::Never;
+
+#[derive(Debug, PartialEq, Copy, Clone)]
+enum Type {
+    PrimFloat(u8),
+    PrimSigned(u8),
+    PrimUnsigned(u8),
+    BFloat16,
+    MutPtr(&'static Type),
+    ConstPtr(&'static Type),
+    M128,
+    M128BH,
+    M128D,
+    M128H,
+    M128I,
+    M256,
+    M256BH,
+    M256D,
+    M256H,
+    M256I,
+    M512,
+    M512BH,
+    M512D,
+    M512H,
+    M512I,
+    MMASK8,
+    MMASK16,
+    MMASK32,
+    MMASK64,
+    MM_CMPINT_ENUM,
+    MM_MANTISSA_NORM_ENUM,
+    MM_MANTISSA_SIGN_ENUM,
+    MM_PERM_ENUM,
+    Tuple,
+    CpuidResult,
+    Never,
+    Ordering,
+}
+
+stdarch_verify::x86_functions!(static FUNCTIONS);
+
+#[derive(Deserialize)]
+struct Data {
+    #[serde(rename = "intrinsic", default)]
+    intrinsics: Vec<Intrinsic>,
+}
+
+#[derive(Deserialize)]
+struct Intrinsic {
+    #[serde(rename = "return")]
+    return_: Return,
+    #[serde(rename = "@name")]
+    name: String,
+    #[serde(rename = "@tech")]
+    tech: String,
+    #[serde(rename = "CPUID", default)]
+    cpuid: Vec<String>,
+    #[serde(rename = "parameter", default)]
+    parameters: Vec<Parameter>,
+    #[serde(rename = "@sequence", default)]
+    generates_sequence: bool,
+    #[serde(default)]
+    instruction: Vec<Instruction>,
+}
+
+#[derive(Deserialize)]
+struct Parameter {
+    #[serde(rename = "@type")]
+    type_: String,
+    #[serde(rename = "@etype", default)]
+    etype: String,
+}
+
+#[derive(Deserialize)]
+struct Return {
+    #[serde(rename = "@type", default)]
+    type_: String,
+}
+
+#[derive(Deserialize, Debug)]
+struct Instruction {
+    #[serde(rename = "@name")]
+    name: String,
+}
+
+macro_rules! bail {
+    ($($t:tt)*) => { return Err(format!($($t)*)) }
+}
+
+#[test]
+fn verify_all_signatures() {
+    // This XML document was downloaded from Intel's site. To update this you
+    // can visit intel's intrinsics guide online documentation:
+    //
+    //   https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
+    //
+    // Open up the network console and you'll see an xml file was downloaded
+    // (currently called data-3.6.9.xml). That's the file we downloaded
+    // here.
+    let xml = include_bytes!("../x86-intel.xml");
+
+    let xml = &xml[..];
+    let data: Data = quick_xml::de::from_reader(xml).expect("failed to deserialize xml");
+    let mut map = HashMap::new();
+    for intrinsic in &data.intrinsics {
+        map.entry(&intrinsic.name[..])
+            .or_insert_with(Vec::new)
+            .push(intrinsic);
+    }
+
+    let mut all_valid = true;
+    'outer: for rust in FUNCTIONS {
+        if !rust.has_test {
+            // FIXME: this list should be almost empty
+            let skip = [
+                // MXCSR - deprecated, immediate UB
+                "_mm_getcsr",
+                "_mm_setcsr",
+                "_MM_GET_EXCEPTION_MASK",
+                "_MM_GET_EXCEPTION_STATE",
+                "_MM_GET_FLUSH_ZERO_MODE",
+                "_MM_GET_ROUNDING_MODE",
+                "_MM_SET_EXCEPTION_MASK",
+                "_MM_SET_EXCEPTION_STATE",
+                "_MM_SET_FLUSH_ZERO_MODE",
+                "_MM_SET_ROUNDING_MODE",
+                // CPUID
+                "__cpuid_count",
+                "__cpuid",
+                "__get_cpuid_max",
+                // Privileged, see https://github.com/rust-lang/stdarch/issues/209
+                "_xsetbv",
+                "_xsaves",
+                "_xrstors",
+                "_xsaves64",
+                "_xrstors64",
+                "_mm_loadiwkey",
+                // RDRAND
+                "_rdrand16_step",
+                "_rdrand32_step",
+                "_rdrand64_step",
+                "_rdseed16_step",
+                "_rdseed32_step",
+                "_rdseed64_step",
+                // Prefetch
+                "_mm_prefetch",
+                // CMPXCHG
+                "cmpxchg16b",
+                // Undefined
+                "_mm_undefined_ps",
+                "_mm_undefined_pd",
+                "_mm_undefined_si128",
+                "_mm_undefined_ph",
+                "_mm256_undefined_ps",
+                "_mm256_undefined_pd",
+                "_mm256_undefined_si256",
+                "_mm256_undefined_ph",
+                "_mm512_undefined_ps",
+                "_mm512_undefined_pd",
+                "_mm512_undefined_epi32",
+                "_mm512_undefined",
+                "_mm512_undefined_ph",
+                // Has doc-tests instead
+                "_mm256_shuffle_epi32",
+                "_mm256_unpackhi_epi8",
+                "_mm256_unpacklo_epi8",
+                "_mm256_unpackhi_epi16",
+                "_mm256_unpacklo_epi16",
+                "_mm256_unpackhi_epi32",
+                "_mm256_unpacklo_epi32",
+                "_mm256_unpackhi_epi64",
+                "_mm256_unpacklo_epi64",
+                // Has tests with some other intrinsic
+                "__writeeflags",
+                "_xrstor",
+                "_xrstor64",
+                "_fxrstor",
+                "_fxrstor64",
+                "_xend",
+                "_xabort_code",
+                // Aliases
+                "_mm_comige_ss",
+                "_mm_cvt_ss2si",
+                "_mm_cvtt_ss2si",
+                "_mm_cvt_si2ss",
+                "_mm_set_ps1",
+                "_mm_load_ps1",
+                "_mm_store_ps1",
+                "_mm_bslli_si128",
+                "_mm_bsrli_si128",
+                "_bextr2_u32",
+                "_mm_tzcnt_32",
+                "_mm256_bslli_epi128",
+                "_mm256_bsrli_epi128",
+                "_mm_cvtsi64x_si128",
+                "_mm_cvtsi128_si64x",
+                "_mm_cvtsi64x_sd",
+                "_bextr2_u64",
+                "_mm_tzcnt_64",
+            ];
+            if !skip.contains(&rust.name) {
+                println!(
+                    "missing run-time test named `test_{}` for `{}`",
+                    {
+                        let mut id = rust.name;
+                        while id.starts_with('_') {
+                            id = &id[1..];
+                        }
+                        id
+                    },
+                    rust.name
+                );
+                all_valid = false;
+            }
+        }
+
+        match rust.name {
+            // These aren't defined by Intel but they're defined by what appears
+            // to be all other compilers. For more information see
+            // rust-lang/stdarch#307, and otherwise these signatures
+            // have all been manually verified.
+            "__readeflags" |
+            "__writeeflags" |
+            "__cpuid_count" |
+            "__cpuid" |
+            "__get_cpuid_max" |
+            "_MM_SHUFFLE" |
+            "_xabort_code" |
+            // Not listed with intel, but manually verified
+            "cmpxchg16b"
+            => continue,
+            _ => {}
+        }
+
+        // these are all AMD-specific intrinsics
+        if let Some(feature) = rust.target_feature {
+            if feature.contains("sse4a") || feature.contains("tbm") {
+                continue;
+            }
+        }
+
+        let intel = match map.remove(rust.name) {
+            Some(i) => i,
+            None => panic!("missing intel definition for {}", rust.name),
+        };
+
+        let mut errors = Vec::new();
+        for intel in intel {
+            match matches(rust, intel) {
+                Ok(()) => continue 'outer,
+                Err(e) => errors.push(e),
+            }
+        }
+        println!("failed to verify `{}`", rust.name);
+        for error in errors {
+            println!("  * {error}");
+        }
+        all_valid = false;
+    }
+    assert!(all_valid);
+
+    if GENERATE_MISSING_X86_MD {
+        print_missing(
+            &map,
+            BufWriter::new(File::create("../core_arch/missing-x86.md").unwrap()),
+        )
+        .unwrap();
+    }
+}
+
+fn print_missing(map: &HashMap<&str, Vec<&Intrinsic>>, mut f: impl Write) -> io::Result<()> {
+    let mut missing = BTreeMap::new(); // BTreeMap to keep the cpuids ordered
+
+    // we cannot use SVML and MMX, and MPX is not in LLVM, and intrinsics without any cpuid requirement
+    // are accessible from safe rust
+    for intrinsic in map.values().flatten().filter(|intrinsic| {
+        intrinsic.tech != "SVML"
+            && intrinsic.tech != "MMX"
+            && !intrinsic.cpuid.is_empty()
+            && !intrinsic.cpuid.contains(&"MPX".to_string())
+            && intrinsic.return_.type_ != "__m64"
+            && !intrinsic
+                .parameters
+                .iter()
+                .any(|param| param.type_.contains("__m64"))
+    }) {
+        missing
+            .entry(&intrinsic.cpuid)
+            .or_insert_with(Vec::new)
+            .push(intrinsic);
+    }
+
+    for (k, v) in &mut missing {
+        v.sort_by_key(|intrinsic| &intrinsic.name); // sort to make the order of everything same
+        writeln!(f, "\n<details><summary>{k:?}</summary><p>\n")?;
+        for intel in v {
+            let url = format!(
+                "https://software.intel.com/sites/landingpage\
+                         /IntrinsicsGuide/#text={}",
+                intel.name
+            );
+            writeln!(f, "  * [ ] [`{}`]({url})", intel.name)?;
+        }
+        writeln!(f, "</p></details>\n")?;
+    }
+
+    f.flush()
+}
+
+fn check_target_features(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
+    // Verify that all `#[target_feature]` annotations are correct,
+    // ensuring that we've actually enabled the right instruction
+    // set for this intrinsic.
+    match rust.name {
+        "_bswap" | "_bswap64" => {}
+
+        // These don't actually have a target feature unlike their brethren with
+        // the `x` inside the name which requires adx
+        "_addcarry_u32" | "_addcarry_u64" | "_subborrow_u32" | "_subborrow_u64" => {}
+
+        "_bittest"
+        | "_bittestandset"
+        | "_bittestandreset"
+        | "_bittestandcomplement"
+        | "_bittest64"
+        | "_bittestandset64"
+        | "_bittestandreset64"
+        | "_bittestandcomplement64" => {}
+
+        _ => {
+            if intel.cpuid.is_empty() {
+                bail!("missing cpuid for {}", rust.name);
+            }
+        }
+    }
+
+    let rust_features = match rust.target_feature {
+        Some(features) => features
+            .split(',')
+            .map(|feature| feature.to_string())
+            .collect(),
+        None => HashSet::new(),
+    };
+
+    let mut intel_cpuids = HashSet::new();
+
+    for cpuid in &intel.cpuid {
+        // The pause intrinsic is in the SSE2 module, but it is backwards
+        // compatible with CPUs without SSE2, and it therefore does not need the
+        // target-feature attribute.
+        if rust.name == "_mm_pause" {
+            continue;
+        }
+
+        // these flags on the rdtsc/rtdscp intrinsics we don't test for right
+        // now, but we may wish to add these one day!
+        //
+        // For more info see #308
+        if *cpuid == "TSC" || *cpuid == "RDTSCP" {
+            continue;
+        }
+
+        // Some CPUs support VAES/GFNI/VPCLMULQDQ without AVX512, even though
+        // the Intel documentation states that those instructions require
+        // AVX512VL.
+        if *cpuid == "AVX512VL"
+            && intel
+                .cpuid
+                .iter()
+                .any(|x| matches!(&**x, "VAES" | "GFNI" | "VPCLMULQDQ"))
+        {
+            continue;
+        }
+
+        let cpuid = cpuid.to_lowercase().replace('_', "");
+
+        // Fix mismatching feature names:
+        let fixed_cpuid = match cpuid.as_ref() {
+            // The XML file names IFMA as "avx512ifma52", while Rust calls
+            // it "avx512ifma".
+            "avx512ifma52" => String::from("avx512ifma"),
+            "xss" => String::from("xsaves"),
+            "keylocker" => String::from("kl"),
+            "keylockerwide" => String::from("widekl"),
+            _ => cpuid,
+        };
+
+        intel_cpuids.insert(fixed_cpuid);
+    }
+
+    if intel_cpuids.contains("gfni") {
+        if rust.name.contains("mask") {
+            // LLVM requires avx512bw for all masked GFNI intrinsics, and also avx512vl for the 128- and 256-bit versions
+            if !rust.name.starts_with("_mm512") {
+                intel_cpuids.insert(String::from("avx512vl"));
+            }
+            intel_cpuids.insert(String::from("avx512bw"));
+        } else if rust.name.starts_with("_mm256") {
+            // LLVM requires AVX for all non-masked 256-bit GFNI intrinsics
+            intel_cpuids.insert(String::from("avx"));
+        }
+    }
+
+    // Also, 512-bit vpclmulqdq intrisic requires avx512f
+    if &rust.name == &"_mm512_clmulepi64_epi128" {
+        intel_cpuids.insert(String::from("avx512f"));
+    }
+
+    if rust_features != intel_cpuids {
+        bail!(
+            "Intel cpuids `{:?}` doesn't match Rust `{:?}` for {}",
+            intel_cpuids,
+            rust_features,
+            rust.name
+        );
+    }
+
+    Ok(())
+}
+
+fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
+    check_target_features(rust, intel)?;
+
+    if PRINT_INSTRUCTION_VIOLATIONS {
+        if rust.instrs.is_empty() {
+            if !intel.instruction.is_empty() && !intel.generates_sequence {
+                println!(
+                    "instruction not listed for `{}`, but intel lists {:?}",
+                    rust.name, intel.instruction
+                );
+            }
+
+        // If intel doesn't list any instructions and we do then don't
+        // bother trying to look for instructions in intel, we've just got
+        // some extra assertions on our end.
+        } else if !intel.instruction.is_empty() {
+            for instr in rust.instrs {
+                let asserting = intel
+                    .instruction
+                    .iter()
+                    .any(|a| a.name.to_lowercase().starts_with(instr));
+                if !asserting {
+                    println!(
+                        "intel failed to list `{}` as an instruction for `{}`",
+                        instr, rust.name
+                    );
+                }
+            }
+        }
+    }
+
+    // Make sure we've got the right return type.
+    if let Some(t) = rust.ret {
+        equate(t, &intel.return_.type_, "", intel, false)?;
+    } else if !intel.return_.type_.is_empty() && intel.return_.type_ != "void" {
+        bail!(
+            "{} returns `{}` with intel, void in rust",
+            rust.name,
+            intel.return_.type_
+        );
+    }
+
+    // If there's no arguments on Rust's side intel may list one "void"
+    // argument, so handle that here.
+    if rust.arguments.is_empty() && intel.parameters.len() == 1 {
+        if intel.parameters[0].type_ != "void" {
+            bail!("rust has 0 arguments, intel has one for")
+        }
+    } else {
+        // Otherwise we want all parameters to be exactly the same
+        if rust.arguments.len() != intel.parameters.len() {
+            bail!("wrong number of arguments on {}", rust.name);
+        }
+        for (i, (a, b)) in intel.parameters.iter().zip(rust.arguments).enumerate() {
+            let is_const = rust.required_const.contains(&i);
+            equate(b, &a.type_, &a.etype, &intel, is_const)?;
+        }
+    }
+
+    let any_i64 = rust
+        .arguments
+        .iter()
+        .cloned()
+        .chain(rust.ret)
+        .any(|arg| matches!(*arg, Type::PrimSigned(64) | Type::PrimUnsigned(64)));
+    let any_i64_exempt = match rust.name {
+        // These intrinsics have all been manually verified against Clang's
+        // headers to be available on x86, and the u64 arguments seem
+        // spurious I guess?
+        "_xsave" | "_xrstor" | "_xsetbv" | "_xgetbv" | "_xsaveopt" | "_xsavec" | "_xsaves"
+        | "_xrstors" => true,
+
+        // Apparently all of clang/msvc/gcc accept these intrinsics on
+        // 32-bit, so let's do the same
+        "_mm_set_epi64x"
+        | "_mm_set1_epi64x"
+        | "_mm256_set_epi64x"
+        | "_mm256_setr_epi64x"
+        | "_mm256_set1_epi64x"
+        | "_mm512_set1_epi64"
+        | "_mm256_mask_set1_epi64"
+        | "_mm256_maskz_set1_epi64"
+        | "_mm_mask_set1_epi64"
+        | "_mm_maskz_set1_epi64"
+        | "_mm512_set4_epi64"
+        | "_mm512_setr4_epi64"
+        | "_mm512_set_epi64"
+        | "_mm512_setr_epi64"
+        | "_mm512_reduce_add_epi64"
+        | "_mm512_mask_reduce_add_epi64"
+        | "_mm512_reduce_mul_epi64"
+        | "_mm512_mask_reduce_mul_epi64"
+        | "_mm512_reduce_max_epi64"
+        | "_mm512_mask_reduce_max_epi64"
+        | "_mm512_reduce_max_epu64"
+        | "_mm512_mask_reduce_max_epu64"
+        | "_mm512_reduce_min_epi64"
+        | "_mm512_mask_reduce_min_epi64"
+        | "_mm512_reduce_min_epu64"
+        | "_mm512_mask_reduce_min_epu64"
+        | "_mm512_reduce_and_epi64"
+        | "_mm512_mask_reduce_and_epi64"
+        | "_mm512_reduce_or_epi64"
+        | "_mm512_mask_reduce_or_epi64"
+        | "_mm512_mask_set1_epi64"
+        | "_mm512_maskz_set1_epi64"
+        | "_mm_cvt_roundss_si64"
+        | "_mm_cvt_roundss_i64"
+        | "_mm_cvt_roundss_u64"
+        | "_mm_cvtss_i64"
+        | "_mm_cvtss_u64"
+        | "_mm_cvt_roundsd_si64"
+        | "_mm_cvt_roundsd_i64"
+        | "_mm_cvt_roundsd_u64"
+        | "_mm_cvtsd_i64"
+        | "_mm_cvtsd_u64"
+        | "_mm_cvt_roundi64_ss"
+        | "_mm_cvt_roundi64_sd"
+        | "_mm_cvt_roundsi64_ss"
+        | "_mm_cvt_roundsi64_sd"
+        | "_mm_cvt_roundu64_ss"
+        | "_mm_cvt_roundu64_sd"
+        | "_mm_cvti64_ss"
+        | "_mm_cvti64_sd"
+        | "_mm_cvtt_roundss_si64"
+        | "_mm_cvtt_roundss_i64"
+        | "_mm_cvtt_roundss_u64"
+        | "_mm_cvttss_i64"
+        | "_mm_cvttss_u64"
+        | "_mm_cvtt_roundsd_si64"
+        | "_mm_cvtt_roundsd_i64"
+        | "_mm_cvtt_roundsd_u64"
+        | "_mm_cvttsd_i64"
+        | "_mm_cvttsd_u64"
+        | "_mm_cvtu64_ss"
+        | "_mm_cvtu64_sd" => true,
+
+        // These return a 64-bit argument but they're assembled from other
+        // 32-bit registers, so these work on 32-bit just fine. See #308 for
+        // more info.
+        "_rdtsc" | "__rdtscp" => true,
+
+        _ => false,
+    };
+    if any_i64 && !any_i64_exempt && !rust.file.contains("x86_64") {
+        bail!(
+            "intrinsic `{}` uses a 64-bit bare type but may be \
+             available on 32-bit platforms",
+            rust.name
+        );
+    }
+    if !rust.doc.contains("Intel") {
+        bail!("No link to Intel");
+    }
+    let recognized_links = [
+        "https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html",
+        "https://software.intel.com/sites/landingpage/IntrinsicsGuide/",
+    ];
+    if !recognized_links.iter().any(|link| rust.doc.contains(link)) {
+        bail!("Unrecognized Intel Link");
+    }
+    if !rust.doc.contains(&rust.name[1..]) {
+        // We can leave the leading underscore
+        bail!("Bad link to Intel");
+    }
+    Ok(())
+}
+
+fn pointed_type(intrinsic: &Intrinsic) -> Result<Type, String> {
+    Ok(
+        if intrinsic.tech == "AMX"
+            || intrinsic
+                .cpuid
+                .iter()
+                .any(|cpuid| matches!(&**cpuid, "KEYLOCKER" | "KEYLOCKER_WIDE" | "XSAVE" | "FXSR"))
+        {
+            // AMX, KEYLOCKER and XSAVE intrinsics should take `*u8`
+            U8
+        } else if intrinsic.name == "_mm_clflush" {
+            // Just a false match in the following logic
+            U8
+        } else if ["_mm_storeu_si", "_mm_loadu_si"]
+            .iter()
+            .any(|x| intrinsic.name.starts_with(x))
+        {
+            // These have already been stabilized, so cannot be changed anymore
+            U8
+        } else if intrinsic.name.ends_with("i8") {
+            I8
+        } else if intrinsic.name.ends_with("i16") {
+            I16
+        } else if intrinsic.name.ends_with("i32") {
+            I32
+        } else if intrinsic.name.ends_with("i64") {
+            I64
+        } else if intrinsic.name.ends_with("i128") {
+            M128I
+        } else if intrinsic.name.ends_with("i256") {
+            M256I
+        } else if intrinsic.name.ends_with("i512") {
+            M512I
+        } else if intrinsic.name.ends_with("h") {
+            F16
+        } else if intrinsic.name.ends_with("s") {
+            F32
+        } else if intrinsic.name.ends_with("d") {
+            F64
+        } else {
+            bail!(
+                "Don't know what type of *void to use for {}",
+                intrinsic.name
+            );
+        },
+    )
+}
+
+fn equate(
+    t: &Type,
+    intel: &str,
+    etype: &str,
+    intrinsic: &Intrinsic,
+    is_const: bool,
+) -> Result<(), String> {
+    // Make pointer adjacent to the type: float * foo => float* foo
+    let mut intel = intel.replace(" *", "*");
+    // Make mutability modifier adjacent to the pointer:
+    // float const * foo => float const* foo
+    intel = intel.replace("const *", "const*");
+    // Normalize mutability modifier to after the type:
+    // const float* foo => float const*
+    if intel.starts_with("const") && intel.ends_with('*') {
+        intel = intel.replace("const ", "");
+        intel = intel.replace('*', " const*");
+    }
+    if etype == "IMM" || intel == "constexpr int" {
+        // The _bittest intrinsics claim to only accept immediates but actually
+        // accept run-time values as well.
+        if !is_const && !intrinsic.name.starts_with("_bittest") {
+            bail!("argument required to be const but isn't");
+        }
+    } else {
+        // const int must be an IMM
+        assert_ne!(intel, "const int");
+        if is_const {
+            bail!("argument is const but shouldn't be");
+        }
+    }
+    match (t, &intel[..]) {
+        (&Type::PrimFloat(16), "_Float16") => {}
+        (&Type::PrimFloat(32), "float") => {}
+        (&Type::PrimFloat(64), "double") => {}
+        (&Type::PrimSigned(8), "__int8" | "char") => {}
+        (&Type::PrimSigned(16), "__int16" | "short") => {}
+        (&Type::PrimSigned(32), "__int32" | "constexpr int" | "const int" | "int") => {}
+        (&Type::PrimSigned(64), "__int64" | "long long") => {}
+        (&Type::PrimUnsigned(8), "unsigned char") => {}
+        (&Type::PrimUnsigned(16), "unsigned short") => {}
+        (&Type::BFloat16, "__bfloat16") => {}
+        (
+            &Type::PrimUnsigned(32),
+            "unsigned __int32" | "unsigned int" | "unsigned long" | "const unsigned int",
+        ) => {}
+        (&Type::PrimUnsigned(64), "unsigned __int64") => {}
+        (&Type::PrimUnsigned(SS), "size_t") => {}
+
+        (&Type::M128, "__m128") => {}
+        (&Type::M128BH, "__m128bh") => {}
+        (&Type::M128I, "__m128i") => {}
+        (&Type::M128D, "__m128d") => {}
+        (&Type::M128H, "__m128h") => {}
+        (&Type::M256, "__m256") => {}
+        (&Type::M256BH, "__m256bh") => {}
+        (&Type::M256I, "__m256i") => {}
+        (&Type::M256D, "__m256d") => {}
+        (&Type::M256H, "__m256h") => {}
+        (&Type::M512, "__m512") => {}
+        (&Type::M512BH, "__m512bh") => {}
+        (&Type::M512I, "__m512i") => {}
+        (&Type::M512D, "__m512d") => {}
+        (&Type::M512H, "__m512h") => {}
+        (&Type::MMASK64, "__mmask64") => {}
+        (&Type::MMASK32, "__mmask32") => {}
+        (&Type::MMASK16, "__mmask16") => {}
+        (&Type::MMASK8, "__mmask8") => {}
+
+        (&Type::MutPtr(_type), "void*") | (&Type::ConstPtr(_type), "void const*") => {
+            let pointed_type = pointed_type(intrinsic)?;
+            if _type != &pointed_type {
+                bail!(
+                    "incorrect void pointer type {_type:?} in {}, should be pointer to {pointed_type:?}",
+                    intrinsic.name,
+                );
+            }
+        }
+
+        (&Type::MutPtr(&Type::PrimFloat(32)), "float*") => {}
+        (&Type::MutPtr(&Type::PrimFloat(64)), "double*") => {}
+        (&Type::MutPtr(&Type::PrimSigned(8)), "char*") => {}
+        (&Type::MutPtr(&Type::PrimSigned(32)), "__int32*" | "int*") => {}
+        (&Type::MutPtr(&Type::PrimSigned(64)), "__int64*") => {}
+        (&Type::MutPtr(&Type::PrimUnsigned(8)), "unsigned char*") => {}
+        (&Type::MutPtr(&Type::PrimUnsigned(16)), "unsigned short*") => {}
+        (&Type::MutPtr(&Type::PrimUnsigned(32)), "unsigned int*" | "unsigned __int32*") => {}
+        (&Type::MutPtr(&Type::PrimUnsigned(64)), "unsigned __int64*") => {}
+
+        (&Type::MutPtr(&Type::MMASK8), "__mmask8*") => {}
+        (&Type::MutPtr(&Type::MMASK32), "__mmask32*") => {}
+        (&Type::MutPtr(&Type::MMASK64), "__mmask64*") => {}
+        (&Type::MutPtr(&Type::MMASK16), "__mmask16*") => {}
+
+        (&Type::MutPtr(&Type::M128), "__m128*") => {}
+        (&Type::MutPtr(&Type::M128BH), "__m128bh*") => {}
+        (&Type::MutPtr(&Type::M128I), "__m128i*") => {}
+        (&Type::MutPtr(&Type::M128D), "__m128d*") => {}
+        (&Type::MutPtr(&Type::M256), "__m256*") => {}
+        (&Type::MutPtr(&Type::M256BH), "__m256bh*") => {}
+        (&Type::MutPtr(&Type::M256I), "__m256i*") => {}
+        (&Type::MutPtr(&Type::M256D), "__m256d*") => {}
+        (&Type::MutPtr(&Type::M512), "__m512*") => {}
+        (&Type::MutPtr(&Type::M512BH), "__m512bh*") => {}
+        (&Type::MutPtr(&Type::M512I), "__m512i*") => {}
+        (&Type::MutPtr(&Type::M512D), "__m512d*") => {}
+
+        (&Type::ConstPtr(&Type::PrimFloat(16)), "_Float16 const*") => {}
+        (&Type::ConstPtr(&Type::PrimFloat(32)), "float const*") => {}
+        (&Type::ConstPtr(&Type::PrimFloat(64)), "double const*") => {}
+        (&Type::ConstPtr(&Type::PrimSigned(8)), "char const*") => {}
+        (&Type::ConstPtr(&Type::PrimSigned(32)), "__int32 const*" | "int const*") => {}
+        (&Type::ConstPtr(&Type::PrimSigned(64)), "__int64 const*") => {}
+        (&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*") => {}
+        (&Type::ConstPtr(&Type::PrimUnsigned(32)), "unsigned int const*") => {}
+        (&Type::ConstPtr(&Type::PrimUnsigned(64)), "unsigned __int64 const*") => {}
+        (&Type::ConstPtr(&Type::BFloat16), "__bf16 const*") => {}
+
+        (&Type::ConstPtr(&Type::M128), "__m128 const*") => {}
+        (&Type::ConstPtr(&Type::M128BH), "__m128bh const*") => {}
+        (&Type::ConstPtr(&Type::M128I), "__m128i const*") => {}
+        (&Type::ConstPtr(&Type::M128D), "__m128d const*") => {}
+        (&Type::ConstPtr(&Type::M128H), "__m128h const*") => {}
+        (&Type::ConstPtr(&Type::M256), "__m256 const*") => {}
+        (&Type::ConstPtr(&Type::M256BH), "__m256bh const*") => {}
+        (&Type::ConstPtr(&Type::M256I), "__m256i const*") => {}
+        (&Type::ConstPtr(&Type::M256D), "__m256d const*") => {}
+        (&Type::ConstPtr(&Type::M256H), "__m256h const*") => {}
+        (&Type::ConstPtr(&Type::M512), "__m512 const*") => {}
+        (&Type::ConstPtr(&Type::M512BH), "__m512bh const*") => {}
+        (&Type::ConstPtr(&Type::M512I), "__m512i const*") => {}
+        (&Type::ConstPtr(&Type::M512D), "__m512d const*") => {}
+
+        (&Type::ConstPtr(&Type::MMASK8), "__mmask8*") => {}
+        (&Type::ConstPtr(&Type::MMASK16), "__mmask16*") => {}
+        (&Type::ConstPtr(&Type::MMASK32), "__mmask32*") => {}
+        (&Type::ConstPtr(&Type::MMASK64), "__mmask64*") => {}
+
+        (&Type::MM_CMPINT_ENUM, "_MM_CMPINT_ENUM") => {}
+        (&Type::MM_MANTISSA_NORM_ENUM, "_MM_MANTISSA_NORM_ENUM") => {}
+        (&Type::MM_MANTISSA_SIGN_ENUM, "_MM_MANTISSA_SIGN_ENUM") => {}
+        (&Type::MM_PERM_ENUM, "_MM_PERM_ENUM") => {}
+
+        // This is a macro (?) in C which seems to mutate its arguments, but
+        // that means that we're taking pointers to arguments in rust
+        // as we're not exposing it as a macro.
+        (&Type::MutPtr(&Type::M128), "__m128") if intrinsic.name == "_MM_TRANSPOSE4_PS" => {}
+
+        // The _rdtsc intrinsic uses a __int64 return type, but this is a bug in
+        // the intrinsics guide: https://github.com/rust-lang/stdarch/issues/559
+        // We have manually fixed the bug by changing the return type to `u64`.
+        (&Type::PrimUnsigned(64), "__int64") if intrinsic.name == "_rdtsc" => {}
+
+        // The _bittest and _bittest64 intrinsics takes a mutable pointer in the
+        // intrinsics guide even though it never writes through the pointer:
+        (&Type::ConstPtr(&Type::PrimSigned(32)), "__int32*") if intrinsic.name == "_bittest" => {}
+        (&Type::ConstPtr(&Type::PrimSigned(64)), "__int64*") if intrinsic.name == "_bittest64" => {}
+        // The _xrstor, _fxrstor, _xrstor64, _fxrstor64 intrinsics take a
+        // mutable pointer in the intrinsics guide even though they never write
+        // through the pointer:
+        (&Type::ConstPtr(&Type::PrimUnsigned(8)), "void*")
+            if matches!(
+                &*intrinsic.name,
+                "_xrstor" | "_xrstor64" | "_fxrstor" | "_fxrstor64"
+            ) => {}
+        // The _mm_stream_load_si128 intrinsic take a mutable pointer in the intrinsics
+        // guide even though they never write through the pointer
+        (&Type::ConstPtr(&Type::M128I), "void*") if intrinsic.name == "_mm_stream_load_si128" => {}
+        /// Intel requires the mask argument for _mm_shuffle_ps to be an
+        // unsigned integer, but all other _mm_shuffle_.. intrinsics
+        // take a signed-integer. This breaks `_MM_SHUFFLE` for
+        // `_mm_shuffle_ps`
+        (&Type::PrimSigned(32), "unsigned int") if intrinsic.name == "_mm_shuffle_ps" => {}
+
+        _ => bail!(
+            "failed to equate: `{intel}` and {t:?} for {}",
+            intrinsic.name
+        ),
+    }
+    Ok(())
+}
diff --git a/library/stdarch/crates/stdarch-verify/x86-intel.xml b/library/stdarch/crates/stdarch-verify/x86-intel.xml
new file mode 100644
index 0000000000000..41f2119e681f9
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/x86-intel.xml
@@ -0,0 +1,158422 @@
+<intrinsics_list version="3.6.9" date="07/12/2024">
+<intrinsic name="_addcarryx_u32" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI8" type="unsigned char" varname="c_in" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="b" />
+	<parameter etype="UI32" memwidth="32" type="unsigned int *" varname="out" />
+	<description>Add unsigned 32-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry or overflow flag), and store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+tmp[32:0] := a[31:0] + b[31:0] + (c_in &gt; 0 ? 1 : 0)
+MEM[out+31:out] := tmp[31:0]
+dst[0] := tmp[32]
+dst[7:1] := 0
+	</operation>
+	<instruction form="r32, r32" name="ADCX" xed="ADCX_GPR32d_GPR32d" />
+	<instruction form="r32, r32" name="ADOX" xed="ADOX_GPR32d_GPR32d" />
+	<CPUID>ADX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_addcarryx_u64" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI8" type="unsigned char" varname="c_in" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="b" />
+	<parameter etype="UI64" memwidth="64" type="unsigned __int64 *" varname="out" />
+	<description>Add unsigned 64-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry or overflow flag), and store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+tmp[64:0] := a[63:0] + b[63:0] + (c_in &gt; 0 ? 1 : 0)
+MEM[out+63:out] := tmp[63:0]
+dst[0] := tmp[64]
+dst[7:1] := 0
+	</operation>
+	<instruction form="r64, r64" name="ADCX" xed="ADCX_GPR64q_GPR64q" />
+	<instruction form="r64, r64" name="ADOX" xed="ADOX_GPR64q_GPR64q" />
+	<CPUID>ADX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+<intrinsic name="_mm_aesenc_si128" vexEq="TRUE" tech="Other">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="RoundKey" />
+	<description>Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"."</description>
+	<operation>a[127:0] := ShiftRows(a[127:0])
+a[127:0] := SubBytes(a[127:0])
+a[127:0] := MixColumns(a[127:0])
+dst[127:0] := a[127:0] XOR RoundKey[127:0]
+	</operation>
+	<instruction form="xmm, xmm" name="AESENC" xed="AESENC_XMMdq_XMMdq" />
+	<CPUID>AES</CPUID>
+	<header>wmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_aesenclast_si128" vexEq="TRUE" tech="Other">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="RoundKey" />
+	<description>Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"."</description>
+	<operation>a[127:0] := ShiftRows(a[127:0])
+a[127:0] := SubBytes(a[127:0])
+dst[127:0] := a[127:0] XOR RoundKey[127:0]
+	</operation>
+	<instruction form="xmm, xmm" name="AESENCLAST" xed="AESENCLAST_XMMdq_XMMdq" />
+	<CPUID>AES</CPUID>
+	<header>wmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_aesdec_si128" vexEq="TRUE" tech="Other">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="RoundKey" />
+	<description>Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst".</description>
+	<operation>a[127:0] := InvShiftRows(a[127:0])
+a[127:0] := InvSubBytes(a[127:0])
+a[127:0] := InvMixColumns(a[127:0])
+dst[127:0] := a[127:0] XOR RoundKey[127:0]
+	</operation>
+	<instruction form="xmm, xmm" name="AESDEC" xed="AESDEC_XMMdq_XMMdq" />
+	<CPUID>AES</CPUID>
+	<header>wmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_aesdeclast_si128" vexEq="TRUE" tech="Other">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="RoundKey" />
+	<description>Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst".</description>
+	<operation>a[127:0] := InvShiftRows(a[127:0])
+a[127:0] := InvSubBytes(a[127:0])
+dst[127:0] := a[127:0] XOR RoundKey[127:0]
+	</operation>
+	<instruction form="xmm, xmm" name="AESDECLAST" xed="AESDECLAST_XMMdq_XMMdq" />
+	<CPUID>AES</CPUID>
+	<header>wmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_aesimc_si128" vexEq="TRUE" tech="Other">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<description>Perform the InvMixColumns transformation on "a" and store the result in "dst".</description>
+	<operation>dst[127:0] := InvMixColumns(a[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="AESIMC" xed="AESIMC_XMMdq_XMMdq" />
+	<CPUID>AES</CPUID>
+	<header>wmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_aeskeygenassist_si128" vexEq="TRUE" tech="Other">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Assist in expanding the AES cipher key by computing steps towards generating a round key for encryption cipher using data from "a" and an 8-bit round constant specified in "imm8", and store the result in "dst"."</description>
+	<operation>X3[31:0] := a[127:96]
+X2[31:0] := a[95:64]
+X1[31:0] := a[63:32]
+X0[31:0] := a[31:0]
+RCON[31:0] := ZeroExtend32(imm8[7:0])
+dst[31:0] := SubWord(X1)
+dst[63:32] := RotWord(SubWord(X1)) XOR RCON
+dst[95:64] := SubWord(X3)
+dst[127:96] := RotWord(SubWord(X3)) XOR RCON
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="AESKEYGENASSIST" xed="AESKEYGENASSIST_XMMdq_XMMdq_IMMb" />
+	<CPUID>AES</CPUID>
+	<header>wmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_tile_dpbf16ps" tech="AMX">
+	<return type="void" />
+	<parameter type="constexpr int" varname="dst" />
+	<parameter type="constexpr int" varname="a" />
+	<parameter type="constexpr int" varname="b" />
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in tiles "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "dst", and store the 32-bit result back to tile "dst".</description>
+	<operation>FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (a.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.fp32[n] += FP32(a.row[m].bf16[2*k+0]) * FP32(b.row[k].bf16[2*n+0])
+			tmp.fp32[n] += FP32(a.row[m].bf16[2*k+1]) * FP32(b.row[k].bf16[2*n+1])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction form="tmm, tmm, tmm" name="TDPBF16PS" xed="TDPBF16PS_TMMf32_TMMu32_TMMu32" />
+	<CPUID>AMX-BF16</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="__tile_dpbf16ps" tech="AMX">
+	<return type="void" />
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in tiles "src0" and "src1", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler.</description>
+	<instruction form="tmm, tmm, tmm" name="TDPBF16PS" xed="TDPBF16PS_TMMf32_TMMu32_TMMu32" />
+	<operation>FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (src0.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.fp32[n] += FP32(src0.row[m].bf16[2*k+0]) * FP32(src1.row[k].bf16[2*n+0])
+			tmp.fp32[n] += FP32(src0.row[m].bf16[2*k+1]) * FP32(src1.row[k].bf16[2*n+1])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+</operation>
+	<parameter type="__tile1024i*" varname="dst" />
+	<parameter type="__tile1024i" varname="src0" />
+	<parameter type="__tile1024i" varname="src1" />
+	<CPUID>AMX-BF16</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	
+<intrinsic name="_tile_cmmimfp16ps" tech="AMX">
+	<return type="void" />
+	<parameter type="constexpr int" etype="FP32" varname="dst" />
+	<parameter type="constexpr int" etype="FP16" varname="a" />
+	<parameter type="constexpr int" etype="FP16" varname="b" />
+	<description>Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. Each dword element in input tiles "a" and "b" is interpreted as a complex number with FP16 real part and FP16 imaginary part. Calculates the imaginary part of the result. For each possible combination of (row of "a", column of "b"), it performs a set of multiplication and accumulations on all corresponding complex numbers (one from "a" and one from "b"). The imaginary part of the "a" element is multiplied with the real part of the corresponding "b" element, and the real part of the "a" element is multiplied with the imaginary part of the corresponding "b" elements. The two accumulated results are added, and then accumulated into the corresponding row and column of "dst".</description>
+	<operation>FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (a.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction form="tmm, tmm, tmm" name="TCMMIMFP16PS" xed="TCMMIMFP16PS_TMMf32_TMMf16_TMMf16" />
+	<CPUID>AMX-COMPLEX</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_tile_cmmrlfp16ps" tech="AMX">
+	<return type="void" />
+	<parameter type="constexpr int" etype="FP32" varname="dst" />
+	<parameter type="constexpr int" etype="FP16" varname="a" />
+	<parameter type="constexpr int" etype="FP16" varname="b" />
+	<description>Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. Each dword element in input tiles "a" and "b" is interpreted as a complex number with FP16 real part and FP16 imaginary part. Calculates the real part of the result. For each possible combination of (row of "a", column of "b"), it performs a set of multiplication and accumulations on all corresponding complex numbers (one from "a" and one from "b"). The real part of the "a" element is multiplied with the real part of the corresponding "b" element, and the negated imaginary part of the "a" element is multiplied with the imaginary part of the corresponding "b" elements. The two accumulated results are added, and then accumulated into the corresponding row and column of "dst".</description>
+	<operation>FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (a.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
+			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+</operation>
+	<instruction form="tmm, tmm, tmm" name="TCMMRLFP16PS" xed="TCMMRLFP16PS_TMMf32_TMMf16_TMMf16" />
+	<CPUID>AMX-COMPLEX</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="__tile_cmmimfp16ps" tech="AMX">
+	<return type="void" />
+	<description>Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. Each dword element in input tiles "src0" and "src1" is interpreted as a complex number with FP16 real part and FP16 imaginary part. This function calculates the imaginary part of the result.</description>
+	<instruction form="tmm, tmm, tmm" name="TCMMIMFP16PS" xed="TCMMIMFP16PS_TMMf32_TMMf16_TMMf16" />
+	<operation>FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (src0.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+0]) * FP32(src1.row[k].fp16[2*n+1])
+			tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+1]) * FP32(src1.row[k].fp16[2*n+0])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+</operation>
+	<parameter type="__tile1024i*" varname="dst" />
+	<parameter type="__tile1024i" varname="src0" />
+	<parameter type="__tile1024i" varname="src1" />
+	<CPUID>AMX-COMPLEX</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="__tile_cmmrlfp16ps" tech="AMX">
+	<return type="void" />
+	<description>Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. Each dword element in input tiles src0 and src1 is interpreted as a complex number with FP16 real part and FP16 imaginary part. This function calculates the real part of the result.</description>
+	<instruction form="tmm, tmm, tmm" name="TCMMRLFP16PS" xed="TCMMRLFP16PS_TMMf32_TMMf16_TMMf16" />
+	<operation>FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (src0.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+0]) * FP32(src1.row[k].fp16[2*n+0])
+			tmp.fp32[n] += FP32(-src0.row[m].fp16[2*k+1]) * FP32(src1.row[k].fp16[2*n+1])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+</operation>
+	<parameter type="__tile1024i*" varname="dst" />
+	<parameter type="__tile1024i" varname="src0" />
+	<parameter type="__tile1024i" varname="src1" />
+	<CPUID>AMX-COMPLEX</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	
+<intrinsic name="_tile_dpfp16ps" tech="AMX">
+	<return type="void" />
+	<parameter type="constexpr int" etype="FP32" varname="dst" />
+	<parameter type="constexpr int" etype="FP16" varname="a" />
+	<parameter type="constexpr int" etype="FP16" varname="b" />
+	<description>Compute dot-product of FP16 (16-bit) floating-point pairs in tiles "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "dst", and store the 32-bit result back to tile "dst".</description>
+	<operation>FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (a.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
+			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction form="tmm, tmm, tmm" name="TDPFP16PS" xed="TDPFP16PS_TMMf32_TMM2f16_TMM2f16" />
+	<CPUID>AMX-FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="__tile_dpfp16ps" tech="AMX">
+	<return type="void" />
+	<description>Compute dot-product of FP16 (16-bit) floating-point pairs in tiles "src0" and "src1", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler.</description>
+	<instruction form="tmm, tmm, tmm" name="TDPBF16PS" xed="TDPBF16PS_TMMf32_TMMu32_TMMu32" />
+	<operation>FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (src0.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+0]) * FP32(src1.row[k].fp16[2*n+0])
+			tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+1]) * FP32(src1.row[k].fp16[2*n+1])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+</operation>
+	<parameter type="__tile1024i*" varname="dst" />
+	<parameter type="__tile1024i" varname="src0" />
+	<parameter type="__tile1024i" varname="src1" />
+	<CPUID>AMX-FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	
+<intrinsic name="_tile_dpbsud" tech="AMX">
+	<return type="void" />
+	<parameter type="constexpr int" varname="dst" />
+	<parameter type="constexpr int" varname="a" />
+	<parameter type="constexpr int" varname="b" />
+	<description>Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "a" with corresponding unsigned 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst".</description>
+	<operation>DEFINE DPBD(c, x, y) {
+	tmp1 := SignExtend32(x.byte[0]) * ZeroExtend32(y.byte[0])
+	tmp2 := SignExtend32(x.byte[1]) * ZeroExtend32(y.byte[1])
+	tmp3 := SignExtend32(x.byte[2]) * ZeroExtend32(y.byte[2])
+	tmp4 := SignExtend32(x.byte[3]) * ZeroExtend32(y.byte[3])
+	
+	RETURN c + tmp1 + tmp2 + tmp3 + tmp4
+}
+FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (a.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction form="tmm, tmm, tmm" name="TDPBSUD" xed="TDPBSUD_TMMi32_TMMu32_TMMu32" />
+	<CPUID>AMX-INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_tile_dpbusd" tech="AMX">
+	<return type="void" />
+	<parameter type="constexpr int" varname="dst" />
+	<parameter type="constexpr int" varname="a" />
+	<parameter type="constexpr int" varname="b" />
+	<description>Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst".</description>
+	<operation>DEFINE DPBD(c, x, y) {
+	tmp1 := ZeroExtend32(x.byte[0]) * SignExtend32(y.byte[0])
+	tmp2 := ZeroExtend32(x.byte[1]) * SignExtend32(y.byte[1])
+	tmp3 := ZeroExtend32(x.byte[2]) * SignExtend32(y.byte[2])
+	tmp4 := ZeroExtend32(x.byte[3]) * SignExtend32(y.byte[3])
+	
+	RETURN c + tmp1 + tmp2 + tmp3 + tmp4
+}
+FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (a.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction form="tmm, tmm, tmm" name="TDPBUSD" xed="TDPBUSD_TMMi32_TMMu32_TMMu32" />
+	<CPUID>AMX-INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_tile_dpbuud" tech="AMX">
+	<return type="void" />
+	<parameter type="constexpr int" varname="dst" />
+	<parameter type="constexpr int" varname="a" />
+	<parameter type="constexpr int" varname="b" />
+	<description>Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding unsigned 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst".</description>
+	<operation>DEFINE DPBD(c, x, y) {
+	tmp1 := ZeroExtend32(x.byte[0]) * ZeroExtend32(y.byte[0])
+	tmp2 := ZeroExtend32(x.byte[1]) * ZeroExtend32(y.byte[1])
+	tmp3 := ZeroExtend32(x.byte[2]) * ZeroExtend32(y.byte[2])
+	tmp4 := ZeroExtend32(x.byte[3]) * ZeroExtend32(y.byte[3])
+	
+	RETURN c + tmp1 + tmp2 + tmp3 + tmp4
+}
+FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (a.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction form="tmm, tmm, tmm" name="TDPBUUD" xed="TDPBUUD_TMMu32_TMMu32_TMMu32" />
+	<CPUID>AMX-INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_tile_dpbssd" tech="AMX">
+	<return type="void" />
+	<parameter type="constexpr int" varname="dst" />
+	<parameter type="constexpr int" varname="a" />
+	<parameter type="constexpr int" varname="b" />
+	<description>Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst".</description>
+	<operation>DEFINE DPBD(c, x, y) {
+	tmp1 := SignExtend32(x.byte[0]) * SignExtend32(y.byte[0])
+	tmp2 := SignExtend32(x.byte[1]) * SignExtend32(y.byte[1])
+	tmp3 := SignExtend32(x.byte[2]) * SignExtend32(y.byte[2])
+	tmp4 := SignExtend32(x.byte[3]) * SignExtend32(y.byte[3])
+	
+	RETURN c + tmp1 + tmp2 + tmp3 + tmp4
+}
+FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (a.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction form="tmm, tmm, tmm" name="TDPBSSD" xed="TDPBSSD_TMMi32_TMMu32_TMMu32" />
+	<CPUID>AMX-INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="__tile_dpbssd" tech="AMX">
+	<return type="void" />
+	<description>Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "src0" with corresponding signed 8-bit integers in "src1", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler.</description>
+	<instruction form="tmm, tmm, tmm" name="TDPBSSD" xed="TDPBSSD_TMMi32_TMMu32_TMMu32" />
+	<operation>DEFINE DPBD(c, x, y) {
+	tmp1 := SignExtend32(x.byte[0]) * SignExtend32(y.byte[0])
+	tmp2 := SignExtend32(x.byte[1]) * SignExtend32(y.byte[1])
+	tmp3 := SignExtend32(x.byte[2]) * SignExtend32(y.byte[2])
+	tmp4 := SignExtend32(x.byte[3]) * SignExtend32(y.byte[3])
+	RETURN c + tmp1 + tmp2 + tmp3 + tmp4
+}
+FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (src0.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.dword[n] := DPBD(tmp.dword[n], src0.row[m].dword[k], src1.row[k].dword[n])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+</operation>
+	<parameter type="__tile1024i*" varname="dst" />
+	<parameter type="__tile1024i" varname="src0" />
+	<parameter type="__tile1024i" varname="src1" />
+	<CPUID>AMX-INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="__tile_dpbsud" tech="AMX">
+	<return type="void" />
+	<description>Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "src0" with corresponding unsigned 8-bit integers in "src1", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler.</description>
+	<instruction form="tmm, tmm, tmm" name="TDPBSUD" xed="TDPBSUD_TMMi32_TMMu32_TMMu32" />
+	<operation>DEFINE DPBD(c, x, y) {
+	tmp1 := SignExtend32(x.byte[0]) * ZeroExtend32(y.byte[0])
+	tmp2 := SignExtend32(x.byte[1]) * ZeroExtend32(y.byte[1])
+	tmp3 := SignExtend32(x.byte[2]) * ZeroExtend32(y.byte[2])
+	tmp4 := SignExtend32(x.byte[3]) * ZeroExtend32(y.byte[3])
+	RETURN c + tmp1 + tmp2 + tmp3 + tmp4
+}
+FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (src0.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.dword[n] := DPBD(tmp.dword[n], src0.row[m].dword[k], src1.row[k].dword[n])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+</operation>
+	<parameter type="__tile1024i*" varname="dst" />
+	<parameter type="__tile1024i" varname="src0" />
+	<parameter type="__tile1024i" varname="src1" />
+	<CPUID>AMX-INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="__tile_dpbusd" tech="AMX">
+	<return type="void" />
+	<description>Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "src0" with corresponding signed 8-bit integers in "src1", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler.</description>
+	<instruction form="tmm, tmm, tmm" name="TDPBUSD" xed="TDPBUSD_TMMi32_TMMu32_TMMu32" />
+	<operation>DEFINE DPBD(c, x, y) {
+	tmp1 := ZeroExtend32(x.byte[0]) * SignExtend32(y.byte[0])
+	tmp2 := ZeroExtend32(x.byte[1]) * SignExtend32(y.byte[1])
+	tmp3 := ZeroExtend32(x.byte[2]) * SignExtend32(y.byte[2])
+	tmp4 := ZeroExtend32(x.byte[3]) * SignExtend32(y.byte[3])
+	RETURN c + tmp1 + tmp2 + tmp3 + tmp4
+}
+FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (src0.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.dword[n] := DPBD(tmp.dword[n], src0.row[m].dword[k], src1.row[k].dword[n])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+</operation>
+	<parameter type="__tile1024i*" varname="dst" />
+	<parameter type="__tile1024i" varname="src0" />
+	<parameter type="__tile1024i" varname="src1" />
+	<CPUID>AMX-INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="__tile_dpbuud" tech="AMX">
+	<return type="void" />
+	<description>Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "src0" with corresponding unsigned 8-bit integers in "src1", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler.</description>
+	<instruction form="tmm, tmm, tmm" name="TDPBUUD" xed="TDPBUUD_TMMu32_TMMu32_TMMu32" />
+	<operation>DEFINE DPBD(c, x, y) {
+	tmp1 := ZeroExtend32(x.byte[0]) * ZeroExtend32(y.byte[0])
+	tmp2 := ZeroExtend32(x.byte[1]) * ZeroExtend32(y.byte[1])
+	tmp3 := ZeroExtend32(x.byte[2]) * ZeroExtend32(y.byte[2])
+	tmp4 := ZeroExtend32(x.byte[3]) * ZeroExtend32(y.byte[3])
+	RETURN c + tmp1 + tmp2 + tmp3 + tmp4
+}
+FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (src0.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.dword[n] := DPBD(tmp.dword[n], src0.row[m].dword[k], src1.row[k].dword[n])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+</operation>
+	<parameter type="__tile1024i*" varname="dst" />
+	<parameter type="__tile1024i" varname="src0" />
+	<parameter type="__tile1024i" varname="src1" />
+	<CPUID>AMX-INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	
+<intrinsic name="_tile_loadconfig" tech="AMX">
+	<return type="void" />
+	<parameter memwidth="512" type="const void *" varname="mem_addr" />
+	<description>Load tile configuration from a 64-byte memory location specified by "mem_addr". The tile configuration format is specified below, and includes the tile type pallette, the number of bytes per row, and the number of rows. If the specified pallette_id is zero, that signifies the init state for both the tile config and the tile data, and the tiles are zeroed. Any invalid configurations will result in #GP fault.</description>
+	<operation>
+//	format of memory payload. each field is a byte.
+//		 0: palette
+//		 1: start_row
+//	 2-15: reserved, must be zero
+//	16-17: tile0.colsb
+//	18-19: tile1.colsb
+//	20-21: tile2.colsb
+//			...
+//	30-31: tile7.colsb
+//	32-47: reserved, must be zero
+//		48: tile0.rows
+//		49: tile1.rows
+//		50: tile2.rows
+//			 ...
+//		55: tile7.rows
+//	56-63: reserved, must be zero
+	</operation>
+	<instruction form="m512" name="LDTILECFG" xed="LDTILECFG_MEM" />
+	<CPUID>AMX-TILE</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_tile_storeconfig" tech="AMX">
+	<return type="void" />
+	<parameter memwidth="512" type="void *" varname="mem_addr" />
+	<description>Stores the current tile configuration to a 64-byte memory location specified by "mem_addr". The tile configuration format is specified below, and includes the tile type pallette, the number of bytes per row, and the number of rows. If tiles are not configured, all zeroes will be stored to memory.</description>
+	<operation>
+//	format of memory payload. each field is a byte.
+//		 0: palette
+//		 1: start_row
+//	 2-15: reserved, must be zero
+//	16-17: tile0.colsb
+//	18-19: tile1.colsb
+//	20-21: tile2.colsb
+//			...
+//	30-31: tile7.colsb
+//	32-47: reserved, must be zero
+//		48: tile0.rows
+//		49: tile1.rows
+//		50: tile2.rows
+//			 ...
+//		55: tile7.rows
+//	56-63: reserved, must be zero
+	</operation>
+	<instruction form="m512" name="STTILECFG" xed="STTILECFG_MEM" />
+	<CPUID>AMX-TILE</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_tile_loadd" tech="AMX">
+	<return type="void" />
+	<parameter type="constexpr int" varname="dst" />
+	<parameter type="const void *" varname="base" />
+	<parameter etype="UI32" type="size_t" varname="stride" />
+	<description>Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst" using the tile configuration previously configured via "_tile_loadconfig".</description>
+	<operation>start := tileconfig.startRow
+IF start == 0 // not restarting, zero incoming state
+	tilezero(dst)
+FI
+nbytes := dst.colsb
+DO WHILE start &lt; dst.rows
+	memptr := base + start * stride
+	write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes)
+	start := start + 1
+OD
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction form="tmm, sibmem" name="TILELOADD" xed="TILELOADD_TMMu32_MEMu32" />
+	<CPUID>AMX-TILE</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_tile_stream_loadd" tech="AMX">
+	<return type="void" />
+	<parameter type="constexpr int" varname="dst" />
+	<parameter type="const void *" varname="base" />
+	<parameter etype="UI32" type="size_t" varname="stride" />
+	<description>Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst" using the tile configuration previously configured via "_tile_loadconfig". This intrinsic provides a hint to the implementation that the data will likely not be reused in the near future and the data caching can be optimized accordingly.</description>
+	<operation>start := tileconfig.startRow
+IF start == 0 // not restarting, zero incoming state
+	tilezero(dst)
+FI
+nbytes := dst.colsb
+DO WHILE start &lt; dst.rows
+	memptr := base + start * stride
+	write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes)
+	start := start + 1
+OD
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction form="tmm, sibmem" name="TILELOADDT1" xed="TILELOADDT1_TMMu32_MEMu32" />
+	<CPUID>AMX-TILE</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_tile_release" tech="AMX">
+	<return type="void" />
+	<description>Release the tile configuration to return to the init state, which releases all storage it currently holds.</description>
+	<instruction name="TILERELEASE" xed="TILERELEASE" />
+	<CPUID>AMX-TILE</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_tile_stored" tech="AMX">
+	<return type="void" />
+	<parameter type="constexpr int" varname="src" />
+	<parameter type="void *" varname="base" />
+	<parameter etype="UI32" type="size_t" varname="stride" />
+	<description>Store the tile specified by "src" to memory specifieid by "base" address and "stride" using the tile configuration previously configured via "_tile_loadconfig".</description>
+	<operation>start := tileconfig.startRow
+DO WHILE start &lt; src.rows
+	memptr := base + start * stride
+	write_memory(memptr, src.colsb, src.row[start])
+	start := start + 1
+OD
+zero_tileconfig_start()
+	</operation>
+	<instruction form="sibmem, tmm" name="TILESTORED" xed="TILESTORED_MEMu32_TMMu32" />
+	<CPUID>AMX-TILE</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_tile_zero" tech="AMX">
+	<return type="void" />
+	<parameter type="constexpr int" varname="tdest" />
+	<description>Zero the tile specified by "tdest".</description>
+	<operation>nbytes := palette_table[tileconfig.palette_id].bytes_per_row
+FOR i := 0 TO palette_table[tileconfig.palette_id].max_rows-1
+	FOR j := 0 TO nbytes-1
+		tdest.row[i].byte[j] := 0
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction form="tmm" name="TILEZERO" xed="TILEZERO_TMMu32" />
+	<CPUID>AMX-TILE</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="__tile_loadd" tech="AMX">
+	<return type="void" />
+	<description>Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler.</description>
+	<instruction form="tmm, sibmem" name="TILELOADD" xed="TILELOADD_TMMu32_MEMu32" />
+	<operation>start := tileconfig.startRow
+IF start == 0 // not restarting, zero incoming state
+	tilezero(dst)
+FI
+nbytes := dst.colsb
+DO WHILE start &lt; dst.rows
+	memptr := base + start * stride
+	write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes)
+	start := start + 1
+OD
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+</operation>
+	<parameter type="__tile1024i*" varname="dst" />
+	<parameter type="const void*" varname="base" />
+	<parameter type="size_t" varname="stride" />
+	<CPUID>AMX-TILE</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="__tile_stored" tech="AMX">
+	<return type="void" />
+	<description>Store the tile specified by "src" to memory specifieid by "base" address and "stride". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler.</description>
+	<instruction form="sibmem, tmm" name="TILESTORED" xed="TILESTORED_MEMu32_TMMu32" />
+	<operation>start := tileconfig.startRow
+DO WHILE start &lt; src.rows
+	memptr := base + start * stride
+	write_memory(memptr, src.colsb, src.row[start])
+	start := start + 1
+OD
+zero_tileconfig_start()
+</operation>
+	<parameter type="void*" varname="base" />
+	<parameter type="size_t" varname="stride" />
+	<parameter type="__tile1024i" varname="src" />
+	<CPUID>AMX-TILE</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="__tile_stream_loadd" tech="AMX">
+	<return type="void" />
+	<description>Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst". This intrinsic provides a hint to the implementation that the data will likely not be reused in the near future and the data caching can be optimized accordingly. The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler.</description>
+	<instruction form="tmm, sibmem" name="TILELOADDT1" xed="TILELOADDT1_TMMu32_MEMu32" />
+	<operation>start := tileconfig.startRow
+IF start == 0 // not restarting, zero incoming state
+	tilezero(dst)
+FI
+nbytes := dst.colsb
+DO WHILE start &lt; dst.rows
+	memptr := base + start * stride
+	write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes)
+	start := start + 1
+OD
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+</operation>
+	<parameter type="__tile1024i*" varname="dst" />
+	<parameter type="const void*" varname="base" />
+	<parameter type="size_t" varname="stride" />
+	<CPUID>AMX-TILE</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	<intrinsic name="__tile_zero" tech="AMX">
+	<return type="void" />
+	<description>Zero the tile specified by "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler.</description>
+	<instruction form="tmm" name="TILEZERO" xed="TILEZERO_TMMu32" />
+	<operation>nbytes := palette_table[tileconfig.palette_id].bytes_per_row
+FOR i := 0 TO palette_table[tileconfig.palette_id].max_rows-1
+	FOR j := 0 TO nbytes-1
+		tdest.row[i].byte[j] := 0
+	ENDFOR
+ENDFOR
+</operation>
+	<parameter type="__tile1024i*" varname="dst" />
+	<CPUID>AMX-TILE</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	
+<intrinsic name="_mm256_acos_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ACOS(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_acos_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ACOS(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_acosh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ACOSH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_acosh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ACOSH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_asin_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ASIN(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_asin_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ASIN(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_asinh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ASINH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_asinh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ASINH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_atan_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ATAN(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_atan_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ATAN(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_atan2_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_atan2_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_atanh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ATANH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_atanh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ATANH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cos_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cos_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cosd_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := COSD(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cosd_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := COSD(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cosh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := COSH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cosh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := COSH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_hypot_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_hypot_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sin_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sin_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sincos_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" memwidth="256" type="__m256d *" varname="mem_addr" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+	MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sincos_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" memwidth="256" type="__m256 *" varname="mem_addr" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sind_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SIND(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sind_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SIND(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sinh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SINH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sinh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SINH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_tan_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := TAN(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_tan_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := TAN(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_tand_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := TAND(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_tand_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := TAND(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_tanh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := TANH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_tanh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := TANH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cbrt_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cbrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := CubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cexp_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]".</description>
+	<operation>
+DEFINE CEXP(a[31:0], b[31:0]) {
+	result[31:0]  := POW(FP32(e), a[31:0]) * COS(b[31:0])
+	result[63:32] := POW(FP32(e), a[31:0]) * SIN(b[31:0])
+	RETURN result
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CEXP(a[i+31:i], a[i+63:i+32])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_clog_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the natural logarithm of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]".</description>
+	<operation>
+DEFINE CLOG(a[31:0], b[31:0]) {
+	result[31:0]  := LOG(SQRT(POW(a, 2.0) + POW(b, 2.0)))
+	result[63:32] := ATAN2(b, a)
+	RETURN result
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CLOG(a[i+31:i], a[i+63:i+32])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_csqrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the square root of packed complex snumbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]".</description>
+	<operation>
+DEFINE CSQRT(a[31:0], b[31:0]) {
+	sign[31:0] := (b &lt; 0.0) ? -FP32(1.0) : FP32(1.0)
+	result[31:0]  := SQRT((a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0)
+	result[63:32] := sign * SQRT((-a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0)
+	RETURN result
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CSQRT(a[i+31:i], a[i+63:i+32])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_exp_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := POW(e, a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_exp_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := POW(FP32(e), a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_exp10_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := POW(10.0, a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_exp10_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := POW(FP32(10.0), a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_exp2_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := POW(2.0, a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_exp2_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_expm1_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := POW(e, a[i+63:i]) - 1.0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_expm1_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_invcbrt_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := InvCubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_invcbrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := InvCubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_invsqrt_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := InvSQRT(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_invsqrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := InvSQRT(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_log_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_log_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_log10_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_log10_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_log1p_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LOG(1.0 + a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_log1p_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LOG(1.0 + a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_log2_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_log2_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_logb_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_logb_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_pow_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := POW(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_pow_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := POW(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_svml_sqrt_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_svml_sqrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cdfnorm_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cdfnorm_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := CDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cdfnorminv_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cdfnorminv_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_erf_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ERF(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_erf_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ERF(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_erfc_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_erfc_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+63:i] := 1.0 - ERF(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_erfcinv_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_erfcinv_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_erfinv_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_erfinv_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+63:i] := 1.0 / ERF(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_div_epi8" sequence="TRUE" tech="SVML">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 8*j
+	IF b[i+7:i] == 0
+		#DE
+	FI
+	dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_div_epi16" sequence="TRUE" tech="SVML">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	IF b[i+15:i] == 0
+		#DE
+	FI
+	dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_div_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF b[i+31:i] == 0
+		#DE
+	FI
+	dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_div_epi64" sequence="TRUE" tech="SVML">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	IF b[i+63:i] == 0
+		#DE
+	FI
+	dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_div_epu8" sequence="TRUE" tech="SVML">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 8*j
+	IF b[i+7:i] == 0
+		#DE
+	FI
+	dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_div_epu16" sequence="TRUE" tech="SVML">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	IF b[i+15:i] == 0
+		#DE
+	FI
+	dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_div_epu32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF b[i+31:i] == 0
+		#DE
+	FI
+	dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_div_epu64" sequence="TRUE" tech="SVML">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	IF b[i+63:i] == 0
+		#DE
+	FI
+	dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_idiv_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_idivrem_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" memwidth="256" type="__m256i *" varname="mem_addr" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_irem_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rem_epi8" sequence="TRUE" tech="SVML">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 31
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rem_epi16" sequence="TRUE" tech="SVML">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rem_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rem_epi64" sequence="TRUE" tech="SVML">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rem_epu8" sequence="TRUE" tech="SVML">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 31
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rem_epu16" sequence="TRUE" tech="SVML">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rem_epu32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rem_epu64" sequence="TRUE" tech="SVML">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_udiv_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_udivrem_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" memwidth="256" type="__m256i *" varname="mem_addr" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_urem_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_svml_ceil_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_svml_ceil_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_svml_floor_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_svml_floor_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_svml_round_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_svml_round_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_trunc_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := TRUNCATE(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_trunc_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := TRUNCATE(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_add_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VADDPD" xed="VADDPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_add_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VADDPS" xed="VADDPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_addsub_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF ((j &amp; 1) == 0)
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VADDSUBPD" xed="VADDSUBPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_addsub_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF ((j &amp; 1) == 0)
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VADDSUBPS" xed="VADDSUBPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_div_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	dst[i+63:i] := a[i+63:i] / b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VDIVPD" xed="VDIVPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_div_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := a[i+31:i] / b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VDIVPS" xed="VDIVPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dp_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8".</description>
+	<operation>
+DEFINE DP(a[127:0], b[127:0], imm8[7:0]) {
+	FOR j := 0 to 3
+		i := j*32
+		IF imm8[(4+j)%8]
+			temp[i+31:i] := a[i+31:i] * b[i+31:i]
+		ELSE
+			temp[i+31:i] := FP32(0.0)
+		FI
+	ENDFOR
+	
+	sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0])
+	
+	FOR j := 0 to 3
+		i := j*32
+		IF imm8[j%8]
+			tmpdst[i+31:i] := sum[31:0]
+		ELSE
+			tmpdst[i+31:i] := FP32(0.0)
+		FI
+	ENDFOR
+	RETURN tmpdst[127:0]
+}
+dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
+dst[255:128] := DP(a[255:128], b[255:128], imm8[7:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VDPPS" xed="VDPPS_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_hadd_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[63:0] := a[127:64] + a[63:0]
+dst[127:64] := b[127:64] + b[63:0]
+dst[191:128] := a[255:192] + a[191:128]
+dst[255:192] := b[255:192] + b[191:128]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VHADDPD" xed="VHADDPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_hadd_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := a[127:96] + a[95:64]
+dst[95:64] := b[63:32] + b[31:0]
+dst[127:96] := b[127:96] + b[95:64]
+dst[159:128] := a[191:160] + a[159:128]
+dst[191:160] := a[255:224] + a[223:192]
+dst[223:192] := b[191:160] + b[159:128]
+dst[255:224] := b[255:224] + b[223:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VHADDPS" xed="VHADDPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_hsub_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] - a[127:64]
+dst[127:64] := b[63:0] - b[127:64]
+dst[191:128] := a[191:128] - a[255:192]
+dst[255:192] := b[191:128] - b[255:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VHSUBPD" xed="VHSUBPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_hsub_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Horizontally subtract adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := a[95:64] - a[127:96]
+dst[95:64] := b[31:0] - b[63:32]
+dst[127:96] := b[95:64] - b[127:96]
+dst[159:128] := a[159:128] - a[191:160]
+dst[191:160] := a[223:192] - a[255:224]
+dst[223:192] := b[159:128] - b[191:160]
+dst[255:224] := b[223:192] - b[255:224]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VHSUBPS" xed="VHSUBPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mul_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VMULPD" xed="VMULPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mul_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VMULPS" xed="VMULPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sub_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VSUBPD" xed="VSUBPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sub_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VSUBPS" xed="VSUBPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_and_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VANDPD" xed="VANDPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_and_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VANDPS" xed="VANDPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_andnot_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VANDNPD" xed="VANDNPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_andnot_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VANDNPS" xed="VANDNPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_or_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VORPD" xed="VORPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_or_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VORPS" xed="VORPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_xor_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VXORPD" xed="VXORPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_xor_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VXORPS" xed="VXORPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_testz_si256" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<parameter etype="M256" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+IF ((a[255:0] AND b[255:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[255:0]) AND b[255:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN ZF
+	</operation>
+	<instruction form="ymm, ymm" name="VPTEST" xed="VPTEST_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_testc_si256" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<parameter etype="M256" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+IF ((a[255:0] AND b[255:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[255:0]) AND b[255:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN CF
+	</operation>
+	<instruction form="ymm, ymm" name="VPTEST" xed="VPTEST_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_testnzc_si256" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<parameter etype="M256" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+IF ((a[255:0] AND b[255:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[255:0]) AND b[255:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="ymm, ymm" name="VPTEST" xed="VPTEST_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_testz_pd" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := ZF
+	</operation>
+	<instruction form="ymm, ymm" name="VTESTPD" xed="VTESTPD_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_testc_pd" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := CF
+	</operation>
+	<instruction form="ymm, ymm" name="VTESTPD" xed="VTESTPD_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_testnzc_pd" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="ymm, ymm" name="VTESTPD" xed="VTESTPD_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_testz_pd" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := ZF
+	</operation>
+	<instruction form="xmm, xmm" name="VTESTPD" xed="VTESTPD_XMMdq_XMMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_testc_pd" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := CF
+	</operation>
+	<instruction form="xmm, xmm" name="VTESTPD" xed="VTESTPD_XMMdq_XMMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_testnzc_pd" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="xmm, xmm" name="VTESTPD" xed="VTESTPD_XMMdq_XMMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_testz_ps" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; \
+    tmp[159] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[223] == 0 &amp;&amp; tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; \
+    tmp[159] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[223] == 0 &amp;&amp; tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := ZF
+	</operation>
+	<instruction form="ymm, ymm" name="VTESTPS" xed="VTESTPS_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_testc_ps" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; \
+    tmp[159] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[223] == 0 &amp;&amp; tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; \
+    tmp[159] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[223] == 0 &amp;&amp; tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := CF
+	</operation>
+	<instruction form="ymm, ymm" name="VTESTPS" xed="VTESTPS_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_testnzc_ps" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; \
+    tmp[159] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[223] == 0 &amp;&amp; tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; \
+    tmp[159] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[223] == 0 &amp;&amp; tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="ymm, ymm" name="VTESTPS" xed="VTESTPS_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_testz_ps" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := ZF
+	</operation>
+	<instruction form="xmm, xmm" name="VTESTPS" xed="VTESTPS_XMMdq_XMMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_testc_ps" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := CF
+	</operation>
+	<instruction form="xmm, xmm" name="VTESTPS" xed="VTESTPS_XMMdq_XMMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_testnzc_ps" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="xmm, xmm" name="VTESTPS" xed="VTESTPS_XMMdq_XMMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_blend_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="imm8" />
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF imm8[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VBLENDPD" xed="VBLENDPD_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_blend_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VBLENDPS" xed="VBLENDPS_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_blendv_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="MASK" type="__m256d" varname="mask" />
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, ymm" name="VBLENDVPD" xed="VBLENDVPD_YMMqq_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_blendv_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="MASK" type="__m256" varname="mask" />
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, ymm" name="VBLENDVPS" xed="VBLENDVPS_YMMqq_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shuffle_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VSHUFPD" xed="VSHUFPD_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shuffle_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VSHUFPS" xed="VSHUFPS_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_extractf128_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="const int" varname="imm8" />
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm, imm8" name="VEXTRACTF128" xed="VEXTRACTF128_XMMdq_YMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_extractf128_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="const int" varname="imm8" />
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm, imm8" name="VEXTRACTF128" xed="VEXTRACTF128_XMMdq_YMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_extractf128_si256" tech="AVX_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="const int" varname="imm8" />
+	<description>Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm, imm8" name="VEXTRACTF128" xed="VEXTRACTF128_XMMdq_YMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_extract_epi32" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI32" type="__int32" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="3" type="const int" varname="index" />
+	<description>Extract a 32-bit integer from "a", selected with "index", and store the result in "dst".</description>
+	<operation>
+dst[31:0] := (a[255:0] &gt;&gt; (index[2:0] * 32))[31:0]
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_extract_epi64" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="index" />
+	<description>Extract a 64-bit integer from "a", selected with "index", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[255:0] &gt;&gt; (index[1:0] * 64))[63:0]
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutevar_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], b[1:0])
+dst[63:32] := SELECT4(a[127:0], b[33:32])
+dst[95:64] := SELECT4(a[127:0], b[65:64])
+dst[127:96] := SELECT4(a[127:0], b[97:96])
+dst[159:128] := SELECT4(a[255:128], b[129:128])
+dst[191:160] := SELECT4(a[255:128], b[161:160])
+dst[223:192] := SELECT4(a[255:128], b[193:192])
+dst[255:224] := SELECT4(a[255:128], b[225:224])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMILPS" xed="VPERMILPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_permutevar_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], b[1:0])
+dst[63:32] := SELECT4(a[127:0], b[33:32])
+dst[95:64] := SELECT4(a[127:0], b[65:64])
+dst[127:96] := SELECT4(a[127:0], b[97:96])
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPERMILPS" xed="VPERMILPS_XMMdq_XMMdq_XMMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permute_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPERMILPS" xed="VPERMILPS_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_permute_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VPERMILPS" xed="VPERMILPS_XMMdq_XMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutevar_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
+	<operation>
+IF (b[1] == 0) dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) dst[127:64] := a[127:64]; FI
+IF (b[129] == 0) dst[191:128] := a[191:128]; FI
+IF (b[129] == 1) dst[191:128] := a[255:192]; FI
+IF (b[193] == 0) dst[255:192] := a[191:128]; FI
+IF (b[193] == 1) dst[255:192] := a[255:192]; FI
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMILPD" xed="VPERMILPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_permutevar_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst".</description>
+	<operation>
+IF (b[1] == 0) dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) dst[127:64] := a[127:64]; FI
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPERMILPD" xed="VPERMILPD_XMMdq_XMMdq_XMMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permute_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI
+IF (imm8[2] == 0) dst[191:128] := a[191:128]; FI
+IF (imm8[2] == 1) dst[191:128] := a[255:192]; FI
+IF (imm8[3] == 0) dst[255:192] := a[191:128]; FI
+IF (imm8[3] == 1) dst[255:192] := a[255:192]; FI
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPERMILPD" xed="VPERMILPD_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_permute_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VPERMILPD" xed="VPERMILPD_XMMdq_XMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permute2f128_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src1, src2, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src1[127:0]
+	1:	tmp[127:0] := src1[255:128]
+	2:	tmp[127:0] := src2[127:0]
+	3:	tmp[127:0] := src2[255:128]
+	ESAC
+	IF control[3]
+		tmp[127:0] := 0
+	FI
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
+dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPERM2F128" xed="VPERM2F128_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permute2f128_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src1, src2, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src1[127:0]
+	1:	tmp[127:0] := src1[255:128]
+	2:	tmp[127:0] := src2[127:0]
+	3:	tmp[127:0] := src2[255:128]
+	ESAC
+	IF control[3]
+		tmp[127:0] := 0
+	FI
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
+dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPERM2F128" xed="VPERM2F128_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permute2f128_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<parameter etype="M256" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src1, src2, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src1[127:0]
+	1:	tmp[127:0] := src1[255:128]
+	2:	tmp[127:0] := src2[127:0]
+	3:	tmp[127:0] := src2[255:128]
+	ESAC
+	IF control[3]
+		tmp[127:0] := 0
+	FI
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
+dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPERM2F128" xed="VPERM2F128_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_insertf128_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTF128" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_insertf128_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE imm8[0] OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTF128" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_insertf128_si256" tech="AVX_ALL">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 128 bits from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTF128" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_insert_epi8" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__int8" varname="i" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="index" />
+	<description>Copy "a" to "dst", and insert the 8-bit integer "i" into "dst" at the location specified by "index".</description>
+	<operation>
+dst[255:0] := a[255:0]
+sel := index[4:0]*8
+dst[sel+7:sel] := i[7:0]
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_insert_epi16" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__int16" varname="i" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="index" />
+	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "index".</description>
+	<operation>
+dst[255:0] := a[255:0]
+sel := index[3:0]*16
+dst[sel+15:sel] := i[15:0]
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_insert_epi32" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__int32" varname="i" />
+	<parameter etype="IMM" immwidth="3" type="const int" varname="index" />
+	<description>Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "index".</description>
+	<operation>
+dst[255:0] := a[255:0]
+sel := index[2:0]*32
+dst[sel+31:sel] := i[31:0]
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_insert_epi64" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__int64" varname="i" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="index" />
+	<description>Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "index".</description>
+	<operation>
+dst[255:0] := a[255:0]
+sel := index[1:0]*64
+dst[sel+63:sel] := i[63:0]
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_unpackhi_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VUNPCKHPD" xed="VUNPCKHPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_unpackhi_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VUNPCKHPS" xed="VUNPCKHPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_unpacklo_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VUNPCKLPD" xed="VUNPCKLPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_unpacklo_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VUNPCKLPS" xed="VUNPCKLPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_max_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VMAXPD" xed="VMAXPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_max_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VMAXPS" xed="VMAXPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_min_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VMINPD" xed="VMINPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_min_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VMINPS" xed="VMINPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_round_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i], rounding)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VROUNDPD" xed="VROUNDPD_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_round_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i], rounding)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VROUNDPS" xed="VROUNDPS_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_floor_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VROUNDPS" xed="VROUNDPS_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_ceil_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VROUNDPS" xed="VROUNDPS_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_floor_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VROUNDPD" xed="VROUNDPD_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_ceil_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VROUNDPD" xed="VROUNDPD_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VCMPPD" xed="VCMPPD_XMMdq_XMMdq_XMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmp_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VCMPPD" xed="VCMPPD_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VCMPPS" xed="VCMPPS_XMMdq_XMMdq_XMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmp_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VCMPPS" xed="VCMPPS_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_sd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+dst[63:0] := ( a[63:0] OP b[63:0] ) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VCMPSD" xed="VCMPSD_XMMdq_XMMdq_XMMq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_ss" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+dst[31:0] := ( a[31:0] OP b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VCMPSS" xed="VCMPSS_XMMdq_XMMdq_XMMd_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi32_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTDQ2PD" xed="VCVTDQ2PD_YMMqq_XMMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi32_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTDQ2PS" xed="VCVTDQ2PS_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtpd_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTPD2PS" xed="VCVTPD2PS_XMMdq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtps_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTPS2DQ" xed="VCVTPS2DQ_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtps_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTPS2PD" xed="VCVTPS2PD_YMMqq_XMMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttpd_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTTPD2DQ" xed="VCVTTPD2DQ_XMMdq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtpd_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTPD2DQ" xed="VCVTPD2DQ_XMMdq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttps_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTTPS2DQ" xed="VCVTTPS2DQ_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtss_f32" vexEq="TRUE" tech="AVX_ALL">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Copy the lower single-precision (32-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction form="m32, xmm" name="VMOVSS" xed="VMOVSS_MEMd_XMMd" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtsd_f64" vexEq="TRUE" tech="AVX_ALL">
+	<return etype="FP64" type="double" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Copy the lower double-precision (64-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction form="m64, xmm" name="VMOVSD" xed="VMOVSD_MEMq_XMMq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtsi256_si32" vexEq="TRUE" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction form="r32, xmm" name="VMOVD" xed="VMOVD_GPR32d_XMMd" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_zeroall" tech="AVX_ALL">
+	<return type="void" />
+	<parameter type="void" />
+	<description>Zero the contents of all XMM or YMM registers.</description>
+	<operation>YMM0[MAX:0] := 0
+YMM1[MAX:0] := 0
+YMM2[MAX:0] := 0
+YMM3[MAX:0] := 0
+YMM4[MAX:0] := 0
+YMM5[MAX:0] := 0
+YMM6[MAX:0] := 0
+YMM7[MAX:0] := 0
+IF _64_BIT_MODE
+	YMM8[MAX:0] := 0
+	YMM9[MAX:0] := 0
+	YMM10[MAX:0] := 0
+	YMM11[MAX:0] := 0
+	YMM12[MAX:0] := 0
+	YMM13[MAX:0] := 0
+	YMM14[MAX:0] := 0
+	YMM15[MAX:0] := 0
+FI
+	</operation>
+	<instruction name="VZEROALL" xed="VZEROALL" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm256_zeroupper" tech="AVX_ALL">
+	<return type="void" />
+	<parameter type="void" />
+	<description>Zero the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified.</description>
+	<operation>YMM0[MAX:128] := 0
+YMM1[MAX:128] := 0
+YMM2[MAX:128] := 0
+YMM3[MAX:128] := 0
+YMM4[MAX:128] := 0
+YMM5[MAX:128] := 0
+YMM6[MAX:128] := 0
+YMM7[MAX:128] := 0
+IF _64_BIT_MODE
+	YMM8[MAX:128] := 0
+	YMM9[MAX:128] := 0
+	YMM10[MAX:128] := 0
+	YMM11[MAX:128] := 0
+	YMM12[MAX:128] := 0
+	YMM13[MAX:128] := 0
+	YMM14[MAX:128] := 0
+	YMM15[MAX:128] := 0
+FI
+	</operation>
+	<instruction name="VZEROUPPER" xed="VZEROUPPER" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm256_undefined_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m256 with undefined elements.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm256_undefined_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m256d with undefined elements.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm256_undefined_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m256i with undefined elements.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcast_ss" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" memwidth="32" type="float const *" varname="mem_addr" />
+	<description>Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst".</description>
+	<operation>
+tmp[31:0] := MEM[mem_addr+31:mem_addr]
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m32" name="VBROADCASTSS" xed="VBROADCASTSS_YMMqq_MEMd" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_broadcast_ss" tech="AVX_ALL">
+	<category>Swizzle</category>
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" memwidth="32" type="float const *" varname="mem_addr" />
+	<description>Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst".</description>
+	<operation>
+tmp[31:0] := MEM[mem_addr+31:mem_addr]
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, m32" name="VBROADCASTSS" xed="VBROADCASTSS_XMMdq_MEMd" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcast_sd" tech="AVX_ALL">
+	<category>Swizzle</category>
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" memwidth="64" type="double const *" varname="mem_addr" />
+	<description>Broadcast a double-precision (64-bit) floating-point element from memory to all elements of "dst".</description>
+	<operation>
+tmp[63:0] := MEM[mem_addr+63:mem_addr]
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := tmp[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m64" name="VBROADCASTSD" xed="VBROADCASTSD_YMMqq_MEMq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcast_ps" tech="AVX_ALL">
+	<category>Swizzle</category>
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" memwidth="128" type="__m128 const *" varname="mem_addr" />
+	<description>Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of "dst".</description>
+	<operation>
+tmp[127:0] := MEM[mem_addr+127:mem_addr]
+dst[127:0] := tmp[127:0]
+dst[255:128] := tmp[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m128" name="VBROADCASTF128" xed="VBROADCASTF128_YMMqq_MEMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcast_pd" tech="AVX_ALL">
+	<category>Swizzle</category>
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" memwidth="128" type="__m128d const *" varname="mem_addr" />
+	<description>Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of "dst".</description>
+	<operation>
+tmp[127:0] := MEM[mem_addr+127:mem_addr]
+dst[127:0] := tmp[127:0]
+dst[255:128] := tmp[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m128" name="VBROADCASTF128" xed="VBROADCASTF128_YMMqq_MEMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_load_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" memwidth="256" type="double const *" varname="mem_addr" />
+	<description>Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVAPD" xed="VMOVAPD_YMMqq_MEMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_load_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" memwidth="256" type="float const *" varname="mem_addr" />
+	<description>Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVAPS" xed="VMOVAPS_YMMqq_MEMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_loadu_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" memwidth="256" type="double const *" varname="mem_addr" />
+	<description>Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVUPD" xed="VMOVUPD_YMMqq_MEMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_loadu_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" memwidth="256" type="float const *" varname="mem_addr" />
+	<description>Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVUPS" xed="VMOVUPS_YMMqq_MEMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_load_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="M256" memwidth="256" type="__m256i const *" varname="mem_addr" />
+	<description>Load 256-bits of integer data from memory into "dst".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVDQA" xed="VMOVDQA_YMMqq_MEMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_loadu_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="M256" memwidth="256" type="__m256i const *" varname="mem_addr" />
+	<description>Load 256-bits of integer data from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVDQU" xed="VMOVDQU_YMMqq_MEMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskload_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" memwidth="256" type="double const *" varname="mem_addr" />
+	<parameter etype="MASK" type="__m256i" varname="mask" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, m256" name="VMASKMOVPD" xed="VMASKMOVPD_YMMqq_YMMqq_MEMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskload_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" memwidth="128" type="double const *" varname="mem_addr" />
+	<parameter etype="MASK" type="__m128i" varname="mask" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, m128" name="VMASKMOVPD" xed="VMASKMOVPD_XMMdq_XMMdq_MEMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskload_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" memwidth="256" type="float const *" varname="mem_addr" />
+	<parameter etype="MASK" type="__m256i" varname="mask" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, m256" name="VMASKMOVPS" xed="VMASKMOVPS_YMMqq_YMMqq_MEMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskload_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" memwidth="128" type="float const *" varname="mem_addr" />
+	<parameter etype="MASK" type="__m128i" varname="mask" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, m128" name="VMASKMOVPS" xed="VMASKMOVPS_XMMdq_XMMdq_MEMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_lddqu_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="M256" memwidth="256" type="__m256i const *" varname="mem_addr" />
+	<description>Load 256-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm256_loadu_si256" when the data crosses a cache line boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VLDDQU" xed="VLDDQU_YMMqq_MEMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_loadu2_m128" sequence="TRUE" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" memwidth="128" type="float const*" varname="hiaddr" />
+	<parameter etype="FP32" memwidth="128" type="float const*" varname="loaddr" />
+	<description>Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst".
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[loaddr+127:loaddr]
+dst[255:128] := MEM[hiaddr+127:hiaddr]
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_loadu2_m128d" sequence="TRUE" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" memwidth="128" type="double const*" varname="hiaddr" />
+	<parameter etype="FP64" memwidth="128" type="double const*" varname="loaddr" />
+	<description>Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst".
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[loaddr+127:loaddr]
+dst[255:128] := MEM[hiaddr+127:hiaddr]
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_loadu2_m128i" sequence="TRUE" tech="AVX_ALL">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" memwidth="128" type="__m128i const*" varname="hiaddr" />
+	<parameter etype="M128" memwidth="128" type="__m128i const*" varname="loaddr" />
+	<description>Load two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value in "dst".
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[loaddr+127:loaddr]
+dst[255:128] := MEM[hiaddr+127:hiaddr]
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_store_pd" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="256" type="double *" varname="mem_addr" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVAPD" xed="VMOVAPD_MEMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_store_ps" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="256" type="float *" varname="mem_addr" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVAPS" xed="VMOVAPS_MEMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_storeu_pd" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="256" type="double *" varname="mem_addr" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVUPD" xed="VMOVUPD_MEMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_storeu_ps" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="256" type="float *" varname="mem_addr" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVUPS" xed="VMOVUPS_MEMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_store_si256" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="M256" memwidth="256" type="__m256i *" varname="mem_addr" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<description>Store 256-bits of integer data from "a" into memory.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVDQA" xed="VMOVDQA_MEMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_storeu_si256" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="M256" memwidth="256" type="__m256i *" varname="mem_addr" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<description>Store 256-bits of integer data from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVDQU" xed="VMOVDQU_MEMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskstore_pd" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="256" type="double *" varname="mem_addr" />
+	<parameter etype="MASK" type="__m256i" varname="mask" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256, ymm, ymm" name="VMASKMOVPD" xed="VMASKMOVPD_MEMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskstore_pd" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="128" type="double *" varname="mem_addr" />
+	<parameter etype="MASK" type="__m128i" varname="mask" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128, xmm, xmm" name="VMASKMOVPD" xed="VMASKMOVPD_MEMdq_XMMdq_XMMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskstore_ps" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="256" type="float *" varname="mem_addr" />
+	<parameter etype="MASK" type="__m256i" varname="mask" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256, ymm, ymm" name="VMASKMOVPS" xed="VMASKMOVPS_MEMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskstore_ps" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="128" type="float *" varname="mem_addr" />
+	<parameter etype="MASK" type="__m128i" varname="mask" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128, xmm, xmm" name="VMASKMOVPS" xed="VMASKMOVPS_MEMdq_XMMdq_XMMdq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_stream_si256" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="M256" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<description>Store 256-bits of integer data from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVNTDQ" xed="VMOVNTDQ_MEMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_stream_pd" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVNTPD" xed="VMOVNTPD_MEMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_stream_ps" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVNTPS" xed="VMOVNTPS_MEMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_storeu2_m128" sequence="TRUE" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="128" type="float*" varname="hiaddr" />
+	<parameter etype="FP32" memwidth="128" type="float*" varname="loaddr" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory two different 128-bit locations.
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[loaddr+127:loaddr] := a[127:0]
+MEM[hiaddr+127:hiaddr] := a[255:128]
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_storeu2_m128d" sequence="TRUE" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="128" type="double*" varname="hiaddr" />
+	<parameter etype="FP64" memwidth="128" type="double*" varname="loaddr" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory two different 128-bit locations.
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[loaddr+127:loaddr] := a[127:0]
+MEM[hiaddr+127:hiaddr] := a[255:128]
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_storeu2_m128i" sequence="TRUE" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="M128" memwidth="128" type="__m128i*" varname="hiaddr" />
+	<parameter etype="M128" memwidth="128" type="__m128i*" varname="loaddr" />
+	<parameter etype="M128" type="__m256i" varname="a" />
+	<description>Store the high and low 128-bit halves (each composed of integer data) from "a" into memory two different 128-bit locations.
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[loaddr+127:loaddr] := a[127:0]
+MEM[hiaddr+127:hiaddr] := a[255:128]
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_movehdup_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] 
+dst[63:32] := a[63:32] 
+dst[95:64] := a[127:96] 
+dst[127:96] := a[127:96]
+dst[159:128] := a[191:160] 
+dst[191:160] := a[191:160] 
+dst[223:192] := a[255:224] 
+dst[255:224] := a[255:224]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VMOVSHDUP" xed="VMOVSHDUP_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_moveldup_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] 
+dst[63:32] := a[31:0] 
+dst[95:64] := a[95:64] 
+dst[127:96] := a[95:64]
+dst[159:128] := a[159:128] 
+dst[191:160] := a[159:128] 
+dst[223:192] := a[223:192] 
+dst[255:224] := a[223:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VMOVSLDUP" xed="VMOVSLDUP_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_movedup_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := a[63:0]
+dst[191:128] := a[191:128]
+dst[255:192] := a[191:128]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VMOVDDUP" xed="VMOVDDUP_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rcp_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := 1.0 / a[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VRCPPS" xed="VRCPPS_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rsqrt_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VRSQRTPS" xed="VRSQRTPS_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sqrt_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VSQRTPD" xed="VSQRTPD_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sqrt_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VSQRTPS" xed="VSQRTPS_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_movemask_pd" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF a[i+63]
+		dst[j] := 1
+	ELSE
+		dst[j] := 0
+	FI
+ENDFOR
+dst[MAX:4] := 0
+	</operation>
+	<instruction form="r32, ymm" name="VMOVMSKPD" xed="VMOVMSKPD_GPR32d_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_movemask_ps" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF a[i+31]
+		dst[j] := 1
+	ELSE
+		dst[j] := 0
+	FI
+ENDFOR
+dst[MAX:8] := 0
+	</operation>
+	<instruction form="r32, ymm" name="VMOVMSKPS" xed="VMOVMSKPS_GPR32d_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setzero_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m256d with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VXORPD" xed="VXORPD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setzero_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m256 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VXORPS" xed="VXORPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setzero_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m256i with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPXOR" xed="VPXOR_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set_pd" sequence="TRUE" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="double" varname="e3" />
+	<parameter etype="FP64" type="double" varname="e2" />
+	<parameter etype="FP64" type="double" varname="e1" />
+	<parameter etype="FP64" type="double" varname="e0" />
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+dst[191:128] := e2
+dst[255:192] := e3
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set_ps" sequence="TRUE" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="float" varname="e7" />
+	<parameter etype="FP32" type="float" varname="e6" />
+	<parameter etype="FP32" type="float" varname="e5" />
+	<parameter etype="FP32" type="float" varname="e4" />
+	<parameter etype="FP32" type="float" varname="e3" />
+	<parameter etype="FP32" type="float" varname="e2" />
+	<parameter etype="FP32" type="float" varname="e1" />
+	<parameter etype="FP32" type="float" varname="e0" />
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+dst[159:128] := e4
+dst[191:160] := e5
+dst[223:192] := e6
+dst[255:224] := e7
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set_epi8" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="char" varname="e31" />
+	<parameter etype="UI8" type="char" varname="e30" />
+	<parameter etype="UI8" type="char" varname="e29" />
+	<parameter etype="UI8" type="char" varname="e28" />
+	<parameter etype="UI8" type="char" varname="e27" />
+	<parameter etype="UI8" type="char" varname="e26" />
+	<parameter etype="UI8" type="char" varname="e25" />
+	<parameter etype="UI8" type="char" varname="e24" />
+	<parameter etype="UI8" type="char" varname="e23" />
+	<parameter etype="UI8" type="char" varname="e22" />
+	<parameter etype="UI8" type="char" varname="e21" />
+	<parameter etype="UI8" type="char" varname="e20" />
+	<parameter etype="UI8" type="char" varname="e19" />
+	<parameter etype="UI8" type="char" varname="e18" />
+	<parameter etype="UI8" type="char" varname="e17" />
+	<parameter etype="UI8" type="char" varname="e16" />
+	<parameter etype="UI8" type="char" varname="e15" />
+	<parameter etype="UI8" type="char" varname="e14" />
+	<parameter etype="UI8" type="char" varname="e13" />
+	<parameter etype="UI8" type="char" varname="e12" />
+	<parameter etype="UI8" type="char" varname="e11" />
+	<parameter etype="UI8" type="char" varname="e10" />
+	<parameter etype="UI8" type="char" varname="e9" />
+	<parameter etype="UI8" type="char" varname="e8" />
+	<parameter etype="UI8" type="char" varname="e7" />
+	<parameter etype="UI8" type="char" varname="e6" />
+	<parameter etype="UI8" type="char" varname="e5" />
+	<parameter etype="UI8" type="char" varname="e4" />
+	<parameter etype="UI8" type="char" varname="e3" />
+	<parameter etype="UI8" type="char" varname="e2" />
+	<parameter etype="UI8" type="char" varname="e1" />
+	<parameter etype="UI8" type="char" varname="e0" />
+	<description>Set packed 8-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[7:0] := e0
+dst[15:8] := e1
+dst[23:16] := e2
+dst[31:24] := e3
+dst[39:32] := e4
+dst[47:40] := e5
+dst[55:48] := e6
+dst[63:56] := e7
+dst[71:64] := e8
+dst[79:72] := e9
+dst[87:80] := e10
+dst[95:88] := e11
+dst[103:96] := e12
+dst[111:104] := e13
+dst[119:112] := e14
+dst[127:120] := e15
+dst[135:128] := e16
+dst[143:136] := e17
+dst[151:144] := e18
+dst[159:152] := e19
+dst[167:160] := e20
+dst[175:168] := e21
+dst[183:176] := e22
+dst[191:184] := e23
+dst[199:192] := e24
+dst[207:200] := e25
+dst[215:208] := e26
+dst[223:216] := e27
+dst[231:224] := e28
+dst[239:232] := e29
+dst[247:240] := e30
+dst[255:248] := e31
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set_epi16" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="short" varname="e15" />
+	<parameter etype="UI16" type="short" varname="e14" />
+	<parameter etype="UI16" type="short" varname="e13" />
+	<parameter etype="UI16" type="short" varname="e12" />
+	<parameter etype="UI16" type="short" varname="e11" />
+	<parameter etype="UI16" type="short" varname="e10" />
+	<parameter etype="UI16" type="short" varname="e9" />
+	<parameter etype="UI16" type="short" varname="e8" />
+	<parameter etype="UI16" type="short" varname="e7" />
+	<parameter etype="UI16" type="short" varname="e6" />
+	<parameter etype="UI16" type="short" varname="e5" />
+	<parameter etype="UI16" type="short" varname="e4" />
+	<parameter etype="UI16" type="short" varname="e3" />
+	<parameter etype="UI16" type="short" varname="e2" />
+	<parameter etype="UI16" type="short" varname="e1" />
+	<parameter etype="UI16" type="short" varname="e0" />
+	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[15:0] := e0
+dst[31:16] := e1
+dst[47:32] := e2
+dst[63:48] := e3
+dst[79:64] := e4
+dst[95:80] := e5
+dst[111:96] := e6
+dst[127:112] := e7
+dst[143:128] := e8
+dst[159:144] := e9
+dst[175:160] := e10
+dst[191:176] := e11
+dst[207:192] := e12
+dst[223:208] := e13
+dst[239:224] := e14
+dst[255:240] := e15
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set_epi32" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="int" varname="e7" />
+	<parameter etype="UI32" type="int" varname="e6" />
+	<parameter etype="UI32" type="int" varname="e5" />
+	<parameter etype="UI32" type="int" varname="e4" />
+	<parameter etype="UI32" type="int" varname="e3" />
+	<parameter etype="UI32" type="int" varname="e2" />
+	<parameter etype="UI32" type="int" varname="e1" />
+	<parameter etype="UI32" type="int" varname="e0" />
+	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+dst[159:128] := e4
+dst[191:160] := e5
+dst[223:192] := e6
+dst[255:224] := e7
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set_epi64x" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="e3" />
+	<parameter etype="UI64" type="__int64" varname="e2" />
+	<parameter etype="UI64" type="__int64" varname="e1" />
+	<parameter etype="UI64" type="__int64" varname="e0" />
+	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+dst[191:128] := e2
+dst[255:192] := e3
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setr_pd" sequence="TRUE" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="double" varname="e3" />
+	<parameter etype="FP64" type="double" varname="e2" />
+	<parameter etype="FP64" type="double" varname="e1" />
+	<parameter etype="FP64" type="double" varname="e0" />
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e3
+dst[127:64] := e2
+dst[191:128] := e1
+dst[255:192] := e0
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setr_ps" sequence="TRUE" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="float" varname="e7" />
+	<parameter etype="FP32" type="float" varname="e6" />
+	<parameter etype="FP32" type="float" varname="e5" />
+	<parameter etype="FP32" type="float" varname="e4" />
+	<parameter etype="FP32" type="float" varname="e3" />
+	<parameter etype="FP32" type="float" varname="e2" />
+	<parameter etype="FP32" type="float" varname="e1" />
+	<parameter etype="FP32" type="float" varname="e0" />
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e7
+dst[63:32] := e6
+dst[95:64] := e5
+dst[127:96] := e4
+dst[159:128] := e3
+dst[191:160] := e2
+dst[223:192] := e1
+dst[255:224] := e0
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setr_epi8" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="char" varname="e31" />
+	<parameter etype="UI8" type="char" varname="e30" />
+	<parameter etype="UI8" type="char" varname="e29" />
+	<parameter etype="UI8" type="char" varname="e28" />
+	<parameter etype="UI8" type="char" varname="e27" />
+	<parameter etype="UI8" type="char" varname="e26" />
+	<parameter etype="UI8" type="char" varname="e25" />
+	<parameter etype="UI8" type="char" varname="e24" />
+	<parameter etype="UI8" type="char" varname="e23" />
+	<parameter etype="UI8" type="char" varname="e22" />
+	<parameter etype="UI8" type="char" varname="e21" />
+	<parameter etype="UI8" type="char" varname="e20" />
+	<parameter etype="UI8" type="char" varname="e19" />
+	<parameter etype="UI8" type="char" varname="e18" />
+	<parameter etype="UI8" type="char" varname="e17" />
+	<parameter etype="UI8" type="char" varname="e16" />
+	<parameter etype="UI8" type="char" varname="e15" />
+	<parameter etype="UI8" type="char" varname="e14" />
+	<parameter etype="UI8" type="char" varname="e13" />
+	<parameter etype="UI8" type="char" varname="e12" />
+	<parameter etype="UI8" type="char" varname="e11" />
+	<parameter etype="UI8" type="char" varname="e10" />
+	<parameter etype="UI8" type="char" varname="e9" />
+	<parameter etype="UI8" type="char" varname="e8" />
+	<parameter etype="UI8" type="char" varname="e7" />
+	<parameter etype="UI8" type="char" varname="e6" />
+	<parameter etype="UI8" type="char" varname="e5" />
+	<parameter etype="UI8" type="char" varname="e4" />
+	<parameter etype="UI8" type="char" varname="e3" />
+	<parameter etype="UI8" type="char" varname="e2" />
+	<parameter etype="UI8" type="char" varname="e1" />
+	<parameter etype="UI8" type="char" varname="e0" />
+	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[7:0] := e31
+dst[15:8] := e30
+dst[23:16] := e29
+dst[31:24] := e28
+dst[39:32] := e27
+dst[47:40] := e26
+dst[55:48] := e25
+dst[63:56] := e24
+dst[71:64] := e23
+dst[79:72] := e22
+dst[87:80] := e21
+dst[95:88] := e20
+dst[103:96] := e19
+dst[111:104] := e18
+dst[119:112] := e17
+dst[127:120] := e16
+dst[135:128] := e15
+dst[143:136] := e14
+dst[151:144] := e13
+dst[159:152] := e12
+dst[167:160] := e11
+dst[175:168] := e10
+dst[183:176] := e9
+dst[191:184] := e8
+dst[199:192] := e7
+dst[207:200] := e6
+dst[215:208] := e5
+dst[223:216] := e4
+dst[231:224] := e3
+dst[239:232] := e2
+dst[247:240] := e1
+dst[255:248] := e0
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setr_epi16" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="short" varname="e15" />
+	<parameter etype="UI16" type="short" varname="e14" />
+	<parameter etype="UI16" type="short" varname="e13" />
+	<parameter etype="UI16" type="short" varname="e12" />
+	<parameter etype="UI16" type="short" varname="e11" />
+	<parameter etype="UI16" type="short" varname="e10" />
+	<parameter etype="UI16" type="short" varname="e9" />
+	<parameter etype="UI16" type="short" varname="e8" />
+	<parameter etype="UI16" type="short" varname="e7" />
+	<parameter etype="UI16" type="short" varname="e6" />
+	<parameter etype="UI16" type="short" varname="e5" />
+	<parameter etype="UI16" type="short" varname="e4" />
+	<parameter etype="UI16" type="short" varname="e3" />
+	<parameter etype="UI16" type="short" varname="e2" />
+	<parameter etype="UI16" type="short" varname="e1" />
+	<parameter etype="UI16" type="short" varname="e0" />
+	<description>Set packed 16-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[15:0] := e15
+dst[31:16] := e14
+dst[47:32] := e13
+dst[63:48] := e12
+dst[79:64] := e11
+dst[95:80] := e10
+dst[111:96] := e9
+dst[127:112] := e8
+dst[143:128] := e7
+dst[159:144] := e6
+dst[175:160] := e5
+dst[191:176] := e4
+dst[207:192] := e3
+dst[223:208] := e2
+dst[239:224] := e1
+dst[255:240] := e0
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setr_epi32" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="int" varname="e7" />
+	<parameter etype="UI32" type="int" varname="e6" />
+	<parameter etype="UI32" type="int" varname="e5" />
+	<parameter etype="UI32" type="int" varname="e4" />
+	<parameter etype="UI32" type="int" varname="e3" />
+	<parameter etype="UI32" type="int" varname="e2" />
+	<parameter etype="UI32" type="int" varname="e1" />
+	<parameter etype="UI32" type="int" varname="e0" />
+	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e7
+dst[63:32] := e6
+dst[95:64] := e5
+dst[127:96] := e4
+dst[159:128] := e3
+dst[191:160] := e2
+dst[223:192] := e1
+dst[255:224] := e0
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setr_epi64x" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="e3" />
+	<parameter etype="UI64" type="__int64" varname="e2" />
+	<parameter etype="UI64" type="__int64" varname="e1" />
+	<parameter etype="UI64" type="__int64" varname="e0" />
+	<description>Set packed 64-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e3
+dst[127:64] := e2
+dst[191:128] := e1
+dst[255:192] := e0
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set1_pd" sequence="TRUE" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="double" varname="a" />
+	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set1_ps" sequence="TRUE" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="float" varname="a" />
+	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set1_epi8" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="char" varname="a" />
+	<description>Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastb".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set1_epi16" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="short" varname="a" />
+	<description>Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate the "vpbroadcastw".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set1_epi32" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastd".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set1_epi64x" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="long long" varname="a" />
+	<description>Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set_m128" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="hi" />
+	<parameter etype="FP32" type="__m128" varname="lo" />
+	<description>Set packed __m256 vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTF128" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set_m128d" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="hi" />
+	<parameter etype="FP64" type="__m128d" varname="lo" />
+	<description>Set packed __m256d vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTF128" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set_m128i" tech="AVX_ALL">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="hi" />
+	<parameter etype="M128" type="__m128i" varname="lo" />
+	<description>Set packed __m256i vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTF128" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setr_m128" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="lo" />
+	<parameter etype="FP32" type="__m128" varname="hi" />
+	<description>Set packed __m256 vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTF128" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setr_m128d" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="lo" />
+	<parameter etype="FP64" type="__m128d" varname="hi" />
+	<description>Set packed __m256d vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTF128" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setr_m128i" tech="AVX_ALL">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="lo" />
+	<parameter etype="M128" type="__m128i" varname="hi" />
+	<description>Set packed __m256i vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTF128" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb" />
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castpd_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Cast vector of type __m256d to type __m256.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castps_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Cast vector of type __m256 to type __m256d.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castps_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Cast vector of type __m256 to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castpd_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Cast vector of type __m256d to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castsi256_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Cast vector of type __m256i to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castsi256_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Cast vector of type __m256i to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castps256_ps128" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Cast vector of type __m256 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castpd256_pd128" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Cast vector of type __m256d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castsi256_si128" tech="AVX_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m256i" varname="a" />
+	<description>Cast vector of type __m256i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castps128_ps256" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Cast vector of type __m128 to type __m256; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castpd128_pd256" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Cast vector of type __m128d to type __m256d; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castsi128_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="M256" type="__m128i" varname="a" />
+	<description>Cast vector of type __m128i to type __m256i; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_zextps128_ps256" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Cast vector of type __m128 to type __m256; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_zextpd128_pd256" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Cast vector of type __m128d to type __m256d; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_zextsi128_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="M256" type="__m128i" varname="a" />
+	<description>Cast vector of type __m128i to type __m256i; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_extract_epi8" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI8" type="int" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="index" />
+	<description>Extract an 8-bit integer from "a", selected with "index", and store the result in "dst".</description>
+	<operation>
+dst[7:0] := (a[255:0] &gt;&gt; (index[4:0] * 8))[7:0]
+	</operation>
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_extract_epi16" sequence="TRUE" tech="AVX_ALL">
+	<return etype="UI16" type="int" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="index" />
+	<description>Extract a 16-bit integer from "a", selected with "index", and store the result in "dst".</description>
+	<operation>
+dst[15:0] := (a[255:0] &gt;&gt; (index[3:0] * 16))[15:0]
+	</operation>
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_blend_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Blend packed 16-bit integers from "a" and "b" within 128-bit lanes using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF imm8[j%8]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPBLENDW" xed="VPBLENDW_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_blend_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="imm8" />
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VPBLENDD" xed="VPBLENDD_XMMdq_XMMdq_XMMdq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_blend_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPBLENDD" xed="VPBLENDD_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_blendv_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<parameter etype="MASK" type="__m256i" varname="mask" />
+	<description>Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF mask[i+7]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, ymm" name="VPBLENDVB" xed="VPBLENDVB_YMMqq_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_broadcastb_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPBROADCASTB" xed="VPBROADCASTB_XMMdq_XMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcastb_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPBROADCASTB" xed="VPBROADCASTB_YMMqq_XMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_broadcastd_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPBROADCASTD" xed="VPBROADCASTD_XMMdq_XMMd" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcastd_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPBROADCASTD" xed="VPBROADCASTD_YMMqq_XMMd" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_broadcastq_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPBROADCASTQ" xed="VPBROADCASTQ_XMMdq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcastq_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPBROADCASTQ" xed="VPBROADCASTQ_YMMqq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_broadcastsd_pd" vexEq="TRUE" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="MOVDDUP" xed="MOVDDUP_XMMdq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcastsd_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VBROADCASTSD" xed="VBROADCASTSD_YMMqq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_broadcastsi128_si256" tech="AVX_ALL">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<description>Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst".</description>
+	<operation>
+dst[127:0] := a[127:0]
+dst[255:128] := a[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m128" name="VBROADCASTI128" xed="VBROADCASTI128_YMMqq_MEMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcastsi128_si256" tech="AVX_ALL">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<description>Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst".</description>
+	<operation>
+dst[127:0] := a[127:0]
+dst[255:128] := a[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m128" name="VBROADCASTI128" xed="VBROADCASTI128_YMMqq_MEMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_broadcastss_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VBROADCASTSS" xed="VBROADCASTSS_XMMdq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcastss_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VBROADCASTSS" xed="VBROADCASTSS_YMMqq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_broadcastw_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPBROADCASTW" xed="VPBROADCASTW_XMMdq_XMMw" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcastw_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPBROADCASTW" xed="VPBROADCASTW_YMMqq_XMMw" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_extracti128_si256" tech="AVX_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="const int" varname="imm8" />
+	<description>Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm, imm8" name="VEXTRACTI128" xed="VEXTRACTI128_XMMdq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_inserti128_si256" tech="AVX_ALL">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="const int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 128 bits (composed of integer data) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTI128" xed="VINSERTI128_YMMqq_YMMqq_XMMdq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permute2x128_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<parameter etype="M256" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src1, src2, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src1[127:0]
+	1:	tmp[127:0] := src1[255:128]
+	2:	tmp[127:0] := src2[127:0]
+	3:	tmp[127:0] := src2[255:128]
+	ESAC
+	IF control[3]
+		tmp[127:0] := 0
+	FI
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
+dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPERM2I128" xed="VPERM2I128_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permute4x64_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPERMQ" xed="VPERMQ_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permute4x64_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPERMPD" xed="VPERMPD_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutevar8x32_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMD" xed="VPERMD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutevar8x32_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMPS" xed="VPERMPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shuffle_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSHUFD" xed="VPSHUFD_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shuffle_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" within 128-bit lanes according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF b[i+7] == 1
+		dst[i+7:i] := 0
+	ELSE
+		index[3:0] := b[i+3:i]
+		dst[i+7:i] := a[index*8+7:index*8]
+	FI
+	IF b[128+i+7] == 1
+		dst[128+i+7:128+i] := 0
+	ELSE
+		index[3:0] := b[128+i+3:128+i]
+		dst[128+i+7:128+i] := a[128+index*8+7:128+index*8]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSHUFB" xed="VPSHUFB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shufflehi_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+dst[191:128] := a[191:128]
+dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSHUFHW" xed="VPSHUFHW_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shufflelo_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
+	<operation>
+dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+dst[127:64] := a[127:64]
+dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+dst[255:192] := a[255:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSHUFLW" xed="VPSHUFLW_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_unpackhi_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPUNPCKHBW" xed="VPUNPCKHBW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_unpackhi_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPUNPCKHWD" xed="VPUNPCKHWD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_unpackhi_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPUNPCKHDQ" xed="VPUNPCKHDQ_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_unpackhi_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPUNPCKHQDQ" xed="VPUNPCKHQDQ_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_unpacklo_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPUNPCKLBW" xed="VPUNPCKLBW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_unpacklo_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPUNPCKLWD" xed="VPUNPCKLWD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_unpacklo_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPUNPCKLDQ" xed="VPUNPCKLDQ_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_unpacklo_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPUNPCKLQDQ" xed="VPUNPCKLQDQ_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_abs_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := ABS(a[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPABSB" xed="VPABSB_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_abs_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ABS(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPABSW" xed="VPABSW_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_abs_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ABS(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPABSD" xed="VPABSD_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_max_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMAXSB" xed="VPMAXSB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_max_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMAXSW" xed="VPMAXSW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_max_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMAXSD" xed="VPMAXSD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_max_epu8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMAXUB" xed="VPMAXUB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_max_epu16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMAXUW" xed="VPMAXUW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_max_epu32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMAXUD" xed="VPMAXUD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_min_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMINSB" xed="VPMINSB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_min_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMINSW" xed="VPMINSW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_min_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMINSD" xed="VPMINSD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_min_epu8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMINUB" xed="VPMINUB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_min_epu16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMINUW" xed="VPMINUW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_min_epu32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMINUD" xed="VPMINUD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_add_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPADDB" xed="VPADDB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_add_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPADDW" xed="VPADDW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_add_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPADDD" xed="VPADDD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_add_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPADDQ" xed="VPADDQ_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_adds_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPADDSB" xed="VPADDSB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_adds_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPADDSW" xed="VPADDSW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_adds_epu8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPADDUSB" xed="VPADDUSB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_adds_epu16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPADDUSW" xed="VPADDUSW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_hadd_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[31:16] + a[15:0]
+dst[31:16] := a[63:48] + a[47:32]
+dst[47:32] := a[95:80] + a[79:64]
+dst[63:48] := a[127:112] + a[111:96]
+dst[79:64] := b[31:16] + b[15:0]
+dst[95:80] := b[63:48] + b[47:32]
+dst[111:96] := b[95:80] + b[79:64]
+dst[127:112] := b[127:112] + b[111:96]
+dst[143:128] := a[159:144] + a[143:128]
+dst[159:144] := a[191:176] + a[175:160]
+dst[175:160] := a[223:208] + a[207:192]
+dst[191:176] := a[255:240] + a[239:224]
+dst[207:192] := b[159:144] + b[143:128]
+dst[223:208] := b[191:176] + b[175:160]
+dst[239:224] := b[223:208] + b[207:192]
+dst[255:240] := b[255:240] + b[239:224]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPHADDW" xed="VPHADDW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_hadd_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := a[127:96] + a[95:64]
+dst[95:64] := b[63:32] + b[31:0]
+dst[127:96] := b[127:96] + b[95:64]
+dst[159:128] := a[191:160] + a[159:128]
+dst[191:160] := a[255:224] + a[223:192]
+dst[223:192] := b[191:160] + b[159:128]
+dst[255:224] := b[255:224] + b[223:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPHADDD" xed="VPHADDD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_hadds_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:16] + a[15:0])
+dst[31:16] := Saturate16(a[63:48] + a[47:32])
+dst[47:32] := Saturate16(a[95:80] + a[79:64])
+dst[63:48] := Saturate16(a[127:112] + a[111:96])
+dst[79:64] := Saturate16(b[31:16] + b[15:0])
+dst[95:80] := Saturate16(b[63:48] + b[47:32])
+dst[111:96] := Saturate16(b[95:80] + b[79:64])
+dst[127:112] := Saturate16(b[127:112] + b[111:96])
+dst[143:128] := Saturate16(a[159:144] + a[143:128])
+dst[159:144] := Saturate16(a[191:176] + a[175:160])
+dst[175:160] := Saturate16(a[223:208] + a[207:192])
+dst[191:176] := Saturate16(a[255:240] + a[239:224])
+dst[207:192] := Saturate16(b[159:144] + b[143:128])
+dst[223:208] := Saturate16(b[191:176] + b[175:160])
+dst[239:224] := Saturate16(b[223:208] + b[207:192])
+dst[255:240] := Saturate16(b[255:240] + b[239:224])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPHADDSW" xed="VPHADDSW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_hsub_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[15:0] - a[31:16]
+dst[31:16] := a[47:32] - a[63:48]
+dst[47:32] := a[79:64] - a[95:80]
+dst[63:48] := a[111:96] - a[127:112]
+dst[79:64] := b[15:0] - b[31:16]
+dst[95:80] := b[47:32] - b[63:48]
+dst[111:96] := b[79:64] - b[95:80]
+dst[127:112] := b[111:96] - b[127:112]
+dst[143:128] := a[143:128] - a[159:144]
+dst[159:144] := a[175:160] - a[191:176]
+dst[175:160] := a[207:192] - a[223:208]
+dst[191:176] := a[239:224] - a[255:240]
+dst[207:192] := b[143:128] - b[159:144]
+dst[223:208] := b[175:160] - b[191:176]
+dst[239:224] := b[207:192] - b[223:208]
+dst[255:240] := b[239:224] - b[255:240]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPHSUBW" xed="VPHSUBW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_hsub_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := a[95:64] - a[127:96]
+dst[95:64] := b[31:0] - b[63:32]
+dst[127:96] := b[95:64] - b[127:96]
+dst[159:128] := a[159:128] - a[191:160]
+dst[191:160] := a[223:192] - a[255:224]
+dst[223:192] := b[159:128] - b[191:160]
+dst[255:224] := b[223:192] - b[255:224]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPHSUBD" xed="VPHSUBD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_hsubs_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[15:0] - a[31:16])
+dst[31:16] := Saturate16(a[47:32] - a[63:48])
+dst[47:32] := Saturate16(a[79:64] - a[95:80])
+dst[63:48] := Saturate16(a[111:96] - a[127:112])
+dst[79:64] := Saturate16(b[15:0] - b[31:16])
+dst[95:80] := Saturate16(b[47:32] - b[63:48])
+dst[111:96] := Saturate16(b[79:64] - b[95:80])
+dst[127:112] := Saturate16(b[111:96] - b[127:112])
+dst[143:128] := Saturate16(a[143:128] - a[159:144])
+dst[159:144] := Saturate16(a[175:160] - a[191:176])
+dst[175:160] := Saturate16(a[207:192] - a[223:208])
+dst[191:176] := Saturate16(a[239:224] - a[255:240])
+dst[207:192] := Saturate16(b[143:128] - b[159:144])
+dst[223:208] := Saturate16(b[175:160] - b[191:176])
+dst[239:224] := Saturate16(b[207:192] - b[223:208])
+dst[255:240] := Saturate16(b[239:224] - b[255:240])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPHSUBSW" xed="VPHSUBSW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_madd_epi16" tech="AVX_ALL">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMADDWD" xed="VPMADDWD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maddubs_epi16" tech="AVX_ALL">
+	<return etype="SI16" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMADDUBSW" xed="VPMADDUBSW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mul_epi32" tech="AVX_ALL">
+	<return etype="SI64" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMULDQ" xed="VPMULDQ_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mul_epu32" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMULUDQ" xed="VPMULUDQ_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mulhi_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMULHW" xed="VPMULHW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mulhi_epu16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMULHUW" xed="VPMULHUW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mulhrs_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+	dst[i+15:i] := tmp[16:1]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMULHRSW" xed="VPMULHRSW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mullo_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMULLW" xed="VPMULLW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mullo_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Multiply the packed signed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	tmp[63:0] := a[i+31:i] * b[i+31:i]
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMULLD" xed="VPMULLD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sad_epu8" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+FOR j := 0 to 3
+	i := j*64
+	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \
+	               tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
+	dst[i+63:i+16] := 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSADBW" xed="VPSADBW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sign_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Negate packed signed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF b[i+7:i] &lt; 0
+		dst[i+7:i] := -(a[i+7:i])
+	ELSE IF b[i+7:i] == 0
+		dst[i+7:i] := 0
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSIGNB" xed="VPSIGNB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sign_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Negate packed signed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF b[i+15:i] &lt; 0
+		dst[i+15:i] := -(a[i+15:i])
+	ELSE IF b[i+15:i] == 0
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSIGNW" xed="VPSIGNW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sign_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Negate packed signed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF b[i+31:i] &lt; 0
+		dst[i+31:i] := -(a[i+31:i])
+	ELSE IF b[i+31:i] == 0
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSIGND" xed="VPSIGND_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sub_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSUBB" xed="VPSUBB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sub_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSUBW" xed="VPSUBW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sub_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSUBD" xed="VPSUBD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sub_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSUBQ" xed="VPSUBQ_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_subs_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSUBSB" xed="VPSUBSB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_subs_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSUBSW" xed="VPSUBSW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_subs_epu8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSUBUSB" xed="VPSUBUSB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_subs_epu16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSUBUSW" xed="VPSUBUSW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_alignr_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128)[255:0] OR b[i+127:i]) &gt;&gt; (imm8*8)
+	dst[i+127:i] := tmp[127:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPALIGNR" xed="VPALIGNR_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_movemask_epi8" tech="AVX_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[j] := a[i+7]
+ENDFOR
+	</operation>
+	<instruction form="r32, ymm" name="VPMOVMSKB" xed="VPMOVMSKB_GPR32d_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mpsadbw_epu8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Eight SADs are performed for each 128-bit lane using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8".</description>
+	<operation>
+DEFINE MPSADBW(a[127:0], b[127:0], imm8[2:0]) {
+	a_offset := imm8[2]*32
+	b_offset := imm8[1:0]*32
+	FOR j := 0 to 7
+		i := j*8
+		k := a_offset+i
+		l := b_offset
+		tmp[i*2+15:i*2] := ABS(Signed(a[k+7:k] - b[l+7:l])) + ABS(Signed(a[k+15:k+8] - b[l+15:l+8])) + \
+		                   ABS(Signed(a[k+23:k+16] - b[l+23:l+16])) + ABS(Signed(a[k+31:k+24] - b[l+31:l+24]))
+	ENDFOR
+	RETURN tmp[127:0]
+}
+dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0])
+dst[255:128] := MPSADBW(a[255:128], b[255:128], imm8[5:3])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VMPSADBW" xed="VMPSADBW_YMMqq_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_packs_epi16" tech="AVX_ALL">
+	<return etype="SI8" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate8(a[15:0])
+dst[15:8] := Saturate8(a[31:16])
+dst[23:16] := Saturate8(a[47:32])
+dst[31:24] := Saturate8(a[63:48])
+dst[39:32] := Saturate8(a[79:64])
+dst[47:40] := Saturate8(a[95:80])
+dst[55:48] := Saturate8(a[111:96])
+dst[63:56] := Saturate8(a[127:112])
+dst[71:64] := Saturate8(b[15:0])
+dst[79:72] := Saturate8(b[31:16])
+dst[87:80] := Saturate8(b[47:32])
+dst[95:88] := Saturate8(b[63:48])
+dst[103:96] := Saturate8(b[79:64])
+dst[111:104] := Saturate8(b[95:80])
+dst[119:112] := Saturate8(b[111:96])
+dst[127:120] := Saturate8(b[127:112])
+dst[135:128] := Saturate8(a[143:128])
+dst[143:136] := Saturate8(a[159:144])
+dst[151:144] := Saturate8(a[175:160])
+dst[159:152] := Saturate8(a[191:176])
+dst[167:160] := Saturate8(a[207:192])
+dst[175:168] := Saturate8(a[223:208])
+dst[183:176] := Saturate8(a[239:224])
+dst[191:184] := Saturate8(a[255:240])
+dst[199:192] := Saturate8(b[143:128])
+dst[207:200] := Saturate8(b[159:144])
+dst[215:208] := Saturate8(b[175:160])
+dst[223:216] := Saturate8(b[191:176])
+dst[231:224] := Saturate8(b[207:192])
+dst[239:232] := Saturate8(b[223:208])
+dst[247:240] := Saturate8(b[239:224])
+dst[255:248] := Saturate8(b[255:240])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPACKSSWB" xed="VPACKSSWB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_packs_epi32" tech="AVX_ALL">
+	<return etype="SI16" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:0])
+dst[31:16] := Saturate16(a[63:32])
+dst[47:32] := Saturate16(a[95:64])
+dst[63:48] := Saturate16(a[127:96])
+dst[79:64] := Saturate16(b[31:0])
+dst[95:80] := Saturate16(b[63:32])
+dst[111:96] := Saturate16(b[95:64])
+dst[127:112] := Saturate16(b[127:96])
+dst[143:128] := Saturate16(a[159:128])
+dst[159:144] := Saturate16(a[191:160])
+dst[175:160] := Saturate16(a[223:192])
+dst[191:176] := Saturate16(a[255:224])
+dst[207:192] := Saturate16(b[159:128])
+dst[223:208] := Saturate16(b[191:160])
+dst[239:224] := Saturate16(b[223:192])
+dst[255:240] := Saturate16(b[255:224])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPACKSSDW" xed="VPACKSSDW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_packus_epi16" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := SaturateU8(a[15:0])
+dst[15:8] := SaturateU8(a[31:16])
+dst[23:16] := SaturateU8(a[47:32])
+dst[31:24] := SaturateU8(a[63:48])
+dst[39:32] := SaturateU8(a[79:64])
+dst[47:40] := SaturateU8(a[95:80])
+dst[55:48] := SaturateU8(a[111:96])
+dst[63:56] := SaturateU8(a[127:112])
+dst[71:64] := SaturateU8(b[15:0])
+dst[79:72] := SaturateU8(b[31:16])
+dst[87:80] := SaturateU8(b[47:32])
+dst[95:88] := SaturateU8(b[63:48])
+dst[103:96] := SaturateU8(b[79:64])
+dst[111:104] := SaturateU8(b[95:80])
+dst[119:112] := SaturateU8(b[111:96])
+dst[127:120] := SaturateU8(b[127:112])
+dst[135:128] := SaturateU8(a[143:128])
+dst[143:136] := SaturateU8(a[159:144])
+dst[151:144] := SaturateU8(a[175:160])
+dst[159:152] := SaturateU8(a[191:176])
+dst[167:160] := SaturateU8(a[207:192])
+dst[175:168] := SaturateU8(a[223:208])
+dst[183:176] := SaturateU8(a[239:224])
+dst[191:184] := SaturateU8(a[255:240])
+dst[199:192] := SaturateU8(b[143:128])
+dst[207:200] := SaturateU8(b[159:144])
+dst[215:208] := SaturateU8(b[175:160])
+dst[223:216] := SaturateU8(b[191:176])
+dst[231:224] := SaturateU8(b[207:192])
+dst[239:232] := SaturateU8(b[223:208])
+dst[247:240] := SaturateU8(b[239:224])
+dst[255:248] := SaturateU8(b[255:240])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPACKUSWB" xed="VPACKUSWB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_packus_epi32" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := SaturateU16(a[31:0])
+dst[31:16] := SaturateU16(a[63:32])
+dst[47:32] := SaturateU16(a[95:64])
+dst[63:48] := SaturateU16(a[127:96])
+dst[79:64] := SaturateU16(b[31:0])
+dst[95:80] := SaturateU16(b[63:32])
+dst[111:96] := SaturateU16(b[95:64])
+dst[127:112] := SaturateU16(b[127:96])
+dst[143:128] := SaturateU16(a[159:128])
+dst[159:144] := SaturateU16(a[191:160])
+dst[175:160] := SaturateU16(a[223:192])
+dst[191:176] := SaturateU16(a[255:224])
+dst[207:192] := SaturateU16(b[159:128])
+dst[223:208] := SaturateU16(b[191:160])
+dst[239:224] := SaturateU16(b[223:192])
+dst[255:240] := SaturateU16(b[255:224])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPACKUSDW" xed="VPACKUSDW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_and_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<parameter etype="M256" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[255:0] := (a[255:0] AND b[255:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPAND" xed="VPAND_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_andnot_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<parameter etype="M256" type="__m256i" varname="b" />
+	<description>Compute the bitwise NOT of 256 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[255:0] := ((NOT a[255:0]) AND b[255:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPANDN" xed="VPANDN_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_or_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<parameter etype="M256" type="__m256i" varname="b" />
+	<description>Compute the bitwise OR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[255:0] := (a[255:0] OR b[255:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPOR" xed="VPOR_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_xor_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="M256" type="__m256i" varname="a" />
+	<parameter etype="M256" type="__m256i" varname="b" />
+	<description>Compute the bitwise XOR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[255:0] := (a[255:0] XOR b[255:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPXOR" xed="VPXOR_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_avg_epu8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPAVGB" xed="VPAVGB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_avg_epu16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPAVGW" xed="VPAVGW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpeq_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPCMPEQB" xed="VPCMPEQB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpeq_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPCMPEQW" xed="VPCMPEQW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpeq_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPCMPEQD" xed="VPCMPEQD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpeq_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPCMPEQQ" xed="VPCMPEQQ_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpgt_epi8" tech="AVX_ALL">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPCMPGTB" xed="VPCMPGTB_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpgt_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPCMPGTW" xed="VPCMPGTW_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpgt_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPCMPGTD" xed="VPCMPGTD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpgt_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] &gt; b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPCMPGTQ" xed="VPCMPGTQ_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi16_epi32" tech="AVX_ALL">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := SignExtend32(a[k+15:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPMOVSXWD" xed="VPMOVSXWD_YMMqq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi16_epi64" tech="AVX_ALL">
+	<return etype="SI64" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := SignExtend64(a[k+15:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPMOVSXWQ" xed="VPMOVSXWQ_YMMqq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi32_epi64" tech="AVX_ALL">
+	<return etype="SI64" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := SignExtend64(a[k+31:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPMOVSXDQ" xed="VPMOVSXDQ_YMMqq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi8_epi16" tech="AVX_ALL">
+	<return etype="SI16" type="__m256i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	dst[l+15:l] := SignExtend16(a[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPMOVSXBW" xed="VPMOVSXBW_YMMqq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi8_epi32" tech="AVX_ALL">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := SignExtend32(a[k+7:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPMOVSXBD" xed="VPMOVSXBD_YMMqq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi8_epi64" tech="AVX_ALL">
+	<return etype="SI64" type="__m256i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := SignExtend64(a[k+7:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPMOVSXBQ" xed="VPMOVSXBQ_YMMqq_XMMd" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepu16_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := ZeroExtend32(a[k+15:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPMOVZXWD" xed="VPMOVZXWD_YMMqq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepu16_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := ZeroExtend64(a[k+15:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPMOVZXWQ" xed="VPMOVZXWQ_YMMqq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepu32_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := ZeroExtend64(a[k+31:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPMOVZXDQ" xed="VPMOVZXDQ_YMMqq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepu8_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	dst[l+15:l] := ZeroExtend16(a[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPMOVZXBW" xed="VPMOVZXBW_YMMqq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepu8_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := ZeroExtend32(a[k+7:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPMOVZXBD" xed="VPMOVZXBD_YMMqq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepu8_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := ZeroExtend64(a[k+7:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VPMOVZXBQ" xed="VPMOVZXBQ_YMMqq_XMMd" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_i32gather_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="double const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm32x, xmm" name="VGATHERDPD" xed="VGATHERDPD_XMMf64_MEMf64_XMMi64_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i32gather_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="double const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm32x, ymm" name="VGATHERDPD" xed="VGATHERDPD_YMMf64_MEMf64_YMMi64_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_i32gather_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="float const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm32x, xmm" name="VGATHERDPS" xed="VGATHERDPS_XMMf32_MEMf32_XMMi32_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i32gather_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="float const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm32x, ymm" name="VGATHERDPS" xed="VGATHERDPS_YMMf32_MEMf32_YMMi32_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_i32gather_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="int const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm32x, xmm" name="VPGATHERDD" xed="VPGATHERDD_XMMu32_MEMd_XMMi32_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i32gather_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="int const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm32x, ymm" name="VPGATHERDD" xed="VPGATHERDD_YMMu32_MEMd_YMMi32_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_i32gather_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__int64 const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm32x, xmm" name="VPGATHERDQ" xed="VPGATHERDQ_XMMu64_MEMq_XMMi64_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i32gather_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__int64 const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm32x, ymm" name="VPGATHERDQ" xed="VPGATHERDQ_YMMu64_MEMq_YMMi64_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_i64gather_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="double const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm64x, xmm" name="VGATHERQPD" xed="VGATHERQPD_XMMf64_MEMf64_XMMi64_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i64gather_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="double const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm64x, ymm" name="VGATHERQPD" xed="VGATHERQPD_YMMf64_MEMf64_YMMi64_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_i64gather_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="float const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, vm64x, xmm" name="VGATHERQPS" xed="VGATHERQPS_XMMf32_MEMf32_XMMi32_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i64gather_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="float const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm64y, xmm" name="VGATHERQPS" xed="VGATHERQPS_XMMf32_MEMf32_XMMi32_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_i64gather_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="int const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, vm64x, xmm" name="VPGATHERQD" xed="VPGATHERQD_XMMu32_MEMd_XMMi32_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i64gather_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="int const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm64y, xmm" name="VPGATHERQD" xed="VPGATHERQD_XMMu32_MEMd_XMMi32_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_i64gather_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__int64 const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm64x, xmm" name="VPGATHERQQ" xed="VPGATHERQQ_XMMu64_MEMq_XMMi64_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i64gather_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__int64 const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm64x, ymm" name="VPGATHERQQ" xed="VPGATHERQQ_YMMu64_MEMq_YMMi64_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i32gather_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="FP64" type="double const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="MASK" type="__m128d" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF mask[i+63]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm32x, xmm" name="VGATHERDPD" xed="VGATHERDPD_XMMf64_MEMf64_XMMi64_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i32gather_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="FP64" type="double const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="MASK" type="__m256d" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF mask[i+63]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm32x, ymm" name="VGATHERDPD" xed="VGATHERDPD_YMMf64_MEMf64_YMMi64_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i32gather_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="FP32" type="float const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="MASK" type="__m128" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	IF mask[i+31]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm32x, xmm" name="VGATHERDPS" xed="VGATHERDPS_XMMf32_MEMf32_XMMi32_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i32gather_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="FP32" type="float const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="MASK" type="__m256" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	IF mask[i+31]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm32x, ymm" name="VGATHERDPS" xed="VGATHERDPS_YMMf32_MEMf32_YMMi32_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i32gather_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="UI32" type="int const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="MASK" type="__m128i" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	IF mask[i+31]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm32x, xmm" name="VPGATHERDD" xed="VPGATHERDD_XMMu32_MEMd_XMMi32_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i32gather_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="UI32" type="int const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="MASK" type="__m256i" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	IF mask[i+31]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm32x, ymm" name="VPGATHERDD" xed="VPGATHERDD_YMMu32_MEMd_YMMi32_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i32gather_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="UI64" type="__int64 const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="MASK" type="__m128i" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF mask[i+63]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm32x, xmm" name="VPGATHERDQ" xed="VPGATHERDQ_XMMu64_MEMq_XMMi64_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i32gather_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="UI64" type="__int64 const*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="MASK" type="__m256i" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF mask[i+63]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm32x, ymm" name="VPGATHERDQ" xed="VPGATHERDQ_YMMu64_MEMq_YMMi64_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i64gather_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="FP64" type="double const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="MASK" type="__m128d" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	IF mask[i+63]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm64x, xmm" name="VGATHERQPD" xed="VGATHERQPD_XMMf64_MEMf64_XMMi64_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i64gather_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="FP64" type="double const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="MASK" type="__m256d" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	IF mask[i+63]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm64x, ymm" name="VGATHERQPD" xed="VGATHERQPD_YMMf64_MEMf64_YMMi64_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i64gather_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="FP32" type="float const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="MASK" type="__m128" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF mask[i+31]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:64] := 0
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, vm64x, xmm" name="VGATHERQPS" xed="VGATHERQPS_XMMf32_MEMf32_XMMi32_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i64gather_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="FP32" type="float const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="MASK" type="__m128" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF mask[i+31]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm64y, xmm" name="VGATHERQPS" xed="VGATHERQPS_XMMf32_MEMf32_XMMi32_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i64gather_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="UI32" type="int const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="MASK" type="__m128i" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF mask[i+31]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:64] := 0
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, vm64x, xmm" name="VPGATHERQD" xed="VPGATHERQD_XMMu32_MEMd_XMMi32_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i64gather_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="UI32" type="int const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="MASK" type="__m128i" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF mask[i+31]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm64y, xmm" name="VPGATHERQD" xed="VPGATHERQD_XMMu32_MEMd_XMMi32_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i64gather_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="UI64" type="__int64 const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="MASK" type="__m128i" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	IF mask[i+63]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, vm64x, xmm" name="VPGATHERQQ" xed="VPGATHERQQ_XMMu64_MEMq_XMMi64_VL128" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i64gather_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="UI64" type="__int64 const*" varname="base_addr" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="MASK" type="__m256i" varname="mask" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	IF mask[i+63]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm64x, ymm" name="VPGATHERQQ" xed="VPGATHERQQ_YMMu64_MEMq_YMMi64_VL256" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskload_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" memwidth="128" type="int const*" varname="mem_addr" />
+	<parameter etype="MASK" type="__m128i" varname="mask" />
+	<description>Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, m128" name="VPMASKMOVD" xed="VPMASKMOVD_XMMdq_XMMdq_MEMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskload_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" memwidth="256" type="int const*" varname="mem_addr" />
+	<parameter etype="MASK" type="__m256i" varname="mask" />
+	<description>Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, m256" name="VPMASKMOVD" xed="VPMASKMOVD_YMMqq_YMMqq_MEMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskload_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" memwidth="128" type="__int64 const*" varname="mem_addr" />
+	<parameter etype="MASK" type="__m128i" varname="mask" />
+	<description>Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, m128" name="VPMASKMOVQ" xed="VPMASKMOVQ_XMMdq_XMMdq_MEMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskload_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" memwidth="256" type="__int64 const*" varname="mem_addr" />
+	<parameter etype="MASK" type="__m256i" varname="mask" />
+	<description>Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, m256" name="VPMASKMOVQ" xed="VPMASKMOVQ_YMMqq_YMMqq_MEMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_stream_load_si256" tech="AVX_ALL">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="M256" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load 256-bits of integer data from memory into "dst" using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVNTDQA" xed="VMOVNTDQA_YMMqq_MEMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskstore_epi32" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="128" type="int*" varname="mem_addr" />
+	<parameter etype="MASK" type="__m128i" varname="mask" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128, xmm, xmm" name="VPMASKMOVD" xed="VPMASKMOVD_MEMdq_XMMdq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskstore_epi32" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="256" type="int*" varname="mem_addr" />
+	<parameter etype="MASK" type="__m256i" varname="mask" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256, ymm, ymm" name="VPMASKMOVD" xed="VPMASKMOVD_MEMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskstore_epi64" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="128" type="__int64*" varname="mem_addr" />
+	<parameter etype="MASK" type="__m128i" varname="mask" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128, xmm, xmm" name="VPMASKMOVQ" xed="VPMASKMOVQ_MEMdq_XMMdq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskstore_epi64" tech="AVX_ALL">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="256" type="__int64*" varname="mem_addr" />
+	<parameter etype="MASK" type="__m256i" varname="mask" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256, ymm, ymm" name="VPMASKMOVQ" xed="VPMASKMOVQ_MEMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_slli_si256" tech="AVX_ALL">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+dst[255:128] := a[255:128] &lt;&lt; (tmp*8)
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSLLDQ" xed="VPSLLDQ_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_bslli_epi128" tech="AVX_ALL">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+dst[255:128] := a[255:128] &lt;&lt; (tmp*8)
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSLLDQ" xed="VPSLLDQ_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sll_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm" name="VPSLLW" xed="VPSLLW_YMMqq_YMMqq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_slli_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSLLW" xed="VPSLLW_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sll_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm" name="VPSLLD" xed="VPSLLD_YMMqq_YMMqq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_slli_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSLLD" xed="VPSLLD_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sll_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm" name="VPSLLQ" xed="VPSLLQ_YMMqq_YMMqq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_slli_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSLLQ" xed="VPSLLQ_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sllv_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSLLVD" xed="VPSLLVD_XMMdq_XMMdq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sllv_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSLLVD" xed="VPSLLVD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sllv_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSLLVQ" xed="VPSLLVQ_XMMdq_XMMdq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sllv_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSLLVQ" xed="VPSLLVQ_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sra_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm" name="VPSRAW" xed="VPSRAW_YMMqq_YMMqq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srai_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSRAW" xed="VPSRAW_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sra_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm" name="VPSRAD" xed="VPSRAD_YMMqq_YMMqq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srai_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSRAD" xed="VPSRAD_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srav_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSRAVD" xed="VPSRAVD_XMMdq_XMMdq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srav_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSRAVD" xed="VPSRAVD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srli_si256" tech="AVX_ALL">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+dst[255:128] := a[255:128] &gt;&gt; (tmp*8)
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSRLDQ" xed="VPSRLDQ_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_bsrli_epi128" tech="AVX_ALL">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+dst[255:128] := a[255:128] &gt;&gt; (tmp*8)
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSRLDQ" xed="VPSRLDQ_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srl_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm" name="VPSRLW" xed="VPSRLW_YMMqq_YMMqq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srli_epi16" tech="AVX_ALL">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSRLW" xed="VPSRLW_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srl_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm" name="VPSRLD" xed="VPSRLD_YMMqq_YMMqq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srli_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSRLD" xed="VPSRLD_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srl_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm" name="VPSRLQ" xed="VPSRLQ_YMMqq_YMMqq_XMMq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srli_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSRLQ" xed="VPSRLQ_YMMqq_YMMqq_IMMb" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srlv_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSRLVD" xed="VPSRLVD_XMMdq_XMMdq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srlv_epi32" tech="AVX_ALL">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSRLVD" xed="VPSRLVD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srlv_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSRLVQ" xed="VPSRLVQ_XMMdq_XMMdq_XMMdq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srlv_epi64" tech="AVX_ALL">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSRLVQ" xed="VPSRLVQ_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_dbsad_epu8" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+FOR i := 0 to 1
+	tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ]
+	tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ]
+	tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ]
+	tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ]
+ENDFOR
+FOR j := 0 to 3
+	i := j*64
+	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	               ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                  ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                  ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                  ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VDBPSADBW" xed="VDBPSADBW_YMMu16_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_dbsad_epu8" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+FOR i := 0 to 1
+	tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ]
+	tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ]
+	tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ]
+	tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ]
+ENDFOR
+FOR j := 0 to 3
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	                   ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                      ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VDBPSADBW" xed="VDBPSADBW_YMMu16_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_dbsad_epu8" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+FOR i := 0 to 1
+	tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ]
+	tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ]
+	tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ]
+	tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ]
+ENDFOR
+FOR j := 0 to 3
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	                   ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                      ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VDBPSADBW" xed="VDBPSADBW_YMMu16_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_dbsad_epu8" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+tmp.dword[0] := b.dword[ imm8[1:0] ]
+tmp.dword[1] := b.dword[ imm8[3:2] ]
+tmp.dword[2] := b.dword[ imm8[5:4] ]
+tmp.dword[3] := b.dword[ imm8[7:6] ]
+FOR j := 0 to 1
+	i := j*64
+	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	               ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                  ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                  ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                  ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VDBPSADBW" xed="VDBPSADBW_XMMu16_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_dbsad_epu8" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+tmp.dword[0] := b.dword[ imm8[1:0] ]
+tmp.dword[1] := b.dword[ imm8[3:2] ]
+tmp.dword[2] := b.dword[ imm8[5:4] ]
+tmp.dword[3] := b.dword[ imm8[7:6] ]
+FOR j := 0 to 1
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	                   ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                      ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VDBPSADBW" xed="VDBPSADBW_XMMu16_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_dbsad_epu8" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+tmp.dword[0] := b.dword[ imm8[1:0] ]
+tmp.dword[1] := b.dword[ imm8[3:2] ]
+tmp.dword[2] := b.dword[ imm8[5:4] ]
+tmp.dword[3] := b.dword[ imm8[7:6] ]
+FOR j := 0 to 1
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	                   ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                      ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VDBPSADBW" xed="VDBPSADBW_XMMu16_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_alignr_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128)[255:0] OR b[i+127:i]) &gt;&gt; (imm8*8)
+	tmp_dst[i+127:i] := tmp[127:0]
+ENDFOR
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VPALIGNR" xed="VPALIGNR_YMMu8_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_alignr_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128)[255:0] OR b[i+127:i]) &gt;&gt; (imm8*8)
+	tmp_dst[i+127:i] := tmp[127:0]
+ENDFOR
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VPALIGNR" xed="VPALIGNR_YMMu8_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_alignr_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[255:0] := ((a[127:0] &lt;&lt; 128)[255:0] OR b[127:0]) &gt;&gt; (imm8*8)
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VPALIGNR" xed="VPALIGNR_XMMu8_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_alignr_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[255:0] := ((a[127:0] &lt;&lt; 128)[255:0] OR b[127:0]) &gt;&gt; (imm8*8)
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VPALIGNR" xed="VPALIGNR_XMMu8_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_blend_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPBLENDMB" xed="VPBLENDMB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_blend_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPBLENDMB" xed="VPBLENDMB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_blend_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPBLENDMW" xed="VPBLENDMW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_blend_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPBLENDMW" xed="VPBLENDMW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_broadcastb_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPBROADCASTB" xed="VPBROADCASTB_YMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_broadcastb_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPBROADCASTB" xed="VPBROADCASTB_YMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_broadcastb_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPBROADCASTB" xed="VPBROADCASTB_XMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_broadcastb_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPBROADCASTB" xed="VPBROADCASTB_XMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_broadcastw_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPBROADCASTW" xed="VPBROADCASTW_YMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_broadcastw_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPBROADCASTW" xed="VPBROADCASTW_YMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_broadcastw_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPBROADCASTW" xed="VPBROADCASTW_XMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_broadcastw_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPBROADCASTW" xed="VPBROADCASTW_XMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask2_permutex2var_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="idx" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+3:i]
+		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := idx[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMI2W" xed="VPERMI2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutex2var_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="idx" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+3:i]
+		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMT2W" xed="VPERMT2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutex2var_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="idx" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+3:i]
+		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMI2W" xed="VPERMI2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMT2W" xed="VPERMT2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutex2var_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="idx" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	off := 16*idx[i+3:i]
+	dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMI2W" xed="VPERMI2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VPERMT2W" xed="VPERMT2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask2_permutex2var_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="idx" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+2:i]
+		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := idx[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMI2W" xed="VPERMI2W_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_permutex2var_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="idx" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+2:i]
+		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMT2W" xed="VPERMT2W_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_permutex2var_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="idx" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+2:i]
+		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMI2W" xed="VPERMI2W_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMT2W" xed="VPERMT2W_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_permutex2var_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="idx" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	off := 16*idx[i+2:i]
+	dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPERMI2W" xed="VPERMI2W_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VPERMT2W" xed="VPERMT2W_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutexvar_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="idx" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	id := idx[i+3:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMW" xed="VPERMW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutexvar_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="idx" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	id := idx[i+3:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMW" xed="VPERMW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutexvar_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="idx" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	id := idx[i+3:i]*16
+	dst[i+15:i] := a[id+15:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMW" xed="VPERMW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_permutexvar_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="idx" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	id := idx[i+2:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMW" xed="VPERMW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_permutexvar_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="idx" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	id := idx[i+2:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMW" xed="VPERMW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_permutexvar_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="idx" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	id := idx[i+2:i]*16
+	dst[i+15:i] := a[id+15:id]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPERMW" xed="VPERMW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_movepi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF a[i+7]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm" name="VPMOVB2M" xed="VPMOVB2M_MASKmskw_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_movepi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF a[i+7]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm" name="VPMOVB2M" xed="VPMOVB2M_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_movm_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<description>Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := 0xFF
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm" name="VPMOVM2B" xed="VPMOVM2B_YMMu8_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_movm_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := 0xFF
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm" name="VPMOVM2B" xed="VPMOVM2B_XMMu8_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_movm_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := 0xFFFF
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm" name="VPMOVM2W" xed="VPMOVM2W_YMMu16_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_movm_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := 0xFFFF
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm" name="VPMOVM2W" xed="VPMOVM2W_XMMu16_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_movepi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF a[i+15]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm" name="VPMOVW2M" xed="VPMOVW2M_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_movepi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF a[i+15]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm" name="VPMOVW2M" xed="VPMOVW2M_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shuffle_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[4:0] := b[i+3:i] + (j &amp; 0x10)
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSHUFB" xed="VPSHUFB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shuffle_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[4:0] := b[i+3:i] + (j &amp; 0x10)
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSHUFB" xed="VPSHUFB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shuffle_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[3:0] := b[i+3:i]
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSHUFB" xed="VPSHUFB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shuffle_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[3:0] := b[i+3:i]
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSHUFB" xed="VPSHUFB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shufflehi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+tmp_dst[191:128] := a[191:128]
+tmp_dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+tmp_dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+tmp_dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+tmp_dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPSHUFHW" xed="VPSHUFHW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shufflehi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+tmp_dst[191:128] := a[191:128]
+tmp_dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+tmp_dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+tmp_dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+tmp_dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPSHUFHW" xed="VPSHUFHW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shufflehi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPSHUFHW" xed="VPSHUFHW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shufflehi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPSHUFHW" xed="VPSHUFHW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shufflelo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+tmp_dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+tmp_dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+tmp_dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+tmp_dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+tmp_dst[255:192] := a[255:192]
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPSHUFLW" xed="VPSHUFLW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shufflelo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+tmp_dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+tmp_dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+tmp_dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+tmp_dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+tmp_dst[255:192] := a[255:192]
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPSHUFLW" xed="VPSHUFLW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shufflelo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPSHUFLW" xed="VPSHUFLW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shufflelo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPSHUFLW" xed="VPSHUFLW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_unpackhi_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPUNPCKHBW" xed="VPUNPCKHBW_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_unpackhi_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPUNPCKHBW" xed="VPUNPCKHBW_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_unpackhi_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPUNPCKHBW" xed="VPUNPCKHBW_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_unpackhi_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPUNPCKHBW" xed="VPUNPCKHBW_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_unpackhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPUNPCKHWD" xed="VPUNPCKHWD_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_unpackhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPUNPCKHWD" xed="VPUNPCKHWD_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_unpackhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPUNPCKHWD" xed="VPUNPCKHWD_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_unpackhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPUNPCKHWD" xed="VPUNPCKHWD_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_unpacklo_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPUNPCKLBW" xed="VPUNPCKLBW_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_unpacklo_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPUNPCKLBW" xed="VPUNPCKLBW_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_unpacklo_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPUNPCKLBW" xed="VPUNPCKLBW_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_unpacklo_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPUNPCKLBW" xed="VPUNPCKLBW_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_unpacklo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPUNPCKLWD" xed="VPUNPCKLWD_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_unpacklo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPUNPCKLWD" xed="VPUNPCKLWD_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_unpacklo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPUNPCKLWD" xed="VPUNPCKLWD_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_unpacklo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPUNPCKLWD" xed="VPUNPCKLWD_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_loadu_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m256" name="VMOVDQU16" xed="VMOVDQU16_YMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_loadu_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m256" name="VMOVDQU16" xed="VMOVDQU16_YMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_loadu_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m128" name="VMOVDQU16" xed="VMOVDQU16_XMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_loadu_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m128" name="VMOVDQU16" xed="VMOVDQU16_XMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_loadu_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m256" name="VMOVDQU8" xed="VMOVDQU8_YMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_loadu_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m256" name="VMOVDQU8" xed="VMOVDQU8_YMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_loadu_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m128" name="VMOVDQU8" xed="VMOVDQU8_XMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_loadu_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m128" name="VMOVDQU8" xed="VMOVDQU8_XMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_loadu_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load 256-bits (composed of 16 packed 16-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVDQU16" xed="VMOVDQU16_YMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_loadu_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load 256-bits (composed of 32 packed 8-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVDQU8" xed="VMOVDQU8_YMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadu_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load 128-bits (composed of 8 packed 16-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, m128" name="VMOVDQU16" xed="VMOVDQU16_XMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadu_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load 128-bits (composed of 16 packed 8-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, m128" name="VMOVDQU8" xed="VMOVDQU8_XMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mov_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VMOVDQU16" xed="VMOVDQU16_YMMu16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mov_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VMOVDQU16" xed="VMOVDQU16_YMMu16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mov_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VMOVDQU16" xed="VMOVDQU16_XMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mov_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VMOVDQU16" xed="VMOVDQU16_XMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mov_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VMOVDQU8" xed="VMOVDQU8_YMMu8_MASKmskw_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mov_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VMOVDQU8" xed="VMOVDQU8_YMMu8_MASKmskw_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mov_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VMOVDQU8" xed="VMOVDQU8_XMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mov_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VMOVDQU8" xed="VMOVDQU8_XMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_storeu_epi16" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI16" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Store packed 16-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VMOVDQU16" xed="VMOVDQU16_MEMu16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_storeu_epi16" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI16" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Store packed 16-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VMOVDQU16" xed="VMOVDQU16_MEMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_storeu_epi8" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI8" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Store packed 8-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VMOVDQU8" xed="VMOVDQU8_MEMu8_MASKmskw_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_storeu_epi8" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI8" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Store packed 8-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VMOVDQU8" xed="VMOVDQU8_MEMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_storeu_epi16" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI16" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Store 256-bits (composed of 16 packed 16-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVDQU16" xed="VMOVDQU16_MEMu16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_storeu_epi8" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI8" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Store 256-bits (composed of 32 packed 8-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVDQU8" xed="VMOVDQU8_MEMu8_MASKmskw_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storeu_epi16" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI16" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Store 128-bits (composed of 8 packed 16-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="VMOVDQU16" xed="VMOVDQU16_MEMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storeu_epi8" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI8" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Store 128-bits (composed of 16 packed 8-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="VMOVDQU8" xed="VMOVDQU8_MEMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_abs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPABSB" xed="VPABSB_YMMi8_MASKmskw_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_abs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPABSB" xed="VPABSB_YMMi8_MASKmskw_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_abs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPABSB" xed="VPABSB_XMMi8_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_abs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPABSB" xed="VPABSB_XMMi8_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_abs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPABSW" xed="VPABSW_YMMi16_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_abs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPABSW" xed="VPABSW_YMMi16_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_abs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPABSW" xed="VPABSW_XMMi16_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_abs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPABSW" xed="VPABSW_XMMi16_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_add_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPADDB" xed="VPADDB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_add_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPADDB" xed="VPADDB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_add_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPADDB" xed="VPADDB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_add_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPADDB" xed="VPADDB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_adds_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPADDSB" xed="VPADDSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_adds_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPADDSB" xed="VPADDSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_adds_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPADDSB" xed="VPADDSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_adds_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPADDSB" xed="VPADDSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_adds_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPADDSW" xed="VPADDSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_adds_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPADDSW" xed="VPADDSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_adds_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPADDSW" xed="VPADDSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_adds_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPADDSW" xed="VPADDSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_adds_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPADDUSB" xed="VPADDUSB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_adds_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPADDUSB" xed="VPADDUSB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_adds_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPADDUSB" xed="VPADDUSB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_adds_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPADDUSB" xed="VPADDUSB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_adds_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPADDUSW" xed="VPADDUSW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_adds_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPADDUSW" xed="VPADDUSW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_adds_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPADDUSW" xed="VPADDUSW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_adds_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPADDUSW" xed="VPADDUSW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_add_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPADDW" xed="VPADDW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_add_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPADDW" xed="VPADDW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_add_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPADDW" xed="VPADDW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_add_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPADDW" xed="VPADDW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_avg_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPAVGB" xed="VPAVGB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_avg_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPAVGB" xed="VPAVGB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_avg_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPAVGB" xed="VPAVGB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_avg_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPAVGB" xed="VPAVGB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_avg_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPAVGW" xed="VPAVGW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_avg_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPAVGW" xed="VPAVGW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_avg_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPAVGW" xed="VPAVGW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_avg_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPAVGW" xed="VPAVGW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_maddubs_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMADDUBSW" xed="VPMADDUBSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_maddubs_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMADDUBSW" xed="VPMADDUBSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_maddubs_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMADDUBSW" xed="VPMADDUBSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_maddubs_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMADDUBSW" xed="VPMADDUBSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_madd_epi16" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMADDWD" xed="VPMADDWD_YMMi32_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_madd_epi16" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMADDWD" xed="VPMADDWD_YMMi32_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_madd_epi16" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMADDWD" xed="VPMADDWD_XMMi32_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_madd_epi16" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMADDWD" xed="VPMADDWD_XMMi32_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_max_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMAXSB" xed="VPMAXSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_max_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMAXSB" xed="VPMAXSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMAXSB" xed="VPMAXSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMAXSB" xed="VPMAXSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_max_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMAXSW" xed="VPMAXSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_max_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMAXSW" xed="VPMAXSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMAXSW" xed="VPMAXSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMAXSW" xed="VPMAXSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_max_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMAXUB" xed="VPMAXUB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_max_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMAXUB" xed="VPMAXUB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMAXUB" xed="VPMAXUB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMAXUB" xed="VPMAXUB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_max_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMAXUW" xed="VPMAXUW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_max_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMAXUW" xed="VPMAXUW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMAXUW" xed="VPMAXUW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMAXUW" xed="VPMAXUW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_min_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMINSB" xed="VPMINSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_min_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMINSB" xed="VPMINSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMINSB" xed="VPMINSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMINSB" xed="VPMINSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_min_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMINSW" xed="VPMINSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_min_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMINSW" xed="VPMINSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMINSW" xed="VPMINSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMINSW" xed="VPMINSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_min_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMINUB" xed="VPMINUB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_min_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMINUB" xed="VPMINUB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMINUB" xed="VPMINUB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMINUB" xed="VPMINUB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_min_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMINUW" xed="VPMINUW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_min_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMINUW" xed="VPMINUW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMINUW" xed="VPMINUW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMINUW" xed="VPMINUW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mulhrs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMULHRSW" xed="VPMULHRSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mulhrs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMULHRSW" xed="VPMULHRSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mulhrs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMULHRSW" xed="VPMULHRSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mulhrs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMULHRSW" xed="VPMULHRSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mulhi_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMULHUW" xed="VPMULHUW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mulhi_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMULHUW" xed="VPMULHUW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mulhi_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMULHUW" xed="VPMULHUW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mulhi_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMULHUW" xed="VPMULHUW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mulhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMULHW" xed="VPMULHW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mulhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMULHW" xed="VPMULHW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mulhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMULHW" xed="VPMULHW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mulhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMULHW" xed="VPMULHW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mullo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMULLW" xed="VPMULLW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mullo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMULLW" xed="VPMULLW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mullo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMULLW" xed="VPMULLW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mullo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMULLW" xed="VPMULLW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sub_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSUBB" xed="VPSUBB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sub_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSUBB" xed="VPSUBB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sub_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSUBB" xed="VPSUBB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sub_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSUBB" xed="VPSUBB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_subs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSUBSB" xed="VPSUBSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_subs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSUBSB" xed="VPSUBSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_subs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSUBSB" xed="VPSUBSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_subs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSUBSB" xed="VPSUBSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_subs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSUBSW" xed="VPSUBSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_subs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSUBSW" xed="VPSUBSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_subs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSUBSW" xed="VPSUBSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_subs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSUBSW" xed="VPSUBSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_subs_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSUBUSB" xed="VPSUBUSB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_subs_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSUBUSB" xed="VPSUBUSB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_subs_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSUBUSB" xed="VPSUBUSB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_subs_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSUBUSB" xed="VPSUBUSB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_subs_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSUBUSW" xed="VPSUBUSW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_subs_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSUBUSW" xed="VPSUBUSW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_subs_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSUBUSW" xed="VPSUBUSW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_subs_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSUBUSW" xed="VPSUBUSW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sub_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSUBW" xed="VPSUBW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sub_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSUBW" xed="VPSUBW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sub_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSUBW" xed="VPSUBW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sub_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSUBW" xed="VPSUBW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_packs_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate16(a[31:0])
+tmp_dst[31:16] := Saturate16(a[63:32])
+tmp_dst[47:32] := Saturate16(a[95:64])
+tmp_dst[63:48] := Saturate16(a[127:96])
+tmp_dst[79:64] := Saturate16(b[31:0])
+tmp_dst[95:80] := Saturate16(b[63:32])
+tmp_dst[111:96] := Saturate16(b[95:64])
+tmp_dst[127:112] := Saturate16(b[127:96])
+tmp_dst[143:128] := Saturate16(a[159:128])
+tmp_dst[159:144] := Saturate16(a[191:160])
+tmp_dst[175:160] := Saturate16(a[223:192])
+tmp_dst[191:176] := Saturate16(a[255:224])
+tmp_dst[207:192] := Saturate16(b[159:128])
+tmp_dst[223:208] := Saturate16(b[191:160])
+tmp_dst[239:224] := Saturate16(b[223:192])
+tmp_dst[255:240] := Saturate16(b[255:224])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPACKSSDW" xed="VPACKSSDW_YMMi16_MASKmskw_YMMi32_YMMi32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_packs_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate16(a[31:0])
+tmp_dst[31:16] := Saturate16(a[63:32])
+tmp_dst[47:32] := Saturate16(a[95:64])
+tmp_dst[63:48] := Saturate16(a[127:96])
+tmp_dst[79:64] := Saturate16(b[31:0])
+tmp_dst[95:80] := Saturate16(b[63:32])
+tmp_dst[111:96] := Saturate16(b[95:64])
+tmp_dst[127:112] := Saturate16(b[127:96])
+tmp_dst[143:128] := Saturate16(a[159:128])
+tmp_dst[159:144] := Saturate16(a[191:160])
+tmp_dst[175:160] := Saturate16(a[223:192])
+tmp_dst[191:176] := Saturate16(a[255:224])
+tmp_dst[207:192] := Saturate16(b[159:128])
+tmp_dst[223:208] := Saturate16(b[191:160])
+tmp_dst[239:224] := Saturate16(b[223:192])
+tmp_dst[255:240] := Saturate16(b[255:224])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPACKSSDW" xed="VPACKSSDW_YMMi16_MASKmskw_YMMi32_YMMi32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_packs_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate16(a[31:0])
+tmp_dst[31:16] := Saturate16(a[63:32])
+tmp_dst[47:32] := Saturate16(a[95:64])
+tmp_dst[63:48] := Saturate16(a[127:96])
+tmp_dst[79:64] := Saturate16(b[31:0])
+tmp_dst[95:80] := Saturate16(b[63:32])
+tmp_dst[111:96] := Saturate16(b[95:64])
+tmp_dst[127:112] := Saturate16(b[127:96])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPACKSSDW" xed="VPACKSSDW_XMMi16_MASKmskw_XMMi32_XMMi32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_packs_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate16(a[31:0])
+tmp_dst[31:16] := Saturate16(a[63:32])
+tmp_dst[47:32] := Saturate16(a[95:64])
+tmp_dst[63:48] := Saturate16(a[127:96])
+tmp_dst[79:64] := Saturate16(b[31:0])
+tmp_dst[95:80] := Saturate16(b[63:32])
+tmp_dst[111:96] := Saturate16(b[95:64])
+tmp_dst[127:112] := Saturate16(b[127:96])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPACKSSDW" xed="VPACKSSDW_XMMi16_MASKmskw_XMMi32_XMMi32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_packs_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI8" type="__m256i" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate8(a[15:0])
+tmp_dst[15:8] := Saturate8(a[31:16])
+tmp_dst[23:16] := Saturate8(a[47:32])
+tmp_dst[31:24] := Saturate8(a[63:48])
+tmp_dst[39:32] := Saturate8(a[79:64])
+tmp_dst[47:40] := Saturate8(a[95:80])
+tmp_dst[55:48] := Saturate8(a[111:96])
+tmp_dst[63:56] := Saturate8(a[127:112])
+tmp_dst[71:64] := Saturate8(b[15:0])
+tmp_dst[79:72] := Saturate8(b[31:16])
+tmp_dst[87:80] := Saturate8(b[47:32])
+tmp_dst[95:88] := Saturate8(b[63:48])
+tmp_dst[103:96] := Saturate8(b[79:64])
+tmp_dst[111:104] := Saturate8(b[95:80])
+tmp_dst[119:112] := Saturate8(b[111:96])
+tmp_dst[127:120] := Saturate8(b[127:112])
+tmp_dst[135:128] := Saturate8(a[143:128])
+tmp_dst[143:136] := Saturate8(a[159:144])
+tmp_dst[151:144] := Saturate8(a[175:160])
+tmp_dst[159:152] := Saturate8(a[191:176])
+tmp_dst[167:160] := Saturate8(a[207:192])
+tmp_dst[175:168] := Saturate8(a[223:208])
+tmp_dst[183:176] := Saturate8(a[239:224])
+tmp_dst[191:184] := Saturate8(a[255:240])
+tmp_dst[199:192] := Saturate8(b[143:128])
+tmp_dst[207:200] := Saturate8(b[159:144])
+tmp_dst[215:208] := Saturate8(b[175:160])
+tmp_dst[223:216] := Saturate8(b[191:176])
+tmp_dst[231:224] := Saturate8(b[207:192])
+tmp_dst[239:232] := Saturate8(b[223:208])
+tmp_dst[247:240] := Saturate8(b[239:224])
+tmp_dst[255:248] := Saturate8(b[255:240])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPACKSSWB" xed="VPACKSSWB_YMMi8_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_packs_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate8(a[15:0])
+tmp_dst[15:8] := Saturate8(a[31:16])
+tmp_dst[23:16] := Saturate8(a[47:32])
+tmp_dst[31:24] := Saturate8(a[63:48])
+tmp_dst[39:32] := Saturate8(a[79:64])
+tmp_dst[47:40] := Saturate8(a[95:80])
+tmp_dst[55:48] := Saturate8(a[111:96])
+tmp_dst[63:56] := Saturate8(a[127:112])
+tmp_dst[71:64] := Saturate8(b[15:0])
+tmp_dst[79:72] := Saturate8(b[31:16])
+tmp_dst[87:80] := Saturate8(b[47:32])
+tmp_dst[95:88] := Saturate8(b[63:48])
+tmp_dst[103:96] := Saturate8(b[79:64])
+tmp_dst[111:104] := Saturate8(b[95:80])
+tmp_dst[119:112] := Saturate8(b[111:96])
+tmp_dst[127:120] := Saturate8(b[127:112])
+tmp_dst[135:128] := Saturate8(a[143:128])
+tmp_dst[143:136] := Saturate8(a[159:144])
+tmp_dst[151:144] := Saturate8(a[175:160])
+tmp_dst[159:152] := Saturate8(a[191:176])
+tmp_dst[167:160] := Saturate8(a[207:192])
+tmp_dst[175:168] := Saturate8(a[223:208])
+tmp_dst[183:176] := Saturate8(a[239:224])
+tmp_dst[191:184] := Saturate8(a[255:240])
+tmp_dst[199:192] := Saturate8(b[143:128])
+tmp_dst[207:200] := Saturate8(b[159:144])
+tmp_dst[215:208] := Saturate8(b[175:160])
+tmp_dst[223:216] := Saturate8(b[191:176])
+tmp_dst[231:224] := Saturate8(b[207:192])
+tmp_dst[239:232] := Saturate8(b[223:208])
+tmp_dst[247:240] := Saturate8(b[239:224])
+tmp_dst[255:248] := Saturate8(b[255:240])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPACKSSWB" xed="VPACKSSWB_YMMi8_MASKmskw_YMMi16_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_packs_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate8(a[15:0])
+tmp_dst[15:8] := Saturate8(a[31:16])
+tmp_dst[23:16] := Saturate8(a[47:32])
+tmp_dst[31:24] := Saturate8(a[63:48])
+tmp_dst[39:32] := Saturate8(a[79:64])
+tmp_dst[47:40] := Saturate8(a[95:80])
+tmp_dst[55:48] := Saturate8(a[111:96])
+tmp_dst[63:56] := Saturate8(a[127:112])
+tmp_dst[71:64] := Saturate8(b[15:0])
+tmp_dst[79:72] := Saturate8(b[31:16])
+tmp_dst[87:80] := Saturate8(b[47:32])
+tmp_dst[95:88] := Saturate8(b[63:48])
+tmp_dst[103:96] := Saturate8(b[79:64])
+tmp_dst[111:104] := Saturate8(b[95:80])
+tmp_dst[119:112] := Saturate8(b[111:96])
+tmp_dst[127:120] := Saturate8(b[127:112])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPACKSSWB" xed="VPACKSSWB_XMMi8_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_packs_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate8(a[15:0])
+tmp_dst[15:8] := Saturate8(a[31:16])
+tmp_dst[23:16] := Saturate8(a[47:32])
+tmp_dst[31:24] := Saturate8(a[63:48])
+tmp_dst[39:32] := Saturate8(a[79:64])
+tmp_dst[47:40] := Saturate8(a[95:80])
+tmp_dst[55:48] := Saturate8(a[111:96])
+tmp_dst[63:56] := Saturate8(a[127:112])
+tmp_dst[71:64] := Saturate8(b[15:0])
+tmp_dst[79:72] := Saturate8(b[31:16])
+tmp_dst[87:80] := Saturate8(b[47:32])
+tmp_dst[95:88] := Saturate8(b[63:48])
+tmp_dst[103:96] := Saturate8(b[79:64])
+tmp_dst[111:104] := Saturate8(b[95:80])
+tmp_dst[119:112] := Saturate8(b[111:96])
+tmp_dst[127:120] := Saturate8(b[127:112])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPACKSSWB" xed="VPACKSSWB_XMMi8_MASKmskw_XMMi16_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_packus_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := SaturateU16(a[31:0])
+tmp_dst[31:16] := SaturateU16(a[63:32])
+tmp_dst[47:32] := SaturateU16(a[95:64])
+tmp_dst[63:48] := SaturateU16(a[127:96])
+tmp_dst[79:64] := SaturateU16(b[31:0])
+tmp_dst[95:80] := SaturateU16(b[63:32])
+tmp_dst[111:96] := SaturateU16(b[95:64])
+tmp_dst[127:112] := SaturateU16(b[127:96])
+tmp_dst[143:128] := SaturateU16(a[159:128])
+tmp_dst[159:144] := SaturateU16(a[191:160])
+tmp_dst[175:160] := SaturateU16(a[223:192])
+tmp_dst[191:176] := SaturateU16(a[255:224])
+tmp_dst[207:192] := SaturateU16(b[159:128])
+tmp_dst[223:208] := SaturateU16(b[191:160])
+tmp_dst[239:224] := SaturateU16(b[223:192])
+tmp_dst[255:240] := SaturateU16(b[255:224])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPACKUSDW" xed="VPACKUSDW_YMMu16_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_packus_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := SaturateU16(a[31:0])
+tmp_dst[31:16] := SaturateU16(a[63:32])
+tmp_dst[47:32] := SaturateU16(a[95:64])
+tmp_dst[63:48] := SaturateU16(a[127:96])
+tmp_dst[79:64] := SaturateU16(b[31:0])
+tmp_dst[95:80] := SaturateU16(b[63:32])
+tmp_dst[111:96] := SaturateU16(b[95:64])
+tmp_dst[127:112] := SaturateU16(b[127:96])
+tmp_dst[143:128] := SaturateU16(a[159:128])
+tmp_dst[159:144] := SaturateU16(a[191:160])
+tmp_dst[175:160] := SaturateU16(a[223:192])
+tmp_dst[191:176] := SaturateU16(a[255:224])
+tmp_dst[207:192] := SaturateU16(b[159:128])
+tmp_dst[223:208] := SaturateU16(b[191:160])
+tmp_dst[239:224] := SaturateU16(b[223:192])
+tmp_dst[255:240] := SaturateU16(b[255:224])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPACKUSDW" xed="VPACKUSDW_YMMu16_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_packus_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := SaturateU16(a[31:0])
+tmp_dst[31:16] := SaturateU16(a[63:32])
+tmp_dst[47:32] := SaturateU16(a[95:64])
+tmp_dst[63:48] := SaturateU16(a[127:96])
+tmp_dst[79:64] := SaturateU16(b[31:0])
+tmp_dst[95:80] := SaturateU16(b[63:32])
+tmp_dst[111:96] := SaturateU16(b[95:64])
+tmp_dst[127:112] := SaturateU16(b[127:96])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPACKUSDW" xed="VPACKUSDW_XMMu16_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_packus_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := SaturateU16(a[31:0])
+tmp_dst[31:16] := SaturateU16(a[63:32])
+tmp_dst[47:32] := SaturateU16(a[95:64])
+tmp_dst[63:48] := SaturateU16(a[127:96])
+tmp_dst[79:64] := SaturateU16(b[31:0])
+tmp_dst[95:80] := SaturateU16(b[63:32])
+tmp_dst[111:96] := SaturateU16(b[95:64])
+tmp_dst[127:112] := SaturateU16(b[127:96])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPACKUSDW" xed="VPACKUSDW_XMMu16_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_packus_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := SaturateU8(a[15:0])
+tmp_dst[15:8] := SaturateU8(a[31:16])
+tmp_dst[23:16] := SaturateU8(a[47:32])
+tmp_dst[31:24] := SaturateU8(a[63:48])
+tmp_dst[39:32] := SaturateU8(a[79:64])
+tmp_dst[47:40] := SaturateU8(a[95:80])
+tmp_dst[55:48] := SaturateU8(a[111:96])
+tmp_dst[63:56] := SaturateU8(a[127:112])
+tmp_dst[71:64] := SaturateU8(b[15:0])
+tmp_dst[79:72] := SaturateU8(b[31:16])
+tmp_dst[87:80] := SaturateU8(b[47:32])
+tmp_dst[95:88] := SaturateU8(b[63:48])
+tmp_dst[103:96] := SaturateU8(b[79:64])
+tmp_dst[111:104] := SaturateU8(b[95:80])
+tmp_dst[119:112] := SaturateU8(b[111:96])
+tmp_dst[127:120] := SaturateU8(b[127:112])
+tmp_dst[135:128] := SaturateU8(a[143:128])
+tmp_dst[143:136] := SaturateU8(a[159:144])
+tmp_dst[151:144] := SaturateU8(a[175:160])
+tmp_dst[159:152] := SaturateU8(a[191:176])
+tmp_dst[167:160] := SaturateU8(a[207:192])
+tmp_dst[175:168] := SaturateU8(a[223:208])
+tmp_dst[183:176] := SaturateU8(a[239:224])
+tmp_dst[191:184] := SaturateU8(a[255:240])
+tmp_dst[199:192] := SaturateU8(b[143:128])
+tmp_dst[207:200] := SaturateU8(b[159:144])
+tmp_dst[215:208] := SaturateU8(b[175:160])
+tmp_dst[223:216] := SaturateU8(b[191:176])
+tmp_dst[231:224] := SaturateU8(b[207:192])
+tmp_dst[239:232] := SaturateU8(b[223:208])
+tmp_dst[247:240] := SaturateU8(b[239:224])
+tmp_dst[255:248] := SaturateU8(b[255:240])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPACKUSWB" xed="VPACKUSWB_YMMu8_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_packus_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := SaturateU8(a[15:0])
+tmp_dst[15:8] := SaturateU8(a[31:16])
+tmp_dst[23:16] := SaturateU8(a[47:32])
+tmp_dst[31:24] := SaturateU8(a[63:48])
+tmp_dst[39:32] := SaturateU8(a[79:64])
+tmp_dst[47:40] := SaturateU8(a[95:80])
+tmp_dst[55:48] := SaturateU8(a[111:96])
+tmp_dst[63:56] := SaturateU8(a[127:112])
+tmp_dst[71:64] := SaturateU8(b[15:0])
+tmp_dst[79:72] := SaturateU8(b[31:16])
+tmp_dst[87:80] := SaturateU8(b[47:32])
+tmp_dst[95:88] := SaturateU8(b[63:48])
+tmp_dst[103:96] := SaturateU8(b[79:64])
+tmp_dst[111:104] := SaturateU8(b[95:80])
+tmp_dst[119:112] := SaturateU8(b[111:96])
+tmp_dst[127:120] := SaturateU8(b[127:112])
+tmp_dst[135:128] := SaturateU8(a[143:128])
+tmp_dst[143:136] := SaturateU8(a[159:144])
+tmp_dst[151:144] := SaturateU8(a[175:160])
+tmp_dst[159:152] := SaturateU8(a[191:176])
+tmp_dst[167:160] := SaturateU8(a[207:192])
+tmp_dst[175:168] := SaturateU8(a[223:208])
+tmp_dst[183:176] := SaturateU8(a[239:224])
+tmp_dst[191:184] := SaturateU8(a[255:240])
+tmp_dst[199:192] := SaturateU8(b[143:128])
+tmp_dst[207:200] := SaturateU8(b[159:144])
+tmp_dst[215:208] := SaturateU8(b[175:160])
+tmp_dst[223:216] := SaturateU8(b[191:176])
+tmp_dst[231:224] := SaturateU8(b[207:192])
+tmp_dst[239:232] := SaturateU8(b[223:208])
+tmp_dst[247:240] := SaturateU8(b[239:224])
+tmp_dst[255:248] := SaturateU8(b[255:240])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPACKUSWB" xed="VPACKUSWB_YMMu8_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_packus_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := SaturateU8(a[15:0])
+tmp_dst[15:8] := SaturateU8(a[31:16])
+tmp_dst[23:16] := SaturateU8(a[47:32])
+tmp_dst[31:24] := SaturateU8(a[63:48])
+tmp_dst[39:32] := SaturateU8(a[79:64])
+tmp_dst[47:40] := SaturateU8(a[95:80])
+tmp_dst[55:48] := SaturateU8(a[111:96])
+tmp_dst[63:56] := SaturateU8(a[127:112])
+tmp_dst[71:64] := SaturateU8(b[15:0])
+tmp_dst[79:72] := SaturateU8(b[31:16])
+tmp_dst[87:80] := SaturateU8(b[47:32])
+tmp_dst[95:88] := SaturateU8(b[63:48])
+tmp_dst[103:96] := SaturateU8(b[79:64])
+tmp_dst[111:104] := SaturateU8(b[95:80])
+tmp_dst[119:112] := SaturateU8(b[111:96])
+tmp_dst[127:120] := SaturateU8(b[127:112])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPACKUSWB" xed="VPACKUSWB_XMMu8_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_packus_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := SaturateU8(a[15:0])
+tmp_dst[15:8] := SaturateU8(a[31:16])
+tmp_dst[23:16] := SaturateU8(a[47:32])
+tmp_dst[31:24] := SaturateU8(a[63:48])
+tmp_dst[39:32] := SaturateU8(a[79:64])
+tmp_dst[47:40] := SaturateU8(a[95:80])
+tmp_dst[55:48] := SaturateU8(a[111:96])
+tmp_dst[63:56] := SaturateU8(a[127:112])
+tmp_dst[71:64] := SaturateU8(b[15:0])
+tmp_dst[79:72] := SaturateU8(b[31:16])
+tmp_dst[87:80] := SaturateU8(b[47:32])
+tmp_dst[95:88] := SaturateU8(b[63:48])
+tmp_dst[103:96] := SaturateU8(b[79:64])
+tmp_dst[111:104] := SaturateU8(b[95:80])
+tmp_dst[119:112] := SaturateU8(b[111:96])
+tmp_dst[127:120] := SaturateU8(b[127:112])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPACKUSWB" xed="VPACKUSWB_XMMu8_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtsepi16_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Saturate8(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVSWB" xed="VPMOVSWB_XMMi8_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtsepi16_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVSWB" xed="VPMOVSWB_XMMi8_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtsepi16_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI8" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, ymm" name="VPMOVSWB" xed="VPMOVSWB_MEMi8_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtsepi16_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVSWB" xed="VPMOVSWB_XMMi8_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsepi16_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Saturate8(a[i+15:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVSWB" xed="VPMOVSWB_XMMi8_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsepi16_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVSWB" xed="VPMOVSWB_XMMi8_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsepi16_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI8" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, xmm" name="VPMOVSWB" xed="VPMOVSWB_MEMi8_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtsepi16_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVSWB" xed="VPMOVSWB_XMMi8_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi8_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPMOVSXBW" xed="VPMOVSXBW_YMMi16_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi8_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPMOVSXBW" xed="VPMOVSXBW_YMMi16_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi8_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVSXBW" xed="VPMOVSXBW_XMMi16_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi8_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVSXBW" xed="VPMOVSXBW_XMMi16_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtusepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := SaturateU8(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVUSWB" xed="VPMOVUSWB_XMMu8_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtusepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVUSWB" xed="VPMOVUSWB_XMMu8_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtusepi16_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, ymm" name="VPMOVUSWB" xed="VPMOVUSWB_MEMu8_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtusepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVUSWB" xed="VPMOVUSWB_XMMu8_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtusepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := SaturateU8(a[i+15:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVUSWB" xed="VPMOVUSWB_XMMu8_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtusepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVUSWB" xed="VPMOVUSWB_XMMu8_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtusepi16_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, xmm" name="VPMOVUSWB" xed="VPMOVUSWB_MEMu8_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtusepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVUSWB" xed="VPMOVUSWB_XMMu8_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Truncate8(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVWB" xed="VPMOVWB_XMMu8_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVWB" xed="VPMOVWB_XMMu8_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi16_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, ymm" name="VPMOVWB" xed="VPMOVWB_MEMu8_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVWB" xed="VPMOVWB_XMMu8_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Truncate8(a[i+15:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVWB" xed="VPMOVWB_XMMu8_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVWB" xed="VPMOVWB_XMMu8_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi16_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, xmm" name="VPMOVWB" xed="VPMOVWB_MEMu8_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVWB" xed="VPMOVWB_XMMu8_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepu8_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPMOVZXBW" xed="VPMOVZXBW_YMMi16_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepu8_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPMOVZXBW" xed="VPMOVZXBW_YMMi16_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepu8_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVZXBW" xed="VPMOVZXBW_XMMi16_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepu8_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVZXBW" xed="VPMOVZXBW_XMMi16_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_set1_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="char" varname="a" />
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, r8" name="VPBROADCASTB" xed="VPBROADCASTB_YMMu8_MASKmskw_GPR32u8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_set1_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="char" varname="a" />
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, r8" name="VPBROADCASTB" xed="VPBROADCASTB_YMMu8_MASKmskw_GPR32u8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_set1_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="char" varname="a" />
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, r8" name="VPBROADCASTB" xed="VPBROADCASTB_XMMu8_MASKmskw_GPR32u8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_set1_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="char" varname="a" />
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, r8" name="VPBROADCASTB" xed="VPBROADCASTB_XMMu8_MASKmskw_GPR32u8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_set1_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="short" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, r16" name="VPBROADCASTW" xed="VPBROADCASTW_YMMu16_MASKmskw_GPR32u16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_set1_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="short" varname="a" />
+	<description>Broadcast 16-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, r16" name="VPBROADCASTW" xed="VPBROADCASTW_YMMu16_MASKmskw_GPR32u16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_set1_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="short" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, r16" name="VPBROADCASTW" xed="VPBROADCASTW_XMMu16_MASKmskw_GPR32u16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_set1_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="short" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, r16" name="VPBROADCASTW" xed="VPBROADCASTW_XMMu16_MASKmskw_GPR32u16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmp_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm, imm8" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpeq_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpge_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpgt_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmple_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmplt_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpneq_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmp_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm, imm8" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpeq_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpge_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpgt_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmple_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmplt_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpneq_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpge_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmple_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpneq_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpeq_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpge_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpgt_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmple_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmplt_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpneq_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmp_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm, imm8" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpeq_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpge_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpgt_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmple_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmplt_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpneq_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmp_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm, imm8" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpeq_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpge_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpgt_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmple_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmplt_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpneq_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpge_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmple_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpneq_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpeq_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpge_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpgt_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmple_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmplt_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpneq_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmp_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm, imm8" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpeq_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpge_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpgt_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmple_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmplt_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpneq_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmp_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm, imm8" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpeq_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpge_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpgt_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmple_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmplt_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpneq_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpge_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmple_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpneq_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpeq_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpge_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpgt_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmple_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmplt_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpneq_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmp_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm, imm8" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpeq_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpge_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpgt_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmple_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmplt_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpneq_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmp_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm, imm8" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpeq_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpge_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpgt_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmple_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmplt_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpneq_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpge_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmple_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpneq_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpeq_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpge_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpgt_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmple_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmplt_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpneq_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_test_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPTESTMB" xed="VPTESTMB_MASKmskw_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_test_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPTESTMB" xed="VPTESTMB_MASKmskw_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_test_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPTESTMB" xed="VPTESTMB_MASKmskw_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_test_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPTESTMB" xed="VPTESTMB_MASKmskw_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_test_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPTESTMW" xed="VPTESTMW_MASKmskw_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_test_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPTESTMW" xed="VPTESTMW_MASKmskw_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_test_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPTESTMW" xed="VPTESTMW_MASKmskw_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_test_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPTESTMW" xed="VPTESTMW_MASKmskw_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_testn_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPTESTNMB" xed="VPTESTNMB_MASKmskw_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_testn_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPTESTNMB" xed="VPTESTNMB_MASKmskw_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_testn_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPTESTNMB" xed="VPTESTNMB_MASKmskw_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_testn_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPTESTNMB" xed="VPTESTNMB_MASKmskw_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_testn_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPTESTNMW" xed="VPTESTNMW_MASKmskw_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_testn_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPTESTNMW" xed="VPTESTNMW_MASKmskw_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_testn_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPTESTNMW" xed="VPTESTNMW_MASKmskw_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_testn_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPTESTNMW" xed="VPTESTNMW_MASKmskw_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sllv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSLLVW" xed="VPSLLVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sllv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSLLVW" xed="VPSLLVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sllv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSLLVW" xed="VPSLLVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sllv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSLLVW" xed="VPSLLVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sllv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSLLVW" xed="VPSLLVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sllv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSLLVW" xed="VPSLLVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sll_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, xmm" name="VPSLLW" xed="VPSLLW_YMMu16_MASKmskw_YMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_slli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPSLLW" xed="VPSLLW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sll_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, xmm" name="VPSLLW" xed="VPSLLW_YMMu16_MASKmskw_YMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_slli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPSLLW" xed="VPSLLW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sll_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSLLW" xed="VPSLLW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_slli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPSLLW" xed="VPSLLW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sll_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSLLW" xed="VPSLLW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_slli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPSLLW" xed="VPSLLW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srav_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSRAVW" xed="VPSRAVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srav_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSRAVW" xed="VPSRAVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srav_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSRAVW" xed="VPSRAVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srav_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSRAVW" xed="VPSRAVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srav_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSRAVW" xed="VPSRAVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srav_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSRAVW" xed="VPSRAVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sra_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, xmm" name="VPSRAW" xed="VPSRAW_YMMu16_MASKmskw_YMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srai_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPSRAW" xed="VPSRAW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sra_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, xmm" name="VPSRAW" xed="VPSRAW_YMMu16_MASKmskw_YMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srai_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPSRAW" xed="VPSRAW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sra_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSRAW" xed="VPSRAW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srai_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPSRAW" xed="VPSRAW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sra_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSRAW" xed="VPSRAW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srai_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPSRAW" xed="VPSRAW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srlv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSRLVW" xed="VPSRLVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srlv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSRLVW" xed="VPSRLVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srlv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSRLVW" xed="VPSRLVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srlv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSRLVW" xed="VPSRLVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srlv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSRLVW" xed="VPSRLVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srlv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSRLVW" xed="VPSRLVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srl_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, xmm" name="VPSRLW" xed="VPSRLW_YMMu16_MASKmskw_YMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPSRLW" xed="VPSRLW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srl_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, xmm" name="VPSRLW" xed="VPSRLW_YMMu16_MASKmskw_YMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPSRLW" xed="VPSRLW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srl_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSRLW" xed="VPSRLW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPSRLW" xed="VPSRLW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srl_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSRLW" xed="VPSRLW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPSRLW" xed="VPSRLW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_add_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[15:0] + src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] + src[i+16*len+31:i+16*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_ADD(a, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_add_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[15:0] + src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] + src[i+16*len+15:i+16*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := 0
+	FI
+ENDFOR
+dst[15:0] := REDUCE_ADD(tmp, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_add_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[15:0] + src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] + src[i+16*len+31:i+16*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_ADD(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_add_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[15:0] + src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] + src[i+16*len+15:i+16*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := 0
+	FI
+ENDFOR
+dst[15:0] := REDUCE_ADD(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_add_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[7:0] + src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] + src[i+8*len+15:i+8*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_ADD(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_add_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[7:0] + src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] + src[i+8*len+7:i+8*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := 0
+	FI
+ENDFOR
+dst[7:0] := REDUCE_ADD(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_add_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[7:0] + src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] + src[i+8*len+15:i+8*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_ADD(a, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_add_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[7:0] + src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] + src[i+8*len+7:i+8*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := 0
+	FI
+ENDFOR
+dst[7:0] := REDUCE_ADD(tmp, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_mul_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[15:0] * src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] * src[i+16*len+31:i+16*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_MUL(a, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_mul_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[15:0] * src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] * src[i+16*len+15:i+16*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := 1
+	FI
+ENDFOR
+dst[15:0] := REDUCE_MUL(tmp, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_mul_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[15:0] * src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] * src[i+16*len+31:i+16*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_MUL(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_mul_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[15:0] * src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] * src[i+16*len+15:i+16*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := 1
+	FI
+ENDFOR
+dst[15:0] := REDUCE_MUL(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_mul_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[7:0] * src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] * src[i+8*len+15:i+8*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_MUL(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_mul_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[7:0] * src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] * src[i+8*len+7:i+8*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := 1
+	FI
+ENDFOR
+dst[7:0] := REDUCE_MUL(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_mul_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[7:0] * src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] * src[i+8*len+15:i+8*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_MUL(a, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_mul_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[7:0] * src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] * src[i+8*len+7:i+8*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := 1
+	FI
+ENDFOR
+dst[7:0] := REDUCE_MUL(tmp, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_or_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_OR(src, len) {
+	IF len == 2
+		RETURN src[15:0] OR src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] OR src[i+16*len+31:i+16*len]
+	ENDFOR
+	RETURN REDUCE_OR(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_OR(a, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_or_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_OR(src, len) {
+	IF len == 2
+		RETURN src[15:0] OR src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] OR src[i+16*len+15:i+16*len]
+	ENDFOR
+	RETURN REDUCE_OR(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := 0
+	FI
+ENDFOR
+dst[15:0] := REDUCE_OR(tmp, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_or_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_OR(src, len) {
+	IF len == 2
+		RETURN src[15:0] OR src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] OR src[i+16*len+31:i+16*len]
+	ENDFOR
+	RETURN REDUCE_OR(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_OR(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_or_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_OR(src, len) {
+	IF len == 2
+		RETURN src[15:0] OR src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] OR src[i+16*len+15:i+16*len]
+	ENDFOR
+	RETURN REDUCE_OR(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := 0
+	FI
+ENDFOR
+dst[15:0] := REDUCE_OR(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_or_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_OR(src, len) {
+	IF len == 2
+		RETURN src[7:0] OR src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] OR src[i+8*len+15:i+8*len]
+	ENDFOR
+	RETURN REDUCE_OR(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_OR(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_or_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_OR(src, len) {
+	IF len == 2
+		RETURN src[7:0] OR src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] OR src[i+8*len+7:i+8*len]
+	ENDFOR
+	RETURN REDUCE_OR(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := 0
+	FI
+ENDFOR
+dst[7:0] := REDUCE_OR(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_or_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_OR(src, len) {
+	IF len == 2
+		RETURN src[7:0] OR src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] OR src[i+8*len+15:i+8*len]
+	ENDFOR
+	RETURN REDUCE_OR(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_OR(a, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_or_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_OR(src, len) {
+	IF len == 2
+		RETURN src[7:0] OR src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] OR src[i+8*len+7:i+8*len]
+	ENDFOR
+	RETURN REDUCE_OR(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := 0
+	FI
+ENDFOR
+dst[7:0] := REDUCE_OR(tmp, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_and_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_AND(src, len) {
+	IF len == 2
+		RETURN src[15:0] AND src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] AND src[i+16*len+31:i+16*len]
+	ENDFOR
+	RETURN REDUCE_AND(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_AND(a, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_and_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_AND(src, len) {
+	IF len == 2
+		RETURN src[15:0] AND src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] AND src[i+16*len+15:i+16*len]
+	ENDFOR
+	RETURN REDUCE_AND(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := 0xFFFF
+	FI
+ENDFOR
+dst[15:0] := REDUCE_AND(tmp, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_and_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_AND(src, len) {
+	IF len == 2
+		RETURN src[15:0] AND src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] AND src[i+16*len+31:i+16*len]
+	ENDFOR
+	RETURN REDUCE_AND(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_AND(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_and_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_AND(src, len) {
+	IF len == 2
+		RETURN src[15:0] AND src[31:16]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := src[i+15:i] AND src[i+16*len+15:i+16*len]
+	ENDFOR
+	RETURN REDUCE_AND(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := 0xFFFF
+	FI
+ENDFOR
+dst[15:0] := REDUCE_AND(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_and_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_AND(src, len) {
+	IF len == 2
+		RETURN src[7:0] AND src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] AND src[i+8*len+15:i+8*len]
+	ENDFOR
+	RETURN REDUCE_AND(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_AND(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_and_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_AND(src, len) {
+	IF len == 2
+		RETURN src[7:0] AND src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] AND src[i+8*len+7:i+8*len]
+	ENDFOR
+	RETURN REDUCE_AND(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := 0xFF
+	FI
+ENDFOR
+dst[7:0] := REDUCE_AND(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_and_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_AND(src, len) {
+	IF len == 2
+		RETURN src[7:0] AND src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] AND src[i+8*len+15:i+8*len]
+	ENDFOR
+	RETURN REDUCE_AND(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_AND(a, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_and_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_AND(src, len) {
+	IF len == 2
+		RETURN src[7:0] AND src[15:8]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := src[i+7:i] AND src[i+8*len+7:i+8*len]
+	ENDFOR
+	RETURN REDUCE_AND(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := 0xFF
+	FI
+ENDFOR
+dst[7:0] := REDUCE_AND(tmp, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_max_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Reduce the packed signed 16-bit integers in "a" by maximum. Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &gt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &gt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_MAX(a, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_max_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Reduce the packed signed 16-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &gt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &gt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := Int16(-0x8000)
+	FI
+ENDFOR
+dst[15:0] := REDUCE_MAX(tmp, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_max_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Reduce the packed signed 16-bit integers in "a" by maximum. Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &gt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &gt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_MAX(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_max_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Reduce the packed signed 16-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &gt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &gt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := Int16(-0x8000)
+	FI
+ENDFOR
+dst[15:0] := REDUCE_MAX(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_max_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Reduce the packed signed 8-bit integers in "a" by maximum. Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &gt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &gt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_MAX(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_max_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Reduce the packed signed 8-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &gt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &gt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := Int8(-0x80)
+	FI
+ENDFOR
+dst[7:0] := REDUCE_MAX(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_max_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Reduce the packed signed 8-bit integers in "a" by maximum. Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &gt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &gt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_MAX(a, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_max_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Reduce the packed signed 8-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &gt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &gt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := Int8(-0x80)
+	FI
+ENDFOR
+dst[7:0] := REDUCE_MAX(tmp, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_max_epu16" sequence="TRUE" tech="AVX-512">
+	<return etype="UI16" type="unsigned short" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Reduce the packed unsigned 16-bit integers in "a" by maximum. Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &gt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &gt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_MAX(a, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_max_epu16" sequence="TRUE" tech="AVX-512">
+	<return etype="UI16" type="unsigned short" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Reduce the packed unsigned 16-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &gt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &gt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := 0
+	FI
+ENDFOR
+dst[15:0] := REDUCE_MAX(tmp, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_max_epu16" sequence="TRUE" tech="AVX-512">
+	<return etype="UI16" type="unsigned short" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Reduce the packed unsigned 16-bit integers in "a" by maximum. Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &gt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &gt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_MAX(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_max_epu16" sequence="TRUE" tech="AVX-512">
+	<return etype="UI16" type="unsigned short" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Reduce the packed unsigned 16-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &gt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &gt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := 0
+	FI
+ENDFOR
+dst[15:0] := REDUCE_MAX(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_max_epu8" sequence="TRUE" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Reduce the packed unsigned 8-bit integers in "a" by maximum. Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &gt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &gt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_MAX(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_max_epu8" sequence="TRUE" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Reduce the packed unsigned 8-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &gt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &gt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := 0
+	FI
+ENDFOR
+dst[7:0] := REDUCE_MAX(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_max_epu8" sequence="TRUE" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Reduce the packed unsigned 8-bit integers in "a" by maximum. Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &gt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &gt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_MAX(a, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_max_epu8" sequence="TRUE" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Reduce the packed unsigned 8-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &gt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &gt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := 0
+	FI
+ENDFOR
+dst[7:0] := REDUCE_MAX(tmp, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_min_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Reduce the packed signed 16-bit integers in "a" by minimum. Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &lt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &lt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_MIN(a, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_min_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Reduce the packed signed 16-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &lt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &lt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := Int16(0x7FFF)
+	FI
+ENDFOR
+dst[15:0] := REDUCE_MIN(tmp, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_min_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Reduce the packed signed 16-bit integers in "a" by minimum. Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &lt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &lt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_MIN(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_min_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="SI16" type="short" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Reduce the packed signed 16-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &lt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &lt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := Int16(0x7FFF)
+	FI
+ENDFOR
+dst[15:0] := REDUCE_MIN(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_min_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Reduce the packed signed 8-bit integers in "a" by minimum. Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &lt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &lt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_MIN(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_min_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Reduce the packed signed 8-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &lt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &lt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := Int8(0x7F)
+	FI
+ENDFOR
+dst[7:0] := REDUCE_MIN(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_min_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Reduce the packed signed 8-bit integers in "a" by minimum. Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &lt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &lt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_MIN(a, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_min_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="SI8" type="char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Reduce the packed signed 8-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &lt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &lt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := Int8(0x7F)
+	FI
+ENDFOR
+dst[7:0] := REDUCE_MIN(tmp, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_min_epu16" sequence="TRUE" tech="AVX-512">
+	<return etype="UI16" type="unsigned short" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Reduce the packed unsigned 16-bit integers in "a" by minimum. Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &lt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &lt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_MIN(a, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_min_epu16" sequence="TRUE" tech="AVX-512">
+	<return etype="UI16" type="unsigned short" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Reduce the packed unsigned 16-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &lt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &lt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := 0xFFFF
+	FI
+ENDFOR
+dst[15:0] := REDUCE_MIN(tmp, 8)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_min_epu16" sequence="TRUE" tech="AVX-512">
+	<return etype="UI16" type="unsigned short" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Reduce the packed unsigned 16-bit integers in "a" by minimum. Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &lt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &lt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[16*len-1:0], len)
+}
+dst[15:0] := REDUCE_MIN(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_min_epu16" sequence="TRUE" tech="AVX-512">
+	<return etype="UI16" type="unsigned short" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Reduce the packed unsigned 16-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[15:0] &lt; src[31:16] ? src[15:0] : src[31:16])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*16
+		src[i+15:i] := (src[i+15:i] &lt; src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[16*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[i+15:i] := a[i+15:i]
+	ELSE
+		tmp[i+15:i] := 0xFFFF
+	FI
+ENDFOR
+dst[15:0] := REDUCE_MIN(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_min_epu8" sequence="TRUE" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Reduce the packed unsigned 8-bit integers in "a" by minimum. Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &lt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &lt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_MIN(a, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_min_epu8" sequence="TRUE" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Reduce the packed unsigned 8-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &lt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &lt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := 0xFF
+	FI
+ENDFOR
+dst[7:0] := REDUCE_MIN(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_min_epu8" sequence="TRUE" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Reduce the packed unsigned 8-bit integers in "a" by minimum. Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &lt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &lt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[8*len-1:0], len)
+}
+dst[7:0] := REDUCE_MIN(a, 32)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_min_epu8" sequence="TRUE" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Reduce the packed unsigned 8-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[7:0] &lt; src[15:8] ? src[7:0] : src[15:8])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*8
+		src[i+7:i] := (src[i+7:i] &lt; src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[8*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		tmp[i+7:i] := a[i+7:i]
+	ELSE
+		tmp[i+7:i] := 0xFF
+	FI
+ENDFOR
+dst[7:0] := REDUCE_MIN(tmp, 16)
+	</operation>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	
+<intrinsic name="_mm512_kunpackd" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="b" />
+	<description>Unpack and interleave 32 bits from masks "a" and "b", and store the 64-bit result in "dst".</description>
+	<operation>
+dst[31:0] := b[31:0]
+dst[63:32] := a[31:0]
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="k, k, k" name="KUNPCKDQ" xed="KUNPCKDQ_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_kunpackw" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="b" />
+	<description>Unpack and interleave 16 bits from masks "a" and "b", and store the 32-bit result in "dst".</description>
+	<operation>
+dst[15:0] := b[15:0]
+dst[31:16] := a[15:0]
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="k, k, k" name="KUNPCKWD" xed="KUNPCKWD_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_dbsad_epu8" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+FOR i := 0 to 3
+	tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ]
+	tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ]
+	tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ]
+	tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ]
+ENDFOR
+FOR j := 0 to 7
+	i := j*64
+	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	               ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                  ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                  ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                  ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VDBPSADBW" xed="VDBPSADBW_ZMMu16_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_dbsad_epu8" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+FOR i := 0 to 3
+	tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ]
+	tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ]
+	tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ]
+	tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ]
+ENDFOR
+FOR j := 0 to 7
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	                   ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                      ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VDBPSADBW" xed="VDBPSADBW_ZMMu16_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_dbsad_epu8" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+FOR i := 0 to 3
+	tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ]
+	tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ]
+	tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ]
+	tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ]
+ENDFOR
+FOR j := 0 to 7
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	                   ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                      ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VDBPSADBW" xed="VDBPSADBW_ZMMu16_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_alignr_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128)[255:0] OR b[i+127:i]) &gt;&gt; (imm8*8)
+	dst[i+127:i] := tmp[127:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VPALIGNR" xed="VPALIGNR_ZMMu8_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_alignr_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128)[255:0] OR b[i+127:i]) &gt;&gt; (imm8*8)
+	tmp_dst[i+127:i] := tmp[127:0]
+ENDFOR
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VPALIGNR" xed="VPALIGNR_ZMMu8_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_alignr_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128)[255:0] OR b[i+127:i]) &gt;&gt; (imm8*8)
+	tmp_dst[i+127:i] := tmp[127:0]
+ENDFOR
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VPALIGNR" xed="VPALIGNR_ZMMu8_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_blend_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPBLENDMB" xed="VPBLENDMB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_blend_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPBLENDMW" xed="VPBLENDMW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcastb_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VPBROADCASTB" xed="VPBROADCASTB_ZMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcastb_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VPBROADCASTB" xed="VPBROADCASTB_ZMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcastb_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VPBROADCASTB" xed="VPBROADCASTB_ZMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcastw_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VPBROADCASTW" xed="VPBROADCASTW_ZMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcastw_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VPBROADCASTW" xed="VPBROADCASTW_ZMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcastw_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VPBROADCASTW" xed="VPBROADCASTW_ZMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask2_permutex2var_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="idx" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+4:i]
+		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := idx[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMI2W" xed="VPERMI2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutex2var_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="idx" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+4:i]
+		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMT2W" xed="VPERMT2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutex2var_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="idx" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+4:i]
+		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMI2W" xed="VPERMI2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMT2W" xed="VPERMT2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutex2var_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="idx" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	off := 16*idx[i+4:i]
+	dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMI2W" xed="VPERMI2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VPERMT2W" xed="VPERMT2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutexvar_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="idx" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	id := idx[i+4:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMW" xed="VPERMW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutexvar_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="idx" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	id := idx[i+4:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMW" xed="VPERMW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutexvar_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="idx" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	id := idx[i+4:i]*16
+	dst[i+15:i] := a[id+15:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMW" xed="VPERMW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_movepi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF a[i+7]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm" name="VPMOVB2M" xed="VPMOVB2M_MASKmskw_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_movm_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<description>Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := 0xFF
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm" name="VPMOVM2B" xed="VPMOVM2B_ZMMu8_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_movm_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<description>Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := 0xFFFF
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm" name="VPMOVM2W" xed="VPMOVM2W_ZMMu16_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_movepi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF a[i+15]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm" name="VPMOVW2M" xed="VPMOVW2M_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sad_epu8" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce eight unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+FOR j := 0 to 7
+	i := j*64
+	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \
+	               tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
+	dst[i+63:i+16] := 0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSADBW" xed="VPSADBW_ZMMu16_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shuffle_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" within 128-bit lanes using the control in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[5:0] := b[i+3:i] + (j &amp; 0x30)
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSHUFB" xed="VPSHUFB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shuffle_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[5:0] := b[i+3:i] + (j &amp; 0x30)
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSHUFB" xed="VPSHUFB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shuffle_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF b[i+7] == 1
+		dst[i+7:i] := 0
+	ELSE
+		index[5:0] := b[i+3:i] + (j &amp; 0x30)
+		dst[i+7:i] := a[index*8+7:index*8]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSHUFB" xed="VPSHUFB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shufflehi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+tmp_dst[191:128] := a[191:128]
+tmp_dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+tmp_dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+tmp_dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+tmp_dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+tmp_dst[319:256] := a[319:256]
+tmp_dst[335:320] := (a &gt;&gt; (imm8[1:0] * 16))[335:320]
+tmp_dst[351:336] := (a &gt;&gt; (imm8[3:2] * 16))[335:320]
+tmp_dst[367:352] := (a &gt;&gt; (imm8[5:4] * 16))[335:320]
+tmp_dst[383:368] := (a &gt;&gt; (imm8[7:6] * 16))[335:320]
+tmp_dst[447:384] := a[447:384]
+tmp_dst[463:448] := (a &gt;&gt; (imm8[1:0] * 16))[463:448]
+tmp_dst[479:464] := (a &gt;&gt; (imm8[3:2] * 16))[463:448]
+tmp_dst[495:480] := (a &gt;&gt; (imm8[5:4] * 16))[463:448]
+tmp_dst[511:496] := (a &gt;&gt; (imm8[7:6] * 16))[463:448]
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPSHUFHW" xed="VPSHUFHW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shufflehi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+tmp_dst[191:128] := a[191:128]
+tmp_dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+tmp_dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+tmp_dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+tmp_dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+tmp_dst[319:256] := a[319:256]
+tmp_dst[335:320] := (a &gt;&gt; (imm8[1:0] * 16))[335:320]
+tmp_dst[351:336] := (a &gt;&gt; (imm8[3:2] * 16))[335:320]
+tmp_dst[367:352] := (a &gt;&gt; (imm8[5:4] * 16))[335:320]
+tmp_dst[383:368] := (a &gt;&gt; (imm8[7:6] * 16))[335:320]
+tmp_dst[447:384] := a[447:384]
+tmp_dst[463:448] := (a &gt;&gt; (imm8[1:0] * 16))[463:448]
+tmp_dst[479:464] := (a &gt;&gt; (imm8[3:2] * 16))[463:448]
+tmp_dst[495:480] := (a &gt;&gt; (imm8[5:4] * 16))[463:448]
+tmp_dst[511:496] := (a &gt;&gt; (imm8[7:6] * 16))[463:448]
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPSHUFHW" xed="VPSHUFHW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shufflehi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+dst[191:128] := a[191:128]
+dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+dst[319:256] := a[319:256]
+dst[335:320] := (a &gt;&gt; (imm8[1:0] * 16))[335:320]
+dst[351:336] := (a &gt;&gt; (imm8[3:2] * 16))[335:320]
+dst[367:352] := (a &gt;&gt; (imm8[5:4] * 16))[335:320]
+dst[383:368] := (a &gt;&gt; (imm8[7:6] * 16))[335:320]
+dst[447:384] := a[447:384]
+dst[463:448] := (a &gt;&gt; (imm8[1:0] * 16))[463:448]
+dst[479:464] := (a &gt;&gt; (imm8[3:2] * 16))[463:448]
+dst[495:480] := (a &gt;&gt; (imm8[5:4] * 16))[463:448]
+dst[511:496] := (a &gt;&gt; (imm8[7:6] * 16))[463:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSHUFHW" xed="VPSHUFHW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shufflelo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+tmp_dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+tmp_dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+tmp_dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+tmp_dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+tmp_dst[255:192] := a[255:192]
+tmp_dst[271:256] := (a &gt;&gt; (imm8[1:0] * 16))[271:256]
+tmp_dst[287:272] := (a &gt;&gt; (imm8[3:2] * 16))[271:256]
+tmp_dst[303:288] := (a &gt;&gt; (imm8[5:4] * 16))[271:256]
+tmp_dst[319:304] := (a &gt;&gt; (imm8[7:6] * 16))[271:256]
+tmp_dst[383:320] := a[383:320]
+tmp_dst[399:384] := (a &gt;&gt; (imm8[1:0] * 16))[399:384]
+tmp_dst[415:400] := (a &gt;&gt; (imm8[3:2] * 16))[399:384]
+tmp_dst[431:416] := (a &gt;&gt; (imm8[5:4] * 16))[399:384]
+tmp_dst[447:432] := (a &gt;&gt; (imm8[7:6] * 16))[399:384]
+tmp_dst[511:448] := a[511:448]
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPSHUFLW" xed="VPSHUFLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shufflelo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+tmp_dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+tmp_dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+tmp_dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+tmp_dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+tmp_dst[255:192] := a[255:192]
+tmp_dst[271:256] := (a &gt;&gt; (imm8[1:0] * 16))[271:256]
+tmp_dst[287:272] := (a &gt;&gt; (imm8[3:2] * 16))[271:256]
+tmp_dst[303:288] := (a &gt;&gt; (imm8[5:4] * 16))[271:256]
+tmp_dst[319:304] := (a &gt;&gt; (imm8[7:6] * 16))[271:256]
+tmp_dst[383:320] := a[383:320]
+tmp_dst[399:384] := (a &gt;&gt; (imm8[1:0] * 16))[399:384]
+tmp_dst[415:400] := (a &gt;&gt; (imm8[3:2] * 16))[399:384]
+tmp_dst[431:416] := (a &gt;&gt; (imm8[5:4] * 16))[399:384]
+tmp_dst[447:432] := (a &gt;&gt; (imm8[7:6] * 16))[399:384]
+tmp_dst[511:448] := a[511:448]
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPSHUFLW" xed="VPSHUFLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shufflelo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
+	<operation>
+dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+dst[127:64] := a[127:64]
+dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+dst[255:192] := a[255:192]
+dst[271:256] := (a &gt;&gt; (imm8[1:0] * 16))[271:256]
+dst[287:272] := (a &gt;&gt; (imm8[3:2] * 16))[271:256]
+dst[303:288] := (a &gt;&gt; (imm8[5:4] * 16))[271:256]
+dst[319:304] := (a &gt;&gt; (imm8[7:6] * 16))[271:256]
+dst[383:320] := a[383:320]
+dst[399:384] := (a &gt;&gt; (imm8[1:0] * 16))[399:384]
+dst[415:400] := (a &gt;&gt; (imm8[3:2] * 16))[399:384]
+dst[431:416] := (a &gt;&gt; (imm8[5:4] * 16))[399:384]
+dst[447:432] := (a &gt;&gt; (imm8[7:6] * 16))[399:384]
+dst[511:448] := a[511:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSHUFLW" xed="VPSHUFLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_unpackhi_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPUNPCKHBW" xed="VPUNPCKHBW_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_unpackhi_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPUNPCKHBW" xed="VPUNPCKHBW_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_unpackhi_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPUNPCKHBW" xed="VPUNPCKHBW_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_unpackhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPUNPCKHWD" xed="VPUNPCKHWD_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_unpackhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPUNPCKHWD" xed="VPUNPCKHWD_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_unpackhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPUNPCKHWD" xed="VPUNPCKHWD_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_unpacklo_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPUNPCKLBW" xed="VPUNPCKLBW_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_unpacklo_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPUNPCKLBW" xed="VPUNPCKLBW_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_unpacklo_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPUNPCKLBW" xed="VPUNPCKLBW_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_unpacklo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPUNPCKLWD" xed="VPUNPCKLWD_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_unpacklo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPUNPCKLWD" xed="VPUNPCKLWD_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_unpacklo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPUNPCKLWD" xed="VPUNPCKLWD_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_loadu_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VMOVDQU16" xed="VMOVDQU16_ZMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_loadu_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VMOVDQU16" xed="VMOVDQU16_ZMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_loadu_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VMOVDQU8" xed="VMOVDQU8_ZMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_loadu_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VMOVDQU8" xed="VMOVDQU8_ZMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_loadu_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits (composed of 32 packed 16-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVDQU16" xed="VMOVDQU16_ZMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_loadu_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits (composed of 64 packed 8-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVDQU8" xed="VMOVDQU8_ZMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_load_mask32" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" memwidth="32" type="__mmask32*" varname="mem_addr" />
+	<description>Load 32-bit mask from memory into "k".</description>
+	<operation>
+k[31:0] := MEM[mem_addr+31:mem_addr]
+	</operation>
+	<instruction form="k, m32" name="KMOVD" xed="KMOVD_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_load_mask64" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" memwidth="64" type="__mmask64*" varname="mem_addr" />
+	<description>Load 64-bit mask from memory into "k".</description>
+	<operation>
+k[63:0] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction form="k, m64" name="KMOVQ" xed="KMOVQ_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mov_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VMOVDQU16" xed="VMOVDQU16_ZMMu16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mov_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VMOVDQU16" xed="VMOVDQU16_ZMMu16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mov_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VMOVDQU8" xed="VMOVDQU8_ZMMu8_MASKmskw_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mov_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VMOVDQU8" xed="VMOVDQU8_ZMMu8_MASKmskw_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_storeu_epi16" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI16" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Store packed 16-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VMOVDQU16" xed="VMOVDQU16_MEMu16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_storeu_epi8" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI8" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Store packed 8-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VMOVDQU8" xed="VMOVDQU8_MEMu8_MASKmskw_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_storeu_epi16" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI16" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Store 512-bits (composed of 32 packed 16-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVDQU16" xed="VMOVDQU16_MEMu16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_storeu_epi8" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI8" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Store 512-bits (composed of 64 packed 8-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVDQU8" xed="VMOVDQU8_MEMu8_MASKmskw_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_store_mask32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="MASK" memwidth="32" type="__mmask32*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<description>Store 32-bit mask from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+	</operation>
+	<instruction form="m32, k" name="KMOVD" xed="KMOVD_MEMu32_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_store_mask64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="MASK" memwidth="64" type="__mmask64*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<description>Store 64-bit mask from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction form="m64, k" name="KMOVQ" xed="KMOVQ_MEMu64_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_abs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := ABS(a[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VPABSB" xed="VPABSB_ZMMi8_MASKmskw_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_abs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPABSB" xed="VPABSB_ZMMi8_MASKmskw_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_abs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPABSB" xed="VPABSB_ZMMi8_MASKmskw_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_abs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ABS(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VPABSW" xed="VPABSW_ZMMi16_MASKmskw_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_abs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPABSW" xed="VPABSW_ZMMi16_MASKmskw_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_abs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPABSW" xed="VPABSW_ZMMi16_MASKmskw_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_add_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPADDB" xed="VPADDB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_add_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPADDB" xed="VPADDB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_add_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPADDB" xed="VPADDB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_adds_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPADDSB" xed="VPADDSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_adds_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPADDSB" xed="VPADDSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_adds_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPADDSB" xed="VPADDSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_adds_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPADDSW" xed="VPADDSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_adds_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPADDSW" xed="VPADDSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_adds_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPADDSW" xed="VPADDSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_adds_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPADDUSB" xed="VPADDUSB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_adds_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPADDUSB" xed="VPADDUSB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_adds_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPADDUSB" xed="VPADDUSB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_adds_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPADDUSW" xed="VPADDUSW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_adds_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPADDUSW" xed="VPADDUSW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_adds_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPADDUSW" xed="VPADDUSW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_add_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPADDW" xed="VPADDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_add_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPADDW" xed="VPADDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_add_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPADDW" xed="VPADDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_avg_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPAVGB" xed="VPAVGB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_avg_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPAVGB" xed="VPAVGB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_avg_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPAVGB" xed="VPAVGB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_avg_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPAVGW" xed="VPAVGW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_avg_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPAVGW" xed="VPAVGW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_avg_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPAVGW" xed="VPAVGW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maddubs_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMADDUBSW" xed="VPMADDUBSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_maddubs_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMADDUBSW" xed="VPMADDUBSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_maddubs_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMADDUBSW" xed="VPMADDUBSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_madd_epi16" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMADDWD" xed="VPMADDWD_ZMMi32_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_madd_epi16" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMADDWD" xed="VPMADDWD_ZMMi32_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_madd_epi16" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMADDWD" xed="VPMADDWD_ZMMi32_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMAXSB" xed="VPMAXSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMAXSB" xed="VPMAXSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMAXSB" xed="VPMAXSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMAXSW" xed="VPMAXSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMAXSW" xed="VPMAXSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMAXSW" xed="VPMAXSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMAXUB" xed="VPMAXUB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMAXUB" xed="VPMAXUB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMAXUB" xed="VPMAXUB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMAXUW" xed="VPMAXUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMAXUW" xed="VPMAXUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMAXUW" xed="VPMAXUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMINSB" xed="VPMINSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMINSB" xed="VPMINSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMINSB" xed="VPMINSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMINSW" xed="VPMINSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMINSW" xed="VPMINSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMINSW" xed="VPMINSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMINUB" xed="VPMINUB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMINUB" xed="VPMINUB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMINUB" xed="VPMINUB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMINUW" xed="VPMINUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMINUW" xed="VPMINUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMINUW" xed="VPMINUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mulhrs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMULHRSW" xed="VPMULHRSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mulhrs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMULHRSW" xed="VPMULHRSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mulhrs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+	dst[i+15:i] := tmp[16:1]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMULHRSW" xed="VPMULHRSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mulhi_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMULHUW" xed="VPMULHUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mulhi_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMULHUW" xed="VPMULHUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mulhi_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMULHUW" xed="VPMULHUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mulhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMULHW" xed="VPMULHW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mulhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMULHW" xed="VPMULHW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mulhi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMULHW" xed="VPMULHW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mullo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMULLW" xed="VPMULLW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mullo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMULLW" xed="VPMULLW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mullo_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMULLW" xed="VPMULLW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sub_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSUBB" xed="VPSUBB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sub_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSUBB" xed="VPSUBB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sub_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSUBB" xed="VPSUBB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_subs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSUBSB" xed="VPSUBSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_subs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSUBSB" xed="VPSUBSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_subs_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSUBSB" xed="VPSUBSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_subs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSUBSW" xed="VPSUBSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_subs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSUBSW" xed="VPSUBSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_subs_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSUBSW" xed="VPSUBSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_subs_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSUBUSB" xed="VPSUBUSB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_subs_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSUBUSB" xed="VPSUBUSB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_subs_epu8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSUBUSB" xed="VPSUBUSB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_subs_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSUBUSW" xed="VPSUBUSW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_subs_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSUBUSW" xed="VPSUBUSW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_subs_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSUBUSW" xed="VPSUBUSW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sub_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSUBW" xed="VPSUBW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sub_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSUBW" xed="VPSUBW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sub_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSUBW" xed="VPSUBW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_packs_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI16" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate16(a[31:0])
+tmp_dst[31:16] := Saturate16(a[63:32])
+tmp_dst[47:32] := Saturate16(a[95:64])
+tmp_dst[63:48] := Saturate16(a[127:96])
+tmp_dst[79:64] := Saturate16(b[31:0])
+tmp_dst[95:80] := Saturate16(b[63:32])
+tmp_dst[111:96] := Saturate16(b[95:64])
+tmp_dst[127:112] := Saturate16(b[127:96])
+tmp_dst[143:128] := Saturate16(a[159:128])
+tmp_dst[159:144] := Saturate16(a[191:160])
+tmp_dst[175:160] := Saturate16(a[223:192])
+tmp_dst[191:176] := Saturate16(a[255:224])
+tmp_dst[207:192] := Saturate16(b[159:128])
+tmp_dst[223:208] := Saturate16(b[191:160])
+tmp_dst[239:224] := Saturate16(b[223:192])
+tmp_dst[255:240] := Saturate16(b[255:224])
+tmp_dst[271:256] := Saturate16(a[287:256])
+tmp_dst[287:272] := Saturate16(a[319:288])
+tmp_dst[303:288] := Saturate16(a[351:320])
+tmp_dst[319:304] := Saturate16(a[383:352])
+tmp_dst[335:320] := Saturate16(b[287:256])
+tmp_dst[351:336] := Saturate16(b[319:288])
+tmp_dst[367:352] := Saturate16(b[351:320])
+tmp_dst[383:368] := Saturate16(b[383:352])
+tmp_dst[399:384] := Saturate16(a[415:384])
+tmp_dst[415:400] := Saturate16(a[447:416])
+tmp_dst[431:416] := Saturate16(a[479:448])
+tmp_dst[447:432] := Saturate16(a[511:480])
+tmp_dst[463:448] := Saturate16(b[415:384])
+tmp_dst[479:464] := Saturate16(b[447:416])
+tmp_dst[495:480] := Saturate16(b[479:448])
+tmp_dst[511:496] := Saturate16(b[511:480])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPACKSSDW" xed="VPACKSSDW_ZMMi16_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_packs_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate16(a[31:0])
+tmp_dst[31:16] := Saturate16(a[63:32])
+tmp_dst[47:32] := Saturate16(a[95:64])
+tmp_dst[63:48] := Saturate16(a[127:96])
+tmp_dst[79:64] := Saturate16(b[31:0])
+tmp_dst[95:80] := Saturate16(b[63:32])
+tmp_dst[111:96] := Saturate16(b[95:64])
+tmp_dst[127:112] := Saturate16(b[127:96])
+tmp_dst[143:128] := Saturate16(a[159:128])
+tmp_dst[159:144] := Saturate16(a[191:160])
+tmp_dst[175:160] := Saturate16(a[223:192])
+tmp_dst[191:176] := Saturate16(a[255:224])
+tmp_dst[207:192] := Saturate16(b[159:128])
+tmp_dst[223:208] := Saturate16(b[191:160])
+tmp_dst[239:224] := Saturate16(b[223:192])
+tmp_dst[255:240] := Saturate16(b[255:224])
+tmp_dst[271:256] := Saturate16(a[287:256])
+tmp_dst[287:272] := Saturate16(a[319:288])
+tmp_dst[303:288] := Saturate16(a[351:320])
+tmp_dst[319:304] := Saturate16(a[383:352])
+tmp_dst[335:320] := Saturate16(b[287:256])
+tmp_dst[351:336] := Saturate16(b[319:288])
+tmp_dst[367:352] := Saturate16(b[351:320])
+tmp_dst[383:368] := Saturate16(b[383:352])
+tmp_dst[399:384] := Saturate16(a[415:384])
+tmp_dst[415:400] := Saturate16(a[447:416])
+tmp_dst[431:416] := Saturate16(a[479:448])
+tmp_dst[447:432] := Saturate16(a[511:480])
+tmp_dst[463:448] := Saturate16(b[415:384])
+tmp_dst[479:464] := Saturate16(b[447:416])
+tmp_dst[495:480] := Saturate16(b[479:448])
+tmp_dst[511:496] := Saturate16(b[511:480])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPACKSSDW" xed="VPACKSSDW_ZMMi16_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_packs_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI16" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:0])
+dst[31:16] := Saturate16(a[63:32])
+dst[47:32] := Saturate16(a[95:64])
+dst[63:48] := Saturate16(a[127:96])
+dst[79:64] := Saturate16(b[31:0])
+dst[95:80] := Saturate16(b[63:32])
+dst[111:96] := Saturate16(b[95:64])
+dst[127:112] := Saturate16(b[127:96])
+dst[143:128] := Saturate16(a[159:128])
+dst[159:144] := Saturate16(a[191:160])
+dst[175:160] := Saturate16(a[223:192])
+dst[191:176] := Saturate16(a[255:224])
+dst[207:192] := Saturate16(b[159:128])
+dst[223:208] := Saturate16(b[191:160])
+dst[239:224] := Saturate16(b[223:192])
+dst[255:240] := Saturate16(b[255:224])
+dst[271:256] := Saturate16(a[287:256])
+dst[287:272] := Saturate16(a[319:288])
+dst[303:288] := Saturate16(a[351:320])
+dst[319:304] := Saturate16(a[383:352])
+dst[335:320] := Saturate16(b[287:256])
+dst[351:336] := Saturate16(b[319:288])
+dst[367:352] := Saturate16(b[351:320])
+dst[383:368] := Saturate16(b[383:352])
+dst[399:384] := Saturate16(a[415:384])
+dst[415:400] := Saturate16(a[447:416])
+dst[431:416] := Saturate16(a[479:448])
+dst[447:432] := Saturate16(a[511:480])
+dst[463:448] := Saturate16(b[415:384])
+dst[479:464] := Saturate16(b[447:416])
+dst[495:480] := Saturate16(b[479:448])
+dst[511:496] := Saturate16(b[511:480])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPACKSSDW" xed="VPACKSSDW_ZMMi16_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_packs_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI8" type="__m512i" varname="dst" />
+	<parameter etype="SI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate8(a[15:0])
+tmp_dst[15:8] := Saturate8(a[31:16])
+tmp_dst[23:16] := Saturate8(a[47:32])
+tmp_dst[31:24] := Saturate8(a[63:48])
+tmp_dst[39:32] := Saturate8(a[79:64])
+tmp_dst[47:40] := Saturate8(a[95:80])
+tmp_dst[55:48] := Saturate8(a[111:96])
+tmp_dst[63:56] := Saturate8(a[127:112])
+tmp_dst[71:64] := Saturate8(b[15:0])
+tmp_dst[79:72] := Saturate8(b[31:16])
+tmp_dst[87:80] := Saturate8(b[47:32])
+tmp_dst[95:88] := Saturate8(b[63:48])
+tmp_dst[103:96] := Saturate8(b[79:64])
+tmp_dst[111:104] := Saturate8(b[95:80])
+tmp_dst[119:112] := Saturate8(b[111:96])
+tmp_dst[127:120] := Saturate8(b[127:112])
+tmp_dst[135:128] := Saturate8(a[143:128])
+tmp_dst[143:136] := Saturate8(a[159:144])
+tmp_dst[151:144] := Saturate8(a[175:160])
+tmp_dst[159:152] := Saturate8(a[191:176])
+tmp_dst[167:160] := Saturate8(a[207:192])
+tmp_dst[175:168] := Saturate8(a[223:208])
+tmp_dst[183:176] := Saturate8(a[239:224])
+tmp_dst[191:184] := Saturate8(a[255:240])
+tmp_dst[199:192] := Saturate8(b[143:128])
+tmp_dst[207:200] := Saturate8(b[159:144])
+tmp_dst[215:208] := Saturate8(b[175:160])
+tmp_dst[223:216] := Saturate8(b[191:176])
+tmp_dst[231:224] := Saturate8(b[207:192])
+tmp_dst[239:232] := Saturate8(b[223:208])
+tmp_dst[247:240] := Saturate8(b[239:224])
+tmp_dst[255:248] := Saturate8(b[255:240])
+tmp_dst[263:256] := Saturate8(a[271:256])
+tmp_dst[271:264] := Saturate8(a[287:272])
+tmp_dst[279:272] := Saturate8(a[303:288])
+tmp_dst[287:280] := Saturate8(a[319:304])
+tmp_dst[295:288] := Saturate8(a[335:320])
+tmp_dst[303:296] := Saturate8(a[351:336])
+tmp_dst[311:304] := Saturate8(a[367:352])
+tmp_dst[319:312] := Saturate8(a[383:368])
+tmp_dst[327:320] := Saturate8(b[271:256])
+tmp_dst[335:328] := Saturate8(b[287:272])
+tmp_dst[343:336] := Saturate8(b[303:288])
+tmp_dst[351:344] := Saturate8(b[319:304])
+tmp_dst[359:352] := Saturate8(b[335:320])
+tmp_dst[367:360] := Saturate8(b[351:336])
+tmp_dst[375:368] := Saturate8(b[367:352])
+tmp_dst[383:376] := Saturate8(b[383:368])
+tmp_dst[391:384] := Saturate8(a[399:384])
+tmp_dst[399:392] := Saturate8(a[415:400])
+tmp_dst[407:400] := Saturate8(a[431:416])
+tmp_dst[415:408] := Saturate8(a[447:432])
+tmp_dst[423:416] := Saturate8(a[463:448])
+tmp_dst[431:424] := Saturate8(a[479:464])
+tmp_dst[439:432] := Saturate8(a[495:480])
+tmp_dst[447:440] := Saturate8(a[511:496])
+tmp_dst[455:448] := Saturate8(b[399:384])
+tmp_dst[463:456] := Saturate8(b[415:400])
+tmp_dst[471:464] := Saturate8(b[431:416])
+tmp_dst[479:472] := Saturate8(b[447:432])
+tmp_dst[487:480] := Saturate8(b[463:448])
+tmp_dst[495:488] := Saturate8(b[479:464])
+tmp_dst[503:496] := Saturate8(b[495:480])
+tmp_dst[511:504] := Saturate8(b[511:496])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPACKSSWB" xed="VPACKSSWB_ZMMi8_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_packs_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate8(a[15:0])
+tmp_dst[15:8] := Saturate8(a[31:16])
+tmp_dst[23:16] := Saturate8(a[47:32])
+tmp_dst[31:24] := Saturate8(a[63:48])
+tmp_dst[39:32] := Saturate8(a[79:64])
+tmp_dst[47:40] := Saturate8(a[95:80])
+tmp_dst[55:48] := Saturate8(a[111:96])
+tmp_dst[63:56] := Saturate8(a[127:112])
+tmp_dst[71:64] := Saturate8(b[15:0])
+tmp_dst[79:72] := Saturate8(b[31:16])
+tmp_dst[87:80] := Saturate8(b[47:32])
+tmp_dst[95:88] := Saturate8(b[63:48])
+tmp_dst[103:96] := Saturate8(b[79:64])
+tmp_dst[111:104] := Saturate8(b[95:80])
+tmp_dst[119:112] := Saturate8(b[111:96])
+tmp_dst[127:120] := Saturate8(b[127:112])
+tmp_dst[135:128] := Saturate8(a[143:128])
+tmp_dst[143:136] := Saturate8(a[159:144])
+tmp_dst[151:144] := Saturate8(a[175:160])
+tmp_dst[159:152] := Saturate8(a[191:176])
+tmp_dst[167:160] := Saturate8(a[207:192])
+tmp_dst[175:168] := Saturate8(a[223:208])
+tmp_dst[183:176] := Saturate8(a[239:224])
+tmp_dst[191:184] := Saturate8(a[255:240])
+tmp_dst[199:192] := Saturate8(b[143:128])
+tmp_dst[207:200] := Saturate8(b[159:144])
+tmp_dst[215:208] := Saturate8(b[175:160])
+tmp_dst[223:216] := Saturate8(b[191:176])
+tmp_dst[231:224] := Saturate8(b[207:192])
+tmp_dst[239:232] := Saturate8(b[223:208])
+tmp_dst[247:240] := Saturate8(b[239:224])
+tmp_dst[255:248] := Saturate8(b[255:240])
+tmp_dst[263:256] := Saturate8(a[271:256])
+tmp_dst[271:264] := Saturate8(a[287:272])
+tmp_dst[279:272] := Saturate8(a[303:288])
+tmp_dst[287:280] := Saturate8(a[319:304])
+tmp_dst[295:288] := Saturate8(a[335:320])
+tmp_dst[303:296] := Saturate8(a[351:336])
+tmp_dst[311:304] := Saturate8(a[367:352])
+tmp_dst[319:312] := Saturate8(a[383:368])
+tmp_dst[327:320] := Saturate8(b[271:256])
+tmp_dst[335:328] := Saturate8(b[287:272])
+tmp_dst[343:336] := Saturate8(b[303:288])
+tmp_dst[351:344] := Saturate8(b[319:304])
+tmp_dst[359:352] := Saturate8(b[335:320])
+tmp_dst[367:360] := Saturate8(b[351:336])
+tmp_dst[375:368] := Saturate8(b[367:352])
+tmp_dst[383:376] := Saturate8(b[383:368])
+tmp_dst[391:384] := Saturate8(a[399:384])
+tmp_dst[399:392] := Saturate8(a[415:400])
+tmp_dst[407:400] := Saturate8(a[431:416])
+tmp_dst[415:408] := Saturate8(a[447:432])
+tmp_dst[423:416] := Saturate8(a[463:448])
+tmp_dst[431:424] := Saturate8(a[479:464])
+tmp_dst[439:432] := Saturate8(a[495:480])
+tmp_dst[447:440] := Saturate8(a[511:496])
+tmp_dst[455:448] := Saturate8(b[399:384])
+tmp_dst[463:456] := Saturate8(b[415:400])
+tmp_dst[471:464] := Saturate8(b[431:416])
+tmp_dst[479:472] := Saturate8(b[447:432])
+tmp_dst[487:480] := Saturate8(b[463:448])
+tmp_dst[495:488] := Saturate8(b[479:464])
+tmp_dst[503:496] := Saturate8(b[495:480])
+tmp_dst[511:504] := Saturate8(b[511:496])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPACKSSWB" xed="VPACKSSWB_ZMMi8_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_packs_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="SI8" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate8(a[15:0])
+dst[15:8] := Saturate8(a[31:16])
+dst[23:16] := Saturate8(a[47:32])
+dst[31:24] := Saturate8(a[63:48])
+dst[39:32] := Saturate8(a[79:64])
+dst[47:40] := Saturate8(a[95:80])
+dst[55:48] := Saturate8(a[111:96])
+dst[63:56] := Saturate8(a[127:112])
+dst[71:64] := Saturate8(b[15:0])
+dst[79:72] := Saturate8(b[31:16])
+dst[87:80] := Saturate8(b[47:32])
+dst[95:88] := Saturate8(b[63:48])
+dst[103:96] := Saturate8(b[79:64])
+dst[111:104] := Saturate8(b[95:80])
+dst[119:112] := Saturate8(b[111:96])
+dst[127:120] := Saturate8(b[127:112])
+dst[135:128] := Saturate8(a[143:128])
+dst[143:136] := Saturate8(a[159:144])
+dst[151:144] := Saturate8(a[175:160])
+dst[159:152] := Saturate8(a[191:176])
+dst[167:160] := Saturate8(a[207:192])
+dst[175:168] := Saturate8(a[223:208])
+dst[183:176] := Saturate8(a[239:224])
+dst[191:184] := Saturate8(a[255:240])
+dst[199:192] := Saturate8(b[143:128])
+dst[207:200] := Saturate8(b[159:144])
+dst[215:208] := Saturate8(b[175:160])
+dst[223:216] := Saturate8(b[191:176])
+dst[231:224] := Saturate8(b[207:192])
+dst[239:232] := Saturate8(b[223:208])
+dst[247:240] := Saturate8(b[239:224])
+dst[255:248] := Saturate8(b[255:240])
+dst[263:256] := Saturate8(a[271:256])
+dst[271:264] := Saturate8(a[287:272])
+dst[279:272] := Saturate8(a[303:288])
+dst[287:280] := Saturate8(a[319:304])
+dst[295:288] := Saturate8(a[335:320])
+dst[303:296] := Saturate8(a[351:336])
+dst[311:304] := Saturate8(a[367:352])
+dst[319:312] := Saturate8(a[383:368])
+dst[327:320] := Saturate8(b[271:256])
+dst[335:328] := Saturate8(b[287:272])
+dst[343:336] := Saturate8(b[303:288])
+dst[351:344] := Saturate8(b[319:304])
+dst[359:352] := Saturate8(b[335:320])
+dst[367:360] := Saturate8(b[351:336])
+dst[375:368] := Saturate8(b[367:352])
+dst[383:376] := Saturate8(b[383:368])
+dst[391:384] := Saturate8(a[399:384])
+dst[399:392] := Saturate8(a[415:400])
+dst[407:400] := Saturate8(a[431:416])
+dst[415:408] := Saturate8(a[447:432])
+dst[423:416] := Saturate8(a[463:448])
+dst[431:424] := Saturate8(a[479:464])
+dst[439:432] := Saturate8(a[495:480])
+dst[447:440] := Saturate8(a[511:496])
+dst[455:448] := Saturate8(b[399:384])
+dst[463:456] := Saturate8(b[415:400])
+dst[471:464] := Saturate8(b[431:416])
+dst[479:472] := Saturate8(b[447:432])
+dst[487:480] := Saturate8(b[463:448])
+dst[495:488] := Saturate8(b[479:464])
+dst[503:496] := Saturate8(b[495:480])
+dst[511:504] := Saturate8(b[511:496])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPACKSSWB" xed="VPACKSSWB_ZMMi8_MASKmskw_ZMMi16_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_packus_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := SaturateU16(a[31:0])
+tmp_dst[31:16] := SaturateU16(a[63:32])
+tmp_dst[47:32] := SaturateU16(a[95:64])
+tmp_dst[63:48] := SaturateU16(a[127:96])
+tmp_dst[79:64] := SaturateU16(b[31:0])
+tmp_dst[95:80] := SaturateU16(b[63:32])
+tmp_dst[111:96] := SaturateU16(b[95:64])
+tmp_dst[127:112] := SaturateU16(b[127:96])
+tmp_dst[143:128] := SaturateU16(a[159:128])
+tmp_dst[159:144] := SaturateU16(a[191:160])
+tmp_dst[175:160] := SaturateU16(a[223:192])
+tmp_dst[191:176] := SaturateU16(a[255:224])
+tmp_dst[207:192] := SaturateU16(b[159:128])
+tmp_dst[223:208] := SaturateU16(b[191:160])
+tmp_dst[239:224] := SaturateU16(b[223:192])
+tmp_dst[255:240] := SaturateU16(b[255:224])
+tmp_dst[271:256] := SaturateU16(a[287:256])
+tmp_dst[287:272] := SaturateU16(a[319:288])
+tmp_dst[303:288] := SaturateU16(a[351:320])
+tmp_dst[319:304] := SaturateU16(a[383:352])
+tmp_dst[335:320] := SaturateU16(b[287:256])
+tmp_dst[351:336] := SaturateU16(b[319:288])
+tmp_dst[367:352] := SaturateU16(b[351:320])
+tmp_dst[383:368] := SaturateU16(b[383:352])
+tmp_dst[399:384] := SaturateU16(a[415:384])
+tmp_dst[415:400] := SaturateU16(a[447:416])
+tmp_dst[431:416] := SaturateU16(a[479:448])
+tmp_dst[447:432] := SaturateU16(a[511:480])
+tmp_dst[463:448] := SaturateU16(b[415:384])
+tmp_dst[479:464] := SaturateU16(b[447:416])
+tmp_dst[495:480] := SaturateU16(b[479:448])
+tmp_dst[511:496] := SaturateU16(b[511:480])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPACKUSDW" xed="VPACKUSDW_ZMMu16_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_packus_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := SaturateU16(a[31:0])
+tmp_dst[31:16] := SaturateU16(a[63:32])
+tmp_dst[47:32] := SaturateU16(a[95:64])
+tmp_dst[63:48] := SaturateU16(a[127:96])
+tmp_dst[79:64] := SaturateU16(b[31:0])
+tmp_dst[95:80] := SaturateU16(b[63:32])
+tmp_dst[111:96] := SaturateU16(b[95:64])
+tmp_dst[127:112] := SaturateU16(b[127:96])
+tmp_dst[143:128] := SaturateU16(a[159:128])
+tmp_dst[159:144] := SaturateU16(a[191:160])
+tmp_dst[175:160] := SaturateU16(a[223:192])
+tmp_dst[191:176] := SaturateU16(a[255:224])
+tmp_dst[207:192] := SaturateU16(b[159:128])
+tmp_dst[223:208] := SaturateU16(b[191:160])
+tmp_dst[239:224] := SaturateU16(b[223:192])
+tmp_dst[255:240] := SaturateU16(b[255:224])
+tmp_dst[271:256] := SaturateU16(a[287:256])
+tmp_dst[287:272] := SaturateU16(a[319:288])
+tmp_dst[303:288] := SaturateU16(a[351:320])
+tmp_dst[319:304] := SaturateU16(a[383:352])
+tmp_dst[335:320] := SaturateU16(b[287:256])
+tmp_dst[351:336] := SaturateU16(b[319:288])
+tmp_dst[367:352] := SaturateU16(b[351:320])
+tmp_dst[383:368] := SaturateU16(b[383:352])
+tmp_dst[399:384] := SaturateU16(a[415:384])
+tmp_dst[415:400] := SaturateU16(a[447:416])
+tmp_dst[431:416] := SaturateU16(a[479:448])
+tmp_dst[447:432] := SaturateU16(a[511:480])
+tmp_dst[463:448] := SaturateU16(b[415:384])
+tmp_dst[479:464] := SaturateU16(b[447:416])
+tmp_dst[495:480] := SaturateU16(b[479:448])
+tmp_dst[511:496] := SaturateU16(b[511:480])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPACKUSDW" xed="VPACKUSDW_ZMMu16_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_packus_epi32" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := SaturateU16(a[31:0])
+dst[31:16] := SaturateU16(a[63:32])
+dst[47:32] := SaturateU16(a[95:64])
+dst[63:48] := SaturateU16(a[127:96])
+dst[79:64] := SaturateU16(b[31:0])
+dst[95:80] := SaturateU16(b[63:32])
+dst[111:96] := SaturateU16(b[95:64])
+dst[127:112] := SaturateU16(b[127:96])
+dst[143:128] := SaturateU16(a[159:128])
+dst[159:144] := SaturateU16(a[191:160])
+dst[175:160] := SaturateU16(a[223:192])
+dst[191:176] := SaturateU16(a[255:224])
+dst[207:192] := SaturateU16(b[159:128])
+dst[223:208] := SaturateU16(b[191:160])
+dst[239:224] := SaturateU16(b[223:192])
+dst[255:240] := SaturateU16(b[255:224])
+dst[271:256] := SaturateU16(a[287:256])
+dst[287:272] := SaturateU16(a[319:288])
+dst[303:288] := SaturateU16(a[351:320])
+dst[319:304] := SaturateU16(a[383:352])
+dst[335:320] := SaturateU16(b[287:256])
+dst[351:336] := SaturateU16(b[319:288])
+dst[367:352] := SaturateU16(b[351:320])
+dst[383:368] := SaturateU16(b[383:352])
+dst[399:384] := SaturateU16(a[415:384])
+dst[415:400] := SaturateU16(a[447:416])
+dst[431:416] := SaturateU16(a[479:448])
+dst[447:432] := SaturateU16(a[511:480])
+dst[463:448] := SaturateU16(b[415:384])
+dst[479:464] := SaturateU16(b[447:416])
+dst[495:480] := SaturateU16(b[479:448])
+dst[511:496] := SaturateU16(b[511:480])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPACKUSDW" xed="VPACKUSDW_ZMMu16_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_packus_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := SaturateU8(a[15:0])
+tmp_dst[15:8] := SaturateU8(a[31:16])
+tmp_dst[23:16] := SaturateU8(a[47:32])
+tmp_dst[31:24] := SaturateU8(a[63:48])
+tmp_dst[39:32] := SaturateU8(a[79:64])
+tmp_dst[47:40] := SaturateU8(a[95:80])
+tmp_dst[55:48] := SaturateU8(a[111:96])
+tmp_dst[63:56] := SaturateU8(a[127:112])
+tmp_dst[71:64] := SaturateU8(b[15:0])
+tmp_dst[79:72] := SaturateU8(b[31:16])
+tmp_dst[87:80] := SaturateU8(b[47:32])
+tmp_dst[95:88] := SaturateU8(b[63:48])
+tmp_dst[103:96] := SaturateU8(b[79:64])
+tmp_dst[111:104] := SaturateU8(b[95:80])
+tmp_dst[119:112] := SaturateU8(b[111:96])
+tmp_dst[127:120] := SaturateU8(b[127:112])
+tmp_dst[135:128] := SaturateU8(a[143:128])
+tmp_dst[143:136] := SaturateU8(a[159:144])
+tmp_dst[151:144] := SaturateU8(a[175:160])
+tmp_dst[159:152] := SaturateU8(a[191:176])
+tmp_dst[167:160] := SaturateU8(a[207:192])
+tmp_dst[175:168] := SaturateU8(a[223:208])
+tmp_dst[183:176] := SaturateU8(a[239:224])
+tmp_dst[191:184] := SaturateU8(a[255:240])
+tmp_dst[199:192] := SaturateU8(b[143:128])
+tmp_dst[207:200] := SaturateU8(b[159:144])
+tmp_dst[215:208] := SaturateU8(b[175:160])
+tmp_dst[223:216] := SaturateU8(b[191:176])
+tmp_dst[231:224] := SaturateU8(b[207:192])
+tmp_dst[239:232] := SaturateU8(b[223:208])
+tmp_dst[247:240] := SaturateU8(b[239:224])
+tmp_dst[255:248] := SaturateU8(b[255:240])
+tmp_dst[263:256] := SaturateU8(a[271:256])
+tmp_dst[271:264] := SaturateU8(a[287:272])
+tmp_dst[279:272] := SaturateU8(a[303:288])
+tmp_dst[287:280] := SaturateU8(a[319:304])
+tmp_dst[295:288] := SaturateU8(a[335:320])
+tmp_dst[303:296] := SaturateU8(a[351:336])
+tmp_dst[311:304] := SaturateU8(a[367:352])
+tmp_dst[319:312] := SaturateU8(a[383:368])
+tmp_dst[327:320] := SaturateU8(b[271:256])
+tmp_dst[335:328] := SaturateU8(b[287:272])
+tmp_dst[343:336] := SaturateU8(b[303:288])
+tmp_dst[351:344] := SaturateU8(b[319:304])
+tmp_dst[359:352] := SaturateU8(b[335:320])
+tmp_dst[367:360] := SaturateU8(b[351:336])
+tmp_dst[375:368] := SaturateU8(b[367:352])
+tmp_dst[383:376] := SaturateU8(b[383:368])
+tmp_dst[391:384] := SaturateU8(a[399:384])
+tmp_dst[399:392] := SaturateU8(a[415:400])
+tmp_dst[407:400] := SaturateU8(a[431:416])
+tmp_dst[415:408] := SaturateU8(a[447:432])
+tmp_dst[423:416] := SaturateU8(a[463:448])
+tmp_dst[431:424] := SaturateU8(a[479:464])
+tmp_dst[439:432] := SaturateU8(a[495:480])
+tmp_dst[447:440] := SaturateU8(a[511:496])
+tmp_dst[455:448] := SaturateU8(b[399:384])
+tmp_dst[463:456] := SaturateU8(b[415:400])
+tmp_dst[471:464] := SaturateU8(b[431:416])
+tmp_dst[479:472] := SaturateU8(b[447:432])
+tmp_dst[487:480] := SaturateU8(b[463:448])
+tmp_dst[495:488] := SaturateU8(b[479:464])
+tmp_dst[503:496] := SaturateU8(b[495:480])
+tmp_dst[511:504] := SaturateU8(b[511:496])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPACKUSWB" xed="VPACKUSWB_ZMMu8_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_packus_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := SaturateU8(a[15:0])
+tmp_dst[15:8] := SaturateU8(a[31:16])
+tmp_dst[23:16] := SaturateU8(a[47:32])
+tmp_dst[31:24] := SaturateU8(a[63:48])
+tmp_dst[39:32] := SaturateU8(a[79:64])
+tmp_dst[47:40] := SaturateU8(a[95:80])
+tmp_dst[55:48] := SaturateU8(a[111:96])
+tmp_dst[63:56] := SaturateU8(a[127:112])
+tmp_dst[71:64] := SaturateU8(b[15:0])
+tmp_dst[79:72] := SaturateU8(b[31:16])
+tmp_dst[87:80] := SaturateU8(b[47:32])
+tmp_dst[95:88] := SaturateU8(b[63:48])
+tmp_dst[103:96] := SaturateU8(b[79:64])
+tmp_dst[111:104] := SaturateU8(b[95:80])
+tmp_dst[119:112] := SaturateU8(b[111:96])
+tmp_dst[127:120] := SaturateU8(b[127:112])
+tmp_dst[135:128] := SaturateU8(a[143:128])
+tmp_dst[143:136] := SaturateU8(a[159:144])
+tmp_dst[151:144] := SaturateU8(a[175:160])
+tmp_dst[159:152] := SaturateU8(a[191:176])
+tmp_dst[167:160] := SaturateU8(a[207:192])
+tmp_dst[175:168] := SaturateU8(a[223:208])
+tmp_dst[183:176] := SaturateU8(a[239:224])
+tmp_dst[191:184] := SaturateU8(a[255:240])
+tmp_dst[199:192] := SaturateU8(b[143:128])
+tmp_dst[207:200] := SaturateU8(b[159:144])
+tmp_dst[215:208] := SaturateU8(b[175:160])
+tmp_dst[223:216] := SaturateU8(b[191:176])
+tmp_dst[231:224] := SaturateU8(b[207:192])
+tmp_dst[239:232] := SaturateU8(b[223:208])
+tmp_dst[247:240] := SaturateU8(b[239:224])
+tmp_dst[255:248] := SaturateU8(b[255:240])
+tmp_dst[263:256] := SaturateU8(a[271:256])
+tmp_dst[271:264] := SaturateU8(a[287:272])
+tmp_dst[279:272] := SaturateU8(a[303:288])
+tmp_dst[287:280] := SaturateU8(a[319:304])
+tmp_dst[295:288] := SaturateU8(a[335:320])
+tmp_dst[303:296] := SaturateU8(a[351:336])
+tmp_dst[311:304] := SaturateU8(a[367:352])
+tmp_dst[319:312] := SaturateU8(a[383:368])
+tmp_dst[327:320] := SaturateU8(b[271:256])
+tmp_dst[335:328] := SaturateU8(b[287:272])
+tmp_dst[343:336] := SaturateU8(b[303:288])
+tmp_dst[351:344] := SaturateU8(b[319:304])
+tmp_dst[359:352] := SaturateU8(b[335:320])
+tmp_dst[367:360] := SaturateU8(b[351:336])
+tmp_dst[375:368] := SaturateU8(b[367:352])
+tmp_dst[383:376] := SaturateU8(b[383:368])
+tmp_dst[391:384] := SaturateU8(a[399:384])
+tmp_dst[399:392] := SaturateU8(a[415:400])
+tmp_dst[407:400] := SaturateU8(a[431:416])
+tmp_dst[415:408] := SaturateU8(a[447:432])
+tmp_dst[423:416] := SaturateU8(a[463:448])
+tmp_dst[431:424] := SaturateU8(a[479:464])
+tmp_dst[439:432] := SaturateU8(a[495:480])
+tmp_dst[447:440] := SaturateU8(a[511:496])
+tmp_dst[455:448] := SaturateU8(b[399:384])
+tmp_dst[463:456] := SaturateU8(b[415:400])
+tmp_dst[471:464] := SaturateU8(b[431:416])
+tmp_dst[479:472] := SaturateU8(b[447:432])
+tmp_dst[487:480] := SaturateU8(b[463:448])
+tmp_dst[495:488] := SaturateU8(b[479:464])
+tmp_dst[503:496] := SaturateU8(b[495:480])
+tmp_dst[511:504] := SaturateU8(b[511:496])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPACKUSWB" xed="VPACKUSWB_ZMMu8_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_packus_epi16" tech="AVX-512">
+	<category>Miscellaneous</category>
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := SaturateU8(a[15:0])
+dst[15:8] := SaturateU8(a[31:16])
+dst[23:16] := SaturateU8(a[47:32])
+dst[31:24] := SaturateU8(a[63:48])
+dst[39:32] := SaturateU8(a[79:64])
+dst[47:40] := SaturateU8(a[95:80])
+dst[55:48] := SaturateU8(a[111:96])
+dst[63:56] := SaturateU8(a[127:112])
+dst[71:64] := SaturateU8(b[15:0])
+dst[79:72] := SaturateU8(b[31:16])
+dst[87:80] := SaturateU8(b[47:32])
+dst[95:88] := SaturateU8(b[63:48])
+dst[103:96] := SaturateU8(b[79:64])
+dst[111:104] := SaturateU8(b[95:80])
+dst[119:112] := SaturateU8(b[111:96])
+dst[127:120] := SaturateU8(b[127:112])
+dst[135:128] := SaturateU8(a[143:128])
+dst[143:136] := SaturateU8(a[159:144])
+dst[151:144] := SaturateU8(a[175:160])
+dst[159:152] := SaturateU8(a[191:176])
+dst[167:160] := SaturateU8(a[207:192])
+dst[175:168] := SaturateU8(a[223:208])
+dst[183:176] := SaturateU8(a[239:224])
+dst[191:184] := SaturateU8(a[255:240])
+dst[199:192] := SaturateU8(b[143:128])
+dst[207:200] := SaturateU8(b[159:144])
+dst[215:208] := SaturateU8(b[175:160])
+dst[223:216] := SaturateU8(b[191:176])
+dst[231:224] := SaturateU8(b[207:192])
+dst[239:232] := SaturateU8(b[223:208])
+dst[247:240] := SaturateU8(b[239:224])
+dst[255:248] := SaturateU8(b[255:240])
+dst[263:256] := SaturateU8(a[271:256])
+dst[271:264] := SaturateU8(a[287:272])
+dst[279:272] := SaturateU8(a[303:288])
+dst[287:280] := SaturateU8(a[319:304])
+dst[295:288] := SaturateU8(a[335:320])
+dst[303:296] := SaturateU8(a[351:336])
+dst[311:304] := SaturateU8(a[367:352])
+dst[319:312] := SaturateU8(a[383:368])
+dst[327:320] := SaturateU8(b[271:256])
+dst[335:328] := SaturateU8(b[287:272])
+dst[343:336] := SaturateU8(b[303:288])
+dst[351:344] := SaturateU8(b[319:304])
+dst[359:352] := SaturateU8(b[335:320])
+dst[367:360] := SaturateU8(b[351:336])
+dst[375:368] := SaturateU8(b[367:352])
+dst[383:376] := SaturateU8(b[383:368])
+dst[391:384] := SaturateU8(a[399:384])
+dst[399:392] := SaturateU8(a[415:400])
+dst[407:400] := SaturateU8(a[431:416])
+dst[415:408] := SaturateU8(a[447:432])
+dst[423:416] := SaturateU8(a[463:448])
+dst[431:424] := SaturateU8(a[479:464])
+dst[439:432] := SaturateU8(a[495:480])
+dst[447:440] := SaturateU8(a[511:496])
+dst[455:448] := SaturateU8(b[399:384])
+dst[463:456] := SaturateU8(b[415:400])
+dst[471:464] := SaturateU8(b[431:416])
+dst[479:472] := SaturateU8(b[447:432])
+dst[487:480] := SaturateU8(b[463:448])
+dst[495:488] := SaturateU8(b[479:464])
+dst[503:496] := SaturateU8(b[495:480])
+dst[511:504] := SaturateU8(b[511:496])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPACKUSWB" xed="VPACKUSWB_ZMMu8_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtsepi16_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Saturate8(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VPMOVSWB" xed="VPMOVSWB_YMMi8_MASKmskw_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtsepi16_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m256i" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VPMOVSWB" xed="VPMOVSWB_YMMi8_MASKmskw_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtsepi16_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI8" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, zmm" name="VPMOVSWB" xed="VPMOVSWB_MEMi8_MASKmskw_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtsepi16_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VPMOVSWB" xed="VPMOVSWB_YMMi8_MASKmskw_ZMMi16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi8_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m512i" varname="dst" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	dst[l+15:l] := SignExtend16(a[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VPMOVSXBW" xed="VPMOVSXBW_ZMMi16_MASKmskw_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi8_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VPMOVSXBW" xed="VPMOVSXBW_ZMMi16_MASKmskw_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi8_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI8" type="__m256i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VPMOVSXBW" xed="VPMOVSXBW_ZMMi16_MASKmskw_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtusepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := SaturateU8(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VPMOVUSWB" xed="VPMOVUSWB_YMMu8_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtusepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VPMOVUSWB" xed="VPMOVUSWB_YMMu8_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtusepi16_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, zmm" name="VPMOVUSWB" xed="VPMOVUSWB_MEMu8_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtusepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VPMOVUSWB" xed="VPMOVUSWB_YMMu8_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Truncate8(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VPMOVWB" xed="VPMOVWB_YMMu8_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VPMOVWB" xed="VPMOVWB_YMMu8_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi16_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, zmm" name="VPMOVWB" xed="VPMOVWB_MEMu8_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi16_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VPMOVWB" xed="VPMOVWB_YMMu8_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu8_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	dst[l+15:l] := ZeroExtend16(a[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VPMOVZXBW" xed="VPMOVZXBW_ZMMi16_MASKmskw_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu8_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VPMOVZXBW" xed="VPMOVZXBW_ZMMi16_MASKmskw_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepu8_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VPMOVZXBW" xed="VPMOVZXBW_ZMMi16_MASKmskw_YMMi8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_set1_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="char" varname="a" />
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, r8" name="VPBROADCASTB" xed="VPBROADCASTB_ZMMu8_MASKmskw_GPR32u8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_set1_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="char" varname="a" />
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, r8" name="VPBROADCASTB" xed="VPBROADCASTB_ZMMu8_MASKmskw_GPR32u8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_set1_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="short" varname="a" />
+	<description>Broadcast 16-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, r16" name="VPBROADCASTW" xed="VPBROADCASTW_ZMMu16_MASKmskw_GPR32u16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_set1_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="short" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, r16" name="VPBROADCASTW" xed="VPBROADCASTW_ZMMu16_MASKmskw_GPR32u16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpeq_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpge_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpgt_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmple_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmplt_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpneq_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpeq_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpge_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpgt_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmple_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmplt_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpneq_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPB" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpeq_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpge_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpgt_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmple_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmplt_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpneq_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpeq_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpge_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpgt_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmple_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmplt_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpneq_epu8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPUB" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpeq_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpge_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpgt_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmple_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmplt_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpneq_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpeq_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpge_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpgt_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmple_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmplt_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpneq_epu16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPUW" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpeq_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpge_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpgt_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmple_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmplt_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpneq_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="const int" varname="imm8" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpeq_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpge_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpgt_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmple_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmplt_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpneq_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPW" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_test_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPTESTMB" xed="VPTESTMB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_test_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPTESTMB" xed="VPTESTMB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_test_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPTESTMW" xed="VPTESTMW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_test_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPTESTMW" xed="VPTESTMW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_testn_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="k1" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPTESTNMB" xed="VPTESTNMB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_testn_epi8_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPTESTNMB" xed="VPTESTNMB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_testn_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPTESTNMW" xed="VPTESTNMW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_testn_epi16_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPTESTNMW" xed="VPTESTNMW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_bslli_epi128" tech="AVX-512">
+	<return etype="M128" type="__m512i" varname="dst" />
+	<parameter etype="M128" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+dst[255:128] := a[255:128] &lt;&lt; (tmp*8)
+dst[383:256] := a[383:256] &lt;&lt; (tmp*8)
+dst[511:384] := a[511:384] &lt;&lt; (tmp*8)
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSLLDQ" xed="VPSLLDQ_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sllv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSLLVW" xed="VPSLLVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sllv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSLLVW" xed="VPSLLVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sllv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSLLVW" xed="VPSLLVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sll_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, xmm" name="VPSLLW" xed="VPSLLW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_slli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPSLLW" xed="VPSLLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sll_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, xmm" name="VPSLLW" xed="VPSLLW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_slli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPSLLW" xed="VPSLLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sll_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, xmm" name="VPSLLW" xed="VPSLLW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_slli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSLLW" xed="VPSLLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srav_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSRAVW" xed="VPSRAVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srav_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSRAVW" xed="VPSRAVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srav_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSRAVW" xed="VPSRAVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sra_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, xmm" name="VPSRAW" xed="VPSRAW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srai_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPSRAW" xed="VPSRAW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sra_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, xmm" name="VPSRAW" xed="VPSRAW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srai_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPSRAW" xed="VPSRAW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sra_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, xmm" name="VPSRAW" xed="VPSRAW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srai_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSRAW" xed="VPSRAW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_bsrli_epi128" tech="AVX-512">
+	<return etype="M128" type="__m512i" varname="dst" />
+	<parameter etype="M128" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+dst[255:128] := a[255:128] &gt;&gt; (tmp*8)
+dst[383:256] := a[383:256] &gt;&gt; (tmp*8)
+dst[511:384] := a[511:384] &gt;&gt; (tmp*8)
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSRLDQ" xed="VPSRLDQ_ZMMu8_ZMMu8_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srlv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSRLVW" xed="VPSRLVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srlv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSRLVW" xed="VPSRLVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srlv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSRLVW" xed="VPSRLVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srl_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, xmm" name="VPSRLW" xed="VPSRLW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPSRLW" xed="VPSRLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srl_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, xmm" name="VPSRLW" xed="VPSRLW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPSRLW" xed="VPSRLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srl_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, xmm" name="VPSRLW" xed="VPSRLW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srli_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSRLW" xed="VPSRLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_kadd_mask32" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="b" />
+	<description>Add 32-bit masks in "a" and "b", and store the result in "k".</description>
+	<operation>
+k[31:0] := a[31:0] + b[31:0]
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, k, k" name="KADDD" xed="KADDD_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kadd_mask64" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="b" />
+	<description>Add 64-bit masks in "a" and "b", and store the result in "k".</description>
+	<operation>
+k[63:0] := a[63:0] + b[63:0]
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, k, k" name="KADDQ" xed="KADDQ_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kand_mask32" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="b" />
+	<description>Compute the bitwise AND of 32-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[31:0] := a[31:0] AND b[31:0]
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, k, k" name="KANDD" xed="KANDD_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kand_mask64" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="b" />
+	<description>Compute the bitwise AND of 64-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[63:0] := a[63:0] AND b[63:0]
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, k, k" name="KANDQ" xed="KANDQ_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kandn_mask32" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="b" />
+	<description>Compute the bitwise NOT of 32-bit masks "a" and then AND with "b", and store the result in "k".</description>
+	<operation>
+k[31:0] := (NOT a[31:0]) AND b[31:0]
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, k, k" name="KANDND" xed="KANDND_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kandn_mask64" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="b" />
+	<description>Compute the bitwise NOT of 64-bit masks "a" and then AND with "b", and store the result in "k".</description>
+	<operation>
+k[63:0] := (NOT a[63:0]) AND b[63:0]
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, k, k" name="KANDNQ" xed="KANDNQ_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_knot_mask32" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<description>Compute the bitwise NOT of 32-bit mask "a", and store the result in "k".</description>
+	<operation>
+k[31:0] := NOT a[31:0]
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, k" name="KNOTD" xed="KNOTD_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_knot_mask64" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<description>Compute the bitwise NOT of 64-bit mask "a", and store the result in "k".</description>
+	<operation>
+k[63:0] := NOT a[63:0]
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, k" name="KNOTQ" xed="KNOTQ_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kor_mask32" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="b" />
+	<description>Compute the bitwise OR of 32-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[31:0] := a[31:0] OR b[31:0]
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, k, k" name="KORD" xed="KORD_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kor_mask64" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="b" />
+	<description>Compute the bitwise OR of 64-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[63:0] := a[63:0] OR b[63:0]
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, k, k" name="KORQ" xed="KORQ_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kxnor_mask32" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="b" />
+	<description>Compute the bitwise XNOR of 32-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[31:0] := NOT (a[31:0] XOR b[31:0])
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, k, k" name="KXNORD" xed="KXNORD_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kxnor_mask64" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="b" />
+	<description>Compute the bitwise XNOR of 64-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[63:0] := NOT (a[63:0] XOR b[63:0])
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, k, k" name="KXNORQ" xed="KXNORQ_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kxor_mask32" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="b" />
+	<description>Compute the bitwise XOR of 32-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[31:0] := a[31:0] XOR b[31:0]
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, k, k" name="KXORD" xed="KXORD_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kxor_mask64" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="b" />
+	<description>Compute the bitwise XOR of 64-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[63:0] := a[63:0] XOR b[63:0]
+k[MAX:64] := 0
+	</operation>
+	<instruction form="k, k, k" name="KXORQ" xed="KXORQ_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kshiftli_mask32" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="count" />
+	<description>Shift the bits of 32-bit mask "a" left by "count" while shifting in zeros, and store the least significant 32 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 31
+	k[31:0] := a[31:0] &lt;&lt; count[7:0]
+FI
+	</operation>
+	<instruction form="k, k, imm8" name="KSHIFTLD" xed="KSHIFTLD_MASKmskw_MASKmskw_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kshiftli_mask64" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="count" />
+	<description>Shift the bits of 64-bit mask "a" left by "count" while shifting in zeros, and store the least significant 64 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 63
+	k[63:0] := a[63:0] &lt;&lt; count[7:0]
+FI
+	</operation>
+	<instruction form="k, k, imm8" name="KSHIFTLQ" xed="KSHIFTLQ_MASKmskw_MASKmskw_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kshiftri_mask32" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="count" />
+	<description>Shift the bits of 32-bit mask "a" right by "count" while shifting in zeros, and store the least significant 32 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 31
+	k[31:0] := a[31:0] &gt;&gt; count[7:0]
+FI
+	</operation>
+	<instruction form="k, k, imm8" name="KSHIFTRD" xed="KSHIFTRD_MASKmskw_MASKmskw_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kshiftri_mask64" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="count" />
+	<description>Shift the bits of 64-bit mask "a" right by "count" while shifting in zeros, and store the least significant 64 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 63
+	k[63:0] := a[63:0] &gt;&gt; count[7:0]
+FI
+	</operation>
+	<instruction form="k, k, imm8" name="KSHIFTRQ" xed="KSHIFTRQ_MASKmskw_MASKmskw_IMM8_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kortest_mask32_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="b" />
+	<parameter etype="UI8" memwidth="8" type="unsigned char*" varname="all_ones" />
+	<description>Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones".</description>
+	<operation>
+tmp[31:0] := a[31:0] OR b[31:0]
+IF tmp[31:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+IF tmp[31:0] == 0xFFFFFFFF
+	MEM[all_ones+7:all_ones] := 1
+ELSE
+	MEM[all_ones+7:all_ones] := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTD" xed="KORTESTD_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kortestz_mask32_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="b" />
+	<description>Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[31:0] := a[31:0] OR b[31:0]
+IF tmp[31:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTD" xed="KORTESTD_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kortestc_mask32_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="b" />
+	<description>Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[31:0] := a[31:0] OR b[31:0]
+IF tmp[31:0] == 0xFFFFFFFF
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTD" xed="KORTESTD_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kortest_mask64_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="b" />
+	<parameter etype="UI8" memwidth="8" type="unsigned char*" varname="all_ones" />
+	<description>Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones".</description>
+	<operation>
+tmp[63:0] := a[63:0] OR b[63:0]
+IF tmp[63:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+IF tmp[7:0] == 0xFFFFFFFFFFFFFFFF
+	MEM[all_ones+7:all_ones] := 1
+ELSE
+	MEM[all_ones+7:all_ones] := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTQ" xed="KORTESTQ_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kortestz_mask64_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="b" />
+	<description>Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[63:0] := a[63:0] OR b[63:0]
+IF tmp[63:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTQ" xed="KORTESTQ_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kortestc_mask64_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="b" />
+	<description>Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[63:0] := a[63:0] OR b[63:0]
+IF tmp[63:0] == 0xFFFFFFFFFFFFFFFF
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTQ" xed="KORTESTQ_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_ktest_mask32_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="b" />
+	<parameter etype="UI8" memwidth="8" type="unsigned char*" varname="and_not" />
+	<description>Compute the bitwise AND of 32-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not".</description>
+	<operation>
+tmp1[31:0] := a[31:0] AND b[31:0]
+IF tmp1[31:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+tmp2[31:0] := (NOT a[31:0]) AND b[31:0]
+IF tmp2[31:0] == 0x0
+	MEM[and_not+7:and_not] := 1
+ELSE
+	MEM[and_not+7:and_not] := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KTESTD" xed="KTESTD_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_ktestz_mask32_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="b" />
+	<description>Compute the bitwise AND of 32-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[31:0] := a[31:0] AND b[31:0]
+IF tmp[31:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KTESTD" xed="KTESTD_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_ktestc_mask32_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="b" />
+	<description>Compute the bitwise NOT of 32-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[31:0] := (NOT a[31:0]) AND b[31:0]
+IF tmp[31:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KTESTD" xed="KTESTD_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_ktest_mask64_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="b" />
+	<parameter etype="UI8" memwidth="8" type="unsigned char*" varname="and_not" />
+	<description>Compute the bitwise AND of 64-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not".</description>
+	<operation>
+tmp1[63:0] := a[63:0] AND b[63:0]
+IF tmp1[63:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+tmp2[63:0] := (NOT a[63:0]) AND b[63:0]
+IF tmp2[63:0] == 0x0
+	MEM[and_not+7:and_not] := 1
+ELSE
+	MEM[and_not+7:and_not] := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KTESTQ" xed="KTESTQ_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_ktestz_mask64_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="b" />
+	<description>Compute the bitwise AND of 64-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[63:0] := a[63:0] AND b[63:0]
+IF tmp[63:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KTESTQ" xed="KTESTQ_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_ktestc_mask64_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="b" />
+	<description>Compute the bitwise NOT of 64-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[63:0] := (NOT a[63:0]) AND b[63:0]
+IF tmp[63:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KTESTQ" xed="KTESTQ_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_cvtmask32_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="a" />
+	<description>Convert 32-bit mask "a" into an integer value, and store the result in "dst".</description>
+	<operation>
+dst := ZeroExtend32(a[31:0])
+	</operation>
+	<instruction form="r32, k" name="KMOVD" xed="KMOVD_GPR32u32_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_cvtmask64_u64" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="a" />
+	<description>Convert 64-bit mask "a" into an integer value, and store the result in "dst".</description>
+	<operation>
+dst := ZeroExtend64(a[63:0])
+	</operation>
+	<instruction form="r64, k" name="KMOVQ" xed="KMOVQ_GPR64u64_MASKmskw_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_cvtu32_mask32" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Convert integer value "a" into an 32-bit mask, and store the result in "k".</description>
+	<operation>
+k := ZeroExtend32(a[31:0])
+	</operation>
+	<instruction form="k, r32" name="KMOVD" xed="KMOVD_MASKmskw_GPR32u32_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_cvtu64_mask64" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Convert integer value "a" into an 64-bit mask, and store the result in "k".</description>
+	<operation>
+k := ZeroExtend64(a[63:0])
+	</operation>
+	<instruction form="k, r64" name="KMOVQ" xed="KMOVQ_MASKmskw_GPR64u64_AVX512" />
+	<CPUID>AVX512BW</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_broadcastmb_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ZeroExtend64(k[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm" name="VPBROADCASTMB2Q" xed="VPBROADCASTMB2Q_YMMu64_MASKu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_broadcastmb_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ZeroExtend64(k[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm" name="VPBROADCASTMB2Q" xed="VPBROADCASTMB2Q_XMMu64_MASKu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcastmw_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ZeroExtend32(k[15:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm" name="VPBROADCASTMW2D" xed="VPBROADCASTMW2D_YMMu32_MASKu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_broadcastmw_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ZeroExtend32(k[15:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm" name="VPBROADCASTMW2D" xed="VPBROADCASTMW2D_XMMu32_MASKu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_conflict_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	FOR k := 0 to j-1
+		m := k*32
+		dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+	ENDFOR
+	dst[i+31:i+j] := 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPCONFLICTD" xed="VPCONFLICTD_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_conflict_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPCONFLICTD" xed="VPCONFLICTD_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_conflict_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPCONFLICTD" xed="VPCONFLICTD_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_conflict_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	FOR k := 0 to j-1
+		m := k*32
+		dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+	ENDFOR
+	dst[i+31:i+j] := 0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPCONFLICTD" xed="VPCONFLICTD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_conflict_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPCONFLICTD" xed="VPCONFLICTD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_conflict_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPCONFLICTD" xed="VPCONFLICTD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_conflict_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	FOR k := 0 to j-1
+		m := k*64
+		dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+	ENDFOR
+	dst[i+63:i+j] := 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPCONFLICTQ" xed="VPCONFLICTQ_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_conflict_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPCONFLICTQ" xed="VPCONFLICTQ_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_conflict_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPCONFLICTQ" xed="VPCONFLICTQ_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_conflict_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	FOR k := 0 to j-1
+		m := k*64
+		dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+	ENDFOR
+	dst[i+63:i+j] := 0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPCONFLICTQ" xed="VPCONFLICTQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_conflict_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPCONFLICTQ" xed="VPCONFLICTQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_conflict_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPCONFLICTQ" xed="VPCONFLICTQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_lzcnt_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	tmp := 31
+	dst[i+31:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+31:i] := dst[i+31:i] + 1
+	OD
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPLZCNTD" xed="VPLZCNTD_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_lzcnt_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPLZCNTD" xed="VPLZCNTD_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_lzcnt_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPLZCNTD" xed="VPLZCNTD_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_lzcnt_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	tmp := 31
+	dst[i+31:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+31:i] := dst[i+31:i] + 1
+	OD
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPLZCNTD" xed="VPLZCNTD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_lzcnt_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPLZCNTD" xed="VPLZCNTD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_lzcnt_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPLZCNTD" xed="VPLZCNTD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_lzcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp := 63
+	dst[i+63:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+63:i] := dst[i+63:i] + 1
+	OD
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPLZCNTQ" xed="VPLZCNTQ_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_lzcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPLZCNTQ" xed="VPLZCNTQ_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_lzcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPLZCNTQ" xed="VPLZCNTQ_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_lzcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp := 63
+	dst[i+63:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+63:i] := dst[i+63:i] + 1
+	OD
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPLZCNTQ" xed="VPLZCNTQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_lzcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPLZCNTQ" xed="VPLZCNTQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_lzcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPLZCNTQ" xed="VPLZCNTQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512CD</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_broadcastmb_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ZeroExtend64(k[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm" name="VPBROADCASTMB2Q" xed="VPBROADCASTMB2Q_ZMMu64_MASKu64_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcastmw_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ZeroExtend32(k[15:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm" name="VPBROADCASTMW2D" xed="VPBROADCASTMW2D_ZMMu32_MASKu32_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_conflict_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	FOR k := 0 to j-1
+		m := k*32
+		dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+	ENDFOR
+	dst[i+31:i+j] := 0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VPCONFLICTD" xed="VPCONFLICTD_ZMMu32_MASKmskw_ZMMu32_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_conflict_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPCONFLICTD" xed="VPCONFLICTD_ZMMu32_MASKmskw_ZMMu32_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_conflict_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPCONFLICTD" xed="VPCONFLICTD_ZMMu32_MASKmskw_ZMMu32_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_conflict_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	FOR k := 0 to j-1
+		m := k*64
+		dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+	ENDFOR
+	dst[i+63:i+j] := 0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VPCONFLICTQ" xed="VPCONFLICTQ_ZMMu64_MASKmskw_ZMMu64_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_conflict_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPCONFLICTQ" xed="VPCONFLICTQ_ZMMu64_MASKmskw_ZMMu64_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_conflict_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPCONFLICTQ" xed="VPCONFLICTQ_ZMMu64_MASKmskw_ZMMu64_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_lzcnt_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	tmp := 31
+	dst[i+31:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+31:i] := dst[i+31:i] + 1
+	OD
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VPLZCNTD" xed="VPLZCNTD_ZMMu32_MASKmskw_ZMMu32_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_lzcnt_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPLZCNTD" xed="VPLZCNTD_ZMMu32_MASKmskw_ZMMu32_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_lzcnt_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPLZCNTD" xed="VPLZCNTD_ZMMu32_MASKmskw_ZMMu32_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_lzcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp := 63
+	dst[i+63:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+63:i] := dst[i+63:i] + 1
+	OD
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VPLZCNTQ" xed="VPLZCNTQ_ZMMu64_MASKmskw_ZMMu64_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_lzcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPLZCNTQ" xed="VPLZCNTQ_ZMMu64_MASKmskw_ZMMu64_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_lzcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPLZCNTQ" xed="VPLZCNTQ_ZMMu64_MASKmskw_ZMMu64_AVX512CD" />
+	<CPUID>AVX512CD</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_mask_andnot_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VANDNPD" xed="VANDNPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_andnot_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VANDNPD" xed="VANDNPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_andnot_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VANDNPD" xed="VANDNPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_andnot_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VANDNPD" xed="VANDNPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_andnot_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VANDNPS" xed="VANDNPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_andnot_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VANDNPS" xed="VANDNPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_andnot_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VANDNPS" xed="VANDNPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_andnot_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VANDNPS" xed="VANDNPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_and_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VANDPD" xed="VANDPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_and_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0 
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VANDPD" xed="VANDPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_and_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VANDPD" xed="VANDPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_and_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VANDPD" xed="VANDPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_and_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VANDPS" xed="VANDPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_and_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VANDPS" xed="VANDPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_and_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VANDPS" xed="VANDPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_and_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VANDPS" xed="VANDPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_or_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VORPD" xed="VORPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_or_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VORPD" xed="VORPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_or_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VORPD" xed="VORPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_or_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VORPD" xed="VORPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_or_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VORPS" xed="VORPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_or_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VORPS" xed="VORPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_or_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VORPS" xed="VORPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_or_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VORPS" xed="VORPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_xor_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VXORPD" xed="VXORPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_xor_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VXORPD" xed="VXORPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_xor_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VXORPD" xed="VXORPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_xor_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VXORPD" xed="VXORPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_xor_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VXORPS" xed="VXORPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_xor_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VXORPS" xed="VXORPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_xor_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VXORPS" xed="VXORPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_xor_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VXORPS" xed="VXORPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcast_f32x2" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VBROADCASTF32X2" xed="VBROADCASTF32X2_YMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_broadcast_f32x2" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VBROADCASTF32X2" xed="VBROADCASTF32X2_YMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_broadcast_f32x2" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VBROADCASTF32X2" xed="VBROADCASTF32X2_YMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcast_f64x2" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j % 2)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m128" name="VBROADCASTF64X2" xed="VBROADCASTF64X2_YMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_broadcast_f64x2" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m128" name="VBROADCASTF64X2" xed="VBROADCASTF64X2_YMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_broadcast_f64x2" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m128" name="VBROADCASTF64X2" xed="VBROADCASTF64X2_YMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcast_i32x2" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VBROADCASTI32X2" xed="VBROADCASTI32X2_YMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_broadcast_i32x2" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VBROADCASTI32X2" xed="VBROADCASTI32X2_YMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_broadcast_i32x2" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VBROADCASTI32X2" xed="VBROADCASTI32X2_YMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_broadcast_i32x2" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	n := (j % 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VBROADCASTI32X2" xed="VBROADCASTI32X2_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_broadcast_i32x2" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VBROADCASTI32X2" xed="VBROADCASTI32X2_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_broadcast_i32x2" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VBROADCASTI32X2" xed="VBROADCASTI32X2_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcast_i64x2" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j % 2)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m128" name="VBROADCASTI64X2" xed="VBROADCASTI64X2_YMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_broadcast_i64x2" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m128" name="VBROADCASTI64X2" xed="VBROADCASTI64X2_YMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_broadcast_i64x2" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m128" name="VBROADCASTI64X2" xed="VBROADCASTI64X2_YMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_extractf64x2_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm, imm8" name="VEXTRACTF64X2" xed="VEXTRACTF64X2_XMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_extractf64x2_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm, imm8" name="VEXTRACTF64X2" xed="VEXTRACTF64X2_XMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_extractf64x2_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm, imm8" name="VEXTRACTF64X2" xed="VEXTRACTF64X2_XMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_extracti64x2_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm, imm8" name="VEXTRACTI64X2" xed="VEXTRACTI64X2_XMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_extracti64x2_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm, imm8" name="VEXTRACTI64X2" xed="VEXTRACTI64X2_XMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_extracti64x2_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm, imm8" name="VEXTRACTI64X2" xed="VEXTRACTI64X2_XMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fpclass_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, imm8" name="VFPCLASSPD" xed="VFPCLASSPD_MASKmskw_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fpclass_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, imm8" name="VFPCLASSPD" xed="VFPCLASSPD_MASKmskw_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_fpclass_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, imm8" name="VFPCLASSPD" xed="VFPCLASSPD_MASKmskw_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fpclass_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, imm8" name="VFPCLASSPD" xed="VFPCLASSPD_MASKmskw_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fpclass_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, imm8" name="VFPCLASSPS" xed="VFPCLASSPS_MASKmskw_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fpclass_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, imm8" name="VFPCLASSPS" xed="VFPCLASSPS_MASKmskw_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_fpclass_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, imm8" name="VFPCLASSPS" xed="VFPCLASSPS_MASKmskw_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fpclass_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, imm8" name="VFPCLASSPS" xed="VFPCLASSPS_MASKmskw_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_insertf64x2" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE imm8[0] OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTF64X2" xed="VINSERTF64X2_YMMf64_MASKmskw_YMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_insertf64x2" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, xmm, imm8" name="VINSERTF64X2" xed="VINSERTF64X2_YMMf64_MASKmskw_YMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_insertf64x2" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, xmm, imm8" name="VINSERTF64X2" xed="VINSERTF64X2_YMMf64_MASKmskw_YMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_inserti64x2" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE imm8[0] OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTI64X2" xed="VINSERTI64X2_YMMu64_MASKmskw_YMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_inserti64x2" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, xmm, imm8" name="VINSERTI64X2" xed="VINSERTI64X2_YMMu64_MASKmskw_YMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_inserti64x2" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, xmm, imm8" name="VINSERTI64X2" xed="VINSERTI64X2_YMMu64_MASKmskw_YMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_movepi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF a[i+31]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm" name="VPMOVD2M" xed="VPMOVD2M_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_movepi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF a[i+31]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm" name="VPMOVD2M" xed="VPMOVD2M_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_movm_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 0xFFFFFFFF
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm" name="VPMOVM2D" xed="VPMOVM2D_YMMu32_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_movm_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 0xFFFFFFFF
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm" name="VPMOVM2D" xed="VPMOVM2D_XMMu32_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_movm_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 0xFFFFFFFFFFFFFFFF
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm" name="VPMOVM2Q" xed="VPMOVM2Q_YMMu64_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_movm_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 0xFFFFFFFFFFFFFFFF
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm" name="VPMOVM2Q" xed="VPMOVM2Q_XMMu64_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_movepi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF a[i+63]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm" name="VPMOVQ2M" xed="VPMOVQ2M_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_movepi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF a[i+63]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm" name="VPMOVQ2M" xed="VPMOVQ2M_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_range_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VRANGEPD" xed="VRANGEPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_range_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VRANGEPD" xed="VRANGEPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_range_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VRANGEPD" xed="VRANGEPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_range_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VRANGEPD" xed="VRANGEPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_range_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VRANGEPD" xed="VRANGEPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_range_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VRANGEPD" xed="VRANGEPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_range_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VRANGEPS" xed="VRANGEPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_range_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VRANGEPS" xed="VRANGEPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_range_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VRANGEPS" xed="VRANGEPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_range_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VRANGEPS" xed="VRANGEPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_range_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VRANGEPS" xed="VRANGEPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_range_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VRANGEPS" xed="VRANGEPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VREDUCEPD" xed="VREDUCEPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_reduce_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VREDUCEPD" xed="VREDUCEPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VREDUCEPD" xed="VREDUCEPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VREDUCEPD" xed="VREDUCEPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_reduce_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VREDUCEPD" xed="VREDUCEPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VREDUCEPD" xed="VREDUCEPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VREDUCEPS" xed="VREDUCEPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_reduce_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VREDUCEPS" xed="VREDUCEPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VREDUCEPS" xed="VREDUCEPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VREDUCEPS" xed="VREDUCEPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_reduce_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VREDUCEPS" xed="VREDUCEPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VREDUCEPS" xed="VREDUCEPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTPD2QQ" xed="VCVTPD2QQ_YMMi64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTPD2QQ" xed="VCVTPD2QQ_YMMi64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTPD2QQ" xed="VCVTPD2QQ_YMMi64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPD2QQ" xed="VCVTPD2QQ_XMMi64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPD2QQ" xed="VCVTPD2QQ_XMMi64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPD2QQ" xed="VCVTPD2QQ_XMMi64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTPD2UQQ" xed="VCVTPD2UQQ_YMMu64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTPD2UQQ" xed="VCVTPD2UQQ_YMMu64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTPD2UQQ" xed="VCVTPD2UQQ_YMMu64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPD2UQQ" xed="VCVTPD2UQQ_XMMu64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPD2UQQ" xed="VCVTPD2UQQ_XMMu64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPD2UQQ" xed="VCVTPD2UQQ_XMMu64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTPS2QQ" xed="VCVTPS2QQ_YMMi64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTPS2QQ" xed="VCVTPS2QQ_YMMi64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTPS2QQ" xed="VCVTPS2QQ_YMMi64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPS2QQ" xed="VCVTPS2QQ_XMMi64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPS2QQ" xed="VCVTPS2QQ_XMMi64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPS2QQ" xed="VCVTPS2QQ_XMMi64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTPS2UQQ" xed="VCVTPS2UQQ_YMMu64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTPS2UQQ" xed="VCVTPS2UQQ_YMMu64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTPS2UQQ" xed="VCVTPS2UQQ_YMMu64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPS2UQQ" xed="VCVTPS2UQQ_XMMu64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPS2UQQ" xed="VCVTPS2UQQ_XMMu64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPS2UQQ" xed="VCVTPS2UQQ_XMMu64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTQQ2PD" xed="VCVTQQ2PD_YMMi64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTQQ2PD" xed="VCVTQQ2PD_YMMi64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTQQ2PD" xed="VCVTQQ2PD_YMMi64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTQQ2PD" xed="VCVTQQ2PD_XMMi64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTQQ2PD" xed="VCVTQQ2PD_XMMi64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTQQ2PD" xed="VCVTQQ2PD_XMMi64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTQQ2PS" xed="VCVTQQ2PS_XMMf32_MASKmskw_YMMu64_AVX512_VL256" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTQQ2PS" xed="VCVTQQ2PS_XMMf32_MASKmskw_YMMu64_AVX512_VL256" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTQQ2PS" xed="VCVTQQ2PS_XMMf32_MASKmskw_YMMu64_AVX512_VL256" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTQQ2PS" xed="VCVTQQ2PS_XMMf32_MASKmskw_XMMu64_AVX512_VL128" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTQQ2PS" xed="VCVTQQ2PS_XMMf32_MASKmskw_XMMu64_AVX512_VL128" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTQQ2PS" xed="VCVTQQ2PS_XMMf32_MASKmskw_XMMu64_AVX512_VL128" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTTPD2QQ" xed="VCVTTPD2QQ_YMMi64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTTPD2QQ" xed="VCVTTPD2QQ_YMMi64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTTPD2QQ" xed="VCVTTPD2QQ_YMMi64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTTPD2QQ" xed="VCVTTPD2QQ_XMMi64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPD2QQ" xed="VCVTTPD2QQ_XMMi64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPD2QQ" xed="VCVTTPD2QQ_XMMi64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTTPD2UQQ" xed="VCVTTPD2UQQ_YMMu64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTTPD2UQQ" xed="VCVTTPD2UQQ_YMMu64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTTPD2UQQ" xed="VCVTTPD2UQQ_YMMu64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTTPD2UQQ" xed="VCVTTPD2UQQ_XMMu64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPD2UQQ" xed="VCVTTPD2UQQ_XMMu64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPD2UQQ" xed="VCVTTPD2UQQ_XMMu64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTTPS2QQ" xed="VCVTTPS2QQ_YMMi64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTTPS2QQ" xed="VCVTTPS2QQ_YMMi64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTTPS2QQ" xed="VCVTTPS2QQ_YMMi64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTTPS2QQ" xed="VCVTTPS2QQ_XMMi64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPS2QQ" xed="VCVTTPS2QQ_XMMi64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPS2QQ" xed="VCVTTPS2QQ_XMMi64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTTPS2UQQ" xed="VCVTTPS2UQQ_YMMu64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTTPS2UQQ" xed="VCVTTPS2UQQ_YMMu64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTTPS2UQQ" xed="VCVTTPS2UQQ_YMMu64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTTPS2UQQ" xed="VCVTTPS2UQQ_XMMu64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPS2UQQ" xed="VCVTTPS2UQQ_XMMu64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPS2UQQ" xed="VCVTTPS2UQQ_XMMu64_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepu64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTUQQ2PD" xed="VCVTUQQ2PD_YMMf64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepu64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTUQQ2PD" xed="VCVTUQQ2PD_YMMf64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepu64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTUQQ2PD" xed="VCVTUQQ2PD_YMMf64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepu64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTUQQ2PD" xed="VCVTUQQ2PD_XMMf64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepu64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTUQQ2PD" xed="VCVTUQQ2PD_XMMf64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepu64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTUQQ2PD" xed="VCVTUQQ2PD_XMMf64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepu64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTUQQ2PS" xed="VCVTUQQ2PS_XMMf32_MASKmskw_YMMu64_AVX512_VL256" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepu64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTUQQ2PS" xed="VCVTUQQ2PS_XMMf32_MASKmskw_YMMu64_AVX512_VL256" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepu64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTUQQ2PS" xed="VCVTUQQ2PS_XMMf32_MASKmskw_YMMu64_AVX512_VL256" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepu64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTUQQ2PS" xed="VCVTUQQ2PS_XMMf32_MASKmskw_XMMu64_AVX512_VL128" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepu64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTUQQ2PS" xed="VCVTUQQ2PS_XMMf32_MASKmskw_XMMu64_AVX512_VL128" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepu64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTUQQ2PS" xed="VCVTUQQ2PS_XMMf32_MASKmskw_XMMu64_AVX512_VL128" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mullo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMULLQ" xed="VPMULLQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mullo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMULLQ" xed="VPMULLQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mullo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := a[i+63:i] * b[i+63:i]
+	dst[i+63:i] := tmp[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMULLQ" xed="VPMULLQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mullo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMULLQ" xed="VPMULLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mullo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMULLQ" xed="VPMULLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mullo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := a[i+63:i] * b[i+63:i]
+	dst[i+63:i] := tmp[63:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPMULLQ" xed="VPMULLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_andnot_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VANDNPD" xed="VANDNPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_andnot_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VANDNPD" xed="VANDNPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_andnot_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VANDNPD" xed="VANDNPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_andnot_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VANDNPS" xed="VANDNPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_andnot_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VANDNPS" xed="VANDNPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_andnot_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VANDNPS" xed="VANDNPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_and_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VANDPD" xed="VANDPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_and_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VANDPD" xed="VANDPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_and_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VANDPD" xed="VANDPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_and_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VANDPS" xed="VANDPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_and_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VANDPS" xed="VANDPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_and_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VANDPS" xed="VANDPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_or_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VORPD" xed="VORPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_or_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VORPD" xed="VORPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_or_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VORPD" xed="VORPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_or_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VORPS" xed="VORPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_or_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VORPS" xed="VORPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_or_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VORPS" xed="VORPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_xor_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VXORPD" xed="VXORPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_xor_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VXORPD" xed="VXORPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_xor_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VXORPD" xed="VXORPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_xor_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VXORPS" xed="VXORPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_xor_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VXORPS" xed="VXORPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_xor_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VXORPS" xed="VXORPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcast_f32x2" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VBROADCASTF32X2" xed="VBROADCASTF32X2_ZMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcast_f32x2" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VBROADCASTF32X2" xed="VBROADCASTF32X2_ZMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcast_f32x2" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VBROADCASTF32X2" xed="VBROADCASTF32X2_ZMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcast_f32x8" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 8)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m256" name="VBROADCASTF32X8" xed="VBROADCASTF32X8_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcast_f32x8" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 8)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m256" name="VBROADCASTF32X8" xed="VBROADCASTF32X8_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcast_f32x8" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 8)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m256" name="VBROADCASTF32X8" xed="VBROADCASTF32X8_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcast_f64x2" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 2)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m128" name="VBROADCASTF64X2" xed="VBROADCASTF64X2_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcast_f64x2" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m128" name="VBROADCASTF64X2" xed="VBROADCASTF64X2_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcast_f64x2" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m128" name="VBROADCASTF64X2" xed="VBROADCASTF64X2_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcast_i32x2" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VBROADCASTI32X2" xed="VBROADCASTI32X2_ZMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcast_i32x2" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VBROADCASTI32X2" xed="VBROADCASTI32X2_ZMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcast_i32x2" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VBROADCASTI32X2" xed="VBROADCASTI32X2_ZMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcast_i32x8" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 8)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m256" name="VBROADCASTI32X8" xed="VBROADCASTI32X8_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcast_i32x8" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 8)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m256" name="VBROADCASTI32X8" xed="VBROADCASTI32X8_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcast_i32x8" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 8)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m256" name="VBROADCASTI32X8" xed="VBROADCASTI32X8_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcast_i64x2" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 2)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m128" name="VBROADCASTI64X2" xed="VBROADCASTI64X2_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcast_i64x2" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m128" name="VBROADCASTI64X2" xed="VBROADCASTI64X2_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcast_i64x2" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m128" name="VBROADCASTI64X2" xed="VBROADCASTI64X2_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_extractf32x8_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm, imm8" name="VEXTRACTF32X8" xed="VEXTRACTF32X8_YMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_extractf32x8_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm, imm8" name="VEXTRACTF32X8" xed="VEXTRACTF32X8_YMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_extractf32x8_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm, imm8" name="VEXTRACTF32X8" xed="VEXTRACTF32X8_YMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_extractf64x2_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[1:0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm, imm8" name="VEXTRACTF64X2" xed="VEXTRACTF64X2_XMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_extractf64x2_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm, imm8" name="VEXTRACTF64X2" xed="VEXTRACTF64X2_XMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_extractf64x2_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm, imm8" name="VEXTRACTF64X2" xed="VEXTRACTF64X2_XMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_extracti32x8_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm, imm8" name="VEXTRACTI32X8" xed="VEXTRACTI32X8_YMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_extracti32x8_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm, imm8" name="VEXTRACTI32X8" xed="VEXTRACTI32X8_YMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_extracti32x8_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm, imm8" name="VEXTRACTI32X8" xed="VEXTRACTI32X8_YMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_extracti64x2_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[1:0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm, imm8" name="VEXTRACTI64X2" xed="VEXTRACTI64X2_XMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_extracti64x2_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm, imm8" name="VEXTRACTI64X2" xed="VEXTRACTI64X2_XMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_extracti64x2_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm, imm8" name="VEXTRACTI64X2" xed="VEXTRACTI64X2_XMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fpclass_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, imm8" name="VFPCLASSPD" xed="VFPCLASSPD_MASKmskw_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fpclass_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, imm8" name="VFPCLASSPD" xed="VFPCLASSPD_MASKmskw_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fpclass_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, imm8" name="VFPCLASSPS" xed="VFPCLASSPS_MASKmskw_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fpclass_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, imm8" name="VFPCLASSPS" xed="VFPCLASSPS_MASKmskw_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_fpclass_sd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test the lower double-precision (64-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k".
+	[fpclass_note]</description>
+	<operation>k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0])
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k, xmm, imm8" name="VFPCLASSSD" xed="VFPCLASSSD_MASKmskw_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fpclass_sd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test the lower double-precision (64-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set).
+	[fpclass_note]</description>
+	<operation>IF k1[0]
+	k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0])
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k {k}, xmm, imm8" name="VFPCLASSSD" xed="VFPCLASSSD_MASKmskw_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_fpclass_ss_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test the lower single-precision (32-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k.
+	[fpclass_note]</description>
+	<operation>k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0])
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k, xmm, imm8" name="VFPCLASSSS" xed="VFPCLASSSS_MASKmskw_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fpclass_ss_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test the lower single-precision (32-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set).
+	[fpclass_note]</description>
+	<operation>IF k1[0]
+	k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0])
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k {k}, xmm, imm8" name="VFPCLASSSS" xed="VFPCLASSSS_MASKmskw_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_insertf32x8" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: dst[255:0] := b[255:0]
+1: dst[511:256] := b[255:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, ymm, imm8" name="VINSERTF32X8" xed="VINSERTF32X8_ZMMf32_MASKmskw_ZMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_insertf32x8" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, ymm, imm8" name="VINSERTF32X8" xed="VINSERTF32X8_ZMMf32_MASKmskw_ZMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_insertf32x8" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, ymm, imm8" name="VINSERTF32X8" xed="VINSERTF32X8_ZMMf32_MASKmskw_ZMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_insertf64x2" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE imm8[1:0] OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+2: dst[383:256] := b[127:0]
+3: dst[511:384] := b[127:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, xmm, imm8" name="VINSERTF64X2" xed="VINSERTF64X2_ZMMf64_MASKmskw_ZMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_insertf64x2" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, xmm, imm8" name="VINSERTF64X2" xed="VINSERTF64X2_ZMMf64_MASKmskw_ZMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_insertf64x2" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, xmm, imm8" name="VINSERTF64X2" xed="VINSERTF64X2_ZMMf64_MASKmskw_ZMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_inserti32x8" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE imm8[0] OF
+0: dst[255:0] := b[255:0]
+1: dst[511:256] := b[255:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, ymm, imm8" name="VINSERTI32X8" xed="VINSERTI32X8_ZMMu32_MASKmskw_ZMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_inserti32x8" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, ymm, imm8" name="VINSERTI32X8" xed="VINSERTI32X8_ZMMu32_MASKmskw_ZMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_inserti32x8" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, ymm, imm8" name="VINSERTI32X8" xed="VINSERTI32X8_ZMMu32_MASKmskw_ZMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_inserti64x2" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE imm8[1:0] OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+2: dst[383:256] := b[127:0]
+3: dst[511:384] := b[127:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, xmm, imm8" name="VINSERTI64X2" xed="VINSERTI64X2_ZMMu64_MASKmskw_ZMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_inserti64x2" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, xmm, imm8" name="VINSERTI64X2" xed="VINSERTI64X2_ZMMu64_MASKmskw_ZMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_inserti64x2" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, xmm, imm8" name="VINSERTI64X2" xed="VINSERTI64X2_ZMMu64_MASKmskw_ZMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_movepi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF a[i+31]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm" name="VPMOVD2M" xed="VPMOVD2M_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_movm_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 0xFFFFFFFF
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm" name="VPMOVM2D" xed="VPMOVM2D_ZMMu32_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_movm_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 0xFFFFFFFFFFFFFFFF
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm" name="VPMOVM2Q" xed="VPMOVM2Q_ZMMu64_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_movepi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF a[i+63]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm" name="VPMOVQ2M" xed="VPMOVQ2M_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_range_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VRANGEPD" xed="VRANGEPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_range_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {sae}, imm8" name="VRANGEPD" xed="VRANGEPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_range_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VRANGEPD" xed="VRANGEPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_range_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {sae}, imm8" name="VRANGEPD" xed="VRANGEPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_range_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VRANGEPD" xed="VRANGEPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_range_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {sae}, imm8" name="VRANGEPD" xed="VRANGEPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_range_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VRANGEPS" xed="VRANGEPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_range_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {sae}, imm8" name="VRANGEPS" xed="VRANGEPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_range_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VRANGEPS" xed="VRANGEPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_range_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {sae}, imm8" name="VRANGEPS" xed="VRANGEPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_range_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VRANGEPS" xed="VRANGEPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_range_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {sae}, imm8" name="VRANGEPS" xed="VRANGEPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_range_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}, imm8" name="VRANGESD" xed="VRANGESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_range_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VRANGESD" xed="VRANGESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_range_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}, imm8" name="VRANGESD" xed="VRANGESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_range_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VRANGESD" xed="VRANGESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_range_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}, imm8" name="VRANGESD" xed="VRANGESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_range_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}, imm8" name="VRANGESS" xed="VRANGESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_range_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VRANGESS" xed="VRANGESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_range_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}, imm8" name="VRANGESS" xed="VRANGESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_range_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VRANGESS" xed="VRANGESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_range_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}, imm8" name="VRANGESS" xed="VRANGESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VREDUCEPD" xed="VREDUCEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}, imm8" name="VREDUCEPD" xed="VREDUCEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_reduce_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VREDUCEPD" xed="VREDUCEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_reduce_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}, imm8" name="VREDUCEPD" xed="VREDUCEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VREDUCEPD" xed="VREDUCEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}, imm8" name="VREDUCEPD" xed="VREDUCEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VREDUCEPS" xed="VREDUCEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}, imm8" name="VREDUCEPS" xed="VREDUCEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_reduce_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VREDUCEPS" xed="VREDUCEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_reduce_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}, imm8" name="VREDUCEPS" xed="VREDUCEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VREDUCEPS" xed="VREDUCEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}, imm8" name="VREDUCEPS" xed="VREDUCEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VREDUCESD" xed="VREDUCESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}, imm8" name="VREDUCESD" xed="VREDUCESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_reduce_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VREDUCESD" xed="VREDUCESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_reduce_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}, imm8" name="VREDUCESD" xed="VREDUCESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VREDUCESD" xed="VREDUCESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}, imm8" name="VREDUCESD" xed="VREDUCESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VREDUCESS" xed="VREDUCESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}, imm8" name="VREDUCESS" xed="VREDUCESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_reduce_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VREDUCESS" xed="VREDUCESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_reduce_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}, imm8" name="VREDUCESS" xed="VREDUCESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VREDUCESS" xed="VREDUCESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}, imm8" name="VREDUCESS" xed="VREDUCESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VCVTPD2QQ" xed="VCVTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTPD2QQ" xed="VCVTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VCVTPD2QQ" xed="VCVTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTPD2QQ" xed="VCVTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VCVTPD2QQ" xed="VCVTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTPD2QQ" xed="VCVTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VCVTPD2UQQ" xed="VCVTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTPD2UQQ" xed="VCVTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VCVTPD2UQQ" xed="VCVTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTPD2UQQ" xed="VCVTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VCVTPD2UQQ" xed="VCVTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTPD2UQQ" xed="VCVTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm {er}" name="VCVTPS2QQ" xed="VCVTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VCVTPS2QQ" xed="VCVTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	 [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm {er}" name="VCVTPS2QQ" xed="VCVTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VCVTPS2QQ" xed="VCVTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm {er}" name="VCVTPS2QQ" xed="VCVTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VCVTPS2QQ" xed="VCVTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm {er}" name="VCVTPS2UQQ" xed="VCVTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VCVTPS2UQQ" xed="VCVTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm {er}" name="VCVTPS2UQQ" xed="VCVTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VCVTPS2UQQ" xed="VCVTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm {er}" name="VCVTPS2UQQ" xed="VCVTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VCVTPS2UQQ" xed="VCVTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundepi64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VCVTQQ2PD" xed="VCVTQQ2PD_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTQQ2PD" xed="VCVTQQ2PD_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundepi64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VCVTQQ2PD" xed="VCVTQQ2PD_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTQQ2PD" xed="VCVTQQ2PD_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundepi64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VCVTQQ2PD" xed="VCVTQQ2PD_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTQQ2PD" xed="VCVTQQ2PD_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundepi64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm {er}" name="VCVTQQ2PS" xed="VCVTQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VCVTQQ2PS" xed="VCVTQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundepi64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm {er}" name="VCVTQQ2PS" xed="VCVTQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VCVTQQ2PS" xed="VCVTQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundepi64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm {er}" name="VCVTQQ2PS" xed="VCVTQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VCVTQQ2PS" xed="VCVTQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}" name="VCVTTPD2QQ" xed="VCVTTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTTPD2QQ" xed="VCVTTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}" name="VCVTTPD2QQ" xed="VCVTTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTTPD2QQ" xed="VCVTTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}" name="VCVTTPD2QQ" xed="VCVTTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttpd_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTTPD2QQ" xed="VCVTTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}" name="VCVTTPD2UQQ" xed="VCVTTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTTPD2UQQ" xed="VCVTTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}" name="VCVTTPD2UQQ" xed="VCVTTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTTPD2UQQ" xed="VCVTTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}" name="VCVTTPD2UQQ" xed="VCVTTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttpd_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTTPD2UQQ" xed="VCVTTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm {sae}" name="VCVTTPS2QQ" xed="VCVTTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VCVTTPS2QQ" xed="VCVTTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm {sae}" name="VCVTTPS2QQ" xed="VCVTTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VCVTTPS2QQ" xed="VCVTTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm {sae}" name="VCVTTPS2QQ" xed="VCVTTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttps_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VCVTTPS2QQ" xed="VCVTTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm {sae}" name="VCVTTPS2UQQ" xed="VCVTTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VCVTTPS2UQQ" xed="VCVTTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm {sae}" name="VCVTTPS2UQQ" xed="VCVTTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VCVTTPS2UQQ" xed="VCVTTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm {sae}" name="VCVTTPS2UQQ" xed="VCVTTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttps_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VCVTTPS2UQQ" xed="VCVTTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundepu64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VCVTUQQ2PD" xed="VCVTUQQ2PD_ZMMf64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTUQQ2PD" xed="VCVTUQQ2PD_ZMMf64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundepu64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VCVTUQQ2PD" xed="VCVTUQQ2PD_ZMMf64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTUQQ2PD" xed="VCVTUQQ2PD_ZMMf64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundepu64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VCVTUQQ2PD" xed="VCVTUQQ2PD_ZMMf64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepu64_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTUQQ2PD" xed="VCVTUQQ2PD_ZMMf64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundepu64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm {er}" name="VCVTUQQ2PS" xed="VCVTUQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VCVTUQQ2PS" xed="VCVTUQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundepu64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm {er}" name="VCVTUQQ2PS" xed="VCVTUQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VCVTUQQ2PS" xed="VCVTUQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundepu64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm {er}" name="VCVTUQQ2PS" xed="VCVTUQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepu64_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VCVTUQQ2PS" xed="VCVTUQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mullo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMULLQ" xed="VPMULLQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mullo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMULLQ" xed="VPMULLQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mullo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp[127:0] := a[i+63:i] * b[i+63:i]
+	dst[i+63:i] := tmp[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMULLQ" xed="VPMULLQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_kadd_mask8" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="b" />
+	<description>Add 8-bit masks in "a" and "b", and store the result in "k".</description>
+	<operation>
+k[7:0] := a[7:0] + b[7:0]
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, k, k" name="KADDB" xed="KADDB_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kadd_mask16" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Add 16-bit masks in "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] + b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k, k" name="KADDW" xed="KADDW_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kand_mask8" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="b" />
+	<description>Compute the bitwise AND of 8-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[7:0] := a[7:0] AND b[7:0]
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, k, k" name="KANDB" xed="KANDB_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kandn_mask8" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="b" />
+	<description>Compute the bitwise NOT of 8-bit masks "a" and then AND with "b", and store the result in "k".</description>
+	<operation>
+k[7:0] := (NOT a[7:0]) AND b[7:0]
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, k, k" name="KANDNB" xed="KANDNB_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_knot_mask8" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<description>Compute the bitwise NOT of 8-bit mask "a", and store the result in "k".</description>
+	<operation>
+k[7:0] := NOT a[7:0]
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, k" name="KNOTB" xed="KNOTB_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kor_mask8" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="b" />
+	<description>Compute the bitwise OR of 8-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[7:0] := a[7:0] OR b[7:0]
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, k, k" name="KORB" xed="KORB_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kxnor_mask8" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="b" />
+	<description>Compute the bitwise XNOR of 8-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[7:0] := NOT (a[7:0] XOR b[7:0])
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, k, k" name="KXNORB" xed="KXNORB_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kxor_mask8" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="b" />
+	<description>Compute the bitwise XOR of 8-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[7:0] := a[7:0] XOR b[7:0]
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, k, k" name="KXORB" xed="KXORB_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kshiftli_mask8" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="count" />
+	<description>Shift the bits of 8-bit mask "a" left by "count" while shifting in zeros, and store the least significant 8 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 7
+	k[7:0] := a[7:0] &lt;&lt; count[7:0]
+FI
+	</operation>
+	<instruction form="k, k, imm8" name="KSHIFTLB" xed="KSHIFTLB_MASKmskw_MASKmskw_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kshiftri_mask8" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="count" />
+	<description>Shift the bits of 8-bit mask "a" right by "count" while shifting in zeros, and store the least significant 8 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 7
+	k[7:0] := a[7:0] &gt;&gt; count[7:0]
+FI
+	</operation>
+	<instruction form="k, k, imm8" name="KSHIFTRB" xed="KSHIFTRB_MASKmskw_MASKmskw_IMM8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kortest_mask8_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="b" />
+	<parameter etype="UI8" memwidth="8" type="unsigned char*" varname="all_ones" />
+	<description>Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones".</description>
+	<operation>
+tmp[7:0] := a[7:0] OR b[7:0]
+IF tmp[7:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+IF tmp[7:0] == 0xFF
+	MEM[all_ones+7:all_ones] := 1
+ELSE
+	MEM[all_ones+7:all_ones] := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTB" xed="KORTESTB_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kortestz_mask8_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="b" />
+	<description>Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[7:0] := a[7:0] OR b[7:0]
+IF tmp[7:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTB" xed="KORTESTB_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kortestc_mask8_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="b" />
+	<description>Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[7:0] := a[7:0] OR b[7:0]
+IF tmp[7:0] == 0xFF
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTB" xed="KORTESTB_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_ktest_mask8_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="b" />
+	<parameter etype="UI8" memwidth="8" type="unsigned char*" varname="and_not" />
+	<description>Compute the bitwise AND of 8-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not".</description>
+	<operation>
+tmp1[7:0] := a[7:0] AND b[7:0]
+IF tmp1[7:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+tmp2[7:0] := (NOT a[7:0]) AND b[7:0]
+IF tmp2[7:0] == 0x0
+	MEM[and_not+7:and_not] := 1
+ELSE
+	MEM[and_not+7:and_not] := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KTESTB" xed="KTESTB_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_ktestz_mask8_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="b" />
+	<description>Compute the bitwise AND of 8-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[7:0] := a[7:0] AND b[7:0]
+IF tmp[7:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KTESTB" xed="KTESTB_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_ktestc_mask8_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="b" />
+	<description>Compute the bitwise NOT of 8-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[7:0] := (NOT a[7:0]) AND b[7:0]
+IF tmp[7:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KTESTB" xed="KTESTB_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_ktest_mask16_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<parameter etype="UI8" memwidth="8" type="unsigned char*" varname="and_not" />
+	<description>Compute the bitwise AND of 16-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not".</description>
+	<operation>
+tmp1[15:0] := a[15:0] AND b[15:0]
+IF tmp1[15:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+tmp2[15:0] := (NOT a[15:0]) AND b[15:0]
+IF tmp2[15:0] == 0x0
+	MEM[and_not+7:and_not] := 1
+ELSE
+	MEM[and_not+7:and_not] := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KTESTW" xed="KTESTW_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_ktestz_mask16_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise AND of 16-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[15:0] := a[15:0] AND b[15:0]
+IF tmp[15:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KTESTW" xed="KTESTW_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_ktestc_mask16_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise NOT of 16-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[15:0] := (NOT a[15:0]) AND b[15:0]
+IF tmp[15:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KTESTW" xed="KTESTW_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_cvtmask8_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<description>Convert 8-bit mask "a" into an integer value, and store the result in "dst".</description>
+	<operation>
+dst := ZeroExtend32(a[7:0])
+	</operation>
+	<instruction form="r32, k" name="KMOVB" xed="KMOVB_GPR32u32_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_cvtu32_mask8" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="unsigned int" varname="a" />
+	<description>Convert integer value "a" into an 8-bit mask, and store the result in "k".</description>
+	<operation>
+k := a[7:0]
+	</operation>
+	<instruction form="k, r32" name="KMOVB" xed="KMOVB_MASKmskw_GPR32u32_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_load_mask8" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" memwidth="8" type="__mmask8*" varname="mem_addr" />
+	<description>Load 8-bit mask from memory into "k".</description>
+	<operation>
+k[7:0] := MEM[mem_addr+7:mem_addr]
+	</operation>
+	<instruction form="k, m8" name="KMOVB" xed="KMOVB_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_store_mask8" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="MASK" memwidth="8" type="__mmask8*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="a" />
+	<description>Store 8-bit mask from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+7:mem_addr] := a[7:0]
+	</operation>
+	<instruction form="m8, k" name="KMOVB" xed="KMOVB_MEMu8_MASKmskw_AVX512" />
+	<CPUID>AVX512DQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_acos_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ACOS(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_acos_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ACOS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_acos_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ACOS(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_acos_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ACOS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_acosh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ACOSH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_acosh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ACOSH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_acosh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ACOSH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_acosh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ACOSH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_asin_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ASIN(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_asin_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ASIN(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_asin_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ASIN(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_asin_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ASIN(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_asinh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ASINH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_asinh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ASINH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_asinh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ASINH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_asinh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ASINH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_atan2_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_atan2_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_atan2_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_atan2_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_atan_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ATAN(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_atan_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ATAN(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_atan_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ATAN(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_atan_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ATAN(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_atanh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ATANH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_atanh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ATANH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_atanh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse hyperblic tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ATANH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_atanh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ATANH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cos_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cos_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := COS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cos_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cos_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := COS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cosd_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := COSD(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cosd_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := COSD(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cosd_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := COSD(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cosd_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := COSD(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cosh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := COSH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cosh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := COSH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cosh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := COSH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cosh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := COSH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sin_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sin_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SIN(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sin_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sin_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SIN(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sinh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SINH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sinh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SINH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sinh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SINH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sinh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SINH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sind_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SIND(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sind_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SIND(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sind_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SIND(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sind_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SIND(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_tan_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := TAN(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_tan_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := TAN(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_tan_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := TAN(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_tan_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := TAN(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_tand_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := TAND(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_tand_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := TAND(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_tand_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := TAND(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_tand_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := TAND(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_tanh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := TANH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_tanh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := TANH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_tanh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := TANH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_tanh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := TANH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sincos_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" memwidth="512" type="__m512d *" varname="mem_addr" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+	MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sincos_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" memwidth="512" type="__m512d *" varname="mem_addr" />
+	<parameter etype="FP64" type="__m512d" varname="sin_src" />
+	<parameter etype="FP64" type="__m512d" varname="cos_src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SIN(a[i+63:i])
+		MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := sin_src[i+63:i]
+		MEM[mem_addr+i+63:mem_addr+i] := cos_src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sincos_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" memwidth="512" type="__m512 *" varname="mem_addr" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sincos_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" memwidth="512" type="__m512 *" varname="mem_addr" />
+	<parameter etype="FP32" type="__m512" varname="sin_src" />
+	<parameter etype="FP32" type="__m512" varname="cos_src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SIN(a[i+31:i])
+		MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := sin_src[i+31:i]
+		MEM[mem_addr+i+31:mem_addr+i] := cos_src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cbrt_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := CubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cbrt_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := CubeRoot(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cbrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := CubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cbrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := CubeRoot(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_exp10_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POW(10.0, a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_exp10_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(10.0, a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_exp10_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POW(FP32(10.0), a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_exp10_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(FP32(10.0), a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_exp2_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POW(2.0, a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_exp2_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(2.0, a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_exp2_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_exp2_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_exp_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POW(e, a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_exp_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(e, a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_exp_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POW(FP32(e), a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_exp_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(FP32(e), a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_expm1_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POW(e, a[i+63:i]) - 1.0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expm1_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(e, a[i+63:i]) - 1.0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_expm1_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expm1_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_hypot_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_hypot_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_hypot_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_hypot_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_invsqrt_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := InvSQRT(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_invsqrt_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := InvSQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_invsqrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := InvSQRT(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_invsqrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := InvSQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_log10_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_log10_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_log10_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_log10_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_log1p_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LOG(1.0 + a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_log1p_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LOG(1.0 + a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_log1p_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LOG(1.0 + a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_log1p_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LOG(1.0 + a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_log2_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_log2_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_log_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_log_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LOG(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_log_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_log_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LOG(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_logb_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_logb_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_logb_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_logb_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_pow_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POW(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_pow_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_pow_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POW(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_pow_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_recip_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (1.0 / a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_recip_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_recip_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (1.0 / a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_recip_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cdfnorm_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := CDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cdfnorm_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := CDFNormal(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cdfnorm_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := CDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cdfnorm_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := CDFNormal(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cdfnorminv_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cdfnorminv_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := InverseCDFNormal(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cdfnorminv_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cdfnorminv_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := InverseCDFNormal(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_erf_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ERF(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_erf_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ERF(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_erfc_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_erfc_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 1.0 - ERF(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_erf_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ERF(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_erf_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ERF(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_erfc_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+63:i] := 1.0 - ERF(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_erfc_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+63:i] := 1.0 - ERF(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_erfinv_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_erfinv_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 1.0 / ERF(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_erfinv_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+63:i] := 1.0 / ERF(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_erfinv_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+63:i] := 1.0 / ERF(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_erfcinv_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_erfcinv_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_erfcinv_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_erfcinv_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm512_ceil_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_ceil_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := CEIL(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_ceil_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_ceil_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := CEIL(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_floor_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_floor_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FLOOR(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_floor_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_floor_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FLOOR(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_nearbyint_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := NearbyInt(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_nearbyint_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := NearbyInt(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_nearbyint_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := NearbyInt(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_nearbyint_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := NearbyInt(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rint_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RoundToNearestEven(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rint_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundToNearestEven(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rint_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RoundToNearestEven(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rint_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundToNearestEven(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_svml_round_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_svml_round_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ROUND(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i] 
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_trunc_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := TRUNCATE(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_trunc_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := TRUNCATE(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_trunc_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := TRUNCATE(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_trunc_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := TRUNCATE(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF b[i+31:i] == 0
+		#DE
+	FI
+	dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_div_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		IF b[i+31:i] == 0
+			#DE
+		FI
+		dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_epi8" sequence="TRUE" tech="SVML">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="SI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := 8*j
+	IF b[i+7:i] == 0
+		#DE
+	FI
+	dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_epi16" sequence="TRUE" tech="SVML">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	IF b[i+15:i] == 0
+		#DE
+	FI
+	dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_epi64" sequence="TRUE" tech="SVML">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF b[i+63:i] == 0
+		#DE
+	FI
+	dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rem_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rem_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rem_epi8" sequence="TRUE" tech="SVML">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 63
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rem_epi16" sequence="TRUE" tech="SVML">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 31
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rem_epi64" sequence="TRUE" tech="SVML">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_epu32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF b[i+31:i] == 0
+		#DE
+	FI
+	dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_div_epu32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		IF b[i+31:i] == 0
+			#DE
+		FI
+		dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_epu8" sequence="TRUE" tech="SVML">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := 8*j
+	IF b[i+7:i] == 0
+		#DE
+	FI
+	dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_epu16" sequence="TRUE" tech="SVML">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	IF b[i+15:i] == 0
+		#DE
+	FI
+	dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_epu64" sequence="TRUE" tech="SVML">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF b[i+63:i] == 0
+		#DE
+	FI
+	dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rem_epu32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rem_epu32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rem_epu8" sequence="TRUE" tech="SVML">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 63
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rem_epu16" sequence="TRUE" tech="SVML">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 31
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rem_epu64" sequence="TRUE" tech="SVML">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_log2_ps" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VLOG2PS" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_log2_ps" tech="SVML">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VLOG2PS" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_mask_add_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VADDPD" xed="VADDPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_add_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VADDPD" xed="VADDPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_add_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VADDPD" xed="VADDPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_add_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VADDPD" xed="VADDPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_add_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VADDPS" xed="VADDPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_add_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VADDPS" xed="VADDPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_add_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VADDPS" xed="VADDPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_add_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VADDPS" xed="VADDPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_div_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VDIVPD" xed="VDIVPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_div_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VDIVPD" xed="VDIVPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_div_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VDIVPD" xed="VDIVPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_div_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VDIVPD" xed="VDIVPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_div_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VDIVPS" xed="VDIVPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_div_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VDIVPS" xed="VDIVPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_div_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VDIVPS" xed="VDIVPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_div_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VDIVPS" xed="VDIVPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD132PD" xed="VFMADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD213PD" xed="VFMADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD231PD" xed="VFMADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD132PD" xed="VFMADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD213PD" xed="VFMADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD231PD" xed="VFMADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADD132PD" xed="VFMADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADD213PD" xed="VFMADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADD231PD" xed="VFMADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD132PD" xed="VFMADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD213PD" xed="VFMADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD231PD" xed="VFMADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD132PD" xed="VFMADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD213PD" xed="VFMADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD231PD" xed="VFMADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD132PD" xed="VFMADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD213PD" xed="VFMADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD231PD" xed="VFMADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD132PS" xed="VFMADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD213PS" xed="VFMADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD231PS" xed="VFMADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD132PS" xed="VFMADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD213PS" xed="VFMADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD231PS" xed="VFMADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADD132PS" xed="VFMADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADD213PS" xed="VFMADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADD231PS" xed="VFMADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD132PS" xed="VFMADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD213PS" xed="VFMADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD231PS" xed="VFMADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD132PS" xed="VFMADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD213PS" xed="VFMADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD231PS" xed="VFMADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD132PS" xed="VFMADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD213PS" xed="VFMADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD231PS" xed="VFMADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fmaddsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmaddsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmaddsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmaddsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmaddsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmaddsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fmaddsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmaddsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmaddsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmaddsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmaddsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmaddsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB132PD" xed="VFMSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB213PD" xed="VFMSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB231PD" xed="VFMSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB132PD" xed="VFMSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB213PD" xed="VFMSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB231PD" xed="VFMSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUB132PD" xed="VFMSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUB213PD" xed="VFMSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUB231PD" xed="VFMSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB132PD" xed="VFMSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB213PD" xed="VFMSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB231PD" xed="VFMSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB132PD" xed="VFMSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB213PD" xed="VFMSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB231PD" xed="VFMSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB132PD" xed="VFMSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB213PD" xed="VFMSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB231PD" xed="VFMSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB132PS" xed="VFMSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB213PS" xed="VFMSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB231PS" xed="VFMSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB132PS" xed="VFMSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB213PS" xed="VFMSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB231PS" xed="VFMSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUB132PS" xed="VFMSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUB213PS" xed="VFMSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUB231PS" xed="VFMSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB132PS" xed="VFMSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB213PS" xed="VFMSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB231PS" xed="VFMSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB132PS" xed="VFMSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB213PS" xed="VFMSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB231PS" xed="VFMSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB132PS" xed="VFMSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB213PS" xed="VFMSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB231PS" xed="VFMSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fmsubadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmsubadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmsubadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmsubadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmsubadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1 
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmsubadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fmsubadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmsubadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmsubadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmsubadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmsubadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmsubadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fnmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD132PD" xed="VFNMADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD213PD" xed="VFNMADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD231PD" xed="VFNMADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fnmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD132PD" xed="VFNMADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD213PD" xed="VFNMADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD231PD" xed="VFNMADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fnmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMADD132PD" xed="VFNMADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMADD213PD" xed="VFNMADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMADD231PD" xed="VFNMADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD132PD" xed="VFNMADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD213PD" xed="VFNMADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD231PD" xed="VFNMADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD132PD" xed="VFNMADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD213PD" xed="VFNMADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD231PD" xed="VFNMADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD132PD" xed="VFNMADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD213PD" xed="VFNMADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD231PD" xed="VFNMADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fnmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD132PS" xed="VFNMADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD213PS" xed="VFNMADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD231PS" xed="VFNMADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fnmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD132PS" xed="VFNMADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD213PS" xed="VFNMADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD231PS" xed="VFNMADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fnmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMADD132PS" xed="VFNMADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMADD213PS" xed="VFNMADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMADD231PS" xed="VFNMADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD132PS" xed="VFNMADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD213PS" xed="VFNMADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD231PS" xed="VFNMADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD132PS" xed="VFNMADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD213PS" xed="VFNMADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD231PS" xed="VFNMADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD132PS" xed="VFNMADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD213PS" xed="VFNMADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD231PS" xed="VFNMADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fnmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB132PD" xed="VFNMSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB213PD" xed="VFNMSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB231PD" xed="VFNMSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fnmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB132PD" xed="VFNMSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB213PD" xed="VFNMSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB231PD" xed="VFNMSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fnmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMSUB132PD" xed="VFNMSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMSUB213PD" xed="VFNMSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMSUB231PD" xed="VFNMSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB132PD" xed="VFNMSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB213PD" xed="VFNMSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB231PD" xed="VFNMSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB132PD" xed="VFNMSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB213PD" xed="VFNMSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB231PD" xed="VFNMSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB132PD" xed="VFNMSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB213PD" xed="VFNMSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB231PD" xed="VFNMSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fnmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB132PS" xed="VFNMSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB213PS" xed="VFNMSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB231PS" xed="VFNMSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fnmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB132PS" xed="VFNMSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB213PS" xed="VFNMSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB231PS" xed="VFNMSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fnmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMSUB132PS" xed="VFNMSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMSUB213PS" xed="VFNMSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMSUB231PS" xed="VFNMSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB132PS" xed="VFNMSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB213PS" xed="VFNMSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB231PS" xed="VFNMSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB132PS" xed="VFNMSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB213PS" xed="VFNMSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB231PS" xed="VFNMSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB132PS" xed="VFNMSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB213PS" xed="VFNMSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB231PS" xed="VFNMSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_max_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VMAXPD" xed="VMAXPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_max_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VMAXPD" xed="VMAXPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMAXPD" xed="VMAXPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMAXPD" xed="VMAXPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_max_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VMAXPS" xed="VMAXPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_max_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VMAXPS" xed="VMAXPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMAXPS" xed="VMAXPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMAXPS" xed="VMAXPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_min_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VMINPD" xed="VMINPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_min_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VMINPD" xed="VMINPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMINPD" xed="VMINPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMINPD" xed="VMINPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_min_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VMINPS" xed="VMINPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_min_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VMINPS" xed="VMINPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMINPS" xed="VMINPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMINPS" xed="VMINPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mul_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VMULPD" xed="VMULPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mul_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VMULPD" xed="VMULPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMULPD" xed="VMULPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMULPD" xed="VMULPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mul_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  RM.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VMULPS" xed="VMULPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mul_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VMULPS" xed="VMULPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMULPS" xed="VMULPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMULPS" xed="VMULPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_abs_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPABSD" xed="VPABSD_YMMi32_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_abs_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPABSD" xed="VPABSD_YMMi32_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_abs_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPABSD" xed="VPABSD_XMMi32_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_abs_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPABSD" xed="VPABSD_XMMi32_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_abs_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ABS(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPABSQ" xed="VPABSQ_YMMi64_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_abs_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPABSQ" xed="VPABSQ_YMMi64_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_abs_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPABSQ" xed="VPABSQ_YMMi64_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_abs_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ABS(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPABSQ" xed="VPABSQ_XMMi64_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_abs_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPABSQ" xed="VPABSQ_XMMi64_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_abs_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPABSQ" xed="VPABSQ_XMMi64_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_add_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPADDD" xed="VPADDD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_add_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPADDD" xed="VPADDD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_add_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPADDD" xed="VPADDD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_add_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPADDD" xed="VPADDD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_add_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPADDQ" xed="VPADDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_add_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] :=0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPADDQ" xed="VPADDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_add_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPADDQ" xed="VPADDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_add_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPADDQ" xed="VPADDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_max_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMAXSD" xed="VPMAXSD_YMMi32_MASKmskw_YMMi32_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_max_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMAXSD" xed="VPMAXSD_YMMi32_MASKmskw_YMMi32_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMAXSD" xed="VPMAXSD_XMMi32_MASKmskw_XMMi32_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMAXSD" xed="VPMAXSD_XMMi32_MASKmskw_XMMi32_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_max_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMAXSQ" xed="VPMAXSQ_YMMi64_MASKmskw_YMMi64_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_max_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMAXSQ" xed="VPMAXSQ_YMMi64_MASKmskw_YMMi64_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_max_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMAXSQ" xed="VPMAXSQ_YMMi64_MASKmskw_YMMi64_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMAXSQ" xed="VPMAXSQ_XMMi64_MASKmskw_XMMi64_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMAXSQ" xed="VPMAXSQ_XMMi64_MASKmskw_XMMi64_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPMAXSQ" xed="VPMAXSQ_XMMi64_MASKmskw_XMMi64_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_max_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMAXUD" xed="VPMAXUD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_max_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMAXUD" xed="VPMAXUD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMAXUD" xed="VPMAXUD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMAXUD" xed="VPMAXUD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_max_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMAXUQ" xed="VPMAXUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_max_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMAXUQ" xed="VPMAXUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_max_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMAXUQ" xed="VPMAXUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMAXUQ" xed="VPMAXUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMAXUQ" xed="VPMAXUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPMAXUQ" xed="VPMAXUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_min_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMINSD" xed="VPMINSD_YMMi32_MASKmskw_YMMi32_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_min_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMINSD" xed="VPMINSD_YMMi32_MASKmskw_YMMi32_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMINSD" xed="VPMINSD_XMMi32_MASKmskw_XMMi32_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMINSD" xed="VPMINSD_XMMi32_MASKmskw_XMMi32_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_min_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMINSQ" xed="VPMINSQ_YMMi64_MASKmskw_YMMi64_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_min_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMINSQ" xed="VPMINSQ_YMMi64_MASKmskw_YMMi64_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_min_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMINSQ" xed="VPMINSQ_YMMi64_MASKmskw_YMMi64_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMINSQ" xed="VPMINSQ_XMMi64_MASKmskw_XMMi64_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMINSQ" xed="VPMINSQ_XMMi64_MASKmskw_XMMi64_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPMINSQ" xed="VPMINSQ_XMMi64_MASKmskw_XMMi64_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_min_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMINUD" xed="VPMINUD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_min_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMINUD" xed="VPMINUD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMINUD" xed="VPMINUD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMINUD" xed="VPMINUD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_min_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMINUQ" xed="VPMINUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_min_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMINUQ" xed="VPMINUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_min_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMINUQ" xed="VPMINUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMINUQ" xed="VPMINUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMINUQ" xed="VPMINUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPMINUQ" xed="VPMINUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mul_epi32" tech="AVX-512">
+	<return etype="SI64" type="__m256i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMULDQ" xed="VPMULDQ_YMMi64_MASKmskw_YMMi32_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mul_epi32" tech="AVX-512">
+	<return etype="SI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMULDQ" xed="VPMULDQ_YMMi64_MASKmskw_YMMi32_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_epi32" tech="AVX-512">
+	<return etype="SI64" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMULDQ" xed="VPMULDQ_XMMi64_MASKmskw_XMMi32_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_epi32" tech="AVX-512">
+	<return etype="SI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMULDQ" xed="VPMULDQ_XMMi64_MASKmskw_XMMi32_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mullo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMULLD" xed="VPMULLD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mullo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMULLD" xed="VPMULLD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mullo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMULLD" xed="VPMULLD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mullo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMULLD" xed="VPMULLD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mul_epu32" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMULUDQ" xed="VPMULUDQ_YMMu64_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mul_epu32" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMULUDQ" xed="VPMULUDQ_YMMu64_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_epu32" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMULUDQ" xed="VPMULUDQ_XMMu64_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_epu32" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMULUDQ" xed="VPMULUDQ_XMMu64_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sub_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSUBD" xed="VPSUBD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sub_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSUBD" xed="VPSUBD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sub_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSUBD" xed="VPSUBD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sub_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSUBD" xed="VPSUBD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sub_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSUBQ" xed="VPSUBQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sub_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSUBQ" xed="VPSUBQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sub_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSUBQ" xed="VPSUBQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sub_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSUBQ" xed="VPSUBQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_rcp14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VRCP14PD" xed="VRCP14PD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_rcp14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VRCP14PD" xed="VRCP14PD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rcp14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := (1.0 / a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VRCP14PD" xed="VRCP14PD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rcp14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VRCP14PD" xed="VRCP14PD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rcp14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VRCP14PD" xed="VRCP14PD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_rcp14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (1.0 / a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VRCP14PD" xed="VRCP14PD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_rcp14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VRCP14PS" xed="VRCP14PS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_rcp14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VRCP14PS" xed="VRCP14PS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rcp14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (1.0 / a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VRCP14PS" xed="VRCP14PS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rcp14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VRCP14PS" xed="VRCP14PS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rcp14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VRCP14PS" xed="VRCP14PS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_rcp14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (1.0 / a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VRCP14PS" xed="VRCP14PS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rsqrt14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VRSQRT14PD" xed="VRSQRT14PD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_rsqrt14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VRSQRT14PD" xed="VRSQRT14PD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_rsqrt14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VRSQRT14PD" xed="VRSQRT14PD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_rsqrt14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VRSQRT14PD" xed="VRSQRT14PD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rsqrt14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VRSQRT14PD" xed="VRSQRT14PD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rsqrt14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VRSQRT14PD" xed="VRSQRT14PD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rsqrt14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VRSQRT14PS" xed="VRSQRT14PS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_rsqrt14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VRSQRT14PS" xed="VRSQRT14PS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_rsqrt14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VRSQRT14PS" xed="VRSQRT14PS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_rsqrt14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VRSQRT14PS" xed="VRSQRT14PS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rsqrt14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VRSQRT14PS" xed="VRSQRT14PS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rsqrt14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VRSQRT14PS" xed="VRSQRT14PS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VSUBPD" xed="VSUBPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VSUBPD" xed="VSUBPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSUBPD" xed="VSUBPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSUBPD" xed="VSUBPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VSUBPS" xed="VSUBPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VSUBPS" xed="VSUBPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSUBPS" xed="VSUBPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSUBPS" xed="VSUBPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_alignr_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="3" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst".</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (32*imm8[2:0])
+dst[255:0] := temp[255:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VALIGND" xed="VALIGND_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_alignr_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="3" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (32*imm8[2:0])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VALIGND" xed="VALIGND_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_alignr_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="3" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (32*imm8[2:0])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VALIGND" xed="VALIGND_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_alignr_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst".</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (32*imm8[1:0])
+dst[127:0] := temp[127:0]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VALIGND" xed="VALIGND_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_alignr_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (32*imm8[1:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VALIGND" xed="VALIGND_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_alignr_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (32*imm8[1:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VALIGND" xed="VALIGND_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_alignr_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst".</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (64*imm8[1:0])
+dst[255:0] := temp[255:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VALIGNQ" xed="VALIGNQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_alignr_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (64*imm8[1:0])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VALIGNQ" xed="VALIGNQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_alignr_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (64*imm8[1:0])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VALIGNQ" xed="VALIGNQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_alignr_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst".</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (64*imm8[0])
+dst[127:0] := temp[127:0]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VALIGNQ" xed="VALIGNQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_alignr_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (64*imm8[0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VALIGNQ" xed="VALIGNQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_alignr_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (64*imm8[0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VALIGNQ" xed="VALIGNQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_blend_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VBLENDMPD" xed="VBLENDMPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_blend_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VBLENDMPD" xed="VBLENDMPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_blend_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VBLENDMPS" xed="VBLENDMPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_blend_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VBLENDMPS" xed="VBLENDMPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcast_f32x4" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 4)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m128" name="VBROADCASTF32X4" xed="VBROADCASTF32X4_YMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_broadcast_f32x4" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m128" name="VBROADCASTF32X4" xed="VBROADCASTF32X4_YMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_broadcast_f32x4" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m128" name="VBROADCASTF32X4" xed="VBROADCASTF32X4_YMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_broadcast_i32x4" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 4)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m128" name="VBROADCASTI32X4" xed="VBROADCASTI32X4_YMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_broadcast_i32x4" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m128" name="VBROADCASTI32X4" xed="VBROADCASTI32X4_YMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_broadcast_i32x4" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m128" name="VBROADCASTI32X4" xed="VBROADCASTI32X4_YMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_broadcastsd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VBROADCASTSD" xed="VBROADCASTSD_YMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_broadcastsd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VBROADCASTSD" xed="VBROADCASTSD_YMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_broadcastss_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VBROADCASTSS" xed="VBROADCASTSS_YMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_broadcastss_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VBROADCASTSS" xed="VBROADCASTSS_YMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_broadcastss_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VBROADCASTSS" xed="VBROADCASTSS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_broadcastss_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VBROADCASTSS" xed="VBROADCASTSS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_compress_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCOMPRESSPD" xed="VCOMPRESSPD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_compress_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCOMPRESSPD" xed="VCOMPRESSPD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_compress_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCOMPRESSPD" xed="VCOMPRESSPD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_compress_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCOMPRESSPD" xed="VCOMPRESSPD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_compress_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCOMPRESSPS" xed="VCOMPRESSPS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_compress_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCOMPRESSPS" xed="VCOMPRESSPS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_compress_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCOMPRESSPS" xed="VCOMPRESSPS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_compress_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCOMPRESSPS" xed="VCOMPRESSPS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_expand_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VEXPANDPD" xed="VEXPANDPD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_expand_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VEXPANDPD" xed="VEXPANDPD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_expand_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VEXPANDPD" xed="VEXPANDPD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_expand_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm " name="VEXPANDPD" xed="VEXPANDPD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_expand_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VEXPANDPS" xed="VEXPANDPS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_expand_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VEXPANDPS" xed="VEXPANDPS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_expand_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VEXPANDPS" xed="VEXPANDPS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_expand_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VEXPANDPS" xed="VEXPANDPS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_extractf32x4_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm, imm8" name="VEXTRACTF32X4" xed="VEXTRACTF32X4_XMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_extractf32x4_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm, imm8" name="VEXTRACTF32X4" xed="VEXTRACTF32X4_XMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_extractf32x4_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm, imm8" name="VEXTRACTF32X4" xed="VEXTRACTF32X4_XMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_extracti32x4_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm, imm8" name="VEXTRACTI32X4" xed="VEXTRACTI32X4_XMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_extracti32x4_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm, imm8" name="VEXTRACTI32X4" xed="VEXTRACTI32X4_XMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_extracti32x4_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm, imm8" name="VEXTRACTI32X4" xed="VEXTRACTI32X4_XMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fixupimm_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN: j := 0
+	SNAN_TOKEN: j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VFIXUPIMMPD" xed="VFIXUPIMMPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fixupimm_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VFIXUPIMMPD" xed="VFIXUPIMMPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fixupimm_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VFIXUPIMMPD" xed="VFIXUPIMMPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_fixupimm_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VFIXUPIMMPD" xed="VFIXUPIMMPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fixupimm_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VFIXUPIMMPD" xed="VFIXUPIMMPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fixupimm_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VFIXUPIMMPD" xed="VFIXUPIMMPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fixupimm_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="UI32" type="__m256i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VFIXUPIMMPS" xed="VFIXUPIMMPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fixupimm_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="UI32" type="__m256i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VFIXUPIMMPS" xed="VFIXUPIMMPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fixupimm_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="UI32" type="__m256i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VFIXUPIMMPS" xed="VFIXUPIMMPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_fixupimm_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VFIXUPIMMPS" xed="VFIXUPIMMPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fixupimm_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VFIXUPIMMPS" xed="VFIXUPIMMPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fixupimm_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VFIXUPIMMPS" xed="VFIXUPIMMPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_getexp_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VGETEXPPD" xed="VGETEXPPD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_getexp_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VGETEXPPD" xed="VGETEXPPD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_getexp_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VGETEXPPD" xed="VGETEXPPD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getexp_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VGETEXPPD" xed="VGETEXPPD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getexp_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VGETEXPPD" xed="VGETEXPPD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getexp_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VGETEXPPD" xed="VGETEXPPD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_getexp_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VGETEXPPS" xed="VGETEXPPS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_getexp_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VGETEXPPS" xed="VGETEXPPS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_getexp_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VGETEXPPS" xed="VGETEXPPS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getexp_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VGETEXPPS" xed="VGETEXPPS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getexp_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VGETEXPPS" xed="VGETEXPPS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getexp_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VGETEXPPS" xed="VGETEXPPS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_getmant_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VGETMANTPD" xed="VGETMANTPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_getmant_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VGETMANTPD" xed="VGETMANTPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_getmant_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VGETMANTPD" xed="VGETMANTPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getmant_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VGETMANTPD" xed="VGETMANTPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getmant_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VGETMANTPD" xed="VGETMANTPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getmant_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VGETMANTPD" xed="VGETMANTPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_getmant_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VGETMANTPS" xed="VGETMANTPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_getmant_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VGETMANTPS" xed="VGETMANTPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_getmant_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VGETMANTPS" xed="VGETMANTPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getmant_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VGETMANTPS" xed="VGETMANTPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getmant_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VGETMANTPS" xed="VGETMANTPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getmant_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VGETMANTPS" xed="VGETMANTPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_insertf32x4" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTF32X4" xed="VINSERTF32X4_YMMf32_MASKmskw_YMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_insertf32x4" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, xmm, imm8" name="VINSERTF32X4" xed="VINSERTF32X4_YMMf32_MASKmskw_YMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_insertf32x4" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, xmm, imm8" name="VINSERTF32X4" xed="VINSERTF32X4_YMMf32_MASKmskw_YMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_inserti32x4" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm, imm8" name="VINSERTI32X4" xed="VINSERTI32X4_YMMu32_MASKmskw_YMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_inserti32x4" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, xmm, imm8" name="VINSERTI32X4" xed="VINSERTI32X4_YMMu32_MASKmskw_YMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_inserti32x4" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, xmm, imm8" name="VINSERTI32X4" xed="VINSERTI32X4_YMMu32_MASKmskw_YMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_blend_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPBLENDMD" xed="VPBLENDMD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_blend_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPBLENDMD" xed="VPBLENDMD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_blend_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPBLENDMQ" xed="VPBLENDMQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_blend_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPBLENDMQ" xed="VPBLENDMQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_broadcastd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPBROADCASTD" xed="VPBROADCASTD_YMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_broadcastd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPBROADCASTD" xed="VPBROADCASTD_YMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_broadcastd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPBROADCASTD" xed="VPBROADCASTD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_broadcastd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPBROADCASTD" xed="VPBROADCASTD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_broadcastq_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPBROADCASTQ" xed="VPBROADCASTQ_YMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_broadcastq_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPBROADCASTQ" xed="VPBROADCASTQ_YMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_broadcastq_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPBROADCASTQ" xed="VPBROADCASTQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_broadcastq_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPBROADCASTQ" xed="VPBROADCASTQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_compress_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPCOMPRESSD" xed="VPCOMPRESSD_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_compress_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPCOMPRESSD" xed="VPCOMPRESSD_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_compress_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPCOMPRESSD" xed="VPCOMPRESSD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_compress_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPCOMPRESSD" xed="VPCOMPRESSD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_compress_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPCOMPRESSQ" xed="VPCOMPRESSQ_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_compress_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPCOMPRESSQ" xed="VPCOMPRESSQ_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_compress_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPCOMPRESSQ" xed="VPCOMPRESSQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_compress_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPCOMPRESSQ" xed="VPCOMPRESSQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutexvar_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMD" xed="VPERMD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutexvar_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMD" xed="VPERMD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutexvar_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMD" xed="VPERMD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask2_permutex2var_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMI2D" xed="VPERMI2D_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutex2var_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMT2D" xed="VPERMT2D_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutex2var_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMI2D" xed="VPERMI2D_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMT2D" xed="VPERMT2D_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutex2var_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMI2D" xed="VPERMI2D_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VPERMT2D" xed="VPERMT2D_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask2_permutex2var_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="idx" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMI2D" xed="VPERMI2D_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_permutex2var_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="idx" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMT2D" xed="VPERMT2D_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_permutex2var_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="idx" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMI2D" xed="VPERMI2D_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMT2D" xed="VPERMT2D_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_permutex2var_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="idx" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPERMI2D" xed="VPERMI2D_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VPERMT2D" xed="VPERMT2D_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask2_permutex2var_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMI2PD" xed="VPERMI2PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutex2var_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMT2PD" xed="VPERMT2PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutex2var_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMI2PD" xed="VPERMI2PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMT2PD" xed="VPERMT2PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutex2var_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMI2PD" xed="VPERMI2PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VPERMT2PD" xed="VPERMT2PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask2_permutex2var_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="idx" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set)</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMI2PD" xed="VPERMI2PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_permutex2var_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="idx" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMT2PD" xed="VPERMT2PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_permutex2var_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="idx" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMI2PD" xed="VPERMI2PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMT2PD" xed="VPERMT2PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_permutex2var_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="idx" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPERMI2PD" xed="VPERMI2PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VPERMT2PD" xed="VPERMT2PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask2_permutex2var_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMI2PS" xed="VPERMI2PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutex2var_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMT2PS" xed="VPERMT2PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutex2var_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMI2PS" xed="VPERMI2PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMT2PS" xed="VPERMT2PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutex2var_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMI2PS" xed="VPERMI2PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VPERMT2PS" xed="VPERMT2PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask2_permutex2var_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="idx" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMI2PS" xed="VPERMI2PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_permutex2var_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="idx" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMT2PS" xed="VPERMT2PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_permutex2var_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="idx" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMI2PS" xed="VPERMI2PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMT2PS" xed="VPERMT2PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_permutex2var_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="idx" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPERMI2PS" xed="VPERMI2PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VPERMT2PS" xed="VPERMT2PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask2_permutex2var_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMI2Q" xed="VPERMI2Q_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutex2var_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMT2Q" xed="VPERMT2Q_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutex2var_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMI2Q" xed="VPERMI2Q_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMT2Q" xed="VPERMT2Q_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutex2var_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMI2Q" xed="VPERMI2Q_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VPERMT2Q" xed="VPERMT2Q_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask2_permutex2var_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="idx" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMI2Q" xed="VPERMI2Q_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_permutex2var_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="idx" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMT2Q" xed="VPERMT2Q_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_permutex2var_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="idx" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMI2Q" xed="VPERMI2Q_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMT2Q" xed="VPERMT2Q_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_permutex2var_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="idx" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPERMI2Q" xed="VPERMI2Q_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VPERMT2Q" xed="VPERMT2Q_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permute_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPERMILPD" xed="VPERMILPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutevar_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMILPD" xed="VPERMILPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permute_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPERMILPD" xed="VPERMILPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutevar_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMILPD" xed="VPERMILPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_permute_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPERMILPD" xed="VPERMILPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_permutevar_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMILPD" xed="VPERMILPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_permute_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPERMILPD" xed="VPERMILPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_permutevar_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMILPD" xed="VPERMILPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permute_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPERMILPS" xed="VPERMILPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutevar_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
+tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
+tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
+tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMILPS" xed="VPERMILPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permute_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPERMILPS" xed="VPERMILPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutevar_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
+tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
+tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
+tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMILPS" xed="VPERMILPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_permute_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPERMILPS" xed="VPERMILPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_permutevar_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMILPS" xed="VPERMILPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_permute_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPERMILPS" xed="VPERMILPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_permutevar_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMILPS" xed="VPERMILPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutex_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPERMPD" xed="VPERMPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutexvar_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMPD" xed="VPERMPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutex_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPERMPD" xed="VPERMPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutexvar_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMPD" xed="VPERMPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutex_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPERMPD" xed="VPERMPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutexvar_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	dst[i+63:i] := a[id+63:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMPD" xed="VPERMPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutexvar_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMPS" xed="VPERMPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutexvar_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMPS" xed="VPERMPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutexvar_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="idx" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMPS" xed="VPERMPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutex_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 64-bit integers in "a" across lanes lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPERMQ" xed="VPERMQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutexvar_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMQ" xed="VPERMQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutex_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPERMQ" xed="VPERMQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutexvar_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMQ" xed="VPERMQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutex_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPERMQ" xed="VPERMQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutexvar_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="idx" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	dst[i+63:i] := a[id+63:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMQ" xed="VPERMQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_expand_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPEXPANDD" xed="VPEXPANDD_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_expand_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPEXPANDD" xed="VPEXPANDD_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_expand_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPEXPANDD" xed="VPEXPANDD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_expand_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPEXPANDD" xed="VPEXPANDD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_expand_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPEXPANDQ" xed="VPEXPANDQ_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_expand_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPEXPANDQ" xed="VPEXPANDQ_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_expand_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPEXPANDQ" xed="VPEXPANDQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_expand_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPEXPANDQ" xed="VPEXPANDQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shuffle_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_PERM" type="_MM_PERM_ENUM" varname="imm8" />
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPSHUFD" xed="VPSHUFD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shuffle_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_PERM" type="_MM_PERM_ENUM" varname="imm8" />
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPSHUFD" xed="VPSHUFD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shuffle_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_PERM" type="_MM_PERM_ENUM" varname="imm8" />
+	<description>Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPSHUFD" xed="VPSHUFD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shuffle_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_PERM" type="_MM_PERM_ENUM" varname="imm8" />
+	<description>Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPSHUFD" xed="VPSHUFD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_unpackhi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPUNPCKHDQ" xed="VPUNPCKHDQ_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_unpackhi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPUNPCKHDQ" xed="VPUNPCKHDQ_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_unpackhi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPUNPCKHDQ" xed="VPUNPCKHDQ_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_unpackhi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPUNPCKHDQ" xed="VPUNPCKHDQ_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_unpackhi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPUNPCKHQDQ" xed="VPUNPCKHQDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_unpackhi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPUNPCKHQDQ" xed="VPUNPCKHQDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_unpackhi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPUNPCKHQDQ" xed="VPUNPCKHQDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_unpackhi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPUNPCKHQDQ" xed="VPUNPCKHQDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_unpacklo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPUNPCKLDQ" xed="VPUNPCKLDQ_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_unpacklo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPUNPCKLDQ" xed="VPUNPCKLDQ_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_unpacklo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPUNPCKLDQ" xed="VPUNPCKLDQ_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_unpacklo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPUNPCKLDQ" xed="VPUNPCKLDQ_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_unpacklo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPUNPCKLQDQ" xed="VPUNPCKLQDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_unpacklo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPUNPCKLQDQ" xed="VPUNPCKLQDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_unpacklo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPUNPCKLQDQ" xed="VPUNPCKLQDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_unpacklo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPUNPCKLQDQ" xed="VPUNPCKLQDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_roundscale_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VRNDSCALEPD" xed="VRNDSCALEPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_roundscale_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VRNDSCALEPD" xed="VRNDSCALEPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_roundscale_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VRNDSCALEPD" xed="VRNDSCALEPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_roundscale_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VRNDSCALEPD" xed="VRNDSCALEPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_roundscale_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VRNDSCALEPD" xed="VRNDSCALEPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_roundscale_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VRNDSCALEPD" xed="VRNDSCALEPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_roundscale_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VRNDSCALEPS" xed="VRNDSCALEPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_roundscale_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VRNDSCALEPS" xed="VRNDSCALEPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_roundscale_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VRNDSCALEPS" xed="VRNDSCALEPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_roundscale_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VRNDSCALEPS" xed="VRNDSCALEPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_roundscale_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VRNDSCALEPS" xed="VRNDSCALEPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_roundscale_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VRNDSCALEPS" xed="VRNDSCALEPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_scalef_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VSCALEFPD" xed="VSCALEFPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_scalef_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VSCALEFPD" xed="VSCALEFPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_scalef_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VSCALEFPD" xed="VSCALEFPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_scalef_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSCALEFPD" xed="VSCALEFPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_scalef_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSCALEFPD" xed="VSCALEFPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_scalef_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VSCALEFPD" xed="VSCALEFPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_scalef_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VSCALEFPS" xed="VSCALEFPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_scalef_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VSCALEFPS" xed="VSCALEFPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_scalef_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VSCALEFPS" xed="VSCALEFPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_scalef_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSCALEFPS" xed="VSCALEFPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_scalef_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSCALEFPS" xed="VSCALEFPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_scalef_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VSCALEFPS" xed="VSCALEFPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shuffle_f32x4" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VSHUFF32X4" xed="VSHUFF32X4_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shuffle_f32x4" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VSHUFF32X4" xed="VSHUFF32X4_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shuffle_f32x4" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst.m128[0] := a.m128[imm8[0]]
+dst.m128[1] := b.m128[imm8[1]]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VSHUFF32X4" xed="VSHUFF32X4_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shuffle_f64x2" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VSHUFF64X2" xed="VSHUFF64X2_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shuffle_f64x2" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VSHUFF64X2" xed="VSHUFF64X2_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shuffle_f64x2" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst.m128[0] := a.m128[imm8[0]]
+dst.m128[1] := b.m128[imm8[1]]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VSHUFF64X2" xed="VSHUFF64X2_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shuffle_i32x4" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VSHUFI32X4" xed="VSHUFI32X4_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shuffle_i32x4" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VSHUFI32X4" xed="VSHUFI32X4_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shuffle_i32x4" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst.m128[0] := a.m128[imm8[0]]
+dst.m128[1] := b.m128[imm8[1]]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VSHUFI32X4" xed="VSHUFI32X4_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shuffle_i64x2" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VSHUFI64X2" xed="VSHUFI64X2_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shuffle_i64x2" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VSHUFI64X2" xed="VSHUFI64X2_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shuffle_i64x2" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst.m128[0] := a.m128[imm8[0]]
+dst.m128[1] := b.m128[imm8[1]]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VSHUFI64X2" xed="VSHUFI64X2_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shuffle_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VSHUFPD" xed="VSHUFPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shuffle_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VSHUFPD" xed="VSHUFPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shuffle_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VSHUFPD" xed="VSHUFPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shuffle_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VSHUFPD" xed="VSHUFPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shuffle_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VSHUFPS" xed="VSHUFPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shuffle_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VSHUFPS" xed="VSHUFPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shuffle_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VSHUFPS" xed="VSHUFPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shuffle_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VSHUFPS" xed="VSHUFPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_unpackhi_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VUNPCKHPD" xed="VUNPCKHPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_unpackhi_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VUNPCKHPD" xed="VUNPCKHPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_unpackhi_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VUNPCKHPD" xed="VUNPCKHPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_unpackhi_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VUNPCKHPD" xed="VUNPCKHPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_unpackhi_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VUNPCKHPS" xed="VUNPCKHPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_unpackhi_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VUNPCKHPS" xed="VUNPCKHPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_unpackhi_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VUNPCKHPS" xed="VUNPCKHPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_unpackhi_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VUNPCKHPS" xed="VUNPCKHPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_unpacklo_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VUNPCKLPD" xed="VUNPCKLPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_unpacklo_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VUNPCKLPD" xed="VUNPCKLPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_unpacklo_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VUNPCKLPD" xed="VUNPCKLPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_unpacklo_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VUNPCKLPD" xed="VUNPCKLPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_unpacklo_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VUNPCKLPS" xed="VUNPCKLPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_unpacklo_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VUNPCKLPS" xed="VUNPCKLPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_unpacklo_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VUNPCKLPS" xed="VUNPCKLPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_unpacklo_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VUNPCKLPS" xed="VUNPCKLPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmp_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_YMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmp_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_YMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmp_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_YMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmp_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_YMMf32_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmp_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpeq_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpge_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpgt_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmple_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmplt_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpneq_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmp_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpeq_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpge_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpgt_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmple_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmplt_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpneq_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="SI32" type="__m256i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpge_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmple_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpneq_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpeq_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpge_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpgt_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmple_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmplt_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpneq_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmp_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpeq_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpge_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpgt_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmple_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmplt_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpneq_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmp_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpeq_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpge_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpgt_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmple_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmplt_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpneq_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="SI64" type="__m256i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpge_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmple_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpneq_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpeq_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpge_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpgt_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmple_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmplt_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpneq_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmp_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpeq_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpge_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpgt_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmple_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmplt_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpneq_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmp_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpeq_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpge_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpgt_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmple_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmplt_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpneq_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpge_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmple_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpneq_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpeq_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpge_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpgt_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmple_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmplt_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpneq_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmp_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpeq_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpge_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpgt_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmple_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmplt_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmpneq_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmp_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpeq_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpge_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpgt_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmple_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmplt_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmpneq_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpge_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmple_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpneq_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpeq_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpge_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpgt_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmple_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmplt_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmpneq_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_test_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPTESTMD" xed="VPTESTMD_MASKmskw_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_test_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPTESTMD" xed="VPTESTMD_MASKmskw_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_test_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPTESTMD" xed="VPTESTMD_MASKmskw_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_test_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPTESTMD" xed="VPTESTMD_MASKmskw_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_test_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPTESTMQ" xed="VPTESTMQ_MASKmskw_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_test_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPTESTMQ" xed="VPTESTMQ_MASKmskw_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_test_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPTESTMQ" xed="VPTESTMQ_MASKmskw_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_test_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPTESTMQ" xed="VPTESTMQ_MASKmskw_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_testn_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPTESTNMD" xed="VPTESTNMD_MASKmskw_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_testn_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPTESTNMD" xed="VPTESTNMD_MASKmskw_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_testn_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPTESTNMD" xed="VPTESTNMD_MASKmskw_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_testn_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPTESTNMD" xed="VPTESTNMD_MASKmskw_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_testn_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPTESTNMQ" xed="VPTESTNMQ_MASKmskw_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_testn_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPTESTNMQ" xed="VPTESTNMQ_MASKmskw_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_testn_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPTESTNMQ" xed="VPTESTNMQ_MASKmskw_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_testn_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPTESTNMQ" xed="VPTESTNMQ_MASKmskw_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_compressstoreu_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VCOMPRESSPD" xed="VCOMPRESSPD_MEMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_compressstoreu_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VCOMPRESSPD" xed="VCOMPRESSPD_MEMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_compressstoreu_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VCOMPRESSPS" xed="VCOMPRESSPS_MEMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_compressstoreu_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VCOMPRESSPS" xed="VCOMPRESSPS_MEMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_store_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VMOVAPD" xed="VMOVAPD_MEMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_store_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VMOVAPD" xed="VMOVAPD_MEMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_store_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VMOVAPS" xed="VMOVAPS_MEMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_store_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VMOVAPS" xed="VMOVAPS_MEMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_store_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VMOVDQA32" xed="VMOVDQA32_MEMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_store_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VMOVDQA32" xed="VMOVDQA32_MEMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_store_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VMOVDQA64" xed="VMOVDQA64_MEMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_store_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VMOVDQA64" xed="VMOVDQA64_MEMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_storeu_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VMOVDQU32" xed="VMOVDQU32_MEMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_storeu_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VMOVDQU32" xed="VMOVDQU32_MEMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_storeu_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VMOVDQU64" xed="VMOVDQU64_MEMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_storeu_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VMOVDQU64" xed="VMOVDQU64_MEMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_storeu_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VMOVUPD" xed="VMOVUPD_MEMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_storeu_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VMOVUPD" xed="VMOVUPD_MEMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_storeu_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VMOVUPS" xed="VMOVUPS_MEMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_storeu_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VMOVUPS" xed="VMOVUPS_MEMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_compressstoreu_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VPCOMPRESSD" xed="VPCOMPRESSD_MEMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_compressstoreu_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VPCOMPRESSD" xed="VPCOMPRESSD_MEMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_compressstoreu_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VPCOMPRESSQ" xed="VPCOMPRESSQ_MEMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_compressstoreu_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VPCOMPRESSQ" xed="VPCOMPRESSQ_MEMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i32scatter_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32y, ymm" name="VPSCATTERDD" xed="VPSCATTERDD_MEMu32_MASKmskw_YMMu32_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i32scatter_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32y {k}, ymm" name="VPSCATTERDD" xed="VPSCATTERDD_MEMu32_MASKmskw_YMMu32_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_i32scatter_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32x, xmm" name="VPSCATTERDD" xed="VPSCATTERDD_MEMu32_MASKmskw_XMMu32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i32scatter_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32x {k}, xmm" name="VPSCATTERDD" xed="VPSCATTERDD_MEMu32_MASKmskw_XMMu32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i32scatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32x, ymm" name="VPSCATTERDQ" xed="VPSCATTERDQ_MEMu64_MASKmskw_YMMu64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i32scatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32x {k}, ymm" name="VPSCATTERDQ" xed="VPSCATTERDQ_MEMu64_MASKmskw_YMMu64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_i32scatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32x, xmm" name="VPSCATTERDQ" xed="VPSCATTERDQ_MEMu64_MASKmskw_XMMu64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i32scatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32x {k}, xmm" name="VPSCATTERDQ" xed="VPSCATTERDQ_MEMu64_MASKmskw_XMMu64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i64scatter_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="void*" varname="base_addr" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="vm64y, xmm" name="VPSCATTERQD" xed="VPSCATTERQD_MEMu32_MASKmskw_XMMu32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i64scatter_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm64y {k}, xmm" name="VPSCATTERQD" xed="VPSCATTERQD_MEMu32_MASKmskw_XMMu32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_i64scatter_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="void*" varname="base_addr" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="vm64x, xmm" name="VPSCATTERQD" xed="VPSCATTERQD_MEMu32_MASKmskw_XMMu32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i64scatter_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm64x {k}, xmm" name="VPSCATTERQD" xed="VPSCATTERQD_MEMu32_MASKmskw_XMMu32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i64scatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="vm64y, ymm" name="VPSCATTERQQ" xed="VPSCATTERQQ_MEMu64_MASKmskw_YMMu64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i64scatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm64y {k}, ymm" name="VPSCATTERQQ" xed="VPSCATTERQQ_MEMu64_MASKmskw_YMMu64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_i64scatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="vm64x, xmm" name="VPSCATTERQQ" xed="VPSCATTERQQ_MEMu64_MASKmskw_XMMu64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i64scatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm64x {k}, xmm" name="VPSCATTERQQ" xed="VPSCATTERQQ_MEMu64_MASKmskw_XMMu64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i32scatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32x, ymm" name="VSCATTERDPD" xed="VSCATTERDPD_MEMf64_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i32scatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32x {k}, ymm" name="VSCATTERDPD" xed="VSCATTERDPD_MEMf64_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_i32scatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32x, xmm" name="VSCATTERDPD" xed="VSCATTERDPD_MEMf64_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i32scatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32x {k}, xmm" name="VSCATTERDPD" xed="VSCATTERDPD_MEMf64_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i32scatter_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32y, ymm" name="VSCATTERDPS" xed="VSCATTERDPS_MEMf32_MASKmskw_YMMf32_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i32scatter_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32y {k}, ymm" name="VSCATTERDPS" xed="VSCATTERDPS_MEMf32_MASKmskw_YMMf32_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_i32scatter_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32x, xmm" name="VSCATTERDPS" xed="VSCATTERDPS_MEMf32_MASKmskw_XMMf32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i32scatter_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32x {k}, xmm" name="VSCATTERDPS" xed="VSCATTERDPS_MEMf32_MASKmskw_XMMf32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i64scatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="vm64y, ymm" name="VSCATTERQPD" xed="VSCATTERQPD_MEMf64_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i64scatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm64y {k}, ymm" name="VSCATTERQPD" xed="VSCATTERQPD_MEMf64_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_i64scatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="vm64x, xmm" name="VSCATTERQPD" xed="VSCATTERQPD_MEMf64_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i64scatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm64x {k}, xmm" name="VSCATTERQPD" xed="VSCATTERQPD_MEMf64_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_i64scatter_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" type="void*" varname="base_addr" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="vm64y, xmm" name="VSCATTERQPS" xed="VSCATTERQPS_MEMf32_MASKmskw_XMMf32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_i64scatter_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm64y {k}, xmm" name="VSCATTERQPS" xed="VSCATTERQPS_MEMf32_MASKmskw_XMMf32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_i64scatter_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" type="void*" varname="base_addr" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="vm64x, xmm" name="VSCATTERQPS" xed="VSCATTERQPS_MEMf32_MASKmskw_XMMf32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_i64scatter_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm64x {k}, xmm" name="VSCATTERQPS" xed="VSCATTERQPS_MEMf32_MASKmskw_XMMf32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_storeu_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Store 256-bits (composed of 4 packed 64-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVDQU64" xed="VMOVDQU64_MEMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_storeu_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Store 256-bits (composed of 8 packed 32-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVDQU32" xed="VMOVDQU32_MEMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storeu_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Store 128-bits (composed of 2 packed 64-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="VMOVDQU64" xed="VMOVDQU64_MEMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storeu_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Store 128-bits (composed of 4 packed 32-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="VMOVDQU32" xed="VMOVDQU32_MEMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_store_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Store 256-bits (composed of 4 packed 64-bit integers) from "a" into memory.
+		"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVDQA64" xed="VMOVDQA64_MEMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_store_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="256" type="void*" varname="mem_addr" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Store 256-bits (composed of 8 packed 32-bit integers) from "a" into memory.
+		"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVDQA32" xed="VMOVDQA32_MEMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_store_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Store 128-bits (composed of 2 packed 64-bit integers) from "a" into memory.
+		"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="VMOVDQA64" xed="VMOVDQA64_MEMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_store_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Store 128-bits (composed of 4 packed 32-bit integers) from "a" into memory.
+		"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="VMOVDQA32" xed="VMOVDQA32_MEMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := src[m+63:m]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTDQ2PD" xed="VCVTDQ2PD_YMMf64_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTDQ2PD" xed="VCVTDQ2PD_YMMf64_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := src[m+63:m]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTDQ2PD" xed="VCVTDQ2PD_XMMf64_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTDQ2PD" xed="VCVTDQ2PD_XMMf64_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTDQ2PS" xed="VCVTDQ2PS_YMMf32_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTDQ2PS" xed="VCVTDQ2PS_YMMf32_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTDQ2PS" xed="VCVTDQ2PS_XMMf32_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTDQ2PS" xed="VCVTDQ2PS_XMMf32_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTPD2DQ" xed="VCVTPD2DQ_XMMi32_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTPD2DQ" xed="VCVTPD2DQ_XMMi32_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPD2DQ" xed="VCVTPD2DQ_XMMi32_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPD2DQ" xed="VCVTPD2DQ_XMMi32_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtpd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTPD2PS" xed="VCVTPD2PS_XMMf32_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtpd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTPD2PS" xed="VCVTPD2PS_XMMf32_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtpd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPD2PS" xed="VCVTPD2PS_XMMf32_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtpd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPD2PS" xed="VCVTPD2PS_XMMf32_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTPD2UDQ" xed="VCVTPD2UDQ_XMMu32_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTPD2UDQ" xed="VCVTPD2UDQ_XMMu32_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTPD2UDQ" xed="VCVTPD2UDQ_XMMu32_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPD2UDQ" xed="VCVTPD2UDQ_XMMu32_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPD2UDQ" xed="VCVTPD2UDQ_XMMu32_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPD2UDQ" xed="VCVTPD2UDQ_XMMu32_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128i" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTPH2PS" xed="VCVTPH2PS_YMMf32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128i" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTPH2PS" xed="VCVTPH2PS_YMMf32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128i" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPH2PS" xed="VCVTPH2PS_XMMf32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128i" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPH2PS" xed="VCVTPH2PS_XMMf32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTPS2DQ" xed="VCVTPS2DQ_YMMi32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTPS2DQ" xed="VCVTPS2DQ_YMMi32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPS2DQ" xed="VCVTPS2DQ_XMMi32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPS2DQ" xed="VCVTPS2DQ_XMMi32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvt_roundps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_ROUND_MODE" type="int" varname="imm8" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_XMMf16_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_ROUND_MODE" type="int" varname="imm8" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_XMMf16_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvt_roundps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_ROUND_MODE" type="int" varname="imm8" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_XMMf16_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_ROUND_MODE" type="int" varname="imm8" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_XMMf16_MASKmskw_YMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvt_roundps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_ROUND_MODE" type="int" varname="imm8" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_XMMf16_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_ROUND_MODE" type="int" varname="imm8" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_XMMf16_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvt_roundps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_ROUND_MODE" type="int" varname="imm8" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_XMMf16_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_ROUND_MODE" type="int" varname="imm8" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_XMMf16_MASKmskw_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTPS2UDQ" xed="VCVTPS2UDQ_YMMu32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTPS2UDQ" xed="VCVTPS2UDQ_YMMu32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTPS2UDQ" xed="VCVTPS2UDQ_YMMu32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPS2UDQ" xed="VCVTPS2UDQ_XMMu32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPS2UDQ" xed="VCVTPS2UDQ_XMMu32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPS2UDQ" xed="VCVTPS2UDQ_XMMu32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTTPD2DQ" xed="VCVTTPD2DQ_XMMi32_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTTPD2DQ" xed="VCVTTPD2DQ_XMMi32_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPD2DQ" xed="VCVTTPD2DQ_XMMi32_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPD2DQ" xed="VCVTTPD2DQ_XMMi32_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTTPD2UDQ" xed="VCVTTPD2UDQ_XMMu32_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTTPD2UDQ" xed="VCVTTPD2UDQ_XMMu32_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTTPD2UDQ" xed="VCVTTPD2UDQ_XMMu32_MASKmskw_YMMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTTPD2UDQ" xed="VCVTTPD2UDQ_XMMu32_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPD2UDQ" xed="VCVTTPD2UDQ_XMMu32_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPD2UDQ" xed="VCVTTPD2UDQ_XMMu32_MASKmskw_XMMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTTPS2DQ" xed="VCVTTPS2DQ_YMMi32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTTPS2DQ" xed="VCVTTPS2DQ_YMMi32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPS2DQ" xed="VCVTTPS2DQ_XMMi32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPS2DQ" xed="VCVTTPS2DQ_XMMi32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTTPS2UDQ" xed="VCVTTPS2UDQ_YMMu32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTTPS2UDQ" xed="VCVTTPS2UDQ_YMMu32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTTPS2UDQ" xed="VCVTTPS2UDQ_YMMu32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTTPS2UDQ" xed="VCVTTPS2UDQ_XMMu32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPS2UDQ" xed="VCVTTPS2UDQ_XMMu32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPS2UDQ" xed="VCVTTPS2UDQ_XMMu32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepu32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_Int32_To_FP64(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTUDQ2PD" xed="VCVTUDQ2PD_YMMf64_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepu32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_Int32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTUDQ2PD" xed="VCVTUDQ2PD_YMMf64_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepu32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTUDQ2PD" xed="VCVTUDQ2PD_YMMf64_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepu32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTUDQ2PD" xed="VCVTUDQ2PD_XMMf64_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepu32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTUDQ2PD" xed="VCVTUDQ2PD_XMMf64_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepu32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTUDQ2PD" xed="VCVTUDQ2PD_XMMf64_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Truncate8(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVDB" xed="VPMOVDB_XMMu8_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVDB" xed="VPMOVDB_XMMu8_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi32_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, ymm" name="VPMOVDB" xed="VPMOVDB_MEMu8_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVDB" xed="VPMOVDB_XMMu8_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Truncate8(a[i+31:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVDB" xed="VPMOVDB_XMMu8_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVDB" xed="VPMOVDB_XMMu8_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi32_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, xmm" name="VPMOVDB" xed="VPMOVDB_MEMu8_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVDB" xed="VPMOVDB_XMMu8_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Truncate16(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVDW" xed="VPMOVDW_XMMu16_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVDW" xed="VPMOVDW_XMMu16_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi32_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, ymm" name="VPMOVDW" xed="VPMOVDW_MEMu16_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVDW" xed="VPMOVDW_XMMu16_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Truncate16(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVDW" xed="VPMOVDW_XMMu16_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVDW" xed="VPMOVDW_XMMu16_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi32_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, xmm" name="VPMOVDW" xed="VPMOVDW_MEMu16_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVDW" xed="VPMOVDW_XMMu16_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Truncate8(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVQB" xed="VPMOVQB_XMMu8_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVQB" xed="VPMOVQB_XMMu8_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi64_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m32 {k}, ymm" name="VPMOVQB" xed="VPMOVQB_MEMu8_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVQB" xed="VPMOVQB_XMMu8_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Truncate8(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVQB" xed="VPMOVQB_XMMu8_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVQB" xed="VPMOVQB_XMMu8_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi64_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="16" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m16 {k}, xmm" name="VPMOVQB" xed="VPMOVQB_MEMu8_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVQB" xed="VPMOVQB_XMMu8_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Truncate32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVQD" xed="VPMOVQD_XMMu32_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVQD" xed="VPMOVQD_XMMu32_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi64_storeu_epi32" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI32" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, ymm" name="VPMOVQD" xed="VPMOVQD_MEMu32_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVQD" xed="VPMOVQD_XMMu32_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Truncate32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVQD" xed="VPMOVQD_XMMu32_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVQD" xed="VPMOVQD_XMMu32_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi64_storeu_epi32" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI32" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, xmm" name="VPMOVQD" xed="VPMOVQD_MEMu32_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVQD" xed="VPMOVQD_XMMu32_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Truncate16(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVQW" xed="VPMOVQW_XMMu16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVQW" xed="VPMOVQW_XMMu16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi64_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, ymm" name="VPMOVQW" xed="VPMOVQW_MEMu16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVQW" xed="VPMOVQW_XMMu16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Truncate16(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVQW" xed="VPMOVQW_XMMu16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVQW" xed="VPMOVQW_XMMu16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi64_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m32 {k}, xmm" name="VPMOVQW" xed="VPMOVQW_MEMu16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVQW" xed="VPMOVQW_XMMu16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtsepi32_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Saturate8(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVSDB" xed="VPMOVSDB_XMMi8_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtsepi32_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVSDB" xed="VPMOVSDB_XMMi8_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtsepi32_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI8" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, ymm" name="VPMOVSDB" xed="VPMOVSDB_MEMi8_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtsepi32_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVSDB" xed="VPMOVSDB_XMMi8_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsepi32_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Saturate8(a[i+31:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVSDB" xed="VPMOVSDB_XMMi8_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsepi32_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVSDB" xed="VPMOVSDB_XMMi8_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsepi32_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI8" memwidth="32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m32 {k}, xmm" name="VPMOVSDB" xed="VPMOVSDB_MEMi8_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtsepi32_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVSDB" xed="VPMOVSDB_XMMi8_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtsepi32_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Saturate16(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVSDW" xed="VPMOVSDW_XMMi16_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtsepi32_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVSDW" xed="VPMOVSDW_XMMi16_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtsepi32_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI16" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, ymm" name="VPMOVSDW" xed="VPMOVSDW_MEMi16_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtsepi32_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVSDW" xed="VPMOVSDW_XMMi16_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsepi32_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Saturate16(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVSDW" xed="VPMOVSDW_XMMi16_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsepi32_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVSDW" xed="VPMOVSDW_XMMi16_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsepi32_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI16" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, xmm" name="VPMOVSDW" xed="VPMOVSDW_MEMi16_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtsepi32_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVSDW" xed="VPMOVSDW_XMMi16_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtsepi64_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Saturate8(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVSQB" xed="VPMOVSQB_XMMi8_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtsepi64_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVSQB" xed="VPMOVSQB_XMMi8_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtsepi64_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI8" memwidth="32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m32 {k}, ymm" name="VPMOVSQB" xed="VPMOVSQB_MEMi8_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtsepi64_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVSQB" xed="VPMOVSQB_XMMi8_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsepi64_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Saturate8(a[i+63:i])
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVSQB" xed="VPMOVSQB_XMMi8_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsepi64_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVSQB" xed="VPMOVSQB_XMMi8_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsepi64_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI8" memwidth="16" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m16 {k}, xmm" name="VPMOVSQB" xed="VPMOVSQB_MEMi8_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtsepi64_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVSQB" xed="VPMOVSQB_XMMi8_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtsepi64_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Saturate32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVSQD" xed="VPMOVSQD_XMMi32_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtsepi64_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVSQD" xed="VPMOVSQD_XMMi32_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtsepi64_storeu_epi32" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI32" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, ymm" name="VPMOVSQD" xed="VPMOVSQD_MEMi32_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtsepi64_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVSQD" xed="VPMOVSQD_XMMi32_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsepi64_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Saturate32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVSQD" xed="VPMOVSQD_XMMi32_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsepi64_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVSQD" xed="VPMOVSQD_XMMi32_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsepi64_storeu_epi32" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI32" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, xmm" name="VPMOVSQD" xed="VPMOVSQD_MEMi32_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtsepi64_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVSQD" xed="VPMOVSQD_XMMi32_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtsepi64_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Saturate16(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVSQW" xed="VPMOVSQW_XMMi16_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtsepi64_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVSQW" xed="VPMOVSQW_XMMi16_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtsepi64_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI16" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, ymm" name="VPMOVSQW" xed="VPMOVSQW_MEMi16_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtsepi64_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVSQW" xed="VPMOVSQW_XMMi16_MASKmskw_YMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsepi64_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Saturate16(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVSQW" xed="VPMOVSQW_XMMi16_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsepi64_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVSQW" xed="VPMOVSQW_XMMi16_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsepi64_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI16" memwidth="32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m32 {k}, xmm" name="VPMOVSQW" xed="VPMOVSQW_MEMi16_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtsepi64_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVSQW" xed="VPMOVSQW_XMMi16_MASKmskw_XMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi8_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPMOVSXBD" xed="VPMOVSXBD_YMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi8_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPMOVSXBD" xed="VPMOVSXBD_YMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi8_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVSXBD" xed="VPMOVSXBD_XMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi8_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVSXBD" xed="VPMOVSXBD_XMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi8_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m256i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPMOVSXBQ" xed="VPMOVSXBQ_YMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi8_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPMOVSXBQ" xed="VPMOVSXBQ_YMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi8_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVSXBQ" xed="VPMOVSXBQ_XMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi8_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVSXBQ" xed="VPMOVSXBQ_XMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi32_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPMOVSXDQ" xed="VPMOVSXDQ_YMMi64_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi32_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPMOVSXDQ" xed="VPMOVSXDQ_YMMi64_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi32_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVSXDQ" xed="VPMOVSXDQ_XMMi64_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi32_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVSXDQ" xed="VPMOVSXDQ_XMMi64_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi16_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*16
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPMOVSXWD" xed="VPMOVSXWD_YMMi32_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi16_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPMOVSXWD" xed="VPMOVSXWD_YMMi32_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi16_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*16
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVSXWD" xed="VPMOVSXWD_XMMi32_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi16_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVSXWD" xed="VPMOVSXWD_XMMi32_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi16_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m256i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPMOVSXWQ" xed="VPMOVSXWQ_YMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi16_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPMOVSXWQ" xed="VPMOVSXWQ_YMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi16_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVSXWQ" xed="VPMOVSXWQ_XMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi16_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVSXWQ" xed="VPMOVSXWQ_XMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtusepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := SaturateU8(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVUSDB" xed="VPMOVUSDB_XMMu8_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtusepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVUSDB" xed="VPMOVUSDB_XMMu8_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtusepi32_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, ymm" name="VPMOVUSDB" xed="VPMOVUSDB_MEMu8_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtusepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVUSDB" xed="VPMOVUSDB_XMMu8_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtusepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := SaturateU8(a[i+31:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVUSDB" xed="VPMOVUSDB_XMMu8_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtusepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVUSDB" xed="VPMOVUSDB_XMMu8_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtusepi32_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m32 {k}, xmm" name="VPMOVUSDB" xed="VPMOVUSDB_MEMu8_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtusepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVUSDB" xed="VPMOVUSDB_XMMu8_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtusepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := SaturateU16(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVUSDW" xed="VPMOVUSDW_XMMu16_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtusepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVUSDW" xed="VPMOVUSDW_XMMu16_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtusepi32_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, ymm" name="VPMOVUSDW" xed="VPMOVUSDW_MEMu16_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtusepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVUSDW" xed="VPMOVUSDW_XMMu16_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtusepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := SaturateU16(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVUSDW" xed="VPMOVUSDW_XMMu16_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtusepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVUSDW" xed="VPMOVUSDW_XMMu16_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtusepi32_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, xmm" name="VPMOVUSDW" xed="VPMOVUSDW_MEMu16_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtusepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVUSDW" xed="VPMOVUSDW_XMMu16_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtusepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := SaturateU8(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVUSQB" xed="VPMOVUSQB_XMMu8_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtusepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVUSQB" xed="VPMOVUSQB_XMMu8_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtusepi64_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m32 {k}, ymm" name="VPMOVUSQB" xed="VPMOVUSQB_MEMu8_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtusepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVUSQB" xed="VPMOVUSQB_XMMu8_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtusepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := SaturateU8(a[i+63:i])
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVUSQB" xed="VPMOVUSQB_XMMu8_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtusepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVUSQB" xed="VPMOVUSQB_XMMu8_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtusepi64_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="16" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m16 {k}, xmm" name="VPMOVUSQB" xed="VPMOVUSQB_MEMu8_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtusepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVUSQB" xed="VPMOVUSQB_XMMu8_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtusepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := SaturateU32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVUSQD" xed="VPMOVUSQD_XMMu32_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtusepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := SaturateU32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVUSQD" xed="VPMOVUSQD_XMMu32_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtusepi64_storeu_epi32" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI32" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, ymm" name="VPMOVUSQD" xed="VPMOVUSQD_MEMu32_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtusepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := SaturateU32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVUSQD" xed="VPMOVUSQD_XMMu32_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtusepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := SaturateU32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVUSQD" xed="VPMOVUSQD_XMMu32_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtusepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := SaturateU32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVUSQD" xed="VPMOVUSQD_XMMu32_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtusepi64_storeu_epi32" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI32" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, xmm" name="VPMOVUSQD" xed="VPMOVUSQD_MEMu32_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtusepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := SaturateU32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVUSQD" xed="VPMOVUSQD_XMMu32_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtusepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := SaturateU16(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VPMOVUSQW" xed="VPMOVUSQW_XMMu16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtusepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VPMOVUSQW" xed="VPMOVUSQW_XMMu16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtusepi64_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, ymm" name="VPMOVUSQW" xed="VPMOVUSQW_MEMu16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtusepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VPMOVUSQW" xed="VPMOVUSQW_XMMu16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtusepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := SaturateU16(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPMOVUSQW" xed="VPMOVUSQW_XMMu16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtusepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVUSQW" xed="VPMOVUSQW_XMMu16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtusepi64_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m32 {k}, xmm" name="VPMOVUSQW" xed="VPMOVUSQW_MEMu16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtusepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVUSQW" xed="VPMOVUSQW_XMMu16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepu8_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPMOVZXBD" xed="VPMOVZXBD_YMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepu8_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPMOVZXBD" xed="VPMOVZXBD_YMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepu8_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVZXBD" xed="VPMOVZXBD_XMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepu8_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in th elow 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVZXBD" xed="VPMOVZXBD_XMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepu8_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPMOVZXBQ" xed="VPMOVZXBQ_YMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepu8_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPMOVZXBQ" xed="VPMOVZXBQ_YMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepu8_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVZXBQ" xed="VPMOVZXBQ_XMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepu8_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVZXBQ" xed="VPMOVZXBQ_XMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepu32_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPMOVZXDQ" xed="VPMOVZXDQ_YMMi64_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepu32_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+31:l])
+	ELSE 
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPMOVZXDQ" xed="VPMOVZXDQ_YMMi64_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepu32_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVZXDQ" xed="VPMOVZXDQ_XMMi64_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepu32_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+31:l])
+	ELSE 
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVZXDQ" xed="VPMOVZXDQ_XMMi64_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepu16_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPMOVZXWD" xed="VPMOVZXWD_YMMi32_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepu16_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPMOVZXWD" xed="VPMOVZXWD_YMMi32_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepu16_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVZXWD" xed="VPMOVZXWD_XMMi32_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepu16_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVZXWD" xed="VPMOVZXWD_XMMi32_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepu16_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VPMOVZXWQ" xed="VPMOVZXWQ_YMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepu16_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VPMOVZXWQ" xed="VPMOVZXWQ_YMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepu16_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPMOVZXWQ" xed="VPMOVZXWQ_XMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepu16_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPMOVZXWQ" xed="VPMOVZXWQ_XMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_expandloadu_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m256" name="VEXPANDPD" xed="VEXPANDPD_YMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_expandloadu_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m256" name="VEXPANDPD" xed="VEXPANDPD_YMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_expandloadu_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m128" name="VEXPANDPD" xed="VEXPANDPD_XMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_expandloadu_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m128" name="VEXPANDPD" xed="VEXPANDPD_XMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_expandloadu_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m256" name="VEXPANDPS" xed="VEXPANDPS_YMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_expandloadu_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m256" name="VEXPANDPS" xed="VEXPANDPS_YMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_expandloadu_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m128" name="VEXPANDPS" xed="VEXPANDPS_XMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_expandloadu_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m128" name="VEXPANDPS" xed="VEXPANDPS_XMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mmask_i32gather_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="FP64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, vm32x" name="VGATHERDPD" xed="VGATHERDPD_YMMf64_MASKmskw_MEMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mmask_i32gather_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="FP64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, vm32x" name="VGATHERDPD" xed="VGATHERDPD_XMMf64_MASKmskw_MEMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mmask_i32gather_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="FP32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, vm32y" name="VGATHERDPS" xed="VGATHERDPS_YMMf32_MASKmskw_MEMf32_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mmask_i32gather_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="FP32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, vm32x" name="VGATHERDPS" xed="VGATHERDPS_XMMf32_MASKmskw_MEMf32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mmask_i64gather_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="FP64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, vm64y" name="VGATHERQPD" xed="VGATHERQPD_YMMf64_MASKmskw_MEMf64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mmask_i64gather_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="FP64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, vm64x" name="VGATHERQPD" xed="VGATHERQPD_XMMf64_MASKmskw_MEMf64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mmask_i64gather_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="FP32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="ymm {k}, vm64y" name="VGATHERQPS" xed="VGATHERQPS_YMMf32_MASKmskw_MEMf32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mmask_i64gather_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="FP32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, vm64x" name="VGATHERQPS" xed="VGATHERQPS_XMMf32_MASKmskw_MEMf32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_load_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m64" name="VMOVAPD" xed="VMOVAPD_YMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_load_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m256" name="VMOVAPD" xed="VMOVAPD_YMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_load_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m128" name="VMOVAPD" xed="VMOVAPD_XMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_load_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m128" name="VMOVAPD" xed="VMOVAPD_XMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_load_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m256" name="VMOVAPS" xed="VMOVAPS_YMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_load_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m256" name="VMOVAPS" xed="VMOVAPS_YMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_load_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m128" name="VMOVAPS" xed="VMOVAPS_XMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_load_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m128" name="VMOVAPS" xed="VMOVAPS_XMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_load_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m64" name="VMOVDQA32" xed="VMOVDQA32_YMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_load_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m64" name="VMOVDQA32" xed="VMOVDQA32_YMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_load_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m64" name="VMOVDQA32" xed="VMOVDQA32_XMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_load_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m64" name="VMOVDQA32" xed="VMOVDQA32_XMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_load_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m64" name="VMOVDQA64" xed="VMOVDQA64_YMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_load_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m64" name="VMOVDQA64" xed="VMOVDQA64_YMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_load_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m64" name="VMOVDQA64" xed="VMOVDQA64_XMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_load_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m64" name="VMOVDQA64" xed="VMOVDQA64_XMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_loadu_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m64" name="VMOVDQU32" xed="VMOVDQU32_YMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_loadu_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m64" name="VMOVDQU32" xed="VMOVDQU32_YMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_loadu_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m64" name="VMOVDQU32" xed="VMOVDQU32_XMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_loadu_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m64" name="VMOVDQU32" xed="VMOVDQU32_XMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_loadu_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m64" name="VMOVDQU64" xed="VMOVDQU64_YMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_loadu_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m64" name="VMOVDQU64" xed="VMOVDQU64_YMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_loadu_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m64" name="VMOVDQU64" xed="VMOVDQU64_XMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_loadu_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m64" name="VMOVDQU64" xed="VMOVDQU64_XMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_loadu_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m64" name="VMOVUPD" xed="VMOVUPD_YMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_loadu_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m64" name="VMOVUPD" xed="VMOVUPD_YMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_loadu_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m64" name="VMOVUPD" xed="VMOVUPD_XMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_loadu_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m64" name="VMOVUPD" xed="VMOVUPD_XMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_loadu_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m64" name="VMOVUPS" xed="VMOVUPS_YMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_loadu_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m64" name="VMOVUPS" xed="VMOVUPS_YMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_loadu_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m64" name="VMOVUPS" xed="VMOVUPS_XMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_loadu_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m64" name="VMOVUPS" xed="VMOVUPS_XMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_expandloadu_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m64" name="VPEXPANDD" xed="VPEXPANDD_YMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_expandloadu_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m64" name="VPEXPANDD" xed="VPEXPANDD_YMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_expandloadu_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m64" name="VPEXPANDD" xed="VPEXPANDD_XMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_expandloadu_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m64" name="VPEXPANDD" xed="VPEXPANDD_XMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_expandloadu_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m64" name="VPEXPANDQ" xed="VPEXPANDQ_YMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_expandloadu_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m64" name="VPEXPANDQ" xed="VPEXPANDQ_YMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_expandloadu_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m64" name="VPEXPANDQ" xed="VPEXPANDQ_XMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_expandloadu_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m64" name="VPEXPANDQ" xed="VPEXPANDQ_XMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mmask_i32gather_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="UI32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, vm32y" name="VPGATHERDD" xed="VPGATHERDD_YMMu32_MASKmskw_MEMu32_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mmask_i32gather_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="UI32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, vm32x" name="VPGATHERDD" xed="VPGATHERDD_XMMu32_MASKmskw_MEMu32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mmask_i32gather_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="UI32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, vm32x" name="VPGATHERDQ" xed="VPGATHERDQ_YMMu64_MASKmskw_MEMu64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mmask_i32gather_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="vindex" />
+	<parameter etype="UI64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, vm32x" name="VPGATHERDQ" xed="VPGATHERDQ_XMMu64_MASKmskw_MEMu64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mmask_i64gather_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="UI32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, vm64y" name="VPGATHERQD" xed="VPGATHERQD_XMMu32_MASKmskw_MEMu32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mmask_i64gather_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="UI32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, vm64x" name="VPGATHERQD" xed="VPGATHERQD_XMMu32_MASKmskw_MEMu32_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mmask_i64gather_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="UI64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, vm64y" name="VPGATHERQQ" xed="VPGATHERQQ_YMMu64_MASKmskw_MEMu64_AVX512_VL256" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mmask_i64gather_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="vindex" />
+	<parameter etype="UI64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="const int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, vm64x" name="VPGATHERQQ" xed="VPGATHERQQ_XMMu64_MASKmskw_MEMu64_AVX512_VL128" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_loadu_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load 256-bits (composed of 4 packed 64-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVDQU64" xed="VMOVDQU64_YMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_loadu_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load 256-bits (composed of 8 packed 32-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVDQU32" xed="VMOVDQU32_YMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadu_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load 128-bits (composed of 2 packed 64-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, m128" name="VMOVDQU64" xed="VMOVDQU64_XMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadu_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load 128-bits (composed of 4 packed 32-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, m128" name="VMOVDQU32" xed="VMOVDQU32_XMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_load_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load 256-bits (composed of 4 packed 64-bit integers) from memory into "dst".
+		"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVDQA64" xed="VMOVDQA64_YMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_load_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load 256-bits (composed of 8 packed 32-bit integers) from memory into "dst".
+		"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVDQA32" xed="VMOVDQA32_YMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_load_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load 128-bits (composed of 2 packed 64-bit integers) from memory into "dst".
+		"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, m128" name="VMOVDQA64" xed="VMOVDQA64_XMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_load_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load 128-bits (composed of 4 packed 32-bit integers) from memory into "dst".
+		"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, m128" name="VMOVDQA32" xed="VMOVDQA32_XMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mov_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VMOVAPD" xed="VMOVAPD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mov_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VMOVAPD" xed="VMOVAPD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mov_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VMOVAPD" xed="VMOVAPD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mov_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VMOVAPD" xed="VMOVAPD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mov_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VMOVAPS" xed="VMOVAPS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mov_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VMOVAPS" xed="VMOVAPS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mov_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VMOVAPS" xed="VMOVAPS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mov_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VMOVAPS" xed="VMOVAPS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_movedup_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+tmp[191:128] := a[191:128]
+tmp[255:192] := a[191:128]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VMOVDDUP" xed="VMOVDDUP_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_movedup_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+tmp[191:128] := a[191:128]
+tmp[255:192] := a[191:128]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VMOVDDUP" xed="VMOVDDUP_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_movedup_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VMOVDDUP" xed="VMOVDDUP_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_movedup_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VMOVDDUP" xed="VMOVDDUP_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mov_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VMOVDQA32" xed="VMOVDQA32_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mov_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VMOVDQA32" xed="VMOVDQA32_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mov_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VMOVDQA32" xed="VMOVDQA32_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mov_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VMOVDQA32" xed="VMOVDQA32_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mov_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VMOVDQA64" xed="VMOVDQA64_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mov_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VMOVDQA64" xed="VMOVDQA64_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mov_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VMOVDQA64" xed="VMOVDQA64_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mov_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VMOVDQA64" xed="VMOVDQA64_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_movehdup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+tmp[159:128] := a[191:160] 
+tmp[191:160] := a[191:160] 
+tmp[223:192] := a[255:224] 
+tmp[255:224] := a[255:224]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VMOVSHDUP" xed="VMOVSHDUP_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_movehdup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+tmp[159:128] := a[191:160] 
+tmp[191:160] := a[191:160] 
+tmp[223:192] := a[255:224] 
+tmp[255:224] := a[255:224]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VMOVSHDUP" xed="VMOVSHDUP_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_movehdup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VMOVSHDUP" xed="VMOVSHDUP_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_movehdup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VMOVSHDUP" xed="VMOVSHDUP_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_moveldup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+tmp[159:128] := a[159:128] 
+tmp[191:160] := a[159:128] 
+tmp[223:192] := a[223:192] 
+tmp[255:224] := a[223:192]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VMOVSLDUP" xed="VMOVSLDUP_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_moveldup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+tmp[159:128] := a[159:128] 
+tmp[191:160] := a[159:128] 
+tmp[223:192] := a[223:192] 
+tmp[255:224] := a[223:192]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VMOVSLDUP" xed="VMOVSLDUP_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_moveldup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VMOVSLDUP" xed="VMOVSLDUP_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_moveldup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VMOVSLDUP" xed="VMOVSLDUP_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_and_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPANDD" xed="VPANDD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_and_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPANDD" xed="VPANDD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_and_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPANDD" xed="VPANDD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_and_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPANDD" xed="VPANDD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_andnot_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPANDND" xed="VPANDND_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_andnot_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPANDND" xed="VPANDND_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_andnot_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPANDND" xed="VPANDND_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_andnot_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPANDND" xed="VPANDND_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_andnot_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPANDNQ" xed="VPANDNQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_andnot_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPANDNQ" xed="VPANDNQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_andnot_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPANDNQ" xed="VPANDNQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_andnot_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPANDNQ" xed="VPANDNQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_and_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPANDQ" xed="VPANDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_and_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPANDQ" xed="VPANDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_and_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPANDQ" xed="VPANDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_and_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPANDQ" xed="VPANDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_or_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPORD" xed="VPORD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_or_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPORD" xed="VPORD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_or_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPORD" xed="VPORD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_or_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPORD" xed="VPORD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_or_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPORQ" xed="VPORQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_or_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPORQ" xed="VPORQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_or_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPORQ" xed="VPORQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_or_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPORQ" xed="VPORQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_ternarylogic_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="UI32" type="__m256i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+		ENDFOR
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VPTERNLOGD" xed="VPTERNLOGD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_ternarylogic_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="UI32" type="__m256i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+		ENDFOR
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VPTERNLOGD" xed="VPTERNLOGD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_ternarylogic_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="UI32" type="__m256i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst".</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 7
+	i := j*32
+	FOR h := 0 to 31
+		dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPTERNLOGD" xed="VPTERNLOGD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_ternarylogic_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+		ENDFOR
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VPTERNLOGD" xed="VPTERNLOGD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_ternarylogic_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+		ENDFOR
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VPTERNLOGD" xed="VPTERNLOGD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_ternarylogic_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst".</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 3
+	i := j*32
+	FOR h := 0 to 31
+		dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VPTERNLOGD" xed="VPTERNLOGD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_ternarylogic_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+		ENDFOR
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VPTERNLOGQ" xed="VPTERNLOGQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_ternarylogic_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+		ENDFOR
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VPTERNLOGQ" xed="VPTERNLOGQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_ternarylogic_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst".</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 3
+	i := j*64
+	FOR h := 0 to 63
+		dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPTERNLOGQ" xed="VPTERNLOGQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_ternarylogic_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+		ENDFOR
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VPTERNLOGQ" xed="VPTERNLOGQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_ternarylogic_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+		ENDFOR
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VPTERNLOGQ" xed="VPTERNLOGQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_ternarylogic_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst".</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 1
+	i := j*64
+	FOR h := 0 to 63
+		dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VPTERNLOGQ" xed="VPTERNLOGQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_xor_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPXORD" xed="VPXORD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_xor_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPXORD" xed="VPXORD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_xor_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPXORD" xed="VPXORD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_xor_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPXORD" xed="VPXORD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_xor_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPXORQ" xed="VPXORQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_xor_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPXORQ" xed="VPXORQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_xor_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPXORQ" xed="VPXORQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_xor_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPXORQ" xed="VPXORQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_xor_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPXORQ" xed="VPXORQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_xor_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPXORD" xed="VPXORD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_xor_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPXORQ" xed="VPXORQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_xor_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPXORD" xed="VPXORD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_or_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPORQ" xed="VPORQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_or_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPORD" xed="VPORD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_or_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPORQ" xed="VPORQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_or_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPORD" xed="VPORD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_set1_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, r32" name="VPBROADCASTD" xed="VPBROADCASTD_YMMu32_MASKmskw_GPR32u32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_set1_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, r32" name="VPBROADCASTD" xed="VPBROADCASTD_YMMu32_MASKmskw_GPR32u32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_set1_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, r32" name="VPBROADCASTD" xed="VPBROADCASTD_XMMu32_MASKmskw_GPR32u32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_set1_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, r32" name="VPBROADCASTD" xed="VPBROADCASTD_XMMu32_MASKmskw_GPR32u32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_set1_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, r64" name="VPBROADCASTQ" xed="VPBROADCASTQ_YMMu64_MASKmskw_GPR64u64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_set1_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, r64" name="VPBROADCASTQ" xed="VPBROADCASTQ_YMMu64_MASKmskw_GPR64u64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_set1_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, r64" name="VPBROADCASTQ" xed="VPBROADCASTQ_XMMu64_MASKmskw_GPR64u64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_set1_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, r64" name="VPBROADCASTQ" xed="VPBROADCASTQ_XMMu64_MASKmskw_GPR64u64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_rol_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPROLD" xed="VPROLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_rol_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPROLD" xed="VPROLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rol_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPROLD" xed="VPROLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rol_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPROLD" xed="VPROLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rol_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPROLD" xed="VPROLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_rol_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VPROLD" xed="VPROLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_rol_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPROLQ" xed="VPROLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_rol_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPROLQ" xed="VPROLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rol_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPROLQ" xed="VPROLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rol_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPROLQ" xed="VPROLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rol_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPROLQ" xed="VPROLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_rol_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VPROLQ" xed="VPROLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_rolv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPROLVD" xed="VPROLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_rolv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPROLVD" xed="VPROLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rolv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPROLVD" xed="VPROLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rolv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPROLVD" xed="VPROLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rolv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPROLVD" xed="VPROLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_rolv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPROLVD" xed="VPROLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_rolv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPROLVQ" xed="VPROLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_rolv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPROLVQ" xed="VPROLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rolv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPROLVQ" xed="VPROLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rolv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPROLVQ" xed="VPROLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rolv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPROLVQ" xed="VPROLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_rolv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPROLVQ" xed="VPROLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_ror_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPRORD" xed="VPRORD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_ror_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPRORD" xed="VPRORD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_ror_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPRORD" xed="VPRORD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_ror_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPRORD" xed="VPRORD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_ror_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPRORD" xed="VPRORD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_ror_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VPRORD" xed="VPRORD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_ror_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPRORQ" xed="VPRORQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_ror_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPRORQ" xed="VPRORQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_ror_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPRORQ" xed="VPRORQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_ror_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPRORQ" xed="VPRORQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_ror_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPRORQ" xed="VPRORQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_ror_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VPRORQ" xed="VPRORQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_rorv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPRORVD" xed="VPRORVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_rorv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPRORVD" xed="VPRORVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rorv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPRORVD" xed="VPRORVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rorv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPRORVD" xed="VPRORVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rorv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPRORVD" xed="VPRORVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_rorv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPRORVD" xed="VPRORVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_rorv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPRORVQ" xed="VPRORVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_rorv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPRORVQ" xed="VPRORVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rorv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPRORVQ" xed="VPRORVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rorv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPRORVQ" xed="VPRORVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rorv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPRORVQ" xed="VPRORVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_rorv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPRORVQ" xed="VPRORVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sll_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, xmm" name="VPSLLD" xed="VPSLLD_YMMu32_MASKmskw_YMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_slli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPSLLD" xed="VPSLLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sll_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, xmm" name="VPSLLD" xed="VPSLLD_YMMu32_MASKmskw_YMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_slli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPSLLD" xed="VPSLLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sll_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSLLD" xed="VPSLLD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_slli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPSLLD" xed="VPSLLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sll_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSLLD" xed="VPSLLD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_slli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPSLLD" xed="VPSLLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sll_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, xmm" name="VPSLLQ" xed="VPSLLQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_slli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPSLLQ" xed="VPSLLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sll_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, xmm" name="VPSLLQ" xed="VPSLLQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_slli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPSLLQ" xed="VPSLLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sll_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSLLQ" xed="VPSLLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_slli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPSLLQ" xed="VPSLLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sll_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSLLQ" xed="VPSLLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_slli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPSLLQ" xed="VPSLLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sllv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSLLVD" xed="VPSLLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sllv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSLLVD" xed="VPSLLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sllv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSLLVD" xed="VPSLLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sllv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSLLVD" xed="VPSLLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sllv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSLLVQ" xed="VPSLLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sllv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSLLVQ" xed="VPSLLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sllv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSLLVQ" xed="VPSLLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sllv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSLLVQ" xed="VPSLLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sra_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, xmm" name="VPSRAD" xed="VPSRAD_YMMu32_MASKmskw_YMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srai_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPSRAD" xed="VPSRAD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sra_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, xmm" name="VPSRAD" xed="VPSRAD_YMMu32_MASKmskw_YMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srai_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPSRAD" xed="VPSRAD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sra_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSRAD" xed="VPSRAD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srai_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPSRAD" xed="VPSRAD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sra_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSRAD" xed="VPSRAD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srai_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="6" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPSRAD" xed="VPSRAD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sra_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, xmm" name="VPSRAQ" xed="VPSRAQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srai_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPSRAQ" xed="VPSRAQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sra_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, xmm" name="VPSRAQ" xed="VPSRAQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srai_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPSRAQ" xed="VPSRAQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sra_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+	ELSE
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, xmm" name="VPSRAQ" xed="VPSRAQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srai_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+	ELSE
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VPSRAQ" xed="VPSRAQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sra_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSRAQ" xed="VPSRAQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srai_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPSRAQ" xed="VPSRAQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sra_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSRAQ" xed="VPSRAQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srai_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="7" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPSRAQ" xed="VPSRAQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sra_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+	ELSE
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSRAQ" xed="VPSRAQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srai_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="7" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+	ELSE
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VPSRAQ" xed="VPSRAQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srav_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSRAVD" xed="VPSRAVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srav_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSRAVD" xed="VPSRAVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srav_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSRAVD" xed="VPSRAVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srav_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSRAVD" xed="VPSRAVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srav_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSRAVQ" xed="VPSRAVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srav_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSRAVQ" xed="VPSRAVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_srav_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSRAVQ" xed="VPSRAVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srav_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSRAVQ" xed="VPSRAVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srav_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSRAVQ" xed="VPSRAVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srav_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSRAVQ" xed="VPSRAVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srl_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, xmm" name="VPSRLD" xed="VPSRLD_YMMu32_MASKmskw_YMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPSRLD" xed="VPSRLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srl_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, xmm" name="VPSRLD" xed="VPSRLD_YMMu32_MASKmskw_YMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPSRLD" xed="VPSRLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srl_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSRLD" xed="VPSRLD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPSRLD" xed="VPSRLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srl_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSRLD" xed="VPSRLD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPSRLD" xed="VPSRLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srl_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, xmm" name="VPSRLQ" xed="VPSRLQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VPSRLQ" xed="VPSRLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srl_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, xmm" name="VPSRLQ" xed="VPSRLQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VPSRLQ" xed="VPSRLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srl_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSRLQ" xed="VPSRLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VPSRLQ" xed="VPSRLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srl_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSRLQ" xed="VPSRLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VPSRLQ" xed="VPSRLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srlv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSRLVD" xed="VPSRLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srlv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSRLVD" xed="VPSRLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srlv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSRLVD" xed="VPSRLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srlv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSRLVD" xed="VPSRLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_srlv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSRLVQ" xed="VPSRLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_srlv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSRLVQ" xed="VPSRLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_srlv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSRLVQ" xed="VPSRLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_srlv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSRLVQ" xed="VPSRLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sqrt_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VSQRTPD" xed="VSQRTPD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sqrt_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VSQRTPD" xed="VSQRTPD_YMMf64_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sqrt_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VSQRTPD" xed="VSQRTPD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sqrt_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VSQRTPD" xed="VSQRTPD_XMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sqrt_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VSQRTPS" xed="VSQRTPS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sqrt_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VSQRTPS" xed="VSQRTPS_YMMf32_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sqrt_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VSQRTPS" xed="VSQRTPS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sqrt_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VSQRTPS" xed="VSQRTPS_XMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_aesenclast_epi128" tech="Other">
+	<return etype="M128" type="__m512i" varname="dst" />
+	<parameter etype="M128" type="__m512i" varname="a" />
+	<parameter etype="M128" type="__m512i" varname="RoundKey" />
+	<description>Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"."</description>
+	<operation>FOR j := 0 to 3
+	i := j*128
+	a[i+127:i] := ShiftRows(a[i+127:i])
+	a[i+127:i] := SubBytes(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VAESENCLAST" xed="VAESENCLAST_ZMMu128_ZMMu128_ZMMu128_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>VAES</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm512_aesenc_epi128" tech="Other">
+	<return etype="M128" type="__m512i" varname="dst" />
+	<parameter etype="M128" type="__m512i" varname="a" />
+	<parameter etype="M128" type="__m512i" varname="RoundKey" />
+	<description>Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"."</description>
+	<operation>FOR j := 0 to 3
+	i := j*128
+	a[i+127:i] := ShiftRows(a[i+127:i])
+	a[i+127:i] := SubBytes(a[i+127:i])
+	a[i+127:i] := MixColumns(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VAESENC" xed="VAESENC_ZMMu128_ZMMu128_ZMMu128_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>VAES</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm512_aesdeclast_epi128" tech="Other">
+	<return etype="M128" type="__m512i" varname="dst" />
+	<parameter etype="M128" type="__m512i" varname="a" />
+	<parameter etype="M128" type="__m512i" varname="RoundKey" />
+	<description>Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*128
+	a[i+127:i] := InvShiftRows(a[i+127:i])
+	a[i+127:i] := InvSubBytes(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VAESDECLAST" xed="VAESDECLAST_ZMMu128_ZMMu128_ZMMu128_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>VAES</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm512_aesdec_epi128" tech="Other">
+	<return etype="M128" type="__m512i" varname="dst" />
+	<parameter etype="M128" type="__m512i" varname="a" />
+	<parameter etype="M128" type="__m512i" varname="RoundKey" />
+	<description>Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*128
+	a[i+127:i] := InvShiftRows(a[i+127:i])
+	a[i+127:i] := InvSubBytes(a[i+127:i])
+	a[i+127:i] := InvMixColumns(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VAESDEC" xed="VAESDEC_ZMMu128_ZMMu128_ZMMu128_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<CPUID>VAES</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_maskz_mullo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMULLD" xed="VPMULLD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_add_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VADDPD" xed="VADDPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_add_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VADDPD" xed="VADDPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_add_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VADDPS" xed="VADDPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_add_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VADDPS" xed="VADDPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := a[63:0] + b[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VADDSD" xed="VADDSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_add_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] + b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VADDSD" xed="VADDSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_add_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] + b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VADDSD" xed="VADDSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_add_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] + b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VADDSD" xed="VADDSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_add_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] + b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VADDSD" xed="VADDSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+dst[31:0] := a[31:0] + b[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VADDSS" xed="VADDSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_add_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] + b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VADDSS" xed="VADDSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_add_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] + b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VADDSS" xed="VADDSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_add_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] + b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VADDSS" xed="VADDSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_add_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] + b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VADDSS" xed="VADDSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := a[i+63:i] / b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VDIVPD" xed="VDIVPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", =and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := a[i+63:i] / b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VDIVPD" xed="VDIVPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_div_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VDIVPD" xed="VDIVPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_div_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VDIVPD" xed="VDIVPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_div_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VDIVPD" xed="VDIVPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_div_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VDIVPD" xed="VDIVPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := a[i+31:i] / b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VDIVPS" xed="VDIVPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := a[i+31:i] / b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VDIVPS" xed="VDIVPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_div_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VDIVPS" xed="VDIVPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_div_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VDIVPS" xed="VDIVPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_div_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VDIVPS" xed="VDIVPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_div_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VDIVPS" xed="VDIVPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+dst[63:0] := a[63:0] / b[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VDIVSD" xed="VDIVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_div_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". 
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] / b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VDIVSD" xed="VDIVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_div_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] / b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VDIVSD" xed="VDIVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_div_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] / b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VDIVSD" xed="VDIVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_div_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] / b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VDIVSD" xed="VDIVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+dst[31:0] := a[31:0] / b[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VDIVSS" xed="VDIVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_div_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] / b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VDIVSS" xed="VDIVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_div_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] / b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VDIVSS" xed="VDIVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_div_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] / b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VDIVSS" xed="VDIVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_div_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] / b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VDIVSS" xed="VDIVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADD132PD" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADD213PD" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADD231PD" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmadd_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADD132PD" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADD213PD" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADD231PD" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADD132PS" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADD213PS" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADD231PS" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmadd_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "a" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADD132PS" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADD213PS" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADD231PS" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmadd_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFMADD132SD" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFMADD213SD" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFMADD231SD" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmadd_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD132SD" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD213SD" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD231SD" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmadd_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD132SD" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD213SD" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD231SD" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmadd_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD132SD" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD213SD" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD231SD" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmadd_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD132SD" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD213SD" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD231SD" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmadd_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMADD132SD" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMADD213SD" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMADD231SD" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmadd_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD132SD" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD213SD" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD231SD" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmadd_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD132SS" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD213SS" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD231SS" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmadd_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD132SS" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD213SS" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD231SS" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmadd_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFMADD132SS" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFMADD213SS" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFMADD231SS" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmadd_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD132SS" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD213SS" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD231SS" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmadd_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD132SS" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD213SS" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD231SS" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmadd_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMADD132SS" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMADD213SS" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMADD231SS" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmadd_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD132SS" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD213SS" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD231SS" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmaddsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF ((j &amp; 1) == 0)
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmaddsub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF ((j &amp; 1) == 0)
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmaddsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmaddsub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE 
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmaddsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmaddsub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmaddsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmaddsub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmaddsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF ((j &amp; 1) == 0)
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmaddsub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF ((j &amp; 1) == 0)
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmaddsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmaddsub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmaddsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmaddsub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmaddsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmaddsub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUB132PD" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUB213PD" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUB231PD" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmsub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUB132PD" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUB213PD" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUB231PD" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUB132PS" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUB213PS" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUB231PS" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmsub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUB132PS" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUB213PS" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUB231PS" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmsub_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFMSUB132SD" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFMSUB213SD" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFMSUB231SD" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmsub_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB132SD" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB213SD" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB231SD" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmsub_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB132SD" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB213SD" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB231SD" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmsub_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB132SD" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB213SD" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB231SD" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmsub_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB132SD" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB213SD" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB231SD" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmsub_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMSUB132SD" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMSUB213SD" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMSUB231SD" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmsub_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB132SD" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB213SD" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB231SD" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmsub_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFMSUB132SS" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFMSUB213SS" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFMSUB231SS" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmsub_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB132SS" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB213SS" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB231SS" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmsub_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB132SS" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB213SS" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB231SS" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmsub_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB132SS" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB213SS" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB231SS" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmsub_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB132SS" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB213SS" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB231SS" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmsub_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMSUB132SS" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMSUB213SS" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMSUB231SS" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmsub_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB132SS" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB213SS" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB231SS" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmsubadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF ((j &amp; 1) == 0)
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmsubadd_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF ((j &amp; 1) == 0)
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmsubadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmsubadd_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmsubadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmsubadd_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmsubadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmsubadd_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmsubadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF ((j &amp; 1) == 0)
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmsubadd_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF ((j &amp; 1) == 0)
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmsubadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmsubadd_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmsubadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmsubadd_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmsubadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmsubadd_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fnmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMADD132PD" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMADD213PD" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMADD231PD" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fnmadd_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMADD132PD" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMADD213PD" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMADD231PD" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fnmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMADD132PS" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMADD213PS" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMADD231PS" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fnmadd_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMADD132PS" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMADD213PS" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMADD231PS" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmadd_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMADD132SD" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMADD213SD" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMADD231SD" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmadd_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD132SD" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD213SD" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD231SD" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmadd_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD132SD" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD213SD" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD231SD" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmadd_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD132SD" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD213SD" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD231SD" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmadd_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD132SD" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD213SD" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD231SD" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmadd_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMADD132SD" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMADD213SD" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMADD231SD" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmadd_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD213SD" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD231SD" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD132SD" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmadd_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMADD132SS" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMADD213SS" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMADD231SS" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmadd_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD132SS" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD213SS" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD231SS" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmadd_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD132SS" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD213SS" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD231SS" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmadd_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD132SS" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD213SS" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD231SS" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmadd_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD132SS" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD213SS" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD231SS" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmadd_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMADD132SS" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMADD213SS" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMADD231SS" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmadd_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD132SS" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD213SS" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD231SS" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fnmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMSUB132PD" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMSUB213PD" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMSUB231PD" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fnmsub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMSUB132PD" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMSUB213PD" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMSUB231PD" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fnmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMSUB132PS" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMSUB213PS" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMSUB231PS" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fnmsub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMSUB132PS" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMSUB213PS" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMSUB231PS" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmsub_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMSUB132SD" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMSUB213SD" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMSUB231SD" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmsub_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB132SD" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB213SD" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB231SD" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmsub_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB132SD" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB213SD" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB231SD" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmsub_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB132SD" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB213SD" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB231SD" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmsub_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB132SD" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB213SD" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB231SD" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmsub_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMSUB132SD" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMSUB213SD" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMSUB231SD" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmsub_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB132SD" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB213SD" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB231SD" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmsub_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMSUB132SS" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMSUB213SS" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMSUB231SS" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmsub_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB132SS" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB213SS" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB231SS" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmsub_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB132SS" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB213SS" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB231SS" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmsub_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB132SS" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB213SS" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB231SS" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmsub_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB132SS" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB213SS" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB231SS" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmsub_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMSUB132SS" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMSUB213SS" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMSUB231SS" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmsub_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB132SS" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB213SS" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB231SS" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mul_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VMULPD" xed="VMULPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mul_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VMULPD" xed="VMULPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mul_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VMULPS" xed="VMULPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mul_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VMULPS" xed="VMULPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] * b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VMULSD" xed="VMULSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] * b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMULSD" xed="VMULSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] * b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VMULSD" xed="VMULSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] * b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMULSD" xed="VMULSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+dst[63:0] := a[63:0] * b[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VMULSD" xed="VMULSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] * b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VMULSS" xed="VMULSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] * b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMULSS" xed="VMULSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] * b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VMULSS" xed="VMULSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] * b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMULSS" xed="VMULSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+dst[31:0] := a[31:0] * b[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VMULSS" xed="VMULSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_add_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPADDD" xed="VPADDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_add_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPADDQ" xed="VPADDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_add_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPADDQ" xed="VPADDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_add_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPADDQ" xed="VPADDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mul_epi32" tech="AVX-512">
+	<return etype="SI64" type="__m512i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMULDQ" xed="VPMULDQ_ZMMi64_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mul_epi32" tech="AVX-512">
+	<return etype="SI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMULDQ" xed="VPMULDQ_ZMMi64_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mul_epi32" tech="AVX-512">
+	<return etype="SI64" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMULDQ" xed="VPMULDQ_ZMMi64_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mul_epu32" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMULUDQ" xed="VPMULUDQ_ZMMu64_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mul_epu32" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMULUDQ" xed="VPMULUDQ_ZMMu64_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mul_epu32" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMULUDQ" xed="VPMULUDQ_ZMMu64_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sub_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSUBD" xed="VPSUBD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sub_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSUBQ" xed="VPSUBQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sub_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSUBQ" xed="VPSUBQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sub_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSUBQ" xed="VPSUBQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VSUBPD" xed="VSUBPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VSUBPD" xed="VSUBPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VSUBPS" xed="VSUBPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VSUBPS" xed="VSUBPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sub_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] - b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VSUBSD" xed="VSUBSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sub_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] - b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSUBSD" xed="VSUBSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sub_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] - b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VSUBSD" xed="VSUBSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sub_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] - b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSUBSD" xed="VSUBSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := a[63:0] - b[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VSUBSD" xed="VSUBSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sub_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] - b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VSUBSS" xed="VSUBSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sub_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] - b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSUBSS" xed="VSUBSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sub_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] - b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VSUBSS" xed="VSUBSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sub_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] - b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSUBSS" xed="VSUBSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := a[31:0] - b[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VSUBSS" xed="VSUBSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_storeu_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVDQU64" xed="VMOVDQU64_MEMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_storeu_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVDQU32" xed="VMOVDQU32_MEMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_store_mask16" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="MASK" memwidth="16" type="__mmask16*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<description>Store 16-bit mask from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+15:mem_addr] := a[15:0]
+	</operation>
+	<instruction form="m16, k" name="KMOVW" xed="KMOVW_MEMu16_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_compressstoreu_pd" tech="AVX-512">
+	<category>Swizzle</category>
+	<return type="void" />
+	<parameter etype="FP64" memwidth="512" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VCOMPRESSPD" xed="VCOMPRESSPD_MEMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_compressstoreu_ps" tech="AVX-512">
+	<category>Swizzle</category>
+	<return type="void" />
+	<parameter etype="FP32" memwidth="512" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VCOMPRESSPS" xed="VCOMPRESSPS_MEMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_storeu_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VMOVDQU32" xed="VMOVDQU32_MEMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_storeu_si512" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="M512" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="M512" type="__m512i" varname="a" />
+	<description>Store 512-bits of integer data from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVDQU32" xed="VMOVDQU32_MEMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_storeu_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VMOVDQU64" xed="VMOVDQU64_MEMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_stream_si512" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="M512" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="M512" type="__m512i" varname="a" />
+	<description>Store 512-bits of integer data from "a" into memory using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVNTDQ" xed="VMOVNTDQ_MEMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_stream_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVNTPD" xed="VMOVNTPD_MEMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_stream_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVNTPS" xed="VMOVNTPS_MEMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_store_sd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="64" type="double*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	MEM[mem_addr+63:mem_addr] := a[63:0]
+FI
+	</operation>
+	<instruction form="m64 {k}, xmm" name="VMOVSD" xed="VMOVSD_MEMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_store_ss" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="32" type="float*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Store the lower single-precision (32-bit) floating-point element from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	MEM[mem_addr+31:mem_addr] := a[31:0]
+FI
+	</operation>
+	<instruction form="m32 {k}, xmm" name="VMOVSS" xed="VMOVSS_MEMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_storeu_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VMOVUPD" xed="VMOVUPD_MEMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_storeu_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVUPD" xed="VMOVUPD_MEMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_storeu_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VMOVUPS" xed="VMOVUPS_MEMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_storeu_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVUPS" xed="VMOVUPS_MEMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_compressstoreu_epi32" tech="AVX-512">
+	<category>Swizzle</category>
+	<return type="void" />
+	<parameter etype="UI32" memwidth="512" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m32 {k}, zmm" name="VPCOMPRESSD" xed="VPCOMPRESSD_MEMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_compressstoreu_epi64" tech="AVX-512">
+	<category>Swizzle</category>
+	<return type="void" />
+	<parameter etype="UI64" memwidth="512" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, zmm" name="VPCOMPRESSQ" xed="VPCOMPRESSQ_MEMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i32scatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32y, zmm" name="VPSCATTERDQ" xed="VPSCATTERDQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i32scatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32y {k}, zmm" name="VPSCATTERDQ" xed="VPSCATTERDQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i64scatter_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="void*" varname="base_addr" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="vm64z, ymm" name="VPSCATTERQD" xed="VPSCATTERQD_MEMu32_MASKmskw_YMMu32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i64scatter_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm64z {k}, ymm" name="VPSCATTERQD" xed="VPSCATTERQD_MEMu32_MASKmskw_YMMu32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i64scatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="vm64z, zmm" name="VPSCATTERQQ" xed="VPSCATTERQQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i64scatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm64z {k}, zmm" name="VPSCATTERQQ" xed="VPSCATTERQQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i32scatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32y, zmm" name="VSCATTERDPD" xed="VSCATTERDPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i32scatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32y {k}, zmm" name="VSCATTERDPD" xed="VSCATTERDPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i64scatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32z, zmm" name="VSCATTERQPD" xed="VSCATTERQPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i64scatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32z {k}, zmm" name="VSCATTERQPD" xed="VSCATTERQPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i64scatter_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" type="void*" varname="base_addr" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32z, ymm" name="VSCATTERQPS" xed="VSCATTERQPS_MEMf32_MASKmskw_YMMf32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i64scatter_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32z {k}, ymm" name="VSCATTERQPS" xed="VSCATTERQPS_MEMf32_MASKmskw_YMMf32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mullox_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mullox_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_loadu_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVDQU64" xed="VMOVDQU64_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_loadu_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVDQU32" xed="VMOVDQU32_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_load_mask16" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" memwidth="16" type="__mmask16*" varname="mem_addr" />
+	<description>Load 16-bit mask from memory into "k".</description>
+	<operation>
+k[15:0] := MEM[mem_addr+15:mem_addr]
+	</operation>
+	<instruction form="k, m16" name="KMOVW" xed="KMOVW_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expandloadu_pd" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VEXPANDPD" xed="VEXPANDPD_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_expandloadu_pd" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VEXPANDPD" xed="VEXPANDPD_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expandloadu_ps" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VEXPANDPS" xed="VEXPANDPS_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_expandloadu_ps" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VEXPANDPS" xed="VEXPANDPS_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i32gather_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="FP64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, vm32y" name="VGATHERDPD" xed="VGATHERDPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i32gather_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="FP64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, vm32y" name="VGATHERDPD" xed="VGATHERDPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i64gather_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="FP64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, vm32z" name="VGATHERQPD" xed="VGATHERQPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i64gather_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="FP64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, vm32z" name="VGATHERQPD" xed="VGATHERQPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i64gather_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="FP32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm64z" name="VGATHERQPS" xed="VGATHERQPS_YMMf32_MASKmskw_MEMf32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i64gather_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="FP32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, vm64z" name="VGATHERQPS" xed="VGATHERQPS_YMMf32_MASKmskw_MEMf32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_load_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VMOVAPD" xed="VMOVAPD_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_load_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VMOVAPS" xed="VMOVAPS_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_load_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VMOVDQA32" xed="VMOVDQA32_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_load_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VMOVDQA64" xed="VMOVDQA64_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_loadu_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="UI64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits of integer data from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVDQU32" xed="VMOVDQU32_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_loadu_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VMOVDQU32" xed="VMOVDQU32_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_loadu_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VMOVDQU32" xed="VMOVDQU32_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_loadu_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VMOVDQU64" xed="VMOVDQU64_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_loadu_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VMOVDQU64" xed="VMOVDQU64_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_stream_load_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="M512" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits of integer data from memory into "dst" using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVNTDQA" xed="VMOVNTDQA_ZMMu32_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_load_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="64" type="const double*" varname="mem_addr" />
+	<description>Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MEM[mem_addr+63:mem_addr]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, m64" name="VMOVSD" xed="VMOVSD_XMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_load_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="64" type="const double*" varname="mem_addr" />
+	<description>Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MEM[mem_addr+63:mem_addr]
+ELSE
+	dst[63:0] := 0
+FI
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, m64" name="VMOVSD" xed="VMOVSD_XMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_load_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="32" type="const float*" varname="mem_addr" />
+	<description>Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MEM[mem_addr+31:mem_addr]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {k}, m32" name="VMOVSS" xed="VMOVSS_XMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_load_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" memwidth="32" type="const float*" varname="mem_addr" />
+	<description>Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MEM[mem_addr+31:mem_addr]
+ELSE
+	dst[31:0] := 0
+FI
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {z}, m32" name="VMOVSS" xed="VMOVSS_XMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_loadu_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVUPD" xed="VMOVUPD_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_loadu_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VMOVUPD" xed="VMOVUPD_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_loadu_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VMOVUPD" xed="VMOVUPD_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_loadu_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVUPS" xed="VMOVUPS_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_loadu_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VMOVUPS" xed="VMOVUPS_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_loadu_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VMOVUPS" xed="VMOVUPS_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expandloadu_epi32" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m32" name="VPEXPANDD" xed="VPEXPANDD_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_expandloadu_epi32" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m32" name="VPEXPANDD" xed="VPEXPANDD_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expandloadu_epi64" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m64" name="VPEXPANDQ" xed="VPEXPANDQ_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_expandloadu_epi64" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m64" name="VPEXPANDQ" xed="VPEXPANDQ_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i32gather_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="vindex" />
+	<parameter etype="UI64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, vm32y" name="VPGATHERDQ" xed="VPGATHERDQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i32gather_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="vindex" />
+	<parameter etype="UI64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, vm32y" name="VPGATHERDQ" xed="VPGATHERDQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i64gather_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="UI32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, vm64z" name="VPGATHERQD" xed="VPGATHERQD_YMMu32_MASKmskw_MEMu32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i64gather_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="UI32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, vm64z" name="VPGATHERQD" xed="VPGATHERQD_YMMu32_MASKmskw_MEMu32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i64gather_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="UI64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, vm64z" name="VPGATHERQQ" xed="VPGATHERQQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i64gather_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="UI64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, vm64z" name="VPGATHERQQ" xed="VPGATHERQQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_kand_mask16" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k, k" name="KANDW" xed="KANDW_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kandn_mask16" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := (NOT a[15:0]) AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k, k" name="KANDNW" xed="KANDNW_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_knot_mask16" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<description>Compute the bitwise NOT of 16-bit mask "a", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT a[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k" name="KNOTW" xed="KNOTW_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kor_mask16" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] OR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k, k" name="KORW" xed="KORW_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kxnor_mask16" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT (a[15:0] XOR b[15:0])
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k, k" name="KXNORW" xed="KXNORW_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kxor_mask16" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] XOR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k, k" name="KXORW" xed="KXORW_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kshiftli_mask16" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="count" />
+	<description>Shift the bits of 16-bit mask "a" left by "count" while shifting in zeros, and store the least significant 16 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 15
+	k[15:0] := a[15:0] &lt;&lt; count[7:0]
+FI
+	</operation>
+	<instruction form="k, k, imm8" name="KSHIFTLW" xed="KSHIFTLW_MASKmskw_MASKmskw_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kshiftri_mask16" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="count" />
+	<description>Shift the bits of 16-bit mask "a" right by "count" while shifting in zeros, and store the least significant 16 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 15
+	k[15:0] := a[15:0] &gt;&gt; count[7:0]
+FI
+	</operation>
+	<instruction form="k, k, imm8" name="KSHIFTRW" xed="KSHIFTRW_MASKmskw_MASKmskw_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kortest_mask16_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<parameter etype="UI8" memwidth="8" type="unsigned char*" varname="all_ones" />
+	<description>Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones".</description>
+	<operation>
+tmp[15:0] := a[15:0] OR b[15:0]
+IF tmp[15:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+IF tmp[15:0] == 0xFFFF
+	MEM[all_ones+7:all_ones] := 1
+ELSE
+	MEM[all_ones+7:all_ones] := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTW" xed="KORTESTW_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kortestz_mask16_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[15:0] := a[15:0] OR b[15:0]
+IF tmp[15:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTW" xed="KORTESTW_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_kortestc_mask16_u8" tech="AVX-512">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[15:0] := a[15:0] OR b[15:0]
+IF tmp[15:0] == 0xFFFF
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTW" xed="KORTESTW_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_cvtmask16_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<description>Convert 16-bit mask "a" into an integer value, and store the result in "dst".</description>
+	<operation>
+dst := ZeroExtend32(a[15:0])
+	</operation>
+	<instruction form="r32, k" name="KMOVW" xed="KMOVW_GPR32u32_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_cvtu32_mask16" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="unsigned int" varname="a" />
+	<description>Convert integer value "a" into an 16-bit mask, and store the result in "k".</description>
+	<operation>
+k := ZeroExtend16(a[15:0])
+	</operation>
+	<instruction form="k, r32" name="KMOVW" xed="KMOVW_MASKmskw_GPR32u32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_kandn" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := (NOT a[15:0]) AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k, k" name="KANDNW" xed="KANDNW_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_kand" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k, k" name="KANDW" xed="KANDW_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_kmov" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<description>Copy 16-bit mask "a" to "k".</description>
+	<operation>
+k[15:0] := a[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k" name="KMOVW" xed="KMOVW_MASKmskw_MASKu16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_knot" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<description>Compute the bitwise NOT of 16-bit mask "a", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT a[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k" name="KNOTW" xed="KNOTW_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_kor" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] OR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k, k" name="KORW" xed="KORW_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_kunpackb" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Unpack and interleave 8 bits from masks "a" and "b", and store the 16-bit result in "k".</description>
+	<operation>
+k[7:0] := b[7:0]
+k[15:8] := a[7:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k, k" name="KUNPCKBW" xed="KUNPCKBW_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_kxnor" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT (a[15:0] XOR b[15:0])
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k, k" name="KXNORW" xed="KXNORW_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_kxor" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="b" />
+	<description>Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] XOR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, k, k" name="KXORW" xed="KXORW_MASKmskw_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_kortestz" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="MASK" type="__mmask16" varname="k2" />
+	<description>Performs bitwise OR between "k1" and "k2", storing the result in "dst". ZF flag is set if "dst" is 0.</description>
+	<operation>dst[15:0] := k1[15:0] | k2[15:0]
+IF dst == 0
+	SetZF()
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTW" xed="KORTESTW_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_kortestc" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="MASK" type="__mmask16" varname="k2" />
+	<description>Performs bitwise OR between "k1" and "k2", storing the result in "dst". CF flag is set if "dst" consists of all 1's.</description>
+	<operation>dst[15:0] := k1[15:0] | k2[15:0]
+IF PopCount(dst[15:0]) == 16
+	SetCF()
+FI
+	</operation>
+	<instruction form="k, k" name="KORTESTW" xed="KORTESTW_MASKmskw_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask2int" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<description>Converts bit mask "k1" into an integer value, storing the results in "dst".</description>
+	<operation>
+dst := ZeroExtend32(k1)
+	</operation>
+	<instruction form="r32, k" name="KMOVW" xed="KMOVW_GPR32u32_MASKmskw_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_int2mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="dst" />
+	<parameter etype="UI16" type="int" varname="mask" />
+	<description>Converts integer "mask" into bitmask, storing the result in "dst".</description>
+	<operation>
+dst := mask[15:0]
+	</operation>
+	<instruction form="k, r32" name="KMOVW" xed="KMOVW_MASKmskw_GPR32u32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_alignr_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and stores the low 64 bytes (16 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (32*imm8[3:0])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VALIGND" xed="VALIGND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_alignr_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="3" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 64 bytes (8 elements) in "dst".</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (64*imm8[2:0])
+dst[511:0] := temp[511:0]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VALIGNQ" xed="VALIGNQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_alignr_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="3" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 64 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (64*imm8[2:0])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VALIGNQ" xed="VALIGNQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_alignr_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="3" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and stores the low 64 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (64*imm8[2:0])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VALIGNQ" xed="VALIGNQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fixupimm_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VFIXUPIMMPD" xed="VFIXUPIMMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fixupimm_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8 {sae}" name="VFIXUPIMMPD" xed="VFIXUPIMMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fixupimm_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VFIXUPIMMPD" xed="VFIXUPIMMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fixupimm_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8 {sae}" name="VFIXUPIMMPD" xed="VFIXUPIMMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fixupimm_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VFIXUPIMMPD" xed="VFIXUPIMMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fixupimm_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8 {sae}" name="VFIXUPIMMPD" xed="VFIXUPIMMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fixupimm_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VFIXUPIMMPS" xed="VFIXUPIMMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fixupimm_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8 {sae}" name="VFIXUPIMMPS" xed="VFIXUPIMMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fixupimm_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VFIXUPIMMPS" xed="VFIXUPIMMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fixupimm_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8 {sae}" name="VFIXUPIMMPS" xed="VFIXUPIMMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fixupimm_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VFIXUPIMMPS" xed="VFIXUPIMMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fixupimm_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8 {sae}" name="VFIXUPIMMPS" xed="VFIXUPIMMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_fixupimm_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8 {sae}" name="VFIXUPIMMSD" xed="VFIXUPIMMSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_fixupimm_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VFIXUPIMMSD" xed="VFIXUPIMMSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fixupimm_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+IF k[0]
+	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8 {sae}" name="VFIXUPIMMSD" xed="VFIXUPIMMSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fixupimm_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+IF k[0]
+	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VFIXUPIMMSD" xed="VFIXUPIMMSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fixupimm_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+IF k[0]
+	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8 {sae}" name="VFIXUPIMMSD" xed="VFIXUPIMMSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fixupimm_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+IF k[0]
+	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VFIXUPIMMSD" xed="VFIXUPIMMSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_fixupimm_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8 {sae}" name="VFIXUPIMMSS" xed="VFIXUPIMMSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_fixupimm_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VFIXUPIMMSS" xed="VFIXUPIMMSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fixupimm_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+IF k[0]
+	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8 {sae}" name="VFIXUPIMMSS" xed="VFIXUPIMMSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fixupimm_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+IF k[0]
+	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VFIXUPIMMSS" xed="VFIXUPIMMSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fixupimm_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+IF k[0]
+	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8 {sae}" name="VFIXUPIMMSS" xed="VFIXUPIMMSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fixupimm_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+IF k[0]
+	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VFIXUPIMMSS" xed="VFIXUPIMMSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_getexp_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VGETEXPPD" xed="VGETEXPPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_getexp_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[sae_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}" name="VGETEXPPD" xed="VGETEXPPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_getexp_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VGETEXPPS" xed="VGETEXPPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_getexp_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[sae_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}" name="VGETEXPPS" xed="VGETEXPPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getexp_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[sae_note]</description>
+	<operation>dst[63:0] := ConvertExpFP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}" name="VGETEXPSD" xed="VGETEXPSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getexp_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>dst[63:0] := ConvertExpFP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VGETEXPSD" xed="VGETEXPSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getexp_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[sae_note]</description>
+	<operation>IF k[0]
+	dst[63:0] := ConvertExpFP64(b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}" name="VGETEXPSD" xed="VGETEXPSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getexp_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>IF k[0]
+	dst[63:0] := ConvertExpFP64(b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VGETEXPSD" xed="VGETEXPSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getexp_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[sae_note]</description>
+	<operation>IF k[0]
+	dst[63:0] := ConvertExpFP64(b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}" name="VGETEXPSD" xed="VGETEXPSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getexp_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>IF k[0]
+	dst[63:0] := ConvertExpFP64(b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VGETEXPSD" xed="VGETEXPSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getexp_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[sae_note]</description>
+	<operation>dst[31:0] := ConvertExpFP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}" name="VGETEXPSS" xed="VGETEXPSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getexp_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>dst[31:0] := ConvertExpFP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VGETEXPSS" xed="VGETEXPSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getexp_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[sae_note]</description>
+	<operation>IF k[0]
+	dst[31:0] := ConvertExpFP32(b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}" name="VGETEXPSS" xed="VGETEXPSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getexp_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>IF k[0]
+	dst[31:0] := ConvertExpFP32(b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VGETEXPSS" xed="VGETEXPSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getexp_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[sae_note]</description>
+	<operation>IF k[0]
+	dst[31:0] := ConvertExpFP32(b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}" name="VGETEXPSS" xed="VGETEXPSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getexp_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>IF k[0]
+	dst[31:0] := ConvertExpFP32(b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VGETEXPSS" xed="VGETEXPSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_getmant_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VGETMANTPD" xed="VGETMANTPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_getmant_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8 {sae}" name="VGETMANTPD" xed="VGETMANTPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_getmant_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VGETMANTPS" xed="VGETMANTPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_getmant_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8 {sae}" name="VGETMANTPS" xed="VGETMANTPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getmant_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv)
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8 {sae}" name="VGETMANTSD" xed="VGETMANTSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getmant_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv)
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VGETMANTSD" xed="VGETMANTSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getmant_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>IF k[0]
+	dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv)
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8 {sae}" name="VGETMANTSD" xed="VGETMANTSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getmant_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>IF k[0]
+	dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv)
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VGETMANTSD" xed="VGETMANTSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getmant_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>IF k[0]
+	dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv)
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8 {sae}" name="VGETMANTSD" xed="VGETMANTSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getmant_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>IF k[0]
+	dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv)
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VGETMANTSD" xed="VGETMANTSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getmant_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv)
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8 {sae}" name="VGETMANTSS" xed="VGETMANTSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getmant_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv)
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VGETMANTSS" xed="VGETMANTSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getmant_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>IF k[0]
+	dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv)
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8 {sae}" name="VGETMANTSS" xed="VGETMANTSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getmant_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>IF k[0]
+	dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv)
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VGETMANTSS" xed="VGETMANTSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getmant_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>IF k[0]
+	dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv)
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8 {sae}" name="VGETMANTSS" xed="VGETMANTSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getmant_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>IF k[0]
+	dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv)
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VGETMANTSS" xed="VGETMANTSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_rorv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPRORVD" xed="VPRORVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_roundscale_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VRNDSCALEPD" xed="VRNDSCALEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_roundscale_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8 {sae}" name="VRNDSCALEPD" xed="VRNDSCALEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_roundscale_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VRNDSCALEPD" xed="VRNDSCALEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_roundscale_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8 {sae}" name="VRNDSCALEPD" xed="VRNDSCALEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_roundscale_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VRNDSCALEPD" xed="VRNDSCALEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_roundscale_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8 {sae}" name="VRNDSCALEPD" xed="VRNDSCALEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_roundscale_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VRNDSCALEPS" xed="VRNDSCALEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_roundscale_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8 {sae}" name="VRNDSCALEPS" xed="VRNDSCALEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_roundscale_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VRNDSCALEPS" xed="VRNDSCALEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_roundscale_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8 {sae}" name="VRNDSCALEPS" xed="VRNDSCALEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_roundscale_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VRNDSCALEPS" xed="VRNDSCALEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_roundscale_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8 {sae}" name="VRNDSCALEPS" xed="VRNDSCALEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_roundscale_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8 {sae}" name="VRNDSCALESD" xed="VRNDSCALESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_roundscale_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="const int" varname="imm8" />
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VRNDSCALESD" xed="VRNDSCALESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_roundscale_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8 {sae}" name="VRNDSCALESD" xed="VRNDSCALESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_roundscale_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="const int" varname="imm8" />
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VRNDSCALESD" xed="VRNDSCALESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_roundscale_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8 {sae}" name="VRNDSCALESD" xed="VRNDSCALESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_roundscale_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="const int" varname="imm8" />
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VRNDSCALESD" xed="VRNDSCALESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_roundscale_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8 {sae}" name="VRNDSCALESS" xed="VRNDSCALESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_roundscale_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="const int" varname="imm8" />
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VRNDSCALESS" xed="VRNDSCALESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_roundscale_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8 {sae}" name="VRNDSCALESS" xed="VRNDSCALESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_roundscale_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="const int" varname="imm8" />
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VRNDSCALESS" xed="VRNDSCALESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_roundscale_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8 {sae}" name="VRNDSCALESS" xed="VRNDSCALESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_roundscale_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="const int" varname="imm8" />
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VRNDSCALESS" xed="VRNDSCALESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_scalef_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VSCALEFPD" xed="VSCALEFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_scalef_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VSCALEFPD" xed="VSCALEFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_scalef_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VSCALEFPD" xed="VSCALEFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_scalef_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VSCALEFPD" xed="VSCALEFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_scalef_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VSCALEFPD" xed="VSCALEFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_scalef_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VSCALEFPD" xed="VSCALEFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_scalef_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VSCALEFPS" xed="VSCALEFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_scalef_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VSCALEFPS" xed="VSCALEFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_scalef_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VSCALEFPS" xed="VSCALEFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_scalef_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VSCALEFPS" xed="VSCALEFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_scalef_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VSCALEFPS" xed="VSCALEFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_scalef_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VSCALEFPS" xed="VSCALEFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_scalef_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[63:0] := SCALE(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VSCALEFSD" xed="VSCALEFSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_scalef_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[63:0] := SCALE(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSCALEFSD" xed="VSCALEFSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_scalef_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[63:0] := SCALE(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VSCALEFSD" xed="VSCALEFSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_scalef_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[63:0] := SCALE(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSCALEFSD" xed="VSCALEFSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_scalef_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+dst[63:0] := SCALE(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VSCALEFSD" xed="VSCALEFSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_scalef_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+dst[63:0] := SCALE(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VSCALEFSD" xed="VSCALEFSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_scalef_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[31:0] := SCALE(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VSCALEFSS" xed="VSCALEFSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_scalef_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[31:0] := SCALE(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSCALEFSS" xed="VSCALEFSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_scalef_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[31:0] := SCALE(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VSCALEFSS" xed="VSCALEFSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_scalef_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[31:0] := SCALE(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSCALEFSS" xed="VSCALEFSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_scalef_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+dst[31:0] := SCALE(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VSCALEFSS" xed="VSCALEFSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_scalef_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+dst[31:0] := SCALE(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VSCALEFSS" xed="VSCALEFSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcast_f32x4" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 4)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m128" name="VBROADCASTF32X4" xed="VBROADCASTF32X4_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcast_f32x4" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m128" name="VBROADCASTF32X4" xed="VBROADCASTF32X4_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcast_f32x4" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m128" name="VBROADCASTF32X4" xed="VBROADCASTF32X4_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcast_f64x4" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 4)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m256" name="VBROADCASTF64X4" xed="VBROADCASTF64X4_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcast_f64x4" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 4)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m256" name="VBROADCASTF64X4" xed="VBROADCASTF64X4_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcast_f64x4" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 4)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m256" name="VBROADCASTF64X4" xed="VBROADCASTF64X4_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcast_i32x4" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 4)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m128" name="VBROADCASTI32X4" xed="VBROADCASTI32X4_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcast_i32x4" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m128" name="VBROADCASTI32X4" xed="VBROADCASTI32X4_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcast_i32x4" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m128" name="VBROADCASTI32X4" xed="VBROADCASTI32X4_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcast_i64x4" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 4)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m256" name="VBROADCASTI64X4" xed="VBROADCASTI64X4_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcast_i64x4" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 4)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m256" name="VBROADCASTI64X4" xed="VBROADCASTI64X4_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcast_i64x4" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 4)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m256" name="VBROADCASTI64X4" xed="VBROADCASTI64X4_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcastsd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VBROADCASTSD" xed="VBROADCASTSD_ZMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcastsd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VBROADCASTSD" xed="VBROADCASTSD_ZMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcastsd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VBROADCASTSD" xed="VBROADCASTSD_ZMMf64_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcastss_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VBROADCASTSS" xed="VBROADCASTSS_ZMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcastss_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VBROADCASTSS" xed="VBROADCASTSS_ZMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcastss_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VBROADCASTSS" xed="VBROADCASTSS_ZMMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_compress_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCOMPRESSPD" xed="VCOMPRESSPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_compress_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCOMPRESSPD" xed="VCOMPRESSPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_compress_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCOMPRESSPS" xed="VCOMPRESSPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_compress_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCOMPRESSPS" xed="VCOMPRESSPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expand_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VEXPANDPD" xed="VEXPANDPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_expand_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VEXPANDPD" xed="VEXPANDPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expand_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VEXPANDPS" xed="VEXPANDPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_expand_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VEXPANDPS" xed="VEXPANDPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_extractf32x4_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[1:0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm, imm8" name="VEXTRACTF32X4" xed="VEXTRACTF32X4_XMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_extractf32x4_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm, imm8" name="VEXTRACTF32X4" xed="VEXTRACTF32X4_XMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_extractf32x4_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm, imm8" name="VEXTRACTF32X4" xed="VEXTRACTF32X4_XMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_extractf64x4_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm, imm8" name="VEXTRACTF64X4" xed="VEXTRACTF64X4_YMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_extractf64x4_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm, imm8" name="VEXTRACTF64X4" xed="VEXTRACTF64X4_YMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_extractf64x4_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm, imm8" name="VEXTRACTF64X4" xed="VEXTRACTF64X4_YMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_extracti32x4_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[1:0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm, imm8" name="VEXTRACTI32X4" xed="VEXTRACTI32X4_XMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_extracti32x4_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm, imm8" name="VEXTRACTI32X4" xed="VEXTRACTI32X4_XMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_extracti32x4_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm, imm8" name="VEXTRACTI32X4" xed="VEXTRACTI32X4_XMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_extracti64x4_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm, imm8" name="VEXTRACTI64X4" xed="VEXTRACTI64X4_YMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_extracti64x4_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm, imm8" name="VEXTRACTI64X4" xed="VEXTRACTI64X4_YMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_extracti64x4_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm, imm8" name="VEXTRACTI64X4" xed="VEXTRACTI64X4_YMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_insertf32x4" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+2: dst[383:256] := b[127:0]
+3: dst[511:384] := b[127:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, xmm, imm8" name="VINSERTF32X4" xed="VINSERTF32X4_ZMMf32_MASKmskw_ZMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_insertf32x4" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, xmm, imm8" name="VINSERTF32X4" xed="VINSERTF32X4_ZMMf32_MASKmskw_ZMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_insertf32x4" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, xmm, imm8" name="VINSERTF32X4" xed="VINSERTF32X4_ZMMf32_MASKmskw_ZMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_insertf64x4" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: dst[255:0] := b[255:0]
+1: dst[511:256] := b[255:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, ymm, imm8" name="VINSERTF64X4" xed="VINSERTF64X4_ZMMf64_MASKmskw_ZMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_insertf64x4" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, ymm, imm8" name="VINSERTF64X4" xed="VINSERTF64X4_ZMMf64_MASKmskw_ZMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_insertf64x4" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, ymm, imm8" name="VINSERTF64X4" xed="VINSERTF64X4_ZMMf64_MASKmskw_ZMMf64_YMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_inserti32x4" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+2: dst[383:256] := b[127:0]
+3: dst[511:384] := b[127:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, xmm, imm8" name="VINSERTI32X4" xed="VINSERTI32X4_ZMMu32_MASKmskw_ZMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_inserti32x4" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, xmm, imm8" name="VINSERTI32X4" xed="VINSERTI32X4_ZMMu32_MASKmskw_ZMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_inserti32x4" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, xmm, imm8" name="VINSERTI32X4" xed="VINSERTI32X4_ZMMu32_MASKmskw_ZMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_inserti64x4" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: dst[255:0] := b[255:0]
+1: dst[511:256] := b[255:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, ymm, imm8" name="VINSERTI64X4" xed="VINSERTI64X4_ZMMu64_MASKmskw_ZMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_inserti64x4" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, ymm, imm8" name="VINSERTI64X4" xed="VINSERTI64X4_ZMMu64_MASKmskw_ZMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_inserti64x4" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, ymm, imm8" name="VINSERTI64X4" xed="VINSERTI64X4_ZMMu64_MASKmskw_ZMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcastd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VPBROADCASTD" xed="VPBROADCASTD_ZMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcastd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VPBROADCASTD" xed="VPBROADCASTD_ZMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcastd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VPBROADCASTD" xed="VPBROADCASTD_ZMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_broadcastq_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VPBROADCASTQ" xed="VPBROADCASTQ_ZMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_broadcastq_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VPBROADCASTQ" xed="VPBROADCASTQ_ZMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_broadcastq_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VPBROADCASTQ" xed="VPBROADCASTQ_ZMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_compress_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPCOMPRESSD" xed="VPCOMPRESSD_ZMMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_compress_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPCOMPRESSD" xed="VPCOMPRESSD_ZMMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_compress_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPCOMPRESSQ" xed="VPCOMPRESSQ_ZMMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_compress_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPCOMPRESSQ" xed="VPCOMPRESSQ_ZMMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutexvar_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMD" xed="VPERMD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutexvar_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMD" xed="VPERMD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutexvar_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMD" xed="VPERMD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask2_permutex2var_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMI2D" xed="VPERMI2D_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutex2var_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMT2D" xed="VPERMT2D_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutex2var_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMI2D" xed="VPERMI2D_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMT2D" xed="VPERMT2D_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutex2var_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMI2D" xed="VPERMI2D_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VPERMT2D" xed="VPERMT2D_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask2_permutex2var_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set)</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMI2PD" xed="VPERMI2PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutex2var_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMT2PD" xed="VPERMT2PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutex2var_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMI2PD" xed="VPERMI2PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMT2PD" xed="VPERMT2PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutex2var_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMI2PD" xed="VPERMI2PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VPERMT2PD" xed="VPERMT2PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask2_permutex2var_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMI2PS" xed="VPERMI2PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutex2var_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMT2PS" xed="VPERMT2PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutex2var_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMI2PS" xed="VPERMI2PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMT2PS" xed="VPERMT2PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutex2var_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMI2PS" xed="VPERMI2PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VPERMT2PS" xed="VPERMT2PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask2_permutex2var_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMI2Q" xed="VPERMI2Q_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutex2var_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMT2Q" xed="VPERMT2Q_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutex2var_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMI2Q" xed="VPERMI2Q_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMT2Q" xed="VPERMT2Q_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutex2var_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMI2Q" xed="VPERMI2Q_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VPERMT2Q" xed="VPERMT2Q_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permute_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI
+IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]; FI
+IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]; FI
+IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]; FI
+IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]; FI
+IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]; FI
+IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]; FI
+IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]; FI
+IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]; FI
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPERMILPD" xed="VPERMILPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutevar_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI
+IF (b[257] == 0) tmp_dst[319:256] := a[319:256]; FI
+IF (b[257] == 1) tmp_dst[319:256] := a[383:320]; FI
+IF (b[321] == 0) tmp_dst[383:320] := a[319:256]; FI
+IF (b[321] == 1) tmp_dst[383:320] := a[383:320]; FI
+IF (b[385] == 0) tmp_dst[447:384] := a[447:384]; FI
+IF (b[385] == 1) tmp_dst[447:384] := a[511:448]; FI
+IF (b[449] == 0) tmp_dst[511:448] := a[447:384]; FI
+IF (b[449] == 1) tmp_dst[511:448] := a[511:448]; FI
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMILPD" xed="VPERMILPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permute_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI
+IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]; FI
+IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]; FI
+IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]; FI
+IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]; FI
+IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]; FI
+IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]; FI
+IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]; FI
+IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]; FI
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPERMILPD" xed="VPERMILPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutevar_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI
+IF (b[257] == 0) tmp_dst[319:256] := a[319:256]; FI
+IF (b[257] == 1) tmp_dst[319:256] := a[383:320]; FI
+IF (b[321] == 0) tmp_dst[383:320] := a[319:256]; FI
+IF (b[321] == 1) tmp_dst[383:320] := a[383:320]; FI
+IF (b[385] == 0) tmp_dst[447:384] := a[447:384]; FI
+IF (b[385] == 1) tmp_dst[447:384] := a[511:448]; FI
+IF (b[449] == 0) tmp_dst[511:448] := a[447:384]; FI
+IF (b[449] == 1) tmp_dst[511:448] := a[511:448]; FI
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMILPD" xed="VPERMILPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permute_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI
+IF (imm8[2] == 0) dst[191:128] := a[191:128]; FI
+IF (imm8[2] == 1) dst[191:128] := a[255:192]; FI
+IF (imm8[3] == 0) dst[255:192] := a[191:128]; FI
+IF (imm8[3] == 1) dst[255:192] := a[255:192]; FI
+IF (imm8[4] == 0) dst[319:256] := a[319:256]; FI
+IF (imm8[4] == 1) dst[319:256] := a[383:320]; FI
+IF (imm8[5] == 0) dst[383:320] := a[319:256]; FI
+IF (imm8[5] == 1) dst[383:320] := a[383:320]; FI
+IF (imm8[6] == 0) dst[447:384] := a[447:384]; FI
+IF (imm8[6] == 1) dst[447:384] := a[511:448]; FI
+IF (imm8[7] == 0) dst[511:448] := a[447:384]; FI
+IF (imm8[7] == 1) dst[511:448] := a[511:448]; FI
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPERMILPD" xed="VPERMILPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutevar_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
+	<operation>
+IF (b[1] == 0) dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) dst[127:64] := a[127:64]; FI
+IF (b[129] == 0) dst[191:128] := a[191:128]; FI
+IF (b[129] == 1) dst[191:128] := a[255:192]; FI
+IF (b[193] == 0) dst[255:192] := a[191:128]; FI
+IF (b[193] == 1) dst[255:192] := a[255:192]; FI
+IF (b[257] == 0) dst[319:256] := a[319:256]; FI
+IF (b[257] == 1) dst[319:256] := a[383:320]; FI
+IF (b[321] == 0) dst[383:320] := a[319:256]; FI
+IF (b[321] == 1) dst[383:320] := a[383:320]; FI
+IF (b[385] == 0) dst[447:384] := a[447:384]; FI
+IF (b[385] == 1) dst[447:384] := a[511:448]; FI
+IF (b[449] == 0) dst[511:448] := a[447:384]; FI
+IF (b[449] == 1) dst[511:448] := a[511:448]; FI
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMILPD" xed="VPERMILPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permute_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPERMILPS" xed="VPERMILPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutevar_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
+tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
+tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
+tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
+tmp_dst[287:256] := SELECT4(a[383:256], b[257:256])
+tmp_dst[319:288] := SELECT4(a[383:256], b[289:288])
+tmp_dst[351:320] := SELECT4(a[383:256], b[321:320])
+tmp_dst[383:352] := SELECT4(a[383:256], b[353:352])
+tmp_dst[415:384] := SELECT4(a[511:384], b[385:384])
+tmp_dst[447:416] := SELECT4(a[511:384], b[417:416])
+tmp_dst[479:448] := SELECT4(a[511:384], b[449:448])
+tmp_dst[511:480] := SELECT4(a[511:384], b[481:480])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMILPS" xed="VPERMILPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permute_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPERMILPS" xed="VPERMILPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutevar_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
+tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
+tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
+tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
+tmp_dst[287:256] := SELECT4(a[383:256], b[257:256])
+tmp_dst[319:288] := SELECT4(a[383:256], b[289:288])
+tmp_dst[351:320] := SELECT4(a[383:256], b[321:320])
+tmp_dst[383:352] := SELECT4(a[383:256], b[353:352])
+tmp_dst[415:384] := SELECT4(a[511:384], b[385:384])
+tmp_dst[447:416] := SELECT4(a[511:384], b[417:416])
+tmp_dst[479:448] := SELECT4(a[511:384], b[449:448])
+tmp_dst[511:480] := SELECT4(a[511:384], b[481:480])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMILPS" xed="VPERMILPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permute_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPERMILPS" xed="VPERMILPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutevar_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], b[1:0])
+dst[63:32] := SELECT4(a[127:0], b[33:32])
+dst[95:64] := SELECT4(a[127:0], b[65:64])
+dst[127:96] := SELECT4(a[127:0], b[97:96])
+dst[159:128] := SELECT4(a[255:128], b[129:128])
+dst[191:160] := SELECT4(a[255:128], b[161:160])
+dst[223:192] := SELECT4(a[255:128], b[193:192])
+dst[255:224] := SELECT4(a[255:128], b[225:224])
+dst[287:256] := SELECT4(a[383:256], b[257:256])
+dst[319:288] := SELECT4(a[383:256], b[289:288])
+dst[351:320] := SELECT4(a[383:256], b[321:320])
+dst[383:352] := SELECT4(a[383:256], b[353:352])
+dst[415:384] := SELECT4(a[511:384], b[385:384])
+dst[447:416] := SELECT4(a[511:384], b[417:416])
+dst[479:448] := SELECT4(a[511:384], b[449:448])
+dst[511:480] := SELECT4(a[511:384], b[481:480])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMILPS" xed="VPERMILPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutex_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPERMPD" xed="VPERMPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutexvar_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMPD" xed="VPERMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutex_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPERMPD" xed="VPERMPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutexvar_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMPD" xed="VPERMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutex_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPERMPD" xed="VPERMPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutexvar_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	dst[i+63:i] := a[id+63:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMPD" xed="VPERMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutexvar_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMPS" xed="VPERMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutexvar_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMPS" xed="VPERMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutexvar_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMPS" xed="VPERMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutex_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPERMQ" xed="VPERMQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutexvar_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMQ" xed="VPERMQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutex_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPERMQ" xed="VPERMQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutexvar_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMQ" xed="VPERMQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutex_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPERMQ" xed="VPERMQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutexvar_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="idx" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	dst[i+63:i] := a[id+63:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMQ" xed="VPERMQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expand_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPEXPANDD" xed="VPEXPANDD_ZMMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_expand_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPEXPANDD" xed="VPEXPANDD_ZMMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expand_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPEXPANDQ" xed="VPEXPANDQ_ZMMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_expand_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPEXPANDQ" xed="VPEXPANDQ_ZMMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shuffle_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_PERM" type="_MM_PERM_ENUM" varname="imm8" />
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPSHUFD" xed="VPSHUFD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_unpackhi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPUNPCKHDQ" xed="VPUNPCKHDQ_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_unpackhi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPUNPCKHDQ" xed="VPUNPCKHDQ_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_unpackhi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPUNPCKHDQ" xed="VPUNPCKHDQ_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_unpackhi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPUNPCKHQDQ" xed="VPUNPCKHQDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_unpackhi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPUNPCKHQDQ" xed="VPUNPCKHQDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_unpackhi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPUNPCKHQDQ" xed="VPUNPCKHQDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_unpacklo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPUNPCKLDQ" xed="VPUNPCKLDQ_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_unpacklo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPUNPCKLDQ" xed="VPUNPCKLDQ_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_unpacklo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPUNPCKLDQ" xed="VPUNPCKLDQ_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_unpacklo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPUNPCKLQDQ" xed="VPUNPCKLQDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_unpacklo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPUNPCKLQDQ" xed="VPUNPCKLQDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_unpacklo_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPUNPCKLQDQ" xed="VPUNPCKLQDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shuffle_f32x4" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VSHUFF32X4" xed="VSHUFF32X4_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shuffle_f32x4" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VSHUFF32X4" xed="VSHUFF32X4_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shuffle_f32x4" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VSHUFF32X4" xed="VSHUFF32X4_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shuffle_f64x2" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VSHUFF64X2" xed="VSHUFF64X2_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shuffle_f64x2" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VSHUFF64X2" xed="VSHUFF64X2_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shuffle_f64x2" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VSHUFF64X2" xed="VSHUFF64X2_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shuffle_i32x4" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VSHUFI32X4" xed="VSHUFI32X4_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shuffle_i32x4" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VSHUFI32X4" xed="VSHUFI32X4_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shuffle_i32x4" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VSHUFI32X4" xed="VSHUFI32X4_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shuffle_i64x2" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VSHUFI64X2" xed="VSHUFI64X2_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shuffle_i64x2" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VSHUFI64X2" xed="VSHUFI64X2_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shuffle_i64x2" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VSHUFI64X2" xed="VSHUFI64X2_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shuffle_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
+tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
+tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
+tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VSHUFPD" xed="VSHUFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shuffle_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
+tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
+tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
+tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VSHUFPD" xed="VSHUFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shuffle_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
+dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
+dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
+dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VSHUFPD" xed="VSHUFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shuffle_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VSHUFPS" xed="VSHUFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shuffle_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VSHUFPS" xed="VSHUFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shuffle_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+dst[351:320] := SELECT4(b[383:256], imm8[5:4])
+dst[383:352] := SELECT4(b[383:256], imm8[7:6])
+dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+dst[479:448] := SELECT4(b[511:384], imm8[5:4])
+dst[511:480] := SELECT4(b[511:384], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VSHUFPS" xed="VSHUFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_unpackhi_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VUNPCKHPD" xed="VUNPCKHPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_unpackhi_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VUNPCKHPD" xed="VUNPCKHPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_unpackhi_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VUNPCKHPD" xed="VUNPCKHPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_unpackhi_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VUNPCKHPS" xed="VUNPCKHPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_unpackhi_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VUNPCKHPS" xed="VUNPCKHPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_unpackhi_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VUNPCKHPS" xed="VUNPCKHPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_unpacklo_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VUNPCKLPD" xed="VUNPCKLPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_unpacklo_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VUNPCKLPD" xed="VUNPCKLPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_unpacklo_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VUNPCKLPD" xed="VUNPCKLPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_unpacklo_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VUNPCKLPS" xed="VUNPCKLPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_unpacklo_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VUNPCKLPS" xed="VUNPCKLPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_unpacklo_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VUNPCKLPS" xed="VUNPCKLPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_round_sd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k, xmm, xmm {sae}, imm8" name="VCMPSD" xed="VCMPSD_MASKmskw_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_sd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VCMPSD" xed="VCMPSD_MASKmskw_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_round_sd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+IF k1[0]
+	k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm {sae}, imm8" name="VCMPSD" xed="VCMPSD_MASKmskw_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_sd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+IF k1[0]
+	k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VCMPSD" xed="VCMPSD_MASKmskw_MASKmskw_XMMf64_XMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_round_ss_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k, xmm, xmm {sae}, imm8" name="VCMPSS" xed="VCMPSS_MASKmskw_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_ss_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VCMPSS" xed="VCMPSS_MASKmskw_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_round_ss_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+IF k1[0]
+	k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm {sae}, imm8" name="VCMPSS" xed="VCMPSS_MASKmskw_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_ss_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+IF k1[0]
+	k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VCMPSS" xed="VCMPSS_MASKmskw_MASKmskw_XMMf32_XMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comi_round_sd" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+RETURN ( a[63:0] OP b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm {sae}" name="VCOMISD" xed="VCOMISD_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comi_round_ss" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+RETURN ( a[31:0] OP b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm {sae}" name="VCOMISS" xed="VCOMISS_XMMf32_XMMf32_AVX512" />
+	<instruction form="xmm, xmm {sae}" name="VUCOMISS" xed="VUCOMISS_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmplt_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmplt_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpeq_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPEQQ" xed="VPCMPEQQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpge_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpgt_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPGTQ" xed="VPCMPGTQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmple_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmplt_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpneq_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpeq_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPEQQ" xed="VPCMPEQQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpge_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpgt_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPGTQ" xed="VPCMPGTQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmple_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmplt_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpneq_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPQ" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpeq_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpge_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpgt_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmple_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmplt_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpneq_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpeq_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpge_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpgt_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmple_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmplt_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpneq_epu64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUQ" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VCVTDQ2PD" xed="VCVTDQ2PD_ZMMf64_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := src[m+63:m]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VCVTDQ2PD" xed="VCVTDQ2PD_ZMMf64_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VCVTDQ2PD" xed="VCVTDQ2PD_ZMMf64_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundepi32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VCVTDQ2PS" xed="VCVTDQ2PS_ZMMf32_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTDQ2PS" xed="VCVTDQ2PS_ZMMf32_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundepi32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VCVTDQ2PS" xed="VCVTDQ2PS_ZMMf32_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTDQ2PS" xed="VCVTDQ2PS_ZMMf32_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundepi32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VCVTDQ2PS" xed="VCVTDQ2PS_ZMMf32_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTDQ2PS" xed="VCVTDQ2PS_ZMMf32_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm {er}" name="VCVTPD2DQ" xed="VCVTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VCVTPD2DQ" xed="VCVTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm {er}" name="VCVTPD2DQ" xed="VCVTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VCVTPD2DQ" xed="VCVTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm {er}" name="VCVTPD2DQ" xed="VCVTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VCVTPD2DQ" xed="VCVTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundpd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm {er}" name="VCVTPD2PS" xed="VCVTPD2PS_YMMf32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtpd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VCVTPD2PS" xed="VCVTPD2PS_YMMf32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundpd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm {er}" name="VCVTPD2PS" xed="VCVTPD2PS_YMMf32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtpd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VCVTPD2PS" xed="VCVTPD2PS_YMMf32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundpd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm {er}" name="VCVTPD2PS" xed="VCVTPD2PS_YMMf32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtpd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VCVTPD2PS" xed="VCVTPD2PS_YMMf32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm {er}" name="VCVTPD2UDQ" xed="VCVTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VCVTPD2UDQ" xed="VCVTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm {er}" name="VCVTPD2UDQ" xed="VCVTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VCVTPD2UDQ" xed="VCVTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm {er}" name="VCVTPD2UDQ" xed="VCVTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VCVTPD2UDQ" xed="VCVTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP16" type="__m256i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm {sae}" name="VCVTPH2PS" xed="VCVTPH2PS_ZMMf32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP16" type="__m256i" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VCVTPH2PS" xed="VCVTPH2PS_ZMMf32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm {sae}" name="VCVTPH2PS" xed="VCVTPH2PS_ZMMf32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256i" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VCVTPH2PS" xed="VCVTPH2PS_ZMMf32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm {sae}" name="VCVTPH2PS" xed="VCVTPH2PS_ZMMf32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256i" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VCVTPH2PS" xed="VCVTPH2PS_ZMMf32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VCVTPS2DQ" xed="VCVTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTPS2DQ" xed="VCVTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VCVTPS2DQ" xed="VCVTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTPS2DQ" xed="VCVTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VCVTPS2DQ" xed="VCVTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTPS2DQ" xed="VCVTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundps_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm {sae}" name="VCVTPS2PD" xed="VCVTPS2PD_ZMMf64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtps_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VCVTPS2PD" xed="VCVTPS2PD_ZMMf64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundps_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm {sae}" name="VCVTPS2PD" xed="VCVTPS2PD_ZMMf64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtps_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VCVTPS2PD" xed="VCVTPS2PD_ZMMf64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundps_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm {sae}" name="VCVTPS2PD" xed="VCVTPS2PD_ZMMf64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtps_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VCVTPS2PD" xed="VCVTPS2PD_ZMMf64_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". [round2_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm {sae}, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_YMMf16_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". [round2_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm {sae}, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_YMMf16_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round2_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm {sae}, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_YMMf16_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round2_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm {sae}, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_YMMf16_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round2_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm {sae}, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_YMMf16_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtps_ph" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round2_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm {sae}, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_YMMf16_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VCVTPS2UDQ" xed="VCVTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTPS2UDQ" xed="VCVTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VCVTPS2UDQ" xed="VCVTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTPS2UDQ" xed="VCVTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VCVTPS2UDQ" xed="VCVTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTPS2UDQ" xed="VCVTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsd_i32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+	</operation>
+	<instruction form="r32, xmm {er}" name="VCVTSD2SI" xed="VCVTSD2SI_GPR32i32_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsd_i64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction form="r64, xmm {er}" name="VCVTSD2SI" xed="VCVTSD2SI_GPR64i64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsd_si32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+	</operation>
+	<instruction form="r32, xmm {er}" name="VCVTSD2SI" xed="VCVTSD2SI_GPR32i32_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsd_si64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction form="r64, xmm {er}" name="VCVTSD2SI" xed="VCVTSD2SI_GPR64i64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsd_i32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+	</operation>
+	<instruction form="r32, xmm" name="VCVTSD2SI" xed="VCVTSD2SI_GPR32i32_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsd_i64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction form="r64, xmm" name="VCVTSD2SI" xed="VCVTSD2SI_GPR64i64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsd_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VCVTSD2SS" xed="VCVTSD2SS_XMMf32_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvt_roundsd_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VCVTSD2SS" xed="VCVTSD2SS_XMMf32_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsd_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VCVTSD2SS" xed="VCVTSD2SS_XMMf32_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvt_roundsd_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VCVTSD2SS" xed="VCVTSD2SS_XMMf32_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtsd_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VCVTSD2SS" xed="VCVTSD2SS_XMMf32_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsd_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_UInt32(a[63:0])
+	</operation>
+	<instruction form="r32, xmm {er}" name="VCVTSD2USI" xed="VCVTSD2USI_GPR32u32_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsd_u64" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_UInt64(a[63:0])
+	</operation>
+	<instruction form="r64, xmm {er}" name="VCVTSD2USI" xed="VCVTSD2USI_GPR64u64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsd_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_UInt32(a[63:0])
+	</operation>
+	<instruction form="r32, xmm" name="VCVTSD2USI" xed="VCVTSD2USI_GPR32u32_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsd_u64" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_UInt64(a[63:0])
+	</operation>
+	<instruction form="r64, xmm" name="VCVTSD2USI" xed="VCVTSD2USI_GPR64u64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundi64_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="SI64" type="__int64" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64 {er}" name="VCVTSI2SD" xed="VCVTSI2SD_XMMf64_XMMf64_GPR64i64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsi64_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="SI64" type="__int64" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64 {er}" name="VCVTSI2SD" xed="VCVTSI2SD_XMMf64_XMMf64_GPR64i64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvti32_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="SI32" type="int" varname="b" />
+	<description>Convert the signed 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r32" name="VCVTSI2SD" xed="VCVTSI2SD_XMMf64_XMMf64_GPR32i32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvti64_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="SI64" type="__int64" varname="b" />
+	<description>Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64" name="VCVTSI2SD" xed="VCVTSI2SD_XMMf64_XMMf64_GPR64i64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundi32_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="SI32" type="int" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r32 {er}" name="VCVTSI2SS" xed="VCVTSI2SS_XMMf32_XMMf32_GPR32i32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundi64_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="SI64" type="__int64" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64 {er}" name="VCVTSI2SS" xed="VCVTSI2SS_XMMf32_XMMf32_GPR64i64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsi32_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="SI32" type="int" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r32 {er}" name="VCVTSI2SS" xed="VCVTSI2SS_XMMf32_XMMf32_GPR32i32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsi64_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="SI64" type="__int64" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64 {er}" name="VCVTSI2SS" xed="VCVTSI2SS_XMMf32_XMMf32_GPR64i64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvti32_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="SI32" type="int" varname="b" />
+	<description>Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r32" name="VCVTSI2SS" xed="VCVTSI2SS_XMMf32_XMMf32_GPR32i32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvti64_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="SI64" type="__int64" varname="b" />
+	<description>Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64" name="VCVTSI2SS" xed="VCVTSI2SS_XMMf32_XMMf32_GPR64i64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundss_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}" name="VCVTSS2SD" xed="VCVTSS2SD_XMMf64_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvt_roundss_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[sae_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}" name="VCVTSS2SD" xed="VCVTSS2SD_XMMf64_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtss_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VCVTSS2SD" xed="VCVTSS2SD_XMMf64_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvt_roundss_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". 
+	[sae_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}" name="VCVTSS2SD" xed="VCVTSS2SD_XMMf64_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtss_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VCVTSS2SD" xed="VCVTSS2SD_XMMf64_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundss_i32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction form="r32, xmm {er}" name="VCVTSS2SI" xed="VCVTSS2SI_GPR32i32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundss_i64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+	</operation>
+	<instruction form="r64, xmm {er}" name="VCVTSS2SI" xed="VCVTSS2SI_GPR64i64_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundss_si32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction form="r32, xmm {er}" name="VCVTSS2SI" xed="VCVTSS2SI_GPR32i32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundss_si64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+	</operation>
+	<instruction form="r64, xmm {er}" name="VCVTSS2SI" xed="VCVTSS2SI_GPR64i64_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtss_i32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction form="r32, xmm" name="VCVTSS2SI" xed="VCVTSS2SI_GPR32i32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtss_i64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+	</operation>
+	<instruction form="r64, xmm" name="VCVTSS2SI" xed="VCVTSS2SI_GPR64i64_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundss_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_UInt32(a[31:0])
+	</operation>
+	<instruction form="r32, xmm {er}" name="VCVTSS2USI" xed="VCVTSS2USI_GPR32u32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundss_u64" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_UInt64(a[31:0])
+	</operation>
+	<instruction form="r64, xmm {er}" name="VCVTSS2USI" xed="VCVTSS2USI_GPR64u64_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtss_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_UInt32(a[31:0])
+	</operation>
+	<instruction form="r32, xmm" name="VCVTSS2USI" xed="VCVTSS2USI_GPR32u32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtss_u64" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_UInt64(a[31:0])
+	</operation>
+	<instruction form="r64, xmm" name="VCVTSS2USI" xed="VCVTSS2USI_GPR64u64_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm {sae}" name="VCVTTPD2DQ" xed="VCVTTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VCVTTPD2DQ" xed="VCVTTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm {sae}" name="VCVTTPD2DQ" xed="VCVTTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VCVTTPD2DQ" xed="VCVTTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm {sae}" name="VCVTTPD2DQ" xed="VCVTTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttpd_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VCVTTPD2DQ" xed="VCVTTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm {sae}" name="VCVTTPD2UDQ" xed="VCVTTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VCVTTPD2UDQ" xed="VCVTTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).   [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm {sae}" name="VCVTTPD2UDQ" xed="VCVTTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VCVTTPD2UDQ" xed="VCVTTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm {sae}" name="VCVTTPD2UDQ" xed="VCVTTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttpd_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VCVTTPD2UDQ" xed="VCVTTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}" name="VCVTTPS2DQ" xed="VCVTTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTTPS2DQ" xed="VCVTTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}" name="VCVTTPS2DQ" xed="VCVTTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTTPS2DQ" xed="VCVTTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}" name="VCVTTPS2DQ" xed="VCVTTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttps_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTTPS2DQ" xed="VCVTTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}" name="VCVTTPS2UDQ" xed="VCVTTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTTPS2UDQ" xed="VCVTTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).   [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}" name="VCVTTPS2UDQ" xed="VCVTTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTTPS2UDQ" xed="VCVTTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}" name="VCVTTPS2UDQ" xed="VCVTTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttps_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTTPS2UDQ" xed="VCVTTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundsd_i32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+	</operation>
+	<instruction form="r32, xmm {sae}" name="VCVTTSD2SI" xed="VCVTTSD2SI_GPR32i32_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundsd_i64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction form="r64, xmm {sae}" name="VCVTTSD2SI" xed="VCVTTSD2SI_GPR64i64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundsd_si32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+	</operation>
+	<instruction form="r32, xmm {sae}" name="VCVTTSD2SI" xed="VCVTTSD2SI_GPR32i32_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundsd_si64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction form="r64, xmm {sae}" name="VCVTTSD2SI" xed="VCVTTSD2SI_GPR64i64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttsd_i32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+	</operation>
+	<instruction form="r32, xmm" name="VCVTTSD2SI" xed="VCVTTSD2SI_GPR32i32_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttsd_i64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction form="r64, xmm" name="VCVTTSD2SI" xed="VCVTTSD2SI_GPR64i64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundsd_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_UInt32_Truncate(a[63:0])
+	</operation>
+	<instruction form="r32, xmm {sae}" name="VCVTTSD2USI" xed="VCVTTSD2USI_GPR32u32_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundsd_u64" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_UInt64_Truncate(a[63:0])
+	</operation>
+	<instruction form="r64, xmm {sae}" name="VCVTTSD2USI" xed="VCVTTSD2USI_GPR64u64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttsd_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_UInt32_Truncate(a[63:0])
+	</operation>
+	<instruction form="r32, xmm" name="VCVTTSD2USI" xed="VCVTTSD2USI_GPR32u32_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttsd_u64" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_UInt64_Truncate(a[63:0])
+	</operation>
+	<instruction form="r64, xmm" name="VCVTTSD2USI" xed="VCVTTSD2USI_GPR64u64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundss_i32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction form="r32, xmm {sae}" name="VCVTTSS2SI" xed="VCVTTSS2SI_GPR32i32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundss_i64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+	</operation>
+	<instruction form="r64, xmm {sae}" name="VCVTTSS2SI" xed="VCVTTSS2SI_GPR64i64_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundss_si32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction form="r32, xmm {sae}" name="VCVTTSS2SI" xed="VCVTTSS2SI_GPR32i32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundss_si64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+	</operation>
+	<instruction form="r64, xmm {sae}" name="VCVTTSS2SI" xed="VCVTTSS2SI_GPR64i64_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttss_i32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction form="r32, xmm" name="VCVTTSS2SI" xed="VCVTTSS2SI_GPR32i32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttss_i64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+	</operation>
+	<instruction form="r64, xmm" name="VCVTTSS2SI" xed="VCVTTSS2SI_GPR64i64_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundss_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_UInt32_Truncate(a[31:0])
+	</operation>
+	<instruction form="r32, xmm {sae}" name="VCVTTSS2USI" xed="VCVTTSS2USI_GPR32u32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundss_u64" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_UInt64_Truncate(a[31:0])
+	</operation>
+	<instruction form="r64, xmm {sae}" name="VCVTTSS2USI" xed="VCVTTSS2USI_GPR64u64_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttss_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_UInt32_Truncate(a[31:0])
+	</operation>
+	<instruction form="r32, xmm" name="VCVTTSS2USI" xed="VCVTTSS2USI_GPR32u32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttss_u64" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_UInt64_Truncate(a[31:0])
+	</operation>
+	<instruction form="r64, xmm" name="VCVTTSS2USI" xed="VCVTTSS2USI_GPR64u64_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VCVTUDQ2PD" xed="VCVTUDQ2PD_ZMMf64_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VCVTUDQ2PD" xed="VCVTUDQ2PD_ZMMf64_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepu32_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VCVTUDQ2PD" xed="VCVTUDQ2PD_ZMMf64_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundepu32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VCVTUDQ2PS" xed="VCVTUDQ2PS_ZMMf32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTUDQ2PS" xed="VCVTUDQ2PS_ZMMf32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundepu32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VCVTUDQ2PS" xed="VCVTUDQ2PS_ZMMf32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTUDQ2PS" xed="VCVTUDQ2PS_ZMMf32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundepu32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VCVTUDQ2PS" xed="VCVTUDQ2PS_ZMMf32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepu32_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTUDQ2PS" xed="VCVTUDQ2PS_ZMMf32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundu64_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64 {er}" name="VCVTUSI2SD" xed="VCVTUSI2SD_XMMf64_XMMf64_GPR64u64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtu32_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="b" />
+	<description>Convert the unsigned 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r32" name="VCVTUSI2SD" xed="VCVTUSI2SD_XMMf64_XMMf64_GPR32u32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtu64_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="b" />
+	<description>Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64" name="VCVTUSI2SD" xed="VCVTUSI2SD_XMMf64_XMMf64_GPR64u64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundu32_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r32 {er}" name="VCVTUSI2SS" xed="VCVTUSI2SS_XMMf32_XMMf32_GPR32u32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundu64_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64 {er}" name="VCVTUSI2SS" xed="VCVTUSI2SS_XMMf32_XMMf32_GPR64u64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtu32_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="b" />
+	<description>Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r32" name="VCVTUSI2SS" xed="VCVTUSI2SS_XMMf32_XMMf32_GPR32u32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtu64_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="b" />
+	<description>Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64" name="VCVTUSI2SS" xed="VCVTUSI2SS_XMMf32_XMMf32_GPR64u64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Truncate8(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm" name="VPMOVDB" xed="VPMOVDB_XMMu8_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm" name="VPMOVDB" xed="VPMOVDB_XMMu8_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi32_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, zmm" name="VPMOVDB" xed="VPMOVDB_MEMu8_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm" name="VPMOVDB" xed="VPMOVDB_XMMu8_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Truncate16(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VPMOVDW" xed="VPMOVDW_YMMu16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VPMOVDW" xed="VPMOVDW_YMMu16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi32_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, zmm" name="VPMOVDW" xed="VPMOVDW_MEMu16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VPMOVDW" xed="VPMOVDW_YMMu16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Truncate8(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm" name="VPMOVQB" xed="VPMOVQB_XMMu8_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm" name="VPMOVQB" xed="VPMOVQB_XMMu8_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi64_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, zmm" name="VPMOVQB" xed="VPMOVQB_MEMu8_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm" name="VPMOVQB" xed="VPMOVQB_XMMu8_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Truncate32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VPMOVQD" xed="VPMOVQD_YMMu32_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VPMOVQD" xed="VPMOVQD_YMMu32_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi64_storeu_epi32" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI32" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, zmm" name="VPMOVQD" xed="VPMOVQD_MEMu32_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VPMOVQD" xed="VPMOVQD_YMMu32_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Truncate16(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm" name="VPMOVQW" xed="VPMOVQW_XMMu16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm" name="VPMOVQW" xed="VPMOVQW_XMMu16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi64_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, zmm" name="VPMOVQW" xed="VPMOVQW_MEMu16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm" name="VPMOVQW" xed="VPMOVQW_XMMu16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtsepi32_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Saturate8(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm" name="VPMOVSDB" xed="VPMOVSDB_XMMi8_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtsepi32_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm" name="VPMOVSDB" xed="VPMOVSDB_XMMi8_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtsepi32_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI8" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, zmm" name="VPMOVSDB" xed="VPMOVSDB_MEMi8_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtsepi32_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm" name="VPMOVSDB" xed="VPMOVSDB_XMMi8_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtsepi32_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Saturate16(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VPMOVSDW" xed="VPMOVSDW_YMMi16_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtsepi32_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m256i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VPMOVSDW" xed="VPMOVSDW_YMMi16_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtsepi32_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI16" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, zmm" name="VPMOVSDW" xed="VPMOVSDW_MEMi16_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtsepi32_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VPMOVSDW" xed="VPMOVSDW_YMMi16_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtsepi64_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Saturate8(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, zmm" name="VPMOVSQB" xed="VPMOVSQB_XMMi8_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtsepi64_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm" name="VPMOVSQB" xed="VPMOVSQB_XMMi8_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtsepi64_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI8" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, zmm" name="VPMOVSQB" xed="VPMOVSQB_MEMi8_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtsepi64_epi8" tech="AVX-512">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm" name="VPMOVSQB" xed="VPMOVSQB_XMMi8_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtsepi64_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Saturate32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VPMOVSQD" xed="VPMOVSQD_YMMi32_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtsepi64_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VPMOVSQD" xed="VPMOVSQD_YMMi32_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtsepi64_storeu_epi32" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI32" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, zmm" name="VPMOVSQD" xed="VPMOVSQD_MEMi32_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtsepi64_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VPMOVSQD" xed="VPMOVSQD_YMMi32_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtsepi64_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Saturate16(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm" name="VPMOVSQW" xed="VPMOVSQW_XMMi16_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtsepi64_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm" name="VPMOVSQW" xed="VPMOVSQW_XMMi16_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtsepi64_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="SI16" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, zmm" name="VPMOVSQW" xed="VPMOVSQW_MEMi16_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtsepi64_epi16" tech="AVX-512">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm" name="VPMOVSQW" xed="VPMOVSQW_XMMi16_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi8_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := SignExtend32(a[k+7:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VPMOVSXBD" xed="VPMOVSXBD_ZMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi8_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VPMOVSXBD" xed="VPMOVSXBD_ZMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi8_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VPMOVSXBD" xed="VPMOVSXBD_ZMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi8_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m512i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := SignExtend64(a[k+7:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VPMOVSXBQ" xed="VPMOVSXBQ_ZMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi8_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m512i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VPMOVSXBQ" xed="VPMOVSXBQ_ZMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi8_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VPMOVSXBQ" xed="VPMOVSXBQ_ZMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi32_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := SignExtend64(a[k+31:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VPMOVSXDQ" xed="VPMOVSXDQ_ZMMi64_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi32_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m512i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VPMOVSXDQ" xed="VPMOVSXDQ_ZMMi64_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi32_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VPMOVSXDQ" xed="VPMOVSXDQ_ZMMi64_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi16_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := SignExtend32(a[k+15:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VPMOVSXWD" xed="VPMOVSXWD_ZMMi32_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi16_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	l := j*16
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VPMOVSXWD" xed="VPMOVSXWD_ZMMi32_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi16_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VPMOVSXWD" xed="VPMOVSXWD_ZMMi32_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi16_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m512i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := SignExtend64(a[k+15:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VPMOVSXWQ" xed="VPMOVSXWQ_ZMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi16_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m512i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VPMOVSXWQ" xed="VPMOVSXWQ_ZMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi16_epi64" tech="AVX-512">
+	<return etype="SI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VPMOVSXWQ" xed="VPMOVSXWQ_ZMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtusepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := SaturateU8(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm" name="VPMOVUSDB" xed="VPMOVUSDB_XMMu8_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtusepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm" name="VPMOVUSDB" xed="VPMOVUSDB_XMMu8_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtusepi32_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, zmm" name="VPMOVUSDB" xed="VPMOVUSDB_MEMu8_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtusepi32_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm" name="VPMOVUSDB" xed="VPMOVUSDB_XMMu8_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtusepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := SaturateU16(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VPMOVUSDW" xed="VPMOVUSDW_YMMu16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtusepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VPMOVUSDW" xed="VPMOVUSDW_YMMu16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtusepi32_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, zmm" name="VPMOVUSDW" xed="VPMOVUSDW_MEMu16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtusepi32_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VPMOVUSDW" xed="VPMOVUSDW_YMMu16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtusepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := SaturateU8(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, zmm" name="VPMOVUSQB" xed="VPMOVUSQB_XMMu8_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtusepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm" name="VPMOVUSQB" xed="VPMOVUSQB_XMMu8_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtusepi64_storeu_epi8" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m64 {k}, zmm" name="VPMOVUSQB" xed="VPMOVUSQB_MEMu8_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtusepi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm" name="VPMOVUSQB" xed="VPMOVUSQB_XMMu8_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtusepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := SaturateU32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VPMOVUSQD" xed="VPMOVUSQD_YMMu32_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtusepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := SaturateU32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VPMOVUSQD" xed="VPMOVUSQD_YMMu32_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtusepi64_storeu_epi32" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI32" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, zmm" name="VPMOVUSQD" xed="VPMOVUSQD_MEMu32_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtusepi64_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := SaturateU32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VPMOVUSQD" xed="VPMOVUSQD_YMMu32_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtusepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := SaturateU16(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm" name="VPMOVUSQW" xed="VPMOVUSQW_XMMu16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtusepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm" name="VPMOVUSQW" xed="VPMOVUSQW_XMMu16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtusepi64_storeu_epi16" tech="AVX-512">
+	<category>Store</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, zmm" name="VPMOVUSQW" xed="VPMOVUSQW_MEMu16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtusepi64_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm" name="VPMOVUSQW" xed="VPMOVUSQW_XMMu16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu8_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := ZeroExtend32(a[k+7:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VPMOVZXBD" xed="VPMOVZXBD_ZMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu8_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VPMOVZXBD" xed="VPMOVZXBD_ZMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepu8_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VPMOVZXBD" xed="VPMOVZXBD_ZMMi32_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu8_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := ZeroExtend64(a[k+7:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VPMOVZXBQ" xed="VPMOVZXBQ_ZMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu8_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VPMOVZXBQ" xed="VPMOVZXBQ_ZMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepu8_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VPMOVZXBQ" xed="VPMOVZXBQ_ZMMi64_MASKmskw_XMMi8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu32_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := ZeroExtend64(a[k+31:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VPMOVZXDQ" xed="VPMOVZXDQ_ZMMi64_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu32_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VPMOVZXDQ" xed="VPMOVZXDQ_ZMMi64_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepu32_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+31:l])
+	ELSE 
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VPMOVZXDQ" xed="VPMOVZXDQ_ZMMi64_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu16_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := ZeroExtend32(a[k+15:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VPMOVZXWD" xed="VPMOVZXWD_ZMMi32_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu16_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VPMOVZXWD" xed="VPMOVZXWD_ZMMi32_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepu16_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VPMOVZXWD" xed="VPMOVZXWD_ZMMi32_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu16_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := ZeroExtend64(a[k+15:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VPMOVZXWQ" xed="VPMOVZXWQ_ZMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu16_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VPMOVZXWQ" xed="VPMOVZXWQ_ZMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepu16_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VPMOVZXWQ" xed="VPMOVZXWQ_ZMMi64_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtss_f32" tech="AVX-512" vexEq="TRUE">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Copy the lower single-precision (32-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction form="m32, xmm" name="VMOVSS" xed="VMOVSS_MEMf32_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtsd_f64" tech="AVX-512" vexEq="TRUE">
+	<return etype="FP64" type="double" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Copy the lower double-precision (64-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction form="m64, xmm" name="VMOVSD" xed="VMOVSD_MEMq_XMMq" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtsi512_si32" tech="AVX-512" vexEq="TRUE">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction form="r32, xmm" name="VMOVD" xed="VMOVD_GPR32u32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VMAXPD" xed="VMAXPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).   [sae_note][max_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {sae}" name="VMAXPD" xed="VMAXPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VMAXPD" xed="VMAXPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note][max_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {sae}" name="VMAXPD" xed="VMAXPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VMAXPD" xed="VMAXPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".  [sae_note][max_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {sae}" name="VMAXPD" xed="VMAXPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VMAXPS" xed="VMAXPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).   [sae_note][max_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {sae}" name="VMAXPS" xed="VMAXPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VMAXPS" xed="VMAXPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note][max_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {sae}" name="VMAXPS" xed="VMAXPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VMAXPS" xed="VMAXPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".  [sae_note][max_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {sae}" name="VMAXPS" xed="VMAXPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][max_float_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MAX(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}" name="VMAXSD" xed="VMAXSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MAX(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMAXSD" xed="VMAXSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][max_float_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MAX(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}" name="VMAXSD" xed="VMAXSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MAX(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMAXSD" xed="VMAXSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [sae_note][max_float_note]</description>
+	<operation>
+dst[63:0] := MAX(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}" name="VMAXSD" xed="VMAXSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MAX(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}" name="VMAXSS" xed="VMAXSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MAX(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMAXSS" xed="VMAXSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MAX(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}" name="VMAXSS" xed="VMAXSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MAX(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMAXSS" xed="VMAXSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note]</description>
+	<operation>
+dst[31:0] := MAX(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}" name="VMAXSS" xed="VMAXSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VMINPD" xed="VMINPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).   [sae_note][min_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {sae}" name="VMINPD" xed="VMINPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VMINPD" xed="VMINPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note][min_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {sae}" name="VMINPD" xed="VMINPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VMINPD" xed="VMINPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".  [sae_note][min_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {sae}" name="VMINPD" xed="VMINPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VMINPS" xed="VMINPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).   [sae_note][min_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {sae}" name="VMINPS" xed="VMINPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VMINPS" xed="VMINPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note][min_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {sae}" name="VMINPS" xed="VMINPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VMINPS" xed="VMINPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".  [sae_note][min_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {sae}" name="VMINPS" xed="VMINPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][min_float_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MIN(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}" name="VMINSD" xed="VMINSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MIN(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMINSD" xed="VMINSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][min_float_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MIN(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}" name="VMINSD" xed="VMINSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MIN(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMINSD" xed="VMINSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" , and copy the upper element from "a" to the upper element of "dst". [sae_note][min_float_note]</description>
+	<operation>
+dst[63:0] := MIN(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}" name="VMINSD" xed="VMINSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MIN(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}" name="VMINSS" xed="VMINSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MIN(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMINSS" xed="VMINSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MIN(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}" name="VMINSS" xed="VMINSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MIN(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMINSS" xed="VMINSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note]</description>
+	<operation>
+dst[31:0] := MIN(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}" name="VMINSS" xed="VMINSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_abs_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ABS(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VPABSD" xed="VPABSD_ZMMi32_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_abs_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPABSD" xed="VPABSD_ZMMi32_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_abs_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPABSD" xed="VPABSD_ZMMi32_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_abs_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ABS(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VPABSQ" xed="VPABSQ_ZMMi64_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_abs_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPABSQ" xed="VPABSQ_ZMMi64_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_abs_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPABSQ" xed="VPABSQ_ZMMi64_MASKmskw_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0 
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMAXSD" xed="VPMAXSD_ZMMi32_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMAXSQ" xed="VPMAXSQ_ZMMi64_MASKmskw_ZMMi64_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMAXSQ" xed="VPMAXSQ_ZMMi64_MASKmskw_ZMMi64_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMAXSQ" xed="VPMAXSQ_ZMMi64_MASKmskw_ZMMi64_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMAXUD" xed="VPMAXUD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMAXUQ" xed="VPMAXUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMAXUQ" xed="VPMAXUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMAXUQ" xed="VPMAXUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMINSD" xed="VPMINSD_ZMMi32_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMINSQ" xed="VPMINSQ_ZMMi64_MASKmskw_ZMMi64_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMINSQ" xed="VPMINSQ_ZMMi64_MASKmskw_ZMMi64_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="SI64" type="__m512i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMINSQ" xed="VPMINSQ_ZMMi64_MASKmskw_ZMMi64_ZMMi64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMINUD" xed="VPMINUD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMINUQ" xed="VPMINUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMINUQ" xed="VPMINUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMINUQ" xed="VPMINUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mov_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VMOVAPD" xed="VMOVAPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mov_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VMOVAPS" xed="VMOVAPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_movedup_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+tmp[191:128] := a[191:128]
+tmp[255:192] := a[191:128]
+tmp[319:256] := a[319:256] 
+tmp[383:320] := a[319:256] 
+tmp[447:384] := a[447:384]
+tmp[511:448] := a[447:384]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VMOVDDUP" xed="VMOVDDUP_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_movedup_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+tmp[191:128] := a[191:128]
+tmp[255:192] := a[191:128]
+tmp[319:256] := a[319:256] 
+tmp[383:320] := a[319:256] 
+tmp[447:384] := a[447:384]
+tmp[511:448] := a[447:384]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VMOVDDUP" xed="VMOVDDUP_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_movedup_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := a[63:0]
+dst[191:128] := a[191:128]
+dst[255:192] := a[191:128]
+dst[319:256] := a[319:256]
+dst[383:320] := a[319:256]
+dst[447:384] := a[447:384]
+dst[511:448] := a[447:384]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VMOVDDUP" xed="VMOVDDUP_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mov_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VMOVDQA32" xed="VMOVDQA32_ZMMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mov_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VMOVDQA64" xed="VMOVDQA64_ZMMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_move_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMOVSD" xed="VMOVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_move_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMOVSD" xed="VMOVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_movehdup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+tmp[159:128] := a[191:160] 
+tmp[191:160] := a[191:160] 
+tmp[223:192] := a[255:224] 
+tmp[255:224] := a[255:224]
+tmp[287:256] := a[319:288] 
+tmp[319:288] := a[319:288] 
+tmp[351:320] := a[383:352] 
+tmp[383:352] := a[383:352] 
+tmp[415:384] := a[447:416] 
+tmp[447:416] := a[447:416] 
+tmp[479:448] := a[511:480]
+tmp[511:480] := a[511:480]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VMOVSHDUP" xed="VMOVSHDUP_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_movehdup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+tmp[159:128] := a[191:160] 
+tmp[191:160] := a[191:160] 
+tmp[223:192] := a[255:224] 
+tmp[255:224] := a[255:224]
+tmp[287:256] := a[319:288] 
+tmp[319:288] := a[319:288] 
+tmp[351:320] := a[383:352] 
+tmp[383:352] := a[383:352] 
+tmp[415:384] := a[447:416] 
+tmp[447:416] := a[447:416] 
+tmp[479:448] := a[511:480]
+tmp[511:480] := a[511:480]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VMOVSHDUP" xed="VMOVSHDUP_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_movehdup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] 
+dst[63:32] := a[63:32] 
+dst[95:64] := a[127:96] 
+dst[127:96] := a[127:96]
+dst[159:128] := a[191:160] 
+dst[191:160] := a[191:160] 
+dst[223:192] := a[255:224] 
+dst[255:224] := a[255:224]
+dst[287:256] := a[319:288] 
+dst[319:288] := a[319:288] 
+dst[351:320] := a[383:352] 
+dst[383:352] := a[383:352] 
+dst[415:384] := a[447:416] 
+dst[447:416] := a[447:416] 
+dst[479:448] := a[511:480]
+dst[511:480] := a[511:480]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VMOVSHDUP" xed="VMOVSHDUP_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_moveldup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+tmp[159:128] := a[159:128] 
+tmp[191:160] := a[159:128] 
+tmp[223:192] := a[223:192] 
+tmp[255:224] := a[223:192]
+tmp[287:256] := a[287:256] 
+tmp[319:288] := a[287:256] 
+tmp[351:320] := a[351:320] 
+tmp[383:352] := a[351:320] 
+tmp[415:384] := a[415:384] 
+tmp[447:416] := a[415:384] 
+tmp[479:448] := a[479:448]
+tmp[511:480] := a[479:448]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VMOVSLDUP" xed="VMOVSLDUP_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_moveldup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+tmp[159:128] := a[159:128] 
+tmp[191:160] := a[159:128] 
+tmp[223:192] := a[223:192] 
+tmp[255:224] := a[223:192]
+tmp[287:256] := a[287:256] 
+tmp[319:288] := a[287:256] 
+tmp[351:320] := a[351:320] 
+tmp[383:352] := a[351:320] 
+tmp[415:384] := a[415:384] 
+tmp[447:416] := a[415:384] 
+tmp[479:448] := a[479:448]
+tmp[511:480] := a[479:448]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VMOVSLDUP" xed="VMOVSLDUP_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_moveldup_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] 
+dst[63:32] := a[31:0] 
+dst[95:64] := a[95:64] 
+dst[127:96] := a[95:64]
+dst[159:128] := a[159:128] 
+dst[191:160] := a[159:128] 
+dst[223:192] := a[223:192] 
+dst[255:224] := a[223:192]
+dst[287:256] := a[287:256] 
+dst[319:288] := a[287:256] 
+dst[351:320] := a[351:320] 
+dst[383:352] := a[351:320] 
+dst[415:384] := a[415:384] 
+dst[447:416] := a[415:384] 
+dst[479:448] := a[479:448]
+dst[511:480] := a[479:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VMOVSLDUP" xed="VMOVSLDUP_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_move_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMOVSS" xed="VMOVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_move_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMOVSS" xed="VMOVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_and_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPANDD" xed="VPANDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_andnot_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPANDND" xed="VPANDND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_andnot_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPANDNQ" xed="VPANDNQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_and_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPANDQ" xed="VPANDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_or_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPORD" xed="VPORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_or_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPORQ" xed="VPORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_ternarylogic_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+		ENDFOR
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VPTERNLOGD" xed="VPTERNLOGD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_ternarylogic_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+		ENDFOR
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VPTERNLOGD" xed="VPTERNLOGD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_ternarylogic_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst".</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 15
+	i := j*32
+	FOR h := 0 to 31
+		dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VPTERNLOGD" xed="VPTERNLOGD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_ternarylogic_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+		ENDFOR
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VPTERNLOGQ" xed="VPTERNLOGQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_ternarylogic_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+		ENDFOR
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VPTERNLOGQ" xed="VPTERNLOGQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_ternarylogic_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst".</description>
+	<operation>
+DEFINE TernaryOP(imm8, a, b, c) {
+	CASE imm8[7:0] OF
+	0: dst[0] := 0                   // imm8[7:0] := 0
+	1: dst[0] := NOT (a OR b OR c)   // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C)
+	// ...
+	254: dst[0] := a OR b OR c       // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C
+	255: dst[0] := 1                 // imm8[7:0] := 1
+	ESAC
+}
+imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C)
+FOR j := 0 to 7
+	i := j*64
+	FOR h := 0 to 63
+		dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h])
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VPTERNLOGQ" xed="VPTERNLOGQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_test_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPTESTMQ" xed="VPTESTMQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_test_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPTESTMQ" xed="VPTESTMQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_testn_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPTESTNMD" xed="VPTESTNMD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_testn_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPTESTNMD" xed="VPTESTNMD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_testn_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPTESTNMQ" xed="VPTESTNMQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_testn_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPTESTNMQ" xed="VPTESTNMQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_xor_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPXORD" xed="VPXORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_xor_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPXORQ" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set1_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="char" varname="a" />
+	<description>Broadcast 8-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, r8" name="VPBROADCASTB" xed="VPBROADCASTB_ZMMu8_MASKmskw_GPR32u8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_set1_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, r32" name="VPBROADCASTD" xed="VPBROADCASTD_ZMMu32_MASKmskw_GPR32u32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_set1_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, r32" name="VPBROADCASTD" xed="VPBROADCASTD_ZMMu32_MASKmskw_GPR32u32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set1_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Broadcast 32-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, r32" name="VPBROADCASTD" xed="VPBROADCASTD_ZMMu32_MASKmskw_GPR32u32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_set1_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, r64" name="VPBROADCASTQ" xed="VPBROADCASTQ_ZMMu64_MASKmskw_GPR64u64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_set1_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, r64" name="VPBROADCASTQ" xed="VPBROADCASTQ_ZMMu64_MASKmskw_GPR64u64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set1_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Broadcast 64-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, r64" name="VPBROADCASTQ" xed="VPBROADCASTQ_ZMMu64_MASKmskw_GPR64u64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set1_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="short" varname="a" />
+	<description>Broadcast the low packed 16-bit integer from "a" to all all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, r16" name="VPBROADCASTW" xed="VPBROADCASTW_ZMMu16_MASKmskw_GPR32u16_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set1_pd" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="double" varname="a" />
+	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set1_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="float" varname="a" />
+	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set4_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="int" varname="d" />
+	<parameter etype="UI32" type="int" varname="c" />
+	<parameter etype="UI32" type="int" varname="b" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Set packed 32-bit integers in "dst" with the repeated 4 element sequence.</description>
+	<operation>
+dst[31:0] := a
+dst[63:32] := b
+dst[95:64] := c
+dst[127:96] := d
+dst[159:128] := a
+dst[191:160] := b
+dst[223:192] := c
+dst[255:224] := d
+dst[287:256] := a
+dst[319:288] := b
+dst[351:320] := c
+dst[383:352] := d
+dst[415:384] := a
+dst[447:416] := b
+dst[479:448] := c
+dst[511:480] := d
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set4_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="d" />
+	<parameter etype="UI64" type="__int64" varname="c" />
+	<parameter etype="UI64" type="__int64" varname="b" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Set packed 64-bit integers in "dst" with the repeated 4 element sequence.</description>
+	<operation>
+dst[63:0] := a
+dst[127:64] := b
+dst[191:128] := c
+dst[255:192] := d
+dst[319:256] := a
+dst[383:320] := b
+dst[447:384] := c
+dst[511:448] := d
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set4_pd" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="double" varname="d" />
+	<parameter etype="FP64" type="double" varname="c" />
+	<parameter etype="FP64" type="double" varname="b" />
+	<parameter etype="FP64" type="double" varname="a" />
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence.</description>
+	<operation>
+dst[63:0] := a
+dst[127:64] := b
+dst[191:128] := c
+dst[255:192] := d
+dst[319:256] := a
+dst[383:320] := b
+dst[447:384] := c
+dst[511:448] := d
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set4_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="float" varname="d" />
+	<parameter etype="FP32" type="float" varname="c" />
+	<parameter etype="FP32" type="float" varname="b" />
+	<parameter etype="FP32" type="float" varname="a" />
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence.</description>
+	<operation>
+dst[31:0] := a
+dst[63:32] := b
+dst[95:64] := c
+dst[127:96] := d
+dst[159:128] := a
+dst[191:160] := b
+dst[223:192] := c
+dst[255:224] := d
+dst[287:256] := a
+dst[319:288] := b
+dst[351:320] := c
+dst[383:352] := d
+dst[415:384] := a
+dst[447:416] := b
+dst[479:448] := c
+dst[511:480] := d
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set_epi8" sequence="TRUE" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="char" varname="e63" />
+	<parameter etype="UI8" type="char" varname="e62" />
+	<parameter etype="UI8" type="char" varname="e61" />
+	<parameter etype="UI8" type="char" varname="e60" />
+	<parameter etype="UI8" type="char" varname="e59" />
+	<parameter etype="UI8" type="char" varname="e58" />
+	<parameter etype="UI8" type="char" varname="e57" />
+	<parameter etype="UI8" type="char" varname="e56" />
+	<parameter etype="UI8" type="char" varname="e55" />
+	<parameter etype="UI8" type="char" varname="e54" />
+	<parameter etype="UI8" type="char" varname="e53" />
+	<parameter etype="UI8" type="char" varname="e52" />
+	<parameter etype="UI8" type="char" varname="e51" />
+	<parameter etype="UI8" type="char" varname="e50" />
+	<parameter etype="UI8" type="char" varname="e49" />
+	<parameter etype="UI8" type="char" varname="e48" />
+	<parameter etype="UI8" type="char" varname="e47" />
+	<parameter etype="UI8" type="char" varname="e46" />
+	<parameter etype="UI8" type="char" varname="e45" />
+	<parameter etype="UI8" type="char" varname="e44" />
+	<parameter etype="UI8" type="char" varname="e43" />
+	<parameter etype="UI8" type="char" varname="e42" />
+	<parameter etype="UI8" type="char" varname="e41" />
+	<parameter etype="UI8" type="char" varname="e40" />
+	<parameter etype="UI8" type="char" varname="e39" />
+	<parameter etype="UI8" type="char" varname="e38" />
+	<parameter etype="UI8" type="char" varname="e37" />
+	<parameter etype="UI8" type="char" varname="e36" />
+	<parameter etype="UI8" type="char" varname="e35" />
+	<parameter etype="UI8" type="char" varname="e34" />
+	<parameter etype="UI8" type="char" varname="e33" />
+	<parameter etype="UI8" type="char" varname="e32" />
+	<parameter etype="UI8" type="char" varname="e31" />
+	<parameter etype="UI8" type="char" varname="e30" />
+	<parameter etype="UI8" type="char" varname="e29" />
+	<parameter etype="UI8" type="char" varname="e28" />
+	<parameter etype="UI8" type="char" varname="e27" />
+	<parameter etype="UI8" type="char" varname="e26" />
+	<parameter etype="UI8" type="char" varname="e25" />
+	<parameter etype="UI8" type="char" varname="e24" />
+	<parameter etype="UI8" type="char" varname="e23" />
+	<parameter etype="UI8" type="char" varname="e22" />
+	<parameter etype="UI8" type="char" varname="e21" />
+	<parameter etype="UI8" type="char" varname="e20" />
+	<parameter etype="UI8" type="char" varname="e19" />
+	<parameter etype="UI8" type="char" varname="e18" />
+	<parameter etype="UI8" type="char" varname="e17" />
+	<parameter etype="UI8" type="char" varname="e16" />
+	<parameter etype="UI8" type="char" varname="e15" />
+	<parameter etype="UI8" type="char" varname="e14" />
+	<parameter etype="UI8" type="char" varname="e13" />
+	<parameter etype="UI8" type="char" varname="e12" />
+	<parameter etype="UI8" type="char" varname="e11" />
+	<parameter etype="UI8" type="char" varname="e10" />
+	<parameter etype="UI8" type="char" varname="e9" />
+	<parameter etype="UI8" type="char" varname="e8" />
+	<parameter etype="UI8" type="char" varname="e7" />
+	<parameter etype="UI8" type="char" varname="e6" />
+	<parameter etype="UI8" type="char" varname="e5" />
+	<parameter etype="UI8" type="char" varname="e4" />
+	<parameter etype="UI8" type="char" varname="e3" />
+	<parameter etype="UI8" type="char" varname="e2" />
+	<parameter etype="UI8" type="char" varname="e1" />
+	<parameter etype="UI8" type="char" varname="e0" />
+	<description>Set packed 8-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[7:0] := e0
+dst[15:8] := e1
+dst[23:16] := e2
+dst[31:24] := e3
+dst[39:32] := e4
+dst[47:40] := e5
+dst[55:48] := e6
+dst[63:56] := e7
+dst[71:64] := e8
+dst[79:72] := e9
+dst[87:80] := e10
+dst[95:88] := e11
+dst[103:96] := e12
+dst[111:104] := e13
+dst[119:112] := e14
+dst[127:120] := e15
+dst[135:128] := e16
+dst[143:136] := e17
+dst[151:144] := e18
+dst[159:152] := e19
+dst[167:160] := e20
+dst[175:168] := e21
+dst[183:176] := e22
+dst[191:184] := e23
+dst[199:192] := e24
+dst[207:200] := e25
+dst[215:208] := e26
+dst[223:216] := e27
+dst[231:224] := e28
+dst[239:232] := e29
+dst[247:240] := e30
+dst[255:248] := e31
+dst[263:256] := e32
+dst[271:264] := e33
+dst[279:272] := e34
+dst[287:280] := e35
+dst[295:288] := e36
+dst[303:296] := e37
+dst[311:304] := e38
+dst[319:312] := e39
+dst[327:320] := e40
+dst[335:328] := e41
+dst[343:336] := e42
+dst[351:344] := e43
+dst[359:352] := e44
+dst[367:360] := e45
+dst[375:368] := e46
+dst[383:376] := e47
+dst[391:384] := e48
+dst[399:392] := e49
+dst[407:400] := e50
+dst[415:408] := e51
+dst[423:416] := e52
+dst[431:424] := e53
+dst[439:432] := e54
+dst[447:440] := e55
+dst[455:448] := e56
+dst[463:456] := e57
+dst[471:464] := e58
+dst[479:472] := e59
+dst[487:480] := e60
+dst[495:488] := e61
+dst[503:496] := e62
+dst[511:504] := e63
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set_epi16" sequence="TRUE" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="short" varname="e31" />
+	<parameter etype="UI16" type="short" varname="e30" />
+	<parameter etype="UI16" type="short" varname="e29" />
+	<parameter etype="UI16" type="short" varname="e28" />
+	<parameter etype="UI16" type="short" varname="e27" />
+	<parameter etype="UI16" type="short" varname="e26" />
+	<parameter etype="UI16" type="short" varname="e25" />
+	<parameter etype="UI16" type="short" varname="e24" />
+	<parameter etype="UI16" type="short" varname="e23" />
+	<parameter etype="UI16" type="short" varname="e22" />
+	<parameter etype="UI16" type="short" varname="e21" />
+	<parameter etype="UI16" type="short" varname="e20" />
+	<parameter etype="UI16" type="short" varname="e19" />
+	<parameter etype="UI16" type="short" varname="e18" />
+	<parameter etype="UI16" type="short" varname="e17" />
+	<parameter etype="UI16" type="short" varname="e16" />
+	<parameter etype="UI16" type="short" varname="e15" />
+	<parameter etype="UI16" type="short" varname="e14" />
+	<parameter etype="UI16" type="short" varname="e13" />
+	<parameter etype="UI16" type="short" varname="e12" />
+	<parameter etype="UI16" type="short" varname="e11" />
+	<parameter etype="UI16" type="short" varname="e10" />
+	<parameter etype="UI16" type="short" varname="e9" />
+	<parameter etype="UI16" type="short" varname="e8" />
+	<parameter etype="UI16" type="short" varname="e7" />
+	<parameter etype="UI16" type="short" varname="e6" />
+	<parameter etype="UI16" type="short" varname="e5" />
+	<parameter etype="UI16" type="short" varname="e4" />
+	<parameter etype="UI16" type="short" varname="e3" />
+	<parameter etype="UI16" type="short" varname="e2" />
+	<parameter etype="UI16" type="short" varname="e1" />
+	<parameter etype="UI16" type="short" varname="e0" />
+	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[15:0] := e0
+dst[31:16] := e1
+dst[47:32] := e2
+dst[63:48] := e3
+dst[79:64] := e4
+dst[95:80] := e5
+dst[111:96] := e6
+dst[127:112] := e7
+dst[143:128] := e8
+dst[159:144] := e9
+dst[175:160] := e10
+dst[191:176] := e11
+dst[207:192] := e12
+dst[223:208] := e13
+dst[239:224] := e14
+dst[255:240] := e15
+dst[271:256] := e16
+dst[287:272] := e17
+dst[303:288] := e18
+dst[319:304] := e19
+dst[335:320] := e20
+dst[351:336] := e21
+dst[367:352] := e22
+dst[383:368] := e23
+dst[399:384] := e24
+dst[415:400] := e25
+dst[431:416] := e26
+dst[447:432] := e27
+dst[463:448] := e28
+dst[479:464] := e29
+dst[495:480] := e30
+dst[511:496] := e31
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="int" varname="e15" />
+	<parameter etype="UI32" type="int" varname="e14" />
+	<parameter etype="UI32" type="int" varname="e13" />
+	<parameter etype="UI32" type="int" varname="e12" />
+	<parameter etype="UI32" type="int" varname="e11" />
+	<parameter etype="UI32" type="int" varname="e10" />
+	<parameter etype="UI32" type="int" varname="e9" />
+	<parameter etype="UI32" type="int" varname="e8" />
+	<parameter etype="UI32" type="int" varname="e7" />
+	<parameter etype="UI32" type="int" varname="e6" />
+	<parameter etype="UI32" type="int" varname="e5" />
+	<parameter etype="UI32" type="int" varname="e4" />
+	<parameter etype="UI32" type="int" varname="e3" />
+	<parameter etype="UI32" type="int" varname="e2" />
+	<parameter etype="UI32" type="int" varname="e1" />
+	<parameter etype="UI32" type="int" varname="e0" />
+	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+dst[159:128] := e4
+dst[191:160] := e5
+dst[223:192] := e6
+dst[255:224] := e7
+dst[287:256] := e8
+dst[319:288] := e9
+dst[351:320] := e10
+dst[383:352] := e11
+dst[415:384] := e12
+dst[447:416] := e13
+dst[479:448] := e14
+dst[511:480] := e15
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="e7" />
+	<parameter etype="UI64" type="__int64" varname="e6" />
+	<parameter etype="UI64" type="__int64" varname="e5" />
+	<parameter etype="UI64" type="__int64" varname="e4" />
+	<parameter etype="UI64" type="__int64" varname="e3" />
+	<parameter etype="UI64" type="__int64" varname="e2" />
+	<parameter etype="UI64" type="__int64" varname="e1" />
+	<parameter etype="UI64" type="__int64" varname="e0" />
+	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+dst[191:128] := e2
+dst[255:192] := e3
+dst[319:256] := e4
+dst[383:320] := e5
+dst[447:384] := e6
+dst[511:448] := e7
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set_pd" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="double" varname="e7" />
+	<parameter etype="FP64" type="double" varname="e6" />
+	<parameter etype="FP64" type="double" varname="e5" />
+	<parameter etype="FP64" type="double" varname="e4" />
+	<parameter etype="FP64" type="double" varname="e3" />
+	<parameter etype="FP64" type="double" varname="e2" />
+	<parameter etype="FP64" type="double" varname="e1" />
+	<parameter etype="FP64" type="double" varname="e0" />
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+dst[191:128] := e2
+dst[255:192] := e3
+dst[319:256] := e4
+dst[383:320] := e5
+dst[447:384] := e6
+dst[511:448] := e7
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="float" varname="e15" />
+	<parameter etype="FP32" type="float" varname="e14" />
+	<parameter etype="FP32" type="float" varname="e13" />
+	<parameter etype="FP32" type="float" varname="e12" />
+	<parameter etype="FP32" type="float" varname="e11" />
+	<parameter etype="FP32" type="float" varname="e10" />
+	<parameter etype="FP32" type="float" varname="e9" />
+	<parameter etype="FP32" type="float" varname="e8" />
+	<parameter etype="FP32" type="float" varname="e7" />
+	<parameter etype="FP32" type="float" varname="e6" />
+	<parameter etype="FP32" type="float" varname="e5" />
+	<parameter etype="FP32" type="float" varname="e4" />
+	<parameter etype="FP32" type="float" varname="e3" />
+	<parameter etype="FP32" type="float" varname="e2" />
+	<parameter etype="FP32" type="float" varname="e1" />
+	<parameter etype="FP32" type="float" varname="e0" />
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+dst[159:128] := e4
+dst[191:160] := e5
+dst[223:192] := e6
+dst[255:224] := e7
+dst[287:256] := e8
+dst[319:288] := e9
+dst[351:320] := e10
+dst[383:352] := e11
+dst[415:384] := e12
+dst[447:416] := e13
+dst[479:448] := e14
+dst[511:480] := e15
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setr4_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="int" varname="d" />
+	<parameter etype="UI32" type="int" varname="c" />
+	<parameter etype="UI32" type="int" varname="b" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Set packed 32-bit integers in "dst" with the repeated 4 element sequence in reverse order.</description>
+	<operation>
+dst[31:0] := d
+dst[63:32] := c
+dst[95:64] := b
+dst[127:96] := a
+dst[159:128] := d
+dst[191:160] := c
+dst[223:192] := b
+dst[255:224] := a
+dst[287:256] := d
+dst[319:288] := c
+dst[351:320] := b
+dst[383:352] := a
+dst[415:384] := d
+dst[447:416] := c
+dst[479:448] := b
+dst[511:480] := a
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setr4_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="d" />
+	<parameter etype="UI64" type="__int64" varname="c" />
+	<parameter etype="UI64" type="__int64" varname="b" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Set packed 64-bit integers in "dst" with the repeated 4 element sequence in reverse order.</description>
+	<operation>
+dst[63:0] := d
+dst[127:64] := c
+dst[191:128] := b
+dst[255:192] := a
+dst[319:256] := d
+dst[383:320] := c
+dst[447:384] := b
+dst[511:448] := a
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setr4_pd" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="double" varname="d" />
+	<parameter etype="FP64" type="double" varname="c" />
+	<parameter etype="FP64" type="double" varname="b" />
+	<parameter etype="FP64" type="double" varname="a" />
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order.</description>
+	<operation>
+dst[63:0] := d
+dst[127:64] := c
+dst[191:128] := b
+dst[255:192] := a
+dst[319:256] := d
+dst[383:320] := c
+dst[447:384] := b
+dst[511:448] := a
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setr4_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="float" varname="d" />
+	<parameter etype="FP32" type="float" varname="c" />
+	<parameter etype="FP32" type="float" varname="b" />
+	<parameter etype="FP32" type="float" varname="a" />
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order.</description>
+	<operation>
+dst[31:0] := d
+dst[63:32] := c
+dst[95:64] := b
+dst[127:96] := a
+dst[159:128] := d
+dst[191:160] := c
+dst[223:192] := b
+dst[255:224] := a
+dst[287:256] := d
+dst[319:288] := c
+dst[351:320] := b
+dst[383:352] := a
+dst[415:384] := d
+dst[447:416] := c
+dst[479:448] := b
+dst[511:480] := a
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setr_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="int" varname="e15" />
+	<parameter etype="UI32" type="int" varname="e14" />
+	<parameter etype="UI32" type="int" varname="e13" />
+	<parameter etype="UI32" type="int" varname="e12" />
+	<parameter etype="UI32" type="int" varname="e11" />
+	<parameter etype="UI32" type="int" varname="e10" />
+	<parameter etype="UI32" type="int" varname="e9" />
+	<parameter etype="UI32" type="int" varname="e8" />
+	<parameter etype="UI32" type="int" varname="e7" />
+	<parameter etype="UI32" type="int" varname="e6" />
+	<parameter etype="UI32" type="int" varname="e5" />
+	<parameter etype="UI32" type="int" varname="e4" />
+	<parameter etype="UI32" type="int" varname="e3" />
+	<parameter etype="UI32" type="int" varname="e2" />
+	<parameter etype="UI32" type="int" varname="e1" />
+	<parameter etype="UI32" type="int" varname="e0" />
+	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e15
+dst[63:32] := e14
+dst[95:64] := e13
+dst[127:96] := e12
+dst[159:128] := e11
+dst[191:160] := e10
+dst[223:192] := e9
+dst[255:224] := e8
+dst[287:256] := e7
+dst[319:288] := e6
+dst[351:320] := e5
+dst[383:352] := e4
+dst[415:384] := e3
+dst[447:416] := e2
+dst[479:448] := e1
+dst[511:480] := e0
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setr_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="e7" />
+	<parameter etype="UI64" type="__int64" varname="e6" />
+	<parameter etype="UI64" type="__int64" varname="e5" />
+	<parameter etype="UI64" type="__int64" varname="e4" />
+	<parameter etype="UI64" type="__int64" varname="e3" />
+	<parameter etype="UI64" type="__int64" varname="e2" />
+	<parameter etype="UI64" type="__int64" varname="e1" />
+	<parameter etype="UI64" type="__int64" varname="e0" />
+	<description>Set packed 64-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e7
+dst[127:64] := e6
+dst[191:128] := e5
+dst[255:192] := e4
+dst[319:256] := e3
+dst[383:320] := e2
+dst[447:384] := e1
+dst[511:448] := e0
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setr_pd" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="double" varname="e7" />
+	<parameter etype="FP64" type="double" varname="e6" />
+	<parameter etype="FP64" type="double" varname="e5" />
+	<parameter etype="FP64" type="double" varname="e4" />
+	<parameter etype="FP64" type="double" varname="e3" />
+	<parameter etype="FP64" type="double" varname="e2" />
+	<parameter etype="FP64" type="double" varname="e1" />
+	<parameter etype="FP64" type="double" varname="e0" />
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e7
+dst[127:64] := e6
+dst[191:128] := e5
+dst[255:192] := e4
+dst[319:256] := e3
+dst[383:320] := e2
+dst[447:384] := e1
+dst[511:448] := e0
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setr_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="float" varname="e15" />
+	<parameter etype="FP32" type="float" varname="e14" />
+	<parameter etype="FP32" type="float" varname="e13" />
+	<parameter etype="FP32" type="float" varname="e12" />
+	<parameter etype="FP32" type="float" varname="e11" />
+	<parameter etype="FP32" type="float" varname="e10" />
+	<parameter etype="FP32" type="float" varname="e9" />
+	<parameter etype="FP32" type="float" varname="e8" />
+	<parameter etype="FP32" type="float" varname="e7" />
+	<parameter etype="FP32" type="float" varname="e6" />
+	<parameter etype="FP32" type="float" varname="e5" />
+	<parameter etype="FP32" type="float" varname="e4" />
+	<parameter etype="FP32" type="float" varname="e3" />
+	<parameter etype="FP32" type="float" varname="e2" />
+	<parameter etype="FP32" type="float" varname="e1" />
+	<parameter etype="FP32" type="float" varname="e0" />
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e15
+dst[63:32] := e14
+dst[95:64] := e13
+dst[127:96] := e12
+dst[159:128] := e11
+dst[191:160] := e10
+dst[223:192] := e9
+dst[255:224] := e8
+dst[287:256] := e7
+dst[319:288] := e6
+dst[351:320] := e5
+dst[383:352] := e4
+dst[415:384] := e3
+dst[447:416] := e2
+dst[479:448] := e1
+dst[511:480] := e0
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setzero" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m512 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPXORQ" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setzero_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<description>Return vector of type __m512i with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPXORQ" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setzero_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<description>Return vector of type __m512d with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPXORQ" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setzero_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<description>Return vector of type __m512 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPXORQ" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setzero_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<description>Return vector of type __m512i with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPXORQ" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rol_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPROLD" xed="VPROLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_rol_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPROLD" xed="VPROLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rol_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPROLD" xed="VPROLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rol_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPROLQ" xed="VPROLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_rol_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPROLQ" xed="VPROLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rol_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPROLQ" xed="VPROLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rolv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPROLVD" xed="VPROLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_rolv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPROLVD" xed="VPROLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rolv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPROLVD" xed="VPROLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rolv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPROLVQ" xed="VPROLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_rolv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPROLVQ" xed="VPROLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rolv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPROLVQ" xed="VPROLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_ror_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPRORD" xed="VPRORD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_ror_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPRORD" xed="VPRORD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_ror_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPRORD" xed="VPRORD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_ror_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPRORQ" xed="VPRORQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_ror_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPRORQ" xed="VPRORQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_ror_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPRORQ" xed="VPRORQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rorv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPRORVD" xed="VPRORVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rorv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPRORVD" xed="VPRORVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rorv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPRORVQ" xed="VPRORVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_rorv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPRORVQ" xed="VPRORVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rorv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPRORVQ" xed="VPRORVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sll_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, xmm" name="VPSLLD" xed="VPSLLD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sll_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, xmm" name="VPSLLD" xed="VPSLLD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_slli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPSLLD" xed="VPSLLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sll_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, xmm" name="VPSLLD" xed="VPSLLD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sll_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, xmm" name="VPSLLQ" xed="VPSLLQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_slli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPSLLQ" xed="VPSLLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sll_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, xmm" name="VPSLLQ" xed="VPSLLQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_slli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPSLLQ" xed="VPSLLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sll_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, xmm" name="VPSLLQ" xed="VPSLLQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_slli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSLLQ" xed="VPSLLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sllv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSLLVD" xed="VPSLLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sllv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSLLVQ" xed="VPSLLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sllv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSLLVQ" xed="VPSLLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sllv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSLLVQ" xed="VPSLLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sra_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, xmm" name="VPSRAD" xed="VPSRAD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sra_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, xmm" name="VPSRAD" xed="VPSRAD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srai_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPSRAD" xed="VPSRAD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sra_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, xmm" name="VPSRAD" xed="VPSRAD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sra_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, xmm" name="VPSRAQ" xed="VPSRAQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srai_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPSRAQ" xed="VPSRAQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sra_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, xmm" name="VPSRAQ" xed="VPSRAQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srai_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPSRAQ" xed="VPSRAQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sra_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+	ELSE
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, xmm" name="VPSRAQ" xed="VPSRAQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srai_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+	ELSE
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSRAQ" xed="VPSRAQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srav_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSRAVD" xed="VPSRAVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srav_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSRAVQ" xed="VPSRAVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srav_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSRAVQ" xed="VPSRAVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srav_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSRAVQ" xed="VPSRAVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srl_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, xmm" name="VPSRLD" xed="VPSRLD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srl_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, xmm" name="VPSRLD" xed="VPSRLD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPSRLD" xed="VPSRLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srl_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, xmm" name="VPSRLD" xed="VPSRLD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srl_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, xmm" name="VPSRLQ" xed="VPSRLQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPSRLQ" xed="VPSRLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srl_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, xmm" name="VPSRLQ" xed="VPSRLQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VPSRLQ" xed="VPSRLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srl_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, xmm" name="VPSRLQ" xed="VPSRLQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srli_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSRLQ" xed="VPSRLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srlv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSRLVD" xed="VPSRLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srlv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSRLVQ" xed="VPSRLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_srlv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSRLVQ" xed="VPSRLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srlv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSRLVQ" xed="VPSRLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rcp14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VRCP14PD" xed="VRCP14PD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_rcp14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VRCP14PD" xed="VRCP14PD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rcp14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (1.0 / a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VRCP14PD" xed="VRCP14PD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rcp14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VRCP14PS" xed="VRCP14PS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_rcp14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VRCP14PS" xed="VRCP14PS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rcp14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (1.0 / a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VRCP14PS" xed="VRCP14PS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rcp14_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VRCP14SD" xed="VRCP14SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rcp14_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VRCP14SD" xed="VRCP14SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_rcp14_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+dst[63:0] := (1.0 / b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VRCP14SD" xed="VRCP14SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rcp14_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VRCP14SS" xed="VRCP14SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rcp14_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VRCP14SS" xed="VRCP14SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_rcp14_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+dst[31:0] := (1.0 / b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VRCP14SS" xed="VRCP14SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rsqrt14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VRSQRT14PD" xed="VRSQRT14PD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_rsqrt14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VRSQRT14PD" xed="VRSQRT14PD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rsqrt14_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VRSQRT14PD" xed="VRSQRT14PD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rsqrt14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VRSQRT14PS" xed="VRSQRT14PS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_rsqrt14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VRSQRT14PS" xed="VRSQRT14PS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rsqrt14_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VRSQRT14PS" xed="VRSQRT14PS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rsqrt14_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / SQRT(b[63:0]))
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VRSQRT14SD" xed="VRSQRT14SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rsqrt14_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / SQRT(b[63:0]))
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VRSQRT14SD" xed="VRSQRT14SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_rsqrt14_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+dst[63:0] := (1.0 / SQRT(b[63:0]))
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VRSQRT14SD" xed="VRSQRT14SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rsqrt14_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / SQRT(b[31:0]))
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VRSQRT14SS" xed="VRSQRT14SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rsqrt14_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / SQRT(b[31:0]))
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VRSQRT14SS" xed="VRSQRT14SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_rsqrt14_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+dst[31:0] := (1.0 / SQRT(b[31:0]))
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VRSQRT14SS" xed="VRSQRT14SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sqrt_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VSQRTPD" xed="VSQRTPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sqrt_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VSQRTPD" xed="VSQRTPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sqrt_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VSQRTPD" xed="VSQRTPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sqrt_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note].</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VSQRTPD" xed="VSQRTPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sqrt_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VSQRTPD" xed="VSQRTPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sqrt_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note].</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VSQRTPD" xed="VSQRTPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sqrt_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VSQRTPS" xed="VSQRTPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sqrt_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VSQRTPS" xed="VSQRTPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sqrt_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VSQRTPS" xed="VSQRTPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sqrt_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VSQRTPS" xed="VSQRTPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sqrt_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VSQRTPS" xed="VSQRTPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sqrt_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note].</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VSQRTPS" xed="VSQRTPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sqrt_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := SQRT(b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VSQRTSD" xed="VSQRTSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sqrt_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := SQRT(b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSQRTSD" xed="VSQRTSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sqrt_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := SQRT(b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VSQRTSD" xed="VSQRTSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sqrt_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := SQRT(b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSQRTSD" xed="VSQRTSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_sqrt_round_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := SQRT(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VSQRTSD" xed="VSQRTSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sqrt_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := SQRT(b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VSQRTSS" xed="VSQRTSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sqrt_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := SQRT(b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSQRTSS" xed="VSQRTSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sqrt_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := SQRT(b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VSQRTSS" xed="VSQRTSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sqrt_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := SQRT(b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSQRTSS" xed="VSQRTSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_sqrt_round_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := SQRT(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VSQRTSS" xed="VSQRTSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castpd128_pd512" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castpd256_pd512" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castpd512_pd128" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Cast vector of type __m512d to type __m128d. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castps512_ps128" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Cast vector of type __m512 to type __m128. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castpd512_pd256" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Cast vector of type __m512d to type __m256d. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castps128_ps512" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castps256_ps512" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castps512_ps256" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Cast vector of type __m512 to type __m256. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castsi128_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="M512" type="__m128i" varname="a" />
+	<description>Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castsi256_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="M512" type="__m256i" varname="a" />
+	<description>Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined.
+	 This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castsi512_si128" tech="AVX-512">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m512i" varname="a" />
+	<description>Cast vector of type __m512i to type __m128i.
+	 This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castsi512_si256" tech="AVX-512">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="M256" type="__m512i" varname="a" />
+	<description>Cast vector of type __m512i to type __m256i.
+	 This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_zextpd128_pd512" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_zextps128_ps512" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Cast vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_zextsi128_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="M512" type="__m128i" varname="a" />
+	<description>Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_zextpd256_pd512" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_zextps256_ps512" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Cast vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_zextsi256_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="M512" type="__m256i" varname="a" />
+	<description>Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_undefined" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m512 with undefined elements.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm512_undefined_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<description>Return vector of type __m512i with undefined elements.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm512_undefined_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<description>Return vector of type __m512d with undefined elements.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm512_undefined_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<description>Return vector of type __m512 with undefined elements.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm512_add_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VADDPD" xed="VADDPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_add_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VADDPD" xed="VADDPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_add_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VADDPD" xed="VADDPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_add_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VADDPD" xed="VADDPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_add_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VADDPS" xed="VADDPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_add_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VADDPS" xed="VADDPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_add_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VADDPS" xed="VADDPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_add_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VADDPS" xed="VADDPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMADD132PD" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMADD213PD" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMADD231PD" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmadd_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADD132PD" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADD213PD" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADD231PD" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD132PD" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD213PD" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD231PD" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmadd_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE 
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD132PD" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD213PD" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD231PD" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD132PD" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD213PD" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD231PD" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmadd_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD132PD" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD213PD" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD231PD" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMADD132PS" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMADD213PS" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMADD231PS" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmadd_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADD132PS" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADD213PS" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADD231PS" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD132PS" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD213PS" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD231PS" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmadd_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD132PS" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD213PS" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD231PS" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD132PS" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD213PS" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD231PS" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmadd_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD132PS" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD213PS" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD231PS" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMSUB132PD" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMSUB213PD" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMSUB231PD" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmsub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUB132PD" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUB213PD" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUB231PD" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB132PD" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB213PD" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB231PD" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmsub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB132PD" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB213PD" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB231PD" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB132PD" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB213PD" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB231PD" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmsub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB132PD" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB213PD" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB231PD" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMSUB132PS" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMSUB213PS" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMSUB231PS" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmsub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUB132PS" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUB213PS" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUB231PS" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB132PS" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB213PS" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB231PS" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmsub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB132PS" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB213PS" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB231PS" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB132PS" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB213PS" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB231PS" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmsub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB132PS" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB213PS" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB231PS" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fnmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFNMADD132PD" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFNMADD213PD" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFNMADD231PD" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fnmadd_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".
+	 [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMADD132PD" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMADD213PD" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMADD231PD" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fnmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD132PD" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD213PD" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD231PD" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fnmadd_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD132PD" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD213PD" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD231PD" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fnmadd_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD132PD" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD213PD" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD231PD" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fnmadd_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD132PD" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD213PD" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD231PD" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fnmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFNMADD132PS" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFNMADD213PS" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFNMADD231PS" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fnmadd_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".  
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMADD132PS" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMADD213PS" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMADD231PS" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fnmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD132PS" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD213PS" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD231PS" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fnmadd_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD132PS" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD213PS" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD231PS" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fnmadd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD132PS" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD213PS" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD231PS" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fnmadd_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD132PS" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD213PS" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD231PS" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fnmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFNMSUB132PD" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFNMSUB213PD" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFNMSUB231PD" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fnmsub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".  
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMSUB132PD" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMSUB213PD" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMSUB231PD" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fnmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB132PD" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB213PD" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB231PD" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fnmsub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB132PD" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB213PD" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB231PD" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fnmsub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB132PD" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB213PD" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB231PD" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fnmsub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="FP64" type="__m512d" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB132PD" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB213PD" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB231PD" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fnmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFNMSUB132PS" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFNMSUB213PS" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFNMSUB231PS" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fnmsub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMSUB132PS" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMSUB213PS" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMSUB231PS" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fnmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB132PS" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB213PS" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB231PS" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fnmsub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB132PS" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB213PS" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB231PS" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fnmsub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB132PS" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB213PS" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB231PS" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fnmsub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="FP32" type="__m512" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB132PS" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB213PS" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB231PS" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mul_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  RM.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VMULPD" xed="VMULPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mul_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VMULPD" xed="VMULPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mul_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VMULPD" xed="VMULPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mul_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VMULPD" xed="VMULPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mul_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  RM.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VMULPS" xed="VMULPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mul_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	 [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VMULPS" xed="VMULPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mul_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VMULPS" xed="VMULPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mul_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VMULPS" xed="VMULPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_add_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPADDD" xed="VPADDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_add_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPADDD" xed="VPADDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mullo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMULLD" xed="VPMULLD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mullo_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	tmp[63:0] := a[i+31:i] * b[i+31:i]
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMULLD" xed="VPMULLD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sub_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSUBD" xed="VPSUBD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sub_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSUBD" xed="VPSUBD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VSUBPD" xed="VSUBPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VSUBPD" xed="VSUBPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sub_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VSUBPD" xed="VSUBPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sub_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VSUBPD" xed="VSUBPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VSUBPS" xed="VSUBPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VSUBPS" xed="VSUBPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sub_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VSUBPS" xed="VSUBPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sub_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VSUBPS" xed="VSUBPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_add_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Reduce the packed 32-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[31:0] + src[63:32]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[32*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[i+31:i] := a[i+31:i]
+	ELSE
+		tmp[i+31:i] := 0
+	FI
+ENDFOR
+dst[31:0] := REDUCE_ADD(tmp, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_add_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Reduce the packed 64-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[63:0] + src[127:64]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[64*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 8
+	i := j*64
+	IF k[j]
+		tmp[i+63:i] := a[i+63:i]
+	ELSE
+		tmp[i+63:i] := 0
+	FI
+ENDFOR
+dst[63:0] := REDUCE_ADD(tmp, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_add_pd" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="double" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[63:0] + src[127:64]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[64*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 8
+	i := j*64
+	IF k[j]
+		tmp[i+63:i] := a[i+63:i]
+	ELSE
+		tmp[i+63:i] := 0
+	FI
+ENDFOR
+dst[63:0] := REDUCE_ADD(tmp, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_add_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[31:0] + src[63:32]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[32*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 16
+	i := j*32
+	IF k[j]
+		tmp[i+31:i] := a[i+31:i]
+	ELSE
+		tmp[i+31:i] := 0
+	FI
+ENDFOR
+dst[31:0] := REDUCE_ADD(tmp, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_mul_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Reduce the packed 32-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[31:0] * src[63:32]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[32*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 16
+	i := j*32
+	IF k[j]
+		tmp[i+31:i] := a[i+31:i]
+	ELSE
+		tmp[i+31:i] := 1
+	FI
+ENDFOR
+dst[31:0] := REDUCE_MUL(tmp, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_mul_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Reduce the packed 64-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[63:0] * src[127:64]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[64*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 8
+	i := j*64
+	IF k[j]
+		tmp[i+63:i] := a[i+63:i]
+	ELSE
+		tmp[i+63:i] := 1
+	FI
+ENDFOR
+dst[63:0] := REDUCE_MUL(tmp, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_mul_pd" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="double" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[63:0] * src[127:64]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[64*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 8
+	i := j*64
+	IF k[j]
+		tmp[i+63:i] := a[i+63:i]
+	ELSE
+		tmp[i+63:i] := 1.0
+	FI
+ENDFOR
+dst[63:0] := REDUCE_MUL(tmp, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_mul_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[31:0] * src[63:32]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[32*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 16
+	i := j*32
+	IF k[j]
+		tmp[i+31:i] := a[i+31:i]
+	ELSE
+		tmp[i+31:i] := FP32(1.0)
+	FI
+ENDFOR
+dst[31:0] := REDUCE_MUL(tmp, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_add_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Reduce the packed 32-bit integers in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[31:0] + src[63:32]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[32*len-1:0], len)
+}
+dst[31:0] := REDUCE_ADD(a, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_add_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Reduce the packed 64-bit integers in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[63:0] + src[127:64]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[64*len-1:0], len)
+}
+dst[63:0] := REDUCE_ADD(a, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_add_pd" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="double" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[63:0] + src[127:64]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[64*len-1:0], len)
+}
+dst[63:0] := REDUCE_ADD(a, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_add_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_ADD(src, len) {
+	IF len == 2
+		RETURN src[31:0] + src[63:32]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len]
+	ENDFOR
+	RETURN REDUCE_ADD(src[32*len-1:0], len)
+}
+dst[31:0] := REDUCE_ADD(a, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_mul_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Reduce the packed 32-bit integers in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[31:0] * src[63:32]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[32*len-1:0], len)
+}
+dst[31:0] := REDUCE_MUL(a, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_mul_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Reduce the packed 64-bit integers in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[63:0] * src[127:64]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[64*len-1:0], len)
+}
+dst[63:0] := REDUCE_MUL(a, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_mul_pd" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="double" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[63:0] * src[127:64]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[64*len-1:0], len)
+}
+dst[63:0] := REDUCE_MUL(a, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_mul_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MUL(src, len) {
+	IF len == 2
+		RETURN src[31:0] * src[63:32]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len]
+	ENDFOR
+	RETURN REDUCE_MUL(src[32*len-1:0], len)
+}
+dst[31:0] := REDUCE_MUL(a, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_abs_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="v2" />
+	<description>Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ABS(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, m512" name="VPANDD" xed="VPANDD_ZMMu32_MASKmskw_ZMMu32_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_abs_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="v2" />
+	<description>Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(v2[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, m512" name="VPANDD" xed="VPANDD_ZMMu32_MASKmskw_ZMMu32_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_abs_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="v2" />
+	<description>Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ABS(v2[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, m512" name="VPANDQ" xed="VPANDQ_ZMMu64_MASKmskw_ZMMu64_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_abs_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="v2" />
+	<description>Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(v2[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, m512" name="VPANDQ" xed="VPANDQ_ZMMu64_MASKmskw_ZMMu64_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_alignr_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst".</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (32*imm8[3:0])
+dst[511:0] := temp[511:0]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VALIGND" xed="VALIGND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_alignr_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="imm8" />
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (32*imm8[3:0])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VALIGND" xed="VALIGND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_getexp_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VGETEXPPD" xed="VGETEXPPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_getexp_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[sae_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}" name="VGETEXPPD" xed="VGETEXPPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_getexp_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VGETEXPPD" xed="VGETEXPPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_getexp_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[sae_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}" name="VGETEXPPD" xed="VGETEXPPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_getexp_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VGETEXPPS" xed="VGETEXPPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_getexp_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[sae_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}" name="VGETEXPPS" xed="VGETEXPPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_getexp_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VGETEXPPS" xed="VGETEXPPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_getexp_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[sae_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}" name="VGETEXPPS" xed="VGETEXPPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_getmant_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VGETMANTPD" xed="VGETMANTPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_getmant_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8 {sae}" name="VGETMANTPD" xed="VGETMANTPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_getmant_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VGETMANTPD" xed="VGETMANTPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_getmant_round_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8 {sae}" name="VGETMANTPD" xed="VGETMANTPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_getmant_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VGETMANTPS" xed="VGETMANTPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_getmant_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8 {sae}" name="VGETMANTPS" xed="VGETMANTPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_getmant_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VGETMANTPS" xed="VGETMANTPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_getmant_round_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="interv" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sc" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8 {sae}" name="VGETMANTPS" xed="VGETMANTPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_blend_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VBLENDMPD" xed="VBLENDMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_blend_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VBLENDMPS" xed="VBLENDMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_blend_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPBLENDMD" xed="VPBLENDMD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_blend_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPBLENDMQ" xed="VPBLENDMQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutevar_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_mask_permutexvar_epi32", and it is recommended that you use that intrinsic name.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMD" xed="VPERMD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutevar_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="idx" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_permutexvar_epi32", and it is recommended that you use that intrinsic name.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMD" xed="VPERMD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shuffle_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_PERM" type="_MM_PERM_ENUM" varname="imm8" />
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPSHUFD" xed="VPSHUFD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shuffle_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_PERM" type="_MM_PERM_ENUM" varname="imm8" />
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSHUFD" xed="VPSHUFD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_round_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm {sae}, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpeq_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0
+ENDFOR	
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmple_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] &lt;= b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmplt_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] &lt; b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpneq_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpnle_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (!(a[i+63:i] &lt;= b[i+63:i])) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpnlt_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (!(a[i+63:i] &lt; b[i+63:i])) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpord_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpunord_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_round_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm {sae}, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpeq_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR	
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmple_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] &lt;= b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmplt_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] &lt; b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpneq_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpnle_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (!(a[i+63:i] &lt;= b[i+63:i])) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpnlt_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (!(a[i+63:i] &lt; b[i+63:i])) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpord_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpunord_pd_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="FP64" type="__m512d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPD" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_round_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm {sae}, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpeq_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0
+ENDFOR	
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmple_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] &lt;= b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmplt_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] &lt; b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpneq_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpnle_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (!(a[i+31:i] &lt;= b[i+31:i])) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpnlt_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (!(a[i+31:i] &lt; b[i+31:i])) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpord_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpunord_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_round_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm {sae}, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpeq_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR		
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmple_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] &lt;= b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmplt_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] &lt; b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpneq_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpnle_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (!(a[i+31:i] &lt;= b[i+31:i])) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpnlt_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (!(a[i+31:i] &lt; b[i+31:i])) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpord_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpunord_ps_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPS" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpeq_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPEQD" xed="VPCMPEQD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpge_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpgt_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPCMPGTD" xed="VPCMPGTD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmple_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpneq_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpeq_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPEQD" xed="VPCMPEQD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpge_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpgt_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPCMPGTD" xed="VPCMPGTD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmple_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpneq_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPD" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpeq_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpge_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpgt_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmple_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmplt_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmpneq_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immtype="_MM_CMPINT" type="_MM_CMPINT_ENUM" varname="imm8" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpeq_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpge_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpgt_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmple_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmplt_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmpneq_epu32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VPCMPUD" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i32gather_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="vindex" />
+	<parameter etype="FP32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, vm32z" name="VGATHERDPS" xed="VGATHERDPS_ZMMf32_MASKmskw_MEMf32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i32gather_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="vindex" />
+	<parameter etype="FP32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, vm32z" name="VGATHERDPS" xed="VGATHERDPS_ZMMf32_MASKmskw_MEMf32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_load_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVAPD" xed="VMOVAPD_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_load_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VMOVAPD" xed="VMOVAPD_ZMMf64_MASKmskw_MEMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_load_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVAPS" xed="VMOVAPS_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_load_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VMOVAPS" xed="VMOVAPS_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_load_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVDQA32" xed="VMOVDQA32_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_load_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="M512" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits of integer data from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVDQA32" xed="VMOVDQA32_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_load_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VMOVDQA32" xed="VMOVDQA32_ZMMu32_MASKmskw_MEMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_load_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVDQA64" xed="VMOVDQA64_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_load_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VMOVDQA64" xed="VMOVDQA64_ZMMu64_MASKmskw_MEMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i32gather_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="vindex" />
+	<parameter etype="UI32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, vm32z" name="VPGATHERDD" xed="VPGATHERDD_ZMMu32_MASKmskw_MEMu32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i32gather_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="vindex" />
+	<parameter etype="UI32" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, vm32z" name="VPGATHERDD" xed="VPGATHERDD_ZMMu32_MASKmskw_MEMu32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i32logather_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="vindex" />
+	<parameter etype="UI64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Loads 8 64-bit integer elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" and stores them in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VPGATHERDQ" xed="VPGATHERDQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i32logather_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="UI64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Loads 8 64-bit integer elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VPGATHERDQ" xed="VPGATHERDQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i32logather_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="FP64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Loads 8 double-precision (64-bit) floating-point elements stored at memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" them in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VGATHERDPD" xed="VGATHERDPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i32logather_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="vindex" />
+	<parameter etype="FP64" type="void const*" varname="base_addr" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Loads 8 double-precision (64-bit) floating-point elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VGATHERDPD" xed="VGATHERDPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mov_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VMOVAPD" xed="VMOVAPD_ZMMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mov_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VMOVAPS" xed="VMOVAPS_ZMMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mov_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VMOVDQA32" xed="VMOVDQA32_ZMMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mov_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VMOVDQA64" xed="VMOVDQA64_ZMMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_store_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VMOVAPD" xed="VMOVAPD_MEMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_store_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVAPD" xed="VMOVAPD_MEMf64_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_store_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VMOVAPS" xed="VMOVAPS_MEMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_store_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVAPS" xed="VMOVAPS_MEMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_store_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VMOVDQA32" xed="VMOVDQA32_MEMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_store_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVDQA32" xed="VMOVDQA32_MEMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_store_si512" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="M512" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="M512" type="__m512i" varname="a" />
+	<description>Store 512-bits of integer data from "a" into memory. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVDQA32" xed="VMOVDQA32_MEMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_store_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VMOVDQA64" xed="VMOVDQA64_MEMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_store_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="512" type="void*" varname="mem_addr" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVDQA64" xed="VMOVDQA64_MEMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i32scatter_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m512i" varname="vindex" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32z, zmm" name="VPSCATTERDD" xed="VPSCATTERDD_MEMu32_MASKmskw_ZMMu32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i32scatter_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="vindex" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32z {k}, zmm" name="VPSCATTERDD" xed="VPSCATTERDD_MEMu32_MASKmskw_ZMMu32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i32scatter_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m512i" varname="vindex" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="vm32z, zmm" name="VSCATTERDPS" xed="VSCATTERDPS_MEMf32_MASKmskw_ZMMf32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i32scatter_ps" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP32" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="vindex" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="vm32z {k}, zmm" name="VSCATTERDPS" xed="VSCATTERDPS_MEMf32_MASKmskw_ZMMf32_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i32loscatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m512i" varname="vindex" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Stores 8 packed double-precision (64-bit) floating-point elements in "a" and to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="m512, zmm" name="VSCATTERDPD" xed="VSCATTERDPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i32loscatter_pd" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="vindex" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Stores 8 packed double-precision (64-bit) floating-point elements in "a" to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VSCATTERDPD" xed="VSCATTERDPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_and_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPANDD" xed="VPANDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_and_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="M512" type="__m512i" varname="a" />
+	<parameter etype="M512" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of 512 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[511:0] := (a[511:0] AND b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPANDD" xed="VPANDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_andnot_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPANDND" xed="VPANDND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_andnot_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="M512" type="__m512i" varname="a" />
+	<parameter etype="M512" type="__m512i" varname="b" />
+	<description>Compute the bitwise NOT of 512 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[511:0] := ((NOT a[511:0]) AND b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPANDND" xed="VPANDND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_andnot_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPANDND" xed="VPANDND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_andnot_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+dst[511:0] := ((NOT a[511:0]) AND b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPANDNQ" xed="VPANDNQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_andnot_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPANDNQ" xed="VPANDNQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_and_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst[511:0] := (a[511:0] AND b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPANDQ" xed="VPANDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_and_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPANDQ" xed="VPANDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_or_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPORD" xed="VPORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_or_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPORD" xed="VPORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_or_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="M512" type="__m512i" varname="a" />
+	<parameter etype="M512" type="__m512i" varname="b" />
+	<description>Compute the bitwise OR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[511:0] := (a[511:0] OR b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPORD" xed="VPORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_or_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPORQ" xed="VPORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_or_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the resut in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPORQ" xed="VPORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_test_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPTESTMD" xed="VPTESTMD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_test_epi32_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPTESTMD" xed="VPTESTMD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_xor_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPXORD" xed="VPXORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_xor_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPXORD" xed="VPXORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_xor_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="M512" type="__m512i" varname="a" />
+	<parameter etype="M512" type="__m512i" varname="b" />
+	<description>Compute the bitwise XOR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[511:0] := (a[511:0] XOR b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPXORD" xed="VPXORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_xor_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPXORQ" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_xor_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPXORQ" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_and_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Reduce the packed 32-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_AND(src, len) {
+	IF len == 2
+		RETURN src[31:0] AND src[63:32]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := src[i+31:i] AND src[i+32*len+31:i+32*len]
+	ENDFOR
+	RETURN REDUCE_AND(src[32*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 16
+	i := j*32
+	IF k[j]
+		tmp[i+31:i] := a[i+31:i]
+	ELSE
+		tmp[i+31:i] := 0xFFFFFFFF
+	FI
+ENDFOR
+dst[31:0] := REDUCE_AND(tmp, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_and_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Reduce the packed 64-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_AND(src, len) {
+	IF len == 2
+		RETURN src[63:0] AND src[127:64]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := src[i+63:i] AND src[i+64*len+63:i+64*len]
+	ENDFOR
+	RETURN REDUCE_AND(src[64*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 8
+	i := j*64
+	IF k[j]
+		tmp[i+63:i] := a[i+63:i]
+	ELSE
+		tmp[i+63:i] := 0xFFFFFFFFFFFFFFFF
+	FI
+ENDFOR
+dst[63:0] := REDUCE_AND(tmp, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_or_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Reduce the packed 32-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_OR(src, len) {
+	IF len == 2
+		RETURN src[31:0] OR src[63:32]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := src[i+31:i] OR src[i+32*len+31:i+32*len]
+	ENDFOR
+	RETURN REDUCE_OR(src[32*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 16
+	i := j*32
+	IF k[j]
+		tmp[i+31:i] := a[i+31:i]
+	ELSE
+		tmp[i+31:i] := 0
+	FI
+ENDFOR
+dst[31:0] := REDUCE_OR(tmp, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_or_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Reduce the packed 64-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_OR(src, len) {
+	IF len == 2
+		RETURN src[63:0] OR src[127:64]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := src[i+63:i] OR src[i+64*len+63:i+64*len]
+	ENDFOR
+	RETURN REDUCE_OR(src[64*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 8
+	i := j*64
+	IF k[j]
+		tmp[i+63:i] := a[i+63:i]
+	ELSE
+		tmp[i+63:i] := 0
+	FI
+ENDFOR
+dst[63:0] := REDUCE_OR(tmp, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_and_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Reduce the packed 32-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_AND(src, len) {
+	IF len == 2
+		RETURN src[31:0] AND src[63:32]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := src[i+31:i] AND src[i+32*len+31:i+32*len]
+	ENDFOR
+	RETURN REDUCE_AND(src[32*len-1:0], len)
+}
+dst[31:0] := REDUCE_AND(a, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_and_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Reduce the packed 64-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_AND(src, len) {
+	IF len == 2
+		RETURN src[63:0] AND src[127:64]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := src[i+63:i] AND src[i+64*len+63:i+64*len]
+	ENDFOR
+	RETURN REDUCE_AND(src[64*len-1:0], len)
+}
+dst[63:0] := REDUCE_AND(a, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_or_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Reduce the packed 32-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_OR(src, len) {
+	IF len == 2
+		RETURN src[31:0] OR src[63:32]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := src[i+31:i] OR src[i+32*len+31:i+32*len]
+	ENDFOR
+	RETURN REDUCE_OR(src[32*len-1:0], len)
+}
+dst[31:0] := REDUCE_OR(a, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_or_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Reduce the packed 64-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_OR(src, len) {
+	IF len == 2
+		RETURN src[63:0] OR src[127:64]
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := src[i+63:i] OR src[i+64*len+63:i+64*len]
+	ENDFOR
+	RETURN REDUCE_OR(src[64*len-1:0], len)
+}
+dst[63:0] := REDUCE_OR(a, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_and_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="v2" />
+	<parameter etype="UI32" type="__m512i" varname="v3" />
+	<description>Performs element-by-element bitwise AND between packed 32-bit integer elements of "v2" and "v3", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v2[i+31:i] &amp; v3[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPANDD" xed="VPANDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMAXSD" xed="VPMAXSD_ZMMi32_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMAXSD" xed="VPMAXSD_ZMMi32_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMAXUD" xed="VPMAXUD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMAXUD" xed="VPMAXUD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMINSD" xed="VPMINSD_ZMMi32_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="SI32" type="__m512i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMINSD" xed="VPMINSD_ZMMi32_MASKmskw_ZMMi32_ZMMi32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMINUD" xed="VPMINUD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMINUD" xed="VPMINUD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_max_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="SI32" type="int" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[31:0] &gt; src[63:32] ? src[31:0] : src[63:32])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := (src[i+31:i] &gt; src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[32*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 16
+	i := j*32
+	IF k[j]
+		tmp[i+31:i] := a[i+31:i]
+	ELSE
+		tmp[i+31:i] := Int32(-0x80000000)
+	FI
+ENDFOR
+dst[31:0] := REDUCE_MAX(tmp, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_max_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="SI64" type="__int64" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[63:0] &gt; src[127:64] ? src[63:0] : src[127:64])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := (src[i+63:i] &gt; src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[64*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 8
+	i := j*64
+	IF k[j]
+		tmp[i+63:i] := a[i+63:i]
+	ELSE
+		tmp[i+63:i] := Int64(-0x8000000000000000)
+	FI
+ENDFOR
+dst[63:0] := REDUCE_MAX(tmp, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_max_epu32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[31:0] &gt; src[63:32] ? src[31:0] : src[63:32])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := (src[i+31:i] &gt; src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[32*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 16
+	i := j*32
+	IF k[j]
+		tmp[i+31:i] := a[i+31:i]
+	ELSE
+		tmp[i+31:i] := 0
+	FI
+ENDFOR
+dst[31:0] := REDUCE_MAX(tmp, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_max_epu64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Reduce the packed unsigned 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[63:0] &gt; src[127:64] ? src[63:0] : src[127:64])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := (src[i+63:i] &gt; src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[64*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 8
+	i := j*64
+	IF k[j]
+		tmp[i+63:i] := a[i+63:i]
+	ELSE
+		tmp[i+63:i] := 0
+	FI
+ENDFOR
+dst[63:0] := REDUCE_MAX(tmp, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_max_pd" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="double" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[63:0] &gt; src[127:64] ? src[63:0] : src[127:64])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := (src[i+63:i] &gt; src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[64*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 8
+	i := j*64
+	IF k[j]
+		tmp[i+63:i] := a[i+63:i]
+	ELSE
+		tmp[i+63:i] := Cast_FP64(0xFFEFFFFFFFFFFFFF)
+	FI
+ENDFOR
+dst[63:0] := REDUCE_MAX(tmp, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_max_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[31:0] &gt; src[63:32] ? src[31:0] : src[63:32])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := (src[i+31:i] &gt; src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[32*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 16
+	i := j*32
+	IF k[j]
+		tmp[i+31:i] := a[i+31:i]
+	ELSE
+		tmp[i+31:i] := Cast_FP32(0xFF7FFFFF)
+	FI
+ENDFOR
+dst[31:0] := REDUCE_MAX(tmp, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_min_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="SI32" type="int" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[31:0] &lt; src[63:32] ? src[31:0] : src[63:32])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := (src[i+31:i] &lt; src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[32*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 16
+	i := j*32
+	IF k[j]
+		tmp[i+31:i] := a[i+31:i]
+	ELSE
+		tmp[i+31:i] := Int32(0x7FFFFFFF)
+	FI
+ENDFOR
+dst[31:0] := REDUCE_MIN(tmp, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_min_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="SI64" type="__int64" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[63:0] &lt; src[127:64] ? src[63:0] : src[127:64])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := (src[i+63:i] &lt; src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[64*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 8
+	i := j*64
+	IF k[j]
+		tmp[i+63:i] := a[i+63:i]
+	ELSE
+		tmp[i+63:i] := Int64(0x7FFFFFFFFFFFFFFF)
+	FI
+ENDFOR
+dst[63:0] := REDUCE_MIN(tmp, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_min_epu32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[31:0] &lt; src[63:32] ? src[31:0] : src[63:32])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := (src[i+31:i] &lt; src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[32*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 16
+	i := j*32
+	IF k[j]
+		tmp[i+31:i] := a[i+31:i]
+	ELSE
+		tmp[i+31:i] := 0xFFFFFFFF
+	FI
+ENDFOR
+dst[31:0] := REDUCE_MIN(tmp, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_min_epu64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Reduce the packed unsigned 64-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[63:0] &lt; src[127:64] ? src[63:0] : src[127:64])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := (src[i+63:i] &lt; src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[64*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 8
+	i := j*64
+	IF k[j]
+		tmp[i+63:i] := a[i+63:i]
+	ELSE
+		tmp[i+63:i] := 0xFFFFFFFFFFFFFFFF
+	FI
+ENDFOR
+dst[63:0] := REDUCE_MIN(tmp, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_min_pd" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="double" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". [min_float_note]</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[63:0] &lt; src[127:64] ? src[63:0] : src[127:64])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := (src[i+63:i] &lt; src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[64*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 8
+	i := j*64
+	IF k[j]
+		tmp[i+63:i] := a[i+63:i]
+	ELSE
+		tmp[i+63:i] := Cast_FP64(0x7FEFFFFFFFFFFFFF)
+	FI
+ENDFOR
+dst[63:0] := REDUCE_MIN(tmp, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_min_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". [min_float_note]</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[31:0] &lt; src[63:32] ? src[31:0] : src[63:32])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := (src[i+31:i] &lt; src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[32*len-1:0], len)
+}
+tmp := a
+FOR j := 0 to 16
+	i := j*32
+	IF k[j]
+		tmp[i+31:i] := a[i+31:i]
+	ELSE
+		tmp[i+31:i] := Cast_FP32(0x7F7FFFFF)
+	FI
+ENDFOR
+dst[31:0] := REDUCE_MIN(tmp, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_max_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="SI32" type="int" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Reduce the packed signed 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[31:0] &gt; src[63:32] ? src[31:0] : src[63:32])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := (src[i+31:i] &gt; src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[32*len-1:0], len)
+}
+dst[31:0] := REDUCE_MAX(a, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_max_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="SI64" type="__int64" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Reduce the packed signed 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[63:0] &gt; src[127:64] ? src[63:0] : src[127:64])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := (src[i+63:i] &gt; src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[64*len-1:0], len)
+}
+dst[63:0] := REDUCE_MAX(a, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_max_epu32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Reduce the packed unsigned 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[31:0] &gt; src[63:32] ? src[31:0] : src[63:32])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := (src[i+31:i] &gt; src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[32*len-1:0], len)
+}
+dst[31:0] := REDUCE_MAX(a, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_max_epu64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Reduce the packed unsigned 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[63:0] &gt; src[127:64] ? src[63:0] : src[127:64])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := (src[i+63:i] &gt; src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[64*len-1:0], len)
+}
+dst[63:0] := REDUCE_MAX(a, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_max_pd" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="double" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[63:0] &gt; src[127:64] ? src[63:0] : src[127:64])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := (src[i+63:i] &gt; src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[64*len-1:0], len)
+}
+dst[63:0] := REDUCE_MAX(a, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_max_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MAX(src, len) {
+	IF len == 2
+		RETURN (src[31:0] &gt; src[63:32] ? src[31:0] : src[63:32])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := (src[i+31:i] &gt; src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len])
+	ENDFOR
+	RETURN REDUCE_MAX(src[32*len-1:0], len)
+}
+dst[31:0] := REDUCE_MAX(a, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_min_epi32" sequence="TRUE" tech="AVX-512">
+	<return etype="SI32" type="int" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Reduce the packed signed 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[31:0] &lt; src[63:32] ? src[31:0] : src[63:32])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := (src[i+31:i] &lt; src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[32*len-1:0], len)
+}
+dst[31:0] := REDUCE_MIN(a, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_min_epi64" sequence="TRUE" tech="AVX-512">
+	<return etype="SI64" type="__int64" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Reduce the packed signed 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[63:0] &lt; src[127:64] ? src[63:0] : src[127:64])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := (src[i+63:i] &lt; src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[64*len-1:0], len)
+}
+dst[63:0] := REDUCE_MIN(a, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_min_epu32" sequence="TRUE" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Reduce the packed unsigned 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[31:0] &lt; src[63:32] ? src[31:0] : src[63:32])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := (src[i+31:i] &lt; src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[32*len-1:0], len)
+}
+dst[31:0] := REDUCE_MIN(a, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_min_epu64" sequence="TRUE" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Reduce the packed unsigned 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[63:0] &lt; src[127:64] ? src[63:0] : src[127:64])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := (src[i+63:i] &lt; src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[64*len-1:0], len)
+}
+dst[63:0] := REDUCE_MIN(a, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_min_pd" sequence="TRUE" tech="AVX-512">
+	<return etype="FP64" type="double" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". [min_float_note]</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[63:0] &lt; src[127:64] ? src[63:0] : src[127:64])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*64
+		src[i+63:i] := (src[i+63:i] &lt; src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[64*len-1:0], len)
+}
+dst[63:0] := REDUCE_MIN(a, 8)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_min_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". [min_float_note]</description>
+	<operation>
+DEFINE REDUCE_MIN(src, len) {
+	IF len == 2
+		RETURN (src[31:0] &lt; src[63:32] ? src[31:0] : src[63:32])
+	FI
+	len := len / 2
+	FOR j:= 0 to (len-1)
+		i := j*32
+		src[i+31:i] := (src[i+31:i] &lt; src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len])
+	ENDFOR
+	RETURN REDUCE_MIN(src[32*len-1:0], len)
+}
+dst[31:0] := REDUCE_MIN(a, 16)
+	</operation>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_slli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPSLLD" xed="VPSLLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_slli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSLLD" xed="VPSLLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sllv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSLLVD" xed="VPSLLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sllv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSLLVD" xed="VPSLLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srai_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPSRAD" xed="VPSRAD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srai_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="6" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSRAD" xed="VPSRAD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srav_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSRAVD" xed="VPSRAVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srav_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSRAVD" xed="VPSRAVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VPSRLD" xed="VPSRLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srli_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VPSRLD" xed="VPSRLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_srlv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSRLVD" xed="VPSRLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_srlv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSRLVD" xed="VPSRLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castpd_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Cast vector of type __m512d to type __m512.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castpd_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Cast vector of type __m512d to type __m512i.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castps_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Cast vector of type __m512 to type __m512d.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castps_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Cast vector of type __m512 to type __m512i.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castsi512_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Cast vector of type __m512i to type __m512d.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castsi512_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Cast vector of type __m512i to type __m512.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtpslo_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="v2" />
+	<description>Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := j*64
+	dst[n+63:n] := Convert_FP32_To_FP64(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTPS2PD" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtpslo_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="v2" />
+	<description>Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[l+63:l] := Convert_FP32_To_FP64(v2[i+31:i])
+	ELSE
+		dst[l+63:l] := src[l+63:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTPS2PD" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi32lo_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="v2" />
+	<description>Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTDQ2PD" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi32lo_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="v2" />
+	<description>Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := j*64
+	IF k[j]
+		dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i])
+	ELSE
+		dst[n+63:n] := src[n+63:n]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTDQ2PD" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu32lo_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="v2" />
+	<description>Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := j*64
+	dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTUDQ2PD" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu32lo_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="v2" />
+	<description>Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i])
+	ELSE
+		dst[l+63:l] := src[l+63:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTUDQ2PD" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtpd_pslo" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="v2" />
+	<description>Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst". The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k := j*32
+	dst[k+31:k] := Convert_FP64_To_FP32(v2[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTPD2PS" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtpd_pslo" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="v2" />
+	<description>Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_FP64_To_FP32(v2[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTPD2PS" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_i32loscatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="SI32" type="__m512i" varname="vindex" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="m512, zmm" name="VPSCATTERDQ" xed="VPSCATTERDQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_i32loscatter_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="vindex" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" immtype="_MM_INDEX_SCALE" type="int" varname="scale" />
+	<description>Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements whose corresponding mask bit is not set are not written to memory).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VPSCATTERDQ" xed="VPSCATTERDQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512" />
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	
+<intrinsic name="_mm256_madd52lo_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMADD52LUQ" xed="VPMADD52LUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_madd52lo_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMADD52LUQ" xed="VPMADD52LUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_madd52lo_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMADD52LUQ" xed="VPMADD52LUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_madd52lo_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPMADD52LUQ" xed="VPMADD52LUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_madd52lo_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMADD52LUQ" xed="VPMADD52LUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_madd52lo_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMADD52LUQ" xed="VPMADD52LUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_madd52hi_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMADD52HUQ" xed="VPMADD52HUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_madd52hi_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMADD52HUQ" xed="VPMADD52HUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_madd52hi_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMADD52HUQ" xed="VPMADD52HUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_madd52hi_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPMADD52HUQ" xed="VPMADD52HUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_madd52hi_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMADD52HUQ" xed="VPMADD52HUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_madd52hi_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMADD52HUQ" xed="VPMADD52HUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_madd52lo_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMADD52LUQ" xed="VPMADD52LUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_madd52lo_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMADD52LUQ" xed="VPMADD52LUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_madd52lo_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMADD52LUQ" xed="VPMADD52LUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_madd52hi_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMADD52HUQ" xed="VPMADD52HUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_madd52hi_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMADD52HUQ" xed="VPMADD52HUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_madd52hi_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMADD52HUQ" xed="VPMADD52HUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512IFMA52</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_maskz_popcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPOPCNTQ" xed="VPOPCNTQ_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_popcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPOPCNTQ" xed="VPOPCNTQ_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_popcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := POPCNT(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPOPCNTQ" xed="VPOPCNTQ_YMMu64_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_popcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPOPCNTQ" xed="VPOPCNTQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_popcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPOPCNTQ" xed="VPOPCNTQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_popcnt_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := POPCNT(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPOPCNTQ" xed="VPOPCNTQ_XMMu64_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_popcnt_epi32" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := POPCNT(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPOPCNTD" xed="VPOPCNTD_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_popcnt_epi32" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPOPCNTD" xed="VPOPCNTD_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_popcnt_epi32" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPOPCNTD" xed="VPOPCNTD_YMMu32_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_popcnt_epi32" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := POPCNT(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPOPCNTD" xed="VPOPCNTD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_popcnt_epi32" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPOPCNTD" xed="VPOPCNTD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_popcnt_epi32" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPOPCNTD" xed="VPOPCNTD_XMMu32_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_popcnt_epi32" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POPCNT(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VPOPCNTD" xed="VPOPCNTD_ZMMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_popcnt_epi32" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPOPCNTD" xed="VPOPCNTD_ZMMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_popcnt_epi32" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPOPCNTD" xed="VPOPCNTD_ZMMu32_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_popcnt_epi64" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POPCNT(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VPOPCNTQ" xed="VPOPCNTQ_ZMMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_popcnt_epi64" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPOPCNTQ" xed="VPOPCNTQ_ZMMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_popcnt_epi64" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPOPCNTQ" xed="VPOPCNTQ_ZMMu64_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	
+	
+	<intrinsic name="_mm512_cvtpbh_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="BF16" type="__m256bh" varname="a" />
+	<description>Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtpbh_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="BF16" type="__m256bh" varname="a" />
+	<description>Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtpbh_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="BF16" type="__m256bh" varname="a" />
+	<description>Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsbh_ss" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="BF16" type="__bfloat16" varname="a" />
+	<description>Convert the BF16 (16-bit) floating-point element in "a" to a floating-point element, and store the result in "dst". This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN.</description>
+	<operation>
+dst[31:0] := Convert_BF16_To_FP32(a[15:0])
+	</operation>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtne2ps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m512bh" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	IF j &lt; 16
+		t := b.fp32[j]
+	ELSE
+		t := a.fp32[j-16]
+	FI
+	dst.word[j] := Convert_FP32_To_BF16(t)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VCVTNE2PS2BF16" xed="VCVTNE2PS2BF16_ZMMbf16_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtne2ps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m512bh" varname="dst" />
+	<parameter etype="BF16" type="__m512bh" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF j &lt; 16
+			t := b.fp32[j]
+		ELSE
+			t := a.fp32[j-16]
+		FI
+		dst.word[j] := Convert_FP32_To_BF16(t)
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VCVTNE2PS2BF16" xed="VCVTNE2PS2BF16_ZMMbf16_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtne2ps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m512bh" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="FP32" type="__m512" varname="b" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF j &lt; 16
+			t := b.fp32[j]
+		ELSE
+			t := a.fp32[j-16]
+		FI
+		dst.word[j] := Convert_FP32_To_BF16(t)
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VCVTNE2PS2BF16" xed="VCVTNE2PS2BF16_ZMMbf16_MASKmskw_ZMMf32_ZMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtneps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m256bh" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VCVTNEPS2BF16" xed="VCVTNEPS2BF16_YMMbf16_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtneps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m256bh" varname="dst" />
+	<parameter etype="BF16" type="__m256bh" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VCVTNEPS2BF16" xed="VCVTNEPS2BF16_YMMbf16_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtneps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m256bh" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VCVTNEPS2BF16" xed="VCVTNEPS2BF16_YMMbf16_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_dpbf16_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="BF16" type="__m512bh" varname="a" />
+	<parameter etype="BF16" type="__m512bh" varname="b" />
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst".</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 15
+	dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+	dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VDPBF16PS" xed="VDPBF16PS_ZMMf32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_dpbf16_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="BF16" type="__m512bh" varname="a" />
+	<parameter etype="BF16" type="__m512bh" varname="b" />
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+		dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VDPBF16PS" xed="VDPBF16PS_ZMMf32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_dpbf16_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="BF16" type="__m512bh" varname="a" />
+	<parameter etype="BF16" type="__m512bh" varname="b" />
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+		dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VDPBF16PS" xed="VDPBF16PS_ZMMf32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+	
+	<intrinsic name="_mm_cvtpbh_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="BF16" type="__m128bh" varname="a" />
+	<description>Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtpbh_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="BF16" type="__m128bh" varname="a" />
+	<description>Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtpbh_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="BF16" type="__m128bh" varname="a" />
+	<description>Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtpbh_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="BF16" type="__m128bh" varname="a" />
+	<description>Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtpbh_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="BF16" type="__m128bh" varname="a" />
+	<description>Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtpbh_ps" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="BF16" type="__m128bh" varname="a" />
+	<description>Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtness_sbh" sequence="TRUE" tech="AVX-512">
+	<return etype="BF16" type="__bfloat16" varname="dst" />
+	<parameter etype="FP32" type="float" varname="a" />
+	<description>Convert the single-precision (32-bit) floating-point element in "a" to a BF16 (16-bit) floating-point element, and store the result in "dst".</description>
+	<operation>
+dst[15:0] := Convert_FP32_To_BF16(a[31:0])
+	</operation>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtne2ps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m128bh" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	IF j &lt; 4
+		t := b.fp32[j]
+	ELSE
+		t := a.fp32[j-4]
+	FI
+	dst.word[j] := Convert_FP32_To_BF16(t)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VCVTNE2PS2BF16" xed="VCVTNE2PS2BF16_XMMbf16_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtne2ps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m128bh" varname="dst" />
+	<parameter etype="BF16" type="__m128bh" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		IF j &lt; 4
+			t := b.fp32[j]
+		ELSE
+			t := a.fp32[j-4]
+		FI
+		dst.word[j] := Convert_FP32_To_BF16(t)
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VCVTNE2PS2BF16" xed="VCVTNE2PS2BF16_XMMbf16_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtne2ps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m128bh" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		IF j &lt; 4
+			t := b.fp32[j]
+		ELSE
+			t := a.fp32[j-4]
+		FI
+		dst.word[j] := Convert_FP32_To_BF16(t)
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VCVTNE2PS2BF16" xed="VCVTNE2PS2BF16_XMMbf16_MASKmskw_XMMf32_XMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtne2ps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m256bh" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	IF j &lt; 8
+		t := b.fp32[j]
+	ELSE
+		t := a.fp32[j-8]
+	FI
+	dst.word[j] := Convert_FP32_To_BF16(t)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VCVTNE2PS2BF16" xed="VCVTNE2PS2BF16_YMMbf16_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtne2ps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m256bh" varname="dst" />
+	<parameter etype="BF16" type="__m256bh" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		IF j &lt; 8
+			t := b.fp32[j]
+		ELSE
+			t := a.fp32[j-8]
+		FI
+		dst.word[j] := Convert_FP32_To_BF16(t)
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VCVTNE2PS2BF16" xed="VCVTNE2PS2BF16_YMMbf16_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtne2ps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m256bh" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		IF j &lt; 8
+			t := b.fp32[j]
+		ELSE
+			t := a.fp32[j-8]
+		FI
+		dst.word[j] := Convert_FP32_To_BF16(t)
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VCVTNE2PS2BF16" xed="VCVTNE2PS2BF16_YMMbf16_MASKmskw_YMMf32_YMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtneps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m128bh" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTNEPS2BF16" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtneps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m128bh" varname="dst" />
+	<parameter etype="BF16" type="__m128bh" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTNEPS2BF16" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtneps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m128bh" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTNEPS2BF16" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtneps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m128bh" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTNEPS2BF16" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtneps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m128bh" varname="dst" />
+	<parameter etype="BF16" type="__m128bh" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTNEPS2BF16" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtneps_pbh" tech="AVX-512">
+	<return etype="BF16" type="__m128bh" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTNEPS2BF16" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpbf16_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="BF16" type="__m128bh" varname="a" />
+	<parameter etype="BF16" type="__m128bh" varname="b" />
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst".</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 3
+	dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+	dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VDPBF16PS" xed="VDPBF16PS_XMMf32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_dpbf16_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="BF16" type="__m128bh" varname="a" />
+	<parameter etype="BF16" type="__m128bh" varname="b" />
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 3
+	IF k[j]
+		dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+		dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VDPBF16PS" xed="VDPBF16PS_XMMf32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_dpbf16_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="BF16" type="__m128bh" varname="a" />
+	<parameter etype="BF16" type="__m128bh" varname="b" />
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 3
+	IF k[j]
+		dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+		dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VDPBF16PS" xed="VDPBF16PS_XMMf32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpbf16_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="BF16" type="__m256bh" varname="a" />
+	<parameter etype="BF16" type="__m256bh" varname="b" />
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst".</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 7
+	dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+	dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VDPBF16PS" xed="VDPBF16PS_YMMf32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_dpbf16_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="BF16" type="__m256bh" varname="a" />
+	<parameter etype="BF16" type="__m256bh" varname="b" />
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+		dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VDPBF16PS" xed="VDPBF16PS_YMMf32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_dpbf16_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="BF16" type="__m256bh" varname="a" />
+	<parameter etype="BF16" type="__m256bh" varname="b" />
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+		dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VDPBF16PS" xed="VDPBF16PS_YMMf32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_mask_bitshuffle_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 3 //Qword
+	FOR j := 0 to 7 // Byte
+		IF k[i*8+j]
+			m := c.qword[i].byte[j] &amp; 0x3F
+			dst[i*8+j] := b.qword[i].bit[m]
+		ELSE
+			dst[i*8+j] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm" name="VPSHUFBITQMB" xed="VPSHUFBITQMB_MASKmskw_MASKmskw_YMMu64_YMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_bitshuffle_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 3 //Qword
+	FOR j := 0 to 7 // Byte
+		m := c.qword[i].byte[j] &amp; 0x3F
+		dst[i*8+j] := b.qword[i].bit[m]
+	ENDFOR
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="k, ymm, ymm" name="VPSHUFBITQMB" xed="VPSHUFBITQMB_MASKmskw_MASKmskw_YMMu64_YMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_bitshuffle_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 1 //Qword
+	FOR j := 0 to 7 // Byte
+		IF k[i*8+j]
+			m := c.qword[i].byte[j] &amp; 0x3F
+			dst[i*8+j] := b.qword[i].bit[m]
+		ELSE
+			dst[i*8+j] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm" name="VPSHUFBITQMB" xed="VPSHUFBITQMB_MASKmskw_MASKmskw_XMMu64_XMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_bitshuffle_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 1 //Qword
+	FOR j := 0 to 7 // Byte
+		m := c.qword[i].byte[j] &amp; 0x3F
+		dst[i*8+j] := b.qword[i].bit[m]
+	ENDFOR
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="k, xmm, xmm" name="VPSHUFBITQMB" xed="VPSHUFBITQMB_MASKmskw_MASKmskw_XMMu64_XMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_popcnt_epi16" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := POPCNT(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPOPCNTW" xed="VPOPCNTW_YMMu16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_popcnt_epi16" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POPCNT(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPOPCNTW" xed="VPOPCNTW_YMMu16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_popcnt_epi16" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POPCNT(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPOPCNTW" xed="VPOPCNTW_YMMu16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_popcnt_epi16" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := POPCNT(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPOPCNTW" xed="VPOPCNTW_XMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_popcnt_epi16" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POPCNT(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPOPCNTW" xed="VPOPCNTW_XMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_popcnt_epi16" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POPCNT(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPOPCNTW" xed="VPOPCNTW_XMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_popcnt_epi8" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := POPCNT(a[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VPOPCNTB" xed="VPOPCNTB_YMMu8_MASKmskw_YMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_popcnt_epi8" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := POPCNT(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPOPCNTB" xed="VPOPCNTB_YMMu8_MASKmskw_YMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_popcnt_epi8" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := POPCNT(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPOPCNTB" xed="VPOPCNTB_YMMu8_MASKmskw_YMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_popcnt_epi8" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := POPCNT(a[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VPOPCNTB" xed="VPOPCNTB_XMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_popcnt_epi8" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := POPCNT(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPOPCNTB" xed="VPOPCNTB_XMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_popcnt_epi8" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := POPCNT(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPOPCNTB" xed="VPOPCNTB_XMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_mask_bitshuffle_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 7 //Qword
+	FOR j := 0 to 7 // Byte
+		IF k[i*8+j]
+			m := c.qword[i].byte[j] &amp; 0x3F
+			dst[i*8+j] := b.qword[i].bit[m]
+		ELSE
+			dst[i*8+j] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm" name="VPSHUFBITQMB" xed="VPSHUFBITQMB_MASKmskw_MASKmskw_ZMMu64_ZMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_bitshuffle_epi64_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask64" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 7 //Qword
+	FOR j := 0 to 7 // Byte
+		m := c.qword[i].byte[j] &amp; 0x3F
+		dst[i*8+j] := b.qword[i].bit[m]
+	ENDFOR
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="k, zmm, zmm" name="VPSHUFBITQMB" xed="VPSHUFBITQMB_MASKmskw_MASKmskw_ZMMu64_ZMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_popcnt_epi16" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := POPCNT(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VPOPCNTW" xed="VPOPCNTW_ZMMu16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_popcnt_epi16" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POPCNT(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPOPCNTW" xed="VPOPCNTW_ZMMu16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_popcnt_epi16" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POPCNT(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPOPCNTW" xed="VPOPCNTW_ZMMu16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_popcnt_epi8" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := POPCNT(a[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VPOPCNTB" xed="VPOPCNTB_ZMMu8_MASKmskw_ZMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_popcnt_epi8" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := POPCNT(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPOPCNTB" xed="VPOPCNTB_ZMMu8_MASKmskw_ZMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_popcnt_epi8" vexEq="TRUE" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := POPCNT(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPOPCNTB" xed="VPOPCNTB_ZMMu8_MASKmskw_ZMMu8_AVX512" />
+	<CPUID>AVX512_BITALG</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	
+	
+	<intrinsic name="_mm256_acos_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the inverse cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ACOS(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_acosh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the inverse hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ACOSH(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_asin_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the inverse sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ASIN(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_asinh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the inverse hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ASINH(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_atan2_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+		<parameter etype="FP16" type="__m256h" varname="b" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ATAN2(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_atan_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ATAN(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_atanh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the inverse hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ATANH(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_cbrt_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := CubeRoot(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_cdfnorm_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := CDFNormal(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_cdfnorminv_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the inverse cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := InverseCDFNormal(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_cos_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := COS(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_cosd_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := COSD(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_cosh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := COSH(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_erf_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ERF(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_erfc_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := 1.0 - ERF(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_erfcinv_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the inverse complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := 1.0 / (1.0 - ERF(a[i+15:i]))
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_erfinv_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the inverse error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := 1.0 / ERF(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_exp10_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the exponential value of 10 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := POW(FP16(10.0), a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_exp2_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the exponential value of 2 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := POW(FP16(2.0), a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_exp_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := POW(FP16(e), a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_expm1_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := POW(FP16(e), a[i+15:i]) - 1.0
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_hypot_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+		<parameter etype="FP16" type="__m256h" varname="b" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := SQRT(POW(a[i+15:i], 2.0) + POW(b[i+15:i], 2.0))
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_invcbrt_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the inverse cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := InvCubeRoot(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_invsqrt_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the inverse square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := InvSQRT(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_log10_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the base-10 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := LOG(a[i+15:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_log1p_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the natural logarithm of one plus packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := LOG(1.0 + a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_log2_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the base-2 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := LOG(a[i+15:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_log_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the natural logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := LOG(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_logb_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ConvertExpFP16(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_pow_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the exponential value of packed half-precision (16-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+		<parameter etype="FP16" type="__m256h" varname="b" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := POW(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_sin_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := SIN(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_sincos_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the sine and cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+		<parameter etype="FP16" memwidth="256" type="__m256h*" varname="mem_addr" />
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := SIN(a[i+15:i])
+	MEM[mem_addr+i+15:mem_addr+i] := COS(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+cos_res[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_sind_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := SIND(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_sinh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := SINH(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_svml_ceil_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Round the packed half-precision (16-bit) floating-point elements in "a" up to an integer value, and store the results as packed half-precision floating-point elements in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Special Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := CEIL(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_svml_floor_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Round the packed half-precision (16-bit) floating-point elements in "a" down to an integer value, and store the results as packed half-precision floating-point elements in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Special Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := FLOOR(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_svml_round_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Round the packed half-precision (16-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed half-precision floating-point elements in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Special Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ROUND(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_svml_sqrt_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := SQRT(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_tan_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := TAN(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_tand_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := TAND(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_tanh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Compute the hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := TANH(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm256_trunc_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m256h" varname="dst" />
+		<description>Truncate the packed half-precision (16-bit) floating-point elements in "a", and store the results as packed half-precision floating-point elements in "dst"</description>
+		<parameter etype="FP16" type="__m256h" varname="a" />
+	<category>Special Math Functions</category><operation>FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := TRUNCATE(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_acos_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ACOS(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_acosh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ACOSH(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_asin_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ASIN(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_asinh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ASINH(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_atan2_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+		<parameter etype="FP16" type="__m512h" varname="b" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ATAN2(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_atan_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" expressed in radians.</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ATAN(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_atanh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse hyperblic tangent of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" expressed in radians.</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ATANH(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_cbrt_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := CubeRoot(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_cdfnorm_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := CDFNormal(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_cdfnorminv_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := InverseCDFNormal(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_ceil_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Round the packed half-precision (16-bit) floating-point elements in "a" up to an integer value, and store the results as packed half-precision floating-point elements in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Special Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := CEIL(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_cos_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := COS(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_cosd_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := COSD(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_cosh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := COSH(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_erf_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ERF(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_erfc_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := 1.0 - ERF(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_erfcinv_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := 1.0 / (1.0 - ERF(a[i+15:i]))
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_erfinv_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := 1.0 / ERF(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_exp10_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the exponential value of 10 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := POW(FP16(10.0), a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_exp2_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the exponential value of 2 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := POW(FP16(2.0), a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_exp_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := POW(FP16(e), a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_expm1_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := POW(FP16(e), a[i+15:i]) - 1.0
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_floor_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Round the packed half-precision (16-bit) floating-point elements in "a" down to an integer value, and store the results as packed half-precision floating-point elements in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Special Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := FLOOR(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_hypot_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+		<parameter etype="FP16" type="__m512h" varname="b" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := SQRT(POW(a[i+15:i], 2.0) + POW(b[i+15:i], 2.0))
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_invsqrt_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := InvSQRT(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_log10_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the base-10 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := LOG(a[i+15:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_log1p_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the natural logarithm of one plus packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := LOG(1.0 + a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_log2_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the base-2 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := LOG(a[i+15:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_log_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the natural logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := LOG(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_logb_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ConvertExpFP16(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_acos_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ACOS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_acosh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ACOSH(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_asin_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ASIN(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_asinh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ASINH(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_atan_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ATAN(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_atanh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ATANH(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_cbrt_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := CubeRoot(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_cdfnorm_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := CDFNormal(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_cdfnorminv_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := InverseCDFNormal(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_ceil_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Round the packed half-precision (16-bit) floating-point elements in "a" up to an integer value, and store the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Special Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := CEIL(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_cos_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := COS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_cosd_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := COSD(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_cosh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := COSH(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_erf_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ERF(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_erfc_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := 1.0 - ERF(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_erfcinv_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := 1.0 / (1.0 - ERF(a[i+15:i]))
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_erfinv_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := 1.0 / ERF(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_exp10_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the exponential value of 10 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POW(FP16(10.0), a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_exp2_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the exponential value of 2 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POW(FP16(2.0), a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_exp_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POW(FP16(e), a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_expm1_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POW(FP16(e), a[i+15:i]) - 1.0
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_floor_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Round the packed half-precision (16-bit) floating-point elements in "a" down to an integer value, and store the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Special Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := FLOOR(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_invsqrt_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the inverse square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := InvSQRT(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_log10_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the base-10 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := LOG(a[i+15:i]) / LOG(10.0)
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_log1p_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the natural logarithm of one plus packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := LOG(1.0 + a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_log2_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the base-2 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := LOG(a[i+15:i]) / LOG(2.0)
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_log_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the natural logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := LOG(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_logb_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ConvertExpFP16(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_nearbyint_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Rounds each packed half-precision (16-bit) floating-point element in "a" to the nearest integer value and stores the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Special Math Functions</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := NearbyInt(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_recip_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Computes the reciprocal of packed half-precision (16-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (1.0 / a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_rint_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Rounds the packed half-precision (16-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Special Math Functions</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := RoundToNearestEven(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_sin_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SIN(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_sincos_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the sine and cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" memwidth="512" type="__m512h*" varname="mem_addr" />
+		<parameter etype="FP16" type="__m512h" varname="sin_src" />
+		<parameter etype="FP16" type="__m512h" varname="cos_src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SIN(a[i+15:i])
+		MEM[mem_addr+i+15:mem_addr+i] := COS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := sin_src[i+15:i]
+		MEM[mem_addr+i+15:mem_addr+i] := cos_src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_sind_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SIND(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_sinh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SINH(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_svml_round_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Round the packed half-precision (16-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Special Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ROUND(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_tan_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := TAN(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_tand_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := TAND(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_tanh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := TANH(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_mask_trunc_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Truncate the packed half-precision (16-bit) floating-point elements in "a", and store the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<parameter etype="FP16" type="__m512h" varname="src" />
+		<parameter etype="MASK" type="__mmask32" varname="k" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Special Math Functions</category><operation>FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := TRUNCATE(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_nearbyint_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Rounds each packed half-precision (16-bit) floating-point element in "a" to the nearest integer value and stores the results as packed half-precision floating-point elements in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Special Math Functions</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := NearbyInt(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_pow_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the exponential value of packed half-precision (16-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+		<parameter etype="FP16" type="__m512h" varname="b" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := POW(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_recip_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Computes the reciprocal of packed half-precision (16-bit) floating-point elements in "a", storing the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := (1.0 / a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_rint_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Rounds the packed half-precision (16-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Special Math Functions</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := RoundToNearestEven(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_sin_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := SIN(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_sincos_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the sine and cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+		<parameter etype="FP16" memwidth="512" type="__m512h*" varname="mem_addr" />
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := SIN(a[i+15:i])
+	MEM[mem_addr+i+15:mem_addr+i] := COS(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_sind_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := SIND(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_sinh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := SINH(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_svml_round_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Round the packed half-precision (16-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed half-precision floating-point elements in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Special Math Functions</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ROUND(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_tan_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := TAN(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_tand_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := TAND(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_tanh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Compute the hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := TANH(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm512_trunc_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m512h" varname="dst" />
+		<description>Truncate the packed half-precision (16-bit) floating-point elements in "a", and store the results as packed half-precision floating-point elements in "dst".</description>
+		<parameter etype="FP16" type="__m512h" varname="a" />
+	<category>Special Math Functions</category><operation>FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := TRUNCATE(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_acos_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the inverse cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ACOS(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_acosh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the inverse hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ACOSH(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_asin_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the inverse sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ASIN(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_asinh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the inverse hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ASINH(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_atan2_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+		<parameter etype="FP16" type="__m128h" varname="b" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ATAN2(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_atan_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ATAN(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_atanh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the inverse hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ATANH(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_cbrt_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := CubeRoot(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_cdfnorm_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := CDFNormal(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_cdfnorminv_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the inverse cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := InverseCDFNormal(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_cos_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := COS(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_cosd_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := COSD(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_cosh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := COSH(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_erf_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ERF(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_erfc_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := 1.0 - ERF(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_erfcinv_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the inverse complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := 1.0 / (1.0 - ERF(a[i+15:i]))
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_erfinv_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the inverse error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Probability/Statistics</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := 1.0 / ERF(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_exp10_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the exponential value of 10 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := POW(FP16(10.0), a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_exp2_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the exponential value of 2 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := POW(FP16(2.0), a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_exp_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := POW(FP16(e), a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_expm1_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := POW(FP16(e), a[i+15:i]) - 1.0
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_hypot_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+		<parameter etype="FP16" type="__m128h" varname="b" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := SQRT(POW(a[i+15:i], 2.0) + POW(b[i+15:i], 2.0))
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_invcbrt_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the inverse cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := InvCubeRoot(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_invsqrt_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the inverse square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := InvSQRT(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_log10_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the base-10 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := LOG(a[i+15:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_log1p_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the natural logarithm of one plus packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := LOG(1.0 + a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_log2_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the base-2 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := LOG(a[i+15:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_log_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the natural logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := LOG(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_logb_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Elementary Math Functions</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ConvertExpFP16(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_pow_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the exponential value of packed half-precision (16-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+		<parameter etype="FP16" type="__m128h" varname="b" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := POW(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_sin_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := SIN(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_sincos_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the sine and cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+		<parameter etype="FP16" memwidth="128" type="__m128h*" varname="mem_addr" />
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := SIN(a[i+15:i])
+	MEM[mem_addr+i+15:mem_addr+i] := COS(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+cos_res[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_sind_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := SIND(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_sinh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := SINH(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_svml_ceil_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Round the packed half-precision (16-bit) floating-point elements in "a" up to an integer value, and store the results as packed half-precision floating-point elements in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Special Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := CEIL(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_svml_floor_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Round the packed half-precision (16-bit) floating-point elements in "a" down to an integer value, and store the results as packed half-precision floating-point elements in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Special Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := FLOOR(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_svml_round_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Round the packed half-precision (16-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed half-precision floating-point elements in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Special Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ROUND(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_svml_sqrt_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Elementary Math Functions</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := SQRT(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_tan_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := TAN(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_tand_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := TAND(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_tanh_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Compute the hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Trigonometry</category><operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := TANH(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+	<intrinsic name="_mm_trunc_ph" sequence="TRUE" tech="SVML">
+		<return etype="FP16" type="__m128h" varname="dst" />
+		<description>Truncate the packed half-precision (16-bit) floating-point elements in "a", and store the results as packed half-precision floating-point elements in "dst".</description>
+		<parameter etype="FP16" type="__m128h" varname="a" />
+	<category>Special Math Functions</category><operation>FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := TRUNCATE(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+</operation><header>immintrin.h</header><CPUID>AVX512_FP16</CPUID></intrinsic>
+<intrinsic name="_mm_add_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.fp16[j] := a.fp16[j] + b.fp16[j]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VADDPH" xed="VADDPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_add_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] + b.fp16[j]
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VADDPH" xed="VADDPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_add_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] + b.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VADDPH" xed="VADDPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_add_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.fp16[j] := a.fp16[j] + b.fp16[j]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VADDPH" xed="VADDPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_add_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] + b.fp16[j]
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VADDPH" xed="VADDPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_add_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] + b.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VADDPH" xed="VADDPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	dst.fp16[j] := a.fp16[j] / b.fp16[j]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VDIVPH" xed="VDIVPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_div_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] / b.fp16[j]
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VDIVPH" xed="VDIVPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_div_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] / b.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VDIVPH" xed="VDIVPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_div_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	dst.fp16[j] := a.fp16[j] / b.fp16[j]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VDIVPH" xed="VDIVPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_div_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] / b.fp16[j]
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VDIVPH" xed="VDIVPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_div_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] / b.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VDIVPH" xed="VDIVPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMADD132PH" xed="VFMADD132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFMADD213PH" xed="VFMADD213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFMADD231PH" xed="VFMADD231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD132PH" xed="VFMADD132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD213PH" xed="VFMADD213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD231PH" xed="VFMADD231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD132PH" xed="VFMADD132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD213PH" xed="VFMADD213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD231PH" xed="VFMADD231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD132PH" xed="VFMADD132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD213PH" xed="VFMADD213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD231PH" xed="VFMADD231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMADD132PH" xed="VFMADD132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VFMADD213PH" xed="VFMADD213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VFMADD231PH" xed="VFMADD231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD132PH" xed="VFMADD132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD213PH" xed="VFMADD213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD231PH" xed="VFMADD231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD132PH" xed="VFMADD132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD213PH" xed="VFMADD213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADD231PH" xed="VFMADD231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADD132PH" xed="VFMADD132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADD213PH" xed="VFMADD213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADD231PH" xed="VFMADD231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFNMADD132PH" xed="VFNMADD132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFNMADD213PH" xed="VFNMADD213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFNMADD231PH" xed="VFNMADD231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD132PH" xed="VFNMADD132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD213PH" xed="VFNMADD213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD231PH" xed="VFNMADD231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD132PH" xed="VFNMADD132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD213PH" xed="VFNMADD213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD231PH" xed="VFNMADD231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD132PH" xed="VFNMADD132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD213PH" xed="VFNMADD213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD231PH" xed="VFNMADD231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fnmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFNMADD132PH" xed="VFNMADD132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VFNMADD213PH" xed="VFNMADD213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VFNMADD231PH" xed="VFNMADD231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fnmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD132PH" xed="VFNMADD132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD213PH" xed="VFNMADD213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD231PH" xed="VFNMADD231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fnmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD132PH" xed="VFNMADD132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD213PH" xed="VFNMADD213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMADD231PH" xed="VFNMADD231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fnmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMADD132PH" xed="VFNMADD132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMADD213PH" xed="VFNMADD213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMADD231PH" xed="VFNMADD231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMSUB132PH" xed="VFMSUB132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUB213PH" xed="VFMSUB213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUB231PH" xed="VFMSUB231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB132PH" xed="VFMSUB132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB213PH" xed="VFMSUB213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB231PH" xed="VFMSUB231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB132PH" xed="VFMSUB132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB213PH" xed="VFMSUB213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB231PH" xed="VFMSUB231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB132PH" xed="VFMSUB132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB213PH" xed="VFMSUB213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB231PH" xed="VFMSUB231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMSUB132PH" xed="VFMSUB132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VFMSUB213PH" xed="VFMSUB213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VFMSUB231PH" xed="VFMSUB231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB132PH" xed="VFMSUB132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB213PH" xed="VFMSUB213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB231PH" xed="VFMSUB231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB132PH" xed="VFMSUB132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB213PH" xed="VFMSUB213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUB231PH" xed="VFMSUB231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUB132PH" xed="VFMSUB132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUB213PH" xed="VFMSUB213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUB231PH" xed="VFMSUB231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB132PH" xed="VFNMSUB132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB213PH" xed="VFNMSUB213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB231PH" xed="VFNMSUB231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB132PH" xed="VFNMSUB132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB213PH" xed="VFNMSUB213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB231PH" xed="VFNMSUB231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB132PH" xed="VFNMSUB132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB213PH" xed="VFNMSUB213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB231PH" xed="VFNMSUB231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB132PH" xed="VFNMSUB132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB213PH" xed="VFNMSUB213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB231PH" xed="VFNMSUB231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fnmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFNMSUB132PH" xed="VFNMSUB132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VFNMSUB213PH" xed="VFNMSUB213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VFNMSUB231PH" xed="VFNMSUB231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fnmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB132PH" xed="VFNMSUB132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB213PH" xed="VFNMSUB213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB231PH" xed="VFNMSUB231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fnmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB132PH" xed="VFNMSUB132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB213PH" xed="VFNMSUB213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFNMSUB231PH" xed="VFNMSUB231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fnmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMSUB132PH" xed="VFNMSUB132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMSUB213PH" xed="VFNMSUB213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFNMSUB231PH" xed="VFNMSUB231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmaddsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	IF ((j &amp; 1) == 0)
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmaddsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmaddsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmaddsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmaddsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	IF ((j &amp; 1) == 0)
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmaddsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fmaddsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmaddsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmsubadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	IF ((j &amp; 1) == 0)
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmsubadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmsubadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmsubadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmsubadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	IF ((j &amp; 1) == 0)
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmsubadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fmsubadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {k}, ymm, ymm" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmsubadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.fp16[j] := a.fp16[j] - b.fp16[j]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VSUBPH" xed="VSUBPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] - b.fp16[j]
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSUBPH" xed="VSUBPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] - b.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSUBPH" xed="VSUBPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.fp16[j] := a.fp16[j] - b.fp16[j]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VSUBPH" xed="VSUBPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] - b.fp16[j]
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VSUBPH" xed="VSUBPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] - b.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VSUBPH" xed="VSUBPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR i := 0 TO 7
+	dst.fp16[i] := a.fp16[i] * b.fp16[i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VMULPH" xed="VMULPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 TO 7
+	IF k[i]
+		dst.fp16[i] := a.fp16[i] * b.fp16[i]
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMULPH" xed="VMULPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 TO 7
+	IF k[i]
+		dst.fp16[i] := a.fp16[i] * b.fp16[i]
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMULPH" xed="VMULPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mul_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR i := 0 TO 15
+	dst.fp16[i] := a.fp16[i] * b.fp16[i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VMULPH" xed="VMULPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mul_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 TO 15
+	IF k[i]
+		dst.fp16[i] := a.fp16[i] * b.fp16[i]
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VMULPH" xed="VMULPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mul_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 TO 15
+	IF k[i]
+		dst.fp16[i] := a.fp16[i] * b.fp16[i]
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VMULPH" xed="VMULPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMULCPH" xed="VFMULCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMULCPH" xed="VFMULCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMULCPH" xed="VFMULCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMULCPH" xed="VFMULCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMULCPH" xed="VFMULCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMULCPH" xed="VFMULCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMULCPH" xed="VFMULCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMULCPH" xed="VFMULCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMULCPH" xed="VFMULCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_mul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMULCPH" xed="VFMULCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMULCPH" xed="VFMULCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_mul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMULCPH" xed="VFMULCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fcmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFCMULCPH" xed="VFCMULCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFCMULCPH" xed="VFCMULCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fcmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFCMULCPH" xed="VFCMULCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFCMULCPH" xed="VFCMULCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fcmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFCMULCPH" xed="VFCMULCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFCMULCPH" xed="VFCMULCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fcmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFCMULCPH" xed="VFCMULCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFCMULCPH" xed="VFCMULCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fcmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFCMULCPH" xed="VFCMULCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFCMULCPH" xed="VFCMULCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fcmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFCMULCPH" xed="VFCMULCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFCMULCPH" xed="VFCMULCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMADDCPH" xed="VFMADDCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := a.fp16[2*i+0]
+		dst.fp16[2*i+1] := a.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDCPH" xed="VFMADDCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := c.fp16[2*i+0]
+		dst.fp16[2*i+1] := c.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDCPH" xed="VFMADDCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADDCPH" xed="VFMADDCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMADDCPH" xed="VFMADDCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := a.fp16[2*i+0]
+		dst.fp16[2*i+1] := a.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDCPH" xed="VFMADDCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := c.fp16[2*i+0]
+		dst.fp16[2*i+1] := c.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFMADDCPH" xed="VFMADDCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFMADDCPH" xed="VFMADDCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fcmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFCMADDCPH" xed="VFCMADDCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fcmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := a.fp16[2*i+0]
+		dst.fp16[2*i+1] := a.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFCMADDCPH" xed="VFCMADDCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fcmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := c.fp16[2*i+0]
+		dst.fp16[2*i+1] := c.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFCMADDCPH" xed="VFCMADDCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fcmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 3
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFCMADDCPH" xed="VFCMADDCPH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fcmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFCMADDCPH" xed="VFCMADDCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fcmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := a.fp16[2*i+0]
+		dst.fp16[2*i+1] := a.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFCMADDCPH" xed="VFCMADDCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask3_fcmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := c.fp16[2*i+0]
+		dst.fp16[2*i+1] := c.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VFCMADDCPH" xed="VFCMADDCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_fcmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="FP16" type="__m256h" varname="c" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VFCMADDCPH" xed="VFCMADDCPH_YMM2f16_MASKmskw_YMM2f16_YMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_add_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Reduce the packed half-precision (16-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+tmp := a
+FOR i := 0 to 7
+	tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+8]
+ENDFOR
+FOR i := 0 to 3
+	tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+4]
+ENDFOR
+FOR i := 0 to 1
+	tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+2]
+ENDFOR
+dst.fp16[0] := tmp.fp16[0] + tmp.fp16[1]
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_mul_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Reduce the packed half-precision (316-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+tmp := a
+FOR i := 0 to 7
+	tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+8]
+ENDFOR
+FOR i := 0 to 3
+	tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+4]
+ENDFOR
+FOR i := 0 to 1
+	tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+2]
+ENDFOR
+dst.fp16[0] := tmp.fp16[0] * tmp.fp16[1]
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_max_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Reduce the packed half-precision (16-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+tmp := a
+FOR i := 0 to 7
+	tmp.fp16[i] := (tmp.fp16[i] &gt; tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8])
+ENDFOR
+FOR i := 0 to 3
+	tmp.fp16[i] := (tmp.fp16[i] &gt; tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4])
+ENDFOR
+FOR i := 0 to 1
+	tmp.fp16[i] := (tmp.fp16[i] &gt; tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2])
+ENDFOR
+dst.fp16[0] := (tmp.fp16[0] &gt; tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1])
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_min_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Reduce the packed half-precision (16-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+tmp := a
+FOR i := 0 to 7
+	tmp.fp16[i] := (tmp.fp16[i] &lt; tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8])
+ENDFOR
+FOR i := 0 to 3
+	tmp.fp16[i] := (tmp.fp16[i] &lt; tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4])
+ENDFOR
+FOR i := 0 to 1
+	tmp.fp16[i] := (tmp.fp16[i] &lt; tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2])
+ENDFOR
+dst.fp16[0] := (tmp.fp16[0] &lt; tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1])
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_add_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Reduce the packed half-precision (16-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+tmp := a
+FOR i := 0 to 3
+	tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+4]
+ENDFOR
+FOR i := 0 to 1
+	tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+2]
+ENDFOR
+dst.fp16[0] := tmp.fp16[0] + tmp.fp16[1]
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_mul_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Reduce the packed half-precision (16-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+tmp := a
+FOR i := 0 to 3
+	tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+4]
+ENDFOR
+FOR i := 0 to 1
+	tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+2]
+ENDFOR
+dst.fp16[0] := tmp.fp16[0] * tmp.fp16[1]
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_max_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Reduce the packed half-precision (16-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+tmp := a
+FOR i := 0 to 3
+	tmp.fp16[i] := (tmp.fp16[i] &gt; tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4])
+ENDFOR
+FOR i := 0 to 1
+	tmp.fp16[i] := (tmp.fp16[i] &gt; tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2])
+ENDFOR
+dst.fp16[0] := (tmp.fp16[0] &gt; tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1])
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_min_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Reduce the packed half-precision (16-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+tmp := a
+FOR i := 0 to 3
+	tmp.fp16[i] := (tmp.fp16[i] &lt; tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4])
+ENDFOR
+FOR i := 0 to 1
+	tmp.fp16[i] := (tmp.fp16[i] &lt; tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2])
+ENDFOR
+dst.fp16[0] := (tmp.fp16[0] &lt; tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1])
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_abs_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="v2" />
+	<description>Finds the absolute value of each packed half-precision (16-bit) floating-point element in "v2", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	dst.fp16[j] := ABS(v2.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_abs_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="v2" />
+	<description>Finds the absolute value of each packed half-precision (16-bit) floating-point element in "v2", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	dst.fp16[j] := ABS(v2.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_conj_pch" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Compute the complex conjugates of complex numbers in "a", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR FP32(-0.0)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_conj_pch" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Compute the complex conjugates of complex numbers in "a", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR FP32(-0.0)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_conj_pch" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR FP32(-0.0)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_conj_pch" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR FP32(-0.0)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_conj_pch" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR FP32(-0.0)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_conj_pch" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR FP32(-0.0)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VCMPPH" xed="VCMPPH_MASKmskw_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	IF k1[j]
+		k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VCMPPH" xed="VCMPPH_MASKmskw_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cmp_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, ymm, imm8" name="VCMPPH" xed="VCMPPH_MASKmskw_MASKmskw_YMMf16_YMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cmp_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	IF k1[j]
+		k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, ymm, imm8" name="VCMPPH" xed="VCMPPH_MASKmskw_MASKmskw_YMMf16_YMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTW2PH" xed="VCVTW2PH_XMMf16_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTW2PH" xed="VCVTW2PH_XMMf16_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTW2PH" xed="VCVTW2PH_XMMf16_MASKmskw_XMMi16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTW2PH" xed="VCVTW2PH_YMMf16_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTW2PH" xed="VCVTW2PH_YMMf16_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTW2PH" xed="VCVTW2PH_YMMf16_MASKmskw_YMMi16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepu16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTUW2PH" xed="VCVTUW2PH_XMMf16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepu16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTUW2PH" xed="VCVTUW2PH_XMMf16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepu16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTUW2PH" xed="VCVTUW2PH_XMMf16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepu16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTUW2PH" xed="VCVTUW2PH_YMMf16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepu16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTUW2PH" xed="VCVTUW2PH_YMMf16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepu16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTUW2PH" xed="VCVTUW2PH_YMMf16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTDQ2PH" xed="VCVTDQ2PH_XMMf16_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTDQ2PH" xed="VCVTDQ2PH_XMMf16_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTDQ2PH" xed="VCVTDQ2PH_XMMf16_MASKmskw_XMMi32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTDQ2PH" xed="VCVTDQ2PH_XMMf16_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTDQ2PH" xed="VCVTDQ2PH_XMMf16_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTDQ2PH" xed="VCVTDQ2PH_XMMf16_MASKmskw_YMMi32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepu32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTUDQ2PH" xed="VCVTUDQ2PH_XMMf16_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepu32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTUDQ2PH" xed="VCVTUDQ2PH_XMMf16_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepu32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTUDQ2PH" xed="VCVTUDQ2PH_XMMf16_MASKmskw_XMMu32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepu32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTUDQ2PH" xed="VCVTUDQ2PH_XMMf16_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepu32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTUDQ2PH" xed="VCVTUDQ2PH_XMMf16_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepu32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTUDQ2PH" xed="VCVTUDQ2PH_XMMf16_MASKmskw_YMMu32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 96 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 1
+	dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTQQ2PH" xed="VCVTQQ2PH_XMMf16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepi64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTQQ2PH" xed="VCVTQQ2PH_XMMf16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepi64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTQQ2PH" xed="VCVTQQ2PH_XMMf16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepi64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTQQ2PH" xed="VCVTQQ2PH_XMMf16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepi64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTQQ2PH" xed="VCVTQQ2PH_XMMf16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepi64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m256i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTQQ2PH" xed="VCVTQQ2PH_XMMf16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepu64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 96 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 1
+	dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTUQQ2PH" xed="VCVTUQQ2PH_XMMf16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtepu64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTUQQ2PH" xed="VCVTUQQ2PH_XMMf16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtepu64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTUQQ2PH" xed="VCVTUQQ2PH_XMMf16_MASKmskw_XMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtepu64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTUQQ2PH" xed="VCVTUQQ2PH_XMMf16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtepu64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTUQQ2PH" xed="VCVTUQQ2PH_XMMf16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtepu64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTUQQ2PH" xed="VCVTUQQ2PH_XMMf16_MASKmskw_YMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 96 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 1
+	dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPD2PH" xed="VCVTPD2PH_XMMf16_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPD2PH" xed="VCVTPD2PH_XMMf16_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPD2PH" xed="VCVTPD2PH_XMMf16_MASKmskw_XMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTPD2PH" xed="VCVTPD2PH_XMMf16_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTPD2PH" xed="VCVTPD2PH_XMMf16_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTPD2PH" xed="VCVTPD2PH_XMMf16_MASKmskw_YMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtxps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".  The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 to 3
+	dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPS2PHX" xed="VCVTPS2PHX_XMMf16_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtxps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPS2PHX" xed="VCVTPS2PHX_XMMf16_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtxps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  The upper 64 bits of "dst" are zeroed out.</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPS2PHX" xed="VCVTPS2PHX_XMMf16_MASKmskw_XMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtxps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm" name="VCVTPS2PHX" xed="VCVTPS2PHX_XMMf16_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtxps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, ymm" name="VCVTPS2PHX" xed="VCVTPS2PHX_XMMf16_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtxps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, ymm" name="VCVTPS2PHX" xed="VCVTPS2PHX_XMMf16_MASKmskw_YMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 3
+	dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPH2DQ" xed="VCVTPH2DQ_XMMi32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPH2DQ" xed="VCVTPH2DQ_XMMi32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPH2DQ" xed="VCVTPH2DQ_XMMi32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTPH2DQ" xed="VCVTPH2DQ_YMMi32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTPH2DQ" xed="VCVTPH2DQ_YMMi32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTPH2DQ" xed="VCVTPH2DQ_YMMi32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 3
+	dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTTPH2DQ" xed="VCVTTPH2DQ_XMMi32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPH2DQ" xed="VCVTTPH2DQ_XMMi32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPH2DQ" xed="VCVTTPH2DQ_XMMi32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTTPH2DQ" xed="VCVTTPH2DQ_YMMi32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTTPH2DQ" xed="VCVTTPH2DQ_YMMi32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTTPH2DQ" xed="VCVTTPH2DQ_YMMi32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 3
+	dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPH2UDQ" xed="VCVTPH2UDQ_XMMu32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPH2UDQ" xed="VCVTPH2UDQ_XMMu32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPH2UDQ" xed="VCVTPH2UDQ_XMMu32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTPH2UDQ" xed="VCVTPH2UDQ_YMMu32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTPH2UDQ" xed="VCVTPH2UDQ_YMMu32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTPH2UDQ" xed="VCVTPH2UDQ_YMMu32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 3
+	dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTTPH2UDQ" xed="VCVTTPH2UDQ_XMMu32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPH2UDQ" xed="VCVTTPH2UDQ_XMMu32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPH2UDQ" xed="VCVTTPH2UDQ_XMMu32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTTPH2UDQ" xed="VCVTTPH2UDQ_YMMu32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTTPH2UDQ" xed="VCVTTPH2UDQ_YMMu32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTTPH2UDQ" xed="VCVTTPH2UDQ_YMMu32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 1
+	dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPH2QQ" xed="VCVTPH2QQ_XMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPH2QQ" xed="VCVTPH2QQ_XMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPH2QQ" xed="VCVTPH2QQ_XMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 3
+	dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTPH2QQ" xed="VCVTPH2QQ_YMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTPH2QQ" xed="VCVTPH2QQ_YMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTPH2QQ" xed="VCVTPH2QQ_YMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 1
+	dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTTPH2QQ" xed="VCVTTPH2QQ_XMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPH2QQ" xed="VCVTTPH2QQ_XMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPH2QQ" xed="VCVTTPH2QQ_XMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 3
+	dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTTPH2QQ" xed="VCVTTPH2QQ_YMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTTPH2QQ" xed="VCVTTPH2QQ_YMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTTPH2QQ" xed="VCVTTPH2QQ_YMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 1
+	dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPH2UQQ" xed="VCVTPH2UQQ_XMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPH2UQQ" xed="VCVTPH2UQQ_XMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPH2UQQ" xed="VCVTPH2UQQ_XMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 3
+	dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTPH2UQQ" xed="VCVTPH2UQQ_YMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTPH2UQQ" xed="VCVTPH2UQQ_YMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTPH2UQQ" xed="VCVTPH2UQQ_YMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 1
+	dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTTPH2UQQ" xed="VCVTTPH2UQQ_XMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPH2UQQ" xed="VCVTTPH2UQQ_XMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 1
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPH2UQQ" xed="VCVTTPH2UQQ_XMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 3
+	dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTTPH2UQQ" xed="VCVTTPH2UQQ_YMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTTPH2UQQ" xed="VCVTTPH2UQQ_YMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 3
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTTPH2UQQ" xed="VCVTTPH2UQQ_YMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.word[j] := Convert_FP16_To_Int16(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPH2W" xed="VCVTPH2W_XMMi16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPH2W" xed="VCVTPH2W_XMMi16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPH2W" xed="VCVTPH2W_XMMi16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.word[j] := Convert_FP16_To_Int16(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTPH2W" xed="VCVTPH2W_YMMi16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTPH2W" xed="VCVTPH2W_YMMi16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTPH2W" xed="VCVTPH2W_YMMi16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTTPH2W" xed="VCVTTPH2W_XMMi16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPH2W" xed="VCVTTPH2W_XMMi16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPH2W" xed="VCVTTPH2W_XMMi16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTTPH2W" xed="VCVTTPH2W_YMMi16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTTPH2W" xed="VCVTTPH2W_YMMi16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTTPH2W" xed="VCVTTPH2W_YMMi16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPH2UW" xed="VCVTPH2UW_XMMu16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPH2UW" xed="VCVTPH2UW_XMMu16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPH2UW" xed="VCVTPH2UW_XMMu16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTPH2UW" xed="VCVTPH2UW_YMMu16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTPH2UW" xed="VCVTPH2UW_YMMu16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTPH2UW" xed="VCVTPH2UW_YMMu16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTTPH2UW" xed="VCVTTPH2UW_XMMu16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvttph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTTPH2UW" xed="VCVTTPH2UW_XMMu16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvttph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTTPH2UW" xed="VCVTTPH2UW_XMMu16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvttph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VCVTTPH2UW" xed="VCVTTPH2UW_YMMu16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvttph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VCVTTPH2UW" xed="VCVTTPH2UW_YMMu16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvttph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VCVTTPH2UW" xed="VCVTTPH2UW_YMMu16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPH2PD" xed="VCVTPH2PD_XMMf64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	IF k[j]
+		dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j])
+	ELSE
+		dst.fp64[j] := src.fp64[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPH2PD" xed="VCVTPH2PD_XMMf64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	IF k[j]
+		dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j])
+	ELSE
+		dst.fp64[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPH2PD" xed="VCVTPH2PD_XMMf64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTPH2PD" xed="VCVTPH2PD_YMMf64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j])
+	ELSE
+		dst.fp64[j] := src.fp64[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTPH2PD" xed="VCVTPH2PD_YMMf64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j])
+	ELSE
+		dst.fp64[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTPH2PD" xed="VCVTPH2PD_YMMf64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtxph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPH2PSX" xed="VCVTPH2PSX_XMMf32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtxph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j])
+	ELSE
+		dst.fp32[j] := src.fp32[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VCVTPH2PSX" xed="VCVTPH2PSX_XMMf32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtxph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j])
+	ELSE
+		dst.fp32[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VCVTPH2PSX" xed="VCVTPH2PSX_XMMf32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtxph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTPH2PSX" xed="VCVTPH2PSX_YMMf32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_cvtxph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j])
+	ELSE
+		dst.fp32[j] := src.fp32[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, xmm" name="VCVTPH2PSX" xed="VCVTPH2PSX_YMMf32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_cvtxph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j])
+	ELSE
+		dst.fp32[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, xmm" name="VCVTPH2PSX" xed="VCVTPH2PSX_YMMf32_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	dst.fp16[j] := (a.fp16[j] &gt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VMAXPH" xed="VMAXPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &gt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMAXPH" xed="VMAXPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &gt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMAXPH" xed="VMAXPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_max_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	dst.fp16[j] := (a.fp16[j] &gt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VMAXPH" xed="VMAXPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_max_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &gt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VMAXPH" xed="VMAXPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_max_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &gt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VMAXPH" xed="VMAXPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [max_float_note]</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] &gt; b.fp16[0] ? a.fp16[0] : b.fp16[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VMAXSH" xed="VMAXSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] &gt; b.fp16[0] ? a.fp16[0] : b.fp16[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMAXSH" xed="VMAXSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] &gt; b.fp16[0] ? a.fp16[0] : b.fp16[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMAXSH" xed="VMAXSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note]</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] &gt; b.fp16[0] ? a.fp16[0] : b.fp16[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}" name="VMAXSH" xed="VMAXSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_max_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] &gt; b.fp16[0] ? a.fp16[0] : b.fp16[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}" name="VMAXSH" xed="VMAXSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_max_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] &gt; b.fp16[0] ? a.fp16[0] : b.fp16[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}" name="VMAXSH" xed="VMAXSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	dst.fp16[j] := (a.fp16[j] &lt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VMINPH" xed="VMINPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &lt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMINPH" xed="VMINPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &lt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMINPH" xed="VMINPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_min_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	dst.fp16[j] := (a.fp16[j] &lt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VMINPH" xed="VMINPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_min_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &lt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VMINPH" xed="VMINPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_min_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &lt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VMINPH" xed="VMINPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [min_float_note]</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] &lt; b.fp16[0] ? a.fp16[0] : b.fp16[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VMINSH" xed="VMINSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] &lt; b.fp16[0] ? a.fp16[0] : b.fp16[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMINSH" xed="VMINSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] &lt; b.fp16[0] ? a.fp16[0] : b.fp16[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMINSH" xed="VMINSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note]</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] &lt; b.fp16[0] ? a.fp16[0] : b.fp16[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}" name="VMINSH" xed="VMINSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_min_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] &lt; b.fp16[0] ? a.fp16[0] : b.fp16[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}" name="VMINSH" xed="VMINSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_min_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] &lt; b.fp16[0] ? a.fp16[0] : b.fp16[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}" name="VMINSH" xed="VMINSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_roundscale_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+FOR i := 0 to 7
+	dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8)
+ENDFOR
+dest[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VRNDSCALEPH" xed="VRNDSCALEPH_XMMf16_MASKmskw_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_roundscale_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dest[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VRNDSCALEPH" xed="VRNDSCALEPH_XMMf16_MASKmskw_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_roundscale_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dest[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VRNDSCALEPH" xed="VRNDSCALEPH_XMMf16_MASKmskw_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_roundscale_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+FOR i := 0 to 15
+	dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8)
+ENDFOR
+dest[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VRNDSCALEPH" xed="VRNDSCALEPH_YMMf16_MASKmskw_YMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_roundscale_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dest[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VRNDSCALEPH" xed="VRNDSCALEPH_YMMf16_MASKmskw_YMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_roundscale_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dest[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VRNDSCALEPH" xed="VRNDSCALEPH_YMMf16_MASKmskw_YMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getexp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR i := 0 to 7
+	dst.fp16[i] := ConvertExpFP16(a.fp16[i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VGETEXPPH" xed="VGETEXPPH_XMMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getexp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := ConvertExpFP16(a.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VGETEXPPH" xed="VGETEXPPH_XMMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getexp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := ConvertExpFP16(a.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VGETEXPPH" xed="VGETEXPPH_XMMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_getexp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR i := 0 to 15
+	dst.fp16[i] := ConvertExpFP16(a.fp16[i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VGETEXPPH" xed="VGETEXPPH_YMMf16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_getexp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := ConvertExpFP16(a.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VGETEXPPH" xed="VGETEXPPH_YMMf16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_getexp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := ConvertExpFP16(a.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VGETEXPPH" xed="VGETEXPPH_YMMf16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getmant_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<description>Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR i := 0 TO 7
+	dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VGETMANTPH" xed="VGETMANTPH_XMMf16_MASKmskw_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getmant_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<description>Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR i := 0 TO 7
+	IF k[i]
+		dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign)
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VGETMANTPH" xed="VGETMANTPH_XMMf16_MASKmskw_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getmant_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<description>Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR i := 0 TO 7
+	IF k[i]
+		dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign)
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VGETMANTPH" xed="VGETMANTPH_XMMf16_MASKmskw_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_getmant_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<description>Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+		[getmant_note]</description>
+	<operation>FOR i := 0 TO 15
+	dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VGETMANTPH" xed="VGETMANTPH_YMMf16_MASKmskw_YMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_getmant_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<description>Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+		[getmant_note]</description>
+	<operation>FOR i := 0 TO 15
+	IF k[i]
+		dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign)
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VGETMANTPH" xed="VGETMANTPH_YMMf16_MASKmskw_YMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_getmant_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<description>Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+		[getmant_note]</description>
+	<operation>FOR i := 0 TO 15
+	IF k[i]
+		dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign)
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VGETMANTPH" xed="VGETMANTPH_YMMf16_MASKmskw_YMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+FOR i := 0 to 7
+	dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VREDUCEPH" xed="VREDUCEPH_XMMf16_MASKmskw_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, imm8" name="VREDUCEPH" xed="VREDUCEPH_XMMf16_MASKmskw_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_reduce_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, imm8" name="VREDUCEPH" xed="VREDUCEPH_XMMf16_MASKmskw_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_reduce_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+FOR i := 0 to 15
+	dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, imm8" name="VREDUCEPH" xed="VREDUCEPH_YMMf16_MASKmskw_YMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_reduce_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, imm8" name="VREDUCEPH" xed="VREDUCEPH_YMMf16_MASKmskw_YMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_reduce_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, imm8" name="VREDUCEPH" xed="VREDUCEPH_YMMf16_MASKmskw_YMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_scalef_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+FOR i := 0 to 7
+	dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VSCALEFPH" xed="VSCALEFPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_scalef_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSCALEFPH" xed="VSCALEFPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_scalef_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSCALEFPH" xed="VSCALEFPH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_scalef_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+FOR i := 0 to 15
+	dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VSCALEFPH" xed="VSCALEFPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_scalef_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VSCALEFPH" xed="VSCALEFPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_scalef_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VSCALEFPH" xed="VSCALEFPH_YMMf16_MASKmskw_YMMf16_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_fpclass_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+		[fpclass_note]</description>
+	<operation>FOR i := 0 to 7
+	k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0])
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k, xmm, imm8" name="VFPCLASSPH" xed="VFPCLASSPH_MASKmskw_MASKmskw_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fpclass_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+		[fpclass_note]</description>
+	<operation>FOR i := 0 to 7
+	IF k1[i]
+		k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0])
+	ELSE
+		k[i] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction form="k {k}, xmm, imm8" name="VFPCLASSPH" xed="VFPCLASSPH_MASKmskw_MASKmskw_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fpclass_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+			[fpclass_note]</description>
+	<operation>FOR i := 0 to 15
+	k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0])
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k, ymm, imm8" name="VFPCLASSPH" xed="VFPCLASSPH_MASKmskw_MASKmskw_YMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_fpclass_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="MASK" type="__mmask16" varname="k1" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+		[fpclass_note]</description>
+	<operation>FOR i := 0 to 15
+	IF k1[i]
+		k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0])
+	ELSE
+		k[i] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction form="k {k}, ymm, imm8" name="VFPCLASSPH" xed="VFPCLASSPH_MASKmskw_MASKmskw_YMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_permutex2var_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="idx" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Shuffle half-precision (16-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	off := idx[i+2:i]
+	dst.fp16[j] := idx[i+3] ? b.fp16[off] : a.fp16[off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutex2var_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="idx" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Shuffle half-precision (16-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	off := idx[i+3:i]
+	dst.fp16[j] := idx[i+4] ? b.fp16[off] : a.fp16[off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMI2W" xed="VPERMI2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<instruction form="ymm, ymm, ymm" name="VPERMT2W" xed="VPERMT2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_blend_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="FP16" type="__m256h" varname="b" />
+	<description>Blend packed half-precision (16-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := b.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPBLENDMW" xed="VPBLENDMW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_blend_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Blend packed half-precision (16-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp16[j] := b.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPBLENDMW" xed="VPBLENDMW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutexvar_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="idx" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Shuffle half-precision (16-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	id := idx[i+3:i]
+	dst.fp16[j] := a.fp16[id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMW" xed="VPERMW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_permutexvar_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="idx" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Shuffle half-precision (16-bit) floating-point elements in "a" using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	id := idx[i+2:i]
+	dst.fp16[j] := a.fp16[id]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPERMW" xed="VPERMW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_rsqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 7
+	dst.fp16[i] := (1.0 / SQRT(a.fp16[i]))
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VRSQRTPH" xed="VRSQRTPH_XMMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rsqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := (1.0 / SQRT(a.fp16[i]))
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VRSQRTPH" xed="VRSQRTPH_XMMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rsqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := (1.0 / SQRT(a.fp16[i]))
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VRSQRTPH" xed="VRSQRTPH_XMMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rsqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[i] := (1.0 / SQRT(a.fp16[i]))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VRSQRTPH" xed="VRSQRTPH_YMMf16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_rsqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := (1.0 / SQRT(a.fp16[i]))
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VRSQRTPH" xed="VRSQRTPH_YMMf16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_rsqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := (1.0 / SQRT(a.fp16[i]))
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VRSQRTPH" xed="VRSQRTPH_YMMf16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_sqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR i := 0 to 7
+	dst.fp16[i] := SQRT(a.fp16[i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VSQRTPH" xed="VSQRTPH_XMMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := SQRT(a.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VSQRTPH" xed="VSQRTPH_XMMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := SQRT(a.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VSQRTPH" xed="VSQRTPH_XMMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_sqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[i] := SQRT(a.fp16[i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VSQRTPH" xed="VSQRTPH_YMMf16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_sqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := SQRT(a.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VSQRTPH" xed="VSQRTPH_YMMf16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_sqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := SQRT(a.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VSQRTPH" xed="VSQRTPH_YMMf16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_rcp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 7
+	dst.fp16[i] := (1.0 / a.fp16[i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VRCPPH" xed="VRCPPH_XMMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rcp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := (1.0 / a.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VRCPPH" xed="VRCPPH_XMMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rcp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 7
+	IF k[i]
+		dst.fp16[i] := (1.0 / a.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VRCPPH" xed="VRCPPH_XMMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_rcp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[i] := (1.0 / a.fp16[i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VRCPPH" xed="VRCPPH_YMMf16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_rcp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := (1.0 / a.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VRCPPH" xed="VRCPPH_YMMf16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_rcp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := (1.0 / a.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VRCPPH" xed="VRCPPH_YMMf16_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm256_load_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVAPS" xed="VMOVAPS_YMMqq_MEMqq" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_load_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, m128" name="MOVAPS" xed="MOVAPS_XMMps_MEMps" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_loadu_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" memwidth="256" type="void const*" varname="mem_addr" />
+	<description>Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, m256" name="VMOVUPS" xed="VMOVUPS_YMMqq_MEMqq" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadu_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" memwidth="128" type="void const*" varname="mem_addr" />
+	<description>Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, m128" name="MOVUPS" xed="MOVUPS_XMMps_MEMps" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_store_ph" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP16" memwidth="256" type="void *" varname="mem_addr" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVAPS" xed="VMOVAPS_MEMqq_YMMqq" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_store_ph" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP16" memwidth="128" type="void *" varname="mem_addr" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="MOVAPS" xed="MOVAPS_MEMps_XMMps" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_storeu_ph" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP16" memwidth="256" type="void *" varname="mem_addr" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction form="m256, ymm" name="VMOVUPS" xed="VMOVUPS_MEMqq_YMMqq" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storeu_ph" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP16" memwidth="128" type="void *" varname="mem_addr" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="MOVUPS" xed="MOVUPS_MEMps_XMMps" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_undefined_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m256h with undefined elements.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_undefined_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m128h with undefined elements.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setzero_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m256h with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VXORPS" xed="VXORPS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setzero_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m128h with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="XORPS" xed="XORPS_XMMxud_XMMxud" />
+	<CPUID>AVX512_FP16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_add_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.fp16[j] := a.fp16[j] + b.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VADDPH" xed="VADDPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_add_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] + b.fp16[j]
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VADDPH" xed="VADDPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_add_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] + b.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VADDPH" xed="VADDPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_add_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.fp16[j] := a.fp16[j] + b.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VADDPH" xed="VADDPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_add_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] + b.fp16[j]
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VADDPH" xed="VADDPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_add_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] + b.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VADDPH" xed="VADDPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := a.fp16[0] + b.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VADDSH" xed="VADDSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+dst.fp16[0] := a.fp16[0] + b.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VADDSH" xed="VADDSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_add_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] + b.fp16[0]
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VADDSH" xed="VADDSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_add_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] + b.fp16[0]
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VADDSH" xed="VADDSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_add_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] + b.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VADDSH" xed="VADDSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_add_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] + b.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VADDSH" xed="VADDSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := a.fp16[j] / b.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VDIVPH" xed="VDIVPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_div_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] / b.fp16[j]
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VDIVPH" xed="VDIVPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_div_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] / b.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VDIVPH" xed="VDIVPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_div_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := a.fp16[j] / b.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VDIVPH" xed="VDIVPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_div_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] / b.fp16[j]
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VDIVPH" xed="VDIVPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_div_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] / b.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VDIVPH" xed="VDIVPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := a.fp16[0] / b.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VDIVSH" xed="VDIVSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_div_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] / b.fp16[0]
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VDIVSH" xed="VDIVSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_div_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] / b.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VDIVSH" xed="VDIVSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+dst.fp16[0] := a.fp16[0] / b.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VDIVSH" xed="VDIVSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_div_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] / b.fp16[0]
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VDIVSH" xed="VDIVSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_div_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] / b.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VDIVSH" xed="VDIVSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMADD132PH" xed="VFMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMADD213PH" xed="VFMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMADD231PH" xed="VFMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD132PH" xed="VFMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD213PH" xed="VFMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD231PH" xed="VFMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD132PH" xed="VFMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD213PH" xed="VFMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADD231PH" xed="VFMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADD132PH" xed="VFMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADD213PH" xed="VFMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADD231PH" xed="VFMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmadd_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADD132PH" xed="VFMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADD213PH" xed="VFMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADD231PH" xed="VFMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmadd_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD132PH" xed="VFMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD213PH" xed="VFMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD231PH" xed="VFMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmadd_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD132PH" xed="VFMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD213PH" xed="VFMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADD231PH" xed="VFMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmadd_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADD132PH" xed="VFMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADD213PH" xed="VFMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADD231PH" xed="VFMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmadd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMADD132SH" xed="VFMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFMADD213SH" xed="VFMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFMADD231SH" xed="VFMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmadd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
+ELSE
+	dst.fp16[0] := a.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD132SH" xed="VFMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD213SH" xed="VFMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD231SH" xed="VFMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmadd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
+ELSE
+	dst.fp16[0] := c.fp16[0]
+FI
+dst[127:16] := c[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD132SH" xed="VFMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD213SH" xed="VFMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADD231SH" xed="VFMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmadd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD132SH" xed="VFMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD213SH" xed="VFMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADD231SH" xed="VFMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmadd_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFMADD132SH" xed="VFMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFMADD213SH" xed="VFMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFMADD231SH" xed="VFMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmadd_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
+ELSE
+	dst.fp16[0] := a.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD132SH" xed="VFMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD213SH" xed="VFMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD231SH" xed="VFMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmadd_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
+ELSE
+	dst.fp16[0] := c.fp16[0]
+FI
+dst[127:16] := c[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD132SH" xed="VFMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD213SH" xed="VFMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADD231SH" xed="VFMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmadd_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMADD132SH" xed="VFMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMADD213SH" xed="VFMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMADD231SH" xed="VFMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fnmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFNMADD132PH" xed="VFNMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFNMADD213PH" xed="VFNMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFNMADD231PH" xed="VFNMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fnmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD132PH" xed="VFNMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD213PH" xed="VFNMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD231PH" xed="VFNMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fnmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD132PH" xed="VFNMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD213PH" xed="VFNMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMADD231PH" xed="VFNMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fnmadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMADD132PH" xed="VFNMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMADD213PH" xed="VFNMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMADD231PH" xed="VFNMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fnmadd_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMADD132PH" xed="VFNMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMADD213PH" xed="VFNMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMADD231PH" xed="VFNMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fnmadd_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD132PH" xed="VFNMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD213PH" xed="VFNMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD231PH" xed="VFNMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fnmadd_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD132PH" xed="VFNMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD213PH" xed="VFNMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMADD231PH" xed="VFNMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fnmadd_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMADD132PH" xed="VFNMADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMADD213PH" xed="VFNMADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMADD231PH" xed="VFNMADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmadd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFNMADD132SH" xed="VFNMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFNMADD213SH" xed="VFNMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFNMADD231SH" xed="VFNMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmadd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
+ELSE
+	dst.fp16[0] := a.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD132SH" xed="VFNMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD213SH" xed="VFNMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD231SH" xed="VFNMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmadd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
+ELSE
+	dst.fp16[0] := c.fp16[0]
+FI
+dst[127:16] := c[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD132SH" xed="VFNMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD213SH" xed="VFNMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMADD231SH" xed="VFNMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmadd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD132SH" xed="VFNMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD213SH" xed="VFNMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMADD231SH" xed="VFNMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmadd_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMADD132SH" xed="VFNMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMADD213SH" xed="VFNMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMADD231SH" xed="VFNMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmadd_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
+ELSE
+	dst.fp16[0] := a.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD132SH" xed="VFNMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD213SH" xed="VFNMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD231SH" xed="VFNMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmadd_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
+ELSE
+	dst.fp16[0] := c.fp16[0]
+FI
+dst[127:16] := c[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD132SH" xed="VFNMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD213SH" xed="VFNMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMADD231SH" xed="VFNMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmadd_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMADD132SH" xed="VFNMADD132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMADD213SH" xed="VFNMADD213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMADD231SH" xed="VFNMADD231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMSUB132PH" xed="VFMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMSUB213PH" xed="VFMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMSUB231PH" xed="VFMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB132PH" xed="VFMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB213PH" xed="VFMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB231PH" xed="VFMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB132PH" xed="VFMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB213PH" xed="VFMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUB231PH" xed="VFMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUB132PH" xed="VFMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUB213PH" xed="VFMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUB231PH" xed="VFMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmsub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUB132PH" xed="VFMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUB213PH" xed="VFMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUB231PH" xed="VFMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmsub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB132PH" xed="VFMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB213PH" xed="VFMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB231PH" xed="VFMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmsub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB132PH" xed="VFMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB213PH" xed="VFMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUB231PH" xed="VFMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmsub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUB132PH" xed="VFMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUB213PH" xed="VFMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUB231PH" xed="VFMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmsub_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMSUB132SH" xed="VFMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUB213SH" xed="VFMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUB231SH" xed="VFMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmsub_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
+ELSE
+	dst.fp16[0] := a.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB132SH" xed="VFMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB213SH" xed="VFMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB231SH" xed="VFMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmsub_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
+ELSE
+	dst.fp16[0] := c.fp16[0]
+FI
+dst[127:16] := c[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB132SH" xed="VFMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB213SH" xed="VFMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFMSUB231SH" xed="VFMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmsub_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB132SH" xed="VFMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB213SH" xed="VFMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFMSUB231SH" xed="VFMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmsub_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFMSUB132SH" xed="VFMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFMSUB213SH" xed="VFMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFMSUB231SH" xed="VFMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmsub_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
+ELSE
+	dst.fp16[0] := a.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB132SH" xed="VFMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB213SH" xed="VFMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB231SH" xed="VFMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmsub_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
+ELSE
+	dst.fp16[0] := c.fp16[0]
+FI
+dst[127:16] := c[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB132SH" xed="VFMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB213SH" xed="VFMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMSUB231SH" xed="VFMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmsub_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMSUB132SH" xed="VFMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMSUB213SH" xed="VFMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMSUB231SH" xed="VFMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fnmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFNMSUB132PH" xed="VFNMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFNMSUB213PH" xed="VFNMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFNMSUB231PH" xed="VFNMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fnmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB132PH" xed="VFNMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB213PH" xed="VFNMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB231PH" xed="VFNMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fnmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB132PH" xed="VFNMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB213PH" xed="VFNMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFNMSUB231PH" xed="VFNMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fnmsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMSUB132PH" xed="VFNMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMSUB213PH" xed="VFNMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFNMSUB231PH" xed="VFNMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fnmsub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMSUB132PH" xed="VFNMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMSUB213PH" xed="VFNMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFNMSUB231PH" xed="VFNMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fnmsub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB132PH" xed="VFNMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB213PH" xed="VFNMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB231PH" xed="VFNMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fnmsub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB132PH" xed="VFNMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB213PH" xed="VFNMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFNMSUB231PH" xed="VFNMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fnmsub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMSUB132PH" xed="VFNMSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMSUB213PH" xed="VFNMSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFNMSUB231PH" xed="VFNMSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmsub_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB132SH" xed="VFNMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB213SH" xed="VFNMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB231SH" xed="VFNMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmsub_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
+ELSE
+	dst.fp16[0] := a.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB132SH" xed="VFNMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB213SH" xed="VFNMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB231SH" xed="VFNMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmsub_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
+ELSE
+	dst.fp16[0] := c.fp16[0]
+FI
+dst[127:16] := c[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB132SH" xed="VFNMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB213SH" xed="VFNMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm" name="VFNMSUB231SH" xed="VFNMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmsub_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB132SH" xed="VFNMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB213SH" xed="VFNMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VFNMSUB231SH" xed="VFNMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmsub_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMSUB132SH" xed="VFNMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMSUB213SH" xed="VFNMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm, xmm, xmm {er}" name="VFNMSUB231SH" xed="VFNMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fnmsub_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
+ELSE
+	dst.fp16[0] := a.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB132SH" xed="VFNMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB213SH" xed="VFNMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB231SH" xed="VFNMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fnmsub_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
+ELSE
+	dst.fp16[0] := c.fp16[0]
+FI
+dst[127:16] := c[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB132SH" xed="VFNMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB213SH" xed="VFNMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFNMSUB231SH" xed="VFNMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fnmsub_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMSUB132SH" xed="VFNMSUB132SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMSUB213SH" xed="VFNMSUB213SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFNMSUB231SH" xed="VFNMSUB231SH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmaddsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	IF ((j &amp; 1) == 0)
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmaddsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmaddsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmaddsub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmaddsub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".
+		[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF ((j &amp; 1) == 0)
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	ELSE
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmaddsub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+		[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmaddsub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).
+		[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmaddsub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+		[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADDSUB132PH" xed="VFMADDSUB132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADDSUB213PH" xed="VFMADDSUB213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADDSUB231PH" xed="VFMADDSUB231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmsubadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	IF ((j &amp; 1) == 0)
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmsubadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmsubadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmsubadd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmsubadd_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst".
+		[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF ((j &amp; 1) == 0)
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+	ELSE
+		dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm, zmm, zmm {er}" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmsubadd_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+		[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmsubadd_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).
+		[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := c.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmsubadd_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+		[round_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
+		ELSE
+			dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
+		FI
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUBADD132PH" xed="VFMSUBADD132PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUBADD213PH" xed="VFMSUBADD213PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMSUBADD231PH" xed="VFMSUBADD231PH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.fp16[j] := a.fp16[j] - b.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VSUBPH" xed="VSUBPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.fp16[j] := a.fp16[j] - b.fp16[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VSUBPH" xed="VSUBPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] - b.fp16[j]
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VSUBPH" xed="VSUBPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] - b.fp16[j]
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VSUBPH" xed="VSUBPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sub_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] - b.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VSUBPH" xed="VSUBPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sub_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := a.fp16[j] - b.fp16[j]
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VSUBPH" xed="VSUBPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := a.fp16[0] - b.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VSUBSH" xed="VSUBSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst.fp16[0] := a.fp16[0] - b.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VSUBSH" xed="VSUBSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sub_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] - b.fp16[0]
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSUBSH" xed="VSUBSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sub_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] - b.fp16[0]
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VSUBSH" xed="VSUBSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sub_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] - b.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSUBSH" xed="VSUBSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sub_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] - b.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VSUBSH" xed="VSUBSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mul_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR i := 0 TO 31
+	dst.fp16[i] := a.fp16[i] * b.fp16[i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VMULPH" xed="VMULPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mul_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	 [round_note]</description>
+	<operation>
+FOR i := 0 TO 31
+	dst.fp16[i] := a.fp16[i] * b.fp16[i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VMULPH" xed="VMULPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mul_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 TO 31
+	IF k[i]
+		dst.fp16[i] := a.fp16[i] * b.fp16[i]
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VMULPH" xed="VMULPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mul_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	 [round_note]</description>
+	<operation>
+FOR i := 0 TO 31
+	IF k[i]
+		dst.fp16[i] := a.fp16[i] * b.fp16[i]
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VMULPH" xed="VMULPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mul_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 TO 31
+	IF k[i]
+		dst.fp16[i] := a.fp16[i] * b.fp16[i]
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VMULPH" xed="VMULPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mul_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	 [round_note]</description>
+	<operation>
+FOR i := 0 TO 31
+	IF k[i]
+		dst.fp16[i] := a.fp16[i] * b.fp16[i]
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VMULPH" xed="VMULPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := a.fp16[0] * b.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VMULSH" xed="VMULSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst.fp16[0] := a.fp16[0] * b.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VMULSH" xed="VMULSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] * b.fp16[0]
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMULSH" xed="VMULSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] * b.fp16[0]
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VMULSH" xed="VMULSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] * b.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMULSH" xed="VMULSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := a.fp16[0] * b.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VMULSH" xed="VMULSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMULCPH" xed="VFMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMULCPH" xed="VFMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMULCPH" xed="VFMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMULCPH" xed="VFMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMULCPH" xed="VFMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMULCPH" xed="VFMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmul_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+			[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMULCPH" xed="VFMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mul_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+			[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMULCPH" xed="VFMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmul_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMULCPH" xed="VFMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_mul_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMULCPH" xed="VFMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmul_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+			[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMULCPH" xed="VFMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_mul_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+			[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMULCPH" xed="VFMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmul_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
+dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMULCSH" xed="VFMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
+dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMULCSH" xed="VFMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmul_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+	dst.fp16[1] := src.fp16[1]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMULCSH" xed="VFMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+	dst.fp16[1] := src.fp16[1]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMULCSH" xed="VFMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmul_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := 0
+	dst.fp16[1] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMULCSH" xed="VFMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := 0
+	dst.fp16[1] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMULCSH" xed="VFMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmul_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+			[round_note]</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
+dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFMULCSH" xed="VFMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+			[round_note]</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
+dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFMULCSH" xed="VFMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmul_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+	dst.fp16[1] := src.fp16[1]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMULCSH" xed="VFMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_mul_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+	dst.fp16[1] := src.fp16[1]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMULCSH" xed="VFMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmul_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+			[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := 0
+	dst.fp16[1] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMULCSH" xed="VFMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_mul_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+			[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := 0
+	dst.fp16[1] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMULCSH" xed="VFMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fcmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFCMULCPH" xed="VFCMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFCMULCPH" xed="VFCMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fcmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFCMULCPH" xed="VFCMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFCMULCPH" xed="VFCMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fcmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFCMULCPH" xed="VFCMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cmul_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFCMULCPH" xed="VFCMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fcmul_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFCMULCPH" xed="VFCMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmul_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFCMULCPH" xed="VFCMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fcmul_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFCMULCPH" xed="VFCMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmul_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := src.fp16[2*i+0]
+		dst.fp16[2*i+1] := src.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFCMULCPH" xed="VFCMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fcmul_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFCMULCPH" xed="VFCMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cmul_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFCMULCPH" xed="VFCMULCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fcmul_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
+dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFCMULCSH" xed="VFCMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmul_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
+dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFCMULCSH" xed="VFCMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fcmul_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+	dst.fp16[1] := src.fp16[1]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFCMULCSH" xed="VFCMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmul_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+	dst.fp16[1] := src.fp16[1]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFCMULCSH" xed="VFCMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fcmul_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := 0
+	dst.fp16[1] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFCMULCSH" xed="VFCMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cmul_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := 0
+	dst.fp16[1] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFCMULCSH" xed="VFCMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fcmul_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
+dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFCMULCSH" xed="VFCMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmul_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
+dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFCMULCSH" xed="VFCMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fcmul_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+	dst.fp16[1] := src.fp16[1]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFCMULCSH" xed="VFCMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmul_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+	dst.fp16[1] := src.fp16[1]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFCMULCSH" xed="VFCMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fcmul_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := 0
+	dst.fp16[1] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFCMULCSH" xed="VFCMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cmul_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
+ELSE
+	dst.fp16[0] := 0
+	dst.fp16[1] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFCMULCSH" xed="VFCMULCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFMADDCPH" xed="VFMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "src", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := a.fp16[2*i+0]
+		dst.fp16[2*i+1] := a.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDCPH" xed="VFMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "src", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := c.fp16[2*i+0]
+		dst.fp16[2*i+1] := c.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFMADDCPH" xed="VFMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFMADDCPH" xed="VFMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fmadd_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFMADDCPH" xed="VFMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fmadd_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := a.fp16[2*i+0]
+		dst.fp16[2*i+1] := a.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDCPH" xed="VFMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fmadd_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := c.fp16[2*i+0]
+		dst.fp16[2*i+1] := c.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFMADDCPH" xed="VFMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fmadd_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFMADDCPH" xed="VFMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmadd_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMADDCSH" xed="VFMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmadd_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+ELSE
+	dst.fp16[0] := a.fp16[0]
+	dst.fp16[1] := a.fp16[1]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDCSH" xed="VFMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmadd_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower complex number in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "c" when mask bit 0 is not set), and copy the upper 6 packed elements from "c" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+ELSE
+	dst.fp16[0] := c.fp16[0]
+	dst.fp16[1] := c.fp16[1]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFMADDCSH" xed="VFMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmadd_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+ELSE
+	dst.fp16[0] := 0
+	dst.fp16[1] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFMADDCSH" xed="VFMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmadd_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFMADDCSH" xed="VFMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fmadd_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+ELSE
+	dst.fp16[0] := a.fp16[0]
+	dst.fp16[1] := a.fp16[1]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADDCSH" xed="VFMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fmadd_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "c" when mask bit 0 is not set), and copy the upper 6 packed elements from "c" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+ELSE
+	dst.fp16[0] := c.fp16[0]
+	dst.fp16[1] := c.fp16[1]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFMADDCSH" xed="VFMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fmadd_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+ELSE
+	dst.fp16[0] := 0
+	dst.fp16[1] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFMADDCSH" xed="VFMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fcmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VFCMADDCPH" xed="VFCMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fcmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := a.fp16[2*i+0]
+		dst.fp16[2*i+1] := a.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFCMADDCPH" xed="VFCMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fcmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := c.fp16[2*i+0]
+		dst.fp16[2*i+1] := c.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VFCMADDCPH" xed="VFCMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fcmadd_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VFCMADDCPH" xed="VFCMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fcmadd_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+	dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VFCMADDCPH" xed="VFCMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fcmadd_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := a.fp16[2*i+0]
+		dst.fp16[2*i+1] := a.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFCMADDCPH" xed="VFCMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask3_fcmadd_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := c.fp16[2*i+0]
+		dst.fp16[2*i+1] := c.fp16[2*i+1]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VFCMADDCPH" xed="VFCMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_fcmadd_round_pch" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="FP16" type="__m512h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
+		dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
+	ELSE
+		dst.fp16[2*i+0] := 0
+		dst.fp16[2*i+1] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VFCMADDCPH" xed="VFCMADDCPH_ZMM2f16_MASKmskw_ZMM2f16_ZMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fcmadd_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />	
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFCMADDCSH" xed="VFCMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fcmadd_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+ELSE
+	dst.fp16[0] := a.fp16[0]
+	dst.fp16[1] := a.fp16[1]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFCMADDCSH" xed="VFCMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fcmadd_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "c" when mask bit 0 is not set), and copy the upper 6 packed elements from "c" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+ELSE
+	dst.fp16[0] := c.fp16[0]
+	dst.fp16[1] := c.fp16[1]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VFCMADDCSH" xed="VFCMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fcmadd_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+ELSE
+	dst.fp16[0] := 0
+	dst.fp16[1] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VFCMADDCSH" xed="VFCMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fcmadd_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VFCMADDCSH" xed="VFCMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fcmadd_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+ELSE
+	dst.fp16[0] := a.fp16[0]
+	dst.fp16[1] := a.fp16[1]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFCMADDCSH" xed="VFCMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask3_fcmadd_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "c" when mask bit 0 is not set), and copy the upper 6 packed elements from "c" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+ELSE
+	dst.fp16[0] := c.fp16[0]
+	dst.fp16[1] := c.fp16[1]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VFCMADDCSH" xed="VFCMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_fcmadd_round_sch" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="FP16" type="__m128h" varname="c" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
+	dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
+ELSE
+	dst.fp16[0] := 0
+	dst.fp16[1] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VFCMADDCSH" xed="VFCMADDCSH_XMM2f16_MASKmskw_XMM2f16_XMM2f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_add_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP32" type="__m512h" varname="a" />
+	<description>Reduce the packed half-precision (16-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+tmp := a
+FOR i := 0 to 15
+	tmp.fp16[i] := tmp.fp16[i] + a.fp16[i+16]
+ENDFOR
+FOR i := 0 to 7
+	tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+8]
+ENDFOR
+FOR i := 0 to 3
+	tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+4]
+ENDFOR
+FOR i := 0 to 1
+	tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+2]
+ENDFOR
+dst.fp16[0] := tmp.fp16[0] + tmp.fp16[1]
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_mul_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP32" type="__m512h" varname="a" />
+	<description>Reduce the packed half-precision (16-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+tmp := a
+FOR i := 0 to 15
+	tmp.fp16[i] := tmp.fp16[i] * a.fp16[i+16]
+ENDFOR
+FOR i := 0 to 7
+	tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+8]
+ENDFOR
+FOR i := 0 to 3
+	tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+4]
+ENDFOR
+FOR i := 0 to 1
+	tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+2]
+ENDFOR
+dst.fp16[0] := tmp.fp16[0] * tmp.fp16[1]
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_max_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP32" type="__m512h" varname="a" />
+	<description>Reduce the packed half-precision (16-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". [max_float_note]</description>
+	<operation>
+tmp := a
+FOR i := 0 to 15
+	tmp.fp16[i] := (a.fp16[i] &gt; a.fp16[i+16] ? a.fp16[i] : a.fp16[i+16])
+ENDFOR
+FOR i := 0 to 7
+	tmp.fp16[i] := (tmp.fp16[i] &gt; tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8])
+ENDFOR
+FOR i := 0 to 3
+	tmp.fp16[i] := (tmp.fp16[i] &gt; tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4])
+ENDFOR
+FOR i := 0 to 1
+	tmp.fp16[i] := (tmp.fp16[i] &gt; tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2])
+ENDFOR
+dst.fp16[0] := (tmp.fp16[0] &gt; tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1])
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_min_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP32" type="__m512h" varname="a" />
+	<description>Reduce the packed half-precision (16-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". [min_float_note]</description>
+	<operation>
+tmp := a
+FOR i := 0 to 15
+	tmp.fp16[i] := (a.fp16[i] &lt; a.fp16[i+16] ? tmp.fp16[i] : a.fp16[i+16])
+ENDFOR
+FOR i := 0 to 7
+	tmp.fp16[i] := (tmp.fp16[i] &lt; tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8])
+ENDFOR
+FOR i := 0 to 3
+	tmp.fp16[i] := (tmp.fp16[i] &lt; tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4])
+ENDFOR
+FOR i := 0 to 1
+	tmp.fp16[i] := (tmp.fp16[i] &lt; tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2])
+ENDFOR
+dst.fp16[0] := (tmp.fp16[0] &lt; tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1])
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_abs_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="v2" />
+	<description>Finds the absolute value of each packed half-precision (16-bit) floating-point element in "v2", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := ABS(v2.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_conj_pch" sequence="TRUE" tech="AVX-512">
+	<return etype="FP32" type="__m512h" varname="dst" />
+	<parameter etype="FP32" type="__m512h" varname="a" />
+	<description>Compute the complex conjugates of complex numbers in "a", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR FP32(-0.0)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_conj_pch" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR FP32(-0.0)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_conj_pch" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR FP32(-0.0)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 31
+	k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm, imm8" name="VCMPPH" xed="VCMPPH_MASKmskw_MASKmskw_ZMMf16_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 31
+	IF k1[j]
+		k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm, imm8" name="VCMPPH" xed="VCMPPH_MASKmskw_MASKmskw_ZMMf16_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cmp_round_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 31
+	k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, zmm {sae}, imm8" name="VCMPPH" xed="VCMPPH_MASKmskw_MASKmskw_ZMMf16_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cmp_round_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>CASE (imm8[3:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 31
+	IF k1[j]
+		k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, zmm {sae}, imm8" name="VCMPPH" xed="VCMPPH_MASKmskw_MASKmskw_ZMMf16_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_sh_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+k[0] := (a.fp16[0] OP b.fp16[0]) ? 1 : 0
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k, xmm, xmm, imm8" name="VCMPSH" xed="VCMPSH_MASKmskw_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmp_round_sh_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+k[0] := (a.fp16[0] OP b.fp16[0]) ? 1 : 0
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k, xmm, xmm {sae}, imm8" name="VCMPSH" xed="VCMPSH_MASKmskw_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_sh_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="const int" varname="imm8" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+IF k1[0]
+	k[0] := ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm, imm8" name="VCMPSH" xed="VCMPSH_MASKmskw_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cmp_round_sh_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+IF k1[0]
+	k[0] := ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k {k}, xmm, xmm {sae}, imm8" name="VCMPSH" xed="VCMPSH_MASKmskw_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comi_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+RETURN ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCOMISH" xed="VCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comi_round_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_CMP_" type="const int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ
+26: OP := _CMP_NGT_UQ
+27: OP := _CMP_FALSE_OS
+28: OP := _CMP_NEQ_OS
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+RETURN ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm {sae}" name="VCOMISH" xed="VCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comieq_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for equality, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] == b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCOMISH" xed="VCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comilt_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] &lt; b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCOMISH" xed="VCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comile_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] &lt;= b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCOMISH" xed="VCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comigt_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] &gt; b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCOMISH" xed="VCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comige_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] &gt;= b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCOMISH" xed="VCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comineq_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for not-equal, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a.fp16[0] ==NaN OR b.fp16[0] ==NaN OR a.fp16[0] != b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCOMISH" xed="VCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomieq_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] == b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="VUCOMISH" xed="VUCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomilt_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] &lt; b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="VUCOMISH" xed="VUCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomile_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] &lt;= b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="VUCOMISH" xed="VUCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomigt_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] &gt; b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="VUCOMISH" xed="VUCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomige_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] &gt;= b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="VUCOMISH" xed="VUCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomineq_sh" tech="AVX-512">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a.fp16[0] ==NaN OR b.fp16[0] ==NaN OR a.fp16[0] != b.fp16[0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="VUCOMISH" xed="VUCOMISH_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTW2PH" xed="VCVTW2PH_ZMMf16_MASKmskw_ZMMi16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundepi16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VCVTW2PH" xed="VCVTW2PH_ZMMf16_MASKmskw_ZMMi16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTW2PH" xed="VCVTW2PH_ZMMf16_MASKmskw_ZMMi16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundepi16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VCVTW2PH" xed="VCVTW2PH_ZMMf16_MASKmskw_ZMMi16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<description>Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTW2PH" xed="VCVTW2PH_ZMMf16_MASKmskw_ZMMi16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundepi16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VCVTW2PH" xed="VCVTW2PH_ZMMf16_MASKmskw_ZMMi16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTUW2PH" xed="VCVTUW2PH_ZMMf16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundepu16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VCVTUW2PH" xed="VCVTUW2PH_ZMMf16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTUW2PH" xed="VCVTUW2PH_ZMMf16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundepu16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VCVTUW2PH" xed="VCVTUW2PH_ZMMf16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepu16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTUW2PH" xed="VCVTUW2PH_ZMMf16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundepu16_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.fp16[j] := Convert_Int16_To_FP16(a.word[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VCVTUW2PH" xed="VCVTUW2PH_ZMMf16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VCVTDQ2PH" xed="VCVTDQ2PH_YMMf16_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundepi32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm {er}" name="VCVTDQ2PH" xed="VCVTDQ2PH_YMMf16_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VCVTDQ2PH" xed="VCVTDQ2PH_YMMf16_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundepi32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm {er}" name="VCVTDQ2PH" xed="VCVTDQ2PH_YMMf16_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VCVTDQ2PH" xed="VCVTDQ2PH_YMMf16_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundepi32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm {er}" name="VCVTDQ2PH" xed="VCVTDQ2PH_YMMf16_MASKmskw_ZMMi32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VCVTUDQ2PH" xed="VCVTUDQ2PH_YMMf16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundepu32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm {er}" name="VCVTUDQ2PH" xed="VCVTUDQ2PH_YMMf16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VCVTUDQ2PH" xed="VCVTUDQ2PH_YMMf16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundepu32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm {er}" name="VCVTUDQ2PH" xed="VCVTUDQ2PH_YMMf16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepu32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VCVTUDQ2PH" xed="VCVTUDQ2PH_YMMf16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundepu32_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm {er}" name="VCVTUDQ2PH" xed="VCVTUDQ2PH_YMMf16_MASKmskw_ZMMu32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepi64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm" name="VCVTQQ2PH" xed="VCVTQQ2PH_XMMf16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundepi64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm {er}" name="VCVTQQ2PH" xed="VCVTQQ2PH_XMMf16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepi64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm" name="VCVTQQ2PH" xed="VCVTQQ2PH_XMMf16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundepi64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm {er}" name="VCVTQQ2PH" xed="VCVTQQ2PH_XMMf16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepi64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<description>Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm" name="VCVTQQ2PH" xed="VCVTQQ2PH_XMMf16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundepi64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm {er}" name="VCVTQQ2PH" xed="VCVTQQ2PH_XMMf16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtepu64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm" name="VCVTUQQ2PH" xed="VCVTUQQ2PH_XMMf16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundepu64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm {er}" name="VCVTUQQ2PH" xed="VCVTUQQ2PH_XMMf16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtepu64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm" name="VCVTUQQ2PH" xed="VCVTUQQ2PH_XMMf16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundepu64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm {er}" name="VCVTUQQ2PH" xed="VCVTUQQ2PH_XMMf16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtepu64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm" name="VCVTUQQ2PH" xed="VCVTUQQ2PH_XMMf16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundepu64_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm {er}" name="VCVTUQQ2PH" xed="VCVTUQQ2PH_XMMf16_MASKmskw_ZMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm" name="VCVTPD2PH" xed="VCVTPD2PH_XMMf16_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, zmm {er}" name="VCVTPD2PH" xed="VCVTPD2PH_XMMf16_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm" name="VCVTPD2PH" xed="VCVTPD2PH_XMMf16_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, zmm {er}" name="VCVTPD2PH" xed="VCVTPD2PH_XMMf16_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm" name="VCVTPD2PH" xed="VCVTPD2PH_XMMf16_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, zmm {er}" name="VCVTPD2PH" xed="VCVTPD2PH_XMMf16_MASKmskw_ZMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper element of "dst".</description>
+	<operation>
+dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VCVTSD2SH" xed="VCVTSD2SH_XMMf16_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VCVTSD2SH" xed="VCVTSD2SH_XMMf16_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VCVTSD2SH" xed="VCVTSD2SH_XMMf16_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvt_roundsd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VCVTSD2SH" xed="VCVTSD2SH_XMMf16_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtsd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VCVTSD2SH" xed="VCVTSD2SH_XMMf16_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvt_roundsd_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VCVTSD2SH" xed="VCVTSD2SH_XMMf16_MASKmskw_XMMf64_XMMf64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtxps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm" name="VCVTPS2PHX" xed="VCVTPS2PHX_YMMf16_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtx_roundps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, zmm {er}" name="VCVTPS2PHX" xed="VCVTPS2PHX_YMMf16_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtxps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm" name="VCVTPS2PHX" xed="VCVTPS2PHX_YMMf16_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtx_roundps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, zmm {er}" name="VCVTPS2PHX" xed="VCVTPS2PHX_YMMf16_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtxps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm" name="VCVTPS2PHX" xed="VCVTPS2PHX_YMMf16_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtx_roundps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, zmm {er}" name="VCVTPS2PHX" xed="VCVTPS2PHX_YMMf16_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtss_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VCVTSS2SH" xed="VCVTSS2SH_XMMf16_MASKmskw_XMMf16_XMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundss_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VCVTSS2SH" xed="VCVTSS2SH_XMMf16_MASKmskw_XMMf16_XMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtss_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VCVTSS2SH" xed="VCVTSS2SH_XMMf16_MASKmskw_XMMf16_XMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvt_roundss_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VCVTSS2SH" xed="VCVTSS2SH_XMMf16_MASKmskw_XMMf16_XMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtss_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VCVTSS2SH" xed="VCVTSS2SH_XMMf16_MASKmskw_XMMf16_XMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvt_roundss_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VCVTSS2SH" xed="VCVTSS2SH_XMMf16_MASKmskw_XMMf16_XMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VCVTPH2DQ" xed="VCVTPH2DQ_ZMMi32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm {er}" name="VCVTPH2DQ" xed="VCVTPH2DQ_ZMMi32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VCVTPH2DQ" xed="VCVTPH2DQ_ZMMi32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm {er}" name="VCVTPH2DQ" xed="VCVTPH2DQ_ZMMi32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VCVTPH2DQ" xed="VCVTPH2DQ_ZMMi32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm {er}" name="VCVTPH2DQ" xed="VCVTPH2DQ_ZMMi32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VCVTTPH2DQ" xed="VCVTTPH2DQ_ZMMi32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm {sae}" name="VCVTTPH2DQ" xed="VCVTTPH2DQ_ZMMi32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VCVTTPH2DQ" xed="VCVTTPH2DQ_ZMMi32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm {sae}" name="VCVTTPH2DQ" xed="VCVTTPH2DQ_ZMMi32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VCVTTPH2DQ" xed="VCVTTPH2DQ_ZMMi32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundph_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm {sae}" name="VCVTTPH2DQ" xed="VCVTTPH2DQ_ZMMi32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VCVTPH2UDQ" xed="VCVTPH2UDQ_ZMMu32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm {er}" name="VCVTPH2UDQ" xed="VCVTPH2UDQ_ZMMu32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VCVTPH2UDQ" xed="VCVTPH2UDQ_ZMMu32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm {er}" name="VCVTPH2UDQ" xed="VCVTPH2UDQ_ZMMu32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VCVTPH2UDQ" xed="VCVTPH2UDQ_ZMMu32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm {er}" name="VCVTPH2UDQ" xed="VCVTPH2UDQ_ZMMu32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VCVTTPH2UDQ" xed="VCVTTPH2UDQ_ZMMu32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm {sae}" name="VCVTTPH2UDQ" xed="VCVTTPH2UDQ_ZMMu32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VCVTTPH2UDQ" xed="VCVTTPH2UDQ_ZMMu32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm {sae}" name="VCVTTPH2UDQ" xed="VCVTTPH2UDQ_ZMMu32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VCVTTPH2UDQ" xed="VCVTTPH2UDQ_ZMMu32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundph_epu32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 15
+	IF k[j]
+		dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm {sae}" name="VCVTTPH2UDQ" xed="VCVTTPH2UDQ_ZMMu32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VCVTPH2QQ" xed="VCVTPH2QQ_ZMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm {er}" name="VCVTPH2QQ" xed="VCVTPH2QQ_ZMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VCVTPH2QQ" xed="VCVTPH2QQ_ZMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm {er}" name="VCVTPH2QQ" xed="VCVTPH2QQ_ZMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VCVTPH2QQ" xed="VCVTPH2QQ_ZMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm {er}" name="VCVTPH2QQ" xed="VCVTPH2QQ_ZMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VCVTTPH2QQ" xed="VCVTTPH2QQ_ZMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm {sae}" name="VCVTTPH2QQ" xed="VCVTTPH2QQ_ZMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VCVTTPH2QQ" xed="VCVTTPH2QQ_ZMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm {sae}" name="VCVTTPH2QQ" xed="VCVTTPH2QQ_ZMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VCVTTPH2QQ" xed="VCVTTPH2QQ_ZMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundph_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm {sae}" name="VCVTTPH2QQ" xed="VCVTTPH2QQ_ZMMi64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VCVTPH2UQQ" xed="VCVTPH2UQQ_ZMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm {er}" name="VCVTPH2UQQ" xed="VCVTPH2UQQ_ZMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VCVTPH2UQQ" xed="VCVTPH2UQQ_ZMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm {er}" name="VCVTPH2UQQ" xed="VCVTPH2UQQ_ZMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VCVTPH2UQQ" xed="VCVTPH2UQQ_ZMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm  {er}" name="VCVTPH2UQQ" xed="VCVTPH2UQQ_ZMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VCVTTPH2UQQ" xed="VCVTTPH2UQQ_ZMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm {sae}" name="VCVTTPH2UQQ" xed="VCVTTPH2UQQ_ZMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VCVTTPH2UQQ" xed="VCVTTPH2UQQ_ZMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := src.qword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm {sae}" name="VCVTTPH2UQQ" xed="VCVTTPH2UQQ_ZMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VCVTTPH2UQQ" xed="VCVTTPH2UQQ_ZMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundph_epu64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 7
+	IF k[j]
+		dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j])
+	ELSE
+		dst.qword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm {sae}" name="VCVTTPH2UQQ" xed="VCVTTPH2UQQ_ZMMu64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.word[j] := Convert_FP16_To_Int16(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTPH2W" xed="VCVTPH2W_ZMMi16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.word[j] := Convert_FP16_To_Int16(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VCVTPH2W" xed="VCVTPH2W_ZMMi16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTPH2W" xed="VCVTPH2W_ZMMi16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VCVTPH2W" xed="VCVTPH2W_ZMMi16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTPH2W" xed="VCVTPH2W_ZMMi16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VCVTPH2W" xed="VCVTPH2W_ZMMi16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTTPH2W" xed="VCVTTPH2W_ZMMi16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}" name="VCVTTPH2W" xed="VCVTTPH2W_ZMMi16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTTPH2W" xed="VCVTTPH2W_ZMMi16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}" name="VCVTTPH2W" xed="VCVTTPH2W_ZMMi16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTTPH2W" xed="VCVTTPH2W_ZMMi16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundph_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}" name="VCVTTPH2W" xed="VCVTTPH2W_ZMMi16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTPH2UW" xed="VCVTPH2UW_ZMMu16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}" name="VCVTPH2UW" xed="VCVTPH2UW_ZMMu16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTPH2UW" xed="VCVTPH2UW_ZMMu16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}" name="VCVTPH2UW" xed="VCVTPH2UW_ZMMu16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTPH2UW" xed="VCVTPH2UW_ZMMu16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}" name="VCVTPH2UW" xed="VCVTPH2UW_ZMMu16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvttph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VCVTTPH2UW" xed="VCVTTPH2UW_ZMMu16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtt_roundph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}" name="VCVTTPH2UW" xed="VCVTTPH2UW_ZMMu16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvttph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VCVTTPH2UW" xed="VCVTTPH2UW_ZMMu16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtt_roundph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}" name="VCVTTPH2UW" xed="VCVTTPH2UW_ZMMu16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvttph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VCVTTPH2UW" xed="VCVTTPH2UW_ZMMu16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtt_roundph_epu16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 TO 31
+	IF k[j]
+		dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}" name="VCVTTPH2UW" xed="VCVTTPH2UW_ZMMu16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm" name="VCVTPH2PD" xed="VCVTPH2PD_ZMMf64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvt_roundph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, xmm {sae}" name="VCVTPH2PD" xed="VCVTPH2PD_ZMMf64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j])
+	ELSE
+		dst.fp64[j] := src.fp64[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm" name="VCVTPH2PD" xed="VCVTPH2PD_ZMMf64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvt_roundph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j])
+	ELSE
+		dst.fp64[j] := src.fp64[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, xmm {sae}" name="VCVTPH2PD" xed="VCVTPH2PD_ZMMf64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j])
+	ELSE
+		dst.fp64[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm" name="VCVTPH2PD" xed="VCVTPH2PD_ZMMf64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvt_roundph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j])
+	ELSE
+		dst.fp64[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, xmm {sae}" name="VCVTPH2PD" xed="VCVTPH2PD_ZMMf64_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtxph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm" name="VCVTPH2PSX" xed="VCVTPH2PSX_ZMMf32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtx_roundph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, ymm {sae}" name="VCVTPH2PSX" xed="VCVTPH2PSX_ZMMf32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtxph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j])
+	ELSE
+		dst.fp32[j] := src.fp32[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm" name="VCVTPH2PSX" xed="VCVTPH2PSX_ZMMf32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_cvtx_roundph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j])
+	ELSE
+		dst.fp32[j] := src.fp32[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, ymm {sae}" name="VCVTPH2PSX" xed="VCVTPH2PSX_ZMMf32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtxph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j])
+	ELSE
+		dst.fp32[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm" name="VCVTPH2PSX" xed="VCVTPH2PSX_ZMMf32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_cvtx_roundph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j])
+	ELSE
+		dst.fp32[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, ymm {sae}" name="VCVTPH2PSX" xed="VCVTPH2PSX_ZMMf32_MASKmskw_YMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsh_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VCVTSH2SD" xed="VCVTSH2SD_XMMf64_MASKmskw_XMMf64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsh_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [sae_note]</description>
+	<operation>
+dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}" name="VCVTSH2SD" xed="VCVTSH2SD_XMMf64_MASKmskw_XMMf64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsh_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0])
+ELSE
+	dst.fp64[0] := src.fp64[0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VCVTSH2SD" xed="VCVTSH2SD_XMMf64_MASKmskw_XMMf64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvt_roundsh_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note]</description>
+	<operation>
+IF k[0]
+	dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0])
+ELSE
+	dst.fp64[0] := src.fp64[0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}" name="VCVTSH2SD" xed="VCVTSH2SD_XMMf64_MASKmskw_XMMf64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtsh_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0])
+ELSE
+	dst.fp64[0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VCVTSH2SD" xed="VCVTSH2SD_XMMf64_MASKmskw_XMMf64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvt_roundsh_sd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note]</description>
+	<operation>
+IF k[0]
+	dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0])
+ELSE
+	dst.fp64[0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}" name="VCVTSH2SD" xed="VCVTSH2SD_XMMf64_MASKmskw_XMMf64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsh_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VCVTSH2SS" xed="VCVTSH2SS_XMMf32_MASKmskw_XMMf32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsh_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note]</description>
+	<operation>
+dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}" name="VCVTSH2SS" xed="VCVTSH2SS_XMMf32_MASKmskw_XMMf32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvtsh_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0])
+ELSE
+	dst.fp32[0] := src.fp32[0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VCVTSH2SS" xed="VCVTSH2SS_XMMf32_MASKmskw_XMMf32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_cvt_roundsh_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note]</description>
+	<operation>
+IF k[0]
+	dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0])
+ELSE
+	dst.fp32[0] := src.fp32[0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}" name="VCVTSH2SS" xed="VCVTSH2SS_XMMf32_MASKmskw_XMMf32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvtsh_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0])
+ELSE
+	dst.fp32[0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VCVTSH2SS" xed="VCVTSH2SS_XMMf32_MASKmskw_XMMf32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_cvt_roundsh_ss" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note]</description>
+	<operation>
+IF k[0]
+	dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0])
+ELSE
+	dst.fp32[0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}" name="VCVTSH2SS" xed="VCVTSH2SS_XMMf32_MASKmskw_XMMf32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsh_i32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst.dword := Convert_FP16_To_Int32(a.fp16[0])
+	</operation>
+	<instruction form="r32, xmm" name="VCVTSH2SI" xed="VCVTSH2SI_GPR32i32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsh_i32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst.dword := Convert_FP16_To_Int32(a.fp16[0])
+	</operation>
+	<instruction form="r32, xmm {er}" name="VCVTSH2SI" xed="VCVTSH2SI_GPR32i32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsh_i64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst.qword := Convert_FP16_To_Int64(a.fp16[0])
+	</operation>
+	<instruction form="r64, xmm" name="VCVTSH2SI" xed="VCVTSH2SI_GPR64i64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsh_i64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst.qword := Convert_FP16_To_Int64(a.fp16[0])
+	</operation>
+	<instruction form="r64, xmm {er}" name="VCVTSH2SI" xed="VCVTSH2SI_GPR64i64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttsh_i32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst.dword := Convert_FP16_To_Int32_Truncate(a.fp16[0])
+	</operation>
+	<instruction form="r32, xmm" name="VCVTTSH2SI" xed="VCVTTSH2SI_GPR32i32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundsh_i32" tech="AVX-512">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". [sae_note]</description>
+	<operation>
+dst.dword := Convert_FP16_To_Int32_Truncate(a.fp16[0])
+	</operation>
+	<instruction form="r32, xmm {sae}" name="VCVTTSH2SI" xed="VCVTTSH2SI_GPR32i32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttsh_i64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst.qword := Convert_FP16_To_Int64_Truncate(a.fp16[0])
+	</operation>
+	<instruction form="r64, xmm" name="VCVTTSH2SI" xed="VCVTTSH2SI_GPR64i64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundsh_i64" tech="AVX-512">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". [sae_note]</description>
+	<operation>
+dst.qword := Convert_FP16_To_Int64_Truncate(a.fp16[0])
+	</operation>
+	<instruction form="r64, xmm {sae}" name="VCVTTSH2SI" xed="VCVTTSH2SI_GPR64i64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsh_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst.dword := Convert_FP16_To_UInt32(a.fp16[0])
+	</operation>
+	<instruction form="r32, xmm" name="VCVTSH2USI" xed="VCVTSH2USI_GPR32u32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsh_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". [sae_note]</description>
+	<operation>
+dst.dword := Convert_FP16_To_UInt32(a.fp16[0])
+	</operation>
+	<instruction form="r32, xmm {sae}" name="VCVTSH2USI" xed="VCVTSH2USI_GPR32u32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsh_u64" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst.qword := Convert_FP16_To_UInt64(a.fp16[0])
+	</operation>
+	<instruction form="r64, xmm" name="VCVTSH2USI" xed="VCVTSH2USI_GPR64u64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundsh_u64" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". [round_note]</description>
+	<operation>
+dst.qword := Convert_FP16_To_UInt64(a.fp16[0])
+	</operation>
+	<instruction form="r64, xmm {er}" name="VCVTSH2USI" xed="VCVTSH2USI_GPR64u64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttsh_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst.dword := Convert_FP16_To_UInt32_Truncate(a.fp16[0])
+	</operation>
+	<instruction form="r32, xmm" name="VCVTTSH2USI" xed="VCVTTSH2USI_GPR32u32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundsh_u32" tech="AVX-512">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". [sae_note]</description>
+	<operation>
+dst.dword := Convert_FP16_To_UInt32_Truncate(a.fp16[0])
+	</operation>
+	<instruction form="r32, xmm {sae}" name="VCVTTSH2USI" xed="VCVTTSH2USI_GPR32u32_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttsh_u64" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst.qword := Convert_FP16_To_UInt64_Truncate(a.fp16[0])
+	</operation>
+	<instruction form="r64, xmm" name="VCVTTSH2USI" xed="VCVTTSH2USI_GPR64u64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_roundsh_u64" tech="AVX-512">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". [sae_note]</description>
+	<operation>
+dst.qword := Convert_FP16_To_UInt64_Truncate(a.fp16[0])
+	</operation>
+	<instruction form="r64, xmm {sae}" name="VCVTTSH2USI" xed="VCVTTSH2USI_GPR64u64_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvti32_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="SI32" type="int" varname="b" />
+	<description>Convert the signed 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r32" name="VCVTSI2SH" xed="VCVTSI2SH_XMMf16_XMMf16_GPR32i32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundi32_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="SI32" type="int" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the signed 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r32 {er}" name="VCVTSI2SH" xed="VCVTSI2SH_XMMf16_XMMf16_GPR32i32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtu32_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="b" />
+	<description>Convert the unsigned 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r32" name="VCVTUSI2SH" xed="VCVTUSI2SH_XMMf16_XMMf16_GPR32u32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundu32_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the unsigned 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r32 {er}" name="VCVTUSI2SH" xed="VCVTUSI2SH_XMMf16_XMMf16_GPR32u32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvti64_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="SI64" type="__int64" varname="b" />
+	<description>Convert the signed 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64" name="VCVTSI2SH" xed="VCVTSI2SH_XMMf16_XMMf16_GPR64i64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundi64_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="SI64" type="__int64" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the signed 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64 {er}" name="VCVTSI2SH" xed="VCVTSI2SH_XMMf16_XMMf16_GPR64i64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtu64_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="b" />
+	<description>Convert the unsigned 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64" name="VCVTUSI2SH" xed="VCVTUSI2SH_XMMf16_XMMf16_GPR64u64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_roundu64_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the unsigned 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, r64 {er}" name="VCVTUSI2SH" xed="VCVTUSI2SH_XMMf16_XMMf16_GPR64u64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi16_si128" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="short" varname="a" />
+	<description>Copy 16-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := a.fp16[0]
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="xmm, r16" name="VMOVW" xed="VMOVW_XMMf16_GPR32f16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi128_si16" tech="AVX-512">
+	<return etype="UI16" type="short" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Copy the lower 16-bit integer in "a" to "dst".</description>
+	<operation>
+dst.fp16[0] := a.fp16[0]
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="r16, xmm" name="VMOVW" xed="VMOVW_GPR32f16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsh_h" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Copy the lower half-precision (16-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[15:0] := a.fp16[0]
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtsh_h" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Copy the lower half-precision (16-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[15:0] := a.fp16[0]
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_cvtsh_h" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="_Float16" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Copy the lower half-precision (16-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[15:0] := a.fp16[0]
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note]</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := (a.fp16[j] &gt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VMAXPH" xed="VMAXPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &gt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VMAXPH" xed="VMAXPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &gt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VMAXPH" xed="VMAXPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_max_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [sae_note][max_float_note]</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := (a.fp16[j] &gt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {sae}" name="VMAXPH" xed="VMAXPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_max_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][max_float_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &gt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {sae}" name="VMAXPH" xed="VMAXPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_max_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][max_float_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &gt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {sae}" name="VMAXPH" xed="VMAXPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note]</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := (a.fp16[j] &lt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VMINPH" xed="VMINPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &lt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VMINPH" xed="VMINPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &lt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VMINPH" xed="VMINPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_min_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [sae_note] [min_float_note]</description>
+	<operation>
+FOR j := 0 to 31
+	dst.fp16[j] := (a.fp16[j] &lt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {sae}" name="VMINPH" xed="VMINPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_min_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][min_float_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &lt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := src.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {sae}" name="VMINPH" xed="VMINPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_min_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="int" varname="sae" />
+	<description>Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][min_float_note]</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := (a.fp16[j] &lt; b.fp16[j] ? a.fp16[j] : b.fp16[j])
+	ELSE
+		dst.fp16[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {sae}" name="VMINPH" xed="VMINPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8)
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VREDUCESH" xed="VREDUCESH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_reduce_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8)
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}, imm8" name="VREDUCESH" xed="VREDUCESH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+IF k[0]
+	dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8)
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VREDUCESH" xed="VREDUCESH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_reduce_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+IF k[0]
+	dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8)
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}, imm8" name="VREDUCESH" xed="VREDUCESH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_reduce_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+IF k[0]
+	dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8)
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VREDUCESH" xed="VREDUCESH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_reduce_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+IF k[0]
+	dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8)
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}, imm8" name="VREDUCESH" xed="VREDUCESH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_load_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" memwidth="16" type="void const*" varname="mem_addr" />
+	<description>Load a half-precision (16-bit) floating-point element from memory into the lower element of "dst", and zero the upper elements.</description>
+	<operation>
+dst.fp16[0] := MEM[mem_addr].fp16[0]
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="xmm, m64" name="VMOVSH" xed="VMOVSH_XMMf16_MASKmskw_MEMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_load_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" memwidth="16" type="void const*" varname="mem_addr" />
+	<description>Load a half-precision (16-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper elements of "dst" to zero.</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := MEM[mem_addr].fp16[0]
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="xmm {k}, m64" name="VMOVSH" xed="VMOVSH_XMMf16_MASKmskw_MEMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_load_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" memwidth="16" type="void const*" varname="mem_addr" />
+	<description>Load a half-precision (16-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper elements of "dst" to zero.</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := MEM[mem_addr].fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="xmm {z}, m64" name="VMOVSH" xed="VMOVSH_XMMf16_MASKmskw_MEMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_load_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVAPS" xed="VMOVAPS_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_loadu_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" memwidth="512" type="void const*" varname="mem_addr" />
+	<description>Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, m512" name="VMOVUPS" xed="VMOVUPS_ZMMf32_MASKmskw_MEMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_store_sh" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP16" memwidth="16" type="void *" varname="mem_addr" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Store the lower half-precision (16-bit) floating-point element from "a" into memory.</description>
+	<operation>
+MEM[mem_addr].fp16[0] := a.fp16[0]
+	</operation>
+	<instruction form="m16, xmm" name="VMOVSH" xed="VMOVSH_MEMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_store_sh" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP16" memwidth="16" type="void *" varname="mem_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Store the lower half-precision (16-bit) floating-point element from "a" into memory using writemask "k".</description>
+	<operation>
+IF k[0]
+	MEM[mem_addr].fp16[0] := a.fp16[0]
+FI
+	</operation>
+	<instruction form="m16 {k}, xmm" name="VMOVSH" xed="VMOVSH_MEMf16_MASKmskw_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_store_ph" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP16" memwidth="512" type="void *" varname="mem_addr" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVAPS" xed="VMOVAPS_MEMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_storeu_ph" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="FP16" memwidth="512" type="void *" varname="mem_addr" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction form="m512, zmm" name="VMOVUPS" xed="VMOVUPS_MEMf32_MASKmskw_ZMMf32_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_move_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Move the lower half-precision (16-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := b.fp16[0]
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VMOVSH" xed="VMOVSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_move_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Move the lower half-precision (16-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := b.fp16[0]
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VMOVSH" xed="VMOVSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_move_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Move the lower half-precision (16-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := b.fp16[0]
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VMOVSH" xed="VMOVSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm512_roundscale_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+FOR i := 0 to 31
+	dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8)
+ENDFOR
+dest[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VRNDSCALEPH" xed="VRNDSCALEPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_roundscale_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+FOR i := 0 to 31
+	dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8)
+ENDFOR
+dest[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}, imm8" name="VRNDSCALEPH" xed="VRNDSCALEPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_roundscale_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dest[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VRNDSCALEPH" xed="VRNDSCALEPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_roundscale_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dest[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}, imm8" name="VRNDSCALEPH" xed="VRNDSCALEPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_roundscale_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dest[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VRNDSCALEPH" xed="VRNDSCALEPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_roundscale_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dest[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}, imm8" name="VRNDSCALEPH" xed="VRNDSCALEPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_roundscale_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8)
+dst[127:16] := a[127:16]
+dest[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VRNDSCALESH" xed="VRNDSCALESH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_roundscale_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8)
+dst[127:16] := a[127:16]
+dest[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}, imm8" name="VRNDSCALESH" xed="VRNDSCALESH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_roundscale_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+IF k[0]
+	dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8)
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dest[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VRNDSCALESH" xed="VRNDSCALESH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_roundscale_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+IF k[0]
+	dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8)
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dest[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}, imm8" name="VRNDSCALESH" xed="VRNDSCALESH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_roundscale_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+IF k[0]
+	dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8)
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dest[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VRNDSCALESH" xed="VRNDSCALESH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_roundscale_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) {
+	m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0])
+	RETURN tmp.fp16
+}
+IF k[0]
+	dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8)
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dest[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}, imm8" name="VRNDSCALESH" xed="VRNDSCALESH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_getexp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR i := 0 to 31
+	dst.fp16[i] := ConvertExpFP16(a.fp16[i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VGETEXPPH" xed="VGETEXPPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_getexp_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. [sae_note]</description>
+	<operation>FOR i := 0 to 31
+	dst.fp16[i] := ConvertExpFP16(a.fp16[i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}" name="VGETEXPPH" xed="VGETEXPPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_getexp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := ConvertExpFP16(a.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VGETEXPPH" xed="VGETEXPPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_getexp_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. [sae_note]</description>
+	<operation>FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := ConvertExpFP16(a.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}" name="VGETEXPPH" xed="VGETEXPPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_getexp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := ConvertExpFP16(a.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VGETEXPPH" xed="VGETEXPPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_getexp_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. [sae_note]</description>
+	<operation>FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := ConvertExpFP16(a.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}" name="VGETEXPPH" xed="VGETEXPPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getexp_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>dst.fp16[0] := ConvertExpFP16(b.fp16[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VGETEXPSH" xed="VGETEXPSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getexp_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. [sae_note]</description>
+	<operation>dst.fp16[0] := ConvertExpFP16(b.fp16[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}" name="VGETEXPSH" xed="VGETEXPSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getexp_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>IF k[0]
+	dst.fp16[0] := ConvertExpFP16(b.fp16[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VGETEXPSH" xed="VGETEXPSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getexp_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. [sae_note]</description>
+	<operation>IF k[0]
+	dst.fp16[0] := ConvertExpFP16(b.fp16[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}" name="VGETEXPSH" xed="VGETEXPSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getexp_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>IF k[0]
+	dst.fp16[0] := ConvertExpFP16(b.fp16[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VGETEXPSH" xed="VGETEXPSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getexp_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. [sae_note]</description>
+	<operation>IF k[0]
+	dst.fp16[0] := ConvertExpFP16(b.fp16[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}" name="VGETEXPSH" xed="VGETEXPSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_getmant_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<description>Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+			[getmant_note]</description>
+	<operation>FOR i := 0 TO 31
+	dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VGETMANTPH" xed="VGETMANTPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_getmant_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+			[getmant_note][sae_note]</description>
+	<operation>FOR i := 0 TO 31
+	dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}, imm8" name="VGETMANTPH" xed="VGETMANTPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_getmant_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<description>Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+			[getmant_note]</description>
+	<operation>FOR i := 0 TO 31
+	IF k[i]
+		dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign)
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VGETMANTPH" xed="VGETMANTPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_getmant_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+			[getmant_note][sae_note]</description>
+	<operation>FOR i := 0 TO 31
+	IF k[i]
+		dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign)
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}, imm8" name="VGETMANTPH" xed="VGETMANTPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_getmant_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<description>Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+			[getmant_note]</description>
+	<operation>FOR i := 0 TO 31
+	IF k[i]
+		dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign)
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VGETMANTPH" xed="VGETMANTPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_getmant_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+			[getmant_note][sae_note]</description>
+	<operation>FOR i := 0 TO 31
+	IF k[i]
+		dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign)
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}, imm8" name="VGETMANTPH" xed="VGETMANTPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getmant_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<description>Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+	[getmant_note]</description>
+	<operation>dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign)
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VGETMANTSH" xed="VGETMANTSH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_getmant_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign)
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {sae}, imm8" name="VGETMANTSH" xed="VGETMANTSH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getmant_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<description>Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+	[getmant_note]</description>
+	<operation>IF k[0]
+	dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign)
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VGETMANTSH" xed="VGETMANTSH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_getmant_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>IF k[0]
+	dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign)
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {sae}, imm8" name="VGETMANTSH" xed="VGETMANTSH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getmant_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<description>Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+	[getmant_note]</description>
+	<operation>IF k[0]
+	dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign)
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VGETMANTSH" xed="VGETMANTSH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_getmant_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_NORM" type="_MM_MANTISSA_NORM_ENUM" varname="norm" />
+	<parameter etype="IMM" immtype="_MM_MANTISSA_SIGN" type="_MM_MANTISSA_SIGN_ENUM" varname="sign" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "&#177;(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>IF k[0]
+	dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign)
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {sae}, imm8" name="VGETMANTSH" xed="VGETMANTSH_XMMf16_MASKmskw_XMMf16_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+FOR i := 0 to 31
+	dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, imm8" name="VREDUCEPH" xed="VREDUCEPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_reduce_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+FOR i := 0 to 31
+	dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {sae}, imm8" name="VREDUCEPH" xed="VREDUCEPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, imm8" name="VREDUCEPH" xed="VREDUCEPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_reduce_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {sae}, imm8" name="VREDUCEPH" xed="VREDUCEPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_reduce_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<description>Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, imm8" name="VREDUCEPH" xed="VREDUCEPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_reduce_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immtype="_MM_REDUCE" type="int" varname="imm8" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE" type="const int" varname="sae" />
+	<description>Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) {
+	m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0])
+	tmp[15:0] := src[15:0] - tmp[15:0]
+	IF IsInf(tmp[15:0])
+		tmp[15:0] := FP16(0.0)
+	FI
+	RETURN tmp[15:0]
+}
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8)
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {sae}, imm8" name="VREDUCEPH" xed="VREDUCEPH_ZMMf16_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_scalef_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+FOR i := 0 to 15
+	dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VSCALEFPH" xed="VSCALEFPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_scalef_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+FOR i := 0 to 15
+	dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm {er}" name="VSCALEFPH" xed="VSCALEFPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_scalef_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VSCALEFPH" xed="VSCALEFPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_scalef_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm {er}" name="VSCALEFPH" xed="VSCALEFPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_scalef_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VSCALEFPH" xed="VSCALEFPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_scalef_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+FOR i := 0 to 15
+	IF k[i]
+		dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm {er}" name="VSCALEFPH" xed="VSCALEFPH_ZMMf16_MASKmskw_ZMMf16_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_scalef_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VSCALEFSH" xed="VSCALEFSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_scalef_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VSCALEFSH" xed="VSCALEFSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_scalef_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+IF k[0]
+	dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSCALEFSH" xed="VSCALEFSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_scalef_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+IF k[0]
+	dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VSCALEFSH" xed="VSCALEFSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_scalef_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+IF k[0]
+	dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSCALEFSH" xed="VSCALEFSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_scalef_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>DEFINE ScaleFP16(src1, src2) {
+	denormal1 := (a.exp == 0) and (a.fraction != 0)
+	denormal2 := (b.exp == 0) and (b.fraction != 0)
+	tmp1 := src1
+	tmp2 := src2
+	IF MXCSR.DAZ
+		IF denormal1
+			tmp1 := 0
+		FI
+		IF denormal2
+			tmp2 := 0
+		FI
+	FI
+	RETURN tmp1 * POW(2.0, FLOOR(tmp2))
+}
+IF k[0]
+	dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VSCALEFSH" xed="VSCALEFSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_fpclass_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+				[fpclass_note]</description>
+	<operation>FOR i := 0 to 31
+	k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0])
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k, zmm, imm8" name="VFPCLASSPH" xed="VFPCLASSPH_MASKmskw_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_fpclass_ph_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="MASK" type="__mmask32" varname="k1" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+			[fpclass_note]</description>
+	<operation>FOR i := 0 to 31
+	IF k1[i]
+		k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0])
+	ELSE
+		k[i] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction form="k {k}, zmm, imm8" name="VFPCLASSPH" xed="VFPCLASSPH_MASKmskw_MASKmskw_ZMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_fpclass_sh_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test the lower half-precision (16-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k".
+			[fpclass_note]</description>
+	<operation>k[0] := CheckFPClass_FP16(a.fp16[0], imm8[7:0])
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k, xmm, imm8" name="VFPCLASSSH" xed="VFPCLASSSH_MASKmskw_MASKmskw_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_fpclass_sh_mask" tech="AVX-512">
+	<return etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="MASK" type="__mmask8" varname="k1" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Test the lower half-precision (16-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set).
+		[fpclass_note]</description>
+	<operation>IF k1[0]
+	k[0] := CheckFPClass_FP16(a.fp16[0], imm8[7:0])
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction form="k {k}, xmm, imm8" name="VFPCLASSSH" xed="VFPCLASSSH_MASKmskw_MASKmskw_XMMf16_IMM8_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutex2var_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="idx" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Shuffle half-precision (16-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	off := idx[i+4:i]
+	dst.fp16[j] := idx[i+5] ? b.fp16[off] : a.fp16[off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMI2W" xed="VPERMI2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<instruction form="zmm, zmm, zmm" name="VPERMT2W" xed="VPERMT2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_blend_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="FP16" type="__m512h" varname="b" />
+	<description>Blend packed half-precision (16-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		dst.fp16[j] := b.fp16[j]
+	ELSE
+		dst.fp16[j] := a.fp16[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPBLENDMW" xed="VPBLENDMW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutexvar_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="idx" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Shuffle half-precision (16-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	id := idx[i+4:i]
+	dst.fp16[j] := a.fp16[id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMW" xed="VPERMW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rsqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 31
+	dst.fp16[i] := (1.0 / SQRT(a.fp16[i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VRSQRTPH" xed="VRSQRTPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rsqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := (1.0 / SQRT(a.fp16[i]))
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VRSQRTPH" xed="VRSQRTPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_rsqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := (1.0 / SQRT(a.fp16[i]))
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VRSQRTPH" xed="VRSQRTPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_rsqrt_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+dst.fp16[0] := (1.0 / SQRT(b.fp16[0]))
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VRSQRTSH" xed="VRSQRTSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rsqrt_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (1.0 / SQRT(b.fp16[0]))
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VRSQRTSH" xed="VRSQRTSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rsqrt_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (1.0 / SQRT(b.fp16[0]))
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VRSQRTSH" xed="VRSQRTSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR i := 0 to 31
+	dst.fp16[i] := SQRT(a.fp16[i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VSQRTPH" xed="VSQRTPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_sqrt_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR i := 0 to 31
+	dst.fp16[i] := SQRT(a.fp16[i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm {er}" name="VSQRTPH" xed="VSQRTPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := SQRT(a.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VSQRTPH" xed="VSQRTPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_sqrt_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := SQRT(a.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm {er}" name="VSQRTPH" xed="VSQRTPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sqrt_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := SQRT(a.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VSQRTPH" xed="VSQRTPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_sqrt_round_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := SQRT(a.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm {er}" name="VSQRTPH" xed="VSQRTPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_sqrt_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst.fp16[0] := SQRT(b.fp16[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VSQRTSH" xed="VSQRTSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_sqrt_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+dst.fp16[0] := SQRT(b.fp16[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm {er}" name="VSQRTSH" xed="VSQRTSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sqrt_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := SQRT(b.fp16[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VSQRTSH" xed="VSQRTSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_sqrt_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := SQRT(b.fp16[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm {er}" name="VSQRTSH" xed="VSQRTSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sqrt_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := SQRT(b.fp16[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VSQRTSH" xed="VSQRTSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_sqrt_round_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="const int" varname="rounding" />
+	<description>Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := SQRT(b.fp16[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm {er}" name="VSQRTSH" xed="VSQRTSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_rcp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 31
+	dst.fp16[i] := (1.0 / a.fp16[i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm" name="VRCPPH" xed="VRCPPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_rcp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := (1.0 / a.fp16[i])
+	ELSE
+		dst.fp16[i] := src.fp16[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VRCPPH" xed="VRCPPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_rcp_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR i := 0 to 31
+	IF k[i]
+		dst.fp16[i] := (1.0 / a.fp16[i])
+	ELSE
+		dst.fp16[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VRCPPH" xed="VRCPPH_ZMMf16_MASKmskw_ZMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_rcp_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+dst.fp16[0] := (1.0 / b.fp16[0])
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VRCPSH" xed="VRCPSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_rcp_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (1.0 / b.fp16[0])
+ELSE
+	dst.fp16[0] := src.fp16[0]
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VRCPSH" xed="VRCPSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_rcp_sh" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<parameter etype="FP16" type="__m128h" varname="b" />
+	<description>Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+IF k[0]
+	dst.fp16[0] := (1.0 / b.fp16[0])
+ELSE
+	dst.fp16[0] := 0
+FI
+dst[127:16] := a[127:16]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VRCPSH" xed="VRCPSH_XMMf16_MASKmskw_XMMf16_XMMf16_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="_Float16" varname="e7" />
+	<parameter etype="FP16" type="_Float16" varname="e6" />
+	<parameter etype="FP16" type="_Float16" varname="e5" />
+	<parameter etype="FP16" type="_Float16" varname="e4" />
+	<parameter etype="FP16" type="_Float16" varname="e3" />
+	<parameter etype="FP16" type="_Float16" varname="e2" />
+	<parameter etype="FP16" type="_Float16" varname="e1" />
+	<parameter etype="FP16" type="_Float16" varname="e0" />
+	<description>Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst.fp16[0] := e0
+dst.fp16[1] := e1
+dst.fp16[2] := e2
+dst.fp16[3] := e3
+dst.fp16[4] := e4
+dst.fp16[5] := e5
+dst.fp16[6] := e6
+dst.fp16[7] := e7
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="_Float16" varname="e15" />
+	<parameter etype="FP16" type="_Float16" varname="e14" />
+	<parameter etype="FP16" type="_Float16" varname="e13" />
+	<parameter etype="FP16" type="_Float16" varname="e12" />
+	<parameter etype="FP16" type="_Float16" varname="e11" />
+	<parameter etype="FP16" type="_Float16" varname="e10" />
+	<parameter etype="FP16" type="_Float16" varname="e9" />
+	<parameter etype="FP16" type="_Float16" varname="e8" />
+	<parameter etype="FP16" type="_Float16" varname="e7" />
+	<parameter etype="FP16" type="_Float16" varname="e6" />
+	<parameter etype="FP16" type="_Float16" varname="e5" />
+	<parameter etype="FP16" type="_Float16" varname="e4" />
+	<parameter etype="FP16" type="_Float16" varname="e3" />
+	<parameter etype="FP16" type="_Float16" varname="e2" />
+	<parameter etype="FP16" type="_Float16" varname="e1" />
+	<parameter etype="FP16" type="_Float16" varname="e0" />
+	<description>Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst.fp16[0] := e0
+dst.fp16[1] := e1
+dst.fp16[2] := e2
+dst.fp16[3] := e3
+dst.fp16[4] := e4
+dst.fp16[5] := e5
+dst.fp16[6] := e6
+dst.fp16[7] := e7
+dst.fp16[8] := e8
+dst.fp16[9] := e9
+dst.fp16[10] := e10
+dst.fp16[11] := e11
+dst.fp16[12] := e12
+dst.fp16[13] := e13
+dst.fp16[14] := e14
+dst.fp16[15] := e15
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="_Float16" varname="e31" />
+	<parameter etype="FP16" type="_Float16" varname="e30" />
+	<parameter etype="FP16" type="_Float16" varname="e29" />
+	<parameter etype="FP16" type="_Float16" varname="e28" />
+	<parameter etype="FP16" type="_Float16" varname="e27" />
+	<parameter etype="FP16" type="_Float16" varname="e26" />
+	<parameter etype="FP16" type="_Float16" varname="e25" />
+	<parameter etype="FP16" type="_Float16" varname="e24" />
+	<parameter etype="FP16" type="_Float16" varname="e23" />
+	<parameter etype="FP16" type="_Float16" varname="e22" />
+	<parameter etype="FP16" type="_Float16" varname="e21" />
+	<parameter etype="FP16" type="_Float16" varname="e20" />
+	<parameter etype="FP16" type="_Float16" varname="e19" />
+	<parameter etype="FP16" type="_Float16" varname="e18" />
+	<parameter etype="FP16" type="_Float16" varname="e17" />
+	<parameter etype="FP16" type="_Float16" varname="e16" />
+	<parameter etype="FP16" type="_Float16" varname="e15" />
+	<parameter etype="FP16" type="_Float16" varname="e14" />
+	<parameter etype="FP16" type="_Float16" varname="e13" />
+	<parameter etype="FP16" type="_Float16" varname="e12" />
+	<parameter etype="FP16" type="_Float16" varname="e11" />
+	<parameter etype="FP16" type="_Float16" varname="e10" />
+	<parameter etype="FP16" type="_Float16" varname="e9" />
+	<parameter etype="FP16" type="_Float16" varname="e8" />
+	<parameter etype="FP16" type="_Float16" varname="e7" />
+	<parameter etype="FP16" type="_Float16" varname="e6" />
+	<parameter etype="FP16" type="_Float16" varname="e5" />
+	<parameter etype="FP16" type="_Float16" varname="e4" />
+	<parameter etype="FP16" type="_Float16" varname="e3" />
+	<parameter etype="FP16" type="_Float16" varname="e2" />
+	<parameter etype="FP16" type="_Float16" varname="e1" />
+	<parameter etype="FP16" type="_Float16" varname="e0" />
+	<description>Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst.fp16[0] := e0
+dst.fp16[1] := e1
+dst.fp16[2] := e2
+dst.fp16[3] := e3
+dst.fp16[4] := e4
+dst.fp16[5] := e5
+dst.fp16[6] := e6
+dst.fp16[7] := e7
+dst.fp16[8] := e8
+dst.fp16[9] := e9
+dst.fp16[10] := e10
+dst.fp16[11] := e11
+dst.fp16[12] := e12
+dst.fp16[13] := e13
+dst.fp16[14] := e14
+dst.fp16[15] := e15
+dst.fp16[16] := e16
+dst.fp16[17] := e17
+dst.fp16[18] := e18
+dst.fp16[19] := e19
+dst.fp16[20] := e20
+dst.fp16[21] := e21
+dst.fp16[22] := e22
+dst.fp16[23] := e23
+dst.fp16[24] := e24
+dst.fp16[25] := e25
+dst.fp16[26] := e26
+dst.fp16[27] := e27
+dst.fp16[28] := e28
+dst.fp16[29] := e29
+dst.fp16[30] := e30
+dst.fp16[31] := e31
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setr_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="_Float16" varname="e7" />
+	<parameter etype="FP16" type="_Float16" varname="e6" />
+	<parameter etype="FP16" type="_Float16" varname="e5" />
+	<parameter etype="FP16" type="_Float16" varname="e4" />
+	<parameter etype="FP16" type="_Float16" varname="e3" />
+	<parameter etype="FP16" type="_Float16" varname="e2" />
+	<parameter etype="FP16" type="_Float16" varname="e1" />
+	<parameter etype="FP16" type="_Float16" varname="e0" />
+	<description>Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst.fp16[0] := e7
+dst.fp16[1] := e6
+dst.fp16[2] := e5
+dst.fp16[3] := e4
+dst.fp16[4] := e3
+dst.fp16[5] := e2
+dst.fp16[6] := e1
+dst.fp16[7] := e0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_setr_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="_Float16" varname="e15" />
+	<parameter etype="FP16" type="_Float16" varname="e14" />
+	<parameter etype="FP16" type="_Float16" varname="e13" />
+	<parameter etype="FP16" type="_Float16" varname="e12" />
+	<parameter etype="FP16" type="_Float16" varname="e11" />
+	<parameter etype="FP16" type="_Float16" varname="e10" />
+	<parameter etype="FP16" type="_Float16" varname="e9" />
+	<parameter etype="FP16" type="_Float16" varname="e8" />
+	<parameter etype="FP16" type="_Float16" varname="e7" />
+	<parameter etype="FP16" type="_Float16" varname="e6" />
+	<parameter etype="FP16" type="_Float16" varname="e5" />
+	<parameter etype="FP16" type="_Float16" varname="e4" />
+	<parameter etype="FP16" type="_Float16" varname="e3" />
+	<parameter etype="FP16" type="_Float16" varname="e2" />
+	<parameter etype="FP16" type="_Float16" varname="e1" />
+	<parameter etype="FP16" type="_Float16" varname="e0" />
+	<description>Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst.fp16[0] := e15
+dst.fp16[1] := e14
+dst.fp16[2] := e13
+dst.fp16[3] := e12
+dst.fp16[4] := e11
+dst.fp16[5] := e10
+dst.fp16[6] := e9
+dst.fp16[7] := e8
+dst.fp16[8] := e7
+dst.fp16[9] := e6
+dst.fp16[10] := e5
+dst.fp16[11] := e4
+dst.fp16[12] := e3
+dst.fp16[13] := e2
+dst.fp16[14] := e1
+dst.fp16[15] := e0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setr_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="_Float16" varname="e31" />
+	<parameter etype="FP16" type="_Float16" varname="e30" />
+	<parameter etype="FP16" type="_Float16" varname="e29" />
+	<parameter etype="FP16" type="_Float16" varname="e28" />
+	<parameter etype="FP16" type="_Float16" varname="e27" />
+	<parameter etype="FP16" type="_Float16" varname="e26" />
+	<parameter etype="FP16" type="_Float16" varname="e25" />
+	<parameter etype="FP16" type="_Float16" varname="e24" />
+	<parameter etype="FP16" type="_Float16" varname="e23" />
+	<parameter etype="FP16" type="_Float16" varname="e22" />
+	<parameter etype="FP16" type="_Float16" varname="e21" />
+	<parameter etype="FP16" type="_Float16" varname="e20" />
+	<parameter etype="FP16" type="_Float16" varname="e19" />
+	<parameter etype="FP16" type="_Float16" varname="e18" />
+	<parameter etype="FP16" type="_Float16" varname="e17" />
+	<parameter etype="FP16" type="_Float16" varname="e16" />
+	<parameter etype="FP16" type="_Float16" varname="e15" />
+	<parameter etype="FP16" type="_Float16" varname="e14" />
+	<parameter etype="FP16" type="_Float16" varname="e13" />
+	<parameter etype="FP16" type="_Float16" varname="e12" />
+	<parameter etype="FP16" type="_Float16" varname="e11" />
+	<parameter etype="FP16" type="_Float16" varname="e10" />
+	<parameter etype="FP16" type="_Float16" varname="e9" />
+	<parameter etype="FP16" type="_Float16" varname="e8" />
+	<parameter etype="FP16" type="_Float16" varname="e7" />
+	<parameter etype="FP16" type="_Float16" varname="e6" />
+	<parameter etype="FP16" type="_Float16" varname="e5" />
+	<parameter etype="FP16" type="_Float16" varname="e4" />
+	<parameter etype="FP16" type="_Float16" varname="e3" />
+	<parameter etype="FP16" type="_Float16" varname="e2" />
+	<parameter etype="FP16" type="_Float16" varname="e1" />
+	<parameter etype="FP16" type="_Float16" varname="e0" />
+	<description>Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst.fp16[0] := e31
+dst.fp16[1] := e30
+dst.fp16[2] := e29
+dst.fp16[3] := e28
+dst.fp16[4] := e27
+dst.fp16[5] := e26
+dst.fp16[6] := e25
+dst.fp16[7] := e24
+dst.fp16[8] := e23
+dst.fp16[9] := e22
+dst.fp16[10] := e21
+dst.fp16[11] := e20
+dst.fp16[12] := e19
+dst.fp16[13] := e18
+dst.fp16[14] := e17
+dst.fp16[15] := e16
+dst.fp16[16] := e15
+dst.fp16[17] := e14
+dst.fp16[18] := e13
+dst.fp16[19] := e12
+dst.fp16[20] := e11
+dst.fp16[21] := e10
+dst.fp16[22] := e9
+dst.fp16[23] := e8
+dst.fp16[24] := e7
+dst.fp16[25] := e6
+dst.fp16[26] := e5
+dst.fp16[27] := e4
+dst.fp16[28] := e3
+dst.fp16[29] := e2
+dst.fp16[30] := e1
+dst.fp16[31] := e0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set1_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="_Float16" varname="a" />
+	<description>Broadcast half-precision (16-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR i := 0 to 7
+	dst.fp16[i] := a[15:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set1_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="_Float16" varname="a" />
+	<description>Broadcast half-precision (16-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[i] := a[15:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set1_ph" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="_Float16" varname="a" />
+	<description>Broadcast half-precision (16-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR i := 0 to 31
+	dst.fp16[i] := a[15:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set1_pch" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="_Float16 _Complex" varname="a" />
+	<description>Broadcast half-precision (16-bit) complex floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR i := 0 to 3
+	dst.fp16[2*i+0] := a[15:0]
+	dst.fp16[2*i+1] := a[31:16]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm256_set1_pch" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="_Float16 _Complex" varname="a" />
+	<description>Broadcast half-precision (16-bit) complex floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR i := 0 to 7
+	dst.fp16[2*i+0] := a[15:0]
+	dst.fp16[2*i+1] := a[31:16]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_set1_pch" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="_Float16 _Complex" varname="a" />
+	<description>Broadcast half-precision (16-bit) complex floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR i := 0 to 15
+	dst.fp16[2*i+0] := a[15:0]
+	dst.fp16[2*i+1] := a[31:16]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_sh" sequence="TRUE" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="_Float16" varname="a" />
+	<description>Copy half-precision (16-bit) floating-point element "a" to the lower element of "dst", and zero the upper 7 elements.</description>
+	<operation>
+dst.fp16[0] := a[15:0]
+dst[127:16] := 0
+	</operation>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm512_setzero_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<description>Return vector of type __m512h with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPXORQ" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_castph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Cast vector of type "__m128h" to type "__m128". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Cast vector of type "__m256h" to type "__m256". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castph_ps" tech="AVX-512">
+	<return etype="FP32" type="__m512" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Cast vector of type "__m512h" to type "__m512". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm_castph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Cast vector of type "__m128h" to type "__m128d". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Cast vector of type "__m256h" to type "__m256d". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castph_pd" tech="AVX-512">
+	<return etype="FP64" type="__m512d" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Cast vector of type "__m512h" to type "__m512d". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm_castph_si128" tech="AVX-512">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Cast vector of type "__m128h" to type "__m128i". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castph_si256" tech="AVX-512">
+	<return etype="M256" type="__m256i" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Cast vector of type "__m256h" to type "__m256i". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castph_si512" tech="AVX-512">
+	<return etype="M512" type="__m512i" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Cast vector of type "__m512h" to type "__m512i". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm_castps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Cast vector of type "__m128" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<description>Cast vector of type "__m256" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castps_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP32" type="__m512" varname="a" />
+	<description>Cast vector of type "__m512" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm_castpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Cast vector of type "__m128d" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<description>Cast vector of type "__m256d" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castpd_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP64" type="__m512d" varname="a" />
+	<description>Cast vector of type "__m512d" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm_castsi128_ph" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Cast vector of type "__m128i" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castsi256_ph" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Cast vector of type "__m256i" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castsi512_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Cast vector of type "__m512i" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castph256_ph128" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Cast vector of type "__m256h" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castph512_ph128" tech="AVX-512">
+	<return etype="FP16" type="__m128h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Cast vector of type "__m512h" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castph512_ph256" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m512h" varname="a" />
+	<description>Cast vector of type "__m512h" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_castph128_ph256" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Cast vector of type "__m128h" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castph128_ph512" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Cast vector of type "__m128h" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_castph256_ph512" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Cast vector of type "__m256h" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm256_zextph128_ph256" tech="AVX-512">
+	<return etype="FP16" type="__m256h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Cast vector of type "__m128h" to type "__m256h"; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_zextph128_ph512" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m128h" varname="a" />
+	<description>Cast vector of type "__m128h" to type "__m512h"; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_zextph256_ph512" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<parameter etype="FP16" type="__m256h" varname="a" />
+	<description>Cast vector of type "__m256h" to type "__m512h"; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm512_undefined_ph" tech="AVX-512">
+	<return etype="FP16" type="__m512h" varname="dst" />
+	<description>Return vector of type __m512h with undefined elements.</description>
+	<CPUID>AVX512_FP16</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_multishift_epi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 3
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		dst[q+j*8+7:q+j*8] := tmp8[7:0]
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPMULTISHIFTQB" xed="VPMULTISHIFTQB_YMMu8_MASKmskw_YMMu8_YMMu64_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_multishift_epi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 3
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPMULTISHIFTQB" xed="VPMULTISHIFTQB_YMMu8_MASKmskw_YMMu8_YMMu64_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_multishift_epi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 3
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPMULTISHIFTQB" xed="VPMULTISHIFTQB_YMMu8_MASKmskw_YMMu8_YMMu64_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_multishift_epi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 1
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		dst[q+j*8+7:q+j*8] := tmp8[7:0]
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPMULTISHIFTQB" xed="VPMULTISHIFTQB_XMMu8_MASKmskw_XMMu8_XMMu64_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_multishift_epi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 1
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPMULTISHIFTQB" xed="VPMULTISHIFTQB_XMMu8_MASKmskw_XMMu8_XMMu64_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_multishift_epi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 1
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPMULTISHIFTQB" xed="VPMULTISHIFTQB_XMMu8_MASKmskw_XMMu8_XMMu64_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutexvar_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="idx" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	id := idx[i+4:i]*8
+	dst[i+7:i] := a[id+7:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMB" xed="VPERMB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutexvar_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="idx" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	id := idx[i+4:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMB" xed="VPERMB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutexvar_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="idx" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	id := idx[i+4:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMB" xed="VPERMB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_permutexvar_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="idx" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	id := idx[i+3:i]*8
+	dst[i+7:i] := a[id+7:id]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPERMB" xed="VPERMB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_permutexvar_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="idx" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	id := idx[i+3:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMB" xed="VPERMB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_permutexvar_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="idx" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	id := idx[i+3:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMB" xed="VPERMB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_permutex2var_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="idx" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	off := 8*idx[i+4:i]
+	dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPERMI2B" xed="VPERMI2B_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_permutex2var_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="idx" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+4:i]
+		dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMT2B" xed="VPERMT2B_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask2_permutex2var_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="idx" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+4:i]
+		dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := idx[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPERMI2B" xed="VPERMI2B_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_permutex2var_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="idx" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+4:i]
+		dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMI2B" xed="VPERMI2B_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<instruction form="ymm {z}, ymm, ymm" name="VPERMT2B" xed="VPERMT2B_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_permutex2var_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="idx" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	off := 8*idx[i+3:i]
+	dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPERMI2B" xed="VPERMI2B_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_permutex2var_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="idx" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+3:i]
+		dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMT2B" xed="VPERMT2B_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask2_permutex2var_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="idx" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+3:i]
+		dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := idx[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPERMI2B" xed="VPERMI2B_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_permutex2var_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="idx" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+3:i]
+		dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMI2B" xed="VPERMI2B_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<instruction form="xmm {z}, xmm, xmm" name="VPERMT2B" xed="VPERMT2B_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_multishift_epi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 7
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		dst[q+j*8+7:q+j*8] := tmp8[7:0]
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPMULTISHIFTQB" xed="VPMULTISHIFTQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_multishift_epi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 7
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPMULTISHIFTQB" xed="VPMULTISHIFTQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_multishift_epi64_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 7
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPMULTISHIFTQB" xed="VPMULTISHIFTQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutexvar_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="idx" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	id := idx[i+5:i]*8
+	dst[i+7:i] := a[id+7:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMB" xed="VPERMB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutexvar_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="idx" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	id := idx[i+5:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMB" xed="VPERMB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutexvar_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="idx" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	id := idx[i+5:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMB" xed="VPERMB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_permutex2var_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="idx" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	off := 8*idx[i+5:i]
+	dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPERMI2B" xed="VPERMI2B_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_permutex2var_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="idx" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+5:i]
+		dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMT2B" xed="VPERMT2B_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask2_permutex2var_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="idx" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+5:i]
+		dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := idx[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPERMI2B" xed="VPERMI2B_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_permutex2var_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="idx" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+5:i]
+		dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMI2B" xed="VPERMI2B_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<instruction form="zmm {z}, zmm, zmm" name="VPERMT2B" xed="VPERMT2B_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>AVX512_VBMI</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_maskz_shrdv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSHRDVQ" xed="VPSHRDVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shrdv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSHRDVQ" xed="VPSHRDVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shrdv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSHRDVQ" xed="VPSHRDVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shrdv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSHRDVQ" xed="VPSHRDVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shrdv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSHRDVQ" xed="VPSHRDVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_shrdv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSHRDVQ" xed="VPSHRDVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shrdv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="UI32" type="__m256i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSHRDVD" xed="VPSHRDVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shrdv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="UI32" type="__m256i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSHRDVD" xed="VPSHRDVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shrdv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="UI32" type="__m256i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSHRDVD" xed="VPSHRDVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shrdv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSHRDVD" xed="VPSHRDVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shrdv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSHRDVD" xed="VPSHRDVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_shrdv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSHRDVD" xed="VPSHRDVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shrdv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="UI16" type="__m256i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSHRDVW" xed="VPSHRDVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shrdv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="UI16" type="__m256i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSHRDVW" xed="VPSHRDVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shrdv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="UI16" type="__m256i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSHRDVW" xed="VPSHRDVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shrdv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="UI16" type="__m128i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSHRDVW" xed="VPSHRDVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shrdv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="UI16" type="__m128i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSHRDVW" xed="VPSHRDVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_shrdv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="UI16" type="__m128i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSHRDVW" xed="VPSHRDVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shrdi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VPSHRDQ" xed="VPSHRDQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shrdi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VPSHRDQ" xed="VPSHRDQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shrdi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPSHRDQ" xed="VPSHRDQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shrdi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VPSHRDQ" xed="VPSHRDQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shrdi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VPSHRDQ" xed="VPSHRDQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_shrdi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VPSHRDQ" xed="VPSHRDQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shrdi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VPSHRDD" xed="VPSHRDD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shrdi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VPSHRDD" xed="VPSHRDD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shrdi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPSHRDD" xed="VPSHRDD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shrdi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VPSHRDD" xed="VPSHRDD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shrdi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VPSHRDD" xed="VPSHRDD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_shrdi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VPSHRDD" xed="VPSHRDD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shrdi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VPSHRDW" xed="VPSHRDW_YMMu16_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shrdi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VPSHRDW" xed="VPSHRDW_YMMu16_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shrdi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPSHRDW" xed="VPSHRDW_YMMu16_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shrdi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VPSHRDW" xed="VPSHRDW_XMMu16_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shrdi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VPSHRDW" xed="VPSHRDW_XMMu16_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_shrdi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VPSHRDW" xed="VPSHRDW_XMMu16_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shldv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSHLDVQ" xed="VPSHLDVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shldv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSHLDVQ" xed="VPSHLDVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shldv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="UI64" type="__m256i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+	dst[i+63:i] := tmp[127:64]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSHLDVQ" xed="VPSHLDVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shldv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSHLDVQ" xed="VPSHLDVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shldv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSHLDVQ" xed="VPSHLDVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_shldv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="UI64" type="__m128i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+	dst[i+63:i] := tmp[127:64]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSHLDVQ" xed="VPSHLDVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shldv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="UI32" type="__m256i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSHLDVD" xed="VPSHLDVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shldv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="UI32" type="__m256i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSHLDVD" xed="VPSHLDVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shldv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="UI32" type="__m256i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+	dst[i+31:i] := tmp[63:32]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSHLDVD" xed="VPSHLDVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shldv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSHLDVD" xed="VPSHLDVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shldv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSHLDVD" xed="VPSHLDVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_shldv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+	dst[i+31:i] := tmp[63:32]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSHLDVD" xed="VPSHLDVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shldv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="UI16" type="__m256i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPSHLDVW" xed="VPSHLDVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shldv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="UI16" type="__m256i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPSHLDVW" xed="VPSHLDVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shldv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="UI16" type="__m256i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPSHLDVW" xed="VPSHLDVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shldv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="UI16" type="__m128i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPSHLDVW" xed="VPSHLDVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shldv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="UI16" type="__m128i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPSHLDVW" xed="VPSHLDVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_shldv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="UI16" type="__m128i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPSHLDVW" xed="VPSHLDVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shldi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VPSHLDQ" xed="VPSHLDQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shldi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VPSHLDQ" xed="VPSHLDQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shldi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst").</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+	dst[i+63:i] := tmp[127:64]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPSHLDQ" xed="VPSHLDQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shldi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VPSHLDQ" xed="VPSHLDQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shldi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VPSHLDQ" xed="VPSHLDQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_shldi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst").</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+	dst[i+63:i] := tmp[127:64]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VPSHLDQ" xed="VPSHLDQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shldi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VPSHLDD" xed="VPSHLDD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shldi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VPSHLDD" xed="VPSHLDD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shldi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m256i" varname="dst" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+	dst[i+31:i] := tmp[63:32]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPSHLDD" xed="VPSHLDD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shldi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VPSHLDD" xed="VPSHLDD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shldi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VPSHLDD" xed="VPSHLDD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_shldi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+	dst[i+31:i] := tmp[63:32]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VPSHLDD" xed="VPSHLDD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_shldi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VPSHLDW" xed="VPSHLDW_YMMu16_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_shldi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VPSHLDW" xed="VPSHLDW_YMMu16_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_shldi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<parameter etype="UI16" type="__m256i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst").</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPSHLDW" xed="VPSHLDW_YMMu16_MASKmskw_YMMu16_YMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_shldi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VPSHLDW" xed="VPSHLDW_XMMu16_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_shldi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VPSHLDW" xed="VPSHLDW_XMMu16_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_shldi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst").</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VPSHLDW" xed="VPSHLDW_XMMu16_MASKmskw_XMMu16_XMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_expandloadu_epi16" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" memwidth="256" type="const void*" varname="mem_addr" />
+	<description>Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m256" name="VPEXPANDW" xed="VPEXPANDW_YMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_expandloadu_epi16" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" memwidth="256" type="const void*" varname="mem_addr" />
+	<description>Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m256" name="VPEXPANDW" xed="VPEXPANDW_YMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_expandloadu_epi16" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" memwidth="128" type="const void*" varname="mem_addr" />
+	<description>Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m128" name="VPEXPANDW" xed="VPEXPANDW_XMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_expandloadu_epi16" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" memwidth="128" type="const void*" varname="mem_addr" />
+	<description>Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m128" name="VPEXPANDW" xed="VPEXPANDW_XMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_expandloadu_epi8" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" memwidth="256" type="const void*" varname="mem_addr" />
+	<description>Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, m256" name="VPEXPANDB" xed="VPEXPANDB_YMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_expandloadu_epi8" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" memwidth="256" type="const void*" varname="mem_addr" />
+	<description>Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, m256" name="VPEXPANDB" xed="VPEXPANDB_YMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_expandloadu_epi8" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" memwidth="128" type="const void*" varname="mem_addr" />
+	<description>Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, m128" name="VPEXPANDB" xed="VPEXPANDB_XMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_expandloadu_epi8" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" memwidth="128" type="const void*" varname="mem_addr" />
+	<description>Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, m128" name="VPEXPANDB" xed="VPEXPANDB_XMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_expand_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[m+15:m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPEXPANDW" xed="VPEXPANDW_YMMu16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_expand_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[m+15:m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPEXPANDW" xed="VPEXPANDW_YMMu16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_expand_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[m+15:m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPEXPANDW" xed="VPEXPANDW_XMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_expand_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[m+15:m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPEXPANDW" xed="VPEXPANDW_XMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_expand_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[m+7:m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPEXPANDB" xed="VPEXPANDB_YMMu8_MASKmskw_YMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_expand_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[m+7:m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPEXPANDB" xed="VPEXPANDB_YMMu8_MASKmskw_YMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_expand_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[m+7:m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPEXPANDB" xed="VPEXPANDB_XMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_expand_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[m+7:m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPEXPANDB" xed="VPEXPANDB_XMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_compress_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 16
+m := 0
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPCOMPRESSW" xed="VPCOMPRESSW_YMMu16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_compress_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m256i" varname="dst" />
+	<parameter etype="UI16" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 16
+m := 0
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPCOMPRESSW" xed="VPCOMPRESSW_YMMu16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_compress_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 16
+m := 0
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPCOMPRESSW" xed="VPCOMPRESSW_XMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_compress_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 16
+m := 0
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPCOMPRESSW" xed="VPCOMPRESSW_XMMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_compress_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 8
+m := 0
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm" name="VPCOMPRESSB" xed="VPCOMPRESSB_YMMu8_MASKmskw_YMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_compress_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 8
+m := 0
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm" name="VPCOMPRESSB" xed="VPCOMPRESSB_YMMu8_MASKmskw_YMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_compress_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 8
+m := 0
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm" name="VPCOMPRESSB" xed="VPCOMPRESSB_XMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_compress_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 8
+m := 0
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm" name="VPCOMPRESSB" xed="VPCOMPRESSB_XMMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_compressstoreu_epi16" tech="AVX-512">
+	<category>Swizzle</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI16" type="__m256i" varname="a" />
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 16
+m := base_addr
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		MEM[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VPCOMPRESSW" xed="VPCOMPRESSW_MEMu16_MASKmskw_YMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_compressstoreu_epi16" tech="AVX-512">
+	<category>Swizzle</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 16
+m := base_addr
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		MEM[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VPCOMPRESSW" xed="VPCOMPRESSW_MEMu16_MASKmskw_XMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_compressstoreu_epi8" tech="AVX-512">
+	<category>Swizzle</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="256" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 8
+m := base_addr
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		MEM[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m256 {k}, ymm" name="VPCOMPRESSB" xed="VPCOMPRESSB_MEMu8_MASKmskw_YMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_compressstoreu_epi8" tech="AVX-512">
+	<category>Swizzle</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="128" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 8
+m := base_addr
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		MEM[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m128 {k}, xmm" name="VPCOMPRESSB" xed="VPCOMPRESSB_MEMu8_MASKmskw_XMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_maskz_shrdv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSHRDVQ" xed="VPSHRDVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shrdv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSHRDVQ" xed="VPSHRDVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shrdv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSHRDVQ" xed="VPSHRDVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shrdv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSHRDVD" xed="VPSHRDVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shrdv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSHRDVD" xed="VPSHRDVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shrdv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSHRDVD" xed="VPSHRDVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shrdv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="UI16" type="__m512i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSHRDVW" xed="VPSHRDVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shrdv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="UI16" type="__m512i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSHRDVW" xed="VPSHRDVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shrdv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="UI16" type="__m512i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSHRDVW" xed="VPSHRDVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shrdi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VPSHRDQ" xed="VPSHRDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shrdi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VPSHRDQ" xed="VPSHRDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shrdi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VPSHRDQ" xed="VPSHRDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shrdi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VPSHRDD" xed="VPSHRDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shrdi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VPSHRDD" xed="VPSHRDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shrdi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VPSHRDD" xed="VPSHRDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shrdi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VPSHRDW" xed="VPSHRDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shrdi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VPSHRDW" xed="VPSHRDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shrdi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VPSHRDW" xed="VPSHRDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shldv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSHLDVQ" xed="VPSHLDVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shldv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSHLDVQ" xed="VPSHLDVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shldv_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="UI64" type="__m512i" varname="c" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+	dst[i+63:i] := tmp[127:64]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSHLDVQ" xed="VPSHLDVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shldv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSHLDVD" xed="VPSHLDVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shldv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSHLDVD" xed="VPSHLDVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shldv_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="UI32" type="__m512i" varname="c" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+	dst[i+31:i] := tmp[63:32]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSHLDVD" xed="VPSHLDVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shldv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="UI16" type="__m512i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPSHLDVW" xed="VPSHLDVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shldv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="UI16" type="__m512i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPSHLDVW" xed="VPSHLDVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shldv_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="UI16" type="__m512i" varname="c" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPSHLDVW" xed="VPSHLDVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shldi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VPSHLDQ" xed="VPSHLDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shldi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VPSHLDQ" xed="VPSHLDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shldi_epi64" tech="AVX-512">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst").</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+	dst[i+63:i] := tmp[127:64]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VPSHLDQ" xed="VPSHLDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shldi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VPSHLDD" xed="VPSHLDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shldi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VPSHLDD" xed="VPSHLDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shldi_epi32" tech="AVX-512">
+	<return etype="UI32" type="__m512i" varname="dst" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+	dst[i+31:i] := tmp[63:32]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VPSHLDD" xed="VPSHLDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_shldi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VPSHLDW" xed="VPSHLDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_shldi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VPSHLDW" xed="VPSHLDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_shldi_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<parameter etype="UI16" type="__m512i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst").</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VPSHLDW" xed="VPSHLDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_expandloadu_epi16" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" memwidth="512" type="const void*" varname="mem_addr" />
+	<description>Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VPEXPANDW" xed="VPEXPANDW_ZMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expandloadu_epi16" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" memwidth="512" type="const void*" varname="mem_addr" />
+	<description>Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VPEXPANDW" xed="VPEXPANDW_ZMMu16_MASKmskw_MEMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_expandloadu_epi8" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" memwidth="512" type="const void*" varname="mem_addr" />
+	<description>Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, m512" name="VPEXPANDB" xed="VPEXPANDB_ZMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expandloadu_epi8" tech="AVX-512">
+	<category>Swizzle</category>
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" memwidth="512" type="const void*" varname="mem_addr" />
+	<description>Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, m512" name="VPEXPANDB" xed="VPEXPANDB_ZMMu8_MASKmskw_MEMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_expand_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[m+15:m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPEXPANDW" xed="VPEXPANDW_ZMMu16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expand_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[m+15:m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPEXPANDW" xed="VPEXPANDW_ZMMu16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_expand_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[m+7:m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPEXPANDB" xed="VPEXPANDB_ZMMu8_MASKmskw_ZMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_expand_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[m+7:m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPEXPANDB" xed="VPEXPANDB_ZMMu8_MASKmskw_ZMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_compress_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 16
+m := 0
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPCOMPRESSW" xed="VPCOMPRESSW_ZMMu16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_compress_epi16" tech="AVX-512">
+	<return etype="UI16" type="__m512i" varname="dst" />
+	<parameter etype="UI16" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 16
+m := 0
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPCOMPRESSW" xed="VPCOMPRESSW_ZMMu16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_compress_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 8
+m := 0
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm" name="VPCOMPRESSB" xed="VPCOMPRESSB_ZMMu8_MASKmskw_ZMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_compress_epi8" tech="AVX-512">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 8
+m := 0
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm" name="VPCOMPRESSB" xed="VPCOMPRESSB_ZMMu8_MASKmskw_ZMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_compressstoreu_epi16" tech="AVX-512">
+	<category>Swizzle</category>
+	<return type="void" />
+	<parameter etype="UI16" memwidth="512" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI16" type="__m512i" varname="a" />
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 16
+m := base_addr
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		MEM[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VPCOMPRESSW" xed="VPCOMPRESSW_MEMu16_MASKmskw_ZMMu16_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_compressstoreu_epi8" tech="AVX-512">
+	<category>Swizzle</category>
+	<return type="void" />
+	<parameter etype="UI8" memwidth="512" type="void*" varname="base_addr" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 8
+m := base_addr
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		MEM[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction form="m512 {k}, zmm" name="VPCOMPRESSB" xed="VPCOMPRESSB_MEMu8_MASKmskw_ZMMu8_AVX512" />
+	<CPUID>AVX512_VBMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_maskz_dpwssds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPDPWSSDS" xed="VPDPWSSDS_YMMi32_MASKmskw_YMMi16_YMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_dpwssds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPDPWSSDS" xed="VPDPWSSDS_YMMi32_MASKmskw_YMMi16_YMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpwssds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPDPWSSDS" xed="VPDPWSSDS_YMMi32_MASKmskw_YMMi16_YMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_dpwssds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPDPWSSDS" xed="VPDPWSSDS_XMMi32_MASKmskw_XMMi16_XMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_dpwssds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPDPWSSDS" xed="VPDPWSSDS_XMMi32_MASKmskw_XMMi16_XMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpwssds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPDPWSSDS" xed="VPDPWSSDS_XMMi32_MASKmskw_XMMi16_XMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_dpwssd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPDPWSSD" xed="VPDPWSSD_YMMi32_MASKmskw_YMMi16_YMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_dpwssd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPDPWSSD" xed="VPDPWSSD_YMMi32_MASKmskw_YMMi16_YMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpwssd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="SI16" type="__m256i" varname="a" />
+	<parameter etype="SI16" type="__m256i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPDPWSSD" xed="VPDPWSSD_YMMi32_MASKmskw_YMMi16_YMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_dpwssd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPDPWSSD" xed="VPDPWSSD_XMMi32_MASKmskw_XMMi16_XMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_dpwssd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPDPWSSD" xed="VPDPWSSD_XMMi32_MASKmskw_XMMi16_XMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpwssd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPDPWSSD" xed="VPDPWSSD_XMMi32_MASKmskw_XMMi16_XMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_dpbusds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPDPBUSDS" xed="VPDPBUSDS_YMMi32_MASKmskw_YMMu8_YMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_dpbusds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPDPBUSDS" xed="VPDPBUSDS_YMMi32_MASKmskw_YMMu8_YMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpbusds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPDPBUSDS" xed="VPDPBUSDS_YMMi32_MASKmskw_YMMu8_YMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_dpbusds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPDPBUSDS" xed="VPDPBUSDS_XMMi32_MASKmskw_XMMu8_XMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_dpbusds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPDPBUSDS" xed="VPDPBUSDS_XMMi32_MASKmskw_XMMu8_XMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpbusds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPDPBUSDS" xed="VPDPBUSDS_XMMi32_MASKmskw_XMMu8_XMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_dpbusd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VPDPBUSD" xed="VPDPBUSD_YMMi32_MASKmskw_YMMu8_YMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_dpbusd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VPDPBUSD" xed="VPDPBUSD_YMMi32_MASKmskw_YMMu8_YMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpbusd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m256i" varname="dst" />
+	<parameter etype="SI32" type="__m256i" varname="src" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="SI8" type="__m256i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VPDPBUSD" xed="VPDPBUSD_YMMi32_MASKmskw_YMMu8_YMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_dpbusd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VPDPBUSD" xed="VPDPBUSD_XMMi32_MASKmskw_XMMu8_XMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_dpbusd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask8" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VPDPBUSD" xed="VPDPBUSD_XMMi32_MASKmskw_XMMu8_XMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpbusd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="src" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VPDPBUSD" xed="VPDPBUSD_XMMi32_MASKmskw_XMMu8_XMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_maskz_dpwssds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPDPWSSDS" xed="VPDPWSSDS_ZMMi32_MASKmskw_ZMMi16_ZMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_dpwssds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPDPWSSDS" xed="VPDPWSSDS_ZMMi32_MASKmskw_ZMMi16_ZMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_dpwssds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPDPWSSDS" xed="VPDPWSSDS_ZMMi32_MASKmskw_ZMMi16_ZMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_dpwssd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPDPWSSD" xed="VPDPWSSD_ZMMi32_MASKmskw_ZMMi16_ZMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_dpwssd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPDPWSSD" xed="VPDPWSSD_ZMMi32_MASKmskw_ZMMi16_ZMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_dpwssd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="SI16" type="__m512i" varname="a" />
+	<parameter etype="SI16" type="__m512i" varname="b" />
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPDPWSSD" xed="VPDPWSSD_ZMMi32_MASKmskw_ZMMi16_ZMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_dpbusds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPDPBUSDS" xed="VPDPBUSDS_ZMMi32_MASKmskw_ZMMu8_ZMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_dpbusds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPDPBUSDS" xed="VPDPBUSDS_ZMMi32_MASKmskw_ZMMu8_ZMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_dpbusds_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPDPBUSDS" xed="VPDPBUSDS_ZMMi32_MASKmskw_ZMMu8_ZMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_dpbusd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VPDPBUSD" xed="VPDPBUSD_ZMMi32_MASKmskw_ZMMu8_ZMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_dpbusd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VPDPBUSD" xed="VPDPBUSD_ZMMi32_MASKmskw_ZMMu8_ZMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_dpbusd_epi32" tech="AVX-512">
+	<return etype="SI32" type="__m512i" varname="dst" />
+	<parameter etype="SI32" type="__m512i" varname="src" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="SI8" type="__m512i" varname="b" />
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VPDPBUSD" xed="VPDPBUSD_ZMMi32_MASKmskw_ZMMu8_ZMMu32_AVX512" />
+	<CPUID>AVX512_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_2intersect_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="__m512i" varname="a" />
+	<parameter etype="UI32" type="__m512i" varname="b" />
+	<parameter etype="MASK" memwidth="16" type="__mmask16*" varname="k1" />
+	<parameter etype="MASK" memwidth="16" type="__mmask16*" varname="k2" />
+	<description>Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers.</description>
+	<operation>
+MEM[k1+15:k1] := 0
+MEM[k2+15:k2] := 0
+FOR i := 0 TO 15
+	FOR j := 0 TO 15
+		match := (a.dword[i] == b.dword[j] ? 1 : 0)
+		MEM[k1+15:k1].bit[i] |= match
+		MEM[k2+15:k2].bit[j] |= match
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction form="k, zmm, zmm" name="VP2INTERSECTD" xed="VP2INTERSECTD_MASKmskw_ZMMu32_ZMMu32_AVX512" />
+	<CPUID>AVX512_VP2INTERSECT</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm512_2intersect_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="__m512i" varname="a" />
+	<parameter etype="UI64" type="__m512i" varname="b" />
+	<parameter etype="MASK" memwidth="8" type="__mmask8*" varname="k1" />
+	<parameter etype="MASK" memwidth="8" type="__mmask8*" varname="k2" />
+	<description>Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers.</description>
+	<operation>
+MEM[k1+7:k1] := 0
+MEM[k2+7:k2] := 0
+FOR i := 0 TO 7
+	FOR j := 0 TO 7
+		match := (a.qword[i] == b.qword[j] ? 1 : 0)
+		MEM[k1+7:k1].bit[i] |= match
+		MEM[k2+7:k2].bit[j] |= match
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction form="k, zmm, zmm" name="VP2INTERSECTQ" xed="VP2INTERSECTQ_MASKmskw_ZMMu64_ZMMu64_AVX512" />
+	<CPUID>AVX512_VP2INTERSECT</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm_2intersect_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="MASK" memwidth="8" type="__mmask8*" varname="k1" />
+	<parameter etype="MASK" memwidth="8" type="__mmask8*" varname="k2" />
+	<description>Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers.</description>
+	<operation>
+MEM[k1+7:k1] := 0
+MEM[k2+7:k2] := 0
+FOR i := 0 TO 3
+	FOR j := 0 TO 3
+		match := (a.dword[i] == b.dword[j] ? 1 : 0)
+		MEM[k1+7:k1].bit[i] |= match
+		MEM[k2+7:k2].bit[j] |= match
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction form="k, xmm, xmm" name="VP2INTERSECTD" xed="VP2INTERSECTD_MASKmskw_XMMu32_XMMu32_AVX512" />
+	<CPUID>AVX512_VP2INTERSECT</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm256_2intersect_epi32" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI32" type="__m256i" varname="a" />
+	<parameter etype="UI32" type="__m256i" varname="b" />
+	<parameter etype="MASK" memwidth="8" type="__mmask8*" varname="k1" />
+	<parameter etype="MASK" memwidth="8" type="__mmask8*" varname="k2" />
+	<description>Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers.</description>
+	<operation>
+MEM[k1+7:k1] := 0
+MEM[k2+7:k2] := 0
+FOR i := 0 TO 7
+	FOR j := 0 TO 7
+		match := (a.dword[i] == b.dword[j] ? 1 : 0)
+		MEM[k1+7:k1].bit[i] |= match
+		MEM[k2+7:k2].bit[j] |= match
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction form="k, ymm, ymm" name="VP2INTERSECTD" xed="VP2INTERSECTD_MASKmskw_YMMu32_YMMu32_AVX512" />
+	<CPUID>AVX512_VP2INTERSECT</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm_2intersect_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<parameter etype="MASK" memwidth="8" type="__mmask8*" varname="k1" />
+	<parameter etype="MASK" memwidth="8" type="__mmask8*" varname="k2" />
+	<description>Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers.</description>
+	<operation>
+MEM[k1+7:k1] := 0
+MEM[k2+7:k2] := 0
+FOR i := 0 TO 1
+	FOR j := 0 TO 1
+		match := (a.qword[i] == b.qword[j] ? 1 : 0)
+		MEM[k1+7:k1].bit[i] |= match
+		MEM[k2+7:k2].bit[j] |= match
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction form="k, xmm, xmm" name="VP2INTERSECTQ" xed="VP2INTERSECTQ_MASKmskw_XMMu64_XMMu64_AVX512" />
+	<CPUID>AVX512_VP2INTERSECT</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	<intrinsic name="_mm256_2intersect_epi64" tech="AVX-512">
+	<return type="void" />
+	<parameter etype="UI64" type="__m256i" varname="a" />
+	<parameter etype="UI64" type="__m256i" varname="b" />
+	<parameter etype="MASK" memwidth="8" type="__mmask8*" varname="k1" />
+	<parameter etype="MASK" memwidth="8" type="__mmask8*" varname="k2" />
+	<description>Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers.</description>
+	<operation>
+MEM[k1+7:k1] := 0
+MEM[k2+7:k2] := 0
+FOR i := 0 TO 3
+	FOR j := 0 TO 3
+		match := (a.qword[i] == b.qword[j] ? 1 : 0)
+		MEM[k1+7:k1].bit[i] |= match
+		MEM[k2+7:k2].bit[j] |= match
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction form="k, ymm, ymm" name="VP2INTERSECTQ" xed="VP2INTERSECTQ_MASKmskw_YMMu64_YMMu64_AVX512" />
+	<CPUID>AVX512_VP2INTERSECT</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Mask</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_madd52hi_avx_epu64" tech="AVX_ALL">
+	<return type="__m256i" varname="dst" etype="UI64" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst".</description>
+	<instruction name="VPMADD52HUQ" form="ymm, ymm, ymm" xed="VPMADD52HUQ_YMMu64_YMMu64_YMMu64" />
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+	<parameter type="__m256i" varname="__X" etype="UI64" />
+	<parameter type="__m256i" varname="__Y" etype="UI64" />
+	<parameter type="__m256i" varname="__Z" etype="UI64" />
+	<CPUID>AVX_IFMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_madd52lo_avx_epu64" tech="AVX_ALL">
+	<return type="__m256i" varname="dst" etype="UI64" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst".</description>
+	<instruction name="VPMADD52LUQ" form="ymm, ymm, ymm" xed="VPMADD52LUQ_YMMu64_YMMu64_YMMu64" />
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+	<parameter type="__m256i" varname="__X" etype="UI64" />
+	<parameter type="__m256i" varname="__Y" etype="UI64" />
+	<parameter type="__m256i" varname="__Z" etype="UI64" />
+	<CPUID>AVX_IFMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_madd52hi_avx_epu64" tech="AVX_ALL">
+	<return type="__m128i" varname="dst" etype="UI64" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst".</description>
+	<instruction name="VPMADD52HUQ" form="xmm, xmm, xmm" xed="VPMADD52HUQ_XMMu64_XMMu64_XMMu64" />
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+	<parameter type="__m128i" varname="__X" etype="UI64" />
+	<parameter type="__m128i" varname="__Y" etype="UI64" />
+	<parameter type="__m128i" varname="__Z" etype="UI64" />
+	<CPUID>AVX_IFMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_madd52lo_avx_epu64" tech="AVX_ALL">
+	<return type="__m128i" varname="dst" etype="UI64" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst".</description>
+	<instruction name="VPMADD52LUQ" form="xmm, xmm, xmm" xed="VPMADD52LUQ_XMMu64_XMMu64_XMMu64" />
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+	<parameter type="__m128i" varname="__X" etype="UI64" />
+	<parameter type="__m128i" varname="__Y" etype="UI64" />
+	<parameter type="__m128i" varname="__Z" etype="UI64" />
+	<CPUID>AVX_IFMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+<intrinsic name="_mm256_madd52hi_epu64" tech="AVX_ALL">
+	<return type="__m256i" varname="dst" etype="UI64" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst".</description>
+	<instruction name="VPMADD52HUQ" form="ymm, ymm, ymm" xed="VPMADD52HUQ_YMMu64_YMMu64_YMMu64" />
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+	<parameter type="__m256i" varname="__X" etype="UI64" />
+	<parameter type="__m256i" varname="__Y" etype="UI64" />
+	<parameter type="__m256i" varname="__Z" etype="UI64" />
+	<CPUID>AVX_IFMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_madd52lo_epu64" tech="AVX_ALL">
+	<return type="__m256i" varname="dst" etype="UI64" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst".</description>
+	<instruction name="VPMADD52LUQ" form="ymm, ymm, ymm" xed="VPMADD52LUQ_YMMu64_YMMu64_YMMu64" />
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+	<parameter type="__m256i" varname="__X" etype="UI64" />
+	<parameter type="__m256i" varname="__Y" etype="UI64" />
+	<parameter type="__m256i" varname="__Z" etype="UI64" />
+	<CPUID>AVX_IFMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_madd52hi_epu64" tech="AVX_ALL">
+	<return type="__m128i" varname="dst" etype="UI64" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst".</description>
+	<instruction name="VPMADD52HUQ" form="xmm, xmm, xmm" xed="VPMADD52HUQ_XMMu64_XMMu64_XMMu64" />
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+	<parameter type="__m128i" varname="__X" etype="UI64" />
+	<parameter type="__m128i" varname="__Y" etype="UI64" />
+	<parameter type="__m128i" varname="__Z" etype="UI64" />
+	<CPUID>AVX_IFMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_madd52lo_epu64" tech="AVX_ALL">
+	<return type="__m128i" varname="dst" etype="UI64" />
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst".</description>
+	<instruction name="VPMADD52LUQ" form="xmm, xmm, xmm" xed="VPMADD52LUQ_XMMu64_XMMu64_XMMu64" />
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+	<parameter type="__m128i" varname="__X" etype="UI64" />
+	<parameter type="__m128i" varname="__Y" etype="UI64" />
+	<parameter type="__m128i" varname="__Z" etype="UI64" />
+	<CPUID>AVX_IFMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+<intrinsic name="_mm256_bcstnebf16_ps" tech="AVX_ALL">
+		<return type="__m256" varname="dst" etype="FP32" />
+		<description>Convert scalar BF16 (16-bit) floating-point element stored at memory locations starting at location "__A" to a single-precision (32-bit) floating-point, broadcast it to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VBCSTNEBF162PS" form="ymm, m16" xed="VBCSTNEBF162PS_YMMf32_MEMbf16" />
+		<operation>
+b := Convert_BF16_To_FP32(MEM[__A+15:__A])
+FOR j := 0 to 7
+	m := j*32
+	dst[m+31:m] := b
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+		<parameter type="const __bf16*" memwidth="16" varname="__A" etype="BF16"/>
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_bcstnesh_ps" tech="AVX_ALL">
+		<return type="__m256" varname="dst" etype="FP32" />
+		<description>Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting at location "__A" to a single-precision (32-bit) floating-point, broadcast it to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VBCSTNESH2PS" form="ymm, m16" xed="VBCSTNESH2PS_YMMf32_MEMf16" />
+		<operation>
+b := Convert_FP16_To_FP32(MEM[__A+15:__A])
+FOR j := 0 to 7
+	m := j*32
+	dst[m+31:m] := b
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+		<parameter type="const _Float16*" memwidth="16" varname="__A" etype="FP16"/>
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtneebf16_ps" tech="AVX_ALL">
+		<return type="__m256" varname="dst" etype="FP32" />
+		<description>Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VCVTNEEBF162PS" form="ymm, m256" xed="VCVTNEEBF162PS_YMMf32_MEMbf16" />
+		<operation>
+FOR j := 0 to 7
+	m := j*32
+	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+m+15:__A+m])
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+		<parameter type="const __m256bh*" memwidth="256" varname="__A" etype="BF16"/>
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtneeph_ps" tech="AVX_ALL">
+		<return type="__m256" varname="dst" etype="FP32" />
+		<description>Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VCVTNEEPH2PS" form="ymm, m256" xed="VCVTNEEPH2PS_YMMf32_MEMf16" />
+		<operation>
+FOR j := 0 to 7
+	m := j*32
+	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+m+15:__A+m])
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+		<parameter type="const __m256h*" memwidth="256" etype="FP16" varname="__A" />
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtneobf16_ps" tech="AVX_ALL">
+		<return type="__m256" varname="dst" etype="FP32" />
+		<description>Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VCVTNEOBF162PS" form="ymm, m256" xed="VCVTNEOBF162PS_YMMf32_MEMbf16" />
+		<operation>
+FOR j := 0 to 7
+	m := j*32
+	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+m+31:__A+m+16])
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+		<parameter type="const __m256bh*" memwidth="256" etype="BF16" varname="__A" />
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtneoph_ps" tech="AVX_ALL">
+		<return type="__m256" varname="dst" etype="FP32" />
+		<description>Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VCVTNEOPH2PS" form="ymm, m256" xed="VCVTNEOPH2PS_YMMf32_MEMf16" />
+		<operation>
+FOR j := 0 to 7
+	m := j*32
+	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+m+31:__A+m+16])
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+		<parameter type="const __m256h*" memwidth="256" etype="FP16" varname="__A" />
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtneps_avx_pbh" tech="AVX_ALL">
+		<return type="__m128bh" varname="dst" etype="BF16" />
+		<description>Convert packed single-precision (32-bit) floating-point elements in "__A" to packed BF16 (16-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VCVTNEPS2BF16" form="xmm, ymm" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_YMMf32_AVX512" />
+		<operation>
+FOR j := 0 to 7
+	dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+		<parameter type="__m256" varname="__A" etype="FP32" />
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_bcstnebf16_ps" tech="AVX_ALL">
+		<return type="__m128" varname="dst" etype="FP32" />
+		<description>Convert scalar BF16 (16-bit) floating-point element stored at memory locations starting at location "__A" to a single-precision (32-bit) floating-point, broadcast it to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VBCSTNEBF162PS" form="xmm, m16" xed="VBCSTNEBF162PS_XMMf32_MEMbf16" />
+		<operation>
+b := Convert_BF16_To_FP32(MEM[__A+15:__A])
+FOR j := 0 to 3
+	m := j*32
+	dst[m+31:m] := b
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+		<parameter type="const __bf16*" varname="__A" memwidth="16" etype="BF16"/>
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_bcstnesh_ps" tech="AVX_ALL">
+		<return type="__m128" varname="dst" etype="FP32" />
+		<description>Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting at location "__A" to a single-precision (32-bit) floating-point, broadcast it to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VBCSTNESH2PS" form="xmm, m16" xed="VBCSTNESH2PS_XMMf32_MEMf16" />
+		<operation>
+b := Convert_FP16_To_FP32(MEM[__A+15:__A])
+FOR j := 0 to 3
+	m := j*32
+	dst[m+31:m] := b
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+		<parameter type="const _Float16*" varname="__A" memwidth="16" etype="FP16"/>
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtneebf16_ps" tech="AVX_ALL">
+		<return type="__m128" varname="dst" etype="FP32" />
+		<description>Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VCVTNEEBF162PS" form="xmm, m128" xed="VCVTNEEBF162PS_XMMf32_MEMbf16" />
+		<operation>
+FOR j := 0 to 3
+	m := j*32
+	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+m+15:__A+m])
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+		<parameter type="const __m128bh*" memwidth="128" etype="BF16" varname="__A" />
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtneeph_ps" tech="AVX_ALL">
+		<return type="__m128" varname="dst" etype="FP32" />
+		<description>Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VCVTNEEPH2PS" form="xmm, m128" xed="VCVTNEEPH2PS_XMMf32_MEMf16" />
+		<operation>
+FOR j := 0 to 3
+	m := j*32
+	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+m+15:__A+m])
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+		<parameter type="const __m128h*" memwidth="128" etype="FP16" varname="__A" />
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtneobf16_ps" tech="AVX_ALL">
+		<return type="__m128" varname="dst" etype="FP32" />
+		<description>Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VCVTNEOBF162PS" form="xmm, m128" xed="VCVTNEOBF162PS_XMMf32_MEMbf16" />
+		<operation>
+FOR j := 0 to 3
+	m := j*32
+	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+m+31:__A+m+16])
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+		<parameter type="const __m128bh*" memwidth="128" etype="BF16" varname="__A" />
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtneoph_ps" tech="AVX_ALL">
+		<return type="__m128" varname="dst" etype="FP32" />
+		<description>Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VCVTNEOPH2PS" form="xmm, m128" xed="VCVTNEOPH2PS_XMMf32_MEMf16" />
+		<operation>
+FOR j := 0 to 3
+	m := j*32
+	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+m+31:__A+m+16])
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+		<parameter type="const __m128h*" memwidth="128" etype="FP16" varname="__A" />
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtneps_avx_pbh" tech="AVX_ALL">
+		<return type="__m128bh" varname="dst" etype="BF16" />
+		<description>Convert packed single-precision (32-bit) floating-point elements in "__A" to packed BF16 (16-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VCVTNEPS2BF16" form="xmm, xmm" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_XMMf32_AVX512" />
+		<operation>
+FOR j := 0 to 3
+	dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+		<parameter type="__m128" varname="__A" etype="FP32" />
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtneps_pbh" tech="AVX_ALL">
+		<return type="__m128bh" varname="dst" etype="BF16" />
+		<description>Convert packed single-precision (32-bit) floating-point elements in "__A" to packed BF16 (16-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VCVTNEPS2BF16" form="xmm, ymm" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_YMMf32_AVX512" />
+		<operation>
+FOR j := 0 to 7
+	dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+		<parameter type="__m256" varname="__A" etype="FP32" />
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtneps_pbh" tech="AVX_ALL">
+		<return type="__m128bh" varname="dst" etype="BF16" />
+		<description>Convert packed single-precision (32-bit) floating-point elements in "__A" to packed BF16 (16-bit) floating-point elements, and store the results in "dst".</description>
+		<instruction name="VCVTNEPS2BF16" form="xmm, xmm" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_XMMf32_AVX512" />
+		<operation>
+FOR j := 0 to 3
+	dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+		<parameter type="__m128" varname="__A" etype="FP32" />
+	<CPUID>AVX_NE_CONVERT</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+<intrinsic name="_mm256_dpbusd_avx_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<parameter type="__m256i" varname="src" etype="SI32" />
+		<parameter type="__m256i" varname="a" etype="UI8" />
+		<parameter type="__m256i" varname="b" etype="SI8" />
+		<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 7
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:256] := 0
+		</operation>
+		<instruction name="VPDPBUSD" form="ymm, ymm, ymm" xed="VPDPBUSD_YMMi32_YMMu32_YMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpbusds_avx_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<parameter type="__m256i" varname="src" etype="SI32" />
+		<parameter type="__m256i" varname="a" etype="UI8" />
+		<parameter type="__m256i" varname="b" etype="SI8" />
+		<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 7
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:256] := 0
+		</operation>
+		<instruction name="VPDPBUSDS" form="ymm, ymm, ymm" xed="VPDPBUSDS_YMMi32_YMMu32_YMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpwssd_avx_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<parameter type="__m256i" varname="src" etype="SI32" />
+		<parameter type="__m256i" varname="a" etype="SI16" />
+		<parameter type="__m256i" varname="b" etype="SI16" />
+		<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 7
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:256] := 0
+		</operation>
+		<instruction name="VPDPWSSD" form="ymm, ymm, ymm" xed="VPDPWSSD_YMMi32_YMMu32_YMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpwssds_avx_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<parameter type="__m256i" varname="src" etype="SI32" />
+		<parameter type="__m256i" varname="a" etype="SI16" />
+		<parameter type="__m256i" varname="b" etype="SI16" />
+		<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 7
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:256] := 0
+		</operation>
+		<instruction name="VPDPWSSDS" form="ymm, ymm, ymm" xed="VPDPWSSDS_YMMi32_YMMu32_YMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpbusd_avx_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<parameter type="__m128i" varname="src" etype="SI32" />
+		<parameter type="__m128i" varname="a" etype="SI16" />
+		<parameter type="__m128i" varname="b" etype="SI16" />
+		<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 3
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:128] := 0
+		</operation>
+		<instruction name="VPDPBUSD" form="xmm, xmm, xmm" xed="VPDPBUSD_XMMi32_XMMu32_XMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpbusds_avx_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<parameter type="__m128i" varname="src" etype="SI32" />
+		<parameter type="__m128i" varname="a" etype="UI8" />
+		<parameter type="__m128i" varname="b" etype="SI8" />
+		<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 3
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:128] := 0
+		</operation>
+		<instruction name="VPDPBUSDS" form="xmm, xmm, xmm" xed="VPDPBUSDS_XMMi32_XMMu32_XMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpwssd_avx_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<parameter type="__m128i" varname="src" etype="SI32" />
+		<parameter type="__m128i" varname="a" etype="SI16" />
+		<parameter type="__m128i" varname="b" etype="SI16" />
+		<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 3
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:128] := 0
+		</operation>
+		<instruction name="VPDPWSSD" form="xmm, xmm, xmm" xed="VPDPWSSD_XMMi32_XMMu32_XMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpwssds_avx_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<parameter type="__m128i" varname="src" etype="SI32" />
+		<parameter type="__m128i" varname="a" etype="SI16" />
+		<parameter type="__m128i" varname="b" etype="SI16" />
+		<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 3
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:128] := 0
+		</operation>
+		<instruction name="VPDPWSSDS" form="xmm, xmm, xmm" xed="VPDPWSSDS_XMMi32_XMMu32_XMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+<intrinsic name="_mm256_dpbusd_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<parameter type="__m256i" varname="src" etype="SI32" />
+		<parameter type="__m256i" varname="a" etype="UI8" />
+		<parameter type="__m256i" varname="b" etype="SI8" />
+		<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 7
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:256] := 0
+		</operation>
+		<instruction name="VPDPBUSD" form="ymm, ymm, ymm" xed="VPDPBUSD_YMMi32_YMMu32_YMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpbusds_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<parameter type="__m256i" varname="src" etype="SI32" />
+		<parameter type="__m256i" varname="a" etype="UI8" />
+		<parameter type="__m256i" varname="b" etype="SI8" />
+		<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 7
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:256] := 0
+		</operation>
+		<instruction name="VPDPBUSDS" form="ymm, ymm, ymm" xed="VPDPBUSDS_YMMi32_YMMu32_YMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpwssd_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<parameter type="__m256i" varname="src" etype="SI32" />
+		<parameter type="__m256i" varname="a" etype="SI16" />
+		<parameter type="__m256i" varname="b" etype="SI16" />
+		<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 7
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:256] := 0
+		</operation>
+		<instruction name="VPDPWSSD" form="ymm, ymm, ymm" xed="VPDPWSSD_YMMi32_YMMu32_YMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpwssds_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<parameter type="__m256i" varname="src" etype="SI32" />
+		<parameter type="__m256i" varname="a" etype="SI16" />
+		<parameter type="__m256i" varname="b" etype="SI16" />
+		<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 7
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:256] := 0
+		</operation>
+		<instruction name="VPDPWSSDS" form="ymm, ymm, ymm" xed="VPDPWSSDS_YMMi32_YMMu32_YMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpbusd_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<parameter type="__m128i" varname="src" etype="SI32" />
+		<parameter type="__m128i" varname="a" etype="SI16" />
+		<parameter type="__m128i" varname="b" etype="SI16" />
+		<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 3
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:128] := 0
+		</operation>
+		<instruction name="VPDPBUSD" form="xmm, xmm, xmm" xed="VPDPBUSD_XMMi32_XMMu32_XMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpbusds_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<parameter type="__m128i" varname="src" etype="SI32" />
+		<parameter type="__m128i" varname="a" etype="UI8" />
+		<parameter type="__m128i" varname="b" etype="SI8" />
+		<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 3
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:128] := 0
+		</operation>
+		<instruction name="VPDPBUSDS" form="xmm, xmm, xmm" xed="VPDPBUSDS_XMMi32_XMMu32_XMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpwssd_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<parameter type="__m128i" varname="src" etype="SI32" />
+		<parameter type="__m128i" varname="a" etype="SI16" />
+		<parameter type="__m128i" varname="b" etype="SI16" />
+		<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 3
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:128] := 0
+		</operation>
+		<instruction name="VPDPWSSD" form="xmm, xmm, xmm" xed="VPDPWSSD_XMMi32_XMMu32_XMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpwssds_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<parameter type="__m128i" varname="src" etype="SI32" />
+		<parameter type="__m128i" varname="a" etype="SI16" />
+		<parameter type="__m128i" varname="b" etype="SI16" />
+		<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+		<operation>
+FOR j := 0 to 3
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:128] := 0
+		</operation>
+		<instruction name="VPDPWSSDS" form="xmm, xmm, xmm" xed="VPDPWSSDS_XMMi32_XMMu32_XMMu32" />
+	<CPUID>AVX_VNNI</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+<intrinsic name="_mm256_dpwsud_epi32" tech="AVX_ALL">
+    <return type="__m256i" varname="dst" etype="SI32" />
+    <description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst".</description>
+    <instruction name="VPDPWSUD" form="ymm, ymm, ymm" xed="VPDPWSUD_YMMi32_YMMu32_YMMu32" />
+    <operation>
+FOR j := 0 to 7
+	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+    <parameter type="__m256i" varname="__W" etype="SI32" />
+    <parameter type="__m256i" varname="__A" etype="SI16" />
+    <parameter type="__m256i" varname="__B" etype="UI16" />
+  <CPUID>AVX_VNNI_INT16</CPUID>
+	<header>immintrin.h</header>
+  <category>Arithmetic</category>
+  </intrinsic>
+  <intrinsic name="_mm256_dpwsuds_epi32" tech="AVX_ALL">
+    <return type="__m256i" varname="dst" etype="SI32" />
+    <description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst".</description>
+    <instruction name="VPDPWSUDS" form="ymm, ymm, ymm" xed="VPDPWSUDS_YMMi32_YMMu32_YMMu32" />
+    <operation>
+FOR j := 0 to 7
+	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:256] := 0			</operation>
+    <parameter type="__m256i" varname="__W" etype="SI32" />
+    <parameter type="__m256i" varname="__A" etype="SI16" />
+    <parameter type="__m256i" varname="__B" etype="UI16" />
+  <CPUID>AVX_VNNI_INT16</CPUID>
+	<header>immintrin.h</header>
+  <category>Arithmetic</category>
+  </intrinsic>
+  <intrinsic name="_mm256_dpwusd_epi32" tech="AVX_ALL">
+    <return type="__m256i" varname="dst" etype="SI32" />
+    <description>Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding signed 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst".</description>
+    <instruction name="VPDPWUSD" form="ymm, ymm, ymm" xed="VPDPWUSD_YMMi32_YMMu32_YMMu32" />
+    <operation>
+FOR j := 0 to 7
+	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+    <parameter type="__m256i" varname="__W" etype="SI32" />
+    <parameter type="__m256i" varname="__A" etype="UI16" />
+    <parameter type="__m256i" varname="__B" etype="SI16" />
+  <CPUID>AVX_VNNI_INT16</CPUID>
+	<header>immintrin.h</header>
+  <category>Arithmetic</category>
+  </intrinsic>
+  <intrinsic name="_mm256_dpwusds_epi32" tech="AVX_ALL">
+    <return type="__m256i" varname="dst" etype="SI32" />
+    <description>Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding signed 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst".</description>
+    <instruction name="VPDPWUSDS" form="ymm, ymm, ymm" xed="VPDPWUSDS_YMMi32_YMMu32_YMMu32" />
+    <operation>
+FOR j := 0 to 7
+	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:256] := 0			</operation>
+    <parameter type="__m256i" varname="__W" etype="SI32" />
+    <parameter type="__m256i" varname="__A" etype="UI16" />
+    <parameter type="__m256i" varname="__B" etype="SI16" />
+  <CPUID>AVX_VNNI_INT16</CPUID>
+	<header>immintrin.h</header>
+  <category>Arithmetic</category>
+  </intrinsic>
+  <intrinsic name="_mm256_dpwuud_epi32" tech="AVX_ALL">
+    <return type="__m256i" varname="dst" etype="SI32" />
+    <description>Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst".</description>
+    <instruction name="VPDPWUUD" form="ymm, ymm, ymm" xed="VPDPWUUD_YMMi32_YMMu32_YMMu32" />
+    <operation>
+FOR j := 0 to 7
+	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+    <parameter type="__m256i" varname="__W" etype="UI32" />
+    <parameter type="__m256i" varname="__A" etype="UI16" />
+    <parameter type="__m256i" varname="__B" etype="UI16" />
+  <CPUID>AVX_VNNI_INT16</CPUID>
+	<header>immintrin.h</header>
+  <category>Arithmetic</category>
+  </intrinsic>
+  <intrinsic name="_mm256_dpwuuds_epi32" tech="AVX_ALL">
+    <return type="__m256i" varname="dst" etype="SI32" />
+    <description>Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst".</description>
+    <instruction name="VPDPWUUDS" form="ymm, ymm, ymm" xed="VPDPWUUDS_YMMi32_YMMu32_YMMu32" />
+    <operation>
+FOR j := 0 to 7
+	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:256] := 0			</operation>
+    <parameter type="__m256i" varname="__W" etype="UI32" />
+    <parameter type="__m256i" varname="__A" etype="UI16" />
+    <parameter type="__m256i" varname="__B" etype="UI16" />
+  <CPUID>AVX_VNNI_INT16</CPUID>
+	<header>immintrin.h</header>
+  <category>Arithmetic</category>
+  </intrinsic>
+  <intrinsic name="_mm_dpwsud_epi32" tech="AVX_ALL">
+    <return type="__m128i" varname="dst" etype="SI32" />
+    <description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst".</description>
+    <instruction name="VPDPWSUD" form="xmm, xmm, xmm" xed="VPDPWSUD_XMMi32_XMMu32_XMMu32" />
+    <operation>
+FOR j := 0 to 3
+	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+    <parameter type="__m128i" varname="__W" etype="SI32" />
+    <parameter type="__m128i" varname="__A" etype="SI16" />
+    <parameter type="__m128i" varname="__B" etype="UI16" />
+  <CPUID>AVX_VNNI_INT16</CPUID>
+	<header>immintrin.h</header>
+  <category>Arithmetic</category>
+  </intrinsic>
+  <intrinsic name="_mm_dpwsuds_epi32" tech="AVX_ALL">
+    <return type="__m128i" varname="dst" etype="SI32" />
+    <description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst".</description>
+    <instruction name="VPDPWSUDS" form="xmm, xmm, xmm" xed="VPDPWSUDS_XMMi32_XMMu32_XMMu32" />
+    <operation>
+FOR j := 0 to 3
+	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:128] := 0			</operation>
+    <parameter type="__m128i" varname="__W" etype="SI32" />
+    <parameter type="__m128i" varname="__A" etype="SI16" />
+    <parameter type="__m128i" varname="__B" etype="UI16" />
+  <CPUID>AVX_VNNI_INT16</CPUID>
+	<header>immintrin.h</header>
+  <category>Arithmetic</category>
+  </intrinsic>
+  <intrinsic name="_mm_dpwusd_epi32" tech="AVX_ALL">
+    <return type="__m128i" varname="dst" etype="SI32" />
+    <description>Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding signed 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst".</description>
+    <instruction name="VPDPWUSD" form="xmm, xmm, xmm" xed="VPDPWUSD_XMMi32_XMMu32_XMMu32" />
+    <operation>
+FOR j := 0 to 3
+	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+    <parameter type="__m128i" varname="__W" etype="SI32" />
+    <parameter type="__m128i" varname="__A" etype="UI16" />
+    <parameter type="__m128i" varname="__B" etype="SI16" />
+  <CPUID>AVX_VNNI_INT16</CPUID>
+	<header>immintrin.h</header>
+  <category>Arithmetic</category>
+  </intrinsic>
+  <intrinsic name="_mm_dpwusds_epi32" tech="AVX_ALL">
+    <return type="__m128i" varname="dst" etype="SI32" />
+    <description>Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding signed 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst".</description>
+    <instruction name="VPDPWUSDS" form="xmm, xmm, xmm" xed="VPDPWUSDS_XMMi32_XMMu32_XMMu32" />
+    <operation>
+FOR j := 0 to 3
+	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:128] := 0			</operation>
+    <parameter type="__m128i" varname="__W" etype="SI32" />
+    <parameter type="__m128i" varname="__A" etype="UI16" />
+    <parameter type="__m128i" varname="__B" etype="SI16" />
+  <CPUID>AVX_VNNI_INT16</CPUID>
+	<header>immintrin.h</header>
+  <category>Arithmetic</category>
+  </intrinsic>
+  <intrinsic name="_mm_dpwuud_epi32" tech="AVX_ALL">
+    <return type="__m128i" varname="dst" etype="SI32" />
+    <description>Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst".</description>
+    <instruction name="VPDPWUUD" form="xmm, xmm, xmm" xed="VPDPWUUD_XMMi32_XMMu32_XMMu32" />
+    <operation>
+FOR j := 0 to 3
+	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+    <parameter type="__m128i" varname="__W" etype="UI32" />
+    <parameter type="__m128i" varname="__A" etype="UI16" />
+    <parameter type="__m128i" varname="__B" etype="UI16" />
+  <CPUID>AVX_VNNI_INT16</CPUID>
+	<header>immintrin.h</header>
+  <category>Arithmetic</category>
+  </intrinsic>
+  <intrinsic name="_mm_dpwuuds_epi32" tech="AVX_ALL">
+    <return type="__m128i" varname="dst" etype="SI32" />
+    <description>Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst".</description>
+    <instruction name="VPDPWUUDS" form="xmm, xmm, xmm" xed="VPDPWUUDS_XMMi32_XMMu32_XMMu32" />
+    <operation>
+FOR j := 0 to 3
+	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:128] := 0			</operation>
+    <parameter type="__m128i" varname="__W" etype="UI32" />
+    <parameter type="__m128i" varname="__A" etype="UI16" />
+    <parameter type="__m128i" varname="__B" etype="UI16" />
+  <CPUID>AVX_VNNI_INT16</CPUID>
+	<header>immintrin.h</header>
+  <category>Arithmetic</category>
+  </intrinsic>
+<intrinsic name="_mm256_dpbssd_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<description>Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding signed 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst".</description>
+		<instruction name="VPDPBSSD" form="ymm, ymm, ymm" xed="VPDPBSSD_YMMi32_YMMu32_YMMu32" />
+		<operation>
+FOR j := 0 to 7
+	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+		<parameter type="__m256i" varname="__W" etype="SI32" />
+		<parameter type="__m256i" varname="__A" etype="SI8" />
+		<parameter type="__m256i" varname="__B" etype="SI8" />
+	<CPUID>AVX_VNNI_INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpbssds_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<description>Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding signed 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst".</description>
+		<instruction name="VPDPBSSDS" form="ymm, ymm, ymm" xed="VPDPBSSDS_YMMi32_YMMu32_YMMu32" />
+		<operation>
+FOR j := 0 to 7
+	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:256] := 0			</operation>
+		<parameter type="__m256i" varname="__W" etype="SI32" />
+		<parameter type="__m256i" varname="__A" etype="SI8" />
+		<parameter type="__m256i" varname="__B" etype="SI8" />
+	<CPUID>AVX_VNNI_INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpbsud_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<description>Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst".</description>
+		<instruction name="VPDPBSUD" form="ymm, ymm, ymm" xed="VPDPBSUD_YMMi32_YMMu32_YMMu32" />
+		<operation>
+FOR j := 0 to 7
+	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+		<parameter type="__m256i" varname="__W" etype="SI32" />
+		<parameter type="__m256i" varname="__A" etype="SI8" />
+		<parameter type="__m256i" varname="__B" etype="UI8" />
+	<CPUID>AVX_VNNI_INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpbsuds_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<description>Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst".</description>
+		<instruction name="VPDPBSUDS" form="ymm, ymm, ymm" xed="VPDPBSUDS_YMMi32_YMMu32_YMMu32" />
+		<operation>
+FOR j := 0 to 7
+	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:256] := 0			</operation>
+		<parameter type="__m256i" varname="__W" etype="SI32" />
+		<parameter type="__m256i" varname="__A" etype="SI8" />
+		<parameter type="__m256i" varname="__B" etype="UI8" />
+	<CPUID>AVX_VNNI_INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpbuud_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst".</description>
+		<instruction name="VPDPBUUD" form="ymm, ymm, ymm" xed="VPDPBUUD_YMMi32_YMMu32_YMMu32" />
+		<operation>
+FOR j := 0 to 7
+	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+		<parameter type="__m256i" varname="__W" etype="SI32" />
+		<parameter type="__m256i" varname="__A" etype="UI8" />
+		<parameter type="__m256i" varname="__B" etype="UI8" />
+	<CPUID>AVX_VNNI_INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_dpbuuds_epi32" tech="AVX_ALL">
+		<return type="__m256i" varname="dst" etype="SI32" />
+		<description>Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with unsigned saturation, and store the packed 32-bit results in "dst".</description>
+		<instruction name="VPDPBUUDS" form="ymm, ymm, ymm" xed="VPDPBUUDS_YMMi32_YMMu32_YMMu32" />
+		<operation>
+FOR j := 0 to 7
+	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:256] := 0			</operation>
+		<parameter type="__m256i" varname="__W" etype="SI32" />
+		<parameter type="__m256i" varname="__A" etype="UI8" />
+		<parameter type="__m256i" varname="__B" etype="UI8" />
+	<CPUID>AVX_VNNI_INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpbssd_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<description>Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding signed 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst".</description>
+		<instruction name="VPDPBSSD" form="xmm, xmm, xmm" xed="VPDPBSSD_XMMi32_XMMu32_XMMu32" />
+		<operation>
+FOR j := 0 to 3
+	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+		<parameter type="__m128i" varname="__W" etype="SI32" />
+		<parameter type="__m128i" varname="__A" etype="SI8" />
+		<parameter type="__m128i" varname="__B" etype="SI8" />
+	<CPUID>AVX_VNNI_INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpbssds_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<description>Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding signed 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst".</description>
+		<instruction name="VPDPBSSDS" form="xmm, xmm, xmm" xed="VPDPBSSDS_XMMi32_XMMu32_XMMu32" />
+		<operation>
+FOR j := 0 to 3
+	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:128] := 0			</operation>
+		<parameter type="__m128i" varname="__W" etype="SI32" />
+		<parameter type="__m128i" varname="__A" etype="SI8" />
+		<parameter type="__m128i" varname="__B" etype="SI8" />
+	<CPUID>AVX_VNNI_INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpbsud_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<description>Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst".</description>
+		<instruction name="VPDPBSUD" form="xmm, xmm, xmm" xed="VPDPBSUD_XMMi32_XMMu32_XMMu32" />
+		<operation>
+FOR j := 0 to 3
+	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+		<parameter type="__m128i" varname="__W" etype="SI32" />
+		<parameter type="__m128i" varname="__A" etype="SI8" />
+		<parameter type="__m128i" varname="__B" etype="UI8" />
+	<CPUID>AVX_VNNI_INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpbsuds_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<description>Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst".</description>
+		<instruction name="VPDPBSUDS" form="xmm, xmm, xmm" xed="VPDPBSUDS_XMMi32_XMMu32_XMMu32" />
+		<operation>
+FOR j := 0 to 3
+	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:128] := 0			</operation>
+		<parameter type="__m128i" varname="__W" etype="SI32" />
+		<parameter type="__m128i" varname="__A" etype="SI8" />
+		<parameter type="__m128i" varname="__B" etype="UI8" />
+	<CPUID>AVX_VNNI_INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpbuud_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst".</description>
+		<instruction name="VPDPBUUD" form="xmm, xmm, xmm" xed="VPDPBUUD_XMMi32_XMMu32_XMMu32" />
+		<operation>
+FOR j := 0 to 3
+	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:128] := 0
+</operation>
+		<parameter type="__m128i" varname="__W" etype="SI32" />
+		<parameter type="__m128i" varname="__A" etype="UI8" />
+		<parameter type="__m128i" varname="__B" etype="UI8" />
+	<CPUID>AVX_VNNI_INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dpbuuds_epi32" tech="AVX_ALL">
+		<return type="__m128i" varname="dst" etype="SI32" />
+		<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with unsigned saturation, and store the packed 32-bit results in "dst".</description>
+		<instruction name="VPDPBUUDS" form="xmm, xmm, xmm" xed="VPDPBUUDS_XMMi32_XMMu32_XMMu32" />
+		<operation>
+FOR j := 0 to 3
+	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:128] := 0			</operation>
+		<parameter type="__m128i" varname="__W" etype="SI32" />
+		<parameter type="__m128i" varname="__A" etype="UI8" />
+		<parameter type="__m128i" varname="__B" etype="UI8" />
+	<CPUID>AVX_VNNI_INT8</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+<intrinsic name="_bextr_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="start" />
+	<parameter etype="UI32" type="unsigned int" varname="len" />
+	<description>Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start".</description>
+	<operation>
+tmp[511:0] := a
+dst[31:0] := ZeroExtend32(tmp[(start[7:0] + len[7:0] - 1):start[7:0]])
+	</operation>
+	<instruction form="r32, r32, r32" name="BEXTR" xed="BEXTR_VGPR32d_VGPR32d_VGPR32d" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bextr2_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="control" />
+	<description>Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control".</description>
+	<operation>
+start := control[7:0]
+len := control[15:8]
+tmp[511:0] := a
+dst[31:0] := ZeroExtend32(tmp[(start[7:0] + len[7:0] - 1):start[7:0]])
+	</operation>
+	<instruction form="r32, r32, r32" name="BEXTR" xed="BEXTR_VGPR32d_VGPR32d_VGPR32d" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bextr_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="start" />
+	<parameter etype="UI32" type="unsigned int" varname="len" />
+	<description>Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start".</description>
+	<operation>
+tmp[511:0] := a
+dst[63:0] := ZeroExtend64(tmp[(start[7:0] + len[7:0] - 1):start[7:0]])
+	</operation>
+	<instruction form="r64, r64, r64" name="BEXTR" xed="BEXTR_VGPR64q_VGPR64q_VGPR64q" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bextr2_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="control" />
+	<description>Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control"..</description>
+	<operation>
+start := control[7:0]
+len := control[15:8]
+tmp[511:0] := a
+dst[63:0] := ZeroExtend64(tmp[(start[7:0] + len[7:0] - 1):start[7:0]])
+	</operation>
+	<instruction form="r64, r64, r64" name="BEXTR" xed="BEXTR_VGPR64q_VGPR64q_VGPR64q" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_blsi_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Extract the lowest set bit from unsigned 32-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a".</description>
+	<operation>
+dst := (-a) AND a
+	</operation>
+	<instruction form="r32, r32" name="BLSI" xed="BLSI_VGPR32d_VGPR32d" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_blsi_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Extract the lowest set bit from unsigned 64-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a".</description>
+	<operation>
+dst := (-a) AND a
+	</operation>
+	<instruction form="r64, r64" name="BLSI" xed="BLSI_VGPR64q_VGPR64q" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_blsmsk_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 32-bit integer "a".</description>
+	<operation>
+dst := (a - 1) XOR a
+	</operation>
+	<instruction form="r32, r32" name="BLSMSK" xed="BLSMSK_VGPR32d_VGPR32d" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_blsmsk_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 64-bit integer "a".</description>
+	<operation>
+dst := (a - 1) XOR a
+	</operation>
+	<instruction form="r64, r64" name="BLSMSK" xed="BLSMSK_VGPR64q_VGPR64q" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_blsr_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a".</description>
+	<operation>
+dst := (a - 1) AND a
+	</operation>
+	<instruction form="r32, r32" name="BLSR" xed="BLSR_VGPR32d_VGPR32d" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_blsr_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a".</description>
+	<operation>
+dst := (a - 1) AND a
+	</operation>
+	<instruction form="r64, r64" name="BLSR" xed="BLSR_VGPR64q_VGPR64q" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_andn_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="b" />
+	<description>Compute the bitwise NOT of 32-bit integer "a" and then AND with b, and store the results in dst.</description>
+	<operation>
+dst[31:0] := ((NOT a[31:0]) AND b[31:0])
+	</operation>
+	<instruction form="r32, r32, r32" name="ANDN" xed="ANDN_VGPR32d_VGPR32d_VGPR32d" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_andn_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="b" />
+	<description>Compute the bitwise NOT of 64-bit integer "a" and then AND with b, and store the results in dst.</description>
+	<operation>
+dst[63:0] := ((NOT a[63:0]) AND b[63:0])
+	</operation>
+	<instruction form="r64, r64, r64" name="ANDN" xed="ANDN_VGPR64q_VGPR64q_VGPR64q" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_tzcnt_u16" tech="Other">
+	<return etype="UI16" type="unsigned short" varname="dst" />
+	<parameter etype="UI16" type="unsigned short" varname="a" />
+	<description>Count the number of trailing zero bits in unsigned 16-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 0
+dst := 0
+DO WHILE ((tmp &lt; 16) AND a[tmp] == 0)
+	tmp := tmp + 1
+	dst := dst + 1
+OD
+	</operation>
+	<instruction form="r16, r16" name="TZCNT" xed="TZCNT_GPRv_GPRv" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_tzcnt_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 0
+dst := 0
+DO WHILE ((tmp &lt; 32) AND a[tmp] == 0)
+	tmp := tmp + 1
+	dst := dst + 1
+OD
+	</operation>
+	<instruction form="r32, r32" name="TZCNT" xed="TZCNT_GPRv_GPRv" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_tzcnt_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 0
+dst := 0
+DO WHILE ((tmp &lt; 64) AND a[tmp] == 0)
+	tmp := tmp + 1
+	dst := dst + 1
+OD
+	</operation>
+	<instruction form="r64, r64" name="TZCNT" xed="TZCNT_GPRv_GPRv" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_tzcnt_32" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 0
+dst := 0
+DO WHILE ((tmp &lt; 32) AND a[tmp] == 0)
+	tmp := tmp + 1
+	dst := dst + 1
+OD
+	</operation>
+	<instruction form="r32, r32" name="TZCNT" xed="TZCNT_GPRv_GPRv" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_tzcnt_64" tech="Other">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 0
+dst := 0
+DO WHILE ((tmp &lt; 64) AND a[tmp] == 0)
+	tmp := tmp + 1
+	dst := dst + 1
+OD
+	</operation>
+	<instruction form="r64, r64" name="TZCNT" xed="TZCNT_GPRv_GPRv" />
+	<CPUID>BMI1</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_bzhi_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="index" />
+	<description>Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index".</description>
+	<operation>
+n := index[7:0]
+dst := a
+IF (n &lt; 32)
+	dst[31:n] := 0
+FI
+	</operation>
+	<instruction form="r32, r32, r32" name="BZHI" xed="BZHI_VGPR32d_VGPR32d_VGPR32d" />
+	<CPUID>BMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bzhi_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="index" />
+	<description>Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index".</description>
+	<operation>
+n := index[7:0]
+dst := a
+IF (n &lt; 64)
+	dst[63:n] := 0
+FI
+	</operation>
+	<instruction form="r64, r64, r64" name="BZHI" xed="BZHI_VGPR64q_VGPR64q_VGPR64q" />
+	<CPUID>BMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_pdep_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="mask" />
+	<description>Deposit contiguous low bits from unsigned 32-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero.</description>
+	<operation>
+tmp := a
+dst := 0
+m := 0
+k := 0
+DO WHILE m &lt; 32
+	IF mask[m] == 1
+		dst[m] := tmp[k]
+		k := k + 1
+	FI
+	m := m + 1
+OD
+	</operation>
+	<instruction form="r32, r32, r32" name="PDEP" xed="PDEP_VGPR32d_VGPR32d_VGPR32d" />
+	<CPUID>BMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_pdep_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="mask" />
+	<description>Deposit contiguous low bits from unsigned 64-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero.</description>
+	<operation>
+tmp := a
+dst := 0
+m := 0
+k := 0
+DO WHILE m &lt; 64
+	IF mask[m] == 1
+		dst[m] := tmp[k]
+		k := k + 1
+	FI
+	m := m + 1
+OD
+	</operation>
+	<instruction form="r64, r64, r64" name="PDEP" xed="PDEP_VGPR64q_VGPR64q_VGPR64q" />
+	<CPUID>BMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_pext_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="mask" />
+	<description>Extract bits from unsigned 32-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero.</description>
+	<operation>
+tmp := a
+dst := 0
+m := 0
+k := 0
+DO WHILE m &lt; 32
+	IF mask[m] == 1
+		dst[k] := tmp[m]
+		k := k + 1
+	FI
+	m := m + 1
+OD
+	</operation>
+	<instruction form="r32, r32, r32" name="PEXT" xed="PEXT_VGPR32d_VGPR32d_VGPR32d" />
+	<CPUID>BMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_pext_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="mask" />
+	<description>Extract bits from unsigned 64-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero.</description>
+	<operation>
+tmp := a
+dst := 0
+m := 0
+k := 0
+DO WHILE m &lt; 64
+	IF mask[m] == 1
+		dst[k] := tmp[m]
+		k := k + 1
+	FI
+	m := m + 1
+OD
+	</operation>
+	<instruction form="r64, r64, r64" name="PEXT" xed="PEXT_VGPR64q_VGPR64q_VGPR64q" />
+	<CPUID>BMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mulx_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="b" />
+	<parameter etype="UI32" memwidth="32" type="unsigned int*" varname="hi" />
+	<description>Multiply unsigned 32-bit integers "a" and "b", store the low 32-bits of the result in "dst", and store the high 32-bits in "hi". This does not read or write arithmetic flags.</description>
+	<operation>
+dst[31:0] := (a * b)[31:0]
+MEM[hi+31:hi] := (a * b)[63:32]
+	</operation>
+	<instruction form="r32, r32, m32" name="MULX" xed="MULX_VGPR32d_VGPR32d_MEMd" />
+	<CPUID>BMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mulx_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="b" />
+	<parameter etype="UI64" memwidth="64" type="unsigned __int64*" varname="hi" />
+	<description>Multiply unsigned 64-bit integers "a" and "b", store the low 64-bits of the result in "dst", and store the high 64-bits in "hi". This does not read or write arithmetic flags.</description>
+	<operation>
+dst[63:0] := (a * b)[63:0]
+MEM[hi+63:hi]  := (a * b)[127:64]
+	</operation>
+	<instruction form="r64, r64, m64" name="MULX" xed="MULX_VGPR64q_VGPR64q_MEMq" />
+	<CPUID>BMI2</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_incsspd" tech="Other">
+	<return type="void" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Increment the shadow stack pointer by 4 times the value specified in bits [7:0] of "a".</description>
+	<operation>
+SSP := SSP + a[7:0] * 4
+	</operation>
+	<instruction form="r32" name="INCSSPD" xed="INCSSPD_GPR32u8" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_incsspq" tech="Other">
+	<return type="void" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Increment the shadow stack pointer by 8 times the value specified in bits [7:0] of "a".</description>
+	<operation>
+SSP := SSP + a[7:0] * 8
+	</operation>
+	<instruction form="r64" name="INCSSPQ" xed="INCSSPQ_GPR64u8" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_rdsspd_i32" tech="Other">
+	<return etype="UI32" type="__int32" varname="dst" />
+	<parameter type="void" />
+	<description>Read the low 32-bits of the current shadow stack pointer, and store the result in "dst".</description>
+	<operation>dst := SSP[31:0]
+	</operation>
+	<instruction form="r32" name="RDSSPD" xed="RDSSPD_GPR32u32" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_rdsspq_i64" tech="Other">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter type="void" />
+	<description>Read the current shadow stack pointer, and store the result in "dst".</description>
+	<operation>dst := SSP[63:0]
+	</operation>
+	<instruction form="r64" name="RDSSPQ" xed="RDSSPQ_GPR64u64" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_saveprevssp" tech="Other">
+	<return type="void" />
+	<parameter type="void" />
+	<description>Save the previous shadow stack pointer context.</description>
+	<instruction name="SAVEPREVSSP" xed="SAVEPREVSSP" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_rstorssp" tech="Other">
+	<return type="void" />
+	<parameter type="void *" varname="p" />
+	<description>Restore the saved shadow stack pointer from the shadow stack restore token previously created on shadow stack by saveprevssp.</description>
+	<instruction form="m64" name="RSTORSSP" xed="RSTORSSP_MEMu64" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_wrssd" tech="Other">
+	<return type="void" />
+	<parameter etype="UI32" type="__int32" varname="val" />
+	<parameter type="void *" varname="p" />
+	<description>Write 32-bit value in "val" to a shadow stack page in memory specified by "p".</description>
+	<instruction form="m32, r32" name="WRSSD" xed="WRSSD_MEMu32_GPR32u32" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_wrssq" tech="Other">
+	<return type="void" />
+	<parameter etype="UI64" type="__int64" varname="val" />
+	<parameter type="void *" varname="p" />
+	<description>Write 64-bit value in "val" to a shadow stack page in memory specified by "p".</description>
+	<instruction form="m64, r64" name="WRSSQ" xed="WRSSQ_MEMu64_GPR64u64" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_wrussd" tech="Other">
+	<return type="void" />
+	<parameter etype="UI32" type="__int32" varname="val" />
+	<parameter type="void *" varname="p" />
+	<description>Write 32-bit value in "val" to a user shadow stack page in memory specified by "p".</description>
+	<instruction form="m32, r32" name="WRUSSD" xed="WRUSSD_MEMu32_GPR32u32" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_wrussq" tech="Other">
+	<return type="void" />
+	<parameter etype="UI64" type="__int64" varname="val" />
+	<parameter type="void *" varname="p" />
+	<description>Write 64-bit value in "val" to a user shadow stack page in memory specified by "p".</description>
+	<instruction form="m64, r64" name="WRUSSQ" xed="WRUSSQ_MEMu64_GPR64u64" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_setssbsy" tech="Other">
+	<return type="void" />
+	<parameter type="void" />
+	<description>Mark shadow stack pointed to by IA32_PL0_SSP as busy.</description>
+	<instruction name="SETSSBSY" xed="SETSSBSY" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_clrssbsy" tech="Other">
+	<return type="void" />
+	<parameter type="void *" varname="p" />
+	<description>Mark shadow stack pointed to by "p" as not busy.</description>
+	<instruction form="m64" name="CLRSSBSY" xed="CLRSSBSY_MEMu64" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_get_ssp" tech="Other">
+	<return etype="UI32" type="__int32" varname="dst" />
+	<parameter type="void" />
+	<description>If CET is enabled, read the low 32-bits of the current shadow stack pointer, and store the result in "dst". Otherwise return 0.</description>
+	<operation>dst := SSP[31:0]
+	</operation>
+	<instruction form="r32" name="RDSSPD" xed="RDSSPD_GPR32u32" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_get_ssp" tech="Other">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter type="void" />
+	<description>If CET is enabled, read the current shadow stack pointer, and store the result in "dst". Otherwise return 0.</description>
+	<operation>dst := SSP[63:0]
+	</operation>
+	<instruction form="r64" name="RDSSPQ" xed="RDSSPQ_GPR64u64" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_inc_ssp" tech="Other">
+	<return type="void" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Increment the shadow stack pointer by 4 times the value specified in bits [7:0] of "a".</description>
+	<operation>
+SSP := SSP + a[7:0] * 4
+	</operation>
+	<instruction form="r32" name="INCSSPD" xed="INCSSPD_GPR32u8" />
+	<CPUID>CET_SS</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	
+<intrinsic name="_mm_cldemote" tech="Other">
+	<return type="void" />
+	<parameter type="void const *" varname="p" />
+	<description>Hint to hardware that the cache line that contains "p" should be demoted from the cache closest to the processor core to a level more distant from the processor core.</description>
+	<instruction form="m8" name="CLDEMOTE" xed="CLDEMOTE_MEMu8" />
+	<CPUID>CLDEMOTE</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm_clflushopt" tech="Other">
+	<return type="void" />
+	<parameter type="void const *" varname="p" />
+	<description>Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy.</description>
+	<instruction form="m8" name="CLFLUSHOPT" xed="CLFLUSHOPT_MEMmprefetch" />
+	<CPUID>CLFLUSHOPT</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm_clwb" tech="Other">
+	<return type="void" />
+	<parameter type="void const *" varname="p" />
+	<description>Write back to memory the cache line that contains "p" from any level of the cache hierarchy in the cache coherence domain.</description>
+	<instruction form="m8" name="CLWB" xed="CLWB_MEMmprefetch" />
+	<CPUID>CLWB</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	
+	
+	<intrinsic name="_cmpccxadd_epi32" tech="Other">
+	<return etype="SI32" type="int" varname="dst" />
+	<parameter etype="SI32" type="void*" memwidth="32" varname="__A" />
+	<parameter etype="SI32" type="int" varname="__B" />
+	<parameter etype="SI32" type="int" varname="__C" />
+	<parameter etype="SI32" type="const int" varname="__D" />
+	<description>Compares the value from the memory "__A" with the value of "__B". If the specified condition "__D" is met, then add the third operand "__C" to the "__A" and write it into "__A", else the value of "__A" is unchanged. The return value is the original value of "__A".</description>
+	<operation>CASE (__D[3:0]) OF
+0: OP := _CMPCCX_O
+1: OP := _CMPCCX_NO
+2: OP := _CMPCCX_B
+3: OP := _CMPCCX_NB
+4: OP := _CMPCCX_Z
+5: OP := _CMPCCX_NZ
+6: OP := _CMPCCX_BE
+7: OP := _CMPCCX_NBE
+8: OP := _CMPCCX_S
+9: OP := _CMPCCX_NS
+10: OP := _CMPCCX_P
+11: OP := _CMPCCX_NP
+12: OP := _CMPCCX_L
+13: OP := _CMPCCX_NL
+14: OP := _CMPCCX_LE
+15: OP := _CMPCCX_NLE
+ESAC
+tmp1 := LOAD_LOCK(__A)
+tmp2 := tmp1 + __C
+IF (tmp1[31:0] OP __B[31:0])
+	STORE_UNLOCK(__A, tmp2)
+ELSE
+	STORE_UNLOCK(__A, tmp1)
+FI
+dst[31:0] := tmp1[31:0]
+	</operation>
+	<instruction name="CMPOXADD" form ="m32, r32, r32" xed="CMPOXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPNOXADD" form ="m32, r32, r32" xed="CMPNOXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPBXADD" form ="m32, r32, r32" xed="CMPBXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPNBXADD" form ="m32, r32, r32" xed="CMPNBXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPZXADD" form ="m32, r32, r32" xed="CMPZXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPNZXADD" form ="m32, r32, r32" xed="CMPNZXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPBEXADD" form ="m32, r32, r32" xed="CMPBEXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPNBEXADD" form ="m32, r32, r32" xed="CMPNBEXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPSXADD" form ="m32, r32, r32" xed="CMPSXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPNSXADD" form ="m32, r32, r32" xed="CMPNSXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPPXADD" form ="m32, r32, r32" xed="CMPPXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPNPXADD" form ="m32, r32, r32" xed="CMPNPXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPLXADD" form ="m32, r32, r32" xed="CMPLXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPNLXADD" form ="m32, r32, r32" xed="CMPNLXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPLEXADD" form ="m32, r32, r32" xed="CMPLEXADD_MEMu32_GPR32u32_GPR32u32" />
+	<instruction name="CMPNLEXADD" form ="m32, r32, r32" xed="CMPNLEXADD_MEMu32_GPR32u32_GPR32u32" />
+	<CPUID>CMPCCXADD</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_cmpccxadd_epi64" tech="Other">
+	<return etype="SI64" type="__int64" varname="dst" />
+	<parameter etype="SI64" type="void*" memwidth="32" varname="__A" />
+	<parameter etype="SI64" type="__int64" varname="__B" />
+	<parameter etype="SI64" type="__int64" varname="__C" />
+	<parameter etype="SI32" type="const int" varname="__D" />
+	<description>Compares the value from the memory "__A" with the value of "__B". If the specified condition "__D" is met, then add the third operand "__C" to the "__A" and write it into "__A", else the value of "__A" is unchanged. The return value is the original value of "__A".</description>
+	<operation>CASE (__D[3:0]) OF
+0: OP := _CMPCCX_O
+1: OP := _CMPCCX_NO
+2: OP := _CMPCCX_B
+3: OP := _CMPCCX_NB
+4: OP := _CMPCCX_Z
+5: OP := _CMPCCX_NZ
+6: OP := _CMPCCX_BE
+7: OP := _CMPCCX_NBE
+8: OP := _CMPCCX_S
+9: OP := _CMPCCX_NS
+10: OP := _CMPCCX_P
+11: OP := _CMPCCX_NP
+12: OP := _CMPCCX_L
+13: OP := _CMPCCX_NL
+14: OP := _CMPCCX_LE
+15: OP := _CMPCCX_NLE
+ESAC
+tmp1 := LOAD_LOCK(__A)
+tmp2 := tmp1 + __C
+IF (tmp1[63:0] OP __B[63:0])
+	STORE_UNLOCK(__A, tmp2)
+ELSE
+	STORE_UNLOCK(__A, tmp1)
+FI
+dst[63:0] := tmp1[63:0]
+	</operation>
+	<instruction name="CMPOXADD" form ="m64, r64, r64" xed="CMPOXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPNOXADD" form ="m64, r64, r64" xed="CMPNOXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPBXADD" form ="m64, r64, r64" xed="CMPBXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPNBXADD" form ="m64, r64, r64" xed="CMPNBXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPZXADD" form ="m64, r64, r64" xed="CMPZXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPNZXADD" form ="m64, r64, r64" xed="CMPNZXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPBEXADD" form ="m64, r64, r64" xed="CMPBEXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPNBEXADD" form ="m64, r64, r64" xed="CMPNBEXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPSXADD" form ="m64, r64, r64" xed="CMPSXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPNSXADD" form ="m64, r64, r64" xed="CMPNSXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPPXADD" form ="m64, r64, r64" xed="CMPPXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPNPXADD" form ="m64, r64, r64" xed="CMPNPXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPLXADD" form ="m64, r64, r64" xed="CMPLXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPNLXADD" form ="m64, r64, r64" xed="CMPNLXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPLEXADD" form ="m64, r64, r64" xed="CMPLEXADD_MEMu64_GPR64u64_GPR64u64" />
+	<instruction name="CMPNLEXADD" form ="m64, r64, r64" xed="CMPNLEXADD_MEMu64_GPR64u64_GPR64u64" />
+	<CPUID>CMPCCXADD</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_crc32_u8" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="crc" />
+	<parameter etype="UI8" type="unsigned char" varname="v" />
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 8-bit integer "v", and stores the result in "dst".</description>
+	<operation>tmp1[7:0] := v[0:7] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[39:0] := tmp1[7:0] &lt;&lt; 32 
+tmp4[39:0] := tmp2[31:0] &lt;&lt; 8
+tmp5[39:0] := tmp3[39:0] XOR tmp4[39:0]
+tmp6[31:0] := MOD2(tmp5[39:0], 0x11EDC6F41) // remainder from polynomial division modulus 2
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction form="r32, r8" name="CRC32" xed="CRC32_GPRyy_GPR8b" />
+	<CPUID>CRC32</CPUID>
+	<header>nmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_crc32_u16" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="crc" />
+	<parameter etype="UI16" type="unsigned short" varname="v" />
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 16-bit integer "v", and stores the result in "dst".</description>
+	<operation>tmp1[15:0] := v[0:15] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[47:0] := tmp1[15:0] &lt;&lt; 32
+tmp4[47:0] := tmp2[31:0] &lt;&lt; 16
+tmp5[47:0] := tmp3[47:0] XOR tmp4[47:0]
+tmp6[31:0] := MOD2(tmp5[47:0], 0x11EDC6F41) // remainder from polynomial division modulus 2
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction form="r32, r16" name="CRC32" xed="CRC32_GPRyy_GPRv" />
+	<CPUID>CRC32</CPUID>
+	<header>nmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_crc32_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="crc" />
+	<parameter etype="UI32" type="unsigned int" varname="v" />
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 32-bit integer "v", and stores the result in "dst".</description>
+	<operation>tmp1[31:0] := v[0:31] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[63:0] := tmp1[31:0] &lt;&lt; 32
+tmp4[63:0] := tmp2[31:0] &lt;&lt; 32
+tmp5[63:0] := tmp3[63:0] XOR tmp4[63:0]
+tmp6[31:0] := MOD2(tmp5[63:0], 0x11EDC6F41) // remainder from polynomial division modulus 2
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction form="r32, r32" name="CRC32" xed="CRC32_GPRyy_GPRv" />
+	<CPUID>CRC32</CPUID>
+	<header>nmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_crc32_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="crc" />
+	<parameter etype="UI64" type="unsigned __int64" varname="v" />
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 64-bit integer "v", and stores the result in "dst".</description>
+	<operation>tmp1[63:0] := v[0:63] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[95:0] := tmp1[31:0] &lt;&lt; 32
+tmp4[95:0] := tmp2[63:0] &lt;&lt; 64
+tmp5[95:0] := tmp3[95:0] XOR tmp4[95:0]
+tmp6[31:0] := MOD2(tmp5[95:0], 0x11EDC6F41) // remainder from polynomial division modulus 2
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction form="r64, r64" name="CRC32" xed="CRC32_GPRyy_GPRv" />
+	<CPUID>CRC32</CPUID>
+	<header>nmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_enqcmd" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter type="void*" memwidth="512" varname="__dst" />
+	<parameter type="const void*" memwidth="512" varname="__src" />
+	<description>Reads 64-byte command pointed by "__src", formats 64-byte enqueue store data, and performs 64-byte enqueue store to memory pointed by "__dst". This intrinsics may only be used in User mode.</description>
+	<instruction form="r16/r32/r64, m512" name="ENQCMD" xed="ENQCMD_GPRa_MEMu32" />
+	<CPUID>ENQCMD</CPUID>
+	<header>immintrin.h</header>
+	<category>Unknown</category>
+	</intrinsic>
+	<intrinsic name="_enqcmds" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter type="void*" memwidth="512" varname="__dst" />
+	<parameter type="const void*" memwidth="512" varname="__src" />
+	<description>Reads 64-byte command pointed by "__src", formats 64-byte enqueue store data, and performs 64-byte enqueue store to memory pointed by "__dst" This intrinsic may only be used in Privileged mode.</description>
+	<instruction form="r16/r32/r64, m512" name="ENQCMDS" xed="ENQCMDS_GPRa_MEMu32" />
+	<CPUID>ENQCMD</CPUID>
+	<header>immintrin.h</header>
+	<category>Unknown</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_cvtph_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP16" type="__m128i" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, xmm" name="VCVTPH2PS" xed="VCVTPH2PS_YMMqq_XMMdq" />
+	<CPUID>F16C</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm256_cvtps_ph" tech="AVX_ALL">
+	<return etype="FP16" type="__m128i" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_ROUND_MODE" type="int" varname="imm8" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_imm_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, ymm, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_XMMdq_YMMqq_IMMb" />
+	<CPUID>F16C</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtph_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP16" type="__m128i" varname="a" />
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="VCVTPH2PS" xed="VCVTPH2PS_XMMdq_XMMq" />
+	<CPUID>F16C</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtps_ph" tech="AVX_ALL">
+	<return etype="FP16" type="__m128i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_ROUND_MODE" type="int" varname="imm8" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_imm_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="VCVTPS2PH" xed="VCVTPS2PH_XMMq_XMMdq_IMMb" />
+	<CPUID>F16C</CPUID>
+	<header>immintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm_fmadd_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMADD132PD" xed="VFMADD132PD_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMADD213PD" xed="VFMADD213PD_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMADD231PD" xed="VFMADD231PD_XMMdq_XMMdq_XMMdq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmadd_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMADD132PD" xed="VFMADD132PD_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMADD213PD" xed="VFMADD213PD_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMADD231PD" xed="VFMADD231PD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmadd_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMADD132PS" xed="VFMADD132PS_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMADD213PS" xed="VFMADD213PS_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMADD231PS" xed="VFMADD231PS_XMMdq_XMMdq_XMMdq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmadd_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMADD132PS" xed="VFMADD132PS_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMADD213PS" xed="VFMADD213PS_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMADD231PS" xed="VFMADD231PS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmadd_sd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMADD132SD" xed="VFMADD132SD_XMMdq_XMMq_XMMq" />
+	<instruction form="xmm, xmm, xmm" name="VFMADD213SD" xed="VFMADD213SD_XMMdq_XMMq_XMMq" />
+	<instruction form="xmm, xmm, xmm" name="VFMADD231SD" xed="VFMADD231SD_XMMdq_XMMq_XMMq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmadd_ss" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMADD132SS" xed="VFMADD132SS_XMMdq_XMMd_XMMd" />
+	<instruction form="xmm, xmm, xmm" name="VFMADD213SS" xed="VFMADD213SS_XMMdq_XMMd_XMMd" />
+	<instruction form="xmm, xmm, xmm" name="VFMADD231SS" xed="VFMADD231SS_XMMdq_XMMd_XMMd" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmaddsub_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF ((j &amp; 1) == 0) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_XMMdq_XMMdq_XMMdq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmaddsub_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF ((j &amp; 1) == 0) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMADDSUB132PD" xed="VFMADDSUB132PD_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMADDSUB213PD" xed="VFMADDSUB213PD_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMADDSUB231PD" xed="VFMADDSUB231PD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmaddsub_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF ((j &amp; 1) == 0) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_XMMdq_XMMdq_XMMdq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmaddsub_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF ((j &amp; 1) == 0) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMADDSUB132PS" xed="VFMADDSUB132PS_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMADDSUB213PS" xed="VFMADDSUB213PS_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMADDSUB231PS" xed="VFMADDSUB231PS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmsub_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMSUB132PD" xed="VFMSUB132PD_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUB213PD" xed="VFMSUB213PD_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUB231PD" xed="VFMSUB231PD_XMMdq_XMMdq_XMMdq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmsub_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMSUB132PD" xed="VFMSUB132PD_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMSUB213PD" xed="VFMSUB213PD_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMSUB231PD" xed="VFMSUB231PD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmsub_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMSUB132PS" xed="VFMSUB132PS_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUB213PS" xed="VFMSUB213PS_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUB231PS" xed="VFMSUB231PS_XMMdq_XMMdq_XMMdq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmsub_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMSUB132PS" xed="VFMSUB132PS_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMSUB213PS" xed="VFMSUB213PS_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMSUB231PS" xed="VFMSUB231PS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmsub_sd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMSUB132SD" xed="VFMSUB132SD_XMMdq_XMMq_XMMq" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUB213SD" xed="VFMSUB213SD_XMMdq_XMMq_XMMq" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUB231SD" xed="VFMSUB231SD_XMMdq_XMMq_XMMq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmsub_ss" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMSUB132SS" xed="VFMSUB132SS_XMMdq_XMMd_XMMd" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUB213SS" xed="VFMSUB213SS_XMMdq_XMMd_XMMd" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUB231SS" xed="VFMSUB231SS_XMMdq_XMMd_XMMd" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmsubadd_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF ((j &amp; 1) == 0) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_XMMdq_XMMdq_XMMdq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmsubadd_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF ((j &amp; 1) == 0) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMSUBADD132PD" xed="VFMSUBADD132PD_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMSUBADD213PD" xed="VFMSUBADD213PD_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMSUBADD231PD" xed="VFMSUBADD231PD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fmsubadd_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF ((j &amp; 1) == 0) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_XMMdq_XMMdq_XMMdq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fmsubadd_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF ((j &amp; 1) == 0) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFMSUBADD132PS" xed="VFMSUBADD132PS_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMSUBADD213PS" xed="VFMSUBADD213PS_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFMSUBADD231PS" xed="VFMSUBADD231PS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmadd_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFNMADD132PD" xed="VFNMADD132PD_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFNMADD213PD" xed="VFNMADD213PD_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFNMADD231PD" xed="VFNMADD231PD_XMMdq_XMMdq_XMMdq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fnmadd_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFNMADD132PD" xed="VFNMADD132PD_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFNMADD213PD" xed="VFNMADD213PD_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFNMADD231PD" xed="VFNMADD231PD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmadd_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFNMADD132PS" xed="VFNMADD132PS_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFNMADD213PS" xed="VFNMADD213PS_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFNMADD231PS" xed="VFNMADD231PS_XMMdq_XMMdq_XMMdq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fnmadd_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFNMADD132PS" xed="VFNMADD132PS_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFNMADD213PS" xed="VFNMADD213PS_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFNMADD231PS" xed="VFNMADD231PS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmadd_sd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFNMADD132SD" xed="VFNMADD132SD_XMMdq_XMMq_XMMq" />
+	<instruction form="xmm, xmm, xmm" name="VFNMADD213SD" xed="VFNMADD213SD_XMMdq_XMMq_XMMq" />
+	<instruction form="xmm, xmm, xmm" name="VFNMADD231SD" xed="VFNMADD231SD_XMMdq_XMMq_XMMq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmadd_ss" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFNMADD132SS" xed="VFNMADD132SS_XMMdq_XMMd_XMMd" />
+	<instruction form="xmm, xmm, xmm" name="VFNMADD213SS" xed="VFNMADD213SS_XMMdq_XMMd_XMMd" />
+	<instruction form="xmm, xmm, xmm" name="VFNMADD231SS" xed="VFNMADD231SS_XMMdq_XMMd_XMMd" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmsub_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB132PD" xed="VFNMSUB132PD_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB213PD" xed="VFNMSUB213PD_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB231PD" xed="VFNMSUB231PD_XMMdq_XMMdq_XMMdq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fnmsub_pd" tech="AVX_ALL">
+	<return etype="FP64" type="__m256d" varname="dst" />
+	<parameter etype="FP64" type="__m256d" varname="a" />
+	<parameter etype="FP64" type="__m256d" varname="b" />
+	<parameter etype="FP64" type="__m256d" varname="c" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFNMSUB132PD" xed="VFNMSUB132PD_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFNMSUB213PD" xed="VFNMSUB213PD_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFNMSUB231PD" xed="VFNMSUB231PD_YMMqq_YMMqq_YMMqq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmsub_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB132PS" xed="VFNMSUB132PS_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB213PS" xed="VFNMSUB213PS_XMMdq_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB231PS" xed="VFNMSUB231PS_XMMdq_XMMdq_XMMdq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_fnmsub_ps" tech="AVX_ALL">
+	<return etype="FP32" type="__m256" varname="dst" />
+	<parameter etype="FP32" type="__m256" varname="a" />
+	<parameter etype="FP32" type="__m256" varname="b" />
+	<parameter etype="FP32" type="__m256" varname="c" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VFNMSUB132PS" xed="VFNMSUB132PS_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFNMSUB213PS" xed="VFNMSUB213PS_YMMqq_YMMqq_YMMqq" />
+	<instruction form="ymm, ymm, ymm" name="VFNMSUB231PS" xed="VFNMSUB231PS_YMMqq_YMMqq_YMMqq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmsub_sd" tech="AVX_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="c" />
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB132SD" xed="VFNMSUB132SD_XMMdq_XMMq_XMMq" />
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB213SD" xed="VFNMSUB213SD_XMMdq_XMMq_XMMq" />
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB231SD" xed="VFNMSUB231SD_XMMdq_XMMq_XMMq" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_fnmsub_ss" tech="AVX_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="c" />
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB132SS" xed="VFNMSUB132SS_XMMdq_XMMd_XMMd" />
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB213SS" xed="VFNMSUB213SS_XMMdq_XMMd_XMMd" />
+	<instruction form="xmm, xmm, xmm" name="VFNMSUB231SS" xed="VFNMSUB231SS_XMMdq_XMMd_XMMd" />
+	<CPUID>FMA</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_readfsbase_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<description>Read the FS segment base register and store the 32-bit result in "dst".</description>
+	<operation>dst[31:0] := FS_Segment_Base_Register
+dst[63:32] := 0
+	</operation>
+	<instruction form="r32" name="RDFSBASE" xed="RDFSBASE_GPRy" />
+	<CPUID>FSGSBASE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_readfsbase_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<description>Read the FS segment base register and store the 64-bit result in "dst".</description>
+	<operation>dst[63:0] := FS_Segment_Base_Register
+	</operation>
+	<instruction form="r64" name="RDFSBASE" xed="RDFSBASE_GPRy" />
+	<CPUID>FSGSBASE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_readgsbase_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<description>Read the GS segment base register and store the 32-bit result in "dst".</description>
+	<operation>dst[31:0] := GS_Segment_Base_Register
+dst[63:32] := 0
+	</operation>
+	<instruction form="r32" name="RDGSBASE" xed="RDGSBASE_GPRy" />
+	<CPUID>FSGSBASE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_readgsbase_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<description>Read the GS segment base register and store the 64-bit result in "dst".</description>
+	<operation>dst[63:0] := GS_Segment_Base_Register
+	</operation>
+	<instruction form="r64" name="RDGSBASE" xed="RDGSBASE_GPRy" />
+	<CPUID>FSGSBASE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_writefsbase_u32" tech="Other">
+	<return type="void" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Write the unsigned 32-bit integer "a" to the FS segment base register.</description>
+	<operation>
+FS_Segment_Base_Register[31:0] := a[31:0]
+FS_Segment_Base_Register[63:32] := 0
+	</operation>
+	<instruction form="r32" name="WRFSBASE" xed="WRFSBASE_GPRy" />
+	<CPUID>FSGSBASE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_writefsbase_u64" tech="Other">
+	<return type="void" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Write the unsigned 64-bit integer "a" to the FS segment base register.</description>
+	<operation>
+FS_Segment_Base_Register[63:0] := a[63:0]
+	</operation>
+	<instruction form="r64" name="WRFSBASE" xed="WRFSBASE_GPRy" />
+	<CPUID>FSGSBASE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_writegsbase_u32" tech="Other">
+	<return type="void" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Write the unsigned 32-bit integer "a" to the GS segment base register.</description>
+	<operation>
+GS_Segment_Base_Register[31:0] := a[31:0]
+GS_Segment_Base_Register[63:32] := 0
+	</operation>
+	<instruction form="r32" name="WRGSBASE" xed="WRGSBASE_GPRy" />
+	<CPUID>FSGSBASE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_writegsbase_u64" tech="Other">
+	<return type="void" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Write the unsigned 64-bit integer "a" to the GS segment base register.</description>
+	<operation>
+GS_Segment_Base_Register[63:0] := a[63:0]
+	</operation>
+	<instruction form="r64" name="WRGSBASE" xed="WRGSBASE_GPRy" />
+	<CPUID>FSGSBASE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_fxrstor" tech="Other">
+	<return type="void" />
+	<parameter memwidth="4096" type="void *" varname="mem_addr" />
+	<description>Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary.</description>
+	<operation>state_x87_fpu_mmx_sse := fxrstor(MEM[mem_addr+512*8:mem_addr])
+	</operation>
+	<instruction form="m512" name="FXRSTOR" xed="FXRSTOR_MEMmfpxenv" />
+	<CPUID>FXSR</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_fxrstor64" tech="Other">
+	<return type="void" />
+	<parameter memwidth="4096" type="void *" varname="mem_addr" />
+	<description>Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE64 instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary.</description>
+	<operation>state_x87_fpu_mmx_sse := fxrstor64(MEM[mem_addr+512*8:mem_addr])
+	</operation>
+	<instruction form="m512" name="FXRSTOR64" xed="FXRSTOR64_MEMmfpxenv" />
+	<CPUID>FXSR</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_fxsave" tech="Other">
+	<return type="void" />
+	<parameter memwidth="4096" type="void *" varname="mem_addr" />
+	<description>Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor.</description>
+	<operation>MEM[mem_addr+512*8:mem_addr] := fxsave(state_x87_fpu_mmx_sse)
+	</operation>
+	<instruction form="m512" name="FXSAVE" xed="FXSAVE_MEMmfpxenv" />
+	<CPUID>FXSR</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_fxsave64" tech="Other">
+	<return type="void" />
+	<parameter memwidth="4096" type="void *" varname="mem_addr" />
+	<description>Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor.</description>
+	<operation>MEM[mem_addr+512*8:mem_addr] := fxsave64(state_x87_fpu_mmx_sse)
+	</operation>
+	<instruction form="m512" name="FXSAVE64" xed="FXSAVE64_MEMmfpxenv" />
+	<CPUID>FXSR</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_maskz_gf2p8mul_epi8" tech="Other">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 63
+	IF k[j]
+		dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+	ELSE
+		dst.byte[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm" name="VGF2P8MULB" xed="VGF2P8MULB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_gf2p8mul_epi8" tech="Other">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 63
+	IF k[j]
+		dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+	ELSE
+		dst.byte[j] := src.byte[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm" name="VGF2P8MULB" xed="VGF2P8MULB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_gf2p8mul_epi8" tech="Other">
+	<return etype="UI8" type="__m512i" varname="dst" />
+	<parameter etype="UI8" type="__m512i" varname="a" />
+	<parameter etype="UI8" type="__m512i" varname="b" />
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 63
+	dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm" name="VGF2P8MULB" xed="VGF2P8MULB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_gf2p8affine_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="x" />
+	<parameter etype="UI64" type="__m512i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 7
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VGF2P8AFFINEQB" xed="VGF2P8AFFINEQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_gf2p8affine_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="x" />
+	<parameter etype="UI64" type="__m512i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 7
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := src.qword[j].byte[i]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VGF2P8AFFINEQB" xed="VGF2P8AFFINEQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_gf2p8affine_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="x" />
+	<parameter etype="UI64" type="__m512i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst".</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 7
+	FOR i := 0 to 7
+		dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VGF2P8AFFINEQB" xed="VGF2P8AFFINEQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_maskz_gf2p8affineinv_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="x" />
+	<parameter etype="UI64" type="__m512i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 7
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {z}, zmm, zmm, imm8" name="VGF2P8AFFINEINVQB" xed="VGF2P8AFFINEINVQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_mask_gf2p8affineinv_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="src" />
+	<parameter etype="MASK" type="__mmask64" varname="k" />
+	<parameter etype="UI64" type="__m512i" varname="x" />
+	<parameter etype="UI64" type="__m512i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 7
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := src.qword[j].byte[b]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm {k}, zmm, zmm, imm8" name="VGF2P8AFFINEINVQB" xed="VGF2P8AFFINEINVQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm512_gf2p8affineinv_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m512i" varname="dst" />
+	<parameter etype="UI64" type="__m512i" varname="x" />
+	<parameter etype="UI64" type="__m512i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst".</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 7
+	FOR i := 0 to 7
+		dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VGF2P8AFFINEINVQB" xed="VGF2P8AFFINEINVQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_maskz_gf2p8mul_epi8" tech="Other">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 31
+	IF k[j]
+		dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+	ELSE
+		dst.byte[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm" name="VGF2P8MULB" xed="VGF2P8MULB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_gf2p8mul_epi8" tech="Other">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 31
+	IF k[j]
+		dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+	ELSE
+		dst.byte[j] := src.byte[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm" name="VGF2P8MULB" xed="VGF2P8MULB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_gf2p8mul_epi8" tech="Other">
+	<return etype="UI8" type="__m256i" varname="dst" />
+	<parameter etype="UI8" type="__m256i" varname="a" />
+	<parameter etype="UI8" type="__m256i" varname="b" />
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 31
+	dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm" name="VGF2P8MULB" xed="VGF2P8MULB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_gf2p8mul_epi8" tech="Other">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 15
+	IF k[j]
+		dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+	ELSE
+		dst.byte[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm" name="VGF2P8MULB" xed="VGF2P8MULB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_gf2p8mul_epi8" tech="Other">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 15
+	IF k[j]
+		dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+	ELSE
+		dst.byte[j] := src.byte[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm" name="VGF2P8MULB" xed="VGF2P8MULB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_gf2p8mul_epi8" tech="Other">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 15
+	dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm" name="VGF2P8MULB" xed="VGF2P8MULB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_gf2p8affine_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="x" />
+	<parameter etype="UI64" type="__m256i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 3
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VGF2P8AFFINEQB" xed="VGF2P8AFFINEQB_YMMu8_MASKmskw_YMMu8_YMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_gf2p8affine_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="x" />
+	<parameter etype="UI64" type="__m256i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 3
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := src.qword[j].byte[i]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VGF2P8AFFINEQB" xed="VGF2P8AFFINEQB_YMMu8_MASKmskw_YMMu8_YMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_gf2p8affine_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="x" />
+	<parameter etype="UI64" type="__m256i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst".</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 3
+	FOR i := 0 to 7
+		dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VGF2P8AFFINEQB" xed="VGF2P8AFFINEQB_YMMu8_MASKmskw_YMMu8_YMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_gf2p8affine_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="x" />
+	<parameter etype="UI64" type="__m128i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 1
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VGF2P8AFFINEQB" xed="VGF2P8AFFINEQB_XMMu8_MASKmskw_XMMu8_XMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_gf2p8affine_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="x" />
+	<parameter etype="UI64" type="__m128i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 1
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := src.qword[j].byte[i]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VGF2P8AFFINEQB" xed="VGF2P8AFFINEQB_XMMu8_MASKmskw_XMMu8_XMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_gf2p8affine_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="x" />
+	<parameter etype="UI64" type="__m128i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst".</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 1
+	FOR i := 0 to 7
+		dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VGF2P8AFFINEQB" xed="VGF2P8AFFINEQB_XMMu8_MASKmskw_XMMu8_XMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_maskz_gf2p8affineinv_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="x" />
+	<parameter etype="UI64" type="__m256i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 3
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {z}, ymm, ymm, imm8" name="VGF2P8AFFINEINVQB" xed="VGF2P8AFFINEINVQB_YMMu8_MASKmskw_YMMu8_YMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_mask_gf2p8affineinv_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="src" />
+	<parameter etype="MASK" type="__mmask32" varname="k" />
+	<parameter etype="UI64" type="__m256i" varname="x" />
+	<parameter etype="UI64" type="__m256i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 3
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := src.qword[j].byte[i]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm {k}, ymm, ymm, imm8" name="VGF2P8AFFINEINVQB" xed="VGF2P8AFFINEINVQB_YMMu8_MASKmskw_YMMu8_YMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm256_gf2p8affineinv_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m256i" varname="dst" />
+	<parameter etype="UI64" type="__m256i" varname="x" />
+	<parameter etype="UI64" type="__m256i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst".</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 3
+	FOR i := 0 to 7
+		dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VGF2P8AFFINEINVQB" xed="VGF2P8AFFINEINVQB_YMMu8_MASKmskw_YMMu8_YMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskz_gf2p8affineinv_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="x" />
+	<parameter etype="UI64" type="__m128i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 1
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {z}, xmm, xmm, imm8" name="VGF2P8AFFINEINVQB" xed="VGF2P8AFFINEINVQB_XMMu8_MASKmskw_XMMu8_XMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mask_gf2p8affineinv_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="src" />
+	<parameter etype="MASK" type="__mmask16" varname="k" />
+	<parameter etype="UI64" type="__m128i" varname="x" />
+	<parameter etype="UI64" type="__m128i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 1
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := src.qword[j].byte[i]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm {k}, xmm, xmm, imm8" name="VGF2P8AFFINEINVQB" xed="VGF2P8AFFINEINVQB_XMMu8_MASKmskw_XMMu8_XMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_gf2p8affineinv_epi64_epi8" tech="Other">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="x" />
+	<parameter etype="UI64" type="__m128i" varname="A" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="b" />
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst".</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 1
+	FOR i := 0 to 7
+		dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm, xmm, imm8" name="VGF2P8AFFINEINVQB" xed="VGF2P8AFFINEINVQB_XMMu8_MASKmskw_XMMu8_XMMu64_IMM8_AVX512" />
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_hreset" tech="Other">
+	<return type="void" />
+	<parameter type="int" varname="__eax" etype="SI32" />
+	<description>Provides a hint to the processor to selectively reset the prediction history of the current logical processor specified by a signed 32-bit integer "__eax".</description>
+	<instruction name="HRESET" form="imm8" xed="HRESET_IMM8" />
+	<CPUID>HRESET</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	
+<intrinsic name="_invpcid" tech="Other">
+	<return type="void" />
+	<parameter etype="UI32" type="unsigned int" varname="type" />
+	<parameter memwidth="128" type="void*" varname="descriptor" />
+	<description>Invalidate mappings in the Translation Lookaside Buffers (TLBs) and paging-structure caches for the processor context identifier (PCID) specified by "descriptor" based on the invalidation type specified in "type". 
+	The PCID "descriptor" is specified as a 16-byte memory operand (with no alignment restrictions) where bits [11:0] specify the PCID, and bits [127:64] specify the linear address; bits [63:12] are reserved.
+	The types supported are:
+		0) Individual-address invalidation: If "type" is 0, the logical processor invalidates mappings for a single linear address and tagged with the PCID specified in "descriptor", except global translations. The instruction may also invalidate global translations, mappings for other linear addresses, or mappings tagged with other PCIDs.
+		1) Single-context invalidation: If "type" is 1, the logical processor invalidates all mappings tagged with the PCID specified in "descriptor" except global translations. In some cases, it may invalidate mappings for other PCIDs as well.
+		2) All-context invalidation: If "type" is 2, the logical processor invalidates all mappings tagged with any PCID.
+		3) All-context invalidation, retaining global translations: If "type" is 3, the logical processor invalidates all mappings tagged with any PCID except global translations, ignoring "descriptor". The instruction may also invalidate global translations as well.</description>
+	<operation>
+CASE type[1:0] OF
+0: // individual-address invalidation retaining global translations
+	OP_PCID := MEM[descriptor+11:descriptor]
+	ADDR := MEM[descriptor+127:descriptor+64]
+	BREAK
+1: // single PCID invalidation retaining globals
+	OP_PCID := MEM[descriptor+11:descriptor]
+	// invalidate all mappings tagged with OP_PCID except global translations
+	BREAK
+2: // all PCID invalidation
+	// invalidate all mappings tagged with any PCID
+	BREAK
+3: // all PCID invalidation retaining global translations
+	// invalidate all mappings tagged with any PCID except global translations
+	BREAK
+ESAC
+	</operation>
+	<instruction form="r32, m128" name="INVPCID" xed="INVPCID_GPR32_MEMdq" />
+	<CPUID>INVPCID</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm_aesdec128kl_u8" tech="Other">
+		<type>Flag</type>
+		<return type="unsigned char" varname="dst" etype="UI8" />
+		<parameter type="__m128i*" varname="__odata" etype="UI8" memwidth="128" />
+		<parameter type="__m128i" varname="__idata" etype="UI8" />
+		<parameter type="const void*" varname="__h" etype="UI8" memwidth="384" />
+		<description>Decrypt 10 rounds of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata".</description>
+		<operation>MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], __h[383:0])
+dst := ZF
+		</operation>
+		<instruction name="AESDEC128KL" form="xmm, m32" xed="AESDEC128KL_XMMu8_MEMu8" />
+	<CPUID>KEYLOCKER</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_aesdec256kl_u8" tech="Other">
+		<type>Flag</type>
+		<return type="unsigned char" varname="dst" etype="UI8" />
+		<parameter type="__m128i*" varname="__odata" etype="UI8" memwidth="128" />
+		<parameter type="__m128i" varname="__idata" etype="UI8" />
+		<parameter type="const void*" varname="__h" etype="UI8" memwidth="512" />
+		<description>Decrypt 10 rounds of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata".</description>
+		<operation>MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], __h[511:0])
+dst := ZF
+		</operation>
+		<instruction name="AESDEC256KL" form="xmm, m32" xed="AESDEC256KL_XMMu8_MEMu8" />
+	<CPUID>KEYLOCKER</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_aesenc128kl_u8" tech="Other">
+		<type>Flag</type>
+		<return type="unsigned char" varname="dst" etype="UI8" />
+		<parameter type="__m128i*" varname="__odata" etype="UI8" memwidth="128" />
+		<parameter type="__m128i" varname="__idata" etype="UI8" />
+		<parameter type="const void*" varname="__h" etype="UI8" memwidth="384" />
+		<description>Encrypt 10 rounds of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status.</description>
+		<operation>MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], __h[383:0])
+dst := ZF
+		</operation>
+		<instruction name="AESENC128KL" form="xmm, m32" xed="AESENC128KL_XMMu8_MEMu8" />
+	<CPUID>KEYLOCKER</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_aesenc256kl_u8" tech="Other">
+		<type>Flag</type>
+		<return type="unsigned char" varname="dst" etype="UI8" />
+		<parameter type="__m128i*" varname="__odata" etype="UI8" memwidth="128" />
+		<parameter type="__m128i" varname="__idata" etype="UI8" />
+		<parameter type="const void*" varname="__h" etype="UI8" memwidth="512" />
+		<description>Encrypt 10 rounds of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata".</description>
+		<operation>MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], __h[511:0])
+dst := ZF
+		</operation>
+		<instruction name="AESENC256KL" form="xmm, m32" xed="AESENC256KL_XMMu8_MEMu8" />
+	<CPUID>KEYLOCKER</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_encodekey128_u32" tech="Other">
+		<type>Flag</type>
+		<return type="unsigned int" varname="dst" etype="UI32" />
+		<parameter type="unsigned int" varname="__htype" etype="UI32" />
+		<parameter type="__m128i" varname="__key" etype="UI8" />
+		<parameter type="void*" varname="__h" etype="UI8" memwidth="768" />
+		<description>Wrap a 128-bit AES key from "__key" into a 384-bit key __h stored in "__h" and set IWKey's NoBackup and KeySource bits in "dst". The explicit source operand "__htype" specifies __h restrictions.</description>
+		<operation>__h[383:0] := WrapKey128(__key[127:0], __htype)
+dst[0] := IWKey.NoBackup
+dst[4:1] := IWKey.KeySource[3:0]
+		</operation>
+		<instruction name="ENCODEKEY128" form="r32, r32" xed="ENCODEKEY128_GPR32u8_GPR32u8" />
+	<CPUID>KEYLOCKER</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_encodekey256_u32" tech="Other">
+		<type>Flag</type>
+		<return type="unsigned int" varname="dst" etype="UI32" />
+		<parameter type="unsigned int" varname="__htype" etype="UI32" />
+		<parameter type="__m128i" varname="__key_lo" etype="UI8" />
+		<parameter type="__m128i" varname="__key_hi" etype="UI8" />
+		<parameter type="void*" varname="__h" etype="UI8" memwidth="896" />
+		<description>Wrap a 256-bit AES key from "__key_hi" and "__key_lo" into a 512-bit key stored in "__h" and set IWKey's NoBackup and KeySource bits in "dst". The 32-bit "__htype" specifies __h restrictions.</description>
+		<operation>__h[511:0] := WrapKey256(__key_lo[127:0], __key_hi[127:0], __htype)
+dst[0] := IWKey.NoBackup
+dst[4:1] := IWKey.KeySource[3:0]
+		</operation>
+		<instruction name="ENCODEKEY256" form="r32, r32" xed="ENCODEKEY256_GPR32u8_GPR32u8" />
+	<CPUID>KEYLOCKER</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadiwkey" tech="Other">
+		<type>Flag</type>
+		<return type="void" />
+		<parameter type="unsigned int" varname="__ctl" etype="UI32" />
+		<parameter type="__m128i" varname="__intkey" etype="UI8" />
+		<parameter type="__m128i" varname="__enkey_lo" etype="UI8" />
+		<parameter type="__m128i" varname="__enkey_hi" etype="UI8" />
+		<description>Load internal wrapping key (IWKey). The 32-bit unsigned integer "__ctl" specifies IWKey's KeySource and whether backing up the key is permitted. IWKey's 256-bit encryption key is loaded from "__enkey_lo" and "__enkey_hi". IWKey's 128-bit integrity key is loaded from "__intkey".</description>
+		<instruction name="LOADIWKEY" form="xmm, xmm" xed="LOADIWKEY_XMMu8_XMMu8" />
+	<CPUID>KEYLOCKER</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+<intrinsic name="_mm_aesdecwide128kl_u8" tech="Other">
+		<type>Flag</type>
+		<return type="unsigned char" varname="dst" etype="UI8" />
+		<parameter type="__m128i*" varname="__odata" etype="UI8" memwidth="1024" />
+		<parameter type="const __m128i*" varname="__idata" etype="UI8" memwidth="1024" />
+		<parameter type="const void*" varname="__h" etype="UI8" memwidth="384" />
+		<description>Decrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata".</description>
+		<operation>FOR i := 0 to 7
+	__odata[i] := AES128Decrypt (__idata[i], __h[383:0])
+ENDFOR
+dst := ZF
+		</operation>
+		<instruction name="AESDECWIDE128KL" form="m32" xed="AESDECWIDE128KL_MEMu8" />
+	<CPUID>KEYLOCKER_WIDE</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_aesdecwide256kl_u8" tech="Other">
+		<type>Flag</type>
+		<return type="unsigned char" varname="dst" etype="UI8" />
+		<parameter type="__m128i*" varname="__odata" etype="UI8" memwidth="1024" />
+		<parameter type="const __m128i*" varname="__idata" etype="UI8" memwidth="1024" />
+		<parameter type="const void*" varname="__h" etype="UI8" memwidth="512" />
+		<description>Decrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata".</description>
+		<operation>FOR i := 0 to 7
+	__odata[i] := AES256Decrypt (__idata[i], __h[511:0])
+ENDFOR
+dst := ZF
+		</operation>
+		<instruction name="AESDECWIDE256KL" form="m32" xed="AESDECWIDE256KL_MEMu8" />
+	<CPUID>KEYLOCKER_WIDE</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_aesencwide128kl_u8" tech="Other">
+		<type>Flag</type>
+		<return type="unsigned char" varname="dst" etype="UI8" />
+		<parameter type="__m128i*" varname="__odata" etype="UI8" memwidth="1024" />
+		<parameter type="const __m128i*" varname="__idata" etype="UI8" memwidth="1024" />
+		<parameter type="const void*" varname="__h" etype="UI8" memwidth="384" />
+		<description>Encrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata".</description>
+		<operation>FOR i := 0 to 7
+	__odata[i] := AES128Encrypt (__idata[i], __h[383:0])
+ENDFOR
+dst := ZF
+		</operation>
+		<instruction name="AESENCWIDE128KL" form="m32" xed="AESENCWIDE128KL_MEMu8" />
+	<CPUID>KEYLOCKER_WIDE</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_aesencwide256kl_u8" tech="Other">
+		<type>Flag</type>
+		<return type="unsigned char" varname="dst" etype="UI8" />
+		<parameter type="__m128i*" varname="__odata" etype="UI8" memwidth="1024" />
+		<parameter type="const __m128i*" varname="__idata" etype="UI8" memwidth="1024" />
+		<parameter type="const void*" varname="__h" etype="UI8" memwidth="512" />
+		<description>Encrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata".</description>
+		<operation>FOR i := 0 to 7
+	__odata[i] := AES256Encrypt (__idata[i], __h[512:0])
+ENDFOR
+dst := ZF
+		</operation>
+		<instruction name="AESENCWIDE256KL" form="m32" xed="AESENCWIDE256KL_MEMu8" />
+	<CPUID>KEYLOCKER_WIDE</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	
+<intrinsic name="_lzcnt_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Count the number of leading zero bits in unsigned 32-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 31
+dst := 0
+DO WHILE (tmp &gt;= 0 AND a[tmp] == 0)
+	tmp := tmp - 1
+	dst := dst + 1
+OD
+	</operation>
+	<instruction form="r32, r32" name="LZCNT" xed="LZCNT_GPRv_GPRv" />
+	<CPUID>LZCNT</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_lzcnt_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Count the number of leading zero bits in unsigned 64-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 63
+dst := 0
+DO WHILE (tmp &gt;= 0 AND a[tmp] == 0)
+	tmp := tmp - 1
+	dst := dst + 1
+OD
+	</operation>
+	<instruction form="r64, r64" name="LZCNT" xed="LZCNT_GPRv_GPRv" />
+	<CPUID>LZCNT</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_m_from_int64" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Copy 64-bit integer "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction form="mm, r64" name="MOVQ" xed="MOVQ_MMXq_GPR64" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_m_to_int64" tech="MMX">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP32" type="__m64" varname="a" />
+	<description>Copy 64-bit integer "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction form="r64, mm" name="MOVQ" xed="MOVQ_GPR64_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_m_from_int" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := 0
+	</operation>
+	<instruction form="mm, r32" name="MOVD" xed="MOVD_MMXq_GPR32" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_m_to_int" tech="MMX">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m64" varname="a" />
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction form="r32, mm" name="MOVD" xed="MOVD_GPR32_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi32_si64" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := 0
+	</operation>
+	<instruction form="mm, r32" name="MOVD" xed="MOVD_MMXq_GPR32" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi64_si32" tech="MMX">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m64" varname="a" />
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction form="r32, mm" name="MOVD" xed="MOVD_GPR32_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtm64_si64" tech="MMX">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP32" type="__m64" varname="a" />
+	<description>Copy 64-bit integer "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction form="r64, mm" name="MOVQ" xed="MOVQ_GPR64_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi64_m64" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Copy 64-bit integer "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction form="mm, r64" name="MOVQ" xed="MOVQ_MMXq_GPR64" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_m_empty" tech="MMX">
+	<return type="void" />
+	<parameter type="void" />
+	<description>Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures.</description>
+	<instruction name="EMMS" xed="EMMS" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_empty" tech="MMX">
+	<return type="void" />
+	<parameter type="void" />
+	<description>Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures.</description>
+	<instruction name="EMMS" xed="EMMS" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_m_packsswb" tech="MMX">
+	<return etype="SI8" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate8(a[15:0])
+dst[15:8] := Saturate8(a[31:16])
+dst[23:16] := Saturate8(a[47:32])
+dst[31:24] := Saturate8(a[63:48])
+dst[39:32] := Saturate8(b[15:0])
+dst[47:40] := Saturate8(b[31:16])
+dst[55:48] := Saturate8(b[47:32])
+dst[63:56] := Saturate8(b[63:48])
+	</operation>
+	<instruction form="mm, mm" name="PACKSSWB" xed="PACKSSWB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_m_packssdw" tech="MMX">
+	<return etype="SI16" type="__m64" varname="dst" />
+	<parameter etype="SI32" type="__m64" varname="a" />
+	<parameter etype="SI32" type="__m64" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:0])
+dst[31:16] := Saturate16(a[63:32])
+dst[47:32] := Saturate16(b[31:0])
+dst[63:48] := Saturate16(b[63:32])
+	</operation>
+	<instruction form="mm, mm" name="PACKSSDW" xed="PACKSSDW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_m_packuswb" tech="MMX">
+	<return etype="UI8" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := SaturateU8(a[15:0])
+dst[15:8] := SaturateU8(a[31:16])
+dst[23:16] := SaturateU8(a[47:32])
+dst[31:24] := SaturateU8(a[63:48])
+dst[39:32] := SaturateU8(b[15:0])
+dst[47:40] := SaturateU8(b[31:16])
+dst[55:48] := SaturateU8(b[47:32])
+dst[63:56] := SaturateU8(b[63:48])
+	</operation>
+	<instruction form="mm, mm" name="PACKUSWB" xed="PACKUSWB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_packs_pi16" tech="MMX">
+	<return etype="SI8" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate8(a[15:0])
+dst[15:8] := Saturate8(a[31:16])
+dst[23:16] := Saturate8(a[47:32])
+dst[31:24] := Saturate8(a[63:48])
+dst[39:32] := Saturate8(b[15:0])
+dst[47:40] := Saturate8(b[31:16])
+dst[55:48] := Saturate8(b[47:32])
+dst[63:56] := Saturate8(b[63:48])
+	</operation>
+	<instruction form="mm, mm" name="PACKSSWB" xed="PACKSSWB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_packs_pi32" tech="MMX">
+	<return etype="SI16" type="__m64" varname="dst" />
+	<parameter etype="SI32" type="__m64" varname="a" />
+	<parameter etype="SI32" type="__m64" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:0])
+dst[31:16] := Saturate16(a[63:32])
+dst[47:32] := Saturate16(b[31:0])
+dst[63:48] := Saturate16(b[63:32])
+	</operation>
+	<instruction form="mm, mm" name="PACKSSDW" xed="PACKSSDW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_packs_pu16" tech="MMX">
+	<return etype="UI8" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := SaturateU8(a[15:0])
+dst[15:8] := SaturateU8(a[31:16])
+dst[23:16] := SaturateU8(a[47:32])
+dst[31:24] := SaturateU8(a[63:48])
+dst[39:32] := SaturateU8(b[15:0])
+dst[47:40] := SaturateU8(b[31:16])
+dst[55:48] := SaturateU8(b[47:32])
+dst[63:56] := SaturateU8(b[63:48])
+	</operation>
+	<instruction form="mm, mm" name="PACKUSWB" xed="PACKUSWB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_m_punpckhbw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]) {
+	dst[7:0] := src1[39:32]
+	dst[15:8] := src2[39:32] 
+	dst[23:16] := src1[47:40]
+	dst[31:24] := src2[47:40]
+	dst[39:32] := src1[55:48]
+	dst[47:40] := src2[55:48]
+	dst[55:48] := src1[63:56]
+	dst[63:56] := src2[63:56]
+	RETURN dst[63:0]
+}
+dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PUNPCKHBW" xed="PUNPCKHBW_MMXq_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_m_punpckhwd" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]) {
+	dst[15:0] := src1[47:32]
+	dst[31:16] := src2[47:32]
+	dst[47:32] := src1[63:48]
+	dst[63:48] := src2[63:48]
+	RETURN dst[63:0]
+}
+dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PUNPCKLBW" xed="PUNPCKLBW_MMXq_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_m_punpckhdq" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32]
+dst[63:32] := b[63:32]
+	</operation>
+	<instruction form="mm, mm" name="PUNPCKHDQ" xed="PUNPCKHDQ_MMXq_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_m_punpcklbw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[63:0], src2[63:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	RETURN dst[63:0]	
+}
+dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PUNPCKLBW" xed="PUNPCKLBW_MMXq_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_m_punpcklwd" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[63:0], src2[63:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	RETURN dst[63:0]	
+}
+dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PUNPCKLWD" xed="PUNPCKLWD_MMXq_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_m_punpckldq" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := b[31:0]
+	</operation>
+	<instruction form="mm, mm" name="PUNPCKLDQ" xed="PUNPCKLDQ_MMXq_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpackhi_pi8" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]) {
+	dst[7:0] := src1[39:32]
+	dst[15:8] := src2[39:32] 
+	dst[23:16] := src1[47:40]
+	dst[31:24] := src2[47:40]
+	dst[39:32] := src1[55:48]
+	dst[47:40] := src2[55:48]
+	dst[55:48] := src1[63:56]
+	dst[63:56] := src2[63:56]
+	RETURN dst[63:0]	
+}
+dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PUNPCKHBW" xed="PUNPCKHBW_MMXq_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpackhi_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]) {
+	dst[15:0] := src1[47:32]
+	dst[31:16] := src2[47:32]
+	dst[47:32] := src1[63:48]
+	dst[63:48] := src2[63:48]
+	RETURN dst[63:0]
+}
+dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PUNPCKLBW" xed="PUNPCKLBW_MMXq_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpackhi_pi32" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="__m64" varname="a" />
+	<parameter etype="UI32" type="__m64" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32]
+dst[63:32] := b[63:32]
+	</operation>
+	<instruction form="mm, mm" name="PUNPCKHDQ" xed="PUNPCKHDQ_MMXq_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpacklo_pi8" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[63:0], src2[63:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	RETURN dst[63:0]	
+}
+dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PUNPCKLBW" xed="PUNPCKLBW_MMXq_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpacklo_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[63:0], src2[63:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	RETURN dst[63:0]	
+}
+dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PUNPCKLWD" xed="PUNPCKLWD_MMXq_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpacklo_pi32" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="__m64" varname="a" />
+	<parameter etype="UI32" type="__m64" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := b[31:0]
+	</operation>
+	<instruction form="mm, mm" name="PUNPCKLDQ" xed="PUNPCKLDQ_MMXq_MMXd" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_m_paddb" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDB" xed="PADDB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_paddw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDW" xed="PADDW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_paddd" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDD" xed="PADDD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_paddsb" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI64" type="__m64" varname="a" />
+	<parameter etype="SI64" type="__m64" varname="b" />
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDSB" xed="PADDSB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_paddsw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI64" type="__m64" varname="a" />
+	<parameter etype="SI64" type="__m64" varname="b" />
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDSW" xed="PADDSW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_paddusb" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDUSB" xed="PADDUSB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_paddusw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDUSW" xed="PADDUSW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_psubb" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBB" xed="PSUBB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_psubw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBW" xed="PSUBW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_psubd" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBD" xed="PSUBD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_psubsb" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI64" type="__m64" varname="a" />
+	<parameter etype="SI64" type="__m64" varname="b" />
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBSB" xed="PSUBSB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_psubsw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI64" type="__m64" varname="a" />
+	<parameter etype="SI64" type="__m64" varname="b" />
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBSW" xed="PSUBSW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_psubusb" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBUSB" xed="PSUBUSB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_psubusw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])	
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBUSW" xed="PSUBUSW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_pmaddwd" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI64" type="__m64" varname="a" />
+	<parameter etype="SI64" type="__m64" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMADDWD" xed="PMADDWD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_pmulhw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI64" type="__m64" varname="a" />
+	<parameter etype="SI64" type="__m64" varname="b" />
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMULHW" xed="PMULHW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_pmullw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMULLW" xed="PMULLW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_pi8" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDB" xed="PADDB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="b" />
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDW" xed="PADDW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_pi32" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="__m64" varname="a" />
+	<parameter etype="UI32" type="__m64" varname="b" />
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDD" xed="PADDD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_adds_pi8" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI8" type="__m64" varname="a" />
+	<parameter etype="SI8" type="__m64" varname="b" />
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDSB" xed="PADDSB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_adds_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDSW" xed="PADDSW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_adds_pu8" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDUSB" xed="PADDUSB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_adds_pu16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="b" />
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PADDUSW" xed="PADDUSW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_pi8" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBB" xed="PSUBB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="b" />
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBW" xed="PSUBW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_pi32" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="__m64" varname="a" />
+	<parameter etype="UI32" type="__m64" varname="b" />
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBD" xed="PSUBD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_subs_pi8" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI8" type="__m64" varname="a" />
+	<parameter etype="SI8" type="__m64" varname="b" />
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBSB" xed="PSUBSB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_subs_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBSW" xed="PSUBSW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_subs_pu8" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBUSB" xed="PSUBUSB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_subs_pu16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="b" />
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])	
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSUBUSW" xed="PSUBUSW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_madd_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMADDWD" xed="PMADDWD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mulhi_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMULHW" xed="PMULHW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mullo_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="b" />
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMULLW" xed="PMULLW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_psllw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSLLW" xed="PSLLW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_psllwi" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, imm8" name="PSLLW" xed="PSLLW_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_pslld" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSLLD" xed="PSLLD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_pslldi" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, imm8" name="PSLLD" xed="PSLLD_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_psllq" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="count" />
+	<description>Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF count[63:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &lt;&lt; count[63:0])
+FI
+	</operation>
+	<instruction form="mm, mm" name="PSLLQ" xed="PSLLQ_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_psllqi" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF imm8[7:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &lt;&lt; imm8[7:0])
+FI
+	</operation>
+	<instruction form="mm, imm8" name="PSLLQ" xed="PSLLQ_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_psraw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSRAW" xed="PSRAW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_psrawi" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI64" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, imm8" name="PSRAW" xed="PSRAW_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_psrad" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSRAD" xed="PSRAD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_psradi" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI64" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, imm8" name="PSRAD" xed="PSRAD_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_psrlw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSRLW" xed="PSRLW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_psrlwi" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, imm8" name="PSRLW" xed="PSRLW_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_psrld" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSRLD" xed="PSRLD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_psrldi" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, imm8" name="PSRLD" xed="PSRLD_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_psrlq" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="count" />
+	<description>Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF count[63:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &gt;&gt; count[63:0])
+FI
+	</operation>
+	<instruction form="mm, mm" name="PSRLQ" xed="PSRLQ_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_psrlqi" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF imm8[7:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &gt;&gt; imm8[7:0])
+FI
+	</operation>
+	<instruction form="mm, imm8" name="PSRLQ" xed="PSRLQ_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sll_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSLLW" xed="PSLLW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_slli_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, imm8" name="PSLLW" xed="PSLLW_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sll_pi32" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="__m64" varname="a" />
+	<parameter etype="UI32" type="__m64" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSLLD" xed="PSLLD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_slli_pi32" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, imm8" name="PSLLD" xed="PSLLD_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sll_si64" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="count" />
+	<description>Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF count[63:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &lt;&lt; count[63:0])
+FI
+	</operation>
+	<instruction form="mm, mm" name="PSLLQ" xed="PSLLQ_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_slli_si64" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF imm8[7:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &lt;&lt; imm8[7:0])
+FI
+	</operation>
+	<instruction form="mm, imm8" name="PSLLQ" xed="PSLLQ_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sra_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSRAW" xed="PSRAW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srai_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, imm8" name="PSRAW" xed="PSRAW_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sra_pi32" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="__m64" varname="a" />
+	<parameter etype="UI32" type="__m64" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSRAD" xed="PSRAD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srai_pi32" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, imm8" name="PSRAD" xed="PSRAD_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srl_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSRLW" xed="PSRLW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srli_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, imm8" name="PSRLW" xed="PSRLW_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srl_pi32" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="__m64" varname="a" />
+	<parameter etype="UI32" type="__m64" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSRLD" xed="PSRLD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srli_pi32" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, imm8" name="PSRLD" xed="PSRLD_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srl_si64" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="count" />
+	<description>Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF count[63:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &gt;&gt; count[63:0])
+FI
+	</operation>
+	<instruction form="mm, mm" name="PSRLQ" xed="PSRLQ_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srli_si64" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF imm8[7:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &gt;&gt; imm8[7:0])
+FI
+	</operation>
+	<instruction form="mm, imm8" name="PSRLQ" xed="PSRLQ_MMXq_IMMb" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_m_pand" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] AND b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PAND" xed="PAND_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_m_pandn" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := ((NOT a[63:0]) AND b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PANDN" xed="PANDN_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_m_por" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] OR b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="POR" xed="POR_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_m_pxor" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] XOR b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PXOR" xed="PXOR_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_and_si64" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] AND b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PAND" xed="PAND_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_andnot_si64" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := ((NOT a[63:0]) AND b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PANDN" xed="PANDN_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_or_si64" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] OR b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="POR" xed="POR_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_xor_si64" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] XOR b[63:0])
+	</operation>
+	<instruction form="mm, mm" name="PXOR" xed="PXOR_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_m_pcmpeqb" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PCMPEQB" xed="PCMPEQB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_m_pcmpeqw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PCMPEQW" xed="PCMPEQW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_m_pcmpeqd" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PCMPEQD" xed="PCMPEQD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_m_pcmpgtb" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI64" type="__m64" varname="a" />
+	<parameter etype="SI64" type="__m64" varname="b" />
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PCMPGTB" xed="PCMPGTB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_m_pcmpgtw" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI64" type="__m64" varname="a" />
+	<parameter etype="SI64" type="__m64" varname="b" />
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PCMPGTW" xed="PCMPGTW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_m_pcmpgtd" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI64" type="__m64" varname="a" />
+	<parameter etype="SI64" type="__m64" varname="b" />
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PCMPGTD" xed="PCMPGTD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_pi8" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PCMPEQB" xed="PCMPEQB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="b" />
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PCMPEQW" xed="PCMPEQW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_pi32" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="__m64" varname="a" />
+	<parameter etype="UI32" type="__m64" varname="b" />
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PCMPEQD" xed="PCMPEQD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_pi8" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI8" type="__m64" varname="a" />
+	<parameter etype="SI8" type="__m64" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PCMPGTB" xed="PCMPGTB_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_pi16" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PCMPGTW" xed="PCMPGTW_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_pi32" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="SI32" type="__m64" varname="a" />
+	<parameter etype="SI32" type="__m64" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PCMPGTD" xed="PCMPGTD_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_setzero_si64" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m64 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="mm, mm" name="PXOR" xed="PXOR_MMXq_MMXq" />
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_pi32" sequence="TRUE" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="int" varname="e1" />
+	<parameter etype="UI32" type="int" varname="e0" />
+	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+	</operation>
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_pi16" sequence="TRUE" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="short" varname="e3" />
+	<parameter etype="UI16" type="short" varname="e2" />
+	<parameter etype="UI16" type="short" varname="e1" />
+	<parameter etype="UI16" type="short" varname="e0" />
+	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[15:0] := e0
+dst[31:16] := e1
+dst[47:32] := e2
+dst[63:48] := e3
+	</operation>
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_pi8" sequence="TRUE" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="char" varname="e7" />
+	<parameter etype="UI8" type="char" varname="e6" />
+	<parameter etype="UI8" type="char" varname="e5" />
+	<parameter etype="UI8" type="char" varname="e4" />
+	<parameter etype="UI8" type="char" varname="e3" />
+	<parameter etype="UI8" type="char" varname="e2" />
+	<parameter etype="UI8" type="char" varname="e1" />
+	<parameter etype="UI8" type="char" varname="e0" />
+	<description>Set packed 8-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[7:0] := e0
+dst[15:8] := e1
+dst[23:16] := e2
+dst[31:24] := e3
+dst[39:32] := e4
+dst[47:40] := e5
+dst[55:48] := e6
+dst[63:56] := e7
+	</operation>
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set1_pi32" sequence="TRUE" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Broadcast 32-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+	</operation>
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set1_pi16" sequence="TRUE" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="short" varname="a" />
+	<description>Broadcast 16-bit integer "a" to all all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+	</operation>
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set1_pi8" sequence="TRUE" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="char" varname="a" />
+	<description>Broadcast 8-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+	</operation>
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setr_pi32" sequence="TRUE" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="int" varname="e1" />
+	<parameter etype="UI32" type="int" varname="e0" />
+	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e1
+dst[63:32] := e0
+	</operation>
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setr_pi16" sequence="TRUE" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="short" varname="e3" />
+	<parameter etype="UI16" type="short" varname="e2" />
+	<parameter etype="UI16" type="short" varname="e1" />
+	<parameter etype="UI16" type="short" varname="e0" />
+	<description>Set packed 16-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[15:0] := e3
+dst[31:16] := e2
+dst[47:32] := e1
+dst[63:48] := e0
+	</operation>
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setr_pi8" sequence="TRUE" tech="MMX">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="char" varname="e7" />
+	<parameter etype="UI8" type="char" varname="e6" />
+	<parameter etype="UI8" type="char" varname="e5" />
+	<parameter etype="UI8" type="char" varname="e4" />
+	<parameter etype="UI8" type="char" varname="e3" />
+	<parameter etype="UI8" type="char" varname="e2" />
+	<parameter etype="UI8" type="char" varname="e1" />
+	<parameter etype="UI8" type="char" varname="e0" />
+	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[7:0] := e7
+dst[15:8] := e6
+dst[23:16] := e5
+dst[31:24] := e4
+dst[39:32] := e3
+dst[47:40] := e2
+dst[55:48] := e1
+dst[63:56] := e0
+	</operation>
+	<CPUID>MMX</CPUID>
+	<header>mmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm_monitor" tech="Other">
+	<return type="void" />
+	<parameter type="void const*" varname="p" />
+	<parameter etype="UI32" type="unsigned" varname="extensions" />
+	<parameter etype="UI32" type="unsigned" varname="hints" />
+	<description>Arm address monitoring hardware using the address specified in "p". A store to an address within the specified address range triggers the monitoring hardware. Specify optional extensions in "extensions", and optional hints in "hints".</description>
+	<instruction name="MONITOR" xed="MONITOR" />
+	<CPUID>MONITOR</CPUID>
+	<header>pmmintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_mwait" tech="Other">
+	<return type="void" />
+	<parameter etype="UI32" type="unsigned" varname="extensions" />
+	<parameter etype="UI32" type="unsigned" varname="hints" />
+	<description>Hint to the processor that it can enter an implementation-dependent-optimized state while waiting for an event or store operation to the address range specified by MONITOR.</description>
+	<instruction name="MWAIT" xed="MWAIT" />
+	<CPUID>MONITOR</CPUID>
+	<header>pmmintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_loadbe_i16" tech="Other">
+	<return etype="UI16" type="short" varname="dst" />
+	<parameter etype="UI16" memwidth="16" type="void const *" varname="ptr" />
+	<description>Load 16 bits from memory, perform a byte swap operation, and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*8
+	dst[i+7:i] := MEM[ptr+15-i:ptr+8-i]
+ENDFOR
+	</operation>
+	<instruction form="r16, m16" name="MOVBE" xed="MOVBE_GPRv_MEMv" />
+	<CPUID>MOVBE</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_loadbe_i32" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" memwidth="32" type="void const *" varname="ptr" />
+	<description>Load 32 bits from memory, perform a byte swap operation, and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*8
+	dst[i+7:i] := MEM[ptr+31-i:ptr+24-i]
+ENDFOR
+	</operation>
+	<instruction form="r32, m32" name="MOVBE" xed="MOVBE_GPRv_MEMv" />
+	<CPUID>MOVBE</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_loadbe_i64" tech="Other">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="UI64" memwidth="64" type="void const *" varname="ptr" />
+	<description>Load 64 bits from memory, perform a byte swap operation, and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := MEM[ptr+63-i:ptr+56-i]
+ENDFOR
+	</operation>
+	<instruction form="r64, m64" name="MOVBE" xed="MOVBE_GPRv_MEMv" />
+	<CPUID>MOVBE</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_storebe_i16" tech="Other">
+	<return type="void" />
+	<parameter etype="UI16" memwidth="16" type="void *" varname="ptr" />
+	<parameter etype="UI16" type="short" varname="data" />
+	<description>Perform a bit swap operation of the 16 bits in "data", and store the results to memory.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*8
+	MEM[ptr+i+7:ptr+i] := data[15-i:8-i]
+ENDFOR
+	</operation>
+	<instruction form="m16, r16" name="MOVBE" xed="MOVBE_MEMv_GPRv" />
+	<CPUID>MOVBE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_storebe_i32" tech="Other">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="32" type="void *" varname="ptr" />
+	<parameter etype="UI32" type="int" varname="data" />
+	<description>Perform a bit swap operation of the 32 bits in "data", and store the results to memory.</description>
+	<operation>
+addr := MEM[ptr]
+FOR j := 0 to 3
+	i := j*8
+	MEM[ptr+i+7:ptr+i] := data[31-i:24-i]
+ENDFOR
+	</operation>
+	<instruction form="m32, r32" name="MOVBE" xed="MOVBE_MEMv_GPRv" />
+	<CPUID>MOVBE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_storebe_i64" tech="Other">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="64" type="void *" varname="ptr" />
+	<parameter etype="UI64" type="__int64" varname="data" />
+	<description>Perform a bit swap operation of the 64 bits in "data", and store the results to memory.</description>
+	<operation>
+addr := MEM[ptr]
+FOR j := 0 to 7
+	i := j*8
+	MEM[ptr+i+7:ptr+i] := data[63-i:56-i]
+ENDFOR
+	</operation>
+	<instruction form="m64, r64" name="MOVBE" xed="MOVBE_MEMv_GPRv" />
+	<CPUID>MOVBE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_movdir64b" tech="Other">
+	<return type="void" />
+	<parameter etype="M512" memwidth="512" type="void*" varname="dst" />
+	<parameter etype="M512" memwidth="512" type="const void*" varname="src" />
+	<description>Move 64-byte (512-bit) value using direct store from source memory address "src" to destination memory address "dst".</description>
+	<operation>
+MEM[dst+511:dst] := MEM[src+511:src]
+	</operation>
+	<instruction form="r64, m512" name="MOVDIR64B" xed="MOVDIR64B_GPRa_MEM" />
+	<CPUID>MOVDIR64B</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_directstoreu_u64" tech="Other">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="64" type="void*" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="val" />
+	<description>Store 64-bit integer from "val" into memory using direct store.</description>
+	<operation>
+MEM[dst+63:dst] := val[63:0]
+	</operation>
+	<instruction form="m64, r64" name="MOVDIRI" xed="MOVDIRI_MEMu64_GPR64u64" />
+	<CPUID>MOVDIRI</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_directstoreu_u32" tech="Other">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="32" type="void*" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="val" />
+	<description>Store 32-bit integer from "val" into memory using direct store.</description>
+	<operation>
+MEM[dst+31:dst] := val[31:0]
+	</operation>
+	<instruction form="m32, r32" name="MOVDIRI" xed="MOVDIRI_MEMu32_GPR32u32" />
+	<CPUID>MOVDIRI</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_bnd_set_ptr_bounds" tech="Other">
+	<return type="void *" />
+	<parameter type="const void *" varname="srcmem" />
+	<parameter etype="UI64" type="size_t" varname="size" />
+	<description>Make a pointer with the value of "srcmem" and bounds set to ["srcmem", "srcmem" + "size" - 1], and store the result in "dst".</description>
+	<operation>dst := srcmem
+dst.LB := srcmem.LB
+dst.UB := srcmem + size - 1
+	</operation>
+	<instruction form="bnd, m32" name="BNDMK" xed="BNDMK_BND_AGEN" />
+	<CPUID>MPX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	<supported icx="FALSE" />
+	</intrinsic>
+	<intrinsic name="_bnd_narrow_ptr_bounds" sequence="TRUE" tech="Other">
+	<return type="void *" />
+	<parameter type="const void *" varname="q" />
+	<parameter type="const void *" varname="r" />
+	<parameter etype="UI64" type="size_t" varname="size" />
+	<description>Narrow the bounds for pointer "q" to the intersection of the bounds of "r" and the bounds ["q", "q" + "size" - 1], and store the result in "dst".</description>
+	<operation>dst := q
+IF r.LB &gt; (q + size - 1) OR r.UB &lt; q
+	dst.LB := 1
+	dst.UB := 0
+ELSE
+	dst.LB := MAX(r.LB, q)
+	dst.UB := MIN(r.UB, (q + size - 1))
+FI
+	</operation>
+	<CPUID>MPX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	<supported icx="FALSE" />
+	</intrinsic>
+	<intrinsic name="_bnd_copy_ptr_bounds" sequence="TRUE" tech="Other">
+	<return type="void *" />
+	<parameter type="const void *" varname="q" />
+	<parameter type="const void *" varname="r" />
+	<description>Make a pointer with the value of "q" and bounds set to the bounds of "r" (e.g. copy the bounds of "r" to pointer "q"), and store the result in "dst".</description>
+	<operation>dst := q
+dst.LB := r.LB
+dst.UB := r.UB
+	</operation>
+	<CPUID>MPX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	<supported icx="FALSE" />
+	</intrinsic>
+	<intrinsic name="_bnd_init_ptr_bounds" sequence="TRUE" tech="Other">
+	<return type="void *" />
+	<parameter type="const void *" varname="q" />
+	<description>Make a pointer with the value of "q" and open bounds, which allow the pointer to access the entire virtual address space, and store the result in "dst".</description>
+	<operation>dst := q
+dst.LB := 0
+dst.UB := 0
+	</operation>
+	<CPUID>MPX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	<supported icx="FALSE" />
+	</intrinsic>
+	<intrinsic name="_bnd_store_ptr_bounds" tech="Other">
+	<return type="void" />
+	<parameter type="const void **" varname="ptr_addr" />
+	<parameter type="const void *" varname="ptr_val" />
+	<description>Stores the bounds of "ptr_val" pointer in memory at address "ptr_addr".</description>
+	<operation>MEM[ptr_addr].LB := ptr_val.LB
+MEM[ptr_addr].UB := ptr_val.UB
+	</operation>
+	<instruction form="mib, bnd" name="BNDSTX" xed="BNDSTX_MEMbnd64_BND" />
+	<CPUID>MPX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	<supported icx="FALSE" />
+	</intrinsic>
+	<intrinsic name="_bnd_chk_ptr_lbounds" tech="Other">
+	<return type="void" />
+	<parameter type="const void *" varname="q" />
+	<description>Checks if "q" is within its lower bound, and throws a #BR if not.</description>
+	<operation>IF q &lt; q.LB
+	#BR
+FI
+	</operation>
+	<instruction form="bnd, m64" name="BNDCL" xed="BNDCL_BND_AGEN" />
+	<CPUID>MPX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	<supported icx="FALSE" />
+	</intrinsic>
+	<intrinsic name="_bnd_chk_ptr_ubounds" tech="Other">
+	<return type="void" />
+	<parameter type="const void *" varname="q" />
+	<description>Checks if "q" is within its upper bound, and throws a #BR if not.</description>
+	<operation>IF q &gt; q.UB
+	#BR
+FI
+	</operation>
+	<instruction form="bnd, m64" name="BNDCU" xed="BNDCU_BND_AGEN" />
+	<instruction form="bnd, m64" name="BNDCN" xed="BNDCN_BND_AGEN" />
+	<CPUID>MPX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	<supported icx="FALSE" />
+	</intrinsic>
+	<intrinsic name="_bnd_chk_ptr_bounds" tech="Other">
+	<return type="void" />
+	<parameter type="const void *" varname="q" />
+	<parameter etype="UI64" type="size_t" varname="size" />
+	<description>Checks if ["q", "q" + "size" - 1] is within the lower and upper bounds of "q" and throws a #BR if not.</description>
+	<operation>IF (q + size - 1) &lt; q.LB OR (q + size - 1) &gt; q.UB
+	#BR
+FI
+	</operation>
+	<instruction form="bnd, m32" name="BNDCU" xed="BNDCU_BND_AGEN" />
+	<instruction form="bnd, m32" name="BNDCN" xed="BNDCN_BND_AGEN" />
+	<CPUID>MPX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	<supported icx="FALSE" />
+	</intrinsic>
+	<intrinsic name="_bnd_get_ptr_lbound" sequence="TRUE" tech="Other">
+	<return type="const void *" />
+	<parameter type="const void *" varname="q" />
+	<description>Return the lower bound of "q".</description>
+	<operation>dst := q.LB
+	</operation>
+	<CPUID>MPX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	<supported icx="FALSE" />
+	</intrinsic>
+	<intrinsic name="_bnd_get_ptr_ubound" sequence="TRUE" tech="Other">
+	<return type="const void *" />
+	<parameter type="const void *" varname="q" />
+	<description>Return the upper bound of "q".</description>
+	<operation>dst := q.UB
+	</operation>
+	<CPUID>MPX</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	<supported icx="FALSE" />
+	</intrinsic>
+	
+<intrinsic name="_bit_scan_forward" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Set "dst" to the index of the lowest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined.</description>
+	<operation>
+tmp := 0
+IF a == 0
+	// dst is undefined
+ELSE
+	DO WHILE ((tmp &lt; 32) AND a[tmp] == 0)
+		tmp := tmp + 1
+	OD
+FI
+dst := tmp
+	</operation>
+	<instruction form="r32, r32" name="BSF" xed="BSF_GPRv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bit_scan_reverse" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Set "dst" to the index of the highest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined.</description>
+	<operation>
+tmp := 31
+IF a == 0
+	// dst is undefined
+ELSE
+	DO WHILE ((tmp &gt; 0) AND a[tmp] == 0)
+		tmp := tmp - 1
+	OD
+FI
+dst := tmp
+	</operation>
+	<instruction form="r32, r32" name="BSR" xed="BSR_GPRv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_BitScanForward" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI32" memwidth="32" type="unsigned __int32*" varname="index" />
+	<parameter etype="UI32" type="unsigned __int32" varname="a" />
+	<description>Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1.</description>
+	<operation>
+tmp := 0
+IF a == 0
+	// MEM[index+31:index] is undefined
+	dst := 0
+ELSE
+	DO WHILE ((tmp &lt; 32) AND a[tmp] == 0)
+		tmp := tmp + 1
+	OD
+	MEM[index+31:index] := tmp
+	dst := (tmp == 31) ? 0 : 1
+FI
+	</operation>
+	<instruction form="r32, r32" name="BSF" xed="BSF_GPRv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_BitScanReverse" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI32" memwidth="32" type="unsigned __int32*" varname="index" />
+	<parameter etype="UI32" type="unsigned __int32" varname="a" />
+	<description>Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1.</description>
+	<operation>
+tmp := 31
+IF a == 0
+	// MEM[index+31:index] is undefined
+	dst := 0
+ELSE
+	DO WHILE ((tmp &gt; 0) AND a[tmp] == 0)
+		tmp := tmp - 1
+	OD
+	MEM[index+31:index] := tmp
+	dst := (tmp == 0) ? 0 : 1
+FI
+	</operation>
+	<instruction form="r32, r32" name="BSR" xed="BSR_GPRv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_BitScanForward64" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI32" memwidth="32" type="unsigned __int32*" varname="index" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1.</description>
+	<operation>
+tmp := 0
+IF a == 0
+	// MEM[index+31:index] is undefined
+	dst := 0
+ELSE
+	DO WHILE ((tmp &lt; 64) AND a[tmp] == 0)
+		tmp := tmp + 1
+	OD
+	MEM[index+31:index] := tmp
+	dst := (tmp == 63) ? 0 : 1
+FI
+	</operation>
+	<instruction form="r64, r64" name="BSF" xed="BSF_GPRv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_BitScanReverse64" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI32" memwidth="32" type="unsigned __int32*" varname="index" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1.</description>
+	<operation>
+tmp := 63
+IF a == 0
+	// MEM[index+31:index] is undefined
+	dst := 0
+ELSE
+	DO WHILE ((tmp &gt; 0) AND a[tmp] == 0)
+		tmp := tmp - 1
+	OD
+	MEM[index+31:index] := tmp
+	dst := (tmp == 0) ? 0 : 1
+FI
+	</operation>
+	<instruction form="r64, r64" name="BSR" xed="BSR_GPRv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bittest" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI32" memwidth="32" type="__int32*" varname="a" />
+	<parameter etype="IMM" immwidth="5" type="__int32" varname="b" />
+	<description>Return the bit at index "b" of 32-bit integer "a".</description>
+	<operation>
+addr := a + ZeroExtend64(b)
+dst[0] := MEM[addr]
+	</operation>
+	<instruction form="m32, r32" name="BT" xed="BT_MEMv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bittestandcomplement" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI32" memwidth="32" type="__int32*" varname="a" />
+	<parameter etype="IMM" immwidth="5" type="__int32" varname="b" />
+	<description>Return the bit at index "b" of 32-bit integer "a", and set that bit to its complement.</description>
+	<operation>
+addr := a + ZeroExtend64(b)
+dst[0] := MEM[addr]
+MEM[addr] := ~dst[0]
+	</operation>
+	<instruction form="m32, r32" name="BTC" xed="BTC_MEMv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bittestandreset" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI32" memwidth="32" type="__int32*" varname="a" />
+	<parameter etype="IMM" immwidth="5" type="__int32" varname="b" />
+	<description>Return the bit at index "b" of 32-bit integer "a", and set that bit to zero.</description>
+	<operation>
+addr := a + ZeroExtend64(b)
+dst[0] := MEM[addr]
+MEM[addr] := 0
+	</operation>
+	<instruction form="m32, r32" name="BTR" xed="BTR_MEMv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bittestandset" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI32" memwidth="32" type="__int32*" varname="a" />
+	<parameter etype="IMM" immwidth="5" type="__int32" varname="b" />
+	<description>Return the bit at index "b" of 32-bit integer "a", and set that bit to one.</description>
+	<operation>
+addr := a + ZeroExtend64(b)
+dst[0] := MEM[addr]
+MEM[addr] := 1
+	</operation>
+	<instruction form="m32, r32" name="BTS" xed="BTS_MEMv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bittest64" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI64" memwidth="32" type="__int64*" varname="a" />
+	<parameter etype="IMM" immwidth="6" type="__int64" varname="b" />
+	<description>Return the bit at index "b" of 64-bit integer "a".</description>
+	<operation>
+addr := a + b
+dst[0] := MEM[addr]
+	</operation>
+	<instruction form="r64, r64" name="BT" xed="BT_GPRv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bittestandcomplement64" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI64" memwidth="32" type="__int64*" varname="a" />
+	<parameter etype="IMM" immwidth="6" type="__int64" varname="b" />
+	<description>Return the bit at index "b" of 64-bit integer "a", and set that bit to its complement.</description>
+	<operation>
+addr := a + b
+dst[0] := MEM[addr]
+MEM[addr] := ~dst[0]
+	</operation>
+	<instruction form="r64, r64" name="BTC" xed="BTC_GPRv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bittestandreset64" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI64" memwidth="32" type="__int64*" varname="a" />
+	<parameter etype="IMM" immwidth="6" type="__int64" varname="b" />
+	<description>Return the bit at index "b" of 64-bit integer "a", and set that bit to zero.</description>
+	<operation>
+addr := a + b
+dst[0] := MEM[addr]
+MEM[addr] := 0
+	</operation>
+	<instruction form="r64, r64" name="BTR" xed="BTR_GPRv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bittestandset64" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI64" memwidth="32" type="__int64*" varname="a" />
+	<parameter etype="IMM" immwidth="6" type="__int64" varname="b" />
+	<description>Return the bit at index "b" of 64-bit integer "a", and set that bit to one.</description>
+	<operation>
+addr := a + b
+dst[0] := MEM[addr]
+MEM[addr] := 1
+	</operation>
+	<instruction form="r64, r64" name="BTS" xed="BTS_GPRv_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bswap" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Reverse the byte order of 32-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values.</description>
+	<operation>
+dst[7:0] := a[31:24]
+dst[15:8] := a[23:16]
+dst[23:16] := a[15:8]
+dst[31:24] := a[7:0]
+	</operation>
+	<instruction form="r32" name="BSWAP" xed="BSWAP_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_bswap64" tech="Other">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Reverse the byte order of 64-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values.</description>
+	<operation>
+dst[7:0] := a[63:56]
+dst[15:8] := a[55:48]
+dst[23:16] := a[47:40]
+dst[31:24] := a[39:32]
+dst[39:32] := a[31:24]
+dst[47:40] := a[23:16]
+dst[55:48] := a[15:8]
+dst[63:56] := a[7:0]
+	</operation>
+	<instruction form="r64" name="BSWAP" xed="BSWAP_GPRv" />
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_castf32_u32" tech="Other">
+	<return etype="UI32" type="unsigned __int32" varname="dst" />
+	<parameter etype="FP32" type="float" varname="a" />
+	<description>Cast from type float to type unsigned __int32 without conversion.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_castf64_u64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="FP64" type="double" varname="a" />
+	<description>Cast from type double to type unsigned __int64 without conversion.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_castu32_f32" tech="Other">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="UI32" type="unsigned __int32" varname="a" />
+	<description>Cast from type unsigned __int32 to type float without conversion.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_castu64_f64" tech="Other">
+	<return etype="FP64" type="double" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Cast from type unsigned __int64 to type double without conversion.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_lrotl" tech="Other">
+	<return etype="UI32" type="unsigned long" varname="dst" />
+	<parameter etype="UI32" type="unsigned long" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="shift" />
+	<description>Shift the bits of unsigned long integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>// size := 32 or 64
+dst := a
+count := shift AND (size - 1)
+DO WHILE (count &gt; 0)
+	tmp[0] := dst[size - 1]
+	dst := (dst &lt;&lt; 1) OR tmp[0]
+	count := count - 1
+OD
+	</operation>
+	<instruction form="r64, imm8" name="ROL" xed="ROL_GPRv_IMMb" />
+	<instruction form="r32, imm8" name="ROL" xed="ROL_GPRv_IMMb" />
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_lrotr" tech="Other">
+	<return etype="UI32" type="unsigned long" varname="dst" />
+	<parameter etype="UI32" type="unsigned long" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="shift" />
+	<description>Shift the bits of unsigned long integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>// size := 32 or 64
+dst := a
+count := shift AND (size - 1)
+DO WHILE (count &gt; 0)
+	tmp[size - 1] := dst[0]
+	dst := (dst &gt;&gt; 1) OR tmp[size - 1]
+	count := count - 1
+OD
+	</operation>
+	<instruction form="r64, imm8" name="ROR" xed="ROR_GPRv_IMMb" />
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_rotl" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<parameter etype="IMM" immwidth="5" type="int" varname="shift" />
+	<description>Shift the bits of unsigned 32-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift AND 31
+DO WHILE (count &gt; 0)
+	tmp[0] := dst[31]
+	dst := (dst &lt;&lt; 1) OR tmp[0]
+	count := count - 1
+OD
+	</operation>
+	<instruction form="r32, imm8" name="ROL" xed="ROL_GPRv_IMMb" />
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_rotr" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<parameter etype="IMM" immwidth="5" type="int" varname="shift" />
+	<description>Shift the bits of unsigned 32-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift AND 31
+DO WHILE (count &gt; 0)
+	tmp[31] := dst[0]
+	dst := (dst &gt;&gt; 1) OR tmp
+	count := count - 1
+OD
+	</operation>
+	<instruction form="r32, imm8" name="ROR" xed="ROR_GPRv_IMMb" />
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_rotwl" tech="Other">
+	<return etype="UI16" type="unsigned short" varname="dst" />
+	<parameter etype="UI16" type="unsigned short" varname="a" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="shift" />
+	<description>Shift the bits of unsigned 16-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift AND 15
+DO WHILE (count &gt; 0)
+	tmp[0] := dst[15]
+	dst := (dst &lt;&lt; 1) OR tmp[0]
+	count := count - 1
+OD
+	</operation>
+	<instruction form="r16, imm8" name="ROL" xed="ROL_GPRv_IMMb" />
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_rotwr" tech="Other">
+	<return etype="UI16" type="unsigned short" varname="dst" />
+	<parameter etype="UI16" type="unsigned short" varname="a" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="shift" />
+	<description>Shift the bits of unsigned 16-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift AND 15
+DO WHILE (count &gt; 0)
+	tmp[15] := dst[0]
+	dst := (dst &gt;&gt; 1) OR tmp
+	count := count - 1
+OD
+	</operation>
+	<instruction form="r16, imm8" name="ROR" xed="ROR_GPRv_IMMb" />
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_rotl64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<parameter etype="IMM" immwidth="6" type="int" varname="shift" />
+	<description>Shift the bits of unsigned 64-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift AND 63
+DO WHILE (count &gt; 0)
+	tmp[0] := dst[63]
+	dst := (dst &lt;&lt; 1) OR tmp[0]
+	count := count - 1
+OD
+	</operation>
+	<instruction form="r64, imm8" name="ROL" xed="ROL_GPRv_IMMb" />
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_rotr64" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<parameter etype="IMM" immwidth="6" type="int" varname="shift" />
+	<description>Shift the bits of unsigned 64-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift AND 63
+DO WHILE (count &gt; 0)
+	tmp[63] := dst[0]
+	dst := (dst &gt;&gt; 1) OR tmp[63]
+	count := count - 1
+OD
+	</operation>
+	<instruction form="r64, imm8" name="ROR" xed="ROR_GPRv_IMMb" />
+	<header>immintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_allow_cpu_features" sequence="TRUE" tech="Other">
+	<return type="void" />
+	<parameter etype="IMM" immwidth="8" type="unsigned __int64" varname="a" />
+	<description>Treat the processor-specific feature(s) specified in "a" as available. Multiple features may be OR'd together. See the valid feature flags below:</description>
+	<operation>
+_FEATURE_GENERIC_IA32
+_FEATURE_FPU
+_FEATURE_CMOV
+_FEATURE_MMX
+_FEATURE_FXSAVE
+_FEATURE_SSE
+_FEATURE_SSE2
+_FEATURE_SSE3
+_FEATURE_SSSE3
+_FEATURE_SSE4_1
+_FEATURE_SSE4_2
+_FEATURE_MOVBE
+_FEATURE_POPCNT
+_FEATURE_PCLMULQDQ
+_FEATURE_AES
+_FEATURE_F16C
+_FEATURE_AVX
+_FEATURE_RDRND
+_FEATURE_FMA
+_FEATURE_BMI
+_FEATURE_LZCNT
+_FEATURE_HLE
+_FEATURE_RTM
+_FEATURE_AVX2
+_FEATURE_KNCNI
+_FEATURE_AVX512F
+_FEATURE_ADX
+_FEATURE_RDSEED
+_FEATURE_AVX512ER
+_FEATURE_AVX512PF
+_FEATURE_AVX512CD
+_FEATURE_SHA
+_FEATURE_MPX
+_FEATURE_AVX512BW
+_FEATURE_AVX512VL
+_FEATURE_AVX512VBMI
+_FEATURE_AVX512_4FMAPS
+_FEATURE_AVX512_4VNNIW
+_FEATURE_AVX512_VPOPCNTDQ
+_FEATURE_AVX512_BITALG
+_FEATURE_AVX512_VBMI2
+_FEATURE_GFNI
+_FEATURE_VAES
+_FEATURE_VPCLMULQDQ
+_FEATURE_AVX512_VNNI
+_FEATURE_CLWB
+_FEATURE_RDPID
+_FEATURE_IBT
+_FEATURE_SHSTK
+_FEATURE_SGX
+_FEATURE_WBNOINVD
+_FEATURE_PCONFIG
+_FEATURE_AXV512_4VNNIB
+_FEATURE_AXV512_4FMAPH
+_FEATURE_AXV512_BITALG2
+_FEATURE_AXV512_VP2INTERSECT
+	</operation>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_may_i_use_cpu_feature" sequence="TRUE" tech="Other">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="IMM" immwidth="8" type="unsigned __int64" varname="a" />
+	<description>Dynamically query the processor to determine if the processor-specific feature(s) specified in "a" are available, and return true or false (1 or 0) if the set of features is available. Multiple features may be OR'd together. This function is limited to bitmask values in the first 'page' of the libirc cpu-id information. This intrinsic does not check the processor vendor. See the valid feature flags below:</description>
+	<operation>
+_FEATURE_GENERIC_IA32
+_FEATURE_FPU
+_FEATURE_CMOV
+_FEATURE_MMX
+_FEATURE_FXSAVE
+_FEATURE_SSE
+_FEATURE_SSE2
+_FEATURE_SSE3
+_FEATURE_SSSE3
+_FEATURE_SSE4_1
+_FEATURE_SSE4_2
+_FEATURE_MOVBE
+_FEATURE_POPCNT
+_FEATURE_PCLMULQDQ
+_FEATURE_AES
+_FEATURE_F16C
+_FEATURE_AVX
+_FEATURE_RDRND
+_FEATURE_FMA
+_FEATURE_BMI
+_FEATURE_LZCNT
+_FEATURE_HLE
+_FEATURE_RTM
+_FEATURE_AVX2
+_FEATURE_KNCNI
+_FEATURE_AVX512F
+_FEATURE_ADX
+_FEATURE_RDSEED
+_FEATURE_AVX512ER
+_FEATURE_AVX512PF
+_FEATURE_AVX512CD
+_FEATURE_SHA
+_FEATURE_MPX
+_FEATURE_AVX512BW
+_FEATURE_AVX512VL
+_FEATURE_AVX512VBMI
+_FEATURE_AVX512_4FMAPS
+_FEATURE_AVX512_4VNNIW
+_FEATURE_AVX512_VPOPCNTDQ
+_FEATURE_AVX512_BITALG
+_FEATURE_AVX512_VBMI2
+_FEATURE_GFNI
+_FEATURE_VAES
+_FEATURE_VPCLMULQDQ
+_FEATURE_AVX512_VNNI
+_FEATURE_CLWB
+_FEATURE_RDPID
+_FEATURE_IBT
+_FEATURE_SHSTK
+_FEATURE_SGX
+_FEATURE_WBNOINVD
+_FEATURE_PCONFIG
+_FEATURE_AXV512_4VNNIB
+_FEATURE_AXV512_4FMAPH
+_FEATURE_AXV512_BITALG2
+_FEATURE_AXV512_VP2INTERSECT
+_FEATURE_AXV512_FP16
+	</operation>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_may_i_use_cpu_feature_ext" sequence="TRUE" tech="Other">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="IMM" immwidth="8" type="unsigned __int64" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="unsigned" varname="page" />
+	<description>Dynamically query the processor to determine if the processor-specific feature(s) specified in "a" are available, and return true or false (1 or 0) if the set of features is available. Multiple features may be OR'd together. This works identically to the previous variant, except it also accepts a 'page' index that permits checking features on the 2nd page of the libirc information. When provided with a '0' in the 'page' parameter, this works identically to _may_i_use_cpu_feature. This intrinsic does not check the processor vendor. See the valid feature flags on the 2nd page below: (provided with a '1' in the 'page' parameter)</description>
+	<operation>
+_FEATURE_CLDEMOTE
+_FEATURE_MOVDIRI
+_FEATURE_MOVDIR64B
+_FEATURE_WAITPKG
+_FEATURE_AVX512_Bf16
+_FEATURE_ENQCMD
+_FEATURE_AVX_VNNI
+_FEATURE_AMX_TILE
+_FEATURE_AMX_INT8
+_FEATURE_AMX_BF16
+_FEATURE_KL
+_FEATURE_WIDE_KL
+_FEATURE_HRESET
+_FEATURE_UINTR
+_FEATURE_PREFETCHI
+_FEATURE_AVXVNNIINT8
+_FEATURE_CMPCCXADD
+_FEATURE_AVXIFMA
+_FEATURE_AVXNECONVERT
+_FEATURE_RAOINT
+_FEATURE_AMX_FP16
+_FEATURE_AMX_COMPLEX
+_FEATURE_SHA512
+_FEATURE_SM3
+_FEATURE_SM4
+_FEATURE_AVXVNNIINT16
+_FEATURE_USERMSR
+_FEATURE_AVX10_1_256
+_FEATURE_AVX10_1_512
+_FEATURE_APXF
+_FEATURE_MSRLIST
+_FEATURE_WRMSRNS
+_FEATURE_PBNDKB
+	</operation>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_may_i_use_cpu_feature_str" sequence="TRUE" tech="Other">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter type="string literal" varname="feature, ..." />
+	<description>Dynamically query the processor to determine if the processor-specific feature(s) specified a series of compile-time string literals in "feature, ..." are available, and return true or false (1 or 0) if the set of features is available. These feature names are converted to a bitmask and uses the same infrastructure as _may_i_use_cpu_feature_ext to validate it. The behavior is the same as the previous variants. This intrinsic does not check the processor vendor. Supported string literals are one-to-one corresponding in the "Operation" sections of _may_i_use_cpu_feature and _may_i_use_cpu_feature_ext. Example string literals are "avx2", "bmi", "avx512fp16", "amx-int8"...</description>
+	<operation>
+	</operation>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_rdpmc" tech="Other">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Read the Performance Monitor Counter (PMC) specified by "a", and store up to 64-bits in "dst". The width of performance counters is implementation specific.</description>
+	<operation>dst[63:0] := ReadPMC(a)
+	</operation>
+	<instruction name="RDPMC" xed="RDPMC" />
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_addcarry_u32" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI8" type="unsigned char" varname="c_in" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="b" />
+	<parameter etype="UI32" memwidth="32" type="unsigned int *" varname="out" />
+	<description>Add unsigned 32-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+tmp[32:0] := a[31:0] + b[31:0] + (c_in &gt; 0 ? 1 : 0)
+MEM[out+31:out] := tmp[31:0]
+dst[0] := tmp[32]
+dst[7:1] := 0
+	</operation>
+	<instruction form="r32, r32" name="ADC" xed="ADC_GPRv_GPRv_11" />
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_addcarry_u64" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI8" type="unsigned char" varname="c_in" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="b" />
+	<parameter etype="UI64" memwidth="64" type="unsigned __int64 *" varname="out" />
+	<description>Add unsigned 64-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+tmp[64:0] := a[63:0] + b[63:0] + (c_in &gt; 0 ? 1 : 0)
+MEM[out+63:out] := tmp[63:0]
+dst[0] := tmp[64]
+dst[7:1] := 0
+	</operation>
+	<instruction form="r64, r64" name="ADC" xed="ADC_GPRv_GPRv_11" />
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_subborrow_u32" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI8" type="unsigned char" varname="c_in" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<parameter etype="UI32" type="unsigned int" varname="b" />
+	<parameter etype="UI32" memwidth="32" type="unsigned int *" varname="out" />
+	<description>Add unsigned 8-bit borrow "c_in" (carry flag) to unsigned 32-bit integer "b", and subtract the result from unsigned 32-bit integer "a". Store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+tmp[32:0] := a[31:0] - (b[31:0] + (c_in &gt; 0 ? 1 : 0))
+MEM[out+31:out] := tmp[31:0]
+dst[0] := tmp[32]
+dst[7:1] := 0
+	</operation>
+	<instruction form="r32, r32" name="SBB" xed="SBB_GPRv_GPRv_19" />
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_subborrow_u64" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI8" type="unsigned char" varname="c_in" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="b" />
+	<parameter etype="UI64" memwidth="64" type="unsigned __int64 *" varname="out" />
+	<description>Add unsigned 8-bit borrow "c_in" (carry flag) to unsigned 64-bit integer "b", and subtract the result from unsigned 64-bit integer "a". Store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+tmp[64:0] := a[63:0] - (b[63:0] + (c_in &gt; 0 ? 1 : 0))
+MEM[out+63:out] := tmp[63:0]
+dst[0] := tmp[64]
+dst[7:1] := 0
+	</operation>
+	<instruction form="r64, r64" name="SBB" xed="SBB_GPRv_GPRv_19" />
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_ptwrite32" tech="Other">
+	<return type="void" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Insert the 32-bit data from "a" into a Processor Trace stream via a PTW packet. The PTW packet will be inserted if tracing is currently enabled and ptwrite is currently enabled. The current IP will also be inserted via a FUP packet if FUPonPTW is enabled.</description>
+	<instruction form="r32" name="PTWRITE" xed="PTWRITE_GPRy" />
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_ptwrite64" tech="Other">
+	<return type="void" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Insert the 64-bit data from "a" into a Processor Trace stream via a PTW packet. The PTW packet will be inserted if tracing is currently enabled and ptwrite is currently enabled. The current IP will also be inserted via a FUP packet if FUPonPTW is enabled.</description>
+	<instruction form="r64" name="PTWRITE" xed="PTWRITE_GPRy" />
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_enclu_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="const int" varname="a" />
+	<parameter etype="UI64" type="size_t*" varname="__data" />
+	<description>Invoke the Intel SGX enclave user (non-privilege) leaf function specified by "a", and return the error code. The "__data" array contains 3 32- or 64-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx.</description>
+	<instruction name="ENCLU" xed="ENCLU" />
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_encls_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="const int" varname="a" />
+	<parameter etype="UI64" type="size_t*" varname="__data" />
+	<description>Invoke the Intel SGX enclave system (privileged) leaf function specified by "a", and return the error code. The "__data" array contains 3 32- or 64-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx.</description>
+	<instruction name="ENCLS" xed="ENCLS" />
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_enclv_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="const int" varname="a" />
+	<parameter etype="UI64" type="size_t*" varname="__data" />
+	<description>Invoke the Intel SGX enclave virtualized (VMM) leaf function specified by "a", and return the error code. The "__data" array contains 3 32- or 64-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx.</description>
+	<instruction name="ENCLV" xed="ENCLV" />
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_wbinvd" tech="Other">
+	<return type="void" />
+	<parameter type="void" />
+	<description>Write back and flush internal caches.
+		Initiate writing-back and flushing of external
+		caches.</description>
+	<instruction name="WBINVD" xed="WBINVD" />
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_cvtsh_ss" sequence="TRUE" tech="Other">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="UI16" type="unsigned short" varname="a" />
+	<description>Convert the half-precision (16-bit) floating-point value "a" to a single-precision (32-bit) floating-point value, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP16_To_FP32(a[15:0])
+	</operation>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_cvtss_sh" sequence="TRUE" tech="Other">
+	<return etype="UI16" type="unsigned short" varname="dst" />
+	<parameter etype="FP32" type="float" varname="a" />
+	<parameter etype="IMM" hint="TRUE" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Convert the single-precision (32-bit) floating-point value "a" to a half-precision (16-bit) floating-point value, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[15:0] := Convert_FP32_To_FP16(a[31:0])
+	</operation>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	
+<intrinsic name="_mm_clmulepi64_si128" vexEq="TRUE" tech="Other">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="1" type="const int" varname="imm8" />
+	<description>Perform a carry-less multiplication of two 64-bit integers, selected from "a" and "b" according to "imm8", and store the results in "dst".</description>
+	<operation>
+IF (imm8[0] == 0)
+	TEMP1 := a[63:0]
+ELSE
+	TEMP1 := a[127:64]
+FI 
+IF (imm8[4] == 0)
+	TEMP2 := b[63:0]
+ELSE 
+	TEMP2 := b[127:64]
+FI
+FOR i := 0 to 63
+	TEMP[i] := (TEMP1[0] and TEMP2[i])
+	FOR j := 1 to i
+		TEMP[i] := TEMP[i] XOR (TEMP1[j] AND TEMP2[i-j])
+	ENDFOR 
+	dst[i] := TEMP[i]
+ENDFOR
+FOR i := 64 to 127
+	TEMP[i] := 0
+	FOR j := (i - 63) to 63
+		TEMP[i] := TEMP[i] XOR (TEMP1[j] AND TEMP2[i-j])
+	ENDFOR
+	dst[i] := TEMP[i]
+ENDFOR
+dst[127] := 0
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCLMULQDQ" xed="PCLMULQDQ_XMMdq_XMMdq_IMMb" />
+	<CPUID>PCLMULQDQ</CPUID>
+	<header>wmmintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_pconfig_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<parameter etype="UI64" type="size_t*" varname="__data" />
+	<description>Invoke the PCONFIG leaf function specified by "a". The "__data" array contains 3 32- or 64-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. May return the value in eax, depending on the semantics of the specified leaf function.</description>
+	<instruction name="PCONFIG" xed="PCONFIG" />
+	<CPUID>PCONFIG</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm_popcnt_u32" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Count the number of bits set to 1 in unsigned 32-bit integer "a", and return that count in "dst".</description>
+	<operation>
+dst := 0
+FOR i := 0 to 31
+	IF a[i]
+		dst := dst + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction form="r32, r32" name="POPCNT" xed="POPCNT_GPRv_GPRv" />
+	<CPUID>POPCNT</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_mm_popcnt_u64" tech="Other">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="a" />
+	<description>Count the number of bits set to 1 in unsigned 64-bit integer "a", and return that count in "dst".</description>
+	<operation>
+dst := 0
+FOR i := 0 to 63
+	IF a[i]
+		dst := dst + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction form="r64, r64" name="POPCNT" xed="POPCNT_GPRv_GPRv" />
+	<CPUID>POPCNT</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_popcnt32" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Count the number of bits set to 1 in 32-bit integer "a", and return that count in "dst".</description>
+	<operation>
+dst := 0
+FOR i := 0 to 31
+	IF a[i]
+		dst := dst + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction form="r32, r32" name="POPCNT" xed="POPCNT_GPRv_GPRv" />
+	<CPUID>POPCNT</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	<intrinsic name="_popcnt64" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Count the number of bits set to 1 in 64-bit integer "a", and return that count in "dst".</description>
+	<operation>
+dst := 0
+FOR i := 0 to 63
+	IF a[i]
+		dst := dst + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction form="r64, r64" name="POPCNT" xed="POPCNT_GPRv_GPRv" />
+	<CPUID>POPCNT</CPUID>
+	<header>immintrin.h</header>
+	<category>Bit Manipulation</category>
+	</intrinsic>
+	
+	
+	<intrinsic name="_m_prefetchit0" tech="Other">
+	<return type="void" />
+	<parameter type="const void*" memwidth="32" etype="UI8" varname="__P" />
+	<description>Loads an instruction sequence containing the specified memory address into all level cache.</description>
+	<instruction form="m8" name="PREFETCHIT0" xed="PREFETCHIT0_MEMu8"/>
+	<CPUID>PREFETCHI</CPUID>
+	<header>x86gprintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_m_prefetchit1" tech="Other">
+	<return type="void" />
+	<parameter type="const void*" memwidth="32" etype="UI8" varname="__P" />
+	<description>Loads an instruction sequence containing the specified memory address into all but the first-level cache.</description>
+	<instruction form="m8" name="PREFETCHIT1" xed="PREFETCHIT1_MEMu8"/>
+	<CPUID>PREFETCHI</CPUID>
+	<header>x86gprintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+<intrinsic name="_mm_prefetch" tech="Other">
+	<return type="void" />
+	<parameter etype="UI8" type="char const*" varname="p" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="i" />
+	<description>Fetch the line of data from memory that contains address "p" to a location in the cache hierarchy specified by the locality hint "i", which can be one of:&lt;ul&gt;
+    &lt;li&gt;_MM_HINT_ET0  // 7, move data using the ET0 hint. The PREFETCHW instruction will be generated.&lt;/li&gt;
+    &lt;li&gt;_MM_HINT_T0   // 3, move data using the T0 hint. The PREFETCHT0 instruction will be generated.&lt;/li&gt;
+    &lt;li&gt;_MM_HINT_T1   // 2, move data using the T1 hint. The PREFETCHT1 instruction will be generated.&lt;/li&gt;
+    &lt;li&gt;_MM_HINT_T2   // 1, move data using the T2 hint. The PREFETCHT2 instruction will be generated.&lt;/li&gt;
+    &lt;li&gt;_MM_HINT_NTA  // 0, move data using the non-temporal access (NTA) hint. The PREFETCHNTA instruction will be generated.&lt;/li&gt;
+</description>
+	<instruction form="m8" name="PREFETCHW" />
+	<instruction form="m8" name="PREFETCHNTA" xed="PREFETCHNTA_MEMmprefetch" />
+	<instruction form="m8" name="PREFETCHT0" xed="PREFETCHT0_MEMmprefetch" />
+	<instruction form="m8" name="PREFETCHT1" xed="PREFETCHT1_MEMmprefetch" />
+	<instruction form="m8" name="PREFETCHT2" xed="PREFETCHT2_MEMmprefetch" />
+	<CPUID>PRFCHW</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_aadd_i32" tech="Other">
+		<return type="void" />
+		<description>Atomically add a 32-bit value at memory operand "__A" and a 32-bit "__B", and store the result to the same memory location.</description>
+		<instruction name="AADD" form="m32, r32" xed="AADD_MEM32_GPR32" />
+		<operation>
+MEM[__A+31:__A] := MEM[__A+31:__A] + __B[31:0]
+</operation>
+		<parameter type="int*" memwidth="32" etype="SI32" varname="__A" />
+		<parameter type="int" etype="SI32" varname="__B" />
+	<CPUID>RAO_INT</CPUID>
+	<header>x86gprintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_aadd_i64" tech="Other">
+		<return type="void" />
+		<description>Atomically add a 64-bit value at memory operand "__A" and a 64-bit "__B", and store the result to the same memory location.</description>
+		<instruction name="AADD" form="m64, r64" xed="AADD_MEM64_GPR64" />
+		<operation>
+MEM[__A+63:__A] := MEM[__A+63:__A] + __B[63:0]
+</operation>
+		<parameter type="__int64*" memwidth="64" etype="SI64" varname="__A" />
+		<parameter type="__int64" etype="SI64" varname="__B" />
+	<CPUID>RAO_INT</CPUID>
+	<header>x86gprintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_aand_i32" tech="Other">
+		<return type="void" />
+		<description>Atomically and a 32-bit value at memory operand "__A" and a 32-bit "__B", and store the result to the same memory location.</description>
+		<instruction name="AAND" form="m32, r32" xed="AAND_MEM32_GPR32" />
+		<operation>
+MEM[__A+31:__A] := MEM[__A+31:__A] AND __B[31:0]
+</operation>
+		<parameter type="int*" memwidth="32" etype="SI32" varname="__A" />
+		<parameter type="int" etype="SI32" varname="__B" />
+	<CPUID>RAO_INT</CPUID>
+	<header>x86gprintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_aand_i64" tech="Other">
+		<return type="void" />
+		<description>Atomically and a 64-bit value at memory operand "__A" and a 64-bit "__B", and store the result to the same memory location.</description>
+		<instruction name="AAND" form="m64, r64" xed="AAND_MEM64_GPR64" />
+		<operation>
+MEM[__A+63:__A] := MEM[__A+63:__A] AND __B[63:0]
+</operation>
+		<parameter type="__int64*" memwidth="64" etype="SI64" varname="__A" />
+		<parameter type="__int64" etype="SI64" varname="__B" />
+	<CPUID>RAO_INT</CPUID>
+	<header>x86gprintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_aor_i32" tech="Other">
+		<return type="void" />
+		<description>Atomically or a 32-bit value at memory operand "__A" and a 32-bit "__B", and store the result to the same memory location.</description>
+		<instruction name="AOR" form="m32, r32" xed="AOR_MEM32_GPR32" />
+		<operation>
+MEM[__A+31:__A] := MEM[__A+31:__A] OR __B[31:0]
+</operation>
+		<parameter type="int*" memwidth="32" etype="SI32" varname="__A" />
+		<parameter type="int" etype="SI32" varname="__B" />
+	<CPUID>RAO_INT</CPUID>
+	<header>x86gprintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_aor_i64" tech="Other">
+		<return type="void" />
+		<description>Atomically or a 64-bit value at memory operand "__A" and a 64-bit "__B", and store the result to the same memory location.</description>
+		<instruction name="AOR" form="m64, r64" xed="AOR_MEM64_GPR64" />
+		<operation>
+MEM[__A+63:__A] := MEM[__A+63:__A] OR __B[63:0]
+</operation>
+		<parameter type="__int64*" memwidth="64" etype="SI64" varname="__A" />
+		<parameter type="__int64" etype="SI64" varname="__B" />
+	<CPUID>RAO_INT</CPUID>
+	<header>x86gprintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_axor_i32" tech="Other">
+		<return type="void" />
+		<description>Atomically xor a 32-bit value at memory operand "__A" and a 32-bit "__B", and store the result to the same memory location.</description>
+		<instruction name="AXOR" form="m32, r32" xed="AXOR_MEM32_GPR32" />
+		<operation>
+MEM[__A+31:__A] := MEM[__A+31:__A] XOR __B[31:0]
+</operation>
+		<parameter type="int*" memwidth="32" etype="SI32" varname="__A" />
+		<parameter type="int" etype="SI32" varname="__B" />
+	<CPUID>RAO_INT</CPUID>
+	<header>x86gprintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_axor_i64" tech="Other">
+		<return type="void" />
+		<description>Atomically xor a 64-bit value at memory operand "__A" and a 64-bit "__B", and store the result to the same memory location.</description>
+		<instruction name="AXOR" form="m64, r64" xed="AXOR_MEM64_GPR64" />
+		<operation>
+MEM[__A+63:__A] := MEM[__A+63:__A] XOR __B[63:0]
+</operation>
+		<parameter type="__int64*" memwidth="64" etype="SI64" varname="__A" />
+		<parameter type="__int64" etype="SI64" varname="__B" />
+	<CPUID>RAO_INT</CPUID>
+	<header>x86gprintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+<intrinsic name="_rdpid_u32" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter type="void" />
+	<description>Copy the IA32_TSC_AUX MSR (signature value) into "dst".</description>
+	<operation>dst[31:0] := IA32_TSC_AUX[31:0]
+	</operation>
+	<instruction form="r32" name="RDPID" xed="RDPID_GPR32u32" />
+	<CPUID>RDPID</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_rdrand16_step" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI16" memwidth="16" type="unsigned short*" varname="val" />
+	<description>Read a hardware generated 16-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>IF HW_RND_GEN.ready == 1
+	val[15:0] := HW_RND_GEN.data
+	dst := 1
+ELSE
+	val[15:0] := 0
+	dst := 0
+FI
+	</operation>
+	<instruction form="r16" name="RDRAND" xed="RDRAND_GPRv" />
+	<CPUID>RDRAND</CPUID>
+	<header>immintrin.h</header>
+	<category>Random</category>
+	</intrinsic>
+	<intrinsic name="_rdrand32_step" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" memwidth="32" type="unsigned int*" varname="val" />
+	<description>Read a hardware generated 32-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>IF HW_RND_GEN.ready == 1
+	val[31:0] := HW_RND_GEN.data
+	dst := 1
+ELSE
+	val[31:0] := 0
+	dst := 0
+FI
+	</operation>
+	<instruction form="r32" name="RDRAND" xed="RDRAND_GPRv" />
+	<CPUID>RDRAND</CPUID>
+	<header>immintrin.h</header>
+	<category>Random</category>
+	</intrinsic>
+	<intrinsic name="_rdrand64_step" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI64" memwidth="64" type="unsigned __int64*" varname="val" />
+	<description>Read a hardware generated 64-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>IF HW_RND_GEN.ready == 1
+	val[63:0] := HW_RND_GEN.data
+	dst := 1
+ELSE
+	val[63:0] := 0
+	dst := 0
+FI
+	</operation>
+	<instruction form="r64" name="RDRAND" xed="RDRAND_GPRv" />
+	<CPUID>RDRAND</CPUID>
+	<header>immintrin.h</header>
+	<category>Random</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_rdseed16_step" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI16" type="unsigned short *" varname="val" />
+	<description>Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>IF HW_NRND_GEN.ready == 1
+	val[15:0] := HW_NRND_GEN.data
+	dst := 1
+ELSE
+	val[15:0] := 0
+	dst := 0
+FI
+	</operation>
+	<instruction form="r16" name="RDSEED" xed="RDSEED_GPRv" />
+	<CPUID>RDSEED</CPUID>
+	<header>immintrin.h</header>
+	<category>Random</category>
+	</intrinsic>
+	<intrinsic name="_rdseed32_step" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int *" varname="val" />
+	<description>Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>IF HW_NRND_GEN.ready == 1
+	val[31:0] := HW_NRND_GEN.data
+	dst := 1
+ELSE
+	val[31:0] := 0
+	dst := 0
+FI
+	</operation>
+	<instruction form="r32" name="RDSEED" xed="RDSEED_GPRv" />
+	<CPUID>RDSEED</CPUID>
+	<header>immintrin.h</header>
+	<category>Random</category>
+	</intrinsic>
+	<intrinsic name="_rdseed64_step" tech="Other">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64 *" varname="val" />
+	<description>Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>IF HW_NRND_GEN.ready == 1
+	val[63:0] := HW_NRND_GEN.data
+	dst := 1
+ELSE
+	val[63:0] := 0
+	dst := 0
+FI
+	</operation>
+	<instruction form="r64" name="RDSEED" xed="RDSEED_GPRv" />
+	<CPUID>RDSEED</CPUID>
+	<header>immintrin.h</header>
+	<category>Random</category>
+	</intrinsic>
+	
+	
+<intrinsic name="__rdtscp" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI32" memwidth="32" type="unsigned int *" varname="mem_addr" />
+	<description>Copy the current 64-bit value of the processor's time-stamp counter into "dst", and store the IA32_TSC_AUX MSR (signature value) into memory at "mem_addr".</description>
+	<operation>dst[63:0] := TimeStampCounter
+MEM[mem_addr+31:mem_addr] := IA32_TSC_AUX[31:0]
+	</operation>
+	<instruction name="RDTSCP" xed="RDTSCP" />
+	<CPUID>RDTSCP</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_xabort" tech="Other">
+	<return type="void" />
+	<parameter etype="IMM" immwidth="8" type="const unsigned int" varname="imm8" />
+	<description>Force an RTM abort. The EAX register is updated to reflect an XABORT instruction caused the abort, and the "imm8" parameter will be provided in bits [31:24] of EAX.
+	Following an RTM abort, the logical processor resumes execution at the fallback address computed through the outermost XBEGIN instruction.</description>
+	<operation>IF RTM_ACTIVE == 0
+	// nop
+ELSE
+	// restore architectural register state
+	// discard memory updates performed in transaction
+	// update EAX with status and imm8 value
+	eax[31:24] := imm8[7:0]
+	RTM_NEST_COUNT := 0
+	RTM_ACTIVE := 0
+	IF _64_BIT_MODE
+		RIP := fallbackRIP
+	ELSE
+		EIP := fallbackEIP
+	FI
+FI
+	</operation>
+	<instruction form="imm8" name="XABORT" xed="XABORT_IMMb" />
+	<CPUID>RTM</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_xbegin" tech="Other">
+	<return etype="UI32" type="unsigned int" varname="k" />
+	<parameter type="void" />
+	<description>Specify the start of an RTM code region. 
+	If the logical processor was not already in transactional execution, then this call causes the logical processor to transition into transactional execution. 
+	On an RTM abort, the logical processor discards all architectural register and memory updates performed during the RTM execution, restores architectural state, and starts execution beginning at the fallback address computed from the outermost XBEGIN instruction. Return status of ~0 (0xFFFF) if continuing inside transaction; all other codes are aborts.</description>
+	<operation>IF RTM_NEST_COUNT &lt; MAX_RTM_NEST_COUNT
+	RTM_NEST_COUNT := RTM_NEST_COUNT + 1
+	IF RTM_NEST_COUNT == 1
+		IF _64_BIT_MODE
+			fallbackRIP := RIP
+		ELSE IF _32_BIT_MODE
+			fallbackEIP := EIP
+		FI
+		
+		RTM_ACTIVE := 1
+		// enter RTM execution, record register state, start tracking memory state
+	FI
+ELSE
+	// RTM abort (see _xabort)
+FI
+	</operation>
+	<instruction form="r32" name="XBEGIN" xed="XBEGIN_RELBRz" />
+	<CPUID>RTM</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_xend" tech="Other">
+	<return type="void" />
+	<parameter type="void" />
+	<description>Specify the end of an RTM code region.
+	If this corresponds to the outermost scope, the logical processor will attempt to commit the logical processor state atomically. 
+	If the commit fails, the logical processor will perform an RTM abort.</description>
+	<operation>IF RTM_ACTIVE == 1
+	RTM_NEST_COUNT := RTM_NEST_COUNT - 1
+	IF RTM_NEST_COUNT == 0
+		// try to commit transaction
+		IF FAIL_TO_COMMIT_TRANSACTION
+			// RTM abort (see _xabort)
+		ELSE
+			RTM_ACTIVE := 0
+		FI
+	FI
+FI
+	</operation>
+	<instruction name="XEND" xed="XEND" />
+	<CPUID>RTM</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_xtest" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter type="void" />
+	<description>Query the transactional execution status, return 1 if inside a transactionally executing RTM or HLE region, and return 0 otherwise.</description>
+	<operation>IF (RTM_ACTIVE == 1 OR HLE_ACTIVE == 1)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="XTEST" xed="XTEST" />
+	<CPUID>RTM</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_serialize" tech="Other">
+	<return type="void" />
+	<description>Serialize instruction execution, ensuring all modifications to flags, registers, and memory by previous instructions are completed before the next instruction is fetched.</description>
+	<instruction name="SERIALIZE" xed="SERIALIZE" />
+	<CPUID>SERIALIZE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm_sha1msg1_epu32" tech="Other">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Perform an intermediate calculation for the next four SHA1 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst".</description>
+	<operation>
+W0 := a[127:96]
+W1 := a[95:64]
+W2 := a[63:32]
+W3 := a[31:0]
+W4 := b[127:96]
+W5 := b[95:64]
+dst[127:96] := W2 XOR W0
+dst[95:64] := W3 XOR W1
+dst[63:32] := W4 XOR W2
+dst[31:0] := W5 XOR W3
+	</operation>
+	<instruction form="xmm, xmm" name="SHA1MSG1" xed="SHA1MSG1_XMMi32_XMMi32_SHA" />
+	<CPUID>SHA</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_sha1msg2_epu32" tech="Other">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Perform the final calculation for the next four SHA1 message values (unsigned 32-bit integers) using the intermediate result in "a" and the previous message values in "b", and store the result in "dst".</description>
+	<operation>
+W13 := b[95:64]
+W14 := b[63:32]
+W15 := b[31:0]
+W16 := (a[127:96] XOR W13) &lt;&lt;&lt; 1
+W17 := (a[95:64] XOR W14) &lt;&lt;&lt; 1
+W18 := (a[63:32] XOR W15) &lt;&lt;&lt; 1
+W19 := (a[31:0] XOR W16) &lt;&lt;&lt; 1
+dst[127:96] := W16
+dst[95:64] := W17
+dst[63:32] := W18
+dst[31:0] := W19
+	</operation>
+	<instruction form="xmm, xmm" name="SHA1MSG2" xed="SHA1MSG2_XMMi32_XMMi32_SHA" />
+	<CPUID>SHA</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_sha1nexte_epu32" tech="Other">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Calculate SHA1 state variable E after four rounds of operation from the current SHA1 state variable "a", add that value to the scheduled values (unsigned 32-bit integers) in "b", and store the result in "dst".</description>
+	<operation>
+tmp := (a[127:96] &lt;&lt;&lt; 30)
+dst[127:96] := b[127:96] + tmp
+dst[95:64] := b[95:64]
+dst[63:32] := b[63:32]
+dst[31:0] := b[31:0]
+	</operation>
+	<instruction form="xmm, xmm" name="SHA1NEXTE" xed="SHA1NEXTE_XMMi32_XMMi32_SHA" />
+	<CPUID>SHA</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_sha1rnds4_epu32" tech="Other">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="func" />
+	<description>Perform four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D) from "a" and some pre-computed sum of the next 4 round message values (unsigned 32-bit integers), and state variable E from "b", and store the updated SHA1 state (A,B,C,D) in "dst". "func" contains the logic functions and round constants.</description>
+	<operation>IF (func[1:0] == 0)
+	f := f0()
+	K := K0
+ELSE IF (func[1:0] == 1)
+	f := f1()
+	K := K1
+ELSE IF (func[1:0] == 2)
+	f := f2()
+	K := K2
+ELSE IF (func[1:0] == 3)
+	f := f3()
+	K := K3
+FI
+A := a[127:96]
+B := a[95:64]
+C := a[63:32]
+D := a[31:0]
+W[0] := b[127:96]
+W[1] := b[95:64]
+W[2] := b[63:32]
+W[3] := b[31:0]
+A[1] := f(B, C, D) + (A &lt;&lt;&lt; 5) + W[0] + K
+B[1] := A
+C[1] := B &lt;&lt;&lt; 30
+D[1] := C
+E[1] := D
+FOR i := 1 to 3
+	A[i+1] := f(B[i], C[i], D[i]) + (A[i] &lt;&lt;&lt; 5) + W[i] + E[i] + K
+	B[i+1] := A[i]
+	C[i+1] := B[i] &lt;&lt;&lt; 30
+	D[i+1] := C[i]
+	E[i+1] := D[i]
+ENDFOR
+dst[127:96] := A[4]
+dst[95:64] := B[4]
+dst[63:32] := C[4]
+dst[31:0] := D[4]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="SHA1RNDS4" xed="SHA1RNDS4_XMMi32_XMMi32_IMM8_SHA" />
+	<CPUID>SHA</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_sha256msg1_epu32" tech="Other">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst".</description>
+	<operation>W4 := b[31:0]
+W3 := a[127:96]
+W2 := a[95:64]
+W1 := a[63:32]
+W0 := a[31:0]
+dst[127:96] := W3 + sigma0(W4)
+dst[95:64] := W2 + sigma0(W3)
+dst[63:32] := W1 + sigma0(W2)
+dst[31:0] := W0 + sigma0(W1)
+	</operation>
+	<instruction form="xmm, xmm" name="SHA256MSG1" xed="SHA256MSG1_XMMi32_XMMi32_SHA" />
+	<CPUID>SHA</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_sha256msg2_epu32" tech="Other">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst"."</description>
+	<operation>W14 := b[95:64]
+W15 := b[127:96]
+W16 := a[31:0] + sigma1(W14)
+W17 := a[63:32] + sigma1(W15)
+W18 := a[95:64] + sigma1(W16)
+W19 := a[127:96] + sigma1(W17)
+dst[127:96] := W19
+dst[95:64] := W18
+dst[63:32] := W17
+dst[31:0] := W16
+	</operation>
+	<instruction form="xmm, xmm" name="SHA256MSG2" xed="SHA256MSG2_XMMi32_XMMi32_SHA" />
+	<CPUID>SHA</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_sha256rnds2_epu32" tech="Other">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="__m128i" varname="k" />
+	<description>Perform 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from "a", an initial SHA256 state (A,B,E,F) from "b", and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers) and the corresponding round constants from "k", and store the updated SHA256 state (A,B,E,F) in "dst".</description>
+	<operation>A[0] := b[127:96]
+B[0] := b[95:64]
+C[0] := a[127:96]
+D[0] := a[95:64]
+E[0] := b[63:32]
+F[0] := b[31:0]
+G[0] := a[63:32]
+H[0] := a[31:0]
+W_K[0] := k[31:0]
+W_K[1] := k[63:32]
+FOR i := 0 to 1
+	A[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i])
+	B[i+1] := A[i]
+	C[i+1] := B[i]
+	D[i+1] := C[i]
+	E[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + D[i]
+	F[i+1] := E[i]
+	G[i+1] := F[i]
+	H[i+1] := G[i]
+ENDFOR
+dst[127:96] := A[2]
+dst[95:64] := B[2]
+dst[63:32] := E[2]
+dst[31:0] := F[2]
+	</operation>
+	<instruction form="xmm, xmm" name="SHA256RNDS2" xed="SHA256RNDS2_XMMi32_XMMi32_SHA" />
+	<CPUID>SHA</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_sha512msg1_epi64" tech="AVX_ALL">
+    <return type="__m256i" varname="dst" etype="UI64" />
+    <description>This intrinisc is one of the two SHA512 message scheduling instructions. The intrinsic performs an intermediate calculation for the next four SHA512 message qwords. The calculated results are stored in "dst".</description>
+    <instruction name="VSHA512MSG1" form="ymm, xmm" xed="VSHA512MSG1_YMMu64_XMMu64" />
+    <operation>
+DEFINE ROR64(qword, n) {
+	count := n % 64
+	dest := (qword &gt;&gt; count) | (qword &lt;&lt; (64 - count))
+	RETURN dest
+}
+DEFINE SHR64(qword, n) {
+	RETURN qword &gt;&gt; n
+}
+DEFINE s0(qword) {
+	RETURN ROR64(qword,1) ^ ROR64(qword, 8) ^ SHR64(qword, 7)
+}
+W.qword[4] := __B.qword[0]
+W.qword[3] := __A.qword[3]
+W.qword[2] := __A.qword[2]
+W.qword[1] := __A.qword[1]
+W.qword[0] := __A.qword[0]
+dst.qword[3] := W.qword[3] + s0(W.qword[4])
+dst.qword[2] := W.qword[2] + s0(W.qword[3])
+dst.qword[1] := W.qword[1] + s0(W.qword[2])
+dst.qword[0] := W.qword[0] + s0(W.qword[1])
+</operation>
+    <parameter type="__m256i" varname="__A" etype="UI64" />
+    <parameter type="__m128i" varname="__B" etype="UI64" />
+  <CPUID>SHA512</CPUID>
+  <CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+  <category>Cryptography</category>
+  </intrinsic>
+  <intrinsic name="_mm256_sha512msg2_epi64" tech="AVX_ALL">
+    <return type="__m256i" varname="dst" etype="UI64" />
+    <description>This intrinisc is one of the two SHA512 message scheduling instructions. The intrinsic performs the final calculation for the next four SHA512 message qwords. The calculated results are stored in "dst".</description>
+    <instruction name="VSHA512MSG2" form="ymm, ymm" xed="VSHA512MSG2_YMMu64_YMMu64" />
+    <operation>
+DEFINE ROR64(qword, n) {
+	count := n % 64
+	dest := (qword &gt;&gt; count) | (qword &lt;&lt; (64 - count))
+	RETURN dest
+}
+DEFINE SHR64(qword, n) {
+	RETURN qword &gt;&gt; n
+}
+DEFINE s1(qword) {
+	RETURN ROR64(qword,19) ^ ROR64(qword, 61) ^ SHR64(qword, 6)
+}
+W.qword[14] := __B.qword[2]
+W.qword[15] := __B.qword[3]
+W.qword[16] := __A.qword[0] + s1(W.qword[14])
+W.qword[17] := __A.qword[1] + s1(W.qword[15])
+W.qword[18] := __A.qword[2] + s1(W.qword[16])
+W.qword[19] := __A.qword[3] + s1(W.qword[17])
+dst.qword[3] := W.qword[19]
+dst.qword[2] := W.qword[18]
+dst.qword[1] := W.qword[17]
+dst.qword[0] := W.qword[16]
+</operation>
+    <parameter type="__m256i" varname="__A" etype="UI64" />
+    <parameter type="__m256i" varname="__B" etype="UI64" />
+  <CPUID>SHA512</CPUID>
+  <CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+  <category>Cryptography</category>
+  </intrinsic>
+  <intrinsic name="_mm256_sha512rnds2_epi64" tech="AVX_ALL">
+    <return type="__m256i" varname="dst" etype="UI64" />
+    <description>This intrinisc performs two rounds of SHA512 operation using initial SHA512 state (C,D,G,H) from "__A", an initial SHA512 state (A,B,E,F) from "__B", and a pre-computed sum of the next two round message qwords and the corresponding round constants from "__C" (only the two lower qwords of the third operand). The updated SHA512 state (A,B,E,F) is written to "dst", and "dst" can be used as the updated state (C,D,G,H) in later rounds.</description>
+    <instruction name="VSHA512RNDS2" form="ymm, ymm, xmm" xed="VSHA512RNDS2_YMMu64_YMMu64_XMMu64" />
+    <operation>
+DEFINE ROR64(qword, n) {
+	count := n % 64
+	dest := (qword &gt;&gt; count) | (qword &lt;&lt; (64 - count))
+	RETURN dest
+}
+DEFINE SHR64(qword, n) {
+	RETURN qword &gt;&gt; n
+}
+DEFINE cap_sigma0(qword) {
+	RETURN ROR64(qword, 28) ^ ROR64(qword, 34) ^ ROR64(qword, 39)
+}
+DEFINE cap_sigma1(qword) {
+	RETURN ROR64(qword, 14) ^ ROR64(qword, 18) ^ ROR64(qword, 41)
+}
+DEFINE MAJ(a,b,c) {
+	RETURN (a &amp; b) ^ (a &amp; c) ^ (b &amp; c)
+}
+DEFINE CH(a,b,c) {
+	RETURN (a &amp; b) ^ (c &amp; ~a)
+}
+A.qword[0] := __B.qword[3]
+B.qword[0] := __B.qword[2]
+C.qword[0] := __A.qword[3]
+D.qword[0] := __A.qword[2]
+E.qword[0] := __B.qword[1]
+F.qword[0] := __B.qword[0]
+G.qword[0] := __A.qword[1]
+H.qword[0] := __A.qword[0]
+WK.qword[0]:= __C.qword[0]
+WK.qword[1]:= __C.qword[1]
+FOR i := 0 to 1
+	A.qword[i+1] := CH(E.qword[i], F.qword[i], G.qword[i]) + cap_sigma1(E.qword[i]) + WK.qword[i] + H.qword[i] + MAJ(A.qword[i], B.qword[i], C.qword[i]) + cap_sigma0(A.qword[i])
+	B.qword[i+1] := A.qword[i]
+	C.qword[i+1] := B.qword[i]
+	D.qword[i+1] := C.qword[i]
+	E.qword[i+1] := CH(E.qword[i], F.qword[i], G.qword[i]) + cap_sigma1(E.qword[i]) + WK.qword[i] + H.qword[i] + D.qword[i]
+	F.qword[i+1] := E.qword[i]
+	G.qword[i+1] := F.qword[i]
+	H.qword[i+1] := G.qword[i]
+ENDFOR
+dst.qword[3] := A.qword[2]
+dst.qword[2] := B.qword[2]
+dst.qword[1] := E.qword[2]
+dst.qword[0] := F.qword[2]
+</operation>
+    <parameter type="__m256i" varname="__A" etype="UI64" />
+    <parameter type="__m256i" varname="__B" etype="UI64" />
+    <parameter type="__m128i" varname="__C" etype="UI64" />
+  <CPUID>SHA512</CPUID>
+  <CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+  <category>Cryptography</category>
+  </intrinsic>
+<intrinsic name="_mm_sm3msg1_epi32" tech="AVX_ALL">
+    <return etype="UI32" type="__m128i" varname="dst" />
+    <description>The VSM3MSG1 intrinsic is one of the two SM3 message scheduling intrinsics. The intrinsic performs an initial calculation for the next four SM3 message words. The calculated results are stored in "dst".</description>
+    <instruction form="xmm, xmm, xmm" name="VSM3MSG1" xed="VSM3MSG1_XMMu32_XMMu32_XMMu32" />
+    <operation>
+DEFINE ROL32(dword, n) {
+	count := n % 32
+	dest := (dword &lt;&lt; count) | (dword &gt;&gt; (32 - count))
+	RETURN dest
+}
+DEFINE P1(x) {
+	RETURN x ^ ROL32(x, 15) ^ ROL32(x, 23)
+}
+W.dword[0] := __C.dword[0]
+W.dword[1] := __C.dword[1]
+W.dword[2] := __C.dword[2]
+W.dword[3] := __C.dword[3]
+W.dword[7] := __A.dword[0]
+W.dword[8] := __A.dword[1]
+W.dword[9] := __A.dword[2]
+W.dword[10] := __A.dword[3]
+W.dword[13] := __B.dword[0]
+W.dword[14] := __B.dword[1]
+W.dword[15] := __B.dword[2]
+TMP0 := W.dword[7] ^ W.dword[0] ^ ROL32(W.dword[13], 15)
+TMP1 := W.dword[8] ^ W.dword[1] ^ ROL32(W.dword[14], 15)
+TMP2 := W.dword[9] ^ W.dword[2] ^ ROL32(W.dword[15], 15)
+TMP3 := W.dword[10] ^ W.dword[3]
+dst.dword[0] := P1(TMP0)
+dst.dword[1] := P1(TMP1)
+dst.dword[2] := P1(TMP2)
+dst.dword[3] := P1(TMP3)
+</operation>
+    <parameter etype="UI32" type="__m128i" varname="__A" />
+    <parameter etype="UI32" type="__m128i" varname="__B" />
+    <parameter etype="UI32" type="__m128i" varname="__C" />
+  <CPUID>SM3</CPUID>
+  <CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+  <category>Cryptography</category>
+  </intrinsic>
+  <intrinsic name="_mm_sm3msg2_epi32" tech="AVX_ALL">
+    <return etype="UI32" type="__m128i" varname="dst" />
+    <description>The VSM3MSG2 intrinsic is one of the two SM3 message scheduling intrinsics. The intrinsic performs the final calculation for the next four SM3 message words. The calculated results are stored in "dst".</description>
+    <instruction form="xmm, xmm, xmm" name="VSM3MSG2" xed="VSM3MSG2_XMMu32_XMMu32_XMMu32" />
+    <operation>
+DEFINE ROL32(dword, n) {
+	count := n % 32
+	dest := (dword &lt;&lt; count) | (dword &gt;&gt; (32-count))
+	RETURN dest
+}
+WTMP.dword[0] := __A.dword[0]
+WTMP.dword[1] := __A.dword[1]
+WTMP.dword[2] := __A.dword[2]
+WTMP.dword[3] := __A.dword[3]
+W.dword[3] := __B.dword[0]
+W.dword[4] := __B.dword[1]
+W.dword[5] := __B.dword[2]
+W.dword[6] := __B.dword[3]
+W.dword[10] := __C.dword[0]
+W.dword[11] := __C.dword[1]
+W.dword[12] := __C.dword[2]
+W.dword[13] := __C.dword[3]
+W.dword[16] := ROL32(W.dword[3], 7) ^ W.dword[10] ^ WTMP.dword[0]
+W.dword[17] := ROL32(W.dword[4], 7) ^ W.dword[11] ^ WTMP.dword[1]
+W.dword[18] := ROL32(W.dword[5], 7) ^ W.dword[12] ^ WTMP.dword[2]
+W.dword[19] := ROL32(W.dword[6], 7) ^ W.dword[13] ^ WTMP.dword[3]
+W.dword[19] := W.dword[19] ^ ROL32(W.dword[16], 6) ^ ROL32(W.dword[16], 15) ^ ROL32(W.dword[16], 30)
+dst.dword[0] := W.dword[16]
+dst.dword[1] := W.dword[17]
+dst.dword[2] := W.dword[18]
+dst.dword[3] := W.dword[19]
+</operation>
+    <parameter etype="UI32" type="__m128i" varname="__A" />
+    <parameter etype="UI32" type="__m128i" varname="__B" />
+    <parameter etype="UI32" type="__m128i" varname="__C" />
+  <CPUID>SM3</CPUID>
+  <CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+  <category>Cryptography</category>
+  </intrinsic>
+  <intrinsic name="_mm_sm3rnds2_epi32" tech="AVX_ALL">
+    <return etype="UI32" type="__m128i" varname="dst" />
+    <description>The intrinsic performs two rounds of SM3 operation using initial SM3 state (C, D, G, H) from "__A", an initial SM3 states (A, B, E, F) from "__B" and a pre-computed words from the "__C". "__A" with initial SM3 state of (C, D, G, H) assumes input of non-rotated left variables from previous state. The updated SM3 state (A, B, E, F) is written to "__A". The "imm8" should contain the even round number for the first of the two rounds computed by this instruction. The computation masks the "imm8" value by ANDing it with 0x3E so that only even round numbers from 0 through 62 are used for this operation. The calculated results are stored in "dst".</description>
+    <instruction form="xmm, xmm, xmm, imm8" name="VSM3RNDS2" xed="VSM3RNDS2_XMMu32_XMMu32_XMMu32_IMM8" />
+    <operation>
+DEFINE ROL32(dword, n) {
+	count := n % 32
+	dest := (dword &lt;&lt; count) | (dword &gt;&gt; (32-count))
+	RETURN dest
+}
+DEFINE P0(x) {
+	RETURN x ^ ROL32(x, 9) ^ ROL32(x, 17)
+}
+DEFINE FF(x, y, z, round) {
+	IF round &lt; 16
+		RETURN (x ^ y ^ z)
+	ELSE
+		RETURN (x &amp; y) | (x &amp; z) | (y &amp; z)
+	FI
+}
+DEFINE GG(x, y, z, round){
+	IF round &lt; 16
+		RETURN (x ^ y ^ z)
+	ELSE
+		RETURN (x &amp; y) | (~x &amp; z)
+	FI
+}
+A.dword[0] := __B.dword[3]
+B.dword[0] := __B.dword[2]
+C.dword[0] := __A.dword[3]
+D.dword[0] := __A.dword[2]
+E.dword[0] := __B.dword[1]
+F.dword[0] := __B.dword[0]
+G.dword[0] := __A.dword[1]
+H.dword[0] := __A.dword[0]
+W.dword[0] := __C.dword[0]
+W.dword[1] := __C.dword[1]
+W.dword[4] := __C.dword[2]
+W.dword[5] := __C.dword[3]
+C.dword[0] := ROL32(C.dword[0], 9)
+D.dword[0] := ROL32(D.dword[0], 9)
+G.dword[0] := ROL32(G.dword[0], 19)
+H.dword[0] := ROL32(H.dword[0], 19)
+ROUND := imm8 &amp; 0x3E
+IF ROUND &lt; 16
+	CONST.dword[0] := 0x79CC4519
+ELSE
+	CONST.dword[0] := 0x7A879D8A
+FI
+CONST.dword[0] := ROL32(CONST.dword[0], ROUND)
+FOR i:= 0 to 1
+	temp.dword[0] := ROL32(A.dword[i], 12) + E.dword[i] + CONST.dword[0]
+	S1.dword[0] := ROL32(temp.dword[0], 7)
+	S2.dword[0] := S1.dword[0] ^ ROL32(A.dword[i], 12)
+	T1.dword[0] := FF(A.dword[i], B.dword[i], C.dword[i], ROUND) + D.dword[i] + S2.dword[0] + (W.dword[i] ^ W.dword[i+4])
+	T2.dword[0] := GG(E.dword[i], F.dword[i], G.dword[i], ROUND) + H.dword[i] + S1.dword[0] + W.dword[i]
+	D.dword[i+1] := C.dword[i]
+	C.dword[i+1] := ROL32(B.dword[i], 9)
+	B.dword[i+1] := A.dword[i]
+	A.dword[i+1] := T1.dword[0]
+	H.dword[i+1] := G.dword[i]
+	G.dword[i+1] := ROL32(F.dword[i], 19)
+	F.dword[i+1] := E.dword[i]
+	E.dword[i+1] := P0(T2.dword[0])
+	CONST.dword[0] := ROL32(CONST.dword[0], 1)
+ENDFOR
+dst.dword[3] := A.dword[2]
+dst.dword[2] := B.dword[2]
+dst.dword[1] := E.dword[2]
+dst.dword[0] := F.dword[2]
+</operation>
+    <parameter etype="UI32" type="__m128i" varname="__A" />
+    <parameter etype="UI32" type="__m128i" varname="__B" />
+    <parameter etype="UI32" type="__m128i" varname="__C" />
+    <parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+  <CPUID>SM3</CPUID>
+  <CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+  <category>Cryptography</category>
+  </intrinsic>
+<intrinsic name="_mm256_sm4key4_epi32" tech="AVX_ALL">
+    <return type="__m256i" varname="dst" etype="UI32" />
+    <description>This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent 128-bit lanes. The calculated results are stored in "dst". </description>
+    <instruction name="VSM4KEY4" form="ymm, ymm, ymm" xed="VSM4KEY4_YMMu32_YMMu32_YMMu32" />
+    <operation>
+BYTE sbox[256] = {
+0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05,
+0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
+0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62,
+0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6,
+0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8,
+0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35,
+0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
+0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E,
+0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1,
+0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3,
+0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F,
+0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51,
+0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8,
+0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
+0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
+0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48
+}
+DEFINE ROL32(dword, n) {
+	count := n % 32
+	dest := (dword &lt;&lt; count) | (dword &gt;&gt; (32-count))
+	RETURN dest
+}
+DEFINE SBOX_BYTE(dword, i) {
+	RETURN sbox[dword.byte[i]]
+}
+DEFINE lower_t(dword) {
+	tmp.byte[0] := SBOX_BYTE(dword, 0)
+	tmp.byte[1] := SBOX_BYTE(dword, 1)
+	tmp.byte[2] := SBOX_BYTE(dword, 2)
+	tmp.byte[3] := SBOX_BYTE(dword, 3)
+	RETURN tmp
+}
+DEFINE L_KEY(dword) {
+	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
+}
+DEFINE T_KEY(dword) {
+	RETURN L_KEY(lower_t(dword))
+}
+DEFINE F_KEY(X0, X1, X2, X3, round_key) {
+	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
+}
+FOR i:= 0 to 1
+	P.dword[0] := __A.dword[4*i]
+	P.dword[1] := __A.dword[4*i+1]
+	P.dword[2] := __A.dword[4*i+2]
+	P.dword[3] := __A.dword[4*i+3]
+	C.dword[0] := F_KEY(P.dword[0], P.dword[1], P.dword[2], P.dword[3], __B.dword[4*i])
+	C.dword[1] := F_KEY(P.dword[1], P.dword[2], P.dword[3], C.dword[0], __B.dword[4*i+1])
+	C.dword[2] := F_KEY(P.dword[2], P.dword[3], C.dword[0], C.dword[1], __B.dword[4*i+2])
+	C.dword[3] := F_KEY(P.dword[3], C.dword[0], C.dword[1], C.dword[2], __B.dword[4*i+3])
+	dst.dword[4*i] := C.dword[0]
+	dst.dword[4*i+1] := C.dword[1]
+	dst.dword[4*i+2] := C.dword[2]
+	dst.dword[4*i+3] := C.dword[3]
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+    <parameter type="__m256i" varname="__A" etype="UI32" />
+    <parameter type="__m256i" varname="__B" etype="UI32" />
+  <CPUID>SM4</CPUID>
+  <CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+  <category>Cryptography</category>
+  </intrinsic>
+  <intrinsic name="_mm256_sm4rnds4_epi32" tech="AVX_ALL">
+    <return type="__m256i" varname="dst" etype="UI32" />
+    <description>This intrinisc performs four rounds of SM4 encryption. The intrinisc operates on independent 128-bit lanes. The calculated results are stored in "dst". </description>
+    <instruction name="VSM4RNDS4" form="ymm, ymm, ymm" xed="VSM4RNDS4_YMMu32_YMMu32_YMMu32" />
+    <operation>BYTE sbox[256] = {
+0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05,
+0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
+0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62,
+0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6,
+0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8,
+0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35,
+0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
+0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E,
+0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1,
+0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3,
+0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F,
+0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51,
+0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8,
+0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
+0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
+0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48
+}
+DEFINE ROL32(dword, n) {
+	count := n % 32
+	dest := (dword &lt;&lt; count) | (dword &gt;&gt; (32-count))
+	RETURN dest
+}
+DEFINE SBOX_BYTE(dword, i) {
+	RETURN sbox[dword.byte[i]]
+}
+DEFINE lower_t(dword) {
+	tmp.byte[0] := SBOX_BYTE(dword, 0)
+	tmp.byte[1] := SBOX_BYTE(dword, 1)
+	tmp.byte[2] := SBOX_BYTE(dword, 2)
+	tmp.byte[3] := SBOX_BYTE(dword, 3)
+	RETURN tmp
+}
+DEFINE L_RND(dword) {
+	tmp := dword
+	tmp := tmp ^ ROL32(dword, 2)
+	tmp := tmp ^ ROL32(dword, 10)
+	tmp := tmp ^ ROL32(dword, 18)
+	tmp := tmp ^ ROL32(dword, 24)
+	RETURN tmp
+}
+DEFINE T_RND(dword) {
+	RETURN L_RND(lower_t(dword))
+}
+DEFINE F_RND(X0, X1, X2, X3, round_key) {
+	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
+}
+FOR i:= 0 to 1
+	P.dword[0] := __A.dword[4*i]
+	P.dword[1] := __A.dword[4*i+1]
+	P.dword[2] := __A.dword[4*i+2]
+	P.dword[3] := __A.dword[4*i+3]
+	C.dword[0] := F_RND(P.dword[0], P.dword[1], P.dword[2], P.dword[3], __B.dword[4*i])
+	C.dword[1] := F_RND(P.dword[1], P.dword[2], P.dword[3], C.dword[0], __B.dword[4*i+1])
+	C.dword[2] := F_RND(P.dword[2], P.dword[3], C.dword[0], C.dword[1], __B.dword[4*i+2])
+	C.dword[3] := F_RND(P.dword[3], C.dword[0], C.dword[1], C.dword[2], __B.dword[4*i+3])
+	dst.dword[4*i] := C.dword[0]
+	dst.dword[4*i+1] := C.dword[1]
+	dst.dword[4*i+2] := C.dword[2]
+	dst.dword[4*i+3] := C.dword[3]
+ENDFOR
+dst[MAX:256] := 0
+</operation>
+    <parameter type="__m256i" varname="__A" etype="UI32" />
+    <parameter type="__m256i" varname="__B" etype="UI32" />
+  <CPUID>SM4</CPUID>
+  <CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+  <category>Cryptography</category>
+  </intrinsic>
+  <intrinsic name="_mm_sm4key4_epi32" tech="AVX_ALL">
+    <return type="__m128i" varname="dst" etype="UI32" />
+    <description>This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent 128-bit lanes. The calculated results are stored in "dst". </description>
+    <instruction name="VSM4KEY4" form="xmm, xmm, xmm" xed="VSM4KEY4_XMMu32_XMMu32_XMMu32" />
+    <operation>
+BYTE sbox[256] = {
+0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05,
+0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
+0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62,
+0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6,
+0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8,
+0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35,
+0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
+0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E,
+0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1,
+0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3,
+0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F,
+0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51,
+0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8,
+0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
+0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
+0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48
+}
+DEFINE ROL32(dword, n) {
+	count := n % 32
+	dest := (dword &lt;&lt; count) | (dword &gt;&gt; (32-count))
+	RETURN dest
+}
+DEFINE SBOX_BYTE(dword, i) {
+	RETURN sbox[dword.byte[i]]
+}
+DEFINE lower_t(dword) {
+	tmp.byte[0] := SBOX_BYTE(dword, 0)
+	tmp.byte[1] := SBOX_BYTE(dword, 1)
+	tmp.byte[2] := SBOX_BYTE(dword, 2)
+	tmp.byte[3] := SBOX_BYTE(dword, 3)
+	RETURN tmp
+}
+DEFINE L_KEY(dword) {
+	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
+}
+DEFINE T_KEY(dword) {
+	RETURN L_KEY(lower_t(dword))
+}
+DEFINE F_KEY(X0, X1, X2, X3, round_key) {
+	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
+}
+P.dword[0] := __A.dword[0]
+P.dword[1] := __A.dword[1]
+P.dword[2] := __A.dword[2]
+P.dword[3] := __A.dword[3]
+C.dword[0] := F_KEY(P.dword[0], P.dword[1], P.dword[2], P.dword[3], __B.dword[0])
+C.dword[1] := F_KEY(P.dword[1], P.dword[2], P.dword[3], C.dword[0], __B.dword[1])
+C.dword[2] := F_KEY(P.dword[2], P.dword[3], C.dword[0], C.dword[1], __B.dword[2])
+C.dword[3] := F_KEY(P.dword[3], C.dword[0], C.dword[1], C.dword[2], __B.dword[3])
+dst.dword[0] := C.dword[0]
+dst.dword[1] := C.dword[1]
+dst.dword[2] := C.dword[2]
+dst.dword[3] := C.dword[3]
+dst[MAX:128] := 0
+</operation>
+    <parameter type="__m128i" varname="__A" etype="UI32" />
+    <parameter type="__m128i" varname="__B" etype="UI32" />
+  <CPUID>SM4</CPUID>
+  <CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+  <category>Cryptography</category>
+  </intrinsic>
+  <intrinsic name="_mm_sm4rnds4_epi32" tech="AVX_ALL">
+    <return type="__m128i" varname="dst" etype="UI32" />
+    <description>This intrinisc performs four rounds of SM4 encryption. The intrinisc operates on independent 128-bit lanes. The calculated results are stored in "dst". </description>
+    <instruction name="VSM4RNDS4" form="xmm, xmm, xmm" xed="VSM4RNDS4_XMMu32_XMMu32_XMMu32" />
+    <operation>
+BYTE sbox[256] = {
+0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05,
+0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
+0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62,
+0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6,
+0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8,
+0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35,
+0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
+0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E,
+0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1,
+0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3,
+0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F,
+0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51,
+0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8,
+0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
+0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
+0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48
+}
+DEFINE ROL32(dword, n) {
+	count := n % 32
+	dest := (dword &lt;&lt; count) | (dword &gt;&gt; (32-count))
+	RETURN dest
+}
+DEFINE SBOX_BYTE(dword, i) {
+	RETURN sbox[dword.byte[i]]
+}
+DEFINE lower_t(dword) {
+	tmp.byte[0] := SBOX_BYTE(dword, 0)
+	tmp.byte[1] := SBOX_BYTE(dword, 1)
+	tmp.byte[2] := SBOX_BYTE(dword, 2)
+	tmp.byte[3] := SBOX_BYTE(dword, 3)
+	RETURN tmp
+}
+DEFINE L_RND(dword) {
+	tmp := dword
+	tmp := tmp ^ ROL32(dword, 2)
+	tmp := tmp ^ ROL32(dword, 10)
+	tmp := tmp ^ ROL32(dword, 18)
+	tmp := tmp ^ ROL32(dword, 24)
+	RETURN tmp
+}
+DEFINE T_RND(dword) {
+	RETURN L_RND(lower_t(dword))
+}
+DEFINE F_RND(X0, X1, X2, X3, round_key) {
+	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
+}
+P.dword[0] := __A.dword[0]
+P.dword[1] := __A.dword[1]
+P.dword[2] := __A.dword[2]
+P.dword[3] := __A.dword[3]
+C.dword[0] := F_RND(P.dword[0], P.dword[1], P.dword[2], P.dword[3], __B.dword[0])
+C.dword[1] := F_RND(P.dword[1], P.dword[2], P.dword[3], C.dword[0], __B.dword[1])
+C.dword[2] := F_RND(P.dword[2], P.dword[3], C.dword[0], C.dword[1], __B.dword[2])
+C.dword[3] := F_RND(P.dword[3], C.dword[0], C.dword[1], C.dword[2], __B.dword[3])
+dst.dword[0] := C.dword[0]
+dst.dword[1] := C.dword[1]
+dst.dword[2] := C.dword[2]
+dst.dword[3] := C.dword[3]
+dst[MAX:128] := 0
+</operation>
+    <parameter type="__m128i" varname="__A" etype="UI32" />
+    <parameter type="__m128i" varname="__B" etype="UI32" />
+  <CPUID>SM4</CPUID>
+  <CPUID>AVX</CPUID>
+	<header>immintrin.h</header>
+  <category>Cryptography</category>
+  </intrinsic>
+<intrinsic name="_mm_acos_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ACOS(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_acos_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ACOS(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_acosh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ACOSH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_acosh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ACOSH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_asin_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ASIN(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_asin_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ASIN(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_asinh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ASINH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_asinh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ASINH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_atan_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ATAN(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_atan_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ATAN(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_atan2_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_atan2_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_atanh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ATANH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_atanh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ATANH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_cos_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_cos_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_cosd_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := COSD(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_cosd_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := COSD(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_cosh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := COSH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_cosh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := COSH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_hypot_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0))
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_hypot_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0))
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_sin_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_sin_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_sincos_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" memwidth="128" type="__m128d *" varname="mem_addr" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+	MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_sincos_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" memwidth="128" type="__m128 *" varname="mem_addr" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_sind_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SIND(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_sind_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SIND(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_sinh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SINH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_sinh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SINH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_tan_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := TAN(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_tan_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := TAN(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_tand_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := TAND(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_tand_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := TAND(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_tanh_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := TANH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_tanh_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := TANH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Trigonometry</category>
+	</intrinsic>
+	<intrinsic name="_mm_cbrt_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_cbrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := CubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_cexp_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]".</description>
+	<operation>
+DEFINE CEXP(a[31:0], b[31:0]) {
+	result[31:0]  := POW(FP32(e), a[31:0]) * COS(b[31:0])
+	result[63:32] := POW(FP32(e), a[31:0]) * SIN(b[31:0])
+	RETURN result
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CEXP(a[i+31:i], a[i+63:i+32])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_clog_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the natural logarithm of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]".</description>
+	<operation>
+DEFINE CLOG(a[31:0], b[31:0]) {
+	result[31:0]  := LOG(SQRT(POW(a, 2.0) + POW(b, 2.0)))
+	result[63:32] := ATAN2(b, a)
+	RETURN result
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CLOG(a[i+31:i], a[i+63:i+32])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_csqrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the square root of packed complex snumbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]".</description>
+	<operation>
+DEFINE CSQRT(a[31:0], b[31:0]) {
+	sign[31:0] := (b &lt; 0.0) ? -FP32(1.0) : FP32(1.0)
+	result[31:0]  := SQRT((a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0)
+	result[63:32] := sign * SQRT((-a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0)
+	RETURN result
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CSQRT(a[i+31:i], a[i+63:i+32])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_exp_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := POW(e, a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_exp_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := POW(FP32(e), a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_exp10_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := POW(10.0, a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_exp10_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := POW(FP32(10.0), a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_exp2_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := POW(2.0, a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_exp2_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_expm1_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := POW(e, a[i+63:i]) - 1.0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_expm1_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_invcbrt_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := InvCubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_invcbrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := InvCubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_invsqrt_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := InvSQRT(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_invsqrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := InvSQRT(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_log_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_log_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_log10_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_log10_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_log1p_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LOG(1.0 + a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_log1p_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LOG(1.0 + a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_log2_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_log2_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_logb_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_logb_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_pow_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := POW(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_pow_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := POW(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_svml_sqrt_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_svml_sqrt_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_cdfnorm_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_cdfnorm_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := CDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_cdfnorminv_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_cdfnorminv_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_erf_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ERF(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_erfc_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_erfc_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+63:i] := 1.0 - ERF(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_erfcinv_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_erfcinv_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_erfinv_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_erfinv_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+63:i] := 1.0 / ERF(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_epi8" sequence="TRUE" tech="SVML">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 8*j
+	IF b[i+7:i] == 0
+		#DE
+	FI
+	dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_epi16" sequence="TRUE" tech="SVML">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	IF b[i+15:i] == 0
+		#DE
+	FI
+	dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_epi32" sequence="TRUE" tech="SVML">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF b[i+31:i] == 0
+		#DE
+	FI
+	dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_epi64" sequence="TRUE" tech="SVML">
+	<return etype="SI64" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	IF b[i+63:i] == 0
+		#DE
+	FI
+	dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_epu8" sequence="TRUE" tech="SVML">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 8*j
+	IF b[i+7:i] == 0
+		#DE
+	FI
+	dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_epu16" sequence="TRUE" tech="SVML">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	IF b[i+15:i] == 0
+		#DE
+	FI
+	dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_epu32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF b[i+31:i] == 0
+		#DE
+	FI
+	dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_epu64" sequence="TRUE" tech="SVML">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	IF b[i+63:i] == 0
+		#DE
+	FI
+	dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_erf_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ERF(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_idiv_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_idivrem_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" memwidth="128" type="__m128i *" varname="mem_addr" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_irem_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_rem_epi8" sequence="TRUE" tech="SVML">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_rem_epi16" sequence="TRUE" tech="SVML">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_rem_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_rem_epi64" sequence="TRUE" tech="SVML">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_rem_epu8" sequence="TRUE" tech="SVML">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_rem_epu16" sequence="TRUE" tech="SVML">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_rem_epu32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_rem_epu64" sequence="TRUE" tech="SVML">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_udiv_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_udivrem_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" memwidth="128" type="__m128i *" varname="mem_addr" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_urem_epi32" sequence="TRUE" tech="SVML">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_svml_ceil_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_svml_ceil_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_svml_floor_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_svml_floor_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_svml_round_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_svml_round_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_trunc_pd" sequence="TRUE" tech="SVML">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := TRUNCATE(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_trunc_ps" sequence="TRUE" tech="SVML">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := TRUNCATE(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_MM_TRANSPOSE4_PS" sequence="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP32" type="__m128" varname="row0" />
+	<parameter etype="FP32" type="__m128" varname="row1" />
+	<parameter etype="FP32" type="__m128" varname="row2" />
+	<parameter etype="FP32" type="__m128" varname="row3" />
+	<description>Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in "row0", "row1", "row2", and "row3", and store the transposed matrix in these vectors ("row0" now contains column 0, etc.).</description>
+	<operation>
+__m128 tmp3, tmp2, tmp1, tmp0;
+tmp0 := _mm_unpacklo_ps(row0, row1);
+tmp2 := _mm_unpacklo_ps(row2, row3);
+tmp1 := _mm_unpackhi_ps(row0, row1);
+tmp3 := _mm_unpackhi_ps(row2, row3);
+row0 := _mm_movelh_ps(tmp0, tmp2);
+row1 := _mm_movehl_ps(tmp2, tmp0);
+row2 := _mm_movelh_ps(tmp1, tmp3);
+row3 := _mm_movehl_ps(tmp3, tmp1);
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_extract_pi16" tech="SSE_ALL">
+	<return etype="UI16" type="int" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[15:0] := (a[63:0] &gt;&gt; (imm8[1:0] * 16))[15:0]
+dst[31:16] := 0
+	</operation>
+	<instruction form="r32, mm, imm8" name="PEXTRW" xed="PEXTRW_GPR32_MMXq_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_m_pextrw" tech="SSE_ALL">
+	<return etype="UI16" type="int" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[15:0] := (a[63:0] &gt;&gt; (imm8[1:0] * 16))[15:0]
+dst[31:16] := 0
+	</operation>
+	<instruction form="r32, mm, imm8" name="PEXTRW" xed="PEXTRW_GPR32_MMXq_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_insert_pi16" tech="SSE_ALL">
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="int" varname="i" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[63:0] := a[63:0]
+sel := imm8[1:0]*16
+dst[sel+15:sel] := i[15:0]
+	</operation>
+	<instruction form="mm, r32, imm8" name="PINSRW" xed="PINSRW_MMXq_GPR32_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_m_pinsrw" tech="SSE_ALL">
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="int" varname="i" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[63:0] := a[63:0]
+sel := imm8[1:0]*16
+dst[sel+15:sel] := i[15:0]
+	</operation>
+	<instruction form="mm, r32, imm8" name="PINSRW" xed="PINSRW_MMXq_GPR32_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_shuffle_pi16" tech="SSE_ALL">
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[15:0] := src[15:0]
+	1:	tmp[15:0] := src[31:16]
+	2:	tmp[15:0] := src[47:32]
+	3:	tmp[15:0] := src[63:48]
+	ESAC
+	RETURN tmp[15:0]
+}
+dst[15:0] := SELECT4(a[63:0], imm8[1:0])
+dst[31:16] := SELECT4(a[63:0], imm8[3:2])
+dst[47:32] := SELECT4(a[63:0], imm8[5:4])
+dst[63:48] := SELECT4(a[63:0], imm8[7:6])
+	</operation>
+	<instruction form="mm, mm, imm8" name="PSHUFW" xed="PSHUFW_MMXq_MMXq_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_m_pshufw" tech="SSE_ALL">
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[15:0] := src[15:0]
+	1:	tmp[15:0] := src[31:16]
+	2:	tmp[15:0] := src[47:32]
+	3:	tmp[15:0] := src[63:48]
+	ESAC
+	RETURN tmp[15:0]
+}
+dst[15:0] := SELECT4(a[63:0], imm8[1:0])
+dst[31:16] := SELECT4(a[63:0], imm8[3:2])
+dst[47:32] := SELECT4(a[63:0], imm8[5:4])
+dst[63:48] := SELECT4(a[63:0], imm8[7:6])
+	</operation>
+	<instruction form="mm, mm, imm8" name="PSHUFW" xed="PSHUFW_MMXq_MMXq_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_shuffle_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="unsigned int" varname="imm8" />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="SHUFPS" xed="SHUFPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpackhi_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="UNPCKHPS" xed="UNPCKHPS_XMMps_XMMdq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpacklo_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="UNPCKLPS" xed="UNPCKLPS_XMMps_XMMq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_getcsr" tech="SSE_ALL">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter type="void" />
+	<description>Get the unsigned 32-bit value of the MXCSR control and status register.</description>
+	<operation>dst[31:0] := MXCSR
+	</operation>
+	<instruction form="m32" name="STMXCSR" xed="STMXCSR_MEMd" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_setcsr" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Set the MXCSR control and status register with the value in unsigned 32-bit integer "a".</description>
+	<operation>
+MXCSR := a[31:0]
+	</operation>
+	<instruction form="m32" name="LDMXCSR" xed="LDMXCSR_MEMd" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_MM_GET_EXCEPTION_STATE" tech="SSE_ALL">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<description>Macro: Get the exception state bits from the MXCSR control and status register. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT</description>
+	<operation>dst[31:0] := MXCSR &amp; _MM_EXCEPT_MASK
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_MM_SET_EXCEPTION_STATE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Macro: Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT</description>
+	<operation>MXCSR := a[31:0] AND ~_MM_EXCEPT_MASK
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_MM_GET_EXCEPTION_MASK" tech="SSE_ALL">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<description>Macro: Get the exception mask bits from the MXCSR control and status register. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT</description>
+	<operation>dst[31:0] := MXCSR &amp; _MM_MASK_MASK
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_MM_SET_EXCEPTION_MASK" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Macro: Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT</description>
+	<operation>MXCSR := a[31:0] AND ~_MM_MASK_MASK
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_MM_GET_ROUNDING_MODE" tech="SSE_ALL">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<description>Macro: Get the rounding mode bits from the MXCSR control and status register. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO</description>
+	<operation>dst[31:0] := MXCSR &amp; _MM_ROUND_MASK
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_MM_SET_ROUNDING_MODE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Macro: Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO</description>
+	<operation>MXCSR := a[31:0] AND ~_MM_ROUND_MASK
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_MM_GET_FLUSH_ZERO_MODE" tech="SSE_ALL">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<description>Macro: Get the flush zero bits from the MXCSR control and status register. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF</description>
+	<operation>dst[31:0] := MXCSR &amp; _MM_FLUSH_MASK
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_MM_SET_FLUSH_ZERO_MODE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Macro: Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF</description>
+	<operation>MXCSR := a[31:0] AND ~_MM_FLUSH_MASK
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_prefetch" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI8" type="char const*" varname="p" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="i" />
+	<description>Fetch the line of data from memory that contains address "p" to a location in the cache hierarchy specified by the locality hint "i", which can be one of:&lt;ul&gt;
+    &lt;li&gt;_MM_HINT_T0   // 3, move data using the T0 hint. The PREFETCHT0 instruction will be generated.&lt;/li&gt;
+    &lt;li&gt;_MM_HINT_T1   // 2, move data using the T1 hint. The PREFETCHT1 instruction will be generated.&lt;/li&gt;
+    &lt;li&gt;_MM_HINT_T2   // 1, move data using the T2 hint. The PREFETCHT2 instruction will be generated.&lt;/li&gt;
+    &lt;li&gt;_MM_HINT_NTA  // 0, move data using the non-temporal access (NTA) hint. The PREFETCHNTA instruction will be generated.&lt;/li&gt;
+</description>
+	<instruction form="m8" name="PREFETCHNTA" xed="PREFETCHNTA_MEMmprefetch" />
+	<instruction form="m8" name="PREFETCHT0" xed="PREFETCHT0_MEMmprefetch" />
+	<instruction form="m8" name="PREFETCHT1" xed="PREFETCHT1_MEMmprefetch" />
+	<instruction form="m8" name="PREFETCHT2" xed="PREFETCHT2_MEMmprefetch" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_sfence" tech="SSE_ALL">
+	<return type="void" />
+	<parameter type="void" />
+	<description>Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order.</description>
+	<instruction name="SFENCE" xed="SFENCE" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_malloc" tech="SSE_ALL">
+	<return type="void*" />
+	<parameter etype="UI64" type="size_t" varname="size" />
+	<parameter etype="UI64" type="size_t" varname="align" />
+	<description>Allocate "size" bytes of memory, aligned to the alignment specified in "align", and return a pointer to the allocated memory. "_mm_free" should be used to free memory that is allocated with "_mm_malloc".</description>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_free" tech="SSE_ALL">
+	<return type="void" />
+	<parameter type="void *" varname="mem_addr" />
+	<description>Free aligned memory that was allocated with "_mm_malloc".</description>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_undefined_ps" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m128 with undefined elements.</description>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_pi16" tech="SSE_ALL">
+	<return etype="SI16" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMAXSW" xed="PMAXSW_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_m_pmaxsw" tech="SSE_ALL">
+	<return etype="SI16" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMAXSW" xed="PMAXSW_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_pu8" tech="SSE_ALL">
+	<return etype="UI8" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMAXUB" xed="PMAXUB_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_m_pmaxub" tech="SSE_ALL">
+	<return etype="UI8" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMAXUB" xed="PMAXUB_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_pi16" tech="SSE_ALL">
+	<return etype="SI16" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMINSW" xed="PMINSW_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_m_pminsw" tech="SSE_ALL">
+	<return etype="SI16" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMINSW" xed="PMINSW_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_pu8" tech="SSE_ALL">
+	<return etype="UI8" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMINUB" xed="PMINUB_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_m_pminub" tech="SSE_ALL">
+	<return etype="UI8" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMINUB" xed="PMINUB_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". [min_float_note]</description>
+	<operation>
+dst[31:0] := MIN(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm" name="MINSS" xed="MINSS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="MINPS" xed="MINPS_XMMps_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". [max_float_note]</description>
+	<operation>
+dst[31:0] := MAX(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm" name="MAXSS" xed="MAXSS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="MAXPS" xed="MAXPS_XMMps_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_mulhi_pu16" tech="SSE_ALL">
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="b" />
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMULHUW" xed="PMULHUW_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_pmulhuw" tech="SSE_ALL">
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="b" />
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMULHUW" xed="PMULHUW_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sad_pu8" tech="SSE_ALL">
+	<category>Miscellaneous</category>
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
+dst[63:16] := 0
+	</operation>
+	<instruction form="mm, mm" name="PSADBW" xed="PSADBW_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_m_psadbw" tech="SSE_ALL">
+	<category>Miscellaneous</category>
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
+dst[63:16] := 0
+	</operation>
+	<instruction form="mm, mm" name="PSADBW" xed="PSADBW_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] + b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm" name="ADDSS" xed="ADDSS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="ADDPS" xed="ADDPS_XMMps_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm" name="SUBSS" xed="SUBSS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="SUBPS" xed="SUBPS_XMMps_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] * b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm" name="MULSS" xed="MULSS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="MULPS" xed="MULPS_XMMps_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] / b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm" name="DIVSS" xed="DIVSS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := a[i+31:i] / b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="DIVPS" xed="DIVPS_XMMps_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_avg_pu8" tech="SSE_ALL">
+	<return etype="UI8" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PAVGB" xed="PAVGB_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_m_pavgb" tech="SSE_ALL">
+	<return etype="UI8" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PAVGB" xed="PAVGB_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_avg_pu16" tech="SSE_ALL">
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="b" />
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PAVGW" xed="PAVGW_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_m_pavgw" tech="SSE_ALL">
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<parameter etype="UI16" type="__m64" varname="b" />
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PAVGW" xed="PAVGW_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi32_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="SI32" type="int" varname="b" />
+	<description>Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, r32" name="CVTSI2SS" xed="CVTSI2SS_XMMss_GPR32d" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_si2ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="SI32" type="int" varname="b" />
+	<description>Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, r32" name="CVTSI2SS" xed="CVTSI2SS_XMMss_GPR32d" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi64_ss" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="SI64" type="__int64" varname="b" />
+	<description>Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, r64" name="CVTSI2SS" xed="CVTSI2SS_XMMss_GPR64q" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpi32_ps" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="SI32" type="__m64" varname="b" />
+	<description>Convert packed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+dst[95:64] := a[95:64]
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction form="xmm, mm" name="CVTPI2PS" xed="CVTPI2PS_XMMq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_pi2ps" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="SI32" type="__m64" varname="b" />
+	<description>Convert packed signed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+dst[95:64] := a[95:64]
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction form="xmm, mm" name="CVTPI2PS" xed="CVTPI2PS_XMMq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpi16_ps" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<description>Convert packed 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	m := j*32
+	dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+ENDFOR
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpu16_ps" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="UI16" type="__m64" varname="a" />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	m := j*32
+	dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+ENDFOR
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpi8_ps" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="SI8" type="__m64" varname="a" />
+	<description>Convert the lower packed 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*8
+	m := j*32
+	dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+ENDFOR
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpu8_ps" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<description>Convert the lower packed unsigned 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*8
+	m := j*32
+	dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+ENDFOR
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpi32x2_ps" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="SI32" type="__m64" varname="a" />
+	<parameter etype="SI32" type="__m64" varname="b" />
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", then covert the packed signed 32-bit integers in "b" to single-precision (32-bit) floating-point element, and store the results in the upper 2 elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(a[31:0])
+dst[63:32] := Convert_Int32_To_FP32(a[63:32])
+dst[95:64] := Convert_Int32_To_FP32(b[31:0])
+dst[127:96] := Convert_Int32_To_FP32(b[63:32])
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtss_si32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction form="r32, xmm" name="CVTSS2SI" xed="CVTSS2SI_GPR32d_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_ss2si" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction form="r32, xmm" name="CVTSS2SI" xed="CVTSS2SI_GPR32d_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtss_si64" tech="SSE_ALL">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+	</operation>
+	<instruction form="r64, xmm" name="CVTSS2SI" xed="CVTSS2SI_GPR64q_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtss_f32" tech="SSE_ALL">
+	<return etype="FP32" type="float" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Copy the lower single-precision (32-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction form="m32, xmm" name="MOVSS" xed="MOVSS_MEMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtps_pi32" tech="SSE_ALL">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, xmm" name="CVTPS2PI" xed="CVTPS2PI_MMXq_XMMq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvt_ps2pi" tech="SSE_ALL">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, xmm" name="CVTPS2PI" xed="CVTPS2PI_MMXq_XMMq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttss_si32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction form="r32, xmm" name="CVTTSS2SI" xed="CVTTSS2SI_GPR32d_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_ss2si" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction form="r32, xmm" name="CVTTSS2SI" xed="CVTTSS2SI_GPR32d_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttss_si64" tech="SSE_ALL">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+	</operation>
+	<instruction form="r64, xmm" name="CVTTSS2SI" xed="CVTTSS2SI_GPR64q_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttps_pi32" tech="SSE_ALL">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, xmm" name="CVTTPS2PI" xed="CVTTPS2PI_MMXq_XMMq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtt_ps2pi" tech="SSE_ALL">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, xmm" name="CVTTPS2PI" xed="CVTTPS2PI_MMXq_XMMq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtps_pi16" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF.</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	k := 32*j
+	IF a[k+31:k] &gt;= FP32(0x7FFF) &amp;&amp; a[k+31:k] &lt;= FP32(0x7FFFFFFF)
+		dst[i+15:i] := 0x7FFF
+	ELSE
+		dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
+	FI
+ENDFOR
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtps_pi8" sequence="TRUE" tech="SSE_ALL">
+	<return etype="SI8" type="__m64" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 8-bit integers, and store the results in lower 4 elements of "dst". Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF.</description>
+	<operation>
+FOR j := 0 to 3
+	i := 8*j
+	k := 32*j
+	IF a[k+31:k] &gt;= FP32(0x7F) &amp;&amp; a[k+31:k] &lt;= FP32(0x7FFFFFFF)
+		dst[i+7:i] := 0x7F
+	ELSE
+		dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
+	FI
+ENDFOR
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_stream_pi" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="64" type="void*" varname="mem_addr" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<description>Store 64-bits of integer data from "a" into memory using a non-temporal memory hint.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction form="m64, mm" name="MOVNTQ" xed="MOVNTQ_MEMq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskmove_si64" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="mask" />
+	<parameter etype="UI8" memwidth="64" type="char*" varname="mem_addr" />
+	<description>Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF mask[i+7]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="MASKMOVQ" xed="MASKMOVQ_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_m_maskmovq" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="mask" />
+	<parameter etype="UI8" memwidth="64" type="char*" varname="mem_addr" />
+	<description>Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF mask[i+7]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="MASKMOVQ" xed="MASKMOVQ_MMXq_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_stream_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="MOVNTPS" xed="MOVNTPS_MEMdq_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storeh_pi" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="64" type="__m64*" varname="mem_addr" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Store the upper 2 single-precision (32-bit) floating-point elements from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[95:64]
+MEM[mem_addr+63:mem_addr+32] := a[127:96]
+	</operation>
+	<instruction form="m64, xmm" name="MOVHPS" xed="MOVHPS_MEMq_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storel_pi" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="64" type="__m64*" varname="mem_addr" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Store the lower 2 single-precision (32-bit) floating-point elements from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+MEM[mem_addr+63:mem_addr+32] := a[63:32]
+	</operation>
+	<instruction form="m64, xmm" name="MOVLPS" xed="MOVLPS_MEMq_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_store_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="32" type="float*" varname="mem_addr" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Store the lower single-precision (32-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+	</operation>
+	<instruction form="m32, xmm" name="MOVSS" xed="MOVSS_MEMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_store1_ps" sequence="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="32" type="float*" varname="mem_addr" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+MEM[mem_addr+63:mem_addr+32] := a[31:0]
+MEM[mem_addr+95:mem_addr+64] := a[31:0]
+MEM[mem_addr+127:mem_addr+96] := a[31:0]
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_store_ps1" sequence="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="32" type="float*" varname="mem_addr" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+MEM[mem_addr+63:mem_addr+32] := a[31:0]
+MEM[mem_addr+95:mem_addr+64] := a[31:0]
+MEM[mem_addr+127:mem_addr+96] := a[31:0]
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_store_ps" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="128" type="float*" varname="mem_addr" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="MOVAPS" xed="MOVAPS_MEMps_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storeu_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="128" type="float*" varname="mem_addr" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="MOVUPS" xed="MOVUPS_MEMps_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storer_ps" sequence="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP32" memwidth="128" type="float*" varname="mem_addr" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Store 4 single-precision (32-bit) floating-point elements from "a" into memory in reverse order.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[127:96]
+MEM[mem_addr+63:mem_addr+32] := a[95:64]
+MEM[mem_addr+95:mem_addr+64] := a[63:32]
+MEM[mem_addr+127:mem_addr+96] := a[31:0]
+	</operation>
+	<instruction form="m128, xmm" name="MOVUPS" xed="MOVUPS_MEMps_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_movemask_pi8" tech="SSE_ALL">
+	<return etype="UI8" type="int" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[j] := a[i+7]
+ENDFOR
+dst[MAX:8] := 0
+	</operation>
+	<instruction form="r32, mm" name="PMOVMSKB" xed="PMOVMSKB_GPR32_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_m_pmovmskb" tech="SSE_ALL">
+	<return etype="UI8" type="int" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[j] := a[i+7]
+ENDFOR
+dst[MAX:8] := 0
+	</operation>
+	<instruction form="r32, mm" name="PMOVMSKB" xed="PMOVMSKB_GPR32_MMXq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_movemask_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF a[i+31]
+		dst[j] := 1
+	ELSE
+		dst[j] := 0
+	FI
+ENDFOR
+dst[MAX:4] := 0
+	</operation>
+	<instruction form="r32, xmm" name="MOVMSKPS" xed="MOVMSKPS_GPR32_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_sqrt_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := SQRT(a[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm" name="SQRTSS" xed="SQRTSS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_sqrt_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="SQRTPS" xed="SQRTPS_XMMps_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_rcp_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+dst[31:0] := (1.0 / a[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm" name="RCPSS" xed="RCPSS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_rcp_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (1.0 / a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="RCPPS" xed="RCPPS_XMMps_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_rsqrt_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+dst[31:0] := (1.0 / SQRT(a[31:0]))
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm" name="RSQRTSS" xed="RSQRTSS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_rsqrt_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="RSQRTPS" xed="RSQRTPS_XMMps_XMMps" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_and_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="ANDPS" xed="ANDPS_XMMxud_XMMxud" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_andnot_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="ANDNPS" xed="ANDNPS_XMMxud_XMMxud" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_or_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="ORPS" xed="ORPS_XMMxud_XMMxud" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_xor_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="XORPS" xed="XORPS_XMMxud_XMMxud" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] == b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSS" xed="CMPSS_XMMss_XMMss_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPS" xed="CMPPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] &lt; b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSS" xed="CMPSS_XMMss_XMMss_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &lt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPS" xed="CMPPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmple_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] &lt;= b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSS" xed="CMPSS_XMMss_XMMss_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmple_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPS" xed="CMPPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] &gt; b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSS" xed="CMPSS_XMMss_XMMss_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPS" xed="CMPPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpge_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] &gt;= b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSS" xed="CMPSS_XMMss_XMMss_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpge_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPS" xed="CMPPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpneq_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] != b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSS" xed="CMPSS_XMMss_XMMss_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpneq_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] != b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPS" xed="CMPPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpnlt_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := (!( a[31:0] &lt; b[31:0] )) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSS" xed="CMPSS_XMMss_XMMss_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpnlt_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := !( a[i+31:i] &lt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPS" xed="CMPPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpnle_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := (!( a[31:0] &lt;= b[31:0] )) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSS" xed="CMPSS_XMMss_XMMss_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpnle_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (!( a[i+31:i] &lt;= b[i+31:i] )) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPS" xed="CMPPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpngt_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := (!( a[31:0] &gt; b[31:0] )) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSS" xed="CMPSS_XMMss_XMMss_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpngt_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (!( a[i+31:i] &gt; b[i+31:i] )) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPS" xed="CMPPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpnge_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := (!( a[31:0] &gt;= b[31:0] )) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSS" xed="CMPSS_XMMss_XMMss_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpnge_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (!( a[i+31:i] &gt;= b[i+31:i] )) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPS" xed="CMPPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpord_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>dst[31:0] := ( a[31:0] != NaN AND b[31:0] != NaN ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSS" xed="CMPSS_XMMss_XMMss_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpord_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] != NaN AND b[i+31:i] != NaN ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPS" xed="CMPPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpunord_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>dst[31:0] := ( a[31:0] == NaN OR b[31:0] == NaN ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSS" xed="CMPSS_XMMss_XMMss_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpunord_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == NaN OR b[i+31:i] == NaN ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPS" xed="CMPPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comieq_ss" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] == b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="COMISS" xed="COMISS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comilt_ss" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] &lt; b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="COMISS" xed="COMISS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comile_ss" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] &lt;= b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="COMISS" xed="COMISS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comigt_ss" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] &gt; b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="COMISS" xed="COMISS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comige_ss" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] &gt;= b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="COMISS" xed="COMISS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comineq_ss" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a[31:0] == NaN OR b[31:0] == NaN OR a[31:0] != b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="COMISS" xed="COMISS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomieq_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] == b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="UCOMISS" xed="UCOMISS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomilt_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] &lt; b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="UCOMISS" xed="UCOMISS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomile_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] &lt;= b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="UCOMISS" xed="UCOMISS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomigt_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] &gt; b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="UCOMISS" xed="UCOMISS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomige_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] &gt;= b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="UCOMISS" xed="UCOMISS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomineq_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a[31:0] == NaN OR b[31:0] == NaN OR a[31:0] != b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="UCOMISS" xed="UCOMISS_XMMss_XMMss" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_ss" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="float" varname="a" />
+	<description>Copy single-precision (32-bit) floating-point element "a" to the lower element of "dst", and zero the upper 3 elements.</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[127:32] := 0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set1_ps" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="float" varname="a" />
+	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_ps1" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="float" varname="a" />
+	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_ps" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="float" varname="e3" />
+	<parameter etype="FP32" type="float" varname="e2" />
+	<parameter etype="FP32" type="float" varname="e1" />
+	<parameter etype="FP32" type="float" varname="e0" />
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setr_ps" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="float" varname="e3" />
+	<parameter etype="FP32" type="float" varname="e2" />
+	<parameter etype="FP32" type="float" varname="e1" />
+	<parameter etype="FP32" type="float" varname="e0" />
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e3
+dst[63:32] := e2
+dst[95:64] := e1
+dst[127:96] := e0
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setzero_ps" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m128 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="XORPS" xed="XORPS_XMMxud_XMMxud" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadh_pi" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" memwidth="64" type="__m64 const*" varname="mem_addr" />
+	<description>Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of "dst", and copy the lower 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := a[63:32]
+dst[95:64] := MEM[mem_addr+31:mem_addr]
+dst[127:96] := MEM[mem_addr+63:mem_addr+32]
+	</operation>
+	<instruction form="xmm, m64" name="MOVHPS" xed="MOVHPS_XMMq_MEMq" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadl_pi" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" memwidth="64" type="__m64 const*" varname="mem_addr" />
+	<description>Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of "dst", and copy the upper 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[63:32] := MEM[mem_addr+63:mem_addr+32]
+dst[95:64] := a[95:64]
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction form="xmm, m64" name="MOVLPS" xed="MOVLPS_XMMq_MEMq" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_load_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" memwidth="32" type="float const*" varname="mem_addr" />
+	<description>Load a single-precision (32-bit) floating-point element from memory into the lower of "dst", and zero the upper 3 elements. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[127:32] := 0
+	</operation>
+	<instruction form="xmm, m32" name="MOVSS" xed="MOVSS_XMMdq_MEMss" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_load1_ps" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" memwidth="32" type="float const*" varname="mem_addr" />
+	<description>Load a single-precision (32-bit) floating-point element from memory into all elements of "dst".</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[63:32] := MEM[mem_addr+31:mem_addr]
+dst[95:64] := MEM[mem_addr+31:mem_addr]
+dst[127:96] := MEM[mem_addr+31:mem_addr]
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_load_ps1" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" memwidth="32" type="float const*" varname="mem_addr" />
+	<description>Load a single-precision (32-bit) floating-point element from memory into all elements of "dst".</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[63:32] := MEM[mem_addr+31:mem_addr]
+dst[95:64] := MEM[mem_addr+31:mem_addr]
+dst[127:96] := MEM[mem_addr+31:mem_addr]
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_load_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" memwidth="128" type="float const*" varname="mem_addr" />
+	<description>Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction form="xmm, m128" name="MOVAPS" xed="MOVAPS_XMMps_MEMps" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadu_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" memwidth="128" type="float const*" varname="mem_addr" />
+	<description>Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction form="xmm, m128" name="MOVUPS" xed="MOVUPS_XMMps_MEMps" />
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadr_ps" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" memwidth="128" type="float const*" varname="mem_addr" />
+	<description>Load 4 single-precision (32-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+127:mem_addr+96]
+dst[63:32] := MEM[mem_addr+95:mem_addr+64]
+dst[95:64] := MEM[mem_addr+63:mem_addr+32]
+dst[127:96] := MEM[mem_addr+31:mem_addr]
+	</operation>
+	<CPUID>SSE</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_move_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm" name="MOVSS" xed="MOVSS_XMMss_XMMss_0F10" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_movehl_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Move the upper 2 single-precision (32-bit) floating-point elements from "b" to the lower 2 elements of "dst", and copy the upper 2 elements from "a" to the upper 2 elements of "dst".</description>
+	<operation>
+dst[31:0] := b[95:64]
+dst[63:32] := b[127:96]
+dst[95:64] := a[95:64]
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction form="xmm, xmm" name="MOVHLPS" xed="MOVHLPS_XMMq_XMMq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_movelh_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Move the lower 2 single-precision (32-bit) floating-point elements from "b" to the upper 2 elements of "dst", and copy the lower 2 elements from "a" to the lower 2 elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := a[63:32]
+dst[95:64] := b[31:0]
+dst[127:96] := b[63:32]
+	</operation>
+	<instruction form="xmm, xmm" name="MOVLHPS" xed="MOVLHPS_XMMq_XMMq" />
+	<CPUID>SSE</CPUID>
+	<header>xmmintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm_undefined_pd" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m128d with undefined elements.</description>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_undefined_si128" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m128i with undefined elements.</description>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_pause" tech="SSE_ALL">
+	<return type="void" />
+	<parameter type="void" />
+	<description>Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance and power consumption of spin-wait loops.</description>
+	<instruction name="PAUSE" xed="PAUSE" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_clflush" tech="SSE_ALL">
+	<return type="void" />
+	<parameter type="void const*" varname="p" />
+	<description>Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy.</description>
+	<instruction form="m8" name="CLFLUSH" xed="CLFLUSH_MEMmprefetch" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_lfence" tech="SSE_ALL">
+	<return type="void" />
+	<parameter type="void" />
+	<description>Perform a serializing operation on all load-from-memory instructions that were issued prior to this instruction. Guarantees that every load instruction that precedes, in program order, is globally visible before any load instruction which follows the fence in program order.</description>
+	<instruction name="LFENCE" xed="LFENCE" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_mfence" tech="SSE_ALL">
+	<return type="void" />
+	<parameter type="void" />
+	<description>Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction is globally visible before any memory instruction which follows the fence in program order.</description>
+	<instruction name="MFENCE" xed="MFENCE" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadu_si64" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" memwidth="64" type="void const*" varname="mem_addr" />
+	<description>Load unaligned 64-bit integer from memory into the first element of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, m64" name="MOVQ" xed="MOVQ_XMMdq_MEMq_0F6E" />
+	<CPUID>SSE2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadu_si16" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" memwidth="16" type="void const*" varname="mem_addr" />
+	<description>Load unaligned 16-bit integer from memory into the first element of "dst".</description>
+	<operation>
+dst[15:0] := MEM[mem_addr+15:mem_addr]
+dst[MAX:16] := 0
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>immintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadu_si32" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" memwidth="32" type="void const*" varname="mem_addr" />
+	<description>Load unaligned 32-bit integer from memory into the first element of "dst".</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[MAX:32] := 0
+	</operation>
+	<instruction form="xmm, m32" name="MOVD" xed="MOVD_XMMdq_MEMd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadl_epi64" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" memwidth="64" type="__m128i const*" varname="mem_addr" />
+	<description>Load 64-bit integer from memory into the first element of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[MAX:64] := 0
+	</operation>
+	<instruction form="xmm, m64" name="MOVQ" xed="MOVQ_XMMdq_MEMq_0F7E" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_load_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" memwidth="128" type="__m128i const*" varname="mem_addr" />
+	<description>Load 128-bits of integer data from memory into "dst". 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction form="xmm, m128" name="MOVDQA" xed="MOVDQA_XMMdq_MEMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadu_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" memwidth="128" type="__m128i const*" varname="mem_addr" />
+	<description>Load 128-bits of integer data from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction form="xmm, m128" name="MOVDQU" xed="MOVDQU_XMMdq_MEMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_load_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" memwidth="128" type="double const*" varname="mem_addr" />
+	<description>Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction form="xmm, m128" name="MOVAPD" xed="MOVAPD_XMMpd_MEMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_load1_pd" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" memwidth="64" type="double const*" varname="mem_addr" />
+	<description>Load a double-precision (64-bit) floating-point element from memory into both elements of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction form="xmm, m128" name="MOVAPD" xed="MOVAPD_XMMpd_MEMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_load_pd1" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" memwidth="64" type="double const*" varname="mem_addr" />
+	<description>Load a double-precision (64-bit) floating-point element from memory into both elements of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction form="xmm, m128" name="MOVAPD" xed="MOVAPD_XMMpd_MEMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadr_pd" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" memwidth="128" type="double const*" varname="mem_addr" />
+	<description>Load 2 double-precision (64-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+127:mem_addr+64]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction form="xmm, m128" name="MOVAPD" xed="MOVAPD_XMMpd_MEMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadu_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" memwidth="128" type="double const*" varname="mem_addr" />
+	<description>Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction form="xmm, m128" name="MOVUPD" xed="MOVUPD_XMMpd_MEMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_load_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" memwidth="64" type="double const*" varname="mem_addr" />
+	<description>Load a double-precision (64-bit) floating-point element from memory into the lower of "dst", and zero the upper element. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := 0
+	</operation>
+	<instruction form="xmm, m64" name="MOVSD" xed="MOVSD_XMM_XMMdq_MEMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadh_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" memwidth="64" type="double const*" varname="mem_addr" />
+	<description>Load a double-precision (64-bit) floating-point element from memory into the upper element of "dst", and copy the lower element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction form="xmm, m64" name="MOVHPD" xed="MOVHPD_XMMsd_MEMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loadl_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" memwidth="64" type="double const*" varname="mem_addr" />
+	<description>Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst", and copy the upper element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, m64" name="MOVLPD" xed="MOVLPD_XMMsd_MEMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_storeu_si16" sequence="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI16" memwidth="16" type="void*" varname="mem_addr" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Store 16-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+15:mem_addr] := a[15:0]
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storeu_si64" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="64" type="void*" varname="mem_addr" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Store 64-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction form="m64, xmm" name="MOVQ" xed="MOVQ_MEMq_XMMq_0F7E" />
+	<CPUID>SSE2</CPUID>
+	<header>immintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storeu_si32" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="32" type="void*" varname="mem_addr" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Store 32-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+	</operation>
+	<instruction form="m32, xmm" name="MOVD" xed="MOVD_MEMd_XMMd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_maskmoveu_si128" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="mask" />
+	<parameter etype="UI8" memwidth="128" type="char*" varname="mem_addr" />
+	<description>Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF mask[i+7]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="MASKMOVDQU" xed="MASKMOVDQU_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_store_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="M128" memwidth="128" type="__m128i*" varname="mem_addr" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<description>Store 128-bits of integer data from "a" into memory. 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="MOVDQA" xed="MOVDQA_MEMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storeu_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="M128" memwidth="128" type="__m128i*" varname="mem_addr" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<description>Store 128-bits of integer data from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="MOVDQU" xed="MOVDQU_MEMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storel_epi64" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="64" type="__m128i*" varname="mem_addr" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Store 64-bit integer from the first element of "a" into memory.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction form="m64, xmm" name="MOVQ" xed="MOVQ_MEMq_XMMq_0F7E" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_stream_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="M128" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<description>Store 128-bits of integer data from "a" into memory using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="MOVNTDQ" xed="MOVNTDQ_MEMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_stream_si32" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI32" memwidth="32" type="void*" varname="mem_addr" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Store 32-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+	</operation>
+	<instruction form="m32, r32" name="MOVNTI" xed="MOVNTI_MEMd_GPR32" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_stream_si64" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="UI64" memwidth="64" type="void*" varname="mem_addr" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Store 64-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction form="m64, r64" name="MOVNTI" xed="MOVNTI_MEMq_GPR64" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_stream_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="128" type="void*" varname="mem_addr" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="MOVNTPD" xed="MOVNTPD_MEMdq_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_store_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="64" type="double*" varname="mem_addr" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction form="m64, xmm" name="MOVSD" xed="MOVSD_XMM_MEMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_store1_pd" sequence="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="64" type="double*" varname="mem_addr" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+MEM[mem_addr+127:mem_addr+64] := a[63:0]
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_store_pd1" sequence="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="64" type="double*" varname="mem_addr" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+MEM[mem_addr+127:mem_addr+64] := a[63:0]
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_store_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="128" type="double*" varname="mem_addr" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="MOVAPD" xed="MOVAPD_MEMpd_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storeu_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="128" type="double*" varname="mem_addr" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction form="m128, xmm" name="MOVUPD" xed="MOVUPD_MEMpd_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storer_pd" sequence="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="128" type="double*" varname="mem_addr" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Store 2 double-precision (64-bit) floating-point elements from "a" into memory in reverse order.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[127:64]
+MEM[mem_addr+127:mem_addr+64] := a[63:0]
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storeh_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="64" type="double*" varname="mem_addr" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Store the upper double-precision (64-bit) floating-point element from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[127:64]
+	</operation>
+	<instruction form="m64, xmm" name="MOVHPD" xed="MOVHPD_MEMq_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_storel_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return type="void" />
+	<parameter etype="FP64" memwidth="64" type="double*" varname="mem_addr" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction form="m64, xmm" name="MOVLPD" xed="MOVLPD_MEMq_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Store</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PADDB" xed="PADDB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PADDW" xed="PADDW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PADDD" xed="PADDD_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_si64" tech="SSE_ALL">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Add 64-bit integers "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] + b[63:0]
+	</operation>
+	<instruction form="mm, mm" name="PADDQ" xed="PADDQ_MMXq_MMXq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PADDQ" xed="PADDQ_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_adds_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PADDSB" xed="PADDSB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_adds_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PADDSW" xed="PADDSW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_adds_epu8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PADDUSB" xed="PADDUSB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_adds_epu16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PADDUSW" xed="PADDUSW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_madd_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMADDWD" xed="PMADDWD_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mulhi_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMULHW" xed="PMULHW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mulhi_epu16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMULHUW" xed="PMULHUW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mullo_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMULLW" xed="PMULLW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_su32" tech="SSE_ALL">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI32" type="__m64" varname="a" />
+	<parameter etype="UI32" type="__m64" varname="b" />
+	<description>Multiply the low unsigned 32-bit integers from "a" and "b", and store the unsigned 64-bit result in "dst".</description>
+	<operation>
+dst[63:0] := a[31:0] * b[31:0]
+	</operation>
+	<instruction form="mm, mm" name="PMULUDQ" xed="PMULUDQ_MMXq_MMXq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_epu32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMULUDQ" xed="PMULUDQ_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sad_epu8" vexEq="TRUE" tech="SSE_ALL">
+	<category>Miscellaneous</category>
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+FOR j := 0 to 1
+	i := j*64
+	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \
+	               tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
+	dst[i+63:i+16] := 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSADBW" xed="PSADBW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSUBB" xed="PSUBB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSUBW" xed="PSUBW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSUBD" xed="PSUBD_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_si64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<parameter etype="UI64" type="__m64" varname="b" />
+	<description>Subtract 64-bit integer "b" from 64-bit integer "a", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] - b[63:0]
+	</operation>
+	<instruction form="mm, mm" name="PSUBQ" xed="PSUBQ_MMXq_MMXq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSUBQ" xed="PSUBQ_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_subs_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSUBSB" xed="PSUBSB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_subs_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSUBSW" xed="PSUBSW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_subs_epu8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSUBUSB" xed="PSUBUSB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_subs_epu16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])	
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSUBUSW" xed="PSUBUSW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] + b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm" name="ADDSD" xed="ADDSD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_add_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="ADDPD" xed="ADDPD_XMMpd_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] / b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm" name="DIVSD" xed="DIVSD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_div_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	dst[i+63:i] := a[i+63:i] / b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="DIVPD" xed="DIVPD_XMMpd_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] * b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm" name="MULSD" xed="MULSD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="MULPD" xed="MULPD_XMMpd_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] - b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm" name="SUBSD" xed="SUBSD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sub_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="SUBPD" xed="SUBPD_XMMpd_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_avg_epu8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PAVGB" xed="PAVGB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_avg_epu16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PAVGW" xed="PAVGW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Probability/Statistics</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMAXSW" xed="PMAXSW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_epu8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMAXUB" xed="PMAXUB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMINSW" xed="PMINSW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_epu8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMINUB" xed="PMINUB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [max_float_note]</description>
+	<operation>
+dst[63:0] := MAX(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm" name="MAXSD" xed="MAXSD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note]</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="MAXPD" xed="MAXPD_XMMpd_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [min_float_note]</description>
+	<operation>
+dst[63:0] := MIN(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm" name="MINSD" xed="MINSD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note]</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="MINPD" xed="MINPD_XMMpd_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_slli_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+	</operation>
+	<instruction form="xmm, imm8" name="PSLLDQ" xed="PSLLDQ_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_bslli_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+	</operation>
+	<instruction form="xmm, imm8" name="PSLLDQ" xed="PSLLDQ_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_bsrli_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+	</operation>
+	<instruction form="xmm, imm8" name="PSRLDQ" xed="PSRLDQ_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_slli_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, imm8" name="PSLLW" xed="PSLLW_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sll_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSLLW" xed="PSLLW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_slli_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, imm8" name="PSLLD" xed="PSLLD_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sll_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSLLD" xed="PSLLD_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_slli_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, imm8" name="PSLLQ" xed="PSLLQ_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sll_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSLLQ" xed="PSLLQ_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srai_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="5" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, imm8" name="PSRAW" xed="PSRAW_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sra_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSRAW" xed="PSRAW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srai_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, imm8" name="PSRAD" xed="PSRAD_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_sra_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSRAD" xed="PSRAD_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srli_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+	</operation>
+	<instruction form="xmm, imm8" name="PSRLDQ" xed="PSRLDQ_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srli_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, imm8" name="PSRLW" xed="PSRLW_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srl_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="count" />
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSRLW" xed="PSRLW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srli_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, imm8" name="PSRLD" xed="PSRLD_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srl_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="count" />
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSRLD" xed="PSRLD_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srli_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, imm8" name="PSRLQ" xed="PSRLQ_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_srl_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="count" />
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSRLQ" xed="PSRLQ_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Shift</category>
+	</intrinsic>
+	<intrinsic name="_mm_and_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[127:0] := (a[127:0] AND b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="PAND" xed="PAND_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_andnot_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<description>Compute the bitwise NOT of 128 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[127:0] := ((NOT a[127:0]) AND b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="PANDN" xed="PANDN_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_or_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<description>Compute the bitwise OR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[127:0] := (a[127:0] OR b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="POR" xed="POR_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_xor_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<description>Compute the bitwise XOR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[127:0] := (a[127:0] XOR b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="PXOR" xed="PXOR_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_and_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="ANDPD" xed="ANDPD_XMMxuq_XMMxuq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_andnot_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="ANDNPD" xed="ANDNPD_XMMxuq_XMMxuq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_or_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="ORPD" xed="ORPD_XMMxuq_XMMxuq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_xor_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="XORPD" xed="XORPD_XMMxuq_XMMxuq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PCMPEQB" xed="PCMPEQB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PCMPEQW" xed="PCMPEQW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PCMPEQD" xed="PCMPEQD_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PCMPGTB" xed="PCMPGTB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PCMPGTW" xed="PCMPGTW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PCMPGTD" xed="PCMPGTD_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtb instruction with the order of the operands switched.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &lt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PCMPGTB" xed="PCMPGTB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtw instruction with the order of the operands switched.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &lt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PCMPGTW" xed="PCMPGTW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtd instruction with the order of the operands switched.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &lt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PCMPGTD" xed="PCMPGTD_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] == b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSD" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] &lt; b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSD" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmple_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] &lt;= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSD" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] &gt; b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSD" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpge_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] &gt;= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSD" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpord_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>dst[63:0] := (a[63:0] != NaN AND b[63:0] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSD" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpunord_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>dst[63:0] := (a[63:0] == NaN OR b[63:0] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSD" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpneq_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] != b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSD" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpnlt_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (!(a[63:0] &lt; b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSD" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpnle_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (!(a[63:0] &lt;= b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSD" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpngt_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (!(a[63:0] &gt; b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSD" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpnge_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (!(a[63:0] &gt;= b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPSD" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] == b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPD" xed="CMPPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmplt_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] &lt; b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPD" xed="CMPPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmple_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] &lt;= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPD" xed="CMPPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] &gt; b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPD" xed="CMPPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpge_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] &gt;= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPD" xed="CMPPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpord_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPD" xed="CMPPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpunord_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPD" xed="CMPPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpneq_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] != b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPD" xed="CMPPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpnlt_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (!(a[i+63:i] &lt; b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPD" xed="CMPPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpnle_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (!(a[i+63:i] &lt;= b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPD" xed="CMPPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpngt_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (!(a[i+63:i] &gt; b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPD" xed="CMPPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpnge_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (!(a[i+63:i] &gt;= b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="CMPPD" xed="CMPPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comieq_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] == b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="COMISD" xed="COMISD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comilt_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] &lt; b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="COMISD" xed="COMISD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comile_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] &lt;= b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="COMISD" xed="COMISD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comigt_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] &gt; b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="COMISD" xed="COMISD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comige_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] &gt;= b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="COMISD" xed="COMISD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_comineq_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1).</description>
+	<operation>RETURN ( a[63:0] == NaN OR b[63:0] == NaN OR a[63:0] != b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="COMISD" xed="COMISD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomieq_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] == b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="UCOMISD" xed="UCOMISD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomilt_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] &lt; b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="UCOMISD" xed="UCOMISD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomile_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] &lt;= b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="UCOMISD" xed="UCOMISD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomigt_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] &gt; b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="UCOMISD" xed="UCOMISD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomige_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] &gt;= b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="UCOMISD" xed="UCOMISD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_ucomineq_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>RETURN ( a[63:0] == NaN OR b[63:0] == NaN OR a[63:0] != b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction form="xmm, xmm" name="UCOMISD" xed="UCOMISD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi32_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="CVTDQ2PD" xed="CVTDQ2PD_XMMpd_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi32_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="SI32" type="int" varname="b" />
+	<description>Convert the signed 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, r32" name="CVTSI2SD" xed="CVTSI2SD_XMMsd_GPR32d" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi64_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="SI64" type="__int64" varname="b" />
+	<description>Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, r64" name="CVTSI2SD" xed="CVTSI2SD_XMMsd_GPR64q" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi64x_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="SI64" type="__int64" varname="b" />
+	<description>Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, r64" name="CVTSI2SD" xed="CVTSI2SD_XMMsd_GPR64q" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi32_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="CVTDQ2PS" xed="CVTDQ2PS_XMMps_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpi32_pd" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="SI32" type="__m64" varname="a" />
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, mm" name="CVTPI2PD" xed="CVTPI2PD_XMMpd_MMXq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi32_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[127:32] := 0
+	</operation>
+	<instruction form="xmm, r32" name="MOVD" xed="MOVD_XMMdq_GPR32" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi64_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<instruction form="xmm, r64" name="MOVQ" xed="MOVQ_XMMdq_GPR64" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi64x_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<instruction form="xmm, r64" name="MOVQ" xed="MOVQ_XMMdq_GPR64" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi128_si32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction form="r32, xmm" name="MOVD" xed="MOVD_GPR32_XMMd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi128_si64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Copy the lower 64-bit integer in "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction form="r64, xmm" name="MOVQ" xed="MOVQ_GPR64_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsi128_si64x" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Copy the lower 64-bit integer in "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction form="r64, xmm" name="MOVQ" xed="MOVQ_GPR64_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpd_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
+ENDFOR
+dst[127:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="CVTPD2PS" xed="CVTPD2PS_XMMps_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtps_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="CVTPS2PD" xed="CVTPS2PD_XMMpd_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpd_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="CVTPD2DQ" xed="CVTPD2DQ_XMMdq_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsd_si32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+	</operation>
+	<instruction form="r32, xmm" name="CVTSD2SI" xed="CVTSD2SI_GPR32d_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsd_si64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction form="r64, xmm" name="CVTSD2SI" xed="CVTSD2SI_GPR64q_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsd_si64x" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction form="r64, xmm" name="CVTSD2SI" xed="CVTSD2SI_GPR64q_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsd_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="CVTSD2SS" xed="CVTSD2SS_XMMss_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtsd_f64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="double" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Copy the lower double-precision (64-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction form="m64, xmm" name="MOVSD" xed="MOVSD_XMM_MEMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtss_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="CVTSS2SD" xed="CVTSS2SD_XMMsd_XMMss" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttpd_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="CVTTPD2DQ" xed="CVTTPD2DQ_XMMdq_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttsd_si32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+	</operation>
+	<instruction form="r32, xmm" name="CVTTSD2SI" xed="CVTTSD2SI_GPR32d_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttsd_si64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction form="r64, xmm" name="CVTTSD2SI" xed="CVTTSD2SI_GPR64q_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttsd_si64x" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction form="r64, xmm" name="CVTTSD2SI" xed="CVTTSD2SI_GPR64q_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtps_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="CVTPS2DQ" xed="CVTPS2DQ_XMMdq_XMMps" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttps_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="CVTTPS2DQ" xed="CVTTPS2DQ_XMMdq_XMMps" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtpd_pi32" tech="SSE_ALL">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+	</operation>
+	<instruction form="mm, xmm" name="CVTPD2PI" xed="CVTPD2PI_MMXq_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvttpd_pi32" tech="SSE_ALL">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+	</operation>
+	<instruction form="mm, xmm" name="CVTTPD2PI" xed="CVTTPD2PI_MMXq_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_epi64" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="e1" />
+	<parameter etype="UI64" type="__m64" varname="e0" />
+	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_epi64x" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="e1" />
+	<parameter etype="UI64" type="__int64" varname="e0" />
+	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_epi32" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="int" varname="e3" />
+	<parameter etype="UI32" type="int" varname="e2" />
+	<parameter etype="UI32" type="int" varname="e1" />
+	<parameter etype="UI32" type="int" varname="e0" />
+	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_epi16" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="short" varname="e7" />
+	<parameter etype="UI16" type="short" varname="e6" />
+	<parameter etype="UI16" type="short" varname="e5" />
+	<parameter etype="UI16" type="short" varname="e4" />
+	<parameter etype="UI16" type="short" varname="e3" />
+	<parameter etype="UI16" type="short" varname="e2" />
+	<parameter etype="UI16" type="short" varname="e1" />
+	<parameter etype="UI16" type="short" varname="e0" />
+	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[15:0] := e0
+dst[31:16] := e1
+dst[47:32] := e2
+dst[63:48] := e3
+dst[79:64] := e4
+dst[95:80] := e5
+dst[111:96] := e6
+dst[127:112] := e7
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_epi8" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="char" varname="e15" />
+	<parameter etype="UI8" type="char" varname="e14" />
+	<parameter etype="UI8" type="char" varname="e13" />
+	<parameter etype="UI8" type="char" varname="e12" />
+	<parameter etype="UI8" type="char" varname="e11" />
+	<parameter etype="UI8" type="char" varname="e10" />
+	<parameter etype="UI8" type="char" varname="e9" />
+	<parameter etype="UI8" type="char" varname="e8" />
+	<parameter etype="UI8" type="char" varname="e7" />
+	<parameter etype="UI8" type="char" varname="e6" />
+	<parameter etype="UI8" type="char" varname="e5" />
+	<parameter etype="UI8" type="char" varname="e4" />
+	<parameter etype="UI8" type="char" varname="e3" />
+	<parameter etype="UI8" type="char" varname="e2" />
+	<parameter etype="UI8" type="char" varname="e1" />
+	<parameter etype="UI8" type="char" varname="e0" />
+	<description>Set packed 8-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[7:0] := e0
+dst[15:8] := e1
+dst[23:16] := e2
+dst[31:24] := e3
+dst[39:32] := e4
+dst[47:40] := e5
+dst[55:48] := e6
+dst[63:56] := e7
+dst[71:64] := e8
+dst[79:72] := e9
+dst[87:80] := e10
+dst[95:88] := e11
+dst[103:96] := e12
+dst[111:104] := e13
+dst[119:112] := e14
+dst[127:120] := e15
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set1_epi64" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<description>Broadcast 64-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set1_epi64x" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__int64" varname="a" />
+	<description>Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set1_epi32" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="int" varname="a" />
+	<description>Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastd".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set1_epi16" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="short" varname="a" />
+	<description>Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate "vpbroadcastw".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set1_epi8" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="char" varname="a" />
+	<description>Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastb".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setr_epi64" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="e1" />
+	<parameter etype="UI64" type="__m64" varname="e0" />
+	<description>Set packed 64-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e1
+dst[127:64] := e0
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setr_epi32" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="int" varname="e3" />
+	<parameter etype="UI32" type="int" varname="e2" />
+	<parameter etype="UI32" type="int" varname="e1" />
+	<parameter etype="UI32" type="int" varname="e0" />
+	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e3
+dst[63:32] := e2
+dst[95:64] := e1
+dst[127:96] := e0
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setr_epi16" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="short" varname="e7" />
+	<parameter etype="UI16" type="short" varname="e6" />
+	<parameter etype="UI16" type="short" varname="e5" />
+	<parameter etype="UI16" type="short" varname="e4" />
+	<parameter etype="UI16" type="short" varname="e3" />
+	<parameter etype="UI16" type="short" varname="e2" />
+	<parameter etype="UI16" type="short" varname="e1" />
+	<parameter etype="UI16" type="short" varname="e0" />
+	<description>Set packed 16-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[15:0] := e7
+dst[31:16] := e6
+dst[47:32] := e5
+dst[63:48] := e4
+dst[79:64] := e3
+dst[95:80] := e2
+dst[111:96] := e1
+dst[127:112] := e0
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setr_epi8" sequence="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="char" varname="e15" />
+	<parameter etype="UI8" type="char" varname="e14" />
+	<parameter etype="UI8" type="char" varname="e13" />
+	<parameter etype="UI8" type="char" varname="e12" />
+	<parameter etype="UI8" type="char" varname="e11" />
+	<parameter etype="UI8" type="char" varname="e10" />
+	<parameter etype="UI8" type="char" varname="e9" />
+	<parameter etype="UI8" type="char" varname="e8" />
+	<parameter etype="UI8" type="char" varname="e7" />
+	<parameter etype="UI8" type="char" varname="e6" />
+	<parameter etype="UI8" type="char" varname="e5" />
+	<parameter etype="UI8" type="char" varname="e4" />
+	<parameter etype="UI8" type="char" varname="e3" />
+	<parameter etype="UI8" type="char" varname="e2" />
+	<parameter etype="UI8" type="char" varname="e1" />
+	<parameter etype="UI8" type="char" varname="e0" />
+	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[7:0] := e15
+dst[15:8] := e14
+dst[23:16] := e13
+dst[31:24] := e12
+dst[39:32] := e11
+dst[47:40] := e10
+dst[55:48] := e9
+dst[63:56] := e8
+dst[71:64] := e7
+dst[79:72] := e6
+dst[87:80] := e5
+dst[95:88] := e4
+dst[103:96] := e3
+dst[111:104] := e2
+dst[119:112] := e1
+dst[127:120] := e0
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setzero_si128" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<description>Return vector of type __m128i with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="PXOR" xed="PXOR_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_sd" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="double" varname="a" />
+	<description>Copy double-precision (64-bit) floating-point element "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set1_pd" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="double" varname="a" />
+	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_pd1" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="double" varname="a" />
+	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_set_pd" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="double" varname="e1" />
+	<parameter etype="FP64" type="double" varname="e0" />
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setr_pd" sequence="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="double" varname="e1" />
+	<parameter etype="FP64" type="double" varname="e0" />
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e1
+dst[127:64] := e0
+	</operation>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_setzero_pd" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter type="void" />
+	<description>Return vector of type __m128d with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="XORPD" xed="XORPD_XMMxuq_XMMxuq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Set</category>
+	</intrinsic>
+	<intrinsic name="_mm_movepi64_pi64" tech="SSE_ALL">
+	<return etype="FP32" type="__m64" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Copy the lower 64-bit integer in "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction form="mm, xmm" name="MOVDQ2Q" xed="MOVDQ2Q_MMXq_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_packs_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI8" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate8(a[15:0])
+dst[15:8] := Saturate8(a[31:16])
+dst[23:16] := Saturate8(a[47:32])
+dst[31:24] := Saturate8(a[63:48])
+dst[39:32] := Saturate8(a[79:64])
+dst[47:40] := Saturate8(a[95:80])
+dst[55:48] := Saturate8(a[111:96])
+dst[63:56] := Saturate8(a[127:112])
+dst[71:64] := Saturate8(b[15:0])
+dst[79:72] := Saturate8(b[31:16])
+dst[87:80] := Saturate8(b[47:32])
+dst[95:88] := Saturate8(b[63:48])
+dst[103:96] := Saturate8(b[79:64])
+dst[111:104] := Saturate8(b[95:80])
+dst[119:112] := Saturate8(b[111:96])
+dst[127:120] := Saturate8(b[127:112])
+	</operation>
+	<instruction form="xmm, xmm" name="PACKSSWB" xed="PACKSSWB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_packs_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:0])
+dst[31:16] := Saturate16(a[63:32])
+dst[47:32] := Saturate16(a[95:64])
+dst[63:48] := Saturate16(a[127:96])
+dst[79:64] := Saturate16(b[31:0])
+dst[95:80] := Saturate16(b[63:32])
+dst[111:96] := Saturate16(b[95:64])
+dst[127:112] := Saturate16(b[127:96])
+	</operation>
+	<instruction form="xmm, xmm" name="PACKSSDW" xed="PACKSSDW_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_packus_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := SaturateU8(a[15:0])
+dst[15:8] := SaturateU8(a[31:16])
+dst[23:16] := SaturateU8(a[47:32])
+dst[31:24] := SaturateU8(a[63:48])
+dst[39:32] := SaturateU8(a[79:64])
+dst[47:40] := SaturateU8(a[95:80])
+dst[55:48] := SaturateU8(a[111:96])
+dst[63:56] := SaturateU8(a[127:112])
+dst[71:64] := SaturateU8(b[15:0])
+dst[79:72] := SaturateU8(b[31:16])
+dst[87:80] := SaturateU8(b[47:32])
+dst[95:88] := SaturateU8(b[63:48])
+dst[103:96] := SaturateU8(b[79:64])
+dst[111:104] := SaturateU8(b[95:80])
+dst[119:112] := SaturateU8(b[111:96])
+dst[127:120] := SaturateU8(b[127:112])
+	</operation>
+	<instruction form="xmm, xmm" name="PACKUSWB" xed="PACKUSWB_XMMdq_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_movemask_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="MASK" type="int" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[j] := a[i+7]
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction form="r32, xmm" name="PMOVMSKB" xed="PMOVMSKB_GPR32_XMMdq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_movemask_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="MASK" type="int" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF a[i+63]
+		dst[j] := 1
+	ELSE
+		dst[j] := 0
+	FI
+ENDFOR
+dst[MAX:2] := 0
+	</operation>
+	<instruction form="r32, xmm" name="MOVMSKPD" xed="MOVMSKPD_GPR32_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_movpi64_epi64" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m64" varname="a" />
+	<description>Copy the 64-bit integer "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<instruction form="xmm, mm" name="MOVQ2DQ" xed="MOVQ2DQ_XMMdq_MMXq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_move_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Copy the lower 64-bit integer in "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="MOVQ" xed="MOVQ_XMMdq_XMMq_0F7E" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_move_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm" name="MOVSD" xed="MOVSD_XMM_XMMsd_XMMsd_0F10" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_extract_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="int" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="3" type="int" varname="imm8" />
+	<description>Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[15:0] := (a[127:0] &gt;&gt; (imm8[2:0] * 16))[15:0]
+dst[31:16] := 0
+	</operation>
+	<instruction form="r32, xmm, imm8" name="PEXTRW" xed="PEXTRW_GPR32_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_insert_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="int" varname="i" />
+	<parameter etype="IMM" immwidth="3" type="int" varname="imm8" />
+	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[127:0] := a[127:0]
+sel := imm8[2:0]*16
+dst[sel+15:sel] := i[15:0]
+	</operation>
+	<instruction form="xmm, r32, imm8" name="PINSRW" xed="PINSRW_XMMdq_GPR32_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_shuffle_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PSHUFD" xed="PSHUFD_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_shufflehi_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PSHUFHW" xed="PSHUFHW_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_shufflelo_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="8" type="int" varname="imm8" />
+	<description>Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst".</description>
+	<operation>
+dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PSHUFLW" xed="PSHUFLW_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpackhi_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="PUNPCKHBW" xed="PUNPCKHBW_XMMdq_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpackhi_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="PUNPCKHWD" xed="PUNPCKHWD_XMMdq_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpackhi_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="PUNPCKHDQ" xed="PUNPCKHDQ_XMMdq_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpackhi_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="PUNPCKHQDQ" xed="PUNPCKHQDQ_XMMdq_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpacklo_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="PUNPCKLBW" xed="PUNPCKLBW_XMMdq_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpacklo_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="PUNPCKLWD" xed="PUNPCKLWD_XMMdq_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpacklo_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="PUNPCKLDQ" xed="PUNPCKLDQ_XMMdq_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpacklo_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="PUNPCKLQDQ" xed="PUNPCKLQDQ_XMMdq_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpackhi_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="UNPCKHPD" xed="UNPCKHPD_XMMpd_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_unpacklo_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction form="xmm, xmm" name="UNPCKLPD" xed="UNPCKLPD_XMMpd_XMMq" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_shuffle_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="int" varname="imm8" />
+	<description>Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="SHUFPD" xed="SHUFPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_sqrt_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := SQRT(b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm" name="SQRTSD" xed="SQRTSD_XMMsd_XMMsd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_sqrt_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="SQRTPD" xed="SQRTPD_XMMpd_XMMpd" />
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Elementary Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_castpd_ps" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Cast vector of type __m128d to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm_castpd_si128" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Cast vector of type __m128d to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm_castps_pd" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Cast vector of type __m128 to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm_castps_si128" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Cast vector of type __m128 to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm_castsi128_pd" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<description>Cast vector of type __m128i to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	<intrinsic name="_mm_castsi128_ps" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Cast vector of type __m128i to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<CPUID>SSE2</CPUID>
+	<header>emmintrin.h</header>
+	<category>Cast</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm_addsub_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF ((j &amp; 1) == 0)
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="ADDSUBPS" xed="ADDSUBPS_XMMps_XMMps" />
+	<CPUID>SSE3</CPUID>
+	<header>pmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_addsub_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF ((j &amp; 1) == 0)
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="ADDSUBPD" xed="ADDSUBPD_XMMpd_XMMpd" />
+	<CPUID>SSE3</CPUID>
+	<header>pmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hadd_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[63:0] := a[127:64] + a[63:0]
+dst[127:64] := b[127:64] + b[63:0]
+	</operation>
+	<instruction form="xmm, xmm" name="HADDPD" xed="HADDPD_XMMpd_XMMpd" />
+	<CPUID>SSE3</CPUID>
+	<header>pmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hadd_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := a[127:96] + a[95:64]
+dst[95:64] := b[63:32] + b[31:0]
+dst[127:96] := b[127:96] + b[95:64]
+	</operation>
+	<instruction form="xmm, xmm" name="HADDPS" xed="HADDPS_XMMps_XMMps" />
+	<CPUID>SSE3</CPUID>
+	<header>pmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hsub_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] - a[127:64]
+dst[127:64] := b[63:0] - b[127:64]
+	</operation>
+	<instruction form="xmm, xmm" name="HSUBPD" xed="HSUBPD_XMMpd_XMMpd" />
+	<CPUID>SSE3</CPUID>
+	<header>pmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hsub_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := a[95:64] - a[127:96]
+dst[95:64] := b[31:0] - b[63:32]
+dst[127:96] := b[95:64] - b[127:96]
+	</operation>
+	<instruction form="xmm, xmm" name="HSUBPS" xed="HSUBPS_XMMps_XMMps" />
+	<CPUID>SSE3</CPUID>
+	<header>pmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_lddqu_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" memwidth="128" type="__m128i const*" varname="mem_addr" />
+	<description>Load 128-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm_loadu_si128" when the data crosses a cache line boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction form="xmm, m128" name="LDDQU" xed="LDDQU_XMMpd_MEMdq" />
+	<CPUID>SSE3</CPUID>
+	<header>pmmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_loaddup_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" memwidth="64" type="double const*" varname="mem_addr" />
+	<description>Load a double-precision (64-bit) floating-point element from memory into both elements of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction form="xmm, m64" name="MOVDDUP" xed="MOVDDUP_XMMdq_MEMq" />
+	<CPUID>SSE3</CPUID>
+	<header>pmmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	<intrinsic name="_mm_movedup_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Duplicate the low double-precision (64-bit) floating-point element from "a", and store the results in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := a[63:0]
+	</operation>
+	<instruction form="xmm, xmm" name="MOVDDUP" xed="MOVDDUP_XMMdq_XMMq" />
+	<CPUID>SSE3</CPUID>
+	<header>pmmintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_movehdup_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] 
+dst[63:32] := a[63:32]
+dst[95:64] := a[127:96] 
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction form="xmm, xmm" name="MOVSHDUP" xed="MOVSHDUP_XMMps_XMMps" />
+	<CPUID>SSE3</CPUID>
+	<header>pmmintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	<intrinsic name="_mm_moveldup_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] 
+dst[63:32] := a[31:0]
+dst[95:64] := a[95:64] 
+dst[127:96] := a[95:64]
+	</operation>
+	<instruction form="xmm, xmm" name="MOVSLDUP" xed="MOVSLDUP_XMMps_XMMps" />
+	<CPUID>SSE3</CPUID>
+	<header>pmmintrin.h</header>
+	<category>Move</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm_blend_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF imm8[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="BLENDPD" xed="BLENDPD_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_blend_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="imm8" />
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="BLENDPS" xed="BLENDPS_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_blendv_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="FP64" type="__m128d" varname="mask" />
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="BLENDVPD" xed="BLENDVPD_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_blendv_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="FP32" type="__m128" varname="mask" />
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="BLENDVPS" xed="BLENDVPS_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_blendv_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<parameter etype="UI8" type="__m128i" varname="mask" />
+	<description>Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF mask[i+7]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PBLENDVB" xed="PBLENDVB_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_blend_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Blend packed 16-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF imm8[j]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PBLENDW" xed="PBLENDW_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_extract_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Extract a single-precision (32-bit) floating-point element from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+dst[31:0] := (a[127:0] &gt;&gt; (imm8[1:0] * 32))[31:0]
+	</operation>
+	<instruction form="r32, xmm, imm8" name="EXTRACTPS" xed="EXTRACTPS_GPR32d_XMMdq_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_extract_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="int" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="imm8" />
+	<description>Extract an 8-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[7:0] := (a[127:0] &gt;&gt; (imm8[3:0] * 8))[7:0]
+dst[31:8] := 0
+	</operation>
+	<instruction form="r32, xmm, imm8" name="PEXTRB" xed="PEXTRB_GPR32d_XMMdq_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_extract_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Extract a 32-bit integer from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+dst[31:0] := (a[127:0] &gt;&gt; (imm8[1:0] * 32))[31:0]
+	</operation>
+	<instruction form="r32, xmm, imm8" name="PEXTRD" xed="PEXTRD_GPR32d_XMMdq_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_extract_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="IMM" immwidth="1" type="const int" varname="imm8" />
+	<description>Extract a 64-bit integer from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[127:0] &gt;&gt; (imm8[0] * 64))[63:0]
+	</operation>
+	<instruction form="r64, xmm, imm8" name="PEXTRQ" xed="PEXTRQ_GPR64q_XMMdq_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_insert_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Copy "a" to "tmp", then insert a single-precision (32-bit) floating-point element from "b" into "tmp" using the control in "imm8". Store "tmp" to "dst" using the mask in "imm8" (elements are zeroed out when the corresponding bit is set).</description>
+	<operation>
+tmp2[127:0] := a[127:0]
+CASE (imm8[7:6]) OF
+0: tmp1[31:0] := b[31:0]
+1: tmp1[31:0] := b[63:32]
+2: tmp1[31:0] := b[95:64]
+3: tmp1[31:0] := b[127:96]
+ESAC
+CASE (imm8[5:4]) OF
+0: tmp2[31:0] := tmp1[31:0]
+1: tmp2[63:32] := tmp1[31:0]
+2: tmp2[95:64] := tmp1[31:0]
+3: tmp2[127:96] := tmp1[31:0]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[j%8]
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := tmp2[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="INSERTPS" xed="INSERTPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_insert_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="int" varname="i" />
+	<parameter etype="IMM" immwidth="4" type="const int" varname="imm8" />
+	<description>Copy "a" to "dst", and insert the lower 8-bit integer from "i" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[127:0] := a[127:0]
+sel := imm8[3:0]*8
+dst[sel+7:sel] := i[7:0]
+	</operation>
+	<instruction form="xmm, r32, imm8" name="PINSRB" xed="PINSRB_XMMdq_GPR32d_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_insert_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="int" varname="i" />
+	<parameter etype="IMM" immwidth="2" type="const int" varname="imm8" />
+	<description>Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[127:0] := a[127:0]
+sel := imm8[1:0]*32
+dst[sel+31:sel] := i[31:0]
+	</operation>
+	<instruction form="xmm, r32, imm8" name="PINSRD" xed="PINSRD_XMMdq_GPR32d_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_insert_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__int64" varname="i" />
+	<parameter etype="IMM" immwidth="1" type="const int" varname="imm8" />
+	<description>Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[127:0] := a[127:0]
+sel := imm8[0]*64
+dst[sel+63:sel] := i[63:0]
+	</operation>
+	<instruction form="xmm, r64, imm8" name="PINSRQ" xed="PINSRQ_XMMdq_GPR64q_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_dp_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Conditionally multiply the packed double-precision (64-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8".</description>
+	<operation>
+DEFINE DP(a[127:0], b[127:0], imm8[7:0]) {
+	FOR j := 0 to 1
+		i := j*64
+		IF imm8[(4+j)%8]
+			temp[i+63:i] := a[i+63:i] * b[i+63:i]
+		ELSE
+			temp[i+63:i] := 0.0
+		FI
+	ENDFOR
+	
+	sum[63:0] := temp[127:64] + temp[63:0]
+	
+	FOR j := 0 to 1
+		i := j*64
+		IF imm8[j%8]
+			tmpdst[i+63:i] := sum[63:0]
+		ELSE
+			tmpdst[i+63:i] := 0.0
+		FI
+	ENDFOR
+	RETURN tmpdst[127:0]
+}
+dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="DPPD" xed="DPPD_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_dp_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8".</description>
+	<operation>
+DEFINE DP(a[127:0], b[127:0], imm8[7:0]) {
+	FOR j := 0 to 3
+		i := j*32
+		IF imm8[(4+j)%8]
+			temp[i+31:i] := a[i+31:i] * b[i+31:i]
+		ELSE
+			temp[i+31:i] := 0
+		FI
+	ENDFOR
+	
+	sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0])
+	
+	FOR j := 0 to 3
+		i := j*32
+		IF imm8[j%8]
+			tmpdst[i+31:i] := sum[31:0]
+		ELSE
+			tmpdst[i+31:i] := 0
+		FI
+	ENDFOR
+	RETURN tmpdst[127:0]
+}
+dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="DPPS" xed="DPPS_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mul_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI64" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMULDQ" xed="PMULDQ_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mullo_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	tmp[63:0] := a[i+31:i] * b[i+31:i]
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMULLD" xed="PMULLD_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mpsadbw_epu8" vexEq="TRUE" tech="SSE_ALL">
+	<category>Miscellaneous</category>
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Eight SADs are performed using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8".</description>
+	<operation>
+DEFINE MPSADBW(a[127:0], b[127:0], imm8[2:0]) {
+	a_offset := imm8[2]*32
+	b_offset := imm8[1:0]*32
+	FOR j := 0 to 7
+		i := j*8
+		k := a_offset+i
+		l := b_offset
+		tmp[i*2+15:i*2] := ABS(Signed(a[k+7:k] - b[l+7:l])) + ABS(Signed(a[k+15:k+8] - b[l+15:l+8])) + \
+		                   ABS(Signed(a[k+23:k+16] - b[l+23:l+16])) + ABS(Signed(a[k+31:k+24] - b[l+31:l+24]))
+	ENDFOR
+	RETURN tmp[127:0]
+}
+dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0])
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="MPSADBW" xed="MPSADBW_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMAXSB" xed="PMAXSB_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMAXSD" xed="PMAXSD_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_epu32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMAXUD" xed="PMAXUD_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_max_epu16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMAXUW" xed="PMAXUW_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMINSB" xed="PMINSB_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMINSD" xed="PMINSD_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_epu32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMINUD" xed="PMINUD_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_min_epu16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<parameter etype="UI16" type="__m128i" varname="b" />
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMINUW" xed="PMINUW_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_round_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="IMM" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i], rounding)
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="ROUNDPD" xed="ROUNDPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_floor_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="ROUNDPD" xed="ROUNDPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_ceil_pd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="ROUNDPD" xed="ROUNDPD_XMMpd_XMMpd_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_round_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="IMM" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i], rounding)
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="ROUNDPS" xed="ROUNDPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_floor_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="ROUNDPS" xed="ROUNDPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_ceil_ps" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="ROUNDPS" xed="ROUNDPS_XMMps_XMMps_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_round_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<parameter etype="IMM" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" using the "rounding" parameter, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := ROUND(b[63:0], rounding)
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="ROUNDSD" xed="ROUNDSD_XMMq_XMMq_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_floor_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" down to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := FLOOR(b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="ROUNDSD" xed="ROUNDSD_XMMq_XMMq_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_ceil_sd" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP64" type="__m128d" varname="dst" />
+	<parameter etype="FP64" type="__m128d" varname="a" />
+	<parameter etype="FP64" type="__m128d" varname="b" />
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" up to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := CEIL(b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="ROUNDSD" xed="ROUNDSD_XMMq_XMMq_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_round_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<parameter etype="IMM" immtype="_MM_FROUND" type="int" varname="rounding" />
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" using the "rounding" parameter, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := ROUND(b[31:0], rounding)
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="ROUNDSS" xed="ROUNDSS_XMMd_XMMd_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_floor_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" down to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := FLOOR(b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="ROUNDSS" xed="ROUNDSS_XMMd_XMMd_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_ceil_ss" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="FP32" type="__m128" varname="dst" />
+	<parameter etype="FP32" type="__m128" varname="a" />
+	<parameter etype="FP32" type="__m128" varname="b" />
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" up to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := CEIL(b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="ROUNDSS" xed="ROUNDSS_XMMd_XMMd_IMMb" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_packus_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<category>Miscellaneous</category>
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := SaturateU16(a[31:0])
+dst[31:16] := SaturateU16(a[63:32])
+dst[47:32] := SaturateU16(a[95:64])
+dst[63:48] := SaturateU16(a[127:96])
+dst[79:64] := SaturateU16(b[31:0])
+dst[95:80] := SaturateU16(b[63:32])
+dst[111:96] := SaturateU16(b[95:64])
+dst[127:112] := SaturateU16(b[127:96])
+	</operation>
+	<instruction form="xmm, xmm" name="PACKUSDW" xed="PACKUSDW_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi8_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	dst[l+15:l] := SignExtend16(a[i+7:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMOVSXBW" xed="PMOVSXBW_XMMdq_XMMq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi8_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := SignExtend32(a[k+7:k])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMOVSXBD" xed="PMOVSXBD_XMMdq_XMMd" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi8_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI64" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := SignExtend64(a[k+7:k])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMOVSXBQ" xed="PMOVSXBQ_XMMdq_XMMw" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi16_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := SignExtend32(a[k+15:k])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMOVSXWD" xed="PMOVSXWD_XMMdq_XMMq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi16_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI64" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := SignExtend64(a[k+15:k])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMOVSXWQ" xed="PMOVSXWQ_XMMdq_XMMd" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepi32_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI64" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := SignExtend64(a[k+31:k])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMOVSXDQ" xed="PMOVSXDQ_XMMdq_XMMq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepu8_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	dst[l+15:l] := ZeroExtend16(a[i+7:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMOVZXBW" xed="PMOVZXBW_XMMdq_XMMq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepu8_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := ZeroExtend32(a[k+7:k])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMOVZXBD" xed="PMOVZXBD_XMMdq_XMMd" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepu8_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := ZeroExtend64(a[k+7:k])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMOVZXBQ" xed="PMOVZXBQ_XMMdq_XMMw" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepu16_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := ZeroExtend32(a[k+15:k])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMOVZXWD" xed="PMOVZXWD_XMMdq_XMMq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepu16_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := ZeroExtend64(a[k+15:k])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMOVZXWQ" xed="PMOVZXWQ_XMMdq_XMMd" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cvtepu32_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI32" type="__m128i" varname="a" />
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := ZeroExtend64(a[k+31:k])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMOVZXDQ" xed="PMOVZXDQ_XMMdq_XMMq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Convert</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpeq_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="UI64" type="__m128i" varname="a" />
+	<parameter etype="UI64" type="__m128i" varname="b" />
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PCMPEQQ" xed="PCMPEQQ_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_testz_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+IF ((a[127:0] AND b[127:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[127:0]) AND b[127:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN ZF
+	</operation>
+	<instruction form="xmm, xmm" name="PTEST" xed="PTEST_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_testc_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="k" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+IF ((a[127:0] AND b[127:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[127:0]) AND b[127:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN CF
+	</operation>
+	<instruction form="xmm, xmm" name="PTEST" xed="PTEST_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_testnzc_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+IF ((a[127:0] AND b[127:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[127:0]) AND b[127:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="xmm, xmm" name="PTEST" xed="PTEST_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_test_all_zeros" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="mask" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and return 1 if the result is zero, otherwise return 0.</description>
+	<operation>
+IF ((a[127:0] AND mask[127:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+dst := ZF
+	</operation>
+	<instruction form="xmm, xmm" name="PTEST" xed="PTEST_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_test_mix_ones_zeros" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="mask" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "mask", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+IF ((a[127:0] AND mask[127:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[127:0]) AND mask[127:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction form="xmm, xmm" name="PTEST" xed="PTEST_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_test_all_ones" sequence="TRUE" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<description>Compute the bitwise NOT of "a" and then AND with a 128-bit vector containing all 1's, and return 1 if the result is zero, otherwise return 0.</description>
+	<operation>
+FOR j := 0 to 127
+	tmp[j] := 1
+ENDFOR
+IF (((NOT a[127:0]) AND tmp[127:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := CF
+	</operation>
+	<instruction form="xmm, xmm" name="PCMPEQD" xed="PCMPEQD_XMMdq_XMMdq" />
+	<instruction form="xmm, xmm" name="PTEST" xed="PTEST_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Logical</category>
+	</intrinsic>
+	<intrinsic name="_mm_minpos_epu16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="UI16" type="__m128i" varname="a" />
+	<description>Horizontally compute the minimum amongst the packed unsigned 16-bit integers in "a", store the minimum and index in "dst", and zero the remaining bits in "dst".</description>
+	<operation>
+index[2:0] := 0
+min[15:0] := a[15:0]
+FOR j := 0 to 7
+	i := j*16
+	IF a[i+15:i] &lt; min[15:0]
+		index[2:0] := j
+		min[15:0] := a[i+15:i]
+	FI
+ENDFOR
+dst[15:0] := min[15:0]
+dst[18:16] := index[2:0]
+dst[127:19] := 0
+	</operation>
+	<instruction form="xmm, xmm" name="PHMINPOSUW" xed="PHMINPOSUW_XMMdq_XMMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_stream_load_si128" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" memwidth="128" type="void*" varname="mem_addr" />
+	<description>Load 128-bits of integer data from memory into "dst" using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction form="xmm, m128" name="MOVNTDQA" xed="MOVNTDQA_XMMdq_MEMdq" />
+	<CPUID>SSE4.1</CPUID>
+	<header>smmintrin.h</header>
+	<category>Load</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm_cmpistrm" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated mask in "dst".
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+IF imm8[6] // byte / word mask
+	FOR i := 0 to UpperBound
+		j := i*size
+		IF IntRes2[i]
+			dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF)
+		ELSE
+			dst[j+size-1:j] := 0
+		FI
+	ENDFOR
+ELSE // bit mask
+	dst[UpperBound:0] := IntRes2[UpperBound:0]
+	dst[127:UpperBound+1] := 0
+FI
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPISTRM" xed="PCMPISTRM_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpistri" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated index in "dst".
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+IF imm8[6] // most significant bit
+	tmp := UpperBound
+	dst := tmp
+	DO WHILE ((tmp &gt;= 0) AND a[tmp] == 0)
+		tmp := tmp - 1
+		dst := tmp
+	OD
+ELSE // least significant bit
+	tmp := 0
+	dst := tmp
+	DO WHILE ((tmp &lt;= UpperBound) AND a[tmp] == 0)
+		tmp := tmp + 1
+		dst := tmp
+	OD
+FI
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPISTRI" xed="PCMPISTRI_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpistrz" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" hint="TRUE" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+bInvalid := 0
+FOR j := 0 to UpperBound
+	n := j*size
+	IF b[n+size-1:n] == 0
+		bInvalid := 1
+	FI
+ENDFOR
+dst := bInvalid
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPISTRI" xed="PCMPISTRI_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpistrc" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+dst := (IntRes2 != 0)
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPISTRI" xed="PCMPISTRI_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpistrs" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" hint="TRUE" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+aInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	IF a[m+size-1:m] == 0
+		aInvalid := 1
+	FI
+ENDFOR
+dst := aInvalid
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPISTRI" xed="PCMPISTRI_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpistro" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns bit 0 of the resulting bit mask.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+dst := IntRes2[0]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPISTRI" xed="PCMPISTRI_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpistra" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+dst := (IntRes2 == 0) AND bInvalid
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPISTRI" xed="PCMPISTRI_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpestrm" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="M128" type="__m128i" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="int" varname="la" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="int" varname="lb" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated mask in "dst".
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+IF imm8[6] // byte / word mask
+	FOR i := 0 to UpperBound
+		j := i*size
+		IF IntRes2[i]
+			dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF)
+		ELSE
+			dst[j+size-1:j] := 0
+		FI
+	ENDFOR
+ELSE // bit mask
+	dst[UpperBound:0] := IntRes2[UpperBound:0]
+	dst[127:UpperBound+1] := 0
+FI
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPESTRM" xed="PCMPESTRM_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpestri" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="int" varname="la" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="int" varname="lb" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated index in "dst".
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+IF imm8[6] // most significant bit
+	tmp := UpperBound
+	dst := tmp
+	DO WHILE ((tmp &gt;= 0) AND a[tmp] == 0)
+		tmp := tmp - 1
+		dst := tmp
+	OD
+ELSE // least significant bit
+	tmp := 0
+	dst := tmp
+	DO WHILE ((tmp &lt;= UpperBound) AND a[tmp] == 0)
+		tmp := tmp + 1
+		dst := tmp
+	OD
+FI
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPESTRI" xed="PCMPESTRI_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpestrz" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" hint="TRUE" type="__m128i" varname="a" />
+	<parameter etype="UI32" hint="TRUE" type="int" varname="la" />
+	<parameter etype="M128" hint="TRUE" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="int" varname="lb" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+dst := (lb &lt;= UpperBound)
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPESTRI" xed="PCMPESTRI_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpestrc" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="int" varname="la" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="int" varname="lb" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+dst := (IntRes2 != 0)
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPESTRI" xed="PCMPESTRI_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpestrs" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" hint="TRUE" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="int" varname="la" />
+	<parameter etype="M128" hint="TRUE" type="__m128i" varname="b" />
+	<parameter etype="UI32" hint="TRUE" type="int" varname="lb" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+dst := (la &lt;= UpperBound)
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPESTRI" xed="PCMPESTRI_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpestro" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="int" varname="la" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="int" varname="lb" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns bit 0 of the resulting bit mask.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+dst := IntRes2[0]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPESTRI" xed="PCMPESTRI_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpestra" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="int" varname="dst" />
+	<parameter etype="M128" type="__m128i" varname="a" />
+	<parameter etype="UI32" type="int" varname="la" />
+	<parameter etype="M128" type="__m128i" varname="b" />
+	<parameter etype="UI32" type="int" varname="lb" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="imm8" />
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+dst := (IntRes2 == 0) AND (lb &gt; UpperBound)
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PCMPESTRI" xed="PCMPESTRI_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>String Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_cmpgt_epi64" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI64" type="__m128i" varname="dst" />
+	<parameter etype="SI64" type="__m128i" varname="a" />
+	<parameter etype="SI64" type="__m128i" varname="b" />
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] &gt; b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PCMPGTQ" xed="PCMPGTQ_XMMdq_XMMdq" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>Compare</category>
+	</intrinsic>
+	<intrinsic name="_mm_crc32_u8" tech="SSE_ALL">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="crc" />
+	<parameter etype="UI8" type="unsigned char" varname="v" />
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 8-bit integer "v", and stores the result in "dst".</description>
+	<operation>tmp1[7:0] := v[0:7] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[39:0] := tmp1[7:0] &lt;&lt; 32 
+tmp4[39:0] := tmp2[31:0] &lt;&lt; 8
+tmp5[39:0] := tmp3[39:0] XOR tmp4[39:0]
+tmp6[31:0] := MOD2(tmp5[39:0], 0x11EDC6F41) // remainder from polynomial division modulus 2
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction form="r32, r8" name="CRC32" xed="CRC32_GPRyy_GPR8b" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_crc32_u16" tech="SSE_ALL">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="crc" />
+	<parameter etype="UI16" type="unsigned short" varname="v" />
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 16-bit integer "v", and stores the result in "dst".</description>
+	<operation>tmp1[15:0] := v[0:15] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[47:0] := tmp1[15:0] &lt;&lt; 32
+tmp4[47:0] := tmp2[31:0] &lt;&lt; 16
+tmp5[47:0] := tmp3[47:0] XOR tmp4[47:0]
+tmp6[31:0] := MOD2(tmp5[47:0], 0x11EDC6F41) // remainder from polynomial division modulus 2
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction form="r32, r16" name="CRC32" xed="CRC32_GPRyy_GPRv" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_crc32_u32" tech="SSE_ALL">
+	<return etype="UI32" type="unsigned int" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="crc" />
+	<parameter etype="UI32" type="unsigned int" varname="v" />
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 32-bit integer "v", and stores the result in "dst".</description>
+	<operation>tmp1[31:0] := v[0:31] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[63:0] := tmp1[31:0] &lt;&lt; 32
+tmp4[63:0] := tmp2[31:0] &lt;&lt; 32
+tmp5[63:0] := tmp3[63:0] XOR tmp4[63:0]
+tmp6[31:0] := MOD2(tmp5[63:0], 0x11EDC6F41) // remainder from polynomial division modulus 2
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction form="r32, r32" name="CRC32" xed="CRC32_GPRyy_GPRv" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm_crc32_u64" tech="SSE_ALL">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI64" type="unsigned __int64" varname="crc" />
+	<parameter etype="UI64" type="unsigned __int64" varname="v" />
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 64-bit integer "v", and stores the result in "dst".</description>
+	<operation>tmp1[63:0] := v[0:63] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[95:0] := tmp1[31:0] &lt;&lt; 32
+tmp4[95:0] := tmp2[63:0] &lt;&lt; 64
+tmp5[95:0] := tmp3[95:0] XOR tmp4[95:0]
+tmp6[31:0] := MOD2(tmp5[95:0], 0x11EDC6F41) // remainder from polynomial division modulus 2
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction form="r64, r64" name="CRC32" xed="CRC32_GPRyy_GPRv" />
+	<CPUID>SSE4.2</CPUID>
+	<header>nmmintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm_abs_pi8" tech="SSE_ALL">
+	<return etype="UI8" type="__m64" varname="dst" />
+	<parameter etype="SI8" type="__m64" varname="a" />
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ABS(Int(a[i+7:i]))
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PABSB" xed="PABSB_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_abs_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := ABS(a[i+7:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PABSB" xed="PABSB_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_abs_pi16" tech="SSE_ALL">
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ABS(Int(a[i+15:i]))
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PABSW" xed="PABSW_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_abs_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ABS(a[i+15:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PABSW" xed="PABSW_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_abs_pi32" tech="SSE_ALL">
+	<return etype="UI32" type="__m64" varname="dst" />
+	<parameter etype="SI32" type="__m64" varname="a" />
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ABS(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PABSD" xed="PABSD_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_abs_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ABS(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PABSD" xed="PABSD_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Special Math Functions</category>
+	</intrinsic>
+	<intrinsic name="_mm_shuffle_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF b[i+7] == 1
+		dst[i+7:i] := 0
+	ELSE
+		index[3:0] := b[i+3:i]
+		dst[i+7:i] := a[index*8+7:index*8]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSHUFB" xed="PSHUFB_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_shuffle_pi8" tech="SSE_ALL">
+	<return etype="UI8" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF b[i+7] == 1
+		dst[i+7:i] := 0
+	ELSE
+		index[2:0] := b[i+2:i]
+		dst[i+7:i] := a[index*8+7:index*8]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSHUFB" xed="PSHUFB_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Swizzle</category>
+	</intrinsic>
+	<intrinsic name="_mm_alignr_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="UI8" type="__m128i" varname="b" />
+	<parameter etype="IMM" immwidth="5" type="int" varname="imm8" />
+	<description>Concatenate 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst".</description>
+	<operation>
+tmp[255:0] := ((a[127:0] &lt;&lt; 128)[255:0] OR b[127:0]) &gt;&gt; (imm8*8)
+dst[127:0] := tmp[127:0]
+	</operation>
+	<instruction form="xmm, xmm, imm8" name="PALIGNR" xed="PALIGNR_XMMdq_XMMdq_IMMb" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_alignr_pi8" tech="SSE_ALL">
+	<return etype="UI8" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="UI8" type="__m64" varname="b" />
+	<parameter etype="IMM" immwidth="4" type="int" varname="imm8" />
+	<description>Concatenate 8-byte blocks in "a" and "b" into a 16-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst".</description>
+	<operation>
+tmp[127:0] := ((a[63:0] &lt;&lt; 64)[127:0] OR b[63:0]) &gt;&gt; (imm8*8)
+dst[63:0] := tmp[63:0]
+	</operation>
+	<instruction form="mm, mm, imm8" name="PALIGNR" xed="PALIGNR_MMXq_MMXq_IMMb" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_mm_hadd_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[31:16] + a[15:0]
+dst[31:16] := a[63:48] + a[47:32]
+dst[47:32] := a[95:80] + a[79:64]
+dst[63:48] := a[127:112] + a[111:96]
+dst[79:64] := b[31:16] + b[15:0]
+dst[95:80] := b[63:48] + b[47:32]
+dst[111:96] := b[95:80] + b[79:64]
+dst[127:112] := b[127:112] + b[111:96]
+	</operation>
+	<instruction form="xmm, xmm" name="PHADDW" xed="PHADDW_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hadds_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:16] + a[15:0])
+dst[31:16] := Saturate16(a[63:48] + a[47:32])
+dst[47:32] := Saturate16(a[95:80] + a[79:64])
+dst[63:48] := Saturate16(a[127:112] + a[111:96])
+dst[79:64] := Saturate16(b[31:16] + b[15:0])
+dst[95:80] := Saturate16(b[63:48] + b[47:32])
+dst[111:96] := Saturate16(b[95:80] + b[79:64])
+dst[127:112] := Saturate16(b[127:112] + b[111:96])
+	</operation>
+	<instruction form="xmm, xmm" name="PHADDSW" xed="PHADDSW_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hadd_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := a[127:96] + a[95:64]
+dst[95:64] := b[63:32] + b[31:0]
+dst[127:96] := b[127:96] + b[95:64]
+	</operation>
+	<instruction form="xmm, xmm" name="PHADDD" xed="PHADDD_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hadd_pi16" tech="SSE_ALL">
+	<return etype="SI16" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[31:16] + a[15:0]
+dst[31:16] := a[63:48] + a[47:32]
+dst[47:32] := b[31:16] + b[15:0]
+dst[63:48] := b[63:48] + b[47:32]
+	</operation>
+	<instruction form="mm, mm" name="PHADDW" xed="PHADDW_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hadd_pi32" tech="SSE_ALL">
+	<return etype="SI32" type="__m64" varname="dst" />
+	<parameter etype="SI32" type="__m64" varname="a" />
+	<parameter etype="SI32" type="__m64" varname="b" />
+	<description>Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := b[63:32] + b[31:0]
+	</operation>
+	<instruction form="mm, mm" name="PHADDW" xed="PHADDW_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hadds_pi16" tech="SSE_ALL">
+	<return etype="SI16" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:16] + a[15:0])
+dst[31:16] := Saturate16(a[63:48] + a[47:32])
+dst[47:32] := Saturate16(b[31:16] + b[15:0])
+dst[63:48] := Saturate16(b[63:48] + b[47:32])
+	</operation>
+	<instruction form="mm, mm" name="PHADDSW" xed="PHADDSW_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hsub_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[15:0] - a[31:16]
+dst[31:16] := a[47:32] - a[63:48]
+dst[47:32] := a[79:64] - a[95:80]
+dst[63:48] := a[111:96] - a[127:112]
+dst[79:64] := b[15:0] - b[31:16]
+dst[95:80] := b[47:32] - b[63:48]
+dst[111:96] := b[79:64] - b[95:80]
+dst[127:112] := b[111:96] - b[127:112]
+	</operation>
+	<instruction form="xmm, xmm" name="PHSUBW" xed="PHSUBW_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hsubs_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[15:0] - a[31:16])
+dst[31:16] := Saturate16(a[47:32] - a[63:48])
+dst[47:32] := Saturate16(a[79:64] - a[95:80])
+dst[63:48] := Saturate16(a[111:96] - a[127:112])
+dst[79:64] := Saturate16(b[15:0] - b[31:16])
+dst[95:80] := Saturate16(b[47:32] - b[63:48])
+dst[111:96] := Saturate16(b[79:64] - b[95:80])
+dst[127:112] := Saturate16(b[111:96] - b[127:112])
+	</operation>
+	<instruction form="xmm, xmm" name="PHSUBSW" xed="PHSUBSW_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hsub_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := a[95:64] - a[127:96]
+dst[95:64] := b[31:0] - b[63:32]
+dst[127:96] := b[95:64] - b[127:96]
+	</operation>
+	<instruction form="xmm, xmm" name="PHSUBD" xed="PHSUBD_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hsub_pi16" tech="SSE_ALL">
+	<return etype="SI16" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[15:0] - a[31:16]
+dst[31:16] := a[47:32] - a[63:48]
+dst[47:32] := b[15:0] - b[31:16]
+dst[63:48] := b[47:32] - b[63:48]
+	</operation>
+	<instruction form="mm, mm" name="PHSUBW" xed="PHSUBW_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hsub_pi32" tech="SSE_ALL">
+	<return etype="SI32" type="__m64" varname="dst" />
+	<parameter etype="SI32" type="__m64" varname="a" />
+	<parameter etype="SI32" type="__m64" varname="b" />
+	<description>Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := b[31:0] - b[63:32]
+	</operation>
+	<instruction form="mm, mm" name="PHSUBD" xed="PHSUBD_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_hsubs_pi16" tech="SSE_ALL">
+	<return etype="SI16" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[15:0] - a[31:16])
+dst[31:16] := Saturate16(a[47:32] - a[63:48])
+dst[47:32] := Saturate16(b[15:0] - b[31:16])
+dst[63:48] := Saturate16(b[47:32] - b[63:48])
+	</operation>
+	<instruction form="mm, mm" name="PHSUBSW" xed="PHSUBSW_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maddubs_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="SI16" type="__m128i" varname="dst" />
+	<parameter etype="UI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMADDUBSW" xed="PMADDUBSW_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_maddubs_pi16" tech="SSE_ALL">
+	<return etype="SI16" type="__m64" varname="dst" />
+	<parameter etype="UI8" type="__m64" varname="a" />
+	<parameter etype="SI8" type="__m64" varname="b" />
+	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMADDUBSW" xed="PMADDUBSW_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mulhrs_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+	dst[i+15:i] := tmp[16:1]
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PMULHRSW" xed="PMULHRSW_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_mulhrs_pi16" tech="SSE_ALL">
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+	dst[i+15:i] := tmp[16:1]
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PMULHRSW" xed="PMULHRSW_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sign_epi8" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI8" type="__m128i" varname="dst" />
+	<parameter etype="SI8" type="__m128i" varname="a" />
+	<parameter etype="SI8" type="__m128i" varname="b" />
+	<description>Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF b[i+7:i] &lt; 0
+		dst[i+7:i] := -(a[i+7:i])
+	ELSE IF b[i+7:i] == 0
+		dst[i+7:i] := 0
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSIGNB" xed="PSIGNB_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sign_epi16" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI16" type="__m128i" varname="dst" />
+	<parameter etype="SI16" type="__m128i" varname="a" />
+	<parameter etype="SI16" type="__m128i" varname="b" />
+	<description>Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF b[i+15:i] &lt; 0
+		dst[i+15:i] := -(a[i+15:i])
+	ELSE IF b[i+15:i] == 0
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSIGNW" xed="PSIGNW_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sign_epi32" vexEq="TRUE" tech="SSE_ALL">
+	<return etype="UI32" type="__m128i" varname="dst" />
+	<parameter etype="SI32" type="__m128i" varname="a" />
+	<parameter etype="SI32" type="__m128i" varname="b" />
+	<description>Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF b[i+31:i] &lt; 0
+		dst[i+31:i] := -(a[i+31:i])
+	ELSE IF b[i+31:i] == 0
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="xmm, xmm" name="PSIGND" xed="PSIGND_XMMdq_XMMdq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sign_pi8" tech="SSE_ALL">
+	<return etype="UI8" type="__m64" varname="dst" />
+	<parameter etype="SI8" type="__m64" varname="a" />
+	<parameter etype="SI8" type="__m64" varname="b" />
+	<description>Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF b[i+7:i] &lt; 0
+		dst[i+7:i] := -(a[i+7:i])
+	ELSE IF b[i+7:i] == 0
+		dst[i+7:i] := 0
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSIGNB" xed="PSIGNB_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sign_pi16" tech="SSE_ALL">
+	<return etype="UI16" type="__m64" varname="dst" />
+	<parameter etype="SI16" type="__m64" varname="a" />
+	<parameter etype="SI16" type="__m64" varname="b" />
+	<description>Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF b[i+15:i] &lt; 0
+		dst[i+15:i] := -(a[i+15:i])
+	ELSE IF b[i+15:i] == 0
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSIGNW" xed="PSIGNW_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	<intrinsic name="_mm_sign_pi32" tech="SSE_ALL">
+	<return etype="UI32" type="__m64" varname="dst" />
+	<parameter etype="SI32" type="__m64" varname="a" />
+	<parameter etype="SI32" type="__m64" varname="b" />
+	<description>Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF b[i+31:i] &lt; 0
+		dst[i+31:i] := -(a[i+31:i])
+	ELSE IF b[i+31:i] == 0
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction form="mm, mm" name="PSIGND" xed="PSIGND_MMXq_MMXq" />
+	<CPUID>SSSE3</CPUID>
+	<header>tmmintrin.h</header>
+	<category>Arithmetic</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_rdtsc" tech="Other">
+	<return etype="UI64" type="__int64" varname="dst" />
+	<parameter type="void" />
+	<description>Copy the current 64-bit value of the processor's time-stamp counter into "dst".</description>
+	<operation>dst[63:0] := TimeStampCounter
+	</operation>
+	<instruction name="RDTSC" xed="RDTSC" />
+	<CPUID>TSC</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_xsusldtrk" tech="Other">
+	<return type="void" />
+	<description>Mark the start of a TSX (HLE/RTM) suspend load address tracking region. If this is used inside a transactional region, subsequent loads are not added to the read set of the transaction. If this is used inside a suspend load address tracking region it will cause transaction abort. If this is used outside of a transactional region it behaves like a NOP.</description>
+	<instruction name="XSUSLDTRK" xed="XSUSLDTRK" />
+	<CPUID>TSXLDTRK</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_xresldtrk" tech="Other">
+	<return type="void" />
+	<description>Mark the end of a TSX (HLE/RTM) suspend load address tracking region. If this is used inside a suspend load address tracking region it will end the suspend region and all following load addresses will be added to the transaction read set. If this is used inside an active transaction but not in a suspend region it will cause transaction abort. If this is used outside of a transactional region it behaves like a NOP.</description>
+	<instruction name="XRESLDTRK" xed="XRESLDTRK" />
+	<CPUID>TSXLDTRK</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_clui" tech="Other">
+		<return type="void" />
+		<parameter type="void" />
+		<description>Clear the user interrupt flag (UIF).</description>
+		<instruction name="CLUI" xed="CLUI" />
+	<CPUID>UINTR</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_senduipi" tech="Other">
+		<return type="void" />
+		<parameter type="unsigned __int64" varname="__a" etype="UI64" />
+		<description>Send user interprocessor interrupts specified in unsigned 64-bit integer "__a".</description>
+		<instruction name="SENDUIPI" form="r32" xed="SENDUIPI_GPR32u32" />
+	<CPUID>UINTR</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_stui" tech="Other">
+		<return type="void" />
+		<parameter type="void" />
+		<description>Sets the user interrupt flag (UIF).</description>
+		<instruction name="STUI" xed="STUI" />
+	<CPUID>UINTR</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	<intrinsic name="_testui" tech="Other">
+		<return type="unsigned char" varname="dst" etype="UI8" />
+		<parameter type="void" />
+		<description>Store the current user interrupt flag (UIF) in unsigned 8-bit integer "dst".</description>
+		<instruction name="TESTUI" xed="TESTUI" />
+	<CPUID>UINTR</CPUID>
+	<header>immintrin.h</header>
+	<category>General Support</category>
+	</intrinsic>
+	
+<intrinsic name="_urdmsr" tech="Other">
+	<return type="unsigned __int64" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="__A" etype="UI64" />
+	<description>Reads the contents of a 64-bit MSR specified in "__A" into "dst".</description>
+	<operation>DEST := MSR[__A]
+	</operation>
+	<instruction name="URDMSR" form="r64 r64" xed="URDMSR_GPR64u64_GPR64u64" />
+	<CPUID>USER_MSR</CPUID>
+	<header>x86gprintrin.h</header>
+	<category>General Support</category>
+</intrinsic>
+<intrinsic name="_uwrmsr" tech="Other">
+	<return type="void"/>
+	<parameter type="unsigned __int64" varname="__A" etype="UI64" />
+	<parameter type="unsigned __int64" varname="__B" etype="UI64" />
+	<description>Writes the contents of "__B" into the 64-bit MSR specified in "__A".</description>
+	<operation>MSR[__A] := __B
+	</operation>
+	<instruction name="UWRMSR" form="r64 r64" xed="UWRMSR_GPR64u64_GPR64u64" />
+	<CPUID>USER_MSR</CPUID>
+	<header>x86gprintrin.h</header>
+	<category>General Support</category>
+</intrinsic>
+<intrinsic name="_mm256_aesenclast_epi128" tech="Other">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" type="__m256i" varname="a" />
+	<parameter etype="M128" type="__m256i" varname="RoundKey" />
+	<description>Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"."</description>
+	<operation>FOR j := 0 to 1
+	i := j*128
+	a[i+127:i] := ShiftRows(a[i+127:i])
+	a[i+127:i] := SubBytes(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VAESENCLAST" xed="VAESENCLAST_YMMu128_YMMu128_YMMu128" />
+	<CPUID>VAES</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm256_aesenc_epi128" tech="Other">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" type="__m256i" varname="a" />
+	<parameter etype="M128" type="__m256i" varname="RoundKey" />
+	<description>Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"."</description>
+	<operation>FOR j := 0 to 1
+	i := j*128
+	a[i+127:i] := ShiftRows(a[i+127:i])
+	a[i+127:i] := SubBytes(a[i+127:i])
+	a[i+127:i] := MixColumns(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VAESENC" xed="VAESENC_YMMu128_YMMu128_YMMu128" />
+	<CPUID>VAES</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm256_aesdeclast_epi128" tech="Other">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" type="__m256i" varname="a" />
+	<parameter etype="M128" type="__m256i" varname="RoundKey" />
+	<description>Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*128
+	a[i+127:i] := InvShiftRows(a[i+127:i])
+	a[i+127:i] := InvSubBytes(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VAESDECLAST" xed="VAESDECLAST_YMMu128_YMMu128_YMMu128" />
+	<CPUID>VAES</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	<intrinsic name="_mm256_aesdec_epi128" tech="Other">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" type="__m256i" varname="a" />
+	<parameter etype="M128" type="__m256i" varname="RoundKey" />
+	<description>Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*128
+	a[i+127:i] := InvShiftRows(a[i+127:i])
+	a[i+127:i] := InvSubBytes(a[i+127:i])
+	a[i+127:i] := InvMixColumns(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm" name="VAESDEC" xed="VAESDEC_YMMu128_YMMu128_YMMu128" />
+	<CPUID>VAES</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Cryptography</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm256_clmulepi64_epi128" tech="Other">
+	<return etype="M128" type="__m256i" varname="dst" />
+	<parameter etype="M128" type="__m256i" varname="b" />
+	<parameter etype="M128" type="__m256i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="Imm8" />
+	<description>Carry-less multiplication of one quadword of
+		'b' by one quadword of 'c', stores
+		the 128-bit result in 'dst'. The immediate 'Imm8' is
+		used to determine which quadwords of 'b'
+		and 'c' should be used.</description>
+	<operation>
+DEFINE PCLMUL128(X,Y) {
+	FOR i := 0 to 63
+		TMP[i] := X[ 0 ] and Y[ i ]
+		FOR j := 1 to i
+			TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ])
+		ENDFOR
+		DEST[ i ] := TMP[ i ]
+	ENDFOR
+	FOR i := 64 to 126
+		TMP[i] := 0
+		FOR j := i - 63 to 63
+			TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ])
+		ENDFOR
+		DEST[ i ] := TMP[ i ]
+	ENDFOR
+	DEST[127] := 0
+	RETURN DEST // 128b vector
+}
+FOR i := 0 to 1
+	IF Imm8[0] == 0
+		TEMP1 := b.m128[i].qword[0]
+	ELSE
+		TEMP1 := b.m128[i].qword[1]
+	FI
+	IF Imm8[4] == 0
+		TEMP2 := c.m128[i].qword[0]
+	ELSE
+		TEMP2 := c.m128[i].qword[1]
+	FI
+	dst.m128[i] := PCLMUL128(TEMP1, TEMP2)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction form="ymm, ymm, ymm, imm8" name="VPCLMULQDQ" xed="VPCLMULQDQ_YMMu128_YMMu64_YMMu64_IMM8_AVX512" />
+	<CPUID>VPCLMULQDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_mm512_clmulepi64_epi128" tech="Other">
+	<return etype="M128" type="__m512i" varname="dst" />
+	<parameter etype="M128" type="__m512i" varname="b" />
+	<parameter etype="M128" type="__m512i" varname="c" />
+	<parameter etype="IMM" immwidth="8" type="const int" varname="Imm8" />
+	<description>Carry-less multiplication of one quadword of
+		'b' by one quadword of 'c', stores
+		the 128-bit result in 'dst'. The immediate 'Imm8' is
+		used to determine which quadwords of 'b'
+		and 'c' should be used.</description>
+	<operation>
+DEFINE PCLMUL128(X,Y) {
+	FOR i := 0 to 63
+		TMP[i] := X[ 0 ] and Y[ i ]
+		FOR j := 1 to i
+			TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ])
+		ENDFOR
+		DEST[ i ] := TMP[ i ]
+	ENDFOR
+	FOR i := 64 to 126
+		TMP[i] := 0
+		FOR j := i - 63 to 63
+			TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ])
+		ENDFOR
+		DEST[ i ] := TMP[ i ]
+	ENDFOR
+	DEST[127] := 0
+	RETURN DEST // 128b vector
+}
+FOR i := 0 to 3
+	IF Imm8[0] == 0
+		TEMP1 := b.m128[i].qword[0]
+	ELSE
+		TEMP1 := b.m128[i].qword[1]
+	FI
+	IF Imm8[4] == 0
+		TEMP2 := c.m128[i].qword[0]
+	ELSE
+		TEMP2 := c.m128[i].qword[1]
+	FI
+	dst.m128[i] := PCLMUL128(TEMP1, TEMP2)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction form="zmm, zmm, zmm, imm8" name="VPCLMULQDQ" xed="VPCLMULQDQ_ZMMu128_ZMMu64_ZMMu64_IMM8_AVX512" />
+	<CPUID>VPCLMULQDQ</CPUID>
+	<header>immintrin.h</header>
+	<category>Application-Targeted</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_tpause" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="ctrl" />
+	<parameter etype="UI64" type="unsigned __int64" varname="counter" />
+	<description>Directs the processor to enter an implementation-dependent optimized state until the TSC reaches or exceeds the value specified in "counter". Bit 0 of "ctrl" selects between a lower power (cleared) or faster wakeup (set) optimized state. Returns the carry flag (CF). If the processor that executed a UMWAIT instruction wakes due to the expiration of the operating system timelimit, the instructions sets RFLAGS.CF; otherwise, that flag is cleared.</description>
+	<instruction form="r32" name="TPAUSE" xed="TPAUSE_GPR32u32" />
+	<CPUID>WAITPKG</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_umwait" tech="Other">
+	<return etype="UI8" type="unsigned char" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="ctrl" />
+	<parameter etype="UI64" type="unsigned __int64" varname="counter" />
+	<description>Directs the processor to enter an implementation-dependent optimized state while monitoring a range of addresses. The instruction wakes up when the TSC reaches or exceeds the value specified in "counter" (if the monitoring hardware did not trigger beforehand). Bit 0 of "ctrl" selects between a lower power (cleared) or faster wakeup (set) optimized state. Returns the carry flag (CF). If the processor that executed a UMWAIT instruction wakes due to the expiration of the operating system timelimit, the instructions sets RFLAGS.CF; otherwise, that flag is cleared.</description>
+	<instruction form="r32" name="UMWAIT" xed="UMWAIT_GPR32" />
+	<CPUID>WAITPKG</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	<intrinsic name="_umonitor" tech="Other">
+	<return type="void" />
+	<parameter type="void*" varname="a" />
+	<description>Sets up a linear address range to be
+		monitored by hardware and activates the
+		monitor. The address range should be a writeback
+		memory caching type. The address is
+		contained in "a".</description>
+	<instruction form="r16/r32/r64" name="UMONITOR" xed="UMONITOR_GPRa" />
+	<CPUID>WAITPKG</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_wbnoinvd" tech="Other">
+	<return type="void" />
+	<parameter type="void" />
+	<description>Write back and do not flush internal caches.
+		Initiate writing-back without flushing of external
+		caches.</description>
+	<instruction name="WBNOINVD" xed="WBNOINVD" />
+	<CPUID>WBNOINVD</CPUID>
+	<header>immintrin.h</header>
+	<category>Miscellaneous</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_xsavec" tech="Other">
+	<return type="void" />
+	<parameter type="void *" varname="mem_addr" />
+	<parameter etype="UI64" type="unsigned __int64" varname="save_mask" />
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction form="m8" name="XSAVEC" xed="XSAVEC_MEMmxsave" />
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSAVEC</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_xsavec64" tech="Other">
+	<return type="void" />
+	<parameter type="void *" varname="mem_addr" />
+	<parameter etype="UI64" type="unsigned __int64" varname="save_mask" />
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction form="m8" name="XSAVEC64" xed="XSAVEC64_MEMmxsave" />
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSAVEC</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_xsaveopt" tech="Other">
+	<return type="void" />
+	<parameter type="void *" varname="mem_addr" />
+	<parameter etype="UI64" type="unsigned __int64" varname="save_mask" />
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE instruction.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction form="m8" name="XSAVEOPT" xed="XSAVEOPT_MEMmxsave" />
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSAVEOPT</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_xsaveopt64" tech="Other">
+	<return type="void" />
+	<parameter type="void *" varname="mem_addr" />
+	<parameter etype="UI64" type="unsigned __int64" varname="save_mask" />
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE64 instruction.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction form="m8" name="XSAVEOPT64" xed="XSAVEOPT64_MEMmxsave" />
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSAVEOPT</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_xsaves" tech="Other">
+	<return type="void" />
+	<parameter type="void *" varname="mem_addr" />
+	<parameter etype="UI64" type="unsigned __int64" varname="save_mask" />
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction form="m8" name="XSAVES" xed="XSAVES_MEMmxsave" />
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSS</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_xsaves64" tech="Other">
+	<return type="void" />
+	<parameter type="void *" varname="mem_addr" />
+	<parameter etype="UI64" type="unsigned __int64" varname="save_mask" />
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction form="m8" name="XSAVEC64" xed="XSAVEC64_MEMmxsave" />
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSS</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_xrstors" tech="Other">
+	<return type="void" />
+	<parameter type="const void *" varname="mem_addr" />
+	<parameter etype="UI64" type="unsigned __int64" varname="rs_mask" />
+	<description>Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>st_mask := mem_addr.HEADER.XSTATE_BV[62:0]
+FOR i := 0 to 62
+	IF (rs_mask[i] AND XCR0[i])
+		IF st_mask[i]
+			CASE (i) OF
+			0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU]
+			1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
+			DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
+			ESAC
+		ELSE
+			// ProcessorExtendedState := Processor Supplied Values
+			CASE (i) OF
+			1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
+			ESAC
+		FI
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction form="m8" name="XRSTORS" xed="XRSTORS_MEMmxsave" />
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSS</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_xrstors64" tech="Other">
+	<return type="void" />
+	<parameter type="const void *" varname="mem_addr" />
+	<parameter etype="UI64" type="unsigned __int64" varname="rs_mask" />
+	<description>Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>st_mask := mem_addr.HEADER.XSTATE_BV[62:0]
+FOR i := 0 to 62
+	IF (rs_mask[i] AND XCR0[i])
+		IF st_mask[i]
+			CASE (i) OF
+			0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU]
+			1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
+			DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
+			ESAC
+		ELSE
+			// ProcessorExtendedState := Processor Supplied Values
+			CASE (i) OF
+			1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
+			ESAC
+		FI
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction form="m8" name="XRSTORS64" xed="XRSTORS64_MEMmxsave" />
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSS</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	
+	
+<intrinsic name="_xgetbv" tech="Other">
+	<return etype="UI64" type="unsigned __int64" varname="dst" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<description>Copy up to 64-bits from the value of the extended control register (XCR) specified by "a" into "dst". Currently only XFEATURE_ENABLED_MASK XCR is supported.</description>
+	<operation>dst[63:0] := XCR[a]
+	</operation>
+	<instruction name="XGETBV" xed="XGETBV" />
+	<CPUID>XSAVE</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_xrstor" tech="Other">
+	<return type="void" />
+	<parameter type="void *" varname="mem_addr" />
+	<parameter etype="UI64" type="unsigned __int64" varname="rs_mask" />
+	<description>Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>st_mask := mem_addr.HEADER.XSTATE_BV[62:0]
+FOR i := 0 to 62
+	IF (rs_mask[i] AND XCR0[i])
+		IF st_mask[i]
+			CASE (i) OF
+			0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU]
+			1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
+			DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
+			ESAC
+		ELSE
+			// ProcessorExtendedState := Processor Supplied Values
+			CASE (i) OF
+			1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
+			ESAC
+		FI
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction form="m8" name="XRSTOR" xed="XRSTOR_MEMmxsave" />
+	<CPUID>XSAVE</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_xrstor64" tech="Other">
+	<return type="void" />
+	<parameter type="void *" varname="mem_addr" />
+	<parameter etype="UI64" type="unsigned __int64" varname="rs_mask" />
+	<description>Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>st_mask := mem_addr.HEADER.XSTATE_BV[62:0]
+FOR i := 0 to 62
+	IF (rs_mask[i] AND XCR0[i])
+		IF st_mask[i]
+			CASE (i) OF
+			0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU]
+			1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
+			DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
+			ESAC
+		ELSE
+			// ProcessorExtendedState := Processor Supplied Values
+			CASE (i) OF
+			1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
+			ESAC
+		FI
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction form="m8" name="XRSTOR64" xed="XRSTOR64_MEMmxsave" />
+	<CPUID>XSAVE</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_xsave" tech="Other">
+	<return type="void" />
+	<parameter type="void *" varname="mem_addr" />
+	<parameter etype="UI64" type="unsigned __int64" varname="save_mask" />
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction form="m8" name="XSAVE" xed="XSAVE_MEMmxsave" />
+	<CPUID>XSAVE</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_xsave64" tech="Other">
+	<return type="void" />
+	<parameter type="void *" varname="mem_addr" />
+	<parameter etype="UI64" type="unsigned __int64" varname="save_mask" />
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction form="m8" name="XSAVE64" xed="XSAVE64_MEMmxsave" />
+	<CPUID>XSAVE</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	<intrinsic name="_xsetbv" tech="Other">
+	<return type="void" />
+	<parameter etype="UI32" type="unsigned int" varname="a" />
+	<parameter etype="UI64" type="unsigned __int64" varname="val" />
+	<description>Copy 64-bits from "val" to the extended control register (XCR) specified by "a". Currently only XFEATURE_ENABLED_MASK XCR is supported.</description>
+	<operation>
+XCR[a] := val[63:0]
+	</operation>
+	<instruction name="XSETBV" xed="XSETBV" />
+	<CPUID>XSAVE</CPUID>
+	<header>immintrin.h</header>
+	<category>OS-Targeted</category>
+	</intrinsic>
+	
+	
+</intrinsics_list>
\ No newline at end of file
diff --git a/library/stdarch/examples/Cargo.toml b/library/stdarch/examples/Cargo.toml
new file mode 100644
index 0000000000000..61184494e1573
--- /dev/null
+++ b/library/stdarch/examples/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "stdarch_examples"
+version = "0.0.0"
+authors = [
+    "Alex Crichton <alex@alexcrichton.com>",
+    "Andrew Gallant <jamslam@gmail.com>",
+    "Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>",
+]
+description = "Examples of the stdarch crate."
+edition = "2024"
+default-run = "hex"
+
+[dependencies]
+core_arch = { path = "../crates/core_arch" }
+std_detect = { path = "../crates/std_detect" }
+quickcheck = "1.0"
+rand = "0.8"
+
+[[bin]]
+name = "hex"
+path = "hex.rs"
+
+[[bin]]
+name = "connect5"
+path = "connect5.rs"
+
+[[example]]
+name = "wasm"
+crate-type = ["cdylib"]
+path = "wasm.rs"
diff --git a/library/stdarch/examples/connect5.rs b/library/stdarch/examples/connect5.rs
new file mode 100644
index 0000000000000..2b451f45d71c0
--- /dev/null
+++ b/library/stdarch/examples/connect5.rs
@@ -0,0 +1,1244 @@
+//! <b>Outer-Open Gomoku</b> is a board game which is a enhanced version of connect5 (Gomoku).\
+//! The game is a two-player game which played on a 15x15 Go board.\
+//! Two players take turns placing a move on an empty intersection in this board.\
+//! The winner is the first player to form an unbroken chain of five moves horizontally, vertically, or diagonally.\
+//! Unlike Gomoku, the first move is required to be placed at the two outer rows or columns of this board.\
+//! This program provides an AI playing with Minimax search with alpha-beta pruning which uses
+//! patterns on evaluation.\
+//! The avx512 intrinsic can do 32 pattern matching at one time.\
+//! This avx512 is tested with non-avx512 code to verify its correctness.\
+//!
+//! On Intel i7-7800x using single thread with fixed AVX-512 clock at 4.0GHz, the avx512 is speed up about 9x.\
+//! The average time for each move in the avx512 is around 14.00s <span>&#177;</span> 1.31s and in the non-avx512
+//! is 129.02s <span>&#177;</span> 4.96s.\
+//! On Intel Tiger Lake i7-1165G7, the avx512 is around 11.11s <span>&#177;</span> 1.31s.
+//!
+//! <b>Pattern Matching</b>\
+//! Use 512-bit to present the board state. The location 0 is top left.\
+//! 0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  <b>15</b>\
+//! 16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  <b>31</b>\
+//! ...\
+//! Pattern "OOOOO" is matching through "0 1 2 3 4", "1 2 3 4 5", ...\
+//! Using avx512, "0 1 2 3 4", "16 17 18 19 20", ... can be matched simultaneously.\
+//!
+//! //! You can test out this program via:
+//!
+//!     cargo +nightly run --release --bin connect5
+//!
+//! You should see a game self-playing. In the end of the game, it shows the average time for
+//! each move.
+
+#![allow(internal_features)]
+#![cfg_attr(target_arch = "x86", feature(stdarch_internal))]
+#![cfg_attr(target_arch = "x86_64", feature(stdarch_internal))]
+#![feature(stmt_expr_attributes)]
+
+use rand::seq::SliceRandom;
+use rand::thread_rng;
+
+use std::cmp;
+use std::time::Instant;
+
+#[cfg(target_arch = "x86")]
+use {core_arch::arch::x86::*, std_detect::is_x86_feature_detected};
+#[cfg(target_arch = "x86_64")]
+use {core_arch::arch::x86_64::*, std_detect::is_x86_feature_detected};
+
+// types
+
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum Color {
+    Black = 0,
+    White = 1,
+    Empty = 2,
+    Border = 3,
+}
+
+type Square = i32;
+type Move = i32;
+type Side = Color;
+
+// constants
+
+const FILE_SIZE: i32 = 15;
+const RANK_SIZE: i32 = 15;
+const SQUARE_SIZE: i32 = (FILE_SIZE + 1) * (FILE_SIZE + 4) + 16 + 4;
+
+const EVAL_INF: i32 = FILE_SIZE * RANK_SIZE * 100;
+const MOVE_NONE: Move = -1;
+const SCORE_NONE: i32 = -EVAL_INF - 1;
+
+/// DIRECTION 0: left to right\
+/// DIRECTION 1: top to bottom\
+/// DIRECTION 2: top left to bottom right\
+/// DIRECTION 3: top right to bottom left
+#[rustfmt::skip]
+#[allow(clippy::identity_op)]
+const DIRECTION: [[i32; 5]; 4] = [ [1, 2, 3, 4, 5],
+                                   [1 * (FILE_SIZE + 1), 2 * (FILE_SIZE + 1), 3 * (FILE_SIZE + 1), 4 * (FILE_SIZE + 1), 5 * (FILE_SIZE + 1)],
+                                   [1 * (FILE_SIZE + 2), 2 * (FILE_SIZE + 2), 3 * (FILE_SIZE + 2), 4 * (FILE_SIZE + 2), 5 * (FILE_SIZE + 2)],
+                                   [1 * (FILE_SIZE + 0), 2 * (FILE_SIZE + 0), 3 * (FILE_SIZE + 0), 4 * (FILE_SIZE + 0), 5 * (FILE_SIZE + 0)]];
+
+/// A table to encode each location to a value in bit 31-0 in the bitboard for 4 direction
+#[rustfmt::skip]
+const MAPMOVEVALUE: [[i32; 239]; 4] = [ [// Direction 0
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17],
+                                        [// Direction 1
+                                         1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 0,
+                                         1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 0,
+                                         1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 0,
+                                         1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 0,
+                                         1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 0,
+                                         1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 0,
+                                         1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 0,
+                                         1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 0,
+                                         1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 0,
+                                         1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 0,
+                                         1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 0,
+                                         1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 0,
+                                         1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 0,
+                                         1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 0,
+                                         1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17],
+                                        [// Direction 2
+                                         1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 0,     0,     0,     0,     0,
+                                         1<<15, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 0,     0,     0,     0,
+                                         1<<15, 1<<14, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 0,     0,     0,
+                                         1<<15, 1<<14, 1<<13, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 0,     0,
+                                         1<<15, 1<<14, 1<<13, 1<<12, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 0,
+                                         1<<15, 1<<14, 1<<13, 1<<12, 1<<11, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 0,
+                                         1<<9,  1<<14, 1<<13, 1<<12, 1<<11, 1<<10, 1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  0,
+                                         1<<8,  1<<8,  1<<13, 1<<12, 1<<11, 1<<10, 1<<9,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  0,
+                                         1<<7,  1<<7,  1<<7,  1<<12, 1<<11, 1<<10, 1<<9,  1<<8,  1<<7,  1<<7,  1<<7,  1<<7,  1<<7,  1<<7,  1<<7,  0,
+                                         1<<6,  1<<6,  1<<6,  1<<6,  1<<11, 1<<10, 1<<9,  1<<8,  1<<7,  1<<6,  1<<6,  1<<6,  1<<6,  1<<6,  1<<6,  0,
+                                         1<<5,  1<<5,  1<<5,  1<<5,  1<<5,  1<<10, 1<<9,  1<<8,  1<<7,  1<<6,  1<<5,  1<<5,  1<<5,  1<<5,  1<<5,  0,
+                                         0,     1<<4,  1<<4,  1<<4,  1<<4,  1<<4,  1<<9,  1<<8,  1<<7,  1<<6,  1<<5,  1<<4,  1<<4,  1<<4,  1<<4,  0,
+                                         0,     0,     1<<3,  1<<3,  1<<3,  1<<3,  1<<3,  1<<8,  1<<7,  1<<6,  1<<5,  1<<4,  1<<3,  1<<3,  1<<3,  0,
+                                         0,     0,     0,     1<<2,  1<<2,  1<<2,  1<<2,  1<<2,  1<<7,  1<<6,  1<<5,  1<<4,  1<<3,  1<<2,  1<<2,  0,
+                                         0,     0,     0,     0,     1<<1,  1<<1,  1<<1,  1<<1,  1<<1,  1<<6,  1<<5,  1<<4,  1<<3,  1<<2,  1<<1],
+                                        [// Direction 3
+                                         0,     0,     0,     0,     1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 0,
+                                         0,     0,     0,     1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<15, 0,
+                                         0,     0,     1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<14, 1<<15, 0,
+                                         0,     1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<13, 1<<14, 1<<15, 0,
+                                         1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15, 0,
+                                         1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15, 0,
+                                         1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<9,  0,
+                                         1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<9,  1<<10, 1<<11, 1<<12, 1<<13, 1<<8,  1<<8,  0,
+                                         1<<7,  1<<7,  1<<7,  1<<7,  1<<7,  1<<7,  1<<7,  1<<8,  1<<9,  1<<10, 1<<11, 1<<12, 1<<7,  1<<7,  1<<7,  0,
+                                         1<<6,  1<<6,  1<<6,  1<<6,  1<<6,  1<<6,  1<<7,  1<<8,  1<<9,  1<<10, 1<<11, 1<<6,  1<<6,  1<<6,  1<<6,  0,
+                                         1<<5,  1<<5,  1<<5,  1<<5,  1<<5,  1<<6,  1<<7,  1<<8,  1<<9,  1<<10, 1<<5,  1<<5,  1<<5,  1<<5,  1<<5,  0,
+                                         1<<4,  1<<4,  1<<4,  1<<4,  1<<5,  1<<6,  1<<7,  1<<8,  1<<9,  1<<4,  1<<4,  1<<4,  1<<4,  1<<4,  0,     0,
+                                         1<<3,  1<<3,  1<<3,  1<<4,  1<<5,  1<<6,  1<<7,  1<<8,  1<<3,  1<<3,  1<<3,  1<<3,  1<<3,  0,     0,     0,
+                                         1<<2,  1<<2,  1<<3,  1<<4,  1<<5,  1<<6,  1<<7,  1<<2,  1<<2,  1<<2,  1<<2,  1<<2,  0,     0,     0,     0,
+                                         1<<1,  1<<2,  1<<3,  1<<4,  1<<5,  1<<6,  1<<1,  1<<1,  1<<1,  1<<1,  1<<1,  0,     0,     0,     0]
+                                        ];
+
+/// A table to encode each location to an index in the bitboard for 4 direction
+#[rustfmt::skip]
+const MAPMOVEIDX: [[i32; 239]; 4] = [ [// Direction 0
+                                       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+                                       1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+                                       2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  0,
+                                       3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  0,
+                                       4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  0,
+                                       5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  0,
+                                       6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  0,
+                                       7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  0,
+                                       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  0,
+                                       9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  0,
+                                       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 0,
+                                       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 0,
+                                       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0,
+                                       13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 0,
+                                       14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14],
+                                      [// Direction 1
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+                                      [// Direction 2
+                                       10, 9,  8,   7,  6,  5,  4,  3,  2,  1,  0,  0,  0,  0,  0,  0,
+                                       11, 10, 9,   8,  7,  6,  5,  4,  3,  2,  1,  0,  0,  0,  0,  0,
+                                       12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0,  0,  0,  0,
+                                       13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0,  0,  0,
+                                       14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0,  0,
+                                       15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0,
+                                        1, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  0,
+                                        2,  1, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  0,
+                                        3,  2,  1, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  0,
+                                        4,  3,  2,  1, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  0,
+                                        5,  4,  3,  2,  1, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  0,
+                                        0,  5,  4,  3,  2,  1, 15, 14, 13, 12, 11, 10,  9,  8,  7,  0,
+                                        0,  0,  5,  4,  3,  2,  1, 15, 14, 13, 12, 11, 10,  9,  8,  0,
+                                        0,  0,  0,  5,  4,  3,  2,  1, 15, 14, 13, 12, 11, 10,  9,  0,
+                                        0,  0,  0,  0,  5,  4,  3,  2,  1, 15, 14, 13, 12, 11, 10],
+                                      [// Direction 3
+                                       0,  0,  0,  0,   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,
+                                       0,  0,  0,  0,   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,
+                                       0,  0,  0,  1,   2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,
+                                       0,  0,  1,  2,   3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,
+                                       0,  1,  2,  3,   4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
+                                       1,  2,  3,  4,   5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  0,
+                                       2,  3,  4,  5,   6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  1,  0,
+                                       3,  4,  5,  6,   7,  8,  9, 10, 11, 12, 13, 14, 15,  1,  2,  0,
+                                       4,  5,  6,  7,   8,  9, 10, 11, 12, 13, 14, 15,  1,  2,  3,  0,
+                                       5,  6,  7,  8,   9, 10, 11, 12, 13, 14, 15,  1,  2,  3,  4,  0,
+                                       6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  1,  2,  3,  4,  5,  0,
+                                       7,  8,  9,  10, 11, 12, 13, 14, 15,  1,  2,  3,  4,  5,  0,  0,
+                                       8,  9,  10, 11, 12, 13, 14, 15,  1,  2,  3,  4,  5,  0,  0,  0,
+                                       9,  10, 11, 12, 13, 14, 15,  1,  2,  3,  4,  5,  0,  0,  0,  0,
+                                       10, 11, 12, 13, 14, 15,  1,  2,  3,  4,  5,  0,  0,  0,  0]
+                                ];
+
+// structures
+
+/// Use one-dimensional array to store the board state. The location 0 is top left.\
+/// 0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  <b>15</b>\
+/// 16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  <b>31</b>\
+/// ... \
+/// position 15, 31, ... are Borders.\
+/// position 0 is file 0, rank 0.\
+/// position 17 is file 1, rank 1.\
+///
+/// Use a three-dimensional array to store the bitboard.\
+/// The first dimension is color: Black, White and Empty.\
+/// The second and third one are 2 x 512-bit. Direction 0 and 2 use the first 512-bit. Direction 1 and
+/// 3 use the second 512-bit.\
+/// Each 512-bit is a 32-bit x 16 array. Direction 0 and 1 store at bit 31-16 and Direction 2 and 3 store at bit 15-0.
+pub struct Pos {
+    // position
+    state: [Color; SQUARE_SIZE as usize],
+    p_turn: Side,
+    bitboard: [[[i32; 16]; 2]; 3],
+}
+
+impl Pos {
+    pub fn init(&mut self) {
+        // starting position
+        // Set up the Border
+        for i in 0..SQUARE_SIZE as usize {
+            self.state[i] = Color::Border;
+        }
+
+        // In the beginning, all is Empty
+        for rk in 0..RANK_SIZE {
+            for fl in 0..FILE_SIZE {
+                let sq: Square = square_make(fl, rk);
+                self.state[sq as usize] = Color::Empty;
+            }
+        }
+
+        // first move is Black
+        self.p_turn = Color::Black;
+
+        let black = Color::Black as usize;
+        let white = Color::White as usize;
+        let empty = Color::Empty as usize;
+
+        // set up the corresponding bitboard
+        for i in 0..2 {
+            for j in 0..16 {
+                self.bitboard[black][i][j] = 0;
+                self.bitboard[white][i][j] = 0;
+                self.bitboard[empty][i][j] = 0;
+            }
+        }
+
+        for i in 0..2 {
+            // use bit 31-16 to store direction 0 and 1
+            #[rustfmt::skip]
+            for j in 0..FILE_SIZE as usize {
+                self.bitboard[empty][i][j] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17);
+            }
+        }
+
+        // use bit 15-0 to store direction 2 and 3. There are 21 for each one. We combine row1 and row16, row2 and row17, row3 and row18, row4 and row19, and row 5 and row20
+        #[rustfmt::skip]
+        for i in 0..2 {
+            self.bitboard[empty][i][0]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11); //row 0
+            self.bitboard[empty][i][1]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)/*row1*/|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row16
+            self.bitboard[empty][i][2]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)/*row2*/|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row17
+            self.bitboard[empty][i][3]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)/*row3*/|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row18
+            self.bitboard[empty][i][4]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)/*row4*/|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row19
+            self.bitboard[empty][i][5]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)/*row5*/|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row20
+            self.bitboard[empty][i][6]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5);//row6
+            self.bitboard[empty][i][7]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4);//row7
+            self.bitboard[empty][i][8]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3);//row8
+            self.bitboard[empty][i][9]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2);//row9
+            self.bitboard[empty][i][10] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row10
+            self.bitboard[empty][i][11] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2);//row11
+            self.bitboard[empty][i][12] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3);//row12
+            self.bitboard[empty][i][13] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4);//row13
+            self.bitboard[empty][i][14] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5);//row14
+            self.bitboard[empty][i][15] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6);//row15
+        }
+    }
+
+    pub fn do_move(&mut self, mv: Move) {
+        let atk: Side = self.p_turn;
+        let def: Side = side_opp(atk);
+
+        let mv = mv as usize;
+        let black = Color::Black as usize;
+        let white = Color::White as usize;
+        let empty = Color::Empty as usize;
+
+        match self.p_turn {
+            Color::Black => {
+                self.state[mv] = Color::Black;
+                // update black move and remove empty move in bitboard
+                self.bitboard[black][0][MAPMOVEIDX[0][mv] as usize] |= MAPMOVEVALUE[0][mv];
+                self.bitboard[empty][0][MAPMOVEIDX[0][mv] as usize] ^= MAPMOVEVALUE[0][mv];
+                self.bitboard[black][1][MAPMOVEIDX[1][mv] as usize] |= MAPMOVEVALUE[1][mv];
+                self.bitboard[empty][1][MAPMOVEIDX[1][mv] as usize] ^= MAPMOVEVALUE[1][mv];
+                self.bitboard[black][0][MAPMOVEIDX[2][mv] as usize] |= MAPMOVEVALUE[2][mv];
+                self.bitboard[empty][0][MAPMOVEIDX[2][mv] as usize] ^= MAPMOVEVALUE[2][mv];
+                self.bitboard[black][1][MAPMOVEIDX[3][mv] as usize] |= MAPMOVEVALUE[3][mv];
+                self.bitboard[empty][1][MAPMOVEIDX[3][mv] as usize] ^= MAPMOVEVALUE[3][mv];
+            }
+            Color::White => {
+                self.state[mv] = Color::White;
+                // update white move and remove empty move in bitboard
+                self.bitboard[white][0][MAPMOVEIDX[0][mv] as usize] |= MAPMOVEVALUE[0][mv];
+                self.bitboard[empty][0][MAPMOVEIDX[0][mv] as usize] ^= MAPMOVEVALUE[0][mv];
+                self.bitboard[white][1][MAPMOVEIDX[1][mv] as usize] |= MAPMOVEVALUE[1][mv];
+                self.bitboard[empty][1][MAPMOVEIDX[1][mv] as usize] ^= MAPMOVEVALUE[1][mv];
+                self.bitboard[white][0][MAPMOVEIDX[2][mv] as usize] |= MAPMOVEVALUE[2][mv];
+                self.bitboard[empty][0][MAPMOVEIDX[2][mv] as usize] ^= MAPMOVEVALUE[2][mv];
+                self.bitboard[white][1][MAPMOVEIDX[3][mv] as usize] |= MAPMOVEVALUE[3][mv];
+                self.bitboard[empty][1][MAPMOVEIDX[3][mv] as usize] ^= MAPMOVEVALUE[3][mv];
+            }
+            _ => panic! {},
+        }
+
+        self.p_turn = def;
+    }
+
+    fn turn(&self) -> Side {
+        self.p_turn
+    }
+
+    pub fn can_play(&self, from: Square) -> bool {
+        self.state[from as usize] == Color::Empty
+    }
+}
+
+pub struct List {
+    // legal move list
+    p_move: [Move; (FILE_SIZE * RANK_SIZE) as usize],
+    p_size: i32,
+}
+
+/// Use List to store legal moves.
+impl List {
+    pub fn clear(&mut self) {
+        self.p_size = 0;
+    }
+
+    pub fn add(&mut self, mv: Move) {
+        self.p_move[self.p_size as usize] = mv;
+        self.p_size += 1;
+    }
+
+    pub fn size(&self) -> i32 {
+        self.p_size
+    }
+
+    pub fn shuffle(&mut self) {
+        let mut rng = thread_rng();
+        let num = self.p_size as usize;
+
+        self.p_move[..num].shuffle(&mut rng);
+    }
+}
+
+// functions
+
+fn square_make(fl: i32, rk: i32) -> Square {
+    rk * (FILE_SIZE + 1) + fl
+}
+
+fn side_opp(sd: Side) -> Side {
+    match sd {
+        Side::White => Side::Black,
+        Side::Black => Side::White,
+        _ => panic!(""),
+    }
+}
+
+fn pos_is_winner(pos: &Pos) -> bool {
+    let current_side = side_opp(pos.p_turn);
+    check_pattern5(pos, current_side)
+}
+
+fn pos_is_draw(pos: &Pos) -> bool {
+    let mut found: bool = true;
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+            if pos.can_play(sq) {
+                found = false;
+                break;
+            }
+
+            if !found {
+                break;
+            }
+        }
+    }
+
+    found && !pos_is_winner(pos)
+}
+
+#[target_feature(enable = "avx512f,avx512bw,popcnt")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn pos_is_draw_avx512(pos: &Pos) -> bool {
+    let empty = Color::Empty as usize;
+
+    let board0org = unsafe { _mm512_loadu_epi32(&pos.bitboard[empty][0][0]) };
+
+    let answer = _mm512_set1_epi32(0);
+
+    // if all empty is 0, all board is filled.
+    let temp_mask = _mm512_mask_cmpneq_epi32_mask(0b11111111_11111111, answer, board0org);
+
+    _popcnt32(temp_mask as i32) == 0 && !pos_is_winner_avx512(pos)
+}
+
+fn pos_is_end(pos: &Pos) -> bool {
+    pos_is_winner(pos) || pos_is_draw(pos)
+}
+
+fn pos_disp(pos: &Pos) {
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+
+            match pos.state[sq as usize] {
+                Color::Black => print!("# "),
+                Color::White => print!("O "),
+                Color::Empty => print!("- "),
+                Color::Border => print!("| "),
+            }
+        }
+
+        println!();
+    }
+
+    match pos.turn() {
+        Color::Black => println!("black to play"),
+        Color::White => println!("white to play"),
+        _ => panic!(),
+    }
+}
+
+fn gen_moves(list: &mut List, pos: &Pos) {
+    list.clear();
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+            if pos.can_play(sq) {
+                list.add(sq);
+            }
+        }
+    }
+}
+
+/// AI: use Minimax search with alpha-beta pruning
+#[allow(clippy::manual_range_contains)]
+fn search(pos: &Pos, alpha: i32, beta: i32, depth: i32, _ply: i32) -> i32 {
+    assert!(-EVAL_INF <= alpha && alpha < beta && beta <= EVAL_INF);
+    // leaf?
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if check_x86_avx512_features() {
+            unsafe {
+                if pos_is_winner_avx512(pos) {
+                    return -EVAL_INF + _ply;
+                }
+
+                if pos_is_draw_avx512(pos) {
+                    return 0;
+                }
+            }
+        } else {
+            if pos_is_winner(pos) {
+                return -EVAL_INF + _ply;
+            }
+
+            if pos_is_draw(pos) {
+                return 0;
+            }
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    {
+        if pos_is_winner(pos) {
+            return -EVAL_INF + _ply;
+        }
+
+        if pos_is_draw(pos) {
+            return 0;
+        }
+    }
+
+    if depth == 0 {
+        return eval(pos, _ply);
+    }
+
+    let p_move_new: [Move; (FILE_SIZE * RANK_SIZE) as usize] =
+        [0; (FILE_SIZE * RANK_SIZE) as usize];
+
+    let mut list = List {
+        p_move: p_move_new,
+        p_size: 0,
+    };
+
+    let mut bm: Move = MOVE_NONE;
+    let mut bs: i32 = SCORE_NONE;
+
+    gen_moves(&mut list, pos);
+
+    // move loop
+
+    if _ply == 0 {
+        list.shuffle();
+    }
+
+    for i in 0..list.size() {
+        if bs < beta {
+            let mv: Move = list.p_move[i as usize];
+
+            let mut new_pos = Pos {
+                state: pos.state,
+                p_turn: pos.p_turn,
+                bitboard: pos.bitboard,
+            };
+
+            new_pos.do_move(mv);
+
+            let sc: i32 = -search(&new_pos, -beta, -cmp::max(alpha, bs), depth - 1, _ply + 1);
+
+            if sc > bs {
+                bm = mv;
+                bs = sc;
+            }
+        }
+    }
+
+    assert_ne!(bm, MOVE_NONE);
+    assert!(bs >= -EVAL_INF && bs <= EVAL_INF);
+
+    if _ply == 0 { bm } else { bs } //best move at the root node, best score elsewhere
+}
+
+/// Evaluation function: give different scores to different patterns after a fixed depth.
+fn eval(pos: &Pos, _ply: i32) -> i32 {
+    let atk: Side = pos.turn();
+    let def: Side = side_opp(atk);
+
+    // check if opp has live4 which will win playing next move
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if check_x86_avx512_features() {
+            unsafe {
+                if check_patternlive4_avx512(pos, def) {
+                    return -4096;
+                }
+            }
+        } else {
+            if check_patternlive4(pos, def) {
+                return -4096;
+            }
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    {
+        if check_patternlive4(pos, def) {
+            return -4096;
+        }
+    }
+
+    // check if self has live4 which will win playing next move
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if check_x86_avx512_features() {
+            unsafe {
+                if check_patternlive4_avx512(pos, atk) {
+                    return 2560;
+                }
+            }
+        } else {
+            if check_patternlive4(pos, atk) {
+                return 2560;
+            }
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    {
+        if check_patternlive4(pos, atk) {
+            return 2560;
+        }
+    }
+
+    // check if self has dead4 which will win playing next move
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if check_x86_avx512_features() {
+            unsafe {
+                if check_patterndead4_avx512(pos, atk) > 0 {
+                    return 2560;
+                }
+            }
+        } else {
+            if check_patterndead4(pos, atk) > 0 {
+                return 2560;
+            }
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    {
+        if check_patterndead4(pos, atk) > 0 {
+            return 2560;
+        }
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if check_x86_avx512_features() {
+            unsafe {
+                let n_c4: i32 = check_patterndead4_avx512(pos, def);
+                let n_c3: i32 = check_patternlive3_avx512(pos, def);
+
+                // check if opp has 2 dead4 which will win playing next move
+                if n_c4 > 1 {
+                    return -2048;
+                }
+
+                // check if opp has a dead 4 and live 3 which will win playing the next two move
+                if n_c4 == 1 && n_c3 > 0 {
+                    return -2048;
+                }
+
+                if check_patternlive3_avx512(pos, atk) > 1 {
+                    return 2560;
+                }
+
+                // check if opp has 2 live3 which will win playing the next two move
+                if n_c3 > 1 {
+                    return -2048;
+                }
+            }
+        } else {
+            let n_c4: i32 = check_patterndead4(pos, def);
+            let n_c3: i32 = check_patternlive3(pos, def);
+
+            // check if opp has 2 dead4 which will win playing next move
+            if n_c4 > 1 {
+                return -2048;
+            }
+
+            // check if opp has a dead 4 and live 3 which will win playing the next two move
+            if n_c4 == 1 && n_c3 > 0 {
+                return -2048;
+            }
+
+            // check if self has 2 live3 which will win playing the next two move
+            if check_patternlive3(pos, atk) > 1 {
+                return 2560;
+            }
+
+            // check if opp has 2 live3 which will win playing the next two move
+            if n_c3 > 1 {
+                return -2048;
+            }
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    {
+        let n_c4: i32 = check_patterndead4(pos, def);
+        let n_c3: i32 = check_patternlive3(pos, def);
+
+        // check if opp has 2 dead4 which will win playing next move
+        if n_c4 > 1 {
+            return -2048;
+        }
+
+        // check if opp has a dead 4 and live 3 which will win playing the next two move
+        if n_c4 == 1 && n_c3 > 0 {
+            return -2048;
+        }
+
+        // check if self has 2 live3 which will win playing the next two move
+        if check_patternlive3(pos, atk) > 1 {
+            return 2560;
+        }
+
+        // check if opp has 2 live3 which will win playing the next two move
+        if n_c3 > 1 {
+            return -2048;
+        }
+    }
+
+    0
+}
+
+/// Check <b>OOOOO</b>
+fn check_pattern5(pos: &Pos, sd: Side) -> bool {
+    let mut n: i32 = 0;
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+
+            for direction in &DIRECTION {
+                let idx0 = sq;
+                let idx1 = sq + direction[0];
+                let idx2 = sq + direction[1];
+                let idx3 = sq + direction[2];
+                let idx4 = sq + direction[3];
+
+                let val0 = pos.state[idx0 as usize];
+                let val1 = pos.state[idx1 as usize];
+                let val2 = pos.state[idx2 as usize];
+                let val3 = pos.state[idx3 as usize];
+                let val4 = pos.state[idx4 as usize];
+
+                #[rustfmt::skip]
+                if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+            }
+        }
+    }
+
+    n > 0
+}
+
+/// Check <b>-OOOO-</b>
+fn check_patternlive4(pos: &Pos, sd: Side) -> bool {
+    let mut n: i32 = 0;
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+
+            for direction in &DIRECTION {
+                let idx0 = sq;
+                let idx1 = sq + direction[0];
+                let idx2 = sq + direction[1];
+                let idx3 = sq + direction[2];
+                let idx4 = sq + direction[3];
+                let idx5 = sq + direction[4];
+
+                let val0 = pos.state[idx0 as usize];
+                let val1 = pos.state[idx1 as usize];
+                let val2 = pos.state[idx2 as usize];
+                let val3 = pos.state[idx3 as usize];
+                let val4 = pos.state[idx4 as usize];
+                let val5 = pos.state[idx5 as usize];
+
+                #[rustfmt::skip]
+                if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
+            }
+        }
+    }
+
+    n > 0
+}
+
+/// Check <b>OOOO-, OOO-O, OO-OO, O-OOO, -OOOO</b>
+fn check_patterndead4(pos: &Pos, sd: Side) -> i32 {
+    let mut n: i32 = 0;
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+
+            for direction in &DIRECTION {
+                let idx0 = sq;
+                let idx1 = sq + direction[0];
+                let idx2 = sq + direction[1];
+                let idx3 = sq + direction[2];
+                let idx4 = sq + direction[3];
+
+                let val0 = pos.state[idx0 as usize];
+                let val1 = pos.state[idx1 as usize];
+                let val2 = pos.state[idx2 as usize];
+                let val3 = pos.state[idx3 as usize];
+                let val4 = pos.state[idx4 as usize];
+
+                #[rustfmt::skip]
+                if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+                #[rustfmt::skip]
+                if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
+                #[rustfmt::skip]
+                if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
+                #[rustfmt::skip]
+                if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+                #[rustfmt::skip]
+                if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+            }
+        }
+    }
+
+    n
+}
+
+/// Check <b>-OOO-, -OO-O-, -O-OO-</b>
+fn check_patternlive3(pos: &Pos, sd: Side) -> i32 {
+    let mut n: i32 = 0;
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+
+            for direction in &DIRECTION {
+                let idx0 = sq;
+                let idx1 = sq + direction[0];
+                let idx2 = sq + direction[1];
+                let idx3 = sq + direction[2];
+                let idx4 = sq + direction[3];
+                let idx5 = sq + direction[4];
+
+                let val0 = pos.state[idx0 as usize];
+                let val1 = pos.state[idx1 as usize];
+                let val2 = pos.state[idx2 as usize];
+                let val3 = pos.state[idx3 as usize];
+                let val4 = pos.state[idx4 as usize];
+                let val5 = pos.state[idx5 as usize];
+
+                #[rustfmt::skip]
+                if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n +=1; }
+                #[rustfmt::skip]
+                if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
+                #[rustfmt::skip]
+                if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
+            }
+        }
+    }
+
+    n
+}
+
+#[target_feature(enable = "avx512f,avx512bw,popcnt")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn pos_is_winner_avx512(pos: &Pos) -> bool {
+    let current_side = side_opp(pos.p_turn);
+    let coloridx = current_side as usize;
+
+    let board0org: [__m512i; 2] = unsafe {
+        [
+            _mm512_loadu_epi32(&pos.bitboard[coloridx][0][0]),
+            _mm512_loadu_epi32(&pos.bitboard[coloridx][1][0]),
+        ]
+    }; // load states from bitboard
+
+    #[rustfmt::skip]
+    let answer = _mm512_set1_epi16((1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)); // an unbroken chain of five moves
+
+    // use Mask to filter out which data is not processed.
+    //    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
+    // 1  x x x x _ _ _ _ _ _  _  _  _  _  _  0  x  o  x  o  x  0  0  0  0  0  0  0  0  0  0  0
+    // 2  x _ _ _ _ o _ x o _  _  _  _  _  _  0  x  o  _  _  _  _  _| x  x  o  o  o  x  x  _  _
+    // .  ...
+    // .  ...
+    // .  ...
+    // 16 0 0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  x  o  x  o  o  o  o  o  o  o  0  0  0  0  0  0
+    //
+    // answer_mask[0]: 01_11..............: "0" is in row 16 and column 1-16.
+    // There is no data to match (x = black, o = white, _ = empty, 0 = no data).
+    //
+    //
+    // Then, shift one space left.
+    //    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
+    // 1  x x x _ _ _ _ _ _ _  _  _  _  _  0  x  o  x  o  x  0  0  0  0  0  0  0  0  0  0  0  0
+    // .  ...
+    // .  ...
+    // .  ...
+    // 16 0 0 0 0 0 0 0 0 0 0  0  0  0  0  0  x  o  x  o  o  o  o  o  o  o  0  0  0  0  0  0  0
+    // answer_mask[1]: ................_10: "0" is in row 1 and column 17-32;
+    // There is no enough data to match (o x o x but we want to match o o o o o).
+    //
+    // answer_mask[2]: mix 2 data together (column 17-23 and column 24-32). Using Mask to make it match correctly.
+    // For example, column 23,24,25,26,27 is not a pattern and 24,25,26,27,28 is a pattern.
+    // That is why some mask bits are set to 0 from answer_mask[2] to answer_mask[10].
+
+    #[rustfmt::skip]
+    let answer_mask: [__mmask32; 11] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10,
+                                        0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_11_10,
+                                        0b00_10_11_11_11_11_11_11_11_10_10_10_10_11_11_10,
+                                        0b00_10_10_11_11_11_11_11_10_10_10_10_11_11_11_10,
+                                        0b00_10_10_10_11_11_11_10_10_10_10_11_11_11_11_10,
+                                        0b00_10_10_10_10_11_10_10_10_10_11_11_11_11_11_10];
+    let mut count_match: i32 = 0;
+
+    for dir in 0..2 {
+        // direction 0 and 1
+        let mut board0 = board0org[dir];
+        let boardf = _mm512_and_si512(answer, board0);
+        let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf);
+        count_match += _popcnt32(temp_mask as i32);
+
+        for i in 1..11 {
+            // OOOOOOOOOOO----, the last 4 "-" cannot make an unbroken chain of five.
+            board0 = _mm512_slli_epi32(board0, 1); // shift one space left
+            let boardf = _mm512_and_si512(answer, board0); // focus on the pattern
+            let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf); // see if it matches the pattern
+            count_match += _popcnt32(temp_mask as i32);
+        }
+    }
+
+    count_match > 0
+}
+
+#[target_feature(enable = "avx512f,avx512bw,popcnt")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn check_patternlive4_avx512(pos: &Pos, sd: Side) -> bool {
+    let coloridx = sd as usize;
+    let emptyidx = Color::Empty as usize;
+
+    #[rustfmt::skip]
+    let answer_color = _mm512_set1_epi16(         (1<<14)|(1<<13)|(1<<12)|(1<<11)         );
+    #[rustfmt::skip]
+    let answer_empty = _mm512_set1_epi16( (1<<15)|                                (1<<10) );
+    #[rustfmt::skip]
+    let answer       = _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10) );
+
+    #[rustfmt::skip]
+    let answer_mask: [__mmask32; 10] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10,
+                                        0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_10_10,
+                                        0b00_10_11_11_11_11_11_11_11_10_10_10_10_10_11_10,
+                                        0b00_10_10_11_11_11_11_11_10_10_10_10_10_11_11_10,
+                                        0b00_10_10_10_11_11_11_10_10_10_10_10_11_11_11_10,
+                                        0b00_10_10_10_10_11_10_10_10_10_10_11_11_11_11_10];
+    let board0org: [__m512i; 2] = unsafe {
+        [
+            _mm512_loadu_epi32(&pos.bitboard[coloridx][0][0]),
+            _mm512_loadu_epi32(&pos.bitboard[coloridx][1][0]),
+        ]
+    };
+    let board1org: [__m512i; 2] = unsafe {
+        [
+            _mm512_loadu_epi32(&pos.bitboard[emptyidx][0][0]),
+            _mm512_loadu_epi32(&pos.bitboard[emptyidx][1][0]),
+        ]
+    };
+
+    let mut count_match: i32 = 0;
+
+    for dir in 0..2 {
+        let mut board0 = board0org[dir];
+        let mut board1 = board1org[dir];
+
+        let boardf1 = _mm512_and_si512(answer_color, board0);
+        let boardf2 = _mm512_and_si512(answer_empty, board1);
+        let boardf = _mm512_or_si512(boardf1, boardf2);
+
+        let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf);
+        count_match += _popcnt32(temp_mask as i32);
+
+        for i in 1..10 {
+            board0 = _mm512_slli_epi32(board0, 1);
+            board1 = _mm512_slli_epi32(board1, 1);
+
+            let boardf1 = _mm512_and_si512(answer_color, board0);
+            let boardf2 = _mm512_and_si512(answer_empty, board1);
+            let boardf = _mm512_or_si512(boardf1, boardf2);
+
+            let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf);
+            count_match += _popcnt32(temp_mask as i32);
+        }
+    }
+
+    count_match > 0
+}
+
+#[target_feature(enable = "avx512f,avx512bw,popcnt")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn check_patterndead4_avx512(pos: &Pos, sd: Side) -> i32 {
+    let coloridx = sd as usize;
+    let emptyidx = Color::Empty as usize;
+
+    #[rustfmt::skip]
+    let answer_color: [__m512i; 5] = [_mm512_set1_epi16(         (1<<14)|(1<<13)|(1<<12)|(1<<11) ),
+                                      _mm512_set1_epi16( (1<<15)|        (1<<13)|(1<<12)|(1<<11) ),
+                                      _mm512_set1_epi16( (1<<15)|(1<<14)        |(1<<12)|(1<<11) ),
+                                      _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)        |(1<<11) ),
+                                      _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)         )];
+    #[rustfmt::skip]
+    let answer_empty: [__m512i; 5]= [_mm512_set1_epi16( 1<<15 ),
+                                     _mm512_set1_epi16(          1<<14 ),
+                                     _mm512_set1_epi16(                  1<<13 ),
+                                     _mm512_set1_epi16(                          1<<12 ),
+                                     _mm512_set1_epi16(                                   1<<11)];
+    #[rustfmt::skip]
+    let answer       = _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11));
+
+    #[rustfmt::skip]
+    let answer_mask: [__mmask32; 11] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10,
+                                        0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_11_10,
+                                        0b00_10_11_11_11_11_11_11_11_10_10_10_10_11_11_10,
+                                        0b00_10_10_11_11_11_11_11_10_10_10_10_11_11_11_10,
+                                        0b00_10_10_10_11_11_11_10_10_10_10_11_11_11_11_10,
+                                        0b00_10_10_10_10_11_10_10_10_10_11_11_11_11_11_10];
+    let board0org: [__m512i; 2] = unsafe {
+        [
+            _mm512_loadu_epi32(&pos.bitboard[coloridx][0][0]),
+            _mm512_loadu_epi32(&pos.bitboard[coloridx][1][0]),
+        ]
+    };
+    let board1org: [__m512i; 2] = unsafe {
+        [
+            _mm512_loadu_epi32(&pos.bitboard[emptyidx][0][0]),
+            _mm512_loadu_epi32(&pos.bitboard[emptyidx][1][0]),
+        ]
+    };
+
+    let mut count_match: i32 = 0;
+
+    for pattern in 0..5 {
+        for dir in 0..2 {
+            let mut board0 = board0org[dir];
+            let mut board1 = board1org[dir];
+
+            let boardf1 = _mm512_and_si512(answer_color[pattern], board0);
+            let boardf2 = _mm512_and_si512(answer_empty[pattern], board1);
+            let boardf = _mm512_or_si512(boardf1, boardf2);
+
+            let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf);
+            count_match += _popcnt32(temp_mask as i32);
+
+            for i in 1..11 {
+                board0 = _mm512_slli_epi32(board0, 1);
+                board1 = _mm512_slli_epi32(board1, 1);
+
+                let boardf1 = _mm512_and_si512(answer_color[pattern], board0);
+                let boardf2 = _mm512_and_si512(answer_empty[pattern], board1);
+                let boardf = _mm512_or_si512(boardf1, boardf2);
+
+                let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf);
+                count_match += _popcnt32(temp_mask as i32);
+            }
+        }
+    }
+
+    count_match
+}
+
+#[target_feature(enable = "avx512f,avx512bw,popcnt")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn check_patternlive3_avx512(pos: &Pos, sd: Side) -> i32 {
+    let coloridx = sd as usize;
+    let emptyidx = Color::Empty as usize;
+
+    #[rustfmt::skip]
+    let board0org: [__m512i; 2] = unsafe { [_mm512_loadu_epi32(&pos.bitboard[coloridx][0][0]), _mm512_loadu_epi32(&pos.bitboard[coloridx][1][0])] };
+    #[rustfmt::skip]
+    let board1org: [__m512i; 2] = unsafe { [_mm512_loadu_epi32(&pos.bitboard[emptyidx][0][0]), _mm512_loadu_epi32(&pos.bitboard[emptyidx][1][0])] };
+
+    #[rustfmt::skip]
+    let answer_color: [__m512i; 1] = [_mm512_set1_epi16(         (1<<14)|(1<<13)|(1<<12)         )];
+    #[rustfmt::skip]
+    let answer_empty: [__m512i; 1] = [_mm512_set1_epi16( (1<<15)|                        (1<<11) )];
+    #[rustfmt::skip]
+    let answer: __m512i = _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11) );
+
+    let mut count_match: i32 = 0;
+
+    #[rustfmt::skip]
+    let answer_mask: [__mmask32; 11] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10,
+                                        0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_11_10,
+                                        0b00_10_11_11_11_11_11_11_11_10_10_10_10_11_11_10,
+                                        0b00_10_10_11_11_11_11_11_10_10_10_10_11_11_11_10,
+                                        0b00_10_10_10_11_11_11_10_10_10_10_11_11_11_11_10,
+                                        0b00_10_10_10_10_11_10_10_10_10_11_11_11_11_11_10];
+    for pattern in 0..1 {
+        for dir in 0..2 {
+            let mut board0 = board0org[dir];
+            let mut board1 = board1org[dir];
+
+            let boardf1 = _mm512_and_si512(answer_color[pattern], board0);
+            let boardf2 = _mm512_and_si512(answer_empty[pattern], board1);
+            let boardf = _mm512_or_si512(boardf1, boardf2);
+
+            let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf);
+            count_match += _popcnt32(temp_mask as i32);
+
+            for i in 1..11 {
+                board0 = _mm512_slli_epi32(board0, 1);
+                board1 = _mm512_slli_epi32(board1, 1);
+
+                let boardf1 = _mm512_and_si512(answer_color[pattern], board0);
+                let boardf2 = _mm512_and_si512(answer_empty[pattern], board1);
+                let boardf = _mm512_or_si512(boardf1, boardf2);
+
+                let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf);
+                count_match += _popcnt32(temp_mask as i32);
+            }
+        }
+    }
+
+    #[rustfmt::skip]
+    let answer_color: [__m512i; 2] = [_mm512_set1_epi16(          (1<<14)|        (1<<12)|(1<<11) ),
+                                      _mm512_set1_epi16(          (1<<14)|(1<<13)        |(1<<11) )];
+    #[rustfmt::skip]
+    let answer_empty: [__m512i; 2] = [_mm512_set1_epi16( (1<<15)|         (1<<13)|                (1<<10) ),
+                                      _mm512_set1_epi16( (1<<15)|                 (1<<12)|        (1<<10) )];
+    #[rustfmt::skip]
+    let answer: __m512i = _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10) );
+
+    #[rustfmt::skip]
+    let answer_mask: [__mmask32; 10] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10,
+                                        0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_10_10,
+                                        0b00_10_11_11_11_11_11_11_11_10_10_10_10_10_11_10,
+                                        0b00_10_10_11_11_11_11_11_10_10_10_10_10_11_11_10,
+                                        0b00_10_10_10_11_11_11_10_10_10_10_10_11_11_11_10,
+                                        0b00_10_10_10_10_11_10_10_10_10_10_11_11_11_11_10];
+    for pattern in 0..2 {
+        for dir in 0..2 {
+            let mut board0 = board0org[dir];
+            let mut board1 = board1org[dir];
+
+            let boardf1 = _mm512_and_si512(answer_color[pattern], board0);
+            let boardf2 = _mm512_and_si512(answer_empty[pattern], board1);
+            let boardf = _mm512_or_si512(boardf1, boardf2);
+
+            let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf);
+            count_match += _popcnt32(temp_mask as i32);
+
+            for i in 1..10 {
+                board0 = _mm512_slli_epi32(board0, 1);
+                board1 = _mm512_slli_epi32(board1, 1);
+
+                let boardf1 = _mm512_and_si512(answer_color[pattern], board0);
+                let boardf2 = _mm512_and_si512(answer_empty[pattern], board1);
+                let boardf = _mm512_or_si512(boardf1, boardf2);
+
+                let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf);
+                count_match += _popcnt32(temp_mask as i32);
+            }
+        }
+    }
+
+    count_match
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn check_x86_avx512_features() -> bool {
+    is_x86_feature_detected!("avx512bw") && is_x86_feature_detected!("popcnt")
+}
+
+fn main() {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if check_x86_avx512_features() {
+            println!("\n\nThe program is running with avx512f and avx512bw intrinsics\n\n");
+        } else {
+            println!("\n\nThe program is running with NO intrinsics.\n\n");
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    {
+        println!("\n\nThe program is running with NO intrinsics.\n\n");
+    }
+
+    loop {
+        let start = Instant::now();
+
+        println!("Hello, this is Connect5 (Outer-Open Gomoku)!");
+        println!("Self-playing with search depth = 4");
+
+        let test_state: [Color; SQUARE_SIZE as usize] = [Color::Empty; SQUARE_SIZE as usize];
+        let test_bitboard: [[[i32; 16]; 2]; 3] = [[[0; 16]; 2]; 3];
+
+        let mut test1 = Pos {
+            state: test_state,
+            p_turn: Color::Black,
+            bitboard: test_bitboard,
+        };
+
+        test1.init();
+
+        let mut count: i32 = 0;
+
+        for i in 0..(FILE_SIZE * RANK_SIZE) {
+            let mut next_move: Move = square_make(1, 7); // set the first move is (1,7)
+
+            if i > 0 {
+                next_move = search(&test1, -EVAL_INF, EVAL_INF, 4, 0);
+            } // search depth = 4
+
+            test1.do_move(next_move);
+            pos_disp(&test1);
+
+            if pos_is_end(&test1) {
+                println!("Game over!!!!!! at Move {i}");
+                count = i + 1;
+                break;
+            }
+        }
+
+        let duration = start.elapsed();
+
+        println!(
+            "Average time for each move is: {:?}",
+            duration / count as u32
+        );
+    }
+}
diff --git a/library/stdarch/examples/hex.rs b/library/stdarch/examples/hex.rs
new file mode 100644
index 0000000000000..e393ad7271689
--- /dev/null
+++ b/library/stdarch/examples/hex.rs
@@ -0,0 +1,420 @@
+//! An example showing runtime dispatch to an architecture-optimized
+//! implementation.
+//!
+//! This program implements hex encoding a slice into a predetermined
+//! destination using various different instruction sets. This selects at
+//! runtime the most optimized implementation and uses that rather than being
+//! required to be compiled differently.
+//!
+//! You can test out this program via:
+//!
+//!     echo test | cargo +nightly run --release hex
+//!
+//! and you should see `746573740a` get printed out.
+
+#![allow(internal_features)]
+#![feature(wasm_target_feature)]
+#![cfg_attr(test, feature(test))]
+#![cfg_attr(
+    any(target_arch = "x86", target_arch = "x86_64"),
+    feature(stdarch_internal)
+)]
+#![allow(
+    clippy::unwrap_used,
+    clippy::print_stdout,
+    clippy::unwrap_used,
+    clippy::shadow_reuse,
+    clippy::cast_possible_wrap,
+    clippy::cast_ptr_alignment,
+    clippy::cast_sign_loss,
+    clippy::missing_docs_in_private_items
+)]
+
+use std::{
+    io::{self, Read},
+    str,
+};
+
+#[cfg(target_arch = "x86")]
+use {core_arch::arch::x86::*, std_detect::is_x86_feature_detected};
+#[cfg(target_arch = "x86_64")]
+use {core_arch::arch::x86_64::*, std_detect::is_x86_feature_detected};
+
+fn main() {
+    let mut input = Vec::new();
+    io::stdin().read_to_end(&mut input).unwrap();
+    let mut dst = vec![0; 2 * input.len()];
+    let s = hex_encode(&input, &mut dst).unwrap();
+    println!("{s}");
+}
+
+fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    let len = src.len().checked_mul(2).unwrap();
+    if dst.len() < len {
+        return Err(len);
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("avx2") {
+            return unsafe { hex_encode_avx2(src, dst) };
+        }
+        if is_x86_feature_detected!("sse4.1") {
+            return unsafe { hex_encode_sse41(src, dst) };
+        }
+    }
+    #[cfg(target_arch = "wasm32")]
+    {
+        if true {
+            return hex_encode_simd128(src, dst);
+        }
+    }
+
+    hex_encode_fallback(src, dst)
+}
+
+#[target_feature(enable = "avx2")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn hex_encode_avx2<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    assert!(dst.len() >= src.len().checked_mul(2).unwrap());
+
+    let ascii_zero = _mm256_set1_epi8(b'0' as i8);
+    let nines = _mm256_set1_epi8(9);
+    let ascii_a = _mm256_set1_epi8((b'a' - 9 - 1) as i8);
+    let and4bits = _mm256_set1_epi8(0xf);
+
+    let mut i = 0_usize;
+    while src.len() >= 32 {
+        // SAFETY: the loop condition ensures that we have at least 32 bytes
+        let invec = unsafe { _mm256_loadu_si256(src.as_ptr() as *const _) };
+
+        let masked1 = _mm256_and_si256(invec, and4bits);
+        let masked2 = _mm256_and_si256(_mm256_srli_epi64(invec, 4), and4bits);
+
+        // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+        let cmpmask1 = _mm256_cmpgt_epi8(masked1, nines);
+        let cmpmask2 = _mm256_cmpgt_epi8(masked2, nines);
+
+        // add '0' or the offset depending on the masks
+        let masked1 = _mm256_add_epi8(masked1, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask1));
+        let masked2 = _mm256_add_epi8(masked2, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask2));
+
+        // interleave masked1 and masked2 bytes
+        let res1 = _mm256_unpacklo_epi8(masked2, masked1);
+        let res2 = _mm256_unpackhi_epi8(masked2, masked1);
+
+        // Store everything into the right destination now
+        unsafe {
+            // SAFETY: the assertion at the beginning of the function ensures
+            // that `dst` is large enough.
+            let base = dst.as_mut_ptr().add(i * 2);
+            let base1 = base.add(0) as *mut _;
+            let base2 = base.add(16) as *mut _;
+            let base3 = base.add(32) as *mut _;
+            let base4 = base.add(48) as *mut _;
+            _mm256_storeu2_m128i(base3, base1, res1);
+            _mm256_storeu2_m128i(base4, base2, res2);
+        }
+
+        src = &src[32..];
+        i += 32;
+    }
+
+    let _ = hex_encode_sse41(src, &mut dst[i * 2..]);
+
+    // SAFETY: `dst` only contains ASCII characters
+    unsafe { Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) }
+}
+
+// copied from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp
+#[target_feature(enable = "sse4.1")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn hex_encode_sse41<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    assert!(dst.len() >= src.len().checked_mul(2).unwrap());
+
+    let ascii_zero = _mm_set1_epi8(b'0' as i8);
+    let nines = _mm_set1_epi8(9);
+    let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
+    let and4bits = _mm_set1_epi8(0xf);
+
+    let mut i = 0_usize;
+    while src.len() >= 16 {
+        // SAFETY: the loop condition ensures that we have at least 16 bytes
+        let invec = unsafe { _mm_loadu_si128(src.as_ptr() as *const _) };
+
+        let masked1 = _mm_and_si128(invec, and4bits);
+        let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
+
+        // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+        let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
+        let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
+
+        // add '0' or the offset depending on the masks
+        let masked1 = _mm_add_epi8(masked1, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1));
+        let masked2 = _mm_add_epi8(masked2, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2));
+
+        // interleave masked1 and masked2 bytes
+        let res1 = _mm_unpacklo_epi8(masked2, masked1);
+        let res2 = _mm_unpackhi_epi8(masked2, masked1);
+
+        unsafe {
+            // SAFETY: the assertion at the beginning of the function ensures
+            // that `dst` is large enough.
+            _mm_storeu_si128(dst.as_mut_ptr().add(i * 2) as *mut _, res1);
+            _mm_storeu_si128(dst.as_mut_ptr().add(i * 2 + 16) as *mut _, res2);
+        }
+        src = &src[16..];
+        i += 16;
+    }
+
+    let _ = hex_encode_fallback(src, &mut dst[i * 2..]);
+
+    // SAFETY: `dst` only contains ASCII characters
+    unsafe { Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) }
+}
+
+#[cfg(target_arch = "wasm32")]
+#[target_feature(enable = "simd128")]
+fn hex_encode_simd128<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    assert!(dst.len() >= src.len().checked_mul(2).unwrap());
+
+    use core_arch::arch::wasm32::*;
+
+    let ascii_zero = u8x16_splat(b'0');
+    let nines = u8x16_splat(9);
+    let ascii_a = u8x16_splat(b'a' - 9 - 1);
+    let and4bits = u8x16_splat(0xf);
+
+    let mut i = 0_usize;
+    while src.len() >= 16 {
+        // SAFETY: the loop condition ensures that we have at least 16 bytes
+        let invec = unsafe { v128_load(src.as_ptr() as *const _) };
+
+        let masked1 = v128_and(invec, and4bits);
+        let masked2 = v128_and(u8x16_shr(invec, 4), and4bits);
+
+        // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+        let cmpmask1 = u8x16_gt(masked1, nines);
+        let cmpmask2 = u8x16_gt(masked2, nines);
+
+        // add '0' or the offset depending on the masks
+        let masked1 = u8x16_add(masked1, v128_bitselect(ascii_a, ascii_zero, cmpmask1));
+        let masked2 = u8x16_add(masked2, v128_bitselect(ascii_a, ascii_zero, cmpmask2));
+
+        // Next we need to shuffle around masked{1,2} to get back to the
+        // original source text order. The first element (res1) we'll store uses
+        // all the low bytes from the 2 masks and the second element (res2) uses
+        // all the upper bytes.
+        let res1 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+            masked2, masked1,
+        );
+        let res2 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
+            masked2, masked1,
+        );
+
+        unsafe {
+            // SAFETY: the assertion at the beginning of the function ensures
+            // that `dst` is large enough.
+            v128_store(dst.as_mut_ptr().add(i * 2) as *mut _, res1);
+            v128_store(dst.as_mut_ptr().add(i * 2 + 16) as *mut _, res2);
+        }
+        src = &src[16..];
+        i += 16;
+    }
+
+    let _ = hex_encode_fallback(src, &mut dst[i * 2..]);
+
+    // SAFETY: `dst` only contains ASCII characters
+    unsafe { Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) }
+}
+
+fn hex_encode_fallback<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    fn hex(byte: u8) -> u8 {
+        static TABLE: &[u8] = b"0123456789abcdef";
+        TABLE[byte as usize]
+    }
+
+    for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) {
+        slots[0] = hex((*byte >> 4) & 0xf);
+        slots[1] = hex(*byte & 0xf);
+    }
+
+    unsafe { Ok(str::from_utf8_unchecked(&dst[..src.len() * 2])) }
+}
+
+// Run these with `cargo +nightly test --example hex -p stdarch`
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test(input: &[u8], output: &str) {
+        let tmp = || vec![0; input.len() * 2];
+
+        assert_eq!(hex_encode_fallback(input, &mut tmp()).unwrap(), output);
+        assert_eq!(hex_encode(input, &mut tmp()).unwrap(), output);
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        unsafe {
+            if self::is_x86_feature_detected!("avx2") {
+                assert_eq!(hex_encode_avx2(input, &mut tmp()).unwrap(), output);
+            }
+            if self::is_x86_feature_detected!("sse4.1") {
+                assert_eq!(hex_encode_sse41(input, &mut tmp()).unwrap(), output);
+            }
+        }
+    }
+
+    #[test]
+    fn empty() {
+        test(b"", "");
+    }
+
+    #[test]
+    fn big() {
+        test(&[0; 1024], &"0".repeat(2048));
+    }
+
+    #[test]
+    fn odd() {
+        test(&[0; 313], &"0".repeat(313 * 2));
+    }
+
+    #[test]
+    fn avx_works() {
+        let mut input = [0; 33];
+        input[4] = 3;
+        input[16] = 3;
+        input[17] = 0x30;
+        input[21] = 1;
+        input[31] = 0x24;
+        test(
+            &input,
+            "\
+             0000000003000000\
+             0000000000000000\
+             0330000000010000\
+             0000000000000024\
+             00\
+             ",
+        );
+    }
+
+    quickcheck::quickcheck! {
+        fn encode_equals_fallback(input: Vec<u8>) -> bool {
+            let mut space1 = vec![0; input.len() * 2];
+            let mut space2 = vec![0; input.len() * 2];
+            let a = hex_encode(&input, &mut space1).unwrap();
+            let b = hex_encode_fallback(&input, &mut space2).unwrap();
+            a == b
+        }
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        fn avx_equals_fallback(input: Vec<u8>) -> bool {
+            if !self::is_x86_feature_detected!("avx2") {
+                return true
+            }
+            let mut space1 = vec![0; input.len() * 2];
+            let mut space2 = vec![0; input.len() * 2];
+            let a = unsafe { hex_encode_avx2(&input, &mut space1).unwrap() };
+            let b = hex_encode_fallback(&input, &mut space2).unwrap();
+            a == b
+        }
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        fn sse41_equals_fallback(input: Vec<u8>) -> bool {
+            if !self::is_x86_feature_detected!("avx2") {
+                return true
+            }
+            let mut space1 = vec![0; input.len() * 2];
+            let mut space2 = vec![0; input.len() * 2];
+            let a = unsafe { hex_encode_sse41(&input, &mut space1).unwrap() };
+            let b = hex_encode_fallback(&input, &mut space2).unwrap();
+            a == b
+        }
+    }
+}
+
+// Run these with `cargo +nightly bench --example hex -p stdarch`
+#[cfg(test)]
+mod benches {
+    extern crate rand;
+    extern crate test;
+
+    use self::rand::Rng;
+
+    use super::*;
+
+    const SMALL_LEN: usize = 117;
+    const LARGE_LEN: usize = 1 * 1024 * 1024;
+
+    fn doit(
+        b: &mut test::Bencher,
+        len: usize,
+        f: for<'a> unsafe fn(&[u8], &'a mut [u8]) -> Result<&'a str, usize>,
+    ) {
+        let mut rng = rand::thread_rng();
+        let input = std::iter::repeat(())
+            .map(|()| rng.r#gen::<u8>())
+            .take(len)
+            .collect::<Vec<_>>();
+        let mut dst = vec![0; input.len() * 2];
+        b.bytes = len as u64;
+        b.iter(|| unsafe {
+            f(&input, &mut dst).unwrap();
+            dst[0]
+        });
+    }
+
+    #[bench]
+    fn small_default(b: &mut test::Bencher) {
+        doit(b, SMALL_LEN, hex_encode);
+    }
+
+    #[bench]
+    fn small_fallback(b: &mut test::Bencher) {
+        doit(b, SMALL_LEN, hex_encode_fallback);
+    }
+
+    #[bench]
+    fn large_default(b: &mut test::Bencher) {
+        doit(b, LARGE_LEN, hex_encode);
+    }
+
+    #[bench]
+    fn large_fallback(b: &mut test::Bencher) {
+        doit(b, LARGE_LEN, hex_encode_fallback);
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    mod x86 {
+        use super::*;
+
+        #[bench]
+        fn small_avx2(b: &mut test::Bencher) {
+            if self::is_x86_feature_detected!("avx2") {
+                doit(b, SMALL_LEN, hex_encode_avx2);
+            }
+        }
+
+        #[bench]
+        fn small_sse41(b: &mut test::Bencher) {
+            if self::is_x86_feature_detected!("sse4.1") {
+                doit(b, SMALL_LEN, hex_encode_sse41);
+            }
+        }
+
+        #[bench]
+        fn large_avx2(b: &mut test::Bencher) {
+            if self::is_x86_feature_detected!("avx2") {
+                doit(b, LARGE_LEN, hex_encode_avx2);
+            }
+        }
+
+        #[bench]
+        fn large_sse41(b: &mut test::Bencher) {
+            if self::is_x86_feature_detected!("sse4.1") {
+                doit(b, LARGE_LEN, hex_encode_sse41);
+            }
+        }
+    }
+}
diff --git a/library/stdarch/examples/wasm.rs b/library/stdarch/examples/wasm.rs
new file mode 100644
index 0000000000000..ed313b15d1e34
--- /dev/null
+++ b/library/stdarch/examples/wasm.rs
@@ -0,0 +1,48 @@
+//! A simple slab allocator for pages in wasm
+
+#![cfg(target_arch = "wasm32")]
+
+use std::ptr;
+
+use core_arch::arch::wasm32::*;
+
+static mut HEAD: *mut *mut u8 = 0 as _;
+
+#[unsafe(no_mangle)]
+pub unsafe extern "C" fn page_alloc() -> *mut u8 {
+    unsafe {
+        if !HEAD.is_null() {
+            let next = *HEAD;
+            let ret = HEAD;
+            HEAD = next as *mut _;
+            return ret as *mut u8;
+        }
+    }
+
+    let ret = memory_grow(0, 1);
+
+    // if we failed to allocate a page then return null
+    if ret == usize::MAX {
+        return ptr::null_mut();
+    }
+
+    ((ret as u32) * page_size()) as *mut u8
+}
+
+#[unsafe(no_mangle)]
+pub unsafe extern "C" fn page_free(page: *mut u8) {
+    let page = page as *mut *mut u8;
+    unsafe {
+        *page = HEAD as *mut u8;
+        HEAD = page;
+    }
+}
+
+#[unsafe(no_mangle)]
+pub unsafe extern "C" fn memory_used() -> usize {
+    (page_size() * (memory_size(0) as u32)) as usize
+}
+
+fn page_size() -> u32 {
+    64 * 1024
+}
diff --git a/library/stdarch/intrinsics_data/arm_intrinsics.json b/library/stdarch/intrinsics_data/arm_intrinsics.json
new file mode 100644
index 0000000000000..9d58aad49cd44
--- /dev/null
+++ b/library/stdarch/intrinsics_data/arm_intrinsics.json
@@ -0,0 +1,119757 @@
+[
+  {
+    "SIMD_ISA": "Neon",
+    "name": "__crc32b",
+    "arguments": [
+      "uint32_t a",
+      "uint8_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Wn"
+      },
+      "b": {
+        "register": "Wm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CRC32B"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "__crc32cb",
+    "arguments": [
+      "uint32_t a",
+      "uint8_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Wn"
+      },
+      "b": {
+        "register": "Wm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CRC32CB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "__crc32cd",
+    "arguments": [
+      "uint32_t a",
+      "uint64_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Wn"
+      },
+      "b": {
+        "register": "Xm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CRC32CX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "__crc32ch",
+    "arguments": [
+      "uint32_t a",
+      "uint16_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Wn"
+      },
+      "b": {
+        "register": "Wm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CRC32CH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "__crc32cw",
+    "arguments": [
+      "uint32_t a",
+      "uint32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Wn"
+      },
+      "b": {
+        "register": "Wm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CRC32CW"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "__crc32d",
+    "arguments": [
+      "uint32_t a",
+      "uint64_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Wn"
+      },
+      "b": {
+        "register": "Xm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CRC32X"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "__crc32h",
+    "arguments": [
+      "uint32_t a",
+      "uint16_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Wn"
+      },
+      "b": {
+        "register": "Wm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CRC32H"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "__crc32w",
+    "arguments": [
+      "uint32_t a",
+      "uint32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Wn"
+      },
+      "b": {
+        "register": "Wm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CRC32W"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaba_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16x4_t c"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaba_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32x2_t c"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaba_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b",
+      "int8x8_t c"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaba_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "uint16x4_t c"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaba_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "uint32x2_t c"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaba_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b",
+      "uint8x8_t c"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabal_high_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16x8_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabal_high_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32x4_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabal_high_s8",
+    "arguments": [
+      "int16x8_t a",
+      "int8x16_t b",
+      "int8x16_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabal_high_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x8_t b",
+      "uint16x8_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabal_high_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x4_t b",
+      "uint32x4_t c"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabal_high_u8",
+    "arguments": [
+      "uint16x8_t a",
+      "uint8x16_t b",
+      "uint8x16_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabal_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16x4_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabal_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32x2_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabal_s8",
+    "arguments": [
+      "int16x8_t a",
+      "int8x8_t b",
+      "int8x8_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabal_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x4_t b",
+      "uint16x4_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabal_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x2_t b",
+      "uint32x2_t c"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabal_u8",
+    "arguments": [
+      "uint16x8_t a",
+      "uint8x8_t b",
+      "uint8x8_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabaq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x8_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabaq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x4_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabaq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b",
+      "int8x16_t c"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabaq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "uint16x8_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabaq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabaq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b",
+      "uint8x16_t c"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabd_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabd_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabd_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabd_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabd_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabd_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabd_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabd_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabd_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdd_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdl_high_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABDL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdl_high_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABDL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdl_high_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABDL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdl_high_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABDL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdl_high_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABDL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdl_high_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABDL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdl_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABDL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdl_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABDL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdl_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABDL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdl_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABDL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdl_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABDL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdl_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABDL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabdq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabds_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabs_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabs_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabs_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabs_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabs_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabs_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabs_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabsd_s64",
+    "arguments": [
+      "int64_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabsh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabsq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabsq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabsq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabsq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabsq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabsq_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vabsq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "poly16x4_t b"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_p64",
+    "arguments": [
+      "poly64x1_t a",
+      "poly64x1_t b"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vadd_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddd_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddd_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddhn_high_s16",
+    "arguments": [
+      "int8x8_t r",
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddhn_high_s32",
+    "arguments": [
+      "int16x4_t r",
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddhn_high_s64",
+    "arguments": [
+      "int32x2_t r",
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddhn_high_u16",
+    "arguments": [
+      "uint8x8_t r",
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddhn_high_u32",
+    "arguments": [
+      "uint16x4_t r",
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddhn_high_u64",
+    "arguments": [
+      "uint32x2_t r",
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddhn_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddhn_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddhn_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddhn_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddhn_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddhn_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddl_high_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddl_high_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddl_high_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddl_high_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddl_high_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddl_high_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddl_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddl_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddl_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddl_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddl_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddl_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddlv_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDLV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddlv_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddlv_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDLV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddlv_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDLV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddlv_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddlv_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDLV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddlvq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDLV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddlvq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDLV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddlvq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDLV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddlvq_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDLV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddlvq_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDLV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddlvq_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDLV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_p128",
+    "arguments": [
+      "poly128_t a",
+      "poly128_t b"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "poly16x8_t b"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddv_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddv_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddv_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddv_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddv_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddv_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddv_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddvq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADDP",
+        "FADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddvq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddvq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddvq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddvq_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddvq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddvq_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddvq_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddvq_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddvq_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddw_high_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDW2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddw_high_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDW2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddw_high_s8",
+    "arguments": [
+      "int16x8_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDW2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddw_high_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDW2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddw_high_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDW2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddw_high_u8",
+    "arguments": [
+      "uint16x8_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDW2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddw_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDW"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddw_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDW"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddw_s8",
+    "arguments": [
+      "int16x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDW"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddw_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDW"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddw_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDW"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaddw_u8",
+    "arguments": [
+      "uint16x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDW"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaesdq_u8",
+    "arguments": [
+      "uint8x16_t data",
+      "uint8x16_t key"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "data": {
+        "register": "Vd.16B"
+      },
+      "key": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AESD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaeseq_u8",
+    "arguments": [
+      "uint8x16_t data",
+      "uint8x16_t key"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "data": {
+        "register": "Vd.16B"
+      },
+      "key": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AESE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaesimcq_u8",
+    "arguments": [
+      "uint8x16_t data"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "data": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AESIMC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaesmcq_u8",
+    "arguments": [
+      "uint8x16_t data"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "data": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AESMC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vand_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vand_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vand_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vand_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vand_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vand_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vand_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vand_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vandq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vandq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vandq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vandq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vandq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vandq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vandq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vandq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "AND"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbcaxq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x8_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BCAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbcaxq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x4_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BCAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbcaxq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b",
+      "int64x2_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BCAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbcaxq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b",
+      "int8x16_t c"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BCAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbcaxq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "uint16x8_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BCAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbcaxq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BCAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbcaxq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b",
+      "uint64x2_t c"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BCAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbcaxq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b",
+      "uint8x16_t c"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BCAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbic_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbic_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbic_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbic_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbic_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbic_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbic_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbic_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbicq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbicq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbicq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbicq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbicq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbicq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbicq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbicq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BIC"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_f16",
+    "arguments": [
+      "uint16x4_t a",
+      "float16x4_t b",
+      "float16x4_t c"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_f32",
+    "arguments": [
+      "uint32x2_t a",
+      "float32x2_t b",
+      "float32x2_t c"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_f64",
+    "arguments": [
+      "uint64x1_t a",
+      "float64x1_t b",
+      "float64x1_t c"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_p16",
+    "arguments": [
+      "uint16x4_t a",
+      "poly16x4_t b",
+      "poly16x4_t c"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_p64",
+    "arguments": [
+      "poly64x1_t a",
+      "poly64x1_t b",
+      "poly64x1_t c"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_p8",
+    "arguments": [
+      "uint8x8_t a",
+      "poly8x8_t b",
+      "poly8x8_t c"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_s16",
+    "arguments": [
+      "uint16x4_t a",
+      "int16x4_t b",
+      "int16x4_t c"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_s32",
+    "arguments": [
+      "uint32x2_t a",
+      "int32x2_t b",
+      "int32x2_t c"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_s64",
+    "arguments": [
+      "uint64x1_t a",
+      "int64x1_t b",
+      "int64x1_t c"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_s8",
+    "arguments": [
+      "uint8x8_t a",
+      "int8x8_t b",
+      "int8x8_t c"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "uint16x4_t c"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "uint32x2_t c"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b",
+      "uint64x1_t c"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbsl_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b",
+      "uint8x8_t c"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_f16",
+    "arguments": [
+      "uint16x8_t a",
+      "float16x8_t b",
+      "float16x8_t c"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_f32",
+    "arguments": [
+      "uint32x4_t a",
+      "float32x4_t b",
+      "float32x4_t c"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_f64",
+    "arguments": [
+      "uint64x2_t a",
+      "float64x2_t b",
+      "float64x2_t c"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_p16",
+    "arguments": [
+      "uint16x8_t a",
+      "poly16x8_t b",
+      "poly16x8_t c"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b",
+      "poly64x2_t c"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_p8",
+    "arguments": [
+      "uint8x16_t a",
+      "poly8x16_t b",
+      "poly8x16_t c"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_s16",
+    "arguments": [
+      "uint16x8_t a",
+      "int16x8_t b",
+      "int16x8_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_s32",
+    "arguments": [
+      "uint32x4_t a",
+      "int32x4_t b",
+      "int32x4_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_s64",
+    "arguments": [
+      "uint64x2_t a",
+      "int64x2_t b",
+      "int64x2_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_s8",
+    "arguments": [
+      "uint8x16_t a",
+      "int8x16_t b",
+      "int8x16_t c"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "uint16x8_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b",
+      "uint64x2_t c"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vbslq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b",
+      "uint8x16_t c"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "BSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcadd_rot270_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H "
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcadd_rot270_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S "
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcadd_rot90_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H "
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcadd_rot90_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S "
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaddq_rot270_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H "
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaddq_rot270_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S "
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaddq_rot270_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D "
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaddq_rot90_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H "
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaddq_rot90_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S "
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaddq_rot90_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D "
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcage_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcage_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcage_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaged_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcageh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcageq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcageq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcageq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcages_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcagt_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcagt_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcagt_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcagtd_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcagth_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcagtq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcagtq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcagtq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcagts_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcale_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcale_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcale_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaled_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaleh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaleq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaleq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaleq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcales_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcalt_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcalt_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcalt_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaltd_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcalth_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaltq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaltq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcaltq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcalts_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FACGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceq_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceq_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceq_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceq_p64",
+    "arguments": [
+      "poly64x1_t a",
+      "poly64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceq_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceq_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceq_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceq_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceq_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceq_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceq_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceq_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceq_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqd_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqd_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqd_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqq_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqq_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqs_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqz_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqz_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqz_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqz_p64",
+    "arguments": [
+      "poly64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqz_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqz_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqz_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqz_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqz_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqz_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqz_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqz_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqz_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzd_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzd_s64",
+    "arguments": [
+      "int64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzd_u64",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzq_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzq_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzq_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzq_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzq_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzq_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzq_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vceqzs_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMEQ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcge_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcge_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcge_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcge_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcge_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcge_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcge_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcge_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcge_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcge_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcge_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcged_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcged_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcged_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgeh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgeq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgeq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgeq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgeq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgeq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgeq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgeq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgeq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgeq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgeq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgeq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcges_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgez_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgez_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgez_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgez_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgez_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgez_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgez_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgezd_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgezd_s64",
+    "arguments": [
+      "int64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgezh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgezq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgezq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgezq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgezq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgezq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgezq_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgezq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgezs_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgt_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgt_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgt_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgt_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgt_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgt_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgt_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgt_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgt_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgt_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgt_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtd_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtd_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtd_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgth_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgts_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtz_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtz_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtz_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtz_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtz_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtz_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtz_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtzd_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtzd_s64",
+    "arguments": [
+      "int64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtzh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtzq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtzq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtzq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtzq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtzq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtzq_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtzq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcgtzs_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcle_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcle_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcle_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcle_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcle_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcle_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcle_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcle_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcle_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcle_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcle_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcled_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcled_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcled_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcleh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcleq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcleq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcleq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcleq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcleq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcleq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcleq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcleq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcleq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcleq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcleq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcles_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclez_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclez_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclez_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclez_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclez_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclez_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclez_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclezd_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclezd_s64",
+    "arguments": [
+      "int64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclezh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclezq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclezq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclezq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclezq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclezq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclezq_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclezq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclezs_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcls_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcls_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcls_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcls_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcls_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcls_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclsq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclsq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclsq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclsq_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclsq_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclsq_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclt_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclt_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclt_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclt_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclt_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclt_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclt_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclt_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclt_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclt_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclt_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltd_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltd_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltd_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclth_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMHI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclts_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMGT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltz_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltz_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltz_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltz_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltz_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltz_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltz_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltzd_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltzd_s64",
+    "arguments": [
+      "int64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltzh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltzq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltzq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltzq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltzq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltzq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltzq_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltzq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcltzs_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclz_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclz_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclz_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclz_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclz_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclz_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclzq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclzq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclzq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclzq_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclzq_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vclzq_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CLZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_f16",
+    "arguments": [
+      "float16x4_t r",
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_f32",
+    "arguments": [
+      "float32x2_t r",
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_lane_f16",
+    "arguments": [
+      "float16x4_t r",
+      "float16x4_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_lane_f32",
+    "arguments": [
+      "float32x2_t r",
+      "float32x2_t a",
+      "float32x2_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_laneq_f16",
+    "arguments": [
+      "float16x4_t r",
+      "float16x4_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_laneq_f32",
+    "arguments": [
+      "float32x2_t r",
+      "float32x2_t a",
+      "float32x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot180_f16",
+    "arguments": [
+      "float16x4_t r",
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot180_f32",
+    "arguments": [
+      "float32x2_t r",
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot180_lane_f16",
+    "arguments": [
+      "float16x4_t r",
+      "float16x4_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot180_lane_f32",
+    "arguments": [
+      "float32x2_t r",
+      "float32x2_t a",
+      "float32x2_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot180_laneq_f16",
+    "arguments": [
+      "float16x4_t r",
+      "float16x4_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot180_laneq_f32",
+    "arguments": [
+      "float32x2_t r",
+      "float32x2_t a",
+      "float32x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot270_f16",
+    "arguments": [
+      "float16x4_t r",
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot270_f32",
+    "arguments": [
+      "float32x2_t r",
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot270_lane_f16",
+    "arguments": [
+      "float16x4_t r",
+      "float16x4_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot270_lane_f32",
+    "arguments": [
+      "float32x2_t r",
+      "float32x2_t a",
+      "float32x2_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot270_laneq_f16",
+    "arguments": [
+      "float16x4_t r",
+      "float16x4_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot270_laneq_f32",
+    "arguments": [
+      "float32x2_t r",
+      "float32x2_t a",
+      "float32x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot90_f16",
+    "arguments": [
+      "float16x4_t r",
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot90_f32",
+    "arguments": [
+      "float32x2_t r",
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot90_lane_f16",
+    "arguments": [
+      "float16x4_t r",
+      "float16x4_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot90_lane_f32",
+    "arguments": [
+      "float32x2_t r",
+      "float32x2_t a",
+      "float32x2_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot90_laneq_f16",
+    "arguments": [
+      "float16x4_t r",
+      "float16x4_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmla_rot90_laneq_f32",
+    "arguments": [
+      "float32x2_t r",
+      "float32x2_t a",
+      "float32x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_f16",
+    "arguments": [
+      "float16x8_t r",
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_f32",
+    "arguments": [
+      "float32x4_t r",
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_f64",
+    "arguments": [
+      "float64x2_t r",
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_lane_f16",
+    "arguments": [
+      "float16x8_t r",
+      "float16x8_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_lane_f32",
+    "arguments": [
+      "float32x4_t r",
+      "float32x4_t a",
+      "float32x2_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_laneq_f16",
+    "arguments": [
+      "float16x8_t r",
+      "float16x8_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_laneq_f32",
+    "arguments": [
+      "float32x4_t r",
+      "float32x4_t a",
+      "float32x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot180_f16",
+    "arguments": [
+      "float16x8_t r",
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot180_f32",
+    "arguments": [
+      "float32x4_t r",
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot180_f64",
+    "arguments": [
+      "float64x2_t r",
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot180_lane_f16",
+    "arguments": [
+      "float16x8_t r",
+      "float16x8_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot180_lane_f32",
+    "arguments": [
+      "float32x4_t r",
+      "float32x4_t a",
+      "float32x2_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot180_laneq_f16",
+    "arguments": [
+      "float16x8_t r",
+      "float16x8_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot180_laneq_f32",
+    "arguments": [
+      "float32x4_t r",
+      "float32x4_t a",
+      "float32x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot270_f16",
+    "arguments": [
+      "float16x8_t r",
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot270_f32",
+    "arguments": [
+      "float32x4_t r",
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot270_f64",
+    "arguments": [
+      "float64x2_t r",
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot270_lane_f16",
+    "arguments": [
+      "float16x8_t r",
+      "float16x8_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot270_lane_f32",
+    "arguments": [
+      "float32x4_t r",
+      "float32x4_t a",
+      "float32x2_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot270_laneq_f16",
+    "arguments": [
+      "float16x8_t r",
+      "float16x8_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot270_laneq_f32",
+    "arguments": [
+      "float32x4_t r",
+      "float32x4_t a",
+      "float32x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot90_f16",
+    "arguments": [
+      "float16x8_t r",
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot90_f32",
+    "arguments": [
+      "float32x4_t r",
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot90_f64",
+    "arguments": [
+      "float64x2_t r",
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot90_lane_f16",
+    "arguments": [
+      "float16x8_t r",
+      "float16x8_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot90_lane_f32",
+    "arguments": [
+      "float32x4_t r",
+      "float32x4_t a",
+      "float32x2_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot90_laneq_f16",
+    "arguments": [
+      "float16x8_t r",
+      "float16x8_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcmlaq_rot90_laneq_f32",
+    "arguments": [
+      "float32x4_t r",
+      "float32x4_t a",
+      "float32x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcnt_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CNT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcnt_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CNT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcnt_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CNT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcntq_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CNT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcntq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CNT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcntq_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CNT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_f16",
+    "arguments": [
+      "float16x4_t low",
+      "float16x4_t high"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.4H"
+      },
+      "low": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_f32",
+    "arguments": [
+      "float32x2_t low",
+      "float32x2_t high"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.2S"
+      },
+      "low": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_f64",
+    "arguments": [
+      "float64x1_t low",
+      "float64x1_t high"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.1D"
+      },
+      "low": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_p16",
+    "arguments": [
+      "poly16x4_t low",
+      "poly16x4_t high"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.4H"
+      },
+      "low": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_p64",
+    "arguments": [
+      "poly64x1_t low",
+      "poly64x1_t high"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.1D"
+      },
+      "low": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_p8",
+    "arguments": [
+      "poly8x8_t low",
+      "poly8x8_t high"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.8B"
+      },
+      "low": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_s16",
+    "arguments": [
+      "int16x4_t low",
+      "int16x4_t high"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.4H"
+      },
+      "low": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_s32",
+    "arguments": [
+      "int32x2_t low",
+      "int32x2_t high"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.2S"
+      },
+      "low": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_s64",
+    "arguments": [
+      "int64x1_t low",
+      "int64x1_t high"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.1D"
+      },
+      "low": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_s8",
+    "arguments": [
+      "int8x8_t low",
+      "int8x8_t high"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.8B"
+      },
+      "low": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_u16",
+    "arguments": [
+      "uint16x4_t low",
+      "uint16x4_t high"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.4H"
+      },
+      "low": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_u32",
+    "arguments": [
+      "uint32x2_t low",
+      "uint32x2_t high"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.2S"
+      },
+      "low": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_u64",
+    "arguments": [
+      "uint64x1_t low",
+      "uint64x1_t high"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.1D"
+      },
+      "low": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcombine_u8",
+    "arguments": [
+      "uint8x8_t low",
+      "uint8x8_t high"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "high": {
+        "register": "Vm.8B"
+      },
+      "low": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP",
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_lane_f32",
+    "arguments": [
+      "float32x2_t a",
+      "const int lane1",
+      "float32x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_lane_f64",
+    "arguments": [
+      "float64x1_t a",
+      "const int lane1",
+      "float64x1_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "UNUSED"
+      },
+      "b": {
+        "register": "Vn.1D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 0
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_lane_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "const int lane1",
+      "poly16x4_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_lane_p64",
+    "arguments": [
+      "poly64x1_t a",
+      "const int lane1",
+      "poly64x1_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "UNUSED"
+      },
+      "b": {
+        "register": "Vn.1D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 0
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_lane_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "const int lane1",
+      "poly8x8_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_lane_s16",
+    "arguments": [
+      "int16x4_t a",
+      "const int lane1",
+      "int16x4_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_lane_s32",
+    "arguments": [
+      "int32x2_t a",
+      "const int lane1",
+      "int32x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_lane_s64",
+    "arguments": [
+      "int64x1_t a",
+      "const int lane1",
+      "int64x1_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "UNUSED"
+      },
+      "b": {
+        "register": "Vn.1D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 0
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_lane_s8",
+    "arguments": [
+      "int8x8_t a",
+      "const int lane1",
+      "int8x8_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_lane_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "const int lane1",
+      "uint16x4_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_lane_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "const int lane1",
+      "uint32x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_lane_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "const int lane1",
+      "uint64x1_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "UNUSED"
+      },
+      "b": {
+        "register": "Vn.1D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 0
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_lane_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "const int lane1",
+      "uint8x8_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_laneq_f32",
+    "arguments": [
+      "float32x2_t a",
+      "const int lane1",
+      "float32x4_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_laneq_f64",
+    "arguments": [
+      "float64x1_t a",
+      "const int lane1",
+      "float64x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "UNUSED"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_laneq_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "const int lane1",
+      "poly16x8_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_laneq_p64",
+    "arguments": [
+      "poly64x1_t a",
+      "const int lane1",
+      "poly64x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "UNUSED"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_laneq_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "const int lane1",
+      "poly8x16_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_laneq_s16",
+    "arguments": [
+      "int16x4_t a",
+      "const int lane1",
+      "int16x8_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_laneq_s32",
+    "arguments": [
+      "int32x2_t a",
+      "const int lane1",
+      "int32x4_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_laneq_s64",
+    "arguments": [
+      "int64x1_t a",
+      "const int lane1",
+      "int64x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "UNUSED"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_laneq_s8",
+    "arguments": [
+      "int8x8_t a",
+      "const int lane1",
+      "int8x16_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_laneq_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "const int lane1",
+      "uint16x8_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_laneq_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "const int lane1",
+      "uint32x4_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_laneq_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "const int lane1",
+      "uint64x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "UNUSED"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopy_laneq_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "const int lane1",
+      "uint8x16_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_lane_f32",
+    "arguments": [
+      "float32x4_t a",
+      "const int lane1",
+      "float32x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_lane_f64",
+    "arguments": [
+      "float64x2_t a",
+      "const int lane1",
+      "float64x1_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.1D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 0
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_lane_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "const int lane1",
+      "poly16x4_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_lane_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "const int lane1",
+      "poly64x1_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.1D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 0
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_lane_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "const int lane1",
+      "poly8x8_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_lane_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int lane1",
+      "int16x4_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_lane_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int lane1",
+      "int32x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_lane_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int lane1",
+      "int64x1_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.1D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 0
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_lane_s8",
+    "arguments": [
+      "int8x16_t a",
+      "const int lane1",
+      "int8x8_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_lane_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "const int lane1",
+      "uint16x4_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_lane_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "const int lane1",
+      "uint32x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_lane_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "const int lane1",
+      "uint64x1_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.1D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 0
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_lane_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "const int lane1",
+      "uint8x8_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_laneq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "const int lane1",
+      "float32x4_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_laneq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "const int lane1",
+      "float64x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_laneq_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "const int lane1",
+      "poly16x8_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_laneq_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "const int lane1",
+      "poly64x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_laneq_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "const int lane1",
+      "poly8x16_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_laneq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int lane1",
+      "int16x8_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_laneq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int lane1",
+      "int32x4_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_laneq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int lane1",
+      "int64x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_laneq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "const int lane1",
+      "int8x16_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_laneq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "const int lane1",
+      "uint16x8_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_laneq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "const int lane1",
+      "uint32x4_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_laneq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "const int lane1",
+      "uint64x2_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcopyq_laneq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "const int lane1",
+      "uint8x16_t b",
+      "const int lane2"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "lane1": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "lane2": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_f16",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_f32",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_f64",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_p16",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_p64",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_p8",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_s16",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_s32",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_s64",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_s8",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_u16",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_u32",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_u64",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcreate_u8",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_f16_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_f16_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_f16_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_f32_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_f32_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_f32_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_f32_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_f64_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_f64_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_f64_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_high_f16_f32",
+    "arguments": [
+      "float16x4_t r",
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_high_f32_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_high_f32_f64",
+    "arguments": [
+      "float32x2_t r",
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_high_f64_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_n_f16_s16",
+    "arguments": [
+      "int16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_n_f16_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_n_f32_s32",
+    "arguments": [
+      "int32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_n_f32_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_n_f64_s64",
+    "arguments": [
+      "int64x1_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_n_f64_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_n_s16_f16",
+    "arguments": [
+      "float16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_n_s32_f32",
+    "arguments": [
+      "float32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_n_s64_f64",
+    "arguments": [
+      "float64x1_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_n_u16_f16",
+    "arguments": [
+      "float16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_n_u32_f32",
+    "arguments": [
+      "float32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_n_u64_f64",
+    "arguments": [
+      "float64x1_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_s16_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_s32_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_s64_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_u16_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_u32_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvt_u64_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvta_s16_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvta_s32_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvta_s64_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvta_u16_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvta_u32_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvta_u64_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtad_s64_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtad_u64_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtah_s16_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtah_s32_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtah_s64_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtah_u16_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtah_u32_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtah_u64_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtaq_s16_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtaq_s32_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtaq_s64_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtaq_u16_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtaq_u32_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtaq_u64_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtas_s32_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtas_u32_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTAU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtd_f64_s64",
+    "arguments": [
+      "int64_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtd_f64_u64",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtd_n_f64_s64",
+    "arguments": [
+      "int64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtd_n_f64_u64",
+    "arguments": [
+      "uint64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtd_n_s64_f64",
+    "arguments": [
+      "float64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtd_n_u64_f64",
+    "arguments": [
+      "float64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtd_s64_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtd_u64_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_f16_s16",
+    "arguments": [
+      "int16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_f16_s32",
+    "arguments": [
+      "int32_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_f16_s64",
+    "arguments": [
+      "int64_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_f16_u16",
+    "arguments": [
+      "uint16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_f16_u32",
+    "arguments": [
+      "uint32_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_f16_u64",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_n_f16_s16",
+    "arguments": [
+      "int16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_n_f16_s32",
+    "arguments": [
+      "int32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_n_f16_s64",
+    "arguments": [
+      "int64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_n_f16_u16",
+    "arguments": [
+      "uint16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_n_f16_u32",
+    "arguments": [
+      "uint32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_n_f16_u64",
+    "arguments": [
+      "uint64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_n_s16_f16",
+    "arguments": [
+      "float16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_n_s32_f16",
+    "arguments": [
+      "float16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_n_s64_f16",
+    "arguments": [
+      "float16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_n_u16_f16",
+    "arguments": [
+      "float16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_n_u32_f16",
+    "arguments": [
+      "float16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_n_u64_f16",
+    "arguments": [
+      "float16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_s16_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_s32_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_s64_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_u16_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_u32_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvth_u64_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtm_s16_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtm_s32_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtm_s64_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtm_u16_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtm_u32_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtm_u64_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmd_s64_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmd_u64_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmh_s16_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmh_s32_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmh_s64_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmh_u16_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmh_u32_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmh_u64_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmq_s16_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmq_s32_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmq_s64_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmq_u16_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmq_u32_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtmq_u64_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtms_s32_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtms_u32_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTMU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtn_s16_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtn_s32_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtn_s64_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtn_u16_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtn_u32_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtn_u64_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnd_s64_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnd_u64_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnh_s16_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnh_s32_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnh_s64_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnh_u16_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnh_u32_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnh_u64_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnq_s16_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnq_s32_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnq_s64_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnq_u16_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnq_u32_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtnq_u64_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtns_s32_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtns_u32_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTNU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtp_s16_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtp_s32_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtp_s64_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtp_u16_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtp_u32_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtp_u64_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtpd_s64_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtpd_u64_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtph_s16_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtph_s32_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtph_s64_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtph_u16_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtph_u32_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtph_u64_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtpq_s16_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtpq_s32_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtpq_s64_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtpq_u16_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtpq_u32_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtpq_u64_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtps_s32_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtps_u32_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTPU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_f16_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_f16_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_f32_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_f32_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_f64_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_f64_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_n_f16_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_n_f16_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_n_f32_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_n_f32_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_n_f64_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_n_f64_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_n_s16_f16",
+    "arguments": [
+      "float16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_n_s32_f32",
+    "arguments": [
+      "float32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_n_s64_f64",
+    "arguments": [
+      "float64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_n_u16_f16",
+    "arguments": [
+      "float16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_n_u32_f32",
+    "arguments": [
+      "float32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_n_u64_f64",
+    "arguments": [
+      "float64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_s16_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_s32_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_s64_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_u16_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_u32_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtq_u64_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvts_f32_s32",
+    "arguments": [
+      "int32_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvts_f32_u32",
+    "arguments": [
+      "uint32_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvts_n_f32_s32",
+    "arguments": [
+      "int32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvts_n_f32_u32",
+    "arguments": [
+      "uint32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UCVTF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvts_n_s32_f32",
+    "arguments": [
+      "float32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvts_n_u32_f32",
+    "arguments": [
+      "float32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvts_s32_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvts_u32_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTZU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtx_f32_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTXN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtx_high_f32_f64",
+    "arguments": [
+      "float32x2_t r",
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTXN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vcvtxd_f32_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FCVTXN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdiv_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FDIV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdiv_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FDIV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdiv_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FDIV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdivh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FDIV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdivq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FDIV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdivq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FDIV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdivq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FDIV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdot_lane_s32",
+    "arguments": [
+      "int32x2_t r",
+      "int8x8_t a",
+      "int8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdot_lane_u32",
+    "arguments": [
+      "uint32x2_t r",
+      "uint8x8_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdot_laneq_s32",
+    "arguments": [
+      "int32x2_t r",
+      "int8x8_t a",
+      "int8x16_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdot_laneq_u32",
+    "arguments": [
+      "uint32x2_t r",
+      "uint8x8_t a",
+      "uint8x16_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdot_s32",
+    "arguments": [
+      "int32x2_t r",
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdot_u32",
+    "arguments": [
+      "uint32x2_t r",
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdotq_lane_s32",
+    "arguments": [
+      "int32x4_t r",
+      "int8x16_t a",
+      "int8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdotq_lane_u32",
+    "arguments": [
+      "uint32x4_t r",
+      "uint8x16_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdotq_laneq_s32",
+    "arguments": [
+      "int32x4_t r",
+      "int8x16_t a",
+      "int8x16_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdotq_laneq_u32",
+    "arguments": [
+      "uint32x4_t r",
+      "uint8x16_t a",
+      "uint8x16_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdotq_s32",
+    "arguments": [
+      "int32x4_t r",
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdotq_u32",
+    "arguments": [
+      "uint32x4_t r",
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_f16",
+    "arguments": [
+      "float16x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_f32",
+    "arguments": [
+      "float32x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_f64",
+    "arguments": [
+      "float64x1_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "vec": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_p16",
+    "arguments": [
+      "poly16x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_p64",
+    "arguments": [
+      "poly64x1_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "vec": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_p8",
+    "arguments": [
+      "poly8x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_s16",
+    "arguments": [
+      "int16x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_s32",
+    "arguments": [
+      "int32x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_s64",
+    "arguments": [
+      "int64x1_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "vec": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_s8",
+    "arguments": [
+      "int8x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_u16",
+    "arguments": [
+      "uint16x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_u32",
+    "arguments": [
+      "uint32x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_u64",
+    "arguments": [
+      "uint64x1_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "vec": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_lane_u8",
+    "arguments": [
+      "uint8x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_f16",
+    "arguments": [
+      "float16x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_f32",
+    "arguments": [
+      "float32x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_f64",
+    "arguments": [
+      "float64x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_p16",
+    "arguments": [
+      "poly16x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_p64",
+    "arguments": [
+      "poly64x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_p8",
+    "arguments": [
+      "poly8x16_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_s16",
+    "arguments": [
+      "int16x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_s32",
+    "arguments": [
+      "int32x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_s64",
+    "arguments": [
+      "int64x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_s8",
+    "arguments": [
+      "int8x16_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_u16",
+    "arguments": [
+      "uint16x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_u32",
+    "arguments": [
+      "uint32x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_u64",
+    "arguments": [
+      "uint64x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_laneq_u8",
+    "arguments": [
+      "uint8x16_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_f16",
+    "arguments": [
+      "float16_t value"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_f32",
+    "arguments": [
+      "float32_t value"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_f64",
+    "arguments": [
+      "float64_t value"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_p16",
+    "arguments": [
+      "poly16_t value"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_p64",
+    "arguments": [
+      "poly64_t value"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_p8",
+    "arguments": [
+      "poly8_t value"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_s16",
+    "arguments": [
+      "int16_t value"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_s32",
+    "arguments": [
+      "int32_t value"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_s64",
+    "arguments": [
+      "int64_t value"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_s8",
+    "arguments": [
+      "int8_t value"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_u16",
+    "arguments": [
+      "uint16_t value"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_u32",
+    "arguments": [
+      "uint32_t value"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_u64",
+    "arguments": [
+      "uint64_t value"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "INS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdup_n_u8",
+    "arguments": [
+      "uint8_t value"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupb_lane_p8",
+    "arguments": [
+      "poly8x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupb_lane_s8",
+    "arguments": [
+      "int8x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupb_lane_u8",
+    "arguments": [
+      "uint8x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupb_laneq_p8",
+    "arguments": [
+      "poly8x16_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupb_laneq_s8",
+    "arguments": [
+      "int8x16_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupb_laneq_u8",
+    "arguments": [
+      "uint8x16_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupd_lane_f64",
+    "arguments": [
+      "float64x1_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "vec": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupd_lane_s64",
+    "arguments": [
+      "int64x1_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "vec": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupd_lane_u64",
+    "arguments": [
+      "uint64x1_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "vec": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupd_laneq_f64",
+    "arguments": [
+      "float64x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupd_laneq_s64",
+    "arguments": [
+      "int64x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupd_laneq_u64",
+    "arguments": [
+      "uint64x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vduph_lane_f16",
+    "arguments": [
+      "float16x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vduph_lane_p16",
+    "arguments": [
+      "poly16x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vduph_lane_s16",
+    "arguments": [
+      "int16x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vduph_lane_u16",
+    "arguments": [
+      "uint16x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vduph_laneq_f16",
+    "arguments": [
+      "float16x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vduph_laneq_p16",
+    "arguments": [
+      "poly16x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vduph_laneq_s16",
+    "arguments": [
+      "int16x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vduph_laneq_u16",
+    "arguments": [
+      "uint16x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_f16",
+    "arguments": [
+      "float16x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_f32",
+    "arguments": [
+      "float32x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_f64",
+    "arguments": [
+      "float64x1_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "vec": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_p16",
+    "arguments": [
+      "poly16x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_p64",
+    "arguments": [
+      "poly64x1_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "vec": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_p8",
+    "arguments": [
+      "poly8x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_s16",
+    "arguments": [
+      "int16x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_s32",
+    "arguments": [
+      "int32x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_s64",
+    "arguments": [
+      "int64x1_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "vec": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_s8",
+    "arguments": [
+      "int8x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_u16",
+    "arguments": [
+      "uint16x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_u32",
+    "arguments": [
+      "uint32x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_u64",
+    "arguments": [
+      "uint64x1_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "vec": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_lane_u8",
+    "arguments": [
+      "uint8x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_f16",
+    "arguments": [
+      "float16x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_f32",
+    "arguments": [
+      "float32x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_f64",
+    "arguments": [
+      "float64x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_p16",
+    "arguments": [
+      "poly16x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_p64",
+    "arguments": [
+      "poly64x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_p8",
+    "arguments": [
+      "poly8x16_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_s16",
+    "arguments": [
+      "int16x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_s32",
+    "arguments": [
+      "int32x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_s64",
+    "arguments": [
+      "int64x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_s8",
+    "arguments": [
+      "int8x16_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_u16",
+    "arguments": [
+      "uint16x8_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_u32",
+    "arguments": [
+      "uint32x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_u64",
+    "arguments": [
+      "uint64x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_laneq_u8",
+    "arguments": [
+      "uint8x16_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_f16",
+    "arguments": [
+      "float16_t value"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_f32",
+    "arguments": [
+      "float32_t value"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_f64",
+    "arguments": [
+      "float64_t value"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_p16",
+    "arguments": [
+      "poly16_t value"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_p64",
+    "arguments": [
+      "poly64_t value"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_p8",
+    "arguments": [
+      "poly8_t value"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_s16",
+    "arguments": [
+      "int16_t value"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_s32",
+    "arguments": [
+      "int32_t value"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_s64",
+    "arguments": [
+      "int64_t value"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_s8",
+    "arguments": [
+      "int8_t value"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_u16",
+    "arguments": [
+      "uint16_t value"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_u32",
+    "arguments": [
+      "uint32_t value"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_u64",
+    "arguments": [
+      "uint64_t value"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdupq_n_u8",
+    "arguments": [
+      "uint8_t value"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdups_lane_f32",
+    "arguments": [
+      "float32x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdups_lane_s32",
+    "arguments": [
+      "int32x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdups_lane_u32",
+    "arguments": [
+      "uint32x2_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "vec": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdups_laneq_f32",
+    "arguments": [
+      "float32x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdups_laneq_s32",
+    "arguments": [
+      "int32x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vdups_laneq_u32",
+    "arguments": [
+      "uint32x4_t vec",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "vec": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor3q_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x8_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor3q_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x4_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor3q_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b",
+      "int64x2_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor3q_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b",
+      "int8x16_t c"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor3q_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "uint16x8_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor3q_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor3q_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b",
+      "uint64x2_t c"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor3q_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b",
+      "uint8x16_t c"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veor_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veorq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veorq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veorq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veorq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veorq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veorq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veorq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "veorq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EOR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 0
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "poly16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_p64",
+    "arguments": [
+      "poly64x1_t a",
+      "poly64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 0
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 0
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 0
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vext_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "poly16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 1
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vextq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "EXT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfma_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b",
+      "float16x4_t c"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfma_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32x2_t c"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfma_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b",
+      "float64x1_t c"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Da"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "c": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfma_lane_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b",
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfma_lane_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfma_lane_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b",
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vm.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfma_laneq_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b",
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfma_laneq_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfma_laneq_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b",
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfma_n_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b",
+      "float16_t n"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H "
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfma_n_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32_t n"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfma_n_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b",
+      "float64_t n"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Da"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmad_lane_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b",
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vm.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmad_laneq_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b",
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmah_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b",
+      "float16_t c"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Ha"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "c": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmah_lane_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b",
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmah_laneq_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b",
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmaq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b",
+      "float16x8_t c"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmaq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32x4_t c"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmaq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b",
+      "float64x2_t c"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "c": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmaq_lane_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b",
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmaq_lane_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmaq_lane_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b",
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vm.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmaq_laneq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b",
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmaq_laneq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmaq_laneq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b",
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmaq_n_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b",
+      "float16_t n"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H "
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmaq_n_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32_t n"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmaq_n_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b",
+      "float64_t n"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "register": "Vm.D[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmas_lane_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmas_laneq_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlal_high_f16",
+    "arguments": [
+      "float32x2_t r",
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlal_lane_high_f16",
+    "arguments": [
+      "float32x2_t r",
+      "float16x4_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlal_lane_low_f16",
+    "arguments": [
+      "float32x2_t r",
+      "float16x4_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlal_laneq_high_f16",
+    "arguments": [
+      "float32x2_t r",
+      "float16x4_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlal_laneq_low_f16",
+    "arguments": [
+      "float32x2_t r",
+      "float16x4_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlal_low_f16",
+    "arguments": [
+      "float32x2_t r",
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlalq_high_f16",
+    "arguments": [
+      "float32x4_t r",
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlalq_lane_high_f16",
+    "arguments": [
+      "float32x4_t r",
+      "float16x8_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlalq_lane_low_f16",
+    "arguments": [
+      "float32x4_t r",
+      "float16x8_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlalq_laneq_high_f16",
+    "arguments": [
+      "float32x4_t r",
+      "float16x8_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlalq_laneq_low_f16",
+    "arguments": [
+      "float32x4_t r",
+      "float16x8_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlalq_low_f16",
+    "arguments": [
+      "float32x4_t r",
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlsl_high_f16",
+    "arguments": [
+      "float32x2_t r",
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlsl_lane_high_f16",
+    "arguments": [
+      "float32x2_t r",
+      "float16x4_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlsl_lane_low_f16",
+    "arguments": [
+      "float32x2_t r",
+      "float16x4_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlsl_laneq_high_f16",
+    "arguments": [
+      "float32x2_t r",
+      "float16x4_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlsl_laneq_low_f16",
+    "arguments": [
+      "float32x2_t r",
+      "float16x4_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlsl_low_f16",
+    "arguments": [
+      "float32x2_t r",
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlslq_high_f16",
+    "arguments": [
+      "float32x4_t r",
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlslq_lane_high_f16",
+    "arguments": [
+      "float32x4_t r",
+      "float16x8_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlslq_lane_low_f16",
+    "arguments": [
+      "float32x4_t r",
+      "float16x8_t a",
+      "float16x4_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlslq_laneq_high_f16",
+    "arguments": [
+      "float32x4_t r",
+      "float16x8_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlslq_laneq_low_f16",
+    "arguments": [
+      "float32x4_t r",
+      "float16x8_t a",
+      "float16x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmlslq_low_f16",
+    "arguments": [
+      "float32x4_t r",
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfms_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b",
+      "float16x4_t c"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfms_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32x2_t c"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfms_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b",
+      "float64x1_t c"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Da"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "c": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfms_lane_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b",
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfms_lane_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfms_lane_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b",
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vm.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfms_laneq_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b",
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfms_laneq_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfms_laneq_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b",
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfms_n_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b",
+      "float16_t n"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H "
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfms_n_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32_t n"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfms_n_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b",
+      "float64_t n"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Da"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsd_lane_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b",
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vm.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsd_laneq_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b",
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b",
+      "float16_t c"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Ha"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "c": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsh_lane_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b",
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsh_laneq_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b",
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b",
+      "float16x8_t c"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32x4_t c"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b",
+      "float64x2_t c"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "c": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsq_lane_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b",
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsq_lane_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsq_lane_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b",
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vm.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsq_laneq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b",
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsq_laneq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsq_laneq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b",
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsq_n_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b",
+      "float16_t n"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H "
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsq_n_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32_t n"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmsq_n_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b",
+      "float64_t n"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "register": "Vm.D[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmss_lane_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vfmss_laneq_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_high_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_f16",
+    "arguments": [
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_f32",
+    "arguments": [
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_f64",
+    "arguments": [
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_p16",
+    "arguments": [
+      "poly16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_p64",
+    "arguments": [
+      "poly64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_p8",
+    "arguments": [
+      "poly8x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_s16",
+    "arguments": [
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_s32",
+    "arguments": [
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_s64",
+    "arguments": [
+      "int64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_s8",
+    "arguments": [
+      "int8x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_u16",
+    "arguments": [
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_u32",
+    "arguments": [
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_u64",
+    "arguments": [
+      "uint64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vn.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_lane_u8",
+    "arguments": [
+      "uint8x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vget_low_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_f16",
+    "arguments": [
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_f32",
+    "arguments": [
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_f64",
+    "arguments": [
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_p16",
+    "arguments": [
+      "poly16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_p64",
+    "arguments": [
+      "poly64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_p8",
+    "arguments": [
+      "poly8x16_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "v": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_s16",
+    "arguments": [
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_s32",
+    "arguments": [
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_s64",
+    "arguments": [
+      "int64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_s8",
+    "arguments": [
+      "int8x16_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "v": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_u16",
+    "arguments": [
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_u32",
+    "arguments": [
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_u64",
+    "arguments": [
+      "uint64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vgetq_lane_u8",
+    "arguments": [
+      "uint8x16_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "v": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhadd_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhadd_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhadd_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhadd_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhadd_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhadd_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhaddq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhaddq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhaddq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhaddq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhaddq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhaddq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhsub_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhsub_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhsub_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhsub_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UHSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhsub_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UHSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhsub_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UHSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhsubq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhsubq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhsubq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhsubq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UHSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhsubq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UHSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vhsubq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UHSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_dup_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_f16_x2",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_f16_x3",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_f16_x4",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_f32_x2",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_f32_x3",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_f32_x4",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_f64_x2",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_f64_x3",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_f64_x4",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_f16",
+    "arguments": [
+      "float16_t const * ptr",
+      "float16x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_f32",
+    "arguments": [
+      "float32_t const * ptr",
+      "float32x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_f64",
+    "arguments": [
+      "float64_t const * ptr",
+      "float64x1_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_p16",
+    "arguments": [
+      "poly16_t const * ptr",
+      "poly16x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_p64",
+    "arguments": [
+      "poly64_t const * ptr",
+      "poly64x1_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_p8",
+    "arguments": [
+      "poly8_t const * ptr",
+      "poly8x8_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_s16",
+    "arguments": [
+      "int16_t const * ptr",
+      "int16x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_s32",
+    "arguments": [
+      "int32_t const * ptr",
+      "int32x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_s64",
+    "arguments": [
+      "int64_t const * ptr",
+      "int64x1_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_s8",
+    "arguments": [
+      "int8_t const * ptr",
+      "int8x8_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_u16",
+    "arguments": [
+      "uint16_t const * ptr",
+      "uint16x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_u32",
+    "arguments": [
+      "uint32_t const * ptr",
+      "uint32x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_u64",
+    "arguments": [
+      "uint64_t const * ptr",
+      "uint64x1_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_lane_u8",
+    "arguments": [
+      "uint8_t const * ptr",
+      "uint8x8_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_p16_x2",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_p16_x3",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_p16_x4",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_p64_x2",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_p64_x3",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_p64_x4",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_p8_x2",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_p8_x3",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_p8_x4",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s16_x2",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s16_x3",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s16_x4",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s32_x2",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s32_x3",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s32_x4",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s64_x2",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s64_x3",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s64_x4",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s8_x2",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s8_x3",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_s8_x4",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u16_x2",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u16_x3",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u16_x4",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u32_x2",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u32_x3",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u32_x4",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u64_x2",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u64_x3",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u64_x4",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u8_x2",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u8_x3",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1_u8_x4",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_dup_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_f16_x2",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_f16_x3",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_f16_x4",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_f32_x2",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_f32_x3",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_f32_x4",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_f64_x2",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_f64_x3",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_f64_x4",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_f16",
+    "arguments": [
+      "float16_t const * ptr",
+      "float16x8_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_f32",
+    "arguments": [
+      "float32_t const * ptr",
+      "float32x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_f64",
+    "arguments": [
+      "float64_t const * ptr",
+      "float64x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_p16",
+    "arguments": [
+      "poly16_t const * ptr",
+      "poly16x8_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_p64",
+    "arguments": [
+      "poly64_t const * ptr",
+      "poly64x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_p8",
+    "arguments": [
+      "poly8_t const * ptr",
+      "poly8x16_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_s16",
+    "arguments": [
+      "int16_t const * ptr",
+      "int16x8_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_s32",
+    "arguments": [
+      "int32_t const * ptr",
+      "int32x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_s64",
+    "arguments": [
+      "int64_t const * ptr",
+      "int64x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_s8",
+    "arguments": [
+      "int8_t const * ptr",
+      "int8x16_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_u16",
+    "arguments": [
+      "uint16_t const * ptr",
+      "uint16x8_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_u32",
+    "arguments": [
+      "uint32_t const * ptr",
+      "uint32x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_u64",
+    "arguments": [
+      "uint64_t const * ptr",
+      "uint64x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_lane_u8",
+    "arguments": [
+      "uint8_t const * ptr",
+      "uint8x16_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_p16_x2",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_p16_x3",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_p16_x4",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_p64_x2",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_p64_x3",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_p64_x4",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_p8_x2",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_p8_x3",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x16x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_p8_x4",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s16_x2",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s16_x3",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s16_x4",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s32_x2",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s32_x3",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s32_x4",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s64_x2",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s64_x3",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s64_x4",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s8_x2",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s8_x3",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x16x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_s8_x4",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u16_x2",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u16_x3",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u16_x4",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u32_x2",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u32_x3",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u32_x4",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u64_x2",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u64_x3",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u64_x4",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u8_x2",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u8_x3",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x16x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld1q_u8_x4",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_dup_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_f16",
+    "arguments": [
+      "float16_t const * ptr",
+      "float16x4x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_f32",
+    "arguments": [
+      "float32_t const * ptr",
+      "float32x2x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_f64",
+    "arguments": [
+      "float64_t const * ptr",
+      "float64x1x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_p16",
+    "arguments": [
+      "poly16_t const * ptr",
+      "poly16x4x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_p64",
+    "arguments": [
+      "poly64_t const * ptr",
+      "poly64x1x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_p8",
+    "arguments": [
+      "poly8_t const * ptr",
+      "poly8x8x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_s16",
+    "arguments": [
+      "int16_t const * ptr",
+      "int16x4x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_s32",
+    "arguments": [
+      "int32_t const * ptr",
+      "int32x2x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_s64",
+    "arguments": [
+      "int64_t const * ptr",
+      "int64x1x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_s8",
+    "arguments": [
+      "int8_t const * ptr",
+      "int8x8x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_u16",
+    "arguments": [
+      "uint16_t const * ptr",
+      "uint16x4x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_u32",
+    "arguments": [
+      "uint32_t const * ptr",
+      "uint32x2x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_u64",
+    "arguments": [
+      "uint64_t const * ptr",
+      "uint64x1x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_lane_u8",
+    "arguments": [
+      "uint8_t const * ptr",
+      "uint8x8x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x1x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_dup_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_f16",
+    "arguments": [
+      "float16_t const * ptr",
+      "float16x8x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_f32",
+    "arguments": [
+      "float32_t const * ptr",
+      "float32x4x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_f64",
+    "arguments": [
+      "float64_t const * ptr",
+      "float64x2x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_p16",
+    "arguments": [
+      "poly16_t const * ptr",
+      "poly16x8x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_p64",
+    "arguments": [
+      "poly64_t const * ptr",
+      "poly64x2x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_p8",
+    "arguments": [
+      "poly8_t const * ptr",
+      "poly8x16x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_s16",
+    "arguments": [
+      "int16_t const * ptr",
+      "int16x8x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_s32",
+    "arguments": [
+      "int32_t const * ptr",
+      "int32x4x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_s64",
+    "arguments": [
+      "int64_t const * ptr",
+      "int64x2x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_s8",
+    "arguments": [
+      "int8_t const * ptr",
+      "int8x16x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_u16",
+    "arguments": [
+      "uint16_t const * ptr",
+      "uint16x8x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_u32",
+    "arguments": [
+      "uint32_t const * ptr",
+      "uint32x4x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_u64",
+    "arguments": [
+      "uint64_t const * ptr",
+      "uint64x2x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_lane_u8",
+    "arguments": [
+      "uint8_t const * ptr",
+      "uint8x16x2_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt2.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld2q_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_dup_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_f16",
+    "arguments": [
+      "float16_t const * ptr",
+      "float16x4x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_f32",
+    "arguments": [
+      "float32_t const * ptr",
+      "float32x2x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_f64",
+    "arguments": [
+      "float64_t const * ptr",
+      "float64x1x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_p16",
+    "arguments": [
+      "poly16_t const * ptr",
+      "poly16x4x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_p64",
+    "arguments": [
+      "poly64_t const * ptr",
+      "poly64x1x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_p8",
+    "arguments": [
+      "poly8_t const * ptr",
+      "poly8x8x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_s16",
+    "arguments": [
+      "int16_t const * ptr",
+      "int16x4x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_s32",
+    "arguments": [
+      "int32_t const * ptr",
+      "int32x2x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_s64",
+    "arguments": [
+      "int64_t const * ptr",
+      "int64x1x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_s8",
+    "arguments": [
+      "int8_t const * ptr",
+      "int8x8x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_u16",
+    "arguments": [
+      "uint16_t const * ptr",
+      "uint16x4x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_u32",
+    "arguments": [
+      "uint32_t const * ptr",
+      "uint32x2x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_u64",
+    "arguments": [
+      "uint64_t const * ptr",
+      "uint64x1x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_lane_u8",
+    "arguments": [
+      "uint8_t const * ptr",
+      "uint8x8x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x1x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x16x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x16x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_dup_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x16x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_f16",
+    "arguments": [
+      "float16_t const * ptr",
+      "float16x8x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_f32",
+    "arguments": [
+      "float32_t const * ptr",
+      "float32x4x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_f64",
+    "arguments": [
+      "float64_t const * ptr",
+      "float64x2x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_p16",
+    "arguments": [
+      "poly16_t const * ptr",
+      "poly16x8x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_p64",
+    "arguments": [
+      "poly64_t const * ptr",
+      "poly64x2x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_p8",
+    "arguments": [
+      "poly8_t const * ptr",
+      "poly8x16x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x16x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_s16",
+    "arguments": [
+      "int16_t const * ptr",
+      "int16x8x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_s32",
+    "arguments": [
+      "int32_t const * ptr",
+      "int32x4x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_s64",
+    "arguments": [
+      "int64_t const * ptr",
+      "int64x2x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_s8",
+    "arguments": [
+      "int8_t const * ptr",
+      "int8x16x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x16x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_u16",
+    "arguments": [
+      "uint16_t const * ptr",
+      "uint16x8x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_u32",
+    "arguments": [
+      "uint32_t const * ptr",
+      "uint32x4x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_u64",
+    "arguments": [
+      "uint64_t const * ptr",
+      "uint64x2x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_lane_u8",
+    "arguments": [
+      "uint8_t const * ptr",
+      "uint8x16x3_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x16x3_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt3.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x16x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x16x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x8x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x4x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x2x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld3q_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x16x3_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_dup_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_f16",
+    "arguments": [
+      "float16_t const * ptr",
+      "float16x4x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_f32",
+    "arguments": [
+      "float32_t const * ptr",
+      "float32x2x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_f64",
+    "arguments": [
+      "float64_t const * ptr",
+      "float64x1x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_p16",
+    "arguments": [
+      "poly16_t const * ptr",
+      "poly16x4x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_p64",
+    "arguments": [
+      "poly64_t const * ptr",
+      "poly64x1x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_p8",
+    "arguments": [
+      "poly8_t const * ptr",
+      "poly8x8x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_s16",
+    "arguments": [
+      "int16_t const * ptr",
+      "int16x4x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_s32",
+    "arguments": [
+      "int32_t const * ptr",
+      "int32x2x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_s64",
+    "arguments": [
+      "int64_t const * ptr",
+      "int64x1x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_s8",
+    "arguments": [
+      "int8_t const * ptr",
+      "int8x8x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_u16",
+    "arguments": [
+      "uint16_t const * ptr",
+      "uint16x4x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_u32",
+    "arguments": [
+      "uint32_t const * ptr",
+      "uint32x2x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_u64",
+    "arguments": [
+      "uint64_t const * ptr",
+      "uint64x1x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_lane_u8",
+    "arguments": [
+      "uint8_t const * ptr",
+      "uint8x8x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x1x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_dup_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4R"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_f16",
+    "arguments": [
+      "float16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_f32",
+    "arguments": [
+      "float32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float32x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_f64",
+    "arguments": [
+      "float64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "float64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_f16",
+    "arguments": [
+      "float16_t const * ptr",
+      "float16x8x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_f32",
+    "arguments": [
+      "float32_t const * ptr",
+      "float32x4x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_f64",
+    "arguments": [
+      "float64_t const * ptr",
+      "float64x2x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_p16",
+    "arguments": [
+      "poly16_t const * ptr",
+      "poly16x8x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_p64",
+    "arguments": [
+      "poly64_t const * ptr",
+      "poly64x2x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_p8",
+    "arguments": [
+      "poly8_t const * ptr",
+      "poly8x16x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_s16",
+    "arguments": [
+      "int16_t const * ptr",
+      "int16x8x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_s32",
+    "arguments": [
+      "int32_t const * ptr",
+      "int32x4x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_s64",
+    "arguments": [
+      "int64_t const * ptr",
+      "int64x2x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_s8",
+    "arguments": [
+      "int8_t const * ptr",
+      "int8x16x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_u16",
+    "arguments": [
+      "uint16_t const * ptr",
+      "uint16x8x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_u32",
+    "arguments": [
+      "uint32_t const * ptr",
+      "uint32x4x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_u64",
+    "arguments": [
+      "uint64_t const * ptr",
+      "uint64x2x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_lane_u8",
+    "arguments": [
+      "uint8_t const * ptr",
+      "uint8x16x4_t src",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x16x4_t"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "src": {
+        "register": "Vt4.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_p16",
+    "arguments": [
+      "poly16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_p64",
+    "arguments": [
+      "poly64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_p8",
+    "arguments": [
+      "poly8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly8x16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_s16",
+    "arguments": [
+      "int16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_s32",
+    "arguments": [
+      "int32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int32x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_s64",
+    "arguments": [
+      "int64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_s8",
+    "arguments": [
+      "int8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "int8x16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_u16",
+    "arguments": [
+      "uint16_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint16x8x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_u32",
+    "arguments": [
+      "uint32_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint32x4x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_u64",
+    "arguments": [
+      "uint64_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint64x2x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vld4q_u8",
+    "arguments": [
+      "uint8_t const * ptr"
+    ],
+    "return_type": {
+      "value": "uint8x16x4_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LD4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vldrq_p128",
+    "arguments": [
+      "poly128_t const * ptr"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LDR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmax_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmax_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmax_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmax_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmax_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmax_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmax_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmax_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmax_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxnm_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxnm_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxnm_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxnmh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxnmq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxnmq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxnmq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxnmv_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxnmv_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxnmvq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxnmvq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNMV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxnmvq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxv_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxv_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxv_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAXV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxv_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxv_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAXV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxv_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAXV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxv_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxv_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAXV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxvq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxvq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxvq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxvq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAXV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxvq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAXV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxvq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAXV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxvq_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAXV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxvq_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAXV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmaxvq_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAXV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmin_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmin_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmin_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmin_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmin_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmin_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmin_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmin_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmin_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminnm_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminnm_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminnm_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminnmh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminnmq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminnmq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminnmq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminnmv_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminnmv_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminnmvq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminnmvq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNMV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminnmvq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminv_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminv_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminv_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMINV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminv_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminv_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMINV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminv_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMINV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminv_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminv_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMINV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminvq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminvq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminvq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminvq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMINV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminvq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMINV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminvq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMINV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminvq_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMINV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminvq_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMINV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vminvq_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMINV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32x2_t c"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "N/A"
+      },
+      "b": {
+        "register": "N/A"
+      },
+      "c": {
+        "register": "N/A"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b",
+      "float64x1_t c"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "N/A"
+      },
+      "b": {
+        "register": "N/A"
+      },
+      "c": {
+        "register": "N/A"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_lane_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_lane_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_lane_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_lane_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_lane_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_laneq_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_laneq_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_laneq_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_laneq_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_laneq_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_n_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32_t c"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "N/A"
+      },
+      "b": {
+        "register": "N/A"
+      },
+      "c": {
+        "register": "N/A"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_n_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "uint16_t c"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_n_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "uint32_t c"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16x4_t c"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32x2_t c"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b",
+      "int8x8_t c"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "uint16x4_t c"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "uint32x2_t c"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmla_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b",
+      "uint8x8_t c"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_lane_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_lane_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_lane_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x8_t b",
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_lane_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x4_t b",
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_laneq_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_laneq_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_laneq_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x8_t b",
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_laneq_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x4_t b",
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_n_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_n_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_n_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x8_t b",
+      "uint16_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_n_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x4_t b",
+      "uint32_t c"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16x8_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32x4_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_s8",
+    "arguments": [
+      "int16x8_t a",
+      "int8x16_t b",
+      "int8x16_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x8_t b",
+      "uint16x8_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x4_t b",
+      "uint32x4_t c"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_high_u8",
+    "arguments": [
+      "uint16x8_t a",
+      "uint8x16_t b",
+      "uint8x16_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_lane_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_lane_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_lane_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x4_t b",
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_lane_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x2_t b",
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_laneq_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_laneq_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_laneq_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x4_t b",
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_laneq_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x2_t b",
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_n_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_n_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_n_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x4_t b",
+      "uint16_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_n_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x2_t b",
+      "uint32_t c"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16x4_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32x2_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_s8",
+    "arguments": [
+      "int16x8_t a",
+      "int8x8_t b",
+      "int8x8_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x4_t b",
+      "uint16x4_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x2_t b",
+      "uint32x2_t c"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlal_u8",
+    "arguments": [
+      "uint16x8_t a",
+      "uint8x8_t b",
+      "uint8x8_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32x4_t c"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "N/A"
+      },
+      "b": {
+        "register": "N/A"
+      },
+      "c": {
+        "register": "N/A"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b",
+      "float64x2_t c"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "N/A"
+      },
+      "b": {
+        "register": "N/A"
+      },
+      "c": {
+        "register": "N/A"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_lane_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_lane_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_lane_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_lane_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_lane_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_laneq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_laneq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_laneq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_laneq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_laneq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_n_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32_t c"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "N/A"
+      },
+      "b": {
+        "register": "N/A"
+      },
+      "c": {
+        "register": "N/A"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "uint16_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x8_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x4_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b",
+      "int8x16_t c"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "uint16x8_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlaq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b",
+      "uint8x16_t c"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32x2_t c"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "N/A"
+      },
+      "b": {
+        "register": "N/A"
+      },
+      "c": {
+        "register": "N/A"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b",
+      "float64x1_t c"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "N/A"
+      },
+      "b": {
+        "register": "N/A"
+      },
+      "c": {
+        "register": "N/A"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_lane_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_lane_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_lane_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_lane_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_lane_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_laneq_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_laneq_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_laneq_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_laneq_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_laneq_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_n_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b",
+      "float32_t c"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "N/A"
+      },
+      "b": {
+        "register": "N/A"
+      },
+      "c": {
+        "register": "N/A"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_n_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "uint16_t c"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_n_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "uint32_t c"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16x4_t c"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32x2_t c"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b",
+      "int8x8_t c"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "uint16x4_t c"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "uint32x2_t c"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmls_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b",
+      "uint8x8_t c"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_lane_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_lane_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_lane_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x8_t b",
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_lane_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x4_t b",
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_laneq_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_laneq_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_laneq_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x8_t b",
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_laneq_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x4_t b",
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_n_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_n_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_n_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x8_t b",
+      "uint16_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_n_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x4_t b",
+      "uint32_t c"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16x8_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32x4_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_s8",
+    "arguments": [
+      "int16x8_t a",
+      "int8x16_t b",
+      "int8x16_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x8_t b",
+      "uint16x8_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x4_t b",
+      "uint32x4_t c"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_high_u8",
+    "arguments": [
+      "uint16x8_t a",
+      "uint8x16_t b",
+      "uint8x16_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_lane_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_lane_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_lane_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x4_t b",
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_lane_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x2_t b",
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_laneq_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_laneq_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_laneq_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x4_t b",
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_laneq_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x2_t b",
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_n_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_n_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_n_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x4_t b",
+      "uint16_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_n_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x2_t b",
+      "uint32_t c"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16x4_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32x2_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_s8",
+    "arguments": [
+      "int16x8_t a",
+      "int8x8_t b",
+      "int8x8_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x4_t b",
+      "uint16x4_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x2_t b",
+      "uint32x2_t c"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsl_u8",
+    "arguments": [
+      "uint16x8_t a",
+      "uint8x8_t b",
+      "uint8x8_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "c": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32x4_t c"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "N/A"
+      },
+      "b": {
+        "register": "N/A"
+      },
+      "c": {
+        "register": "N/A"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b",
+      "float64x2_t c"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "N/A"
+      },
+      "b": {
+        "register": "N/A"
+      },
+      "c": {
+        "register": "N/A"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_lane_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_lane_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_lane_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_lane_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_lane_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_laneq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {},
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_laneq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_laneq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_laneq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_laneq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_n_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b",
+      "float32_t c"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "N/A"
+      },
+      "b": {
+        "register": "N/A"
+      },
+      "c": {
+        "register": "N/A"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RESULT[I]"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "uint16_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x8_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x4_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b",
+      "int8x16_t c"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "uint16x8_t c"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmlsq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b",
+      "uint8x16_t c"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "c": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MLS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmmlaq_s32",
+    "arguments": [
+      "int32x4_t r",
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmmlaq_u32",
+    "arguments": [
+      "uint32x4_t r",
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmov_n_f16",
+    "arguments": [
+      "float16_t value"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmov_n_f32",
+    "arguments": [
+      "float32_t value"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmov_n_f64",
+    "arguments": [
+      "float64_t value"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmov_n_p16",
+    "arguments": [
+      "poly16_t value"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmov_n_p8",
+    "arguments": [
+      "poly8_t value"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmov_n_s16",
+    "arguments": [
+      "int16_t value"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmov_n_s32",
+    "arguments": [
+      "int32_t value"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmov_n_s64",
+    "arguments": [
+      "int64_t value"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmov_n_s8",
+    "arguments": [
+      "int8_t value"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmov_n_u16",
+    "arguments": [
+      "uint16_t value"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmov_n_u32",
+    "arguments": [
+      "uint32_t value"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmov_n_u64",
+    "arguments": [
+      "uint64_t value"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmov_n_u8",
+    "arguments": [
+      "uint8_t value"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovl_high_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHLL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovl_high_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHLL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovl_high_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHLL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovl_high_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHLL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovl_high_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHLL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovl_high_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHLL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovl_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHLL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovl_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHLL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovl_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHLL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovl_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHLL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovl_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHLL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovl_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHLL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovn_high_s16",
+    "arguments": [
+      "int8x8_t r",
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "XTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovn_high_s32",
+    "arguments": [
+      "int16x4_t r",
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "XTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovn_high_s64",
+    "arguments": [
+      "int32x2_t r",
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "XTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovn_high_u16",
+    "arguments": [
+      "uint8x8_t r",
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "XTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovn_high_u32",
+    "arguments": [
+      "uint16x4_t r",
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "XTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovn_high_u64",
+    "arguments": [
+      "uint32x2_t r",
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "XTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovn_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "XTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovn_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "XTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovn_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "XTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovn_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "XTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovn_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "XTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovn_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "XTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovq_n_f16",
+    "arguments": [
+      "float16_t value"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovq_n_f32",
+    "arguments": [
+      "float32_t value"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovq_n_f64",
+    "arguments": [
+      "float64_t value"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovq_n_p16",
+    "arguments": [
+      "poly16_t value"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovq_n_p8",
+    "arguments": [
+      "poly8_t value"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovq_n_s16",
+    "arguments": [
+      "int16_t value"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovq_n_s32",
+    "arguments": [
+      "int32_t value"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovq_n_s64",
+    "arguments": [
+      "int64_t value"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovq_n_s8",
+    "arguments": [
+      "int8_t value"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovq_n_u16",
+    "arguments": [
+      "uint16_t value"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovq_n_u32",
+    "arguments": [
+      "uint32_t value"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovq_n_u64",
+    "arguments": [
+      "uint64_t value"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmovq_n_u8",
+    "arguments": [
+      "uint8_t value"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "value": {
+        "register": "rn"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "DUP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_lane_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_lane_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_lane_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vm.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_lane_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_lane_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_lane_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_lane_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_laneq_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_laneq_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_laneq_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_laneq_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_laneq_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_laneq_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_laneq_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_n_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16_t n"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_n_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_n_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Vm.D[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_n_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_n_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "PMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmul_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmuld_lane_f64",
+    "arguments": [
+      "float64_t a",
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vm.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmuld_laneq_f64",
+    "arguments": [
+      "float64_t a",
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulh_lane_f16",
+    "arguments": [
+      "float16_t a",
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulh_laneq_f16",
+    "arguments": [
+      "float16_t a",
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_lane_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_lane_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_lane_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_lane_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_laneq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_laneq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_laneq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_laneq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "PMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "PMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_high_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_lane_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_lane_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_lane_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_lane_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_laneq_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_laneq_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_laneq_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_laneq_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_n_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_n_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_p64",
+    "arguments": [
+      "poly64_t a",
+      "poly64_t b"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.1D"
+      },
+      "b": {
+        "register": "Vm.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "PMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "PMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmull_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_lane_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_lane_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_lane_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vm.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_lane_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_lane_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_lane_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_lane_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_laneq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_laneq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_laneq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_laneq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_laneq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_laneq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_laneq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_n_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16_t n"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_n_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_n_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.D[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "PMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmuls_lane_f32",
+    "arguments": [
+      "float32_t a",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmuls_laneq_f32",
+    "arguments": [
+      "float32_t a",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMUL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulx_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulx_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulx_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulx_lane_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulx_lane_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulx_lane_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vm.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulx_laneq_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulx_laneq_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulx_laneq_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulx_n_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16_t n"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxd_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxd_lane_f64",
+    "arguments": [
+      "float64_t a",
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vm.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxd_laneq_f64",
+    "arguments": [
+      "float64_t a",
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxh_lane_f16",
+    "arguments": [
+      "float16_t a",
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxh_laneq_f16",
+    "arguments": [
+      "float16_t a",
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxq_lane_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxq_lane_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxq_lane_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vm.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxq_laneq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxq_laneq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxq_laneq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxq_n_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16_t n"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxs_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxs_lane_f32",
+    "arguments": [
+      "float32_t a",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmulxs_laneq_f32",
+    "arguments": [
+      "float32_t a",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMULX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvn_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvn_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvn_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvn_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvn_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvn_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvn_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvnq_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvnq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvnq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvnq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvnq_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvnq_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vmvnq_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MVN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vneg_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vneg_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vneg_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vneg_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vneg_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vneg_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vneg_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vnegd_s64",
+    "arguments": [
+      "int64_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vnegh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vnegq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vnegq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vnegq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vnegq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vnegq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vnegq_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vnegq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorn_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorn_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorn_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorn_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorn_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorn_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorn_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorn_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vornq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vornq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vornq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vornq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vornq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vornq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vornq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vornq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorr_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorr_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorr_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorr_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorr_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorr_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorr_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorr_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorrq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorrq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorrq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorrq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorrq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorrq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorrq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vorrq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ORR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadal_s16",
+    "arguments": [
+      "int32x2_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADALP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadal_s32",
+    "arguments": [
+      "int64x1_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADALP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadal_s8",
+    "arguments": [
+      "int16x4_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADALP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadal_u16",
+    "arguments": [
+      "uint32x2_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADALP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadal_u32",
+    "arguments": [
+      "uint64x1_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADALP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadal_u8",
+    "arguments": [
+      "uint16x4_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADALP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadalq_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADALP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadalq_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADALP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadalq_s8",
+    "arguments": [
+      "int16x8_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADALP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadalq_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADALP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadalq_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADALP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadalq_u8",
+    "arguments": [
+      "uint16x8_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADALP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadd_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadd_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadd_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadd_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadd_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadd_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadd_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadd_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddd_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddd_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddd_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddl_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddl_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddl_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddl_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddl_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddl_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddlq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddlq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddlq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddlq_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddlq_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddlq_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UADDLP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpaddq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpadds_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FADDP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmax_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmax_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmax_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmax_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmax_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmax_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmax_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmax_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxnm_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxnm_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxnmq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxnmq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxnmq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxnmqd_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxnms_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxqd_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmaxs_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMAXP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmin_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmin_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmin_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmin_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmin_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmin_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmin_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmin_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminnm_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminnm_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminnmq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminnmq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminnmq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminnmqd_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminnms_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINNMP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpminqd_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vpmins_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FMINP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqabs_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqabs_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqabs_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqabs_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqabsb_s8",
+    "arguments": [
+      "int8_t a"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqabsd_s64",
+    "arguments": [
+      "int64_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqabsh_s16",
+    "arguments": [
+      "int16_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqabsq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqabsq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqabsq_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqabsq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqabss_s32",
+    "arguments": [
+      "int32_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQABS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqadd_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqadd_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqadd_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqadd_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqadd_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqadd_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqadd_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqadd_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddb_s8",
+    "arguments": [
+      "int8_t a",
+      "int8_t b"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bn"
+      },
+      "b": {
+        "register": "Bm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddb_u8",
+    "arguments": [
+      "uint8_t a",
+      "uint8_t b"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bn"
+      },
+      "b": {
+        "register": "Bm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddd_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddd_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddh_s16",
+    "arguments": [
+      "int16_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddh_u16",
+    "arguments": [
+      "uint16_t a",
+      "uint16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqaddq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqadds_s32",
+    "arguments": [
+      "int32_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqadds_u32",
+    "arguments": [
+      "uint32_t a",
+      "uint32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_high_lane_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_high_lane_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_high_laneq_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_high_laneq_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_high_n_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_high_n_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_high_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16x8_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_high_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32x4_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_lane_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_lane_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_laneq_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_laneq_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_n_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_n_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16x4_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlal_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32x2_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlalh_lane_s16",
+    "arguments": [
+      "int32_t a",
+      "int16_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlalh_laneq_s16",
+    "arguments": [
+      "int32_t a",
+      "int16_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlalh_s16",
+    "arguments": [
+      "int32_t a",
+      "int16_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "c": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlals_lane_s32",
+    "arguments": [
+      "int64_t a",
+      "int32_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlals_laneq_s32",
+    "arguments": [
+      "int64_t a",
+      "int32_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlals_s32",
+    "arguments": [
+      "int64_t a",
+      "int32_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "c": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLAL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_high_lane_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_high_lane_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_high_laneq_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_high_laneq_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_high_n_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_high_n_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_high_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b",
+      "int16x8_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_high_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b",
+      "int32x4_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_lane_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_lane_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_laneq_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_laneq_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_n_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_n_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b",
+      "int16x4_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsl_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b",
+      "int32x2_t c"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlslh_lane_s16",
+    "arguments": [
+      "int32_t a",
+      "int16_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlslh_laneq_s16",
+    "arguments": [
+      "int32_t a",
+      "int16_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlslh_s16",
+    "arguments": [
+      "int32_t a",
+      "int16_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "c": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsls_lane_s32",
+    "arguments": [
+      "int64_t a",
+      "int32_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsls_laneq_s32",
+    "arguments": [
+      "int64_t a",
+      "int32_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmlsls_s32",
+    "arguments": [
+      "int64_t a",
+      "int32_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "c": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMLSL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulh_lane_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulh_lane_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulh_laneq_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulh_laneq_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulh_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulh_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulh_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulh_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhh_lane_s16",
+    "arguments": [
+      "int16_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhh_laneq_s16",
+    "arguments": [
+      "int16_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhh_s16",
+    "arguments": [
+      "int16_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhq_lane_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhq_lane_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhq_laneq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhq_laneq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhs_lane_s32",
+    "arguments": [
+      "int32_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhs_laneq_s32",
+    "arguments": [
+      "int32_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulhs_s32",
+    "arguments": [
+      "int32_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_high_lane_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_high_lane_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_high_laneq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_high_laneq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_high_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_high_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_high_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_high_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_lane_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_lane_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_laneq_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_laneq_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmull_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmullh_lane_s16",
+    "arguments": [
+      "int16_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmullh_laneq_s16",
+    "arguments": [
+      "int16_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmullh_s16",
+    "arguments": [
+      "int16_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulls_lane_s32",
+    "arguments": [
+      "int32_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulls_laneq_s32",
+    "arguments": [
+      "int32_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqdmulls_s32",
+    "arguments": [
+      "int32_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQDMULL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovn_high_s16",
+    "arguments": [
+      "int8x8_t r",
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovn_high_s32",
+    "arguments": [
+      "int16x4_t r",
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovn_high_s64",
+    "arguments": [
+      "int32x2_t r",
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovn_high_u16",
+    "arguments": [
+      "uint8x8_t r",
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQXTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovn_high_u32",
+    "arguments": [
+      "uint16x4_t r",
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQXTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovn_high_u64",
+    "arguments": [
+      "uint32x2_t r",
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQXTN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovn_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovn_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovn_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovn_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQXTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovn_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQXTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovn_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQXTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovnd_s64",
+    "arguments": [
+      "int64_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovnd_u64",
+    "arguments": [
+      "uint64_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQXTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovnh_s16",
+    "arguments": [
+      "int16_t a"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovnh_u16",
+    "arguments": [
+      "uint16_t a"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQXTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovns_s32",
+    "arguments": [
+      "int32_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovns_u32",
+    "arguments": [
+      "uint32_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQXTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovun_high_s16",
+    "arguments": [
+      "uint8x8_t r",
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTUN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovun_high_s32",
+    "arguments": [
+      "uint16x4_t r",
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTUN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovun_high_s64",
+    "arguments": [
+      "uint32x2_t r",
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTUN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovun_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovun_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovun_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovund_s64",
+    "arguments": [
+      "int64_t a"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovunh_s16",
+    "arguments": [
+      "int16_t a"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqmovuns_s32",
+    "arguments": [
+      "int32_t a"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQXTUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqneg_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqneg_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqneg_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqneg_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqnegb_s8",
+    "arguments": [
+      "int8_t a"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqnegd_s64",
+    "arguments": [
+      "int64_t a"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqnegh_s16",
+    "arguments": [
+      "int16_t a"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqnegq_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqnegq_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqnegq_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqnegq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqnegs_s32",
+    "arguments": [
+      "int32_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQNEG"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlah_lane_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlah_lane_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlah_laneq_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlah_laneq_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlah_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16x4_t c"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlah_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32x2_t c"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlahh_lane_s16",
+    "arguments": [
+      "int16_t a",
+      "int16_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlahh_laneq_s16",
+    "arguments": [
+      "int16_t a",
+      "int16_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlahh_s16",
+    "arguments": [
+      "int16_t a",
+      "int16_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "c": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlahq_lane_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlahq_lane_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlahq_laneq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlahq_laneq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlahq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x8_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlahq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x4_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlahs_lane_s32",
+    "arguments": [
+      "int32_t a",
+      "int32_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlahs_laneq_s32",
+    "arguments": [
+      "int32_t a",
+      "int32_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLAH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlahs_s32",
+    "arguments": [
+      "int32_t a",
+      "int32_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "c": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlsh_lane_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlsh_lane_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlsh_laneq_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlsh_laneq_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlsh_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "int16x4_t c"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "c": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlsh_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "int32x2_t c"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "c": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlshh_lane_s16",
+    "arguments": [
+      "int16_t a",
+      "int16_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlshh_laneq_s16",
+    "arguments": [
+      "int16_t a",
+      "int16_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlshh_s16",
+    "arguments": [
+      "int16_t a",
+      "int16_t b",
+      "int16_t c"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hd"
+      },
+      "b": {
+        "register": "Hn"
+      },
+      "c": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlshq_lane_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlshq_lane_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlshq_laneq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlshq_laneq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlshq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "int16x8_t c"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "c": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlshq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "int32x4_t c"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "c": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlshs_lane_s32",
+    "arguments": [
+      "int32_t a",
+      "int32_t b",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlshs_laneq_s32",
+    "arguments": [
+      "int32_t a",
+      "int32_t b",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmlshs_s32",
+    "arguments": [
+      "int32_t a",
+      "int32_t b",
+      "int32_t c"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Sn"
+      },
+      "c": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMLSH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulh_lane_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulh_lane_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulh_laneq_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulh_laneq_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulh_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulh_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulh_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulh_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhh_lane_s16",
+    "arguments": [
+      "int16_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhh_laneq_s16",
+    "arguments": [
+      "int16_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhh_s16",
+    "arguments": [
+      "int16_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhq_lane_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhq_lane_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhq_laneq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhq_laneq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.H[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.S[0]"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhs_lane_s32",
+    "arguments": [
+      "int32_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhs_laneq_s32",
+    "arguments": [
+      "int32_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrdmulhs_s32",
+    "arguments": [
+      "int32_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRDMULH"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshl_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshl_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshl_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshl_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshl_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshl_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshl_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshl_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshlb_s8",
+    "arguments": [
+      "int8_t a",
+      "int8_t b"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bn"
+      },
+      "b": {
+        "register": "Bm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshlb_u8",
+    "arguments": [
+      "uint8_t a",
+      "int8_t b"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bn"
+      },
+      "b": {
+        "register": "Bm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshld_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshld_u64",
+    "arguments": [
+      "uint64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshlh_s16",
+    "arguments": [
+      "int16_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshlh_u16",
+    "arguments": [
+      "uint16_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshlq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshlq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshlq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshlq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshlq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshlq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshlq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshlq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshls_s32",
+    "arguments": [
+      "int32_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshls_u32",
+    "arguments": [
+      "uint32_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrn_high_n_s16",
+    "arguments": [
+      "int8x8_t r",
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrn_high_n_s32",
+    "arguments": [
+      "int16x4_t r",
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrn_high_n_s64",
+    "arguments": [
+      "int32x2_t r",
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrn_high_n_u16",
+    "arguments": [
+      "uint8x8_t r",
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrn_high_n_u32",
+    "arguments": [
+      "uint16x4_t r",
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrn_high_n_u64",
+    "arguments": [
+      "uint32x2_t r",
+      "uint64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrn_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrn_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrn_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrn_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrn_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrn_n_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrnd_n_s64",
+    "arguments": [
+      "int64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrnd_n_u64",
+    "arguments": [
+      "uint64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrnh_n_s16",
+    "arguments": [
+      "int16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrnh_n_u16",
+    "arguments": [
+      "uint16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrns_n_s32",
+    "arguments": [
+      "int32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrns_n_u32",
+    "arguments": [
+      "uint32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQRSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrun_high_n_s16",
+    "arguments": [
+      "uint8x8_t r",
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRUN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrun_high_n_s32",
+    "arguments": [
+      "uint16x4_t r",
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRUN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrun_high_n_s64",
+    "arguments": [
+      "uint32x2_t r",
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRUN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrun_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrun_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrun_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrund_n_s64",
+    "arguments": [
+      "int64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshrunh_n_s16",
+    "arguments": [
+      "int16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqrshruns_n_s32",
+    "arguments": [
+      "int32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQRSHRUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_n_s64",
+    "arguments": [
+      "int64x1_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_n_s8",
+    "arguments": [
+      "int8x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_n_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_n_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_n_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_n_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshl_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlb_n_s8",
+    "arguments": [
+      "int8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlb_n_u8",
+    "arguments": [
+      "uint8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlb_s8",
+    "arguments": [
+      "int8_t a",
+      "int8_t b"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bn"
+      },
+      "b": {
+        "register": "Bm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlb_u8",
+    "arguments": [
+      "uint8_t a",
+      "int8_t b"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bn"
+      },
+      "b": {
+        "register": "Bm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshld_n_s64",
+    "arguments": [
+      "int64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshld_n_u64",
+    "arguments": [
+      "uint64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshld_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshld_u64",
+    "arguments": [
+      "uint64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlh_n_s16",
+    "arguments": [
+      "int16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlh_n_u16",
+    "arguments": [
+      "uint16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlh_s16",
+    "arguments": [
+      "int16_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlh_u16",
+    "arguments": [
+      "uint16_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_n_s8",
+    "arguments": [
+      "int8x16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_n_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_n_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshls_n_s32",
+    "arguments": [
+      "int32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshls_n_u32",
+    "arguments": [
+      "uint32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshls_s32",
+    "arguments": [
+      "int32_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshls_u32",
+    "arguments": [
+      "uint32_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlu_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHLU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlu_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHLU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlu_n_s64",
+    "arguments": [
+      "int64x1_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHLU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlu_n_s8",
+    "arguments": [
+      "int8x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHLU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlub_n_s8",
+    "arguments": [
+      "int8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHLU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlud_n_s64",
+    "arguments": [
+      "int64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHLU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshluh_n_s16",
+    "arguments": [
+      "int16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHLU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshluq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHLU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshluq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHLU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshluq_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHLU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshluq_n_s8",
+    "arguments": [
+      "int8x16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHLU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshlus_n_s32",
+    "arguments": [
+      "int32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHLU"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrn_high_n_s16",
+    "arguments": [
+      "int8x8_t r",
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrn_high_n_s32",
+    "arguments": [
+      "int16x4_t r",
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrn_high_n_s64",
+    "arguments": [
+      "int32x2_t r",
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrn_high_n_u16",
+    "arguments": [
+      "uint8x8_t r",
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrn_high_n_u32",
+    "arguments": [
+      "uint16x4_t r",
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrn_high_n_u64",
+    "arguments": [
+      "uint32x2_t r",
+      "uint64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrn_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrn_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrn_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrn_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrn_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrn_n_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrnd_n_s64",
+    "arguments": [
+      "int64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrnd_n_u64",
+    "arguments": [
+      "uint64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrnh_n_s16",
+    "arguments": [
+      "int16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrnh_n_u16",
+    "arguments": [
+      "uint16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrns_n_s32",
+    "arguments": [
+      "int32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrns_n_u32",
+    "arguments": [
+      "uint32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrun_high_n_s16",
+    "arguments": [
+      "uint8x8_t r",
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRUN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrun_high_n_s32",
+    "arguments": [
+      "uint16x4_t r",
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRUN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrun_high_n_s64",
+    "arguments": [
+      "uint32x2_t r",
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRUN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrun_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrun_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrun_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrund_n_s64",
+    "arguments": [
+      "int64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshrunh_n_s16",
+    "arguments": [
+      "int16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqshruns_n_s32",
+    "arguments": [
+      "int32_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSHRUN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsub_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsub_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsub_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsub_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsub_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsub_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsub_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsub_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubb_s8",
+    "arguments": [
+      "int8_t a",
+      "int8_t b"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bn"
+      },
+      "b": {
+        "register": "Bm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubb_u8",
+    "arguments": [
+      "uint8_t a",
+      "uint8_t b"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bn"
+      },
+      "b": {
+        "register": "Bm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubd_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubd_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubh_s16",
+    "arguments": [
+      "int16_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubh_u16",
+    "arguments": [
+      "uint16_t a",
+      "uint16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubs_s32",
+    "arguments": [
+      "int32_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqsubs_u32",
+    "arguments": [
+      "uint32_t a",
+      "uint32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UQSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl1_p8",
+    "arguments": [
+      "poly8x16_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl1_s8",
+    "arguments": [
+      "int8x16_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl1_u8",
+    "arguments": [
+      "uint8x16_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl1q_p8",
+    "arguments": [
+      "poly8x16_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl1q_s8",
+    "arguments": [
+      "int8x16_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl1q_u8",
+    "arguments": [
+      "uint8x16_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl2_p8",
+    "arguments": [
+      "poly8x16x2_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl2_s8",
+    "arguments": [
+      "int8x16x2_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl2_u8",
+    "arguments": [
+      "uint8x16x2_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl2q_p8",
+    "arguments": [
+      "poly8x16x2_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl2q_s8",
+    "arguments": [
+      "int8x16x2_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl2q_u8",
+    "arguments": [
+      "uint8x16x2_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl3_p8",
+    "arguments": [
+      "poly8x16x3_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl3_s8",
+    "arguments": [
+      "int8x16x3_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl3_u8",
+    "arguments": [
+      "uint8x16x3_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl3q_p8",
+    "arguments": [
+      "poly8x16x3_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl3q_s8",
+    "arguments": [
+      "int8x16x3_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl3q_u8",
+    "arguments": [
+      "uint8x16x3_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl4_p8",
+    "arguments": [
+      "poly8x16x4_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl4_s8",
+    "arguments": [
+      "int8x16x4_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl4_u8",
+    "arguments": [
+      "uint8x16x4_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl4q_p8",
+    "arguments": [
+      "poly8x16x4_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl4q_s8",
+    "arguments": [
+      "int8x16x4_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbl4q_u8",
+    "arguments": [
+      "uint8x16x4_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx1_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x16_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx1_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x16_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx1_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x16_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx1q_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx1q_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx1q_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx2_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x16x2_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx2_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x16x2_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx2_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x16x2_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx2q_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16x2_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx2q_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16x2_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx2q_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16x2_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx3_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x16x3_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx3_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x16x3_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx3_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x16x3_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx3q_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16x3_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx3q_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16x3_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx3q_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16x3_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx4_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x16x4_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx4_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x16x4_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx4_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x16x4_t t",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "idx": {
+        "register": "Vm.8B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx4q_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16x4_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx4q_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16x4_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vqtbx4q_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16x4_t t",
+      "uint8x16_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "idx": {
+        "register": "Vm.16B"
+      },
+      "t": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vraddhn_high_s16",
+    "arguments": [
+      "int8x8_t r",
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RADDHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vraddhn_high_s32",
+    "arguments": [
+      "int16x4_t r",
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RADDHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vraddhn_high_s64",
+    "arguments": [
+      "int32x2_t r",
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RADDHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vraddhn_high_u16",
+    "arguments": [
+      "uint8x8_t r",
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RADDHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vraddhn_high_u32",
+    "arguments": [
+      "uint16x4_t r",
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RADDHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vraddhn_high_u64",
+    "arguments": [
+      "uint32x2_t r",
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RADDHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vraddhn_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RADDHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vraddhn_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RADDHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vraddhn_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RADDHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vraddhn_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RADDHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vraddhn_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RADDHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vraddhn_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RADDHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrax1q_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RAX1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrbit_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RBIT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrbit_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RBIT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrbit_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RBIT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrbitq_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RBIT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrbitq_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RBIT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrbitq_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RBIT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpe_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpe_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpe_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpe_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URECPE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecped_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpeh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpeq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpeq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpeq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpeq_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URECPE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpes_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecps_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecps_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecps_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpsd_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpsh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpsq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpsq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpsq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpss_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpxd_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpxh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrecpxs_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRECPX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f16_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f16_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f16_p16",
+    "arguments": [
+      "poly16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f16_p64",
+    "arguments": [
+      "poly64x1_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f16_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f16_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f16_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f16_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f16_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f16_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f16_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f16_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f16_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f32_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f32_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f32_p16",
+    "arguments": [
+      "poly16x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f32_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f32_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f32_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f32_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f32_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f32_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f32_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f32_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f32_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f64_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f64_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f64_p16",
+    "arguments": [
+      "poly16x4_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f64_p64",
+    "arguments": [
+      "poly64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f64_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f64_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f64_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f64_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f64_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f64_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f64_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f64_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_f64_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p16_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p16_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p16_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p16_p64",
+    "arguments": [
+      "poly64x1_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p16_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p16_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p16_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p16_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p16_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p16_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p16_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p16_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p16_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p64_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p64_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p64_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p64_p16",
+    "arguments": [
+      "poly16x4_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p64_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p64_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p64_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p64_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p64_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p64_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p64_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p64_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p8_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p8_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p8_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p8_p16",
+    "arguments": [
+      "poly16x4_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p8_p64",
+    "arguments": [
+      "poly64x1_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p8_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p8_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p8_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p8_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p8_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p8_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p8_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_p8_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s16_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s16_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s16_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s16_p16",
+    "arguments": [
+      "poly16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s16_p64",
+    "arguments": [
+      "poly64x1_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s16_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s16_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s16_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s16_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s16_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s16_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s16_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s16_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s32_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s32_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s32_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s32_p16",
+    "arguments": [
+      "poly16x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s32_p64",
+    "arguments": [
+      "poly64x1_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s32_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s32_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s32_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s32_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s32_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s32_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s32_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s32_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s64_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s64_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s64_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s64_p16",
+    "arguments": [
+      "poly16x4_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s64_p64",
+    "arguments": [
+      "poly64x1_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s64_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s64_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s64_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s64_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s64_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s64_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s64_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s64_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s8_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s8_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s8_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s8_p16",
+    "arguments": [
+      "poly16x4_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s8_p64",
+    "arguments": [
+      "poly64x1_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s8_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s8_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s8_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s8_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s8_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s8_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s8_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_s8_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u16_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u16_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u16_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u16_p16",
+    "arguments": [
+      "poly16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u16_p64",
+    "arguments": [
+      "poly64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u16_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u16_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u16_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u16_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u16_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u16_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u16_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u16_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u32_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u32_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u32_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u32_p16",
+    "arguments": [
+      "poly16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u32_p64",
+    "arguments": [
+      "poly64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u32_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u32_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u32_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u32_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u32_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u32_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u32_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u32_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u64_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u64_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u64_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u64_p16",
+    "arguments": [
+      "poly16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u64_p64",
+    "arguments": [
+      "poly64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u64_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u64_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u64_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u64_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u64_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u64_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u64_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u64_u8",
+    "arguments": [
+      "uint8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u8_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u8_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u8_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u8_p16",
+    "arguments": [
+      "poly16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u8_p64",
+    "arguments": [
+      "poly64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u8_p8",
+    "arguments": [
+      "poly8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u8_s16",
+    "arguments": [
+      "int16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u8_s32",
+    "arguments": [
+      "int32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u8_s64",
+    "arguments": [
+      "int64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u8_s8",
+    "arguments": [
+      "int8x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u8_u16",
+    "arguments": [
+      "uint16x4_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u8_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpret_u8_u64",
+    "arguments": [
+      "uint64x1_t a"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_p128",
+    "arguments": [
+      "poly128_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f16_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f32_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f32_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f32_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f32_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f32_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f32_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f32_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f32_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f32_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f32_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f32_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f32_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_p128",
+    "arguments": [
+      "poly128_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_f64_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p128_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p128_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p128_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p128_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p128_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p128_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p128_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p128_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p128_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p128_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p128_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p128_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p128_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly128_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_p128",
+    "arguments": [
+      "poly128_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p16_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p64_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p64_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p64_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p64_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p64_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p64_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p64_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p64_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p64_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p64_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p64_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p64_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p64_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_p128",
+    "arguments": [
+      "poly128_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_p8_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_p128",
+    "arguments": [
+      "poly128_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s16_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_p128",
+    "arguments": [
+      "poly128_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s32_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_p128",
+    "arguments": [
+      "poly128_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s64_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_p128",
+    "arguments": [
+      "poly128_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_s8_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_p128",
+    "arguments": [
+      "poly128_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u16_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_p128",
+    "arguments": [
+      "poly128_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u32_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_p128",
+    "arguments": [
+      "poly128_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u64_u8",
+    "arguments": [
+      "uint8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_p128",
+    "arguments": [
+      "poly128_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.1Q"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_p16",
+    "arguments": [
+      "poly16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_p64",
+    "arguments": [
+      "poly64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_p8",
+    "arguments": [
+      "poly8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_s16",
+    "arguments": [
+      "int16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_s32",
+    "arguments": [
+      "int32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_s64",
+    "arguments": [
+      "int64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_s8",
+    "arguments": [
+      "int8x16_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_u16",
+    "arguments": [
+      "uint16x8_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vreinterpretq_u8_u64",
+    "arguments": [
+      "uint64x2_t a"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "NOP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev16_p8",
+    "arguments": [
+      "poly8x8_t vec"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV16"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev16_s8",
+    "arguments": [
+      "int8x8_t vec"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV16"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev16_u8",
+    "arguments": [
+      "uint8x8_t vec"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV16"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev16q_p8",
+    "arguments": [
+      "poly8x16_t vec"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV16"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev16q_s8",
+    "arguments": [
+      "int8x16_t vec"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV16"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev16q_u8",
+    "arguments": [
+      "uint8x16_t vec"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV16"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev32_p16",
+    "arguments": [
+      "poly16x4_t vec"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV32"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev32_p8",
+    "arguments": [
+      "poly8x8_t vec"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV32"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev32_s16",
+    "arguments": [
+      "int16x4_t vec"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV32"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev32_s8",
+    "arguments": [
+      "int8x8_t vec"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV32"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev32_u16",
+    "arguments": [
+      "uint16x4_t vec"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV32"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev32_u8",
+    "arguments": [
+      "uint8x8_t vec"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV32"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev32q_p16",
+    "arguments": [
+      "poly16x8_t vec"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV32"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev32q_p8",
+    "arguments": [
+      "poly8x16_t vec"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV32"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev32q_s16",
+    "arguments": [
+      "int16x8_t vec"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV32"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev32q_s8",
+    "arguments": [
+      "int8x16_t vec"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV32"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev32q_u16",
+    "arguments": [
+      "uint16x8_t vec"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV32"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev32q_u8",
+    "arguments": [
+      "uint8x16_t vec"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV32"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64_f16",
+    "arguments": [
+      "float16x4_t vec"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64_f32",
+    "arguments": [
+      "float32x2_t vec"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64_p16",
+    "arguments": [
+      "poly16x4_t vec"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64_p8",
+    "arguments": [
+      "poly8x8_t vec"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64_s16",
+    "arguments": [
+      "int16x4_t vec"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64_s32",
+    "arguments": [
+      "int32x2_t vec"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64_s8",
+    "arguments": [
+      "int8x8_t vec"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64_u16",
+    "arguments": [
+      "uint16x4_t vec"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64_u32",
+    "arguments": [
+      "uint32x2_t vec"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64_u8",
+    "arguments": [
+      "uint8x8_t vec"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64q_f16",
+    "arguments": [
+      "float16x8_t vec"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64q_f32",
+    "arguments": [
+      "float32x4_t vec"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64q_p16",
+    "arguments": [
+      "poly16x8_t vec"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64q_p8",
+    "arguments": [
+      "poly8x16_t vec"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64q_s16",
+    "arguments": [
+      "int16x8_t vec"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64q_s32",
+    "arguments": [
+      "int32x4_t vec"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64q_s8",
+    "arguments": [
+      "int8x16_t vec"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64q_u16",
+    "arguments": [
+      "uint16x8_t vec"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64q_u32",
+    "arguments": [
+      "uint32x4_t vec"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrev64q_u8",
+    "arguments": [
+      "uint8x16_t vec"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vec": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "REV64"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrhadd_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrhadd_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrhadd_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrhadd_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrhadd_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrhadd_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrhaddq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrhaddq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrhaddq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrhaddq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrhaddq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrhaddq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URHADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd32x_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT32X"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd32x_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT32X"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd32xq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT32X"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd32xq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT32X"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd32z_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT32Z"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd32z_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT32Z"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd32zq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT32Z"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd32zq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT32Z"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd64x_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT64X"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd64x_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT64X"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd64xq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT64X"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd64xq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT64X"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd64z_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT64Z"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd64z_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT64Z"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd64zq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT64Z"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd64zq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINT64Z"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnd_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnda_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnda_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrnda_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndah_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndaq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndaq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndaq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndi_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndi_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndi_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndih_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndiq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndiq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndiq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndm_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndm_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndm_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndmh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndmq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndmq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndmq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTM"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndn_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndn_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndn_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndnh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndnq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndnq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndnq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndns_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndp_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndp_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndp_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndph_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndpq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndpq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndpq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTP"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTZ"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndx_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndx_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndx_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndxh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndxq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndxq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrndxq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRINTX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshl_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshl_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshl_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshl_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshl_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshl_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshl_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshl_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshld_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshld_u64",
+    "arguments": [
+      "uint64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshlq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshlq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshlq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshlq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshlq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshlq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshlq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshlq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshr_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshr_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshr_n_s64",
+    "arguments": [
+      "int64x1_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshr_n_s8",
+    "arguments": [
+      "int8x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshr_n_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshr_n_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshr_n_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshr_n_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrd_n_s64",
+    "arguments": [
+      "int64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrd_n_u64",
+    "arguments": [
+      "uint64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrn_high_n_s16",
+    "arguments": [
+      "int8x8_t r",
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrn_high_n_s32",
+    "arguments": [
+      "int16x4_t r",
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrn_high_n_s64",
+    "arguments": [
+      "int32x2_t r",
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrn_high_n_u16",
+    "arguments": [
+      "uint8x8_t r",
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrn_high_n_u32",
+    "arguments": [
+      "uint16x4_t r",
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrn_high_n_u64",
+    "arguments": [
+      "uint32x2_t r",
+      "uint64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      },
+      "r": {
+        "register": "32(Vd)"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrn_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrn_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrn_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrn_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrn_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrn_n_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrq_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrq_n_s8",
+    "arguments": [
+      "int8x16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrq_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrq_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrq_n_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrshrq_n_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrte_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrte_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrte_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrte_u32",
+    "arguments": [
+      "uint32x2_t a"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSQRTE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrted_f64",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrteh_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrteq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrteq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrteq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrteq_u32",
+    "arguments": [
+      "uint32x4_t a"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSQRTE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrtes_f32",
+    "arguments": [
+      "float32_t a"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTE"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrts_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrts_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrts_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrtsd_f64",
+    "arguments": [
+      "float64_t a",
+      "float64_t b"
+    ],
+    "return_type": {
+      "value": "float64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrtsh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrtsq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrtsq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrtsq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsqrtss_f32",
+    "arguments": [
+      "float32_t a",
+      "float32_t b"
+    ],
+    "return_type": {
+      "value": "float32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sn"
+      },
+      "b": {
+        "register": "Sm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FRSQRTS"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsra_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsra_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsra_n_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsra_n_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsra_n_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsra_n_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsra_n_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsra_n_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsrad_n_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsrad_n_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsraq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsraq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsraq_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsraq_n_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsraq_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsraq_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsraq_n_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsraq_n_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "URSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsubhn_high_s16",
+    "arguments": [
+      "int8x8_t r",
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSUBHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsubhn_high_s32",
+    "arguments": [
+      "int16x4_t r",
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSUBHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsubhn_high_s64",
+    "arguments": [
+      "int32x2_t r",
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSUBHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsubhn_high_u16",
+    "arguments": [
+      "uint8x8_t r",
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSUBHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsubhn_high_u32",
+    "arguments": [
+      "uint16x4_t r",
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSUBHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsubhn_high_u64",
+    "arguments": [
+      "uint32x2_t r",
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSUBHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsubhn_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSUBHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsubhn_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSUBHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsubhn_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSUBHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsubhn_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSUBHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsubhn_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSUBHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vrsubhn_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "RSUBHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_f16",
+    "arguments": [
+      "float16_t a",
+      "float16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "VnH"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_f32",
+    "arguments": [
+      "float32_t a",
+      "float32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_f64",
+    "arguments": [
+      "float64_t a",
+      "float64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_p16",
+    "arguments": [
+      "poly16_t a",
+      "poly16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_p64",
+    "arguments": [
+      "poly64_t a",
+      "poly64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_p8",
+    "arguments": [
+      "poly8_t a",
+      "poly8x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_s16",
+    "arguments": [
+      "int16_t a",
+      "int16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_s32",
+    "arguments": [
+      "int32_t a",
+      "int32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_s64",
+    "arguments": [
+      "int64_t a",
+      "int64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_s8",
+    "arguments": [
+      "int8_t a",
+      "int8x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_u16",
+    "arguments": [
+      "uint16_t a",
+      "uint16x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_u32",
+    "arguments": [
+      "uint32_t a",
+      "uint32x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64x1_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "v": {
+        "register": "Vd.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vset_lane_u8",
+    "arguments": [
+      "uint8_t a",
+      "uint8x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_f16",
+    "arguments": [
+      "float16_t a",
+      "float16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "VnH"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_f32",
+    "arguments": [
+      "float32_t a",
+      "float32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_f64",
+    "arguments": [
+      "float64_t a",
+      "float64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_p16",
+    "arguments": [
+      "poly16_t a",
+      "poly16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_p64",
+    "arguments": [
+      "poly64_t a",
+      "poly64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_p8",
+    "arguments": [
+      "poly8_t a",
+      "poly8x16_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "v": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_s16",
+    "arguments": [
+      "int16_t a",
+      "int16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_s32",
+    "arguments": [
+      "int32_t a",
+      "int32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_s64",
+    "arguments": [
+      "int64_t a",
+      "int64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_s8",
+    "arguments": [
+      "int8_t a",
+      "int8x16_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "v": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_u16",
+    "arguments": [
+      "uint16_t a",
+      "uint16x8_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "v": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_u32",
+    "arguments": [
+      "uint32_t a",
+      "uint32x4_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "v": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64x2_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "v": {
+        "register": "Vd.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsetq_lane_u8",
+    "arguments": [
+      "uint8_t a",
+      "uint8x16_t v",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Rn"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "v": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOV"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha1cq_u32",
+    "arguments": [
+      "uint32x4_t hash_abcd",
+      "uint32_t hash_e",
+      "uint32x4_t wk"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "hash_abcd": {
+        "register": "Qd"
+      },
+      "hash_e": {
+        "register": "Sn"
+      },
+      "wk": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA1C"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha1h_u32",
+    "arguments": [
+      "uint32_t hash_e"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "hash_e": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA1H"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha1mq_u32",
+    "arguments": [
+      "uint32x4_t hash_abcd",
+      "uint32_t hash_e",
+      "uint32x4_t wk"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "hash_abcd": {
+        "register": "Qd"
+      },
+      "hash_e": {
+        "register": "Sn"
+      },
+      "wk": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA1M"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha1pq_u32",
+    "arguments": [
+      "uint32x4_t hash_abcd",
+      "uint32_t hash_e",
+      "uint32x4_t wk"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "hash_abcd": {
+        "register": "Qd"
+      },
+      "hash_e": {
+        "register": "Sn"
+      },
+      "wk": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA1P"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha1su0q_u32",
+    "arguments": [
+      "uint32x4_t w0_3",
+      "uint32x4_t w4_7",
+      "uint32x4_t w8_11"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "w0_3": {
+        "register": "Vd.4S"
+      },
+      "w4_7": {
+        "register": "Vn.4S"
+      },
+      "w8_11": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA1SU0"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha1su1q_u32",
+    "arguments": [
+      "uint32x4_t tw0_3",
+      "uint32x4_t w12_15"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "tw0_3": {
+        "register": "Vd.4S"
+      },
+      "w12_15": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA1SU1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha256h2q_u32",
+    "arguments": [
+      "uint32x4_t hash_efgh",
+      "uint32x4_t hash_abcd",
+      "uint32x4_t wk"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "hash_abcd": {
+        "register": "Qn"
+      },
+      "hash_efgh": {
+        "register": "Qd"
+      },
+      "wk": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA256H2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha256hq_u32",
+    "arguments": [
+      "uint32x4_t hash_abcd",
+      "uint32x4_t hash_efgh",
+      "uint32x4_t wk"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "hash_abcd": {
+        "register": "Qd"
+      },
+      "hash_efgh": {
+        "register": "Qn"
+      },
+      "wk": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA256H"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha256su0q_u32",
+    "arguments": [
+      "uint32x4_t w0_3",
+      "uint32x4_t w4_7"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "w0_3": {
+        "register": "Vd.4S"
+      },
+      "w4_7": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA256SU0"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha256su1q_u32",
+    "arguments": [
+      "uint32x4_t tw0_3",
+      "uint32x4_t w8_11",
+      "uint32x4_t w12_15"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "tw0_3": {
+        "register": "Vd.4S"
+      },
+      "w12_15": {
+        "register": "Vm.4S"
+      },
+      "w8_11": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA256SU1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha512h2q_u64",
+    "arguments": [
+      "uint64x2_t sum_ab",
+      "uint64x2_t hash_c_",
+      "uint64x2_t hash_ab"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "hash_ab": {},
+      "hash_c_": {
+        "register": "Qn"
+      },
+      "sum_ab": {
+        "register": "Qd"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA512H2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha512hq_u64",
+    "arguments": [
+      "uint64x2_t hash_ed",
+      "uint64x2_t hash_gf",
+      "uint64x2_t kwh_kwh2"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "hash_ed": {
+        "register": "Qd"
+      },
+      "hash_gf": {
+        "register": "Qn"
+      },
+      "kwh_kwh2": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA512H"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha512su0q_u64",
+    "arguments": [
+      "uint64x2_t w0_1",
+      "uint64x2_t w2_"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "w0_1": {
+        "register": "Vd.2D"
+      },
+      "w2_": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA512SU0"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsha512su1q_u64",
+    "arguments": [
+      "uint64x2_t s01_s02",
+      "uint64x2_t w14_15",
+      "uint64x2_t w9_10"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "s01_s02": {
+        "register": "Vd.2D"
+      },
+      "w14_15": {
+        "register": "Vn.2D"
+      },
+      "w9_10": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHA512SU1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_n_s64",
+    "arguments": [
+      "int64x1_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_n_s8",
+    "arguments": [
+      "int8x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_n_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_n_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_n_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_n_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshl_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshld_n_s64",
+    "arguments": [
+      "int64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshld_n_u64",
+    "arguments": [
+      "uint64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshld_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshld_u64",
+    "arguments": [
+      "uint64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshll_high_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHLL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshll_high_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHLL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshll_high_n_s8",
+    "arguments": [
+      "int8x16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHLL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshll_high_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHLL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshll_high_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHLL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshll_high_n_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHLL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshll_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHLL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshll_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHLL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshll_n_s8",
+    "arguments": [
+      "int8x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHLL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshll_n_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHLL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshll_n_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHLL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshll_n_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHLL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_n_s8",
+    "arguments": [
+      "int8x16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_n_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_n_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshlq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshr_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshr_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshr_n_s64",
+    "arguments": [
+      "int64x1_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshr_n_s8",
+    "arguments": [
+      "int8x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshr_n_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshr_n_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshr_n_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshr_n_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrd_n_s64",
+    "arguments": [
+      "int64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrd_n_u64",
+    "arguments": [
+      "uint64_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrn_high_n_s16",
+    "arguments": [
+      "int8x8_t r",
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrn_high_n_s32",
+    "arguments": [
+      "int16x4_t r",
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrn_high_n_s64",
+    "arguments": [
+      "int32x2_t r",
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrn_high_n_u16",
+    "arguments": [
+      "uint8x8_t r",
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrn_high_n_u32",
+    "arguments": [
+      "uint16x4_t r",
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrn_high_n_u64",
+    "arguments": [
+      "uint32x2_t r",
+      "uint64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrn_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrn_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrn_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrn_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrn_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrn_n_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SHRN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrq_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrq_n_s8",
+    "arguments": [
+      "int8x16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrq_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrq_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrq_n_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vshrq_n_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USHR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsli_n_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "poly16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsli_n_p64",
+    "arguments": [
+      "poly64x1_t a",
+      "poly64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsli_n_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsli_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsli_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsli_n_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsli_n_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsli_n_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsli_n_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsli_n_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsli_n_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vslid_n_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vslid_n_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsliq_n_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "poly16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsliq_n_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsliq_n_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsliq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsliq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsliq_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsliq_n_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsliq_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 15
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsliq_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 31
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsliq_n_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsliq_n_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 0,
+        "maximum": 7
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SLI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsm3partw1q_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SM3PARTW1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsm3partw2q_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SM3PARTW2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsm3ss1q_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t c"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {},
+      "c": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SM3SS1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsm3tt1aq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t c",
+      "const int imm2"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {},
+      "c": {},
+      "imm2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SM3TT1A"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsm3tt1bq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t c",
+      "const int imm2"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {},
+      "c": {},
+      "imm2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SM3TT1B"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsm3tt2aq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t c",
+      "const int imm2"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {},
+      "c": {},
+      "imm2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SM3TT2A"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsm3tt2bq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "uint32x4_t c",
+      "const int imm2"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {},
+      "c": {},
+      "imm2": {
+        "minimum": 0,
+        "maximum": 3
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SM3TT2B"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsm4ekeyq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SM4EKEY"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsm4eq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {}
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SM4E"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqadd_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqadd_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqadd_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqadd_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqaddb_u8",
+    "arguments": [
+      "uint8_t a",
+      "int8_t b"
+    ],
+    "return_type": {
+      "value": "uint8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bd"
+      },
+      "b": {
+        "register": "Bn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqaddd_u64",
+    "arguments": [
+      "uint64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqaddh_u16",
+    "arguments": [
+      "uint16_t a",
+      "int16_t b"
+    ],
+    "return_type": {
+      "value": "uint16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hd"
+      },
+      "b": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqaddq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqaddq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqaddq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqaddq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqadds_u32",
+    "arguments": [
+      "uint32_t a",
+      "int32_t b"
+    ],
+    "return_type": {
+      "value": "uint32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqrt_f16",
+    "arguments": [
+      "float16x4_t a"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSQRT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqrt_f32",
+    "arguments": [
+      "float32x2_t a"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSQRT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqrt_f64",
+    "arguments": [
+      "float64x1_t a"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSQRT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqrth_f16",
+    "arguments": [
+      "float16_t a"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSQRT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqrtq_f16",
+    "arguments": [
+      "float16x8_t a"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSQRT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqrtq_f32",
+    "arguments": [
+      "float32x4_t a"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSQRT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsqrtq_f64",
+    "arguments": [
+      "float64x2_t a"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSQRT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsra_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsra_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsra_n_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsra_n_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsra_n_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsra_n_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsra_n_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsra_n_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsrad_n_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsrad_n_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsraq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsraq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsraq_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsraq_n_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsraq_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsraq_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsraq_n_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsraq_n_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USRA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsri_n_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "poly16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsri_n_p64",
+    "arguments": [
+      "poly64x1_t a",
+      "poly64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsri_n_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsri_n_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsri_n_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsri_n_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsri_n_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsri_n_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsri_n_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsri_n_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsri_n_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsrid_n_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsrid_n_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsriq_n_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "poly16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsriq_n_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsriq_n_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsriq_n_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsriq_n_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsriq_n_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsriq_n_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsriq_n_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 16
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsriq_n_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 32
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsriq_n_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 64
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsriq_n_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b",
+      "const int n"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      },
+      "n": {
+        "minimum": 1,
+        "maximum": 8
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SRI"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_f16_x2",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_f16_x3",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_f16_x4",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_f32_x2",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_f32_x3",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_f32_x4",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x1_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_f64_x2",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x1x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_f64_x3",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x1x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_f64_x4",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x1x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x1_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x1_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x8_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x1_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x8_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x1_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_lane_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x8_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_p16_x2",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_p16_x3",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_p16_x4",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x1_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_p64_x2",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x1x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_p64_x3",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x1x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_p64_x4",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x1x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x8_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_p8_x2",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_p8_x3",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_p8_x4",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s16_x2",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s16_x3",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s16_x4",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s32_x2",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s32_x3",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s32_x4",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x1_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s64_x2",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x1x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s64_x3",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x1x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s64_x4",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x1x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x8_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s8_x2",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s8_x3",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_s8_x4",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u16_x2",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u16_x3",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u16_x4",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u32_x2",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u32_x3",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u32_x4",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x1_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u64_x2",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x1x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u64_x3",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x1x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u64_x4",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x1x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x8_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u8_x2",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u8_x3",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1_u8_x4",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x8_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_f16_x2",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_f16_x3",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_f16_x4",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_f32_x2",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_f32_x3",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_f32_x4",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_f64_x2",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_f64_x3",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_f64_x4",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x8_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x8_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x16_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x8_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x16_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x8_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_lane_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x16_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x8_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_p16_x2",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_p16_x3",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_p16_x4",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_p64_x2",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_p64_x3",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_p64_x4",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x16_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_p8_x2",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x16x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_p8_x3",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x16x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_p8_x4",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x16x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x8_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s16_x2",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s16_x3",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s16_x4",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s32_x2",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s32_x3",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s32_x4",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s64_x2",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s64_x3",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s64_x4",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x16_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s8_x2",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x16x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s8_x3",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x16x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_s8_x4",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x16x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x8_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u16_x2",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u16_x3",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u16_x4",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u32_x2",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u32_x3",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u32_x4",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u64_x2",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u64_x3",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u64_x4",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x16_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u8_x2",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x16x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u8_x3",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x16x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst1q_u8_x4",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x16x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x1x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x4x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x2x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x1x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x4x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x1x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x8x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x4x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x2x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x1x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x8x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x4x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x2x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x1x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_lane_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x8x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x1x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x1x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x1x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x8x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x4x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x2x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 2
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x8x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x2x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x16x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x8x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x4x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x2x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x16x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x8x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x4x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x2x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_lane_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x16x2_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x16x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x16x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x8x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x4x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x2x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst2q_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x16x2_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt2.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x1x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x4x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x2x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x1x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x4x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x1x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x8x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x4x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x2x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x1x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x8x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x4x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x2x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x1x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_lane_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x8x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x1x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x1x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x1x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x8x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x4x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x2x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x8x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x2x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x16x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x8x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x4x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x2x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x16x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x8x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x4x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x2x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_lane_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x16x3_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x16x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x16x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x8x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x4x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x2x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst3q_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x16x3_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt3.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST3"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x1x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x4x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x2x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x1x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x4x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x1x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x8x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x4x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x2x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x1x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x8x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x4x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x2x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x1x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_lane_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x8x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x1x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x1x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x1x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.1D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_f16",
+    "arguments": [
+      "float16_t * ptr",
+      "float16x8x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_f32",
+    "arguments": [
+      "float32_t * ptr",
+      "float32x4x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_f64",
+    "arguments": [
+      "float64_t * ptr",
+      "float64x2x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x8x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x2x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x16x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x8x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x4x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x2x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x16x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x8x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 7
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x4x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x2x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_lane_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x16x4_t val",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "lane": {
+        "minimum": 0,
+        "maximum": 15
+      },
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_p16",
+    "arguments": [
+      "poly16_t * ptr",
+      "poly16x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_p64",
+    "arguments": [
+      "poly64_t * ptr",
+      "poly64x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_p8",
+    "arguments": [
+      "poly8_t * ptr",
+      "poly8x16x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_s16",
+    "arguments": [
+      "int16_t * ptr",
+      "int16x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_s32",
+    "arguments": [
+      "int32_t * ptr",
+      "int32x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_s64",
+    "arguments": [
+      "int64_t * ptr",
+      "int64x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_s8",
+    "arguments": [
+      "int8_t * ptr",
+      "int8x16x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_u16",
+    "arguments": [
+      "uint16_t * ptr",
+      "uint16x8x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_u32",
+    "arguments": [
+      "uint32_t * ptr",
+      "uint32x4x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_u64",
+    "arguments": [
+      "uint64_t * ptr",
+      "uint64x2x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vst4q_u8",
+    "arguments": [
+      "uint8_t * ptr",
+      "uint8x16x4_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Vt4.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ST4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vstrq_p128",
+    "arguments": [
+      "poly128_t * ptr",
+      "poly128_t val"
+    ],
+    "return_type": {
+      "value": "void"
+    },
+    "Arguments_Preparation": {
+      "ptr": {
+        "register": "Xn"
+      },
+      "val": {
+        "register": "Qt"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "STR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsub_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsub_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsub_f64",
+    "arguments": [
+      "float64x1_t a",
+      "float64x1_t b"
+    ],
+    "return_type": {
+      "value": "float64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsub_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsub_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsub_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsub_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsub_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsub_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsub_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsub_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubd_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubd_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubh_f16",
+    "arguments": [
+      "float16_t a",
+      "float16_t b"
+    ],
+    "return_type": {
+      "value": "float16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hn"
+      },
+      "b": {
+        "register": "Hm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubhn_high_s16",
+    "arguments": [
+      "int8x8_t r",
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUBHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubhn_high_s32",
+    "arguments": [
+      "int16x4_t r",
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUBHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubhn_high_s64",
+    "arguments": [
+      "int32x2_t r",
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUBHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubhn_high_u16",
+    "arguments": [
+      "uint8x8_t r",
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      },
+      "r": {
+        "register": "Vd.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUBHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubhn_high_u32",
+    "arguments": [
+      "uint16x4_t r",
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      },
+      "r": {
+        "register": "Vd.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUBHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubhn_high_u64",
+    "arguments": [
+      "uint32x2_t r",
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUBHN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubhn_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUBHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubhn_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUBHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubhn_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUBHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubhn_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUBHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubhn_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUBHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubhn_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUBHN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubl_high_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSUBL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubl_high_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSUBL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubl_high_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSUBL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubl_high_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USUBL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubl_high_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USUBL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubl_high_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USUBL2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubl_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSUBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubl_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSUBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubl_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSUBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubl_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USUBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubl_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USUBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubl_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USUBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FSUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUB"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubw_high_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSUBW2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubw_high_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSUBW2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubw_high_s8",
+    "arguments": [
+      "int16x8_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSUBW2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubw_high_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USUBW2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubw_high_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USUBW2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubw_high_u8",
+    "arguments": [
+      "uint16x8_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USUBW2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubw_s16",
+    "arguments": [
+      "int32x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSUBW"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubw_s32",
+    "arguments": [
+      "int64x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSUBW"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubw_s8",
+    "arguments": [
+      "int16x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SSUBW"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubw_u16",
+    "arguments": [
+      "uint32x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USUBW"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubw_u32",
+    "arguments": [
+      "uint64x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USUBW"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsubw_u8",
+    "arguments": [
+      "uint16x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USUBW"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsudot_lane_s32",
+    "arguments": [
+      "int32x2_t r",
+      "int8x8_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsudot_laneq_s32",
+    "arguments": [
+      "int32x2_t r",
+      "int8x8_t a",
+      "uint8x16_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsudotq_lane_s32",
+    "arguments": [
+      "int32x4_t r",
+      "int8x16_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vsudotq_laneq_s32",
+    "arguments": [
+      "int32x4_t r",
+      "int8x16_t a",
+      "uint8x16_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbl1_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbl1_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbl1_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbl2_p8",
+    "arguments": [
+      "poly8x8x2_t a",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbl2_s8",
+    "arguments": [
+      "int8x8x2_t a",
+      "int8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbl2_u8",
+    "arguments": [
+      "uint8x8x2_t a",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbl3_p8",
+    "arguments": [
+      "poly8x8x3_t a",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbl3_s8",
+    "arguments": [
+      "int8x8x3_t a",
+      "int8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbl3_u8",
+    "arguments": [
+      "uint8x8x3_t a",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbl4_p8",
+    "arguments": [
+      "poly8x8x4_t a",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbl4_s8",
+    "arguments": [
+      "int8x8x4_t a",
+      "int8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbl4_u8",
+    "arguments": [
+      "uint8x8x4_t a",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBL"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbx1_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOVI",
+        "CMHS",
+        "TBL",
+        "BIF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbx1_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b",
+      "int8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOVI",
+        "CMHS",
+        "TBL",
+        "BIF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbx1_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOVI",
+        "CMHS",
+        "TBL",
+        "BIF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbx2_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8x2_t b",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbx2_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8x2_t b",
+      "int8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbx2_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8x2_t b",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbx3_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8x3_t b",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOVI",
+        "CMHS",
+        "TBL",
+        "BIF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbx3_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8x3_t b",
+      "int8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOVI",
+        "CMHS",
+        "TBL",
+        "BIF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbx3_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8x3_t b",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "MOVI",
+        "CMHS",
+        "TBL",
+        "BIF"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbx4_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8x4_t b",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbx4_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8x4_t b",
+      "int8x8_t idx"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtbx4_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8x4_t b",
+      "uint8x8_t idx"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {},
+      "b": {
+        "register": "Vn.16B"
+      },
+      "idx": {}
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TBX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "poly16x4_t b"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "poly16x8_t b"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn1q_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "poly16x4_t b"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "poly16x8_t b"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn2q_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "poly16x4_t b"
+    ],
+    "return_type": {
+      "value": "poly16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "poly8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrn_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrnq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrnq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrnq_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "poly16x8_t b"
+    ],
+    "return_type": {
+      "value": "poly16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrnq_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "poly8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrnq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrnq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrnq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrnq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrnq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtrnq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "TRN1",
+        "TRN2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtst_p64",
+    "arguments": [
+      "poly64x1_t a",
+      "poly64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtst_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtst_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtst_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtst_s64",
+    "arguments": [
+      "int64x1_t a",
+      "int64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtst_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtst_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtst_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtst_u64",
+    "arguments": [
+      "uint64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "uint64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtst_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtstd_s64",
+    "arguments": [
+      "int64_t a",
+      "int64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtstd_u64",
+    "arguments": [
+      "uint64_t a",
+      "uint64_t b"
+    ],
+    "return_type": {
+      "value": "uint64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      },
+      "b": {
+        "register": "Dm"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtstq_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtstq_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtstq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtstq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtstq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtstq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtstq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtstq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtstq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vtstq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "CMTST"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuqadd_s16",
+    "arguments": [
+      "int16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4H"
+      },
+      "b": {
+        "register": "Vn.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuqadd_s32",
+    "arguments": [
+      "int32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2S"
+      },
+      "b": {
+        "register": "Vn.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuqadd_s64",
+    "arguments": [
+      "int64x1_t a",
+      "uint64x1_t b"
+    ],
+    "return_type": {
+      "value": "int64x1_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuqadd_s8",
+    "arguments": [
+      "int8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8B"
+      },
+      "b": {
+        "register": "Vn.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuqaddb_s8",
+    "arguments": [
+      "int8_t a",
+      "uint8_t b"
+    ],
+    "return_type": {
+      "value": "int8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Bd"
+      },
+      "b": {
+        "register": "Bn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuqaddd_s64",
+    "arguments": [
+      "int64_t a",
+      "uint64_t b"
+    ],
+    "return_type": {
+      "value": "int64_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dd"
+      },
+      "b": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuqaddh_s16",
+    "arguments": [
+      "int16_t a",
+      "uint16_t b"
+    ],
+    "return_type": {
+      "value": "int16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Hd"
+      },
+      "b": {
+        "register": "Hn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuqaddq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.8H"
+      },
+      "b": {
+        "register": "Vn.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuqaddq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.4S"
+      },
+      "b": {
+        "register": "Vn.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuqaddq_s64",
+    "arguments": [
+      "int64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.2D"
+      },
+      "b": {
+        "register": "Vn.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuqaddq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vd.16B"
+      },
+      "b": {
+        "register": "Vn.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuqadds_s32",
+    "arguments": [
+      "int32_t a",
+      "uint32_t b"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Sd"
+      },
+      "b": {
+        "register": "Sn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "SUQADD"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vusdot_lane_s32",
+    "arguments": [
+      "int32x2_t r",
+      "uint8x8_t a",
+      "int8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vusdot_laneq_s32",
+    "arguments": [
+      "int32x2_t r",
+      "uint8x8_t a",
+      "int8x16_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vusdot_s32",
+    "arguments": [
+      "int32x2_t r",
+      "uint8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      },
+      "r": {
+        "register": "Vd.2S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vusdotq_lane_s32",
+    "arguments": [
+      "int32x4_t r",
+      "uint8x16_t a",
+      "int8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vusdotq_laneq_s32",
+    "arguments": [
+      "int32x4_t r",
+      "uint8x16_t a",
+      "int8x16_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.4B"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vusdotq_s32",
+    "arguments": [
+      "int32x4_t r",
+      "uint8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USDOT"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vusmmlaq_s32",
+    "arguments": [
+      "int32x4_t r",
+      "uint8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      },
+      "r": {
+        "register": "Vd.4S"
+      }
+    },
+    "Architectures": [
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "USMMLA"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "poly16x4_t b"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "poly16x8_t b"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp1q_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "poly16x4_t b"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "poly16x8_t b"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp2q_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "poly16x4_t b"
+    ],
+    "return_type": {
+      "value": "poly16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "poly8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzp_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzpq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzpq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzpq_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "poly16x8_t b"
+    ],
+    "return_type": {
+      "value": "poly16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzpq_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "poly8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzpq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzpq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzpq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzpq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzpq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vuzpq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "UZP1",
+        "UZP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vxarq_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b",
+      "const int imm6"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {},
+      "imm6": {
+        "minimum": 0,
+        "maximum": 63
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "XAR"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "poly16x4_t b"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "poly16x8_t b"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip1q_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "poly16x4_t b"
+    ],
+    "return_type": {
+      "value": "poly16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "poly8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "poly16x8_t b"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_p64",
+    "arguments": [
+      "poly64x2_t a",
+      "poly64x2_t b"
+    ],
+    "return_type": {
+      "value": "poly64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_s64",
+    "arguments": [
+      "int64x2_t a",
+      "int64x2_t b"
+    ],
+    "return_type": {
+      "value": "int64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_u64",
+    "arguments": [
+      "uint64x2_t a",
+      "uint64x2_t b"
+    ],
+    "return_type": {
+      "value": "uint64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip2q_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip_f16",
+    "arguments": [
+      "float16x4_t a",
+      "float16x4_t b"
+    ],
+    "return_type": {
+      "value": "float16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "poly16x4_t b"
+    ],
+    "return_type": {
+      "value": "poly16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "poly8x8_t b"
+    ],
+    "return_type": {
+      "value": "poly8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip_s16",
+    "arguments": [
+      "int16x4_t a",
+      "int16x4_t b"
+    ],
+    "return_type": {
+      "value": "int16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip_s32",
+    "arguments": [
+      "int32x2_t a",
+      "int32x2_t b"
+    ],
+    "return_type": {
+      "value": "int32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip_s8",
+    "arguments": [
+      "int8x8_t a",
+      "int8x8_t b"
+    ],
+    "return_type": {
+      "value": "int8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint16x4_t b"
+    ],
+    "return_type": {
+      "value": "uint16x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4H"
+      },
+      "b": {
+        "register": "Vm.4H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip_u32",
+    "arguments": [
+      "uint32x2_t a",
+      "uint32x2_t b"
+    ],
+    "return_type": {
+      "value": "uint32x2x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzip_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b"
+    ],
+    "return_type": {
+      "value": "uint8x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8B"
+      },
+      "b": {
+        "register": "Vm.8B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzipq_f16",
+    "arguments": [
+      "float16x8_t a",
+      "float16x8_t b"
+    ],
+    "return_type": {
+      "value": "float16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzipq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzipq_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "poly16x8_t b"
+    ],
+    "return_type": {
+      "value": "poly16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzipq_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "poly8x16_t b"
+    ],
+    "return_type": {
+      "value": "poly8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzipq_s16",
+    "arguments": [
+      "int16x8_t a",
+      "int16x8_t b"
+    ],
+    "return_type": {
+      "value": "int16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzipq_s32",
+    "arguments": [
+      "int32x4_t a",
+      "int32x4_t b"
+    ],
+    "return_type": {
+      "value": "int32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzipq_s8",
+    "arguments": [
+      "int8x16_t a",
+      "int8x16_t b"
+    ],
+    "return_type": {
+      "value": "int8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzipq_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint16x8_t b"
+    ],
+    "return_type": {
+      "value": "uint16x8x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm.8H"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzipq_u32",
+    "arguments": [
+      "uint32x4_t a",
+      "uint32x4_t b"
+    ],
+    "return_type": {
+      "value": "uint32x4x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vzipq_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x16_t b"
+    ],
+    "return_type": {
+      "value": "uint8x16x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm.16B"
+      }
+    },
+    "Architectures": [
+      "v7",
+      "A32",
+      "A64"
+    ],
+    "instructions": [
+      [
+        "ZIP1",
+        "ZIP2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vamin_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FAMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaminq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FAMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vaminq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FAMIN"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vamax_f32",
+    "arguments": [
+      "float32x2_t a",
+      "float32x2_t b"
+    ],
+    "return_type": {
+      "value": "float32x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2S"
+      },
+      "b": {
+        "register": "Vm.2S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FAMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vamaxq_f32",
+    "arguments": [
+      "float32x4_t a",
+      "float32x4_t b"
+    ],
+    "return_type": {
+      "value": "float32x4_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.4S"
+      },
+      "b": {
+        "register": "Vm.4S"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FAMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vamaxq_f64",
+    "arguments": [
+      "float64x2_t a",
+      "float64x2_t b"
+    ],
+    "return_type": {
+      "value": "float64x2_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.2D"
+      },
+      "b": {
+        "register": "Vm.2D"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FAMAX"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti2_lane_u8",
+    "arguments": [
+      "uint8x8_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti2q_lane_u8",
+    "arguments": [
+      "uint8x16_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti2_lane_s8",
+    "arguments": [
+      "int8x8_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti2q_lane_s8",
+    "arguments": [
+      "int8x16_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti2_lane_p8",
+    "arguments": [
+      "poly8x8_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti2q_lane_p8",
+    "arguments": [
+      "poly8x16_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.16B"
+      },
+      "b": {
+        "register": "Vm"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti2_lane_u16",
+    "arguments": [
+      "uint16x4_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti2q_lane_u16",
+    "arguments": [
+      "uint16x8_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti2_lane_s16",
+    "arguments": [
+      "int16x4_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti2q_lane_s16",
+    "arguments": [
+      "int16x8_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti2_lane_p16",
+    "arguments": [
+      "poly16x4_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti2q_lane_p16",
+    "arguments": [
+      "poly16x8_t a",
+      "uint8x8_t b",
+      "const int lane"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Vn.8H"
+      },
+      "b": {
+        "register": "Vm"
+      },
+      "lane": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI2"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_lane_u8",
+    "arguments": [
+      "uint8x16_t vn",
+      "uint8x8_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn.16B"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "r": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_laneq_u8",
+    "arguments": [
+      "uint8x16_t vn",
+      "uint8x16_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "uint8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn.16B"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_lane_s8",
+    "arguments": [
+      "int8x16_t vn",
+      "uint8x8_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn.16B"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "r": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_laneq_s8",
+    "arguments": [
+      "int8x16_t vn",
+      "uint8x16_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "int8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn.16B"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_lane_p8",
+    "arguments": [
+      "poly8x16_t vn",
+      "uint8x8_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn.16B"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 0
+      },
+      "r": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_laneq_p8",
+    "arguments": [
+      "poly8x16_t vn",
+      "uint8x16_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "poly8x16_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn.16B"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.16B"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_lane_u16_x2",
+    "arguments": [
+      "uint16x8x2_t vn",
+      "uint8x8_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn1.8H"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_laneq_u16_x2",
+    "arguments": [
+      "uint16x8x2_t vn",
+      "uint8x16_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "uint16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn1.8H"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_lane_s16_x2",
+    "arguments": [
+      "int16x8x2_t vn",
+      "uint8x8_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn1.8H"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_laneq_s16_x2",
+    "arguments": [
+      "int16x8x2_t vn",
+      "uint8x16_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "int16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn1.8H"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_lane_f16_x2",
+    "arguments": [
+      "float16x8x2_t vn",
+      "uint8x8_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn1.8H"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_laneq_f16_x2",
+    "arguments": [
+      "float16x8x2_t vn",
+      "uint8x16_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "float16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn1.8H"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_lane_p16_x2",
+    "arguments": [
+      "poly16x8x2_t vn",
+      "uint8x8_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn1.8H"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 1
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "vluti4q_laneq_p16_x2",
+    "arguments": [
+      "poly16x8x2_t vn",
+      "uint8x16_t vm",
+      "const int index"
+    ],
+    "return_type": {
+      "value": "poly16x8_t"
+    },
+    "Arguments_Preparation": {
+      "vn": {
+        "register": "Vn1.8H"
+      },
+      "vm": {
+        "register": "Vm"
+      },
+      "index": {
+        "minimum": 0,
+        "maximum": 3
+      },
+      "r": {
+        "register": "Vd.8H"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "LUTI4"
+      ]
+    ]
+  }
+]
diff --git a/library/stdarch/rustfmt.toml b/library/stdarch/rustfmt.toml
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/library/stdarch/triagebot.toml b/library/stdarch/triagebot.toml
new file mode 100644
index 0000000000000..082c8cf2c63be
--- /dev/null
+++ b/library/stdarch/triagebot.toml
@@ -0,0 +1,52 @@
+[assign]
+
+[assign.owners]
+"*" = ["@Amanieu"]
+
+[ping.windows]
+message = """\
+Hey Windows Group! This issue could use some guidance on how it can be resolved
+on Windows platforms.
+Could one of you weigh in please? In case it's useful, here are some
+[instructions] for tackling these sorts of bugs.
+Thanks!
+
+[instructions]: https://rustc-dev-guide.rust-lang.org/notification-groups/windows.html
+"""
+
+[ping.arm]
+message = """\
+Hey Arm-interested people! This issue could use some guidance on how it can be
+resolved on Arm platforms.
+Could one of you weigh in please? In case it's useful, here are some
+[instructions] for tackling these sorts of bugs.
+Thanks!
+
+[instructions]: https://rustc-dev-guide.rust-lang.org/notification-groups/arm.html
+"""
+
+[ping.risc-v]
+message = """\
+Hey RISC-V Group! This issue could use some guidance on how it can be resolved
+on RISC-V platforms.
+Could one of you weigh in please? In case it's useful, here are some
+[instructions] for tackling these sorts of bugs.
+Thanks!
+
+[instructions]: https://rustc-dev-guide.rust-lang.org/notification-groups/risc-v.html
+"""
+
+[ping.fuchsia]
+message = """\
+Hey friends of Fuchsia! This issue could use some guidance on how this should be
+resolved/implemented on Fuchsia. Could one of you weigh in please?
+Thanks!
+"""
+
+[ping.apple]
+alias = ["macos", "ios", "tvos", "watchos", "visionos"]
+message = """\
+Hey Apple Group! This issue or PR could use some Darwin-specific guidance. Could
+one of you weigh in please?
+Thanks!
+"""
diff --git a/library/stdarch/vendor.yml b/library/stdarch/vendor.yml
new file mode 100644
index 0000000000000..fd2bfecba733a
--- /dev/null
+++ b/library/stdarch/vendor.yml
@@ -0,0 +1,3 @@
+- crates/stdarch-verify/x86-intel.xml
+- crates/stdarch-verify/mips-msa.h
+- intrinsics_data/arm_intrinsics.json
diff --git a/src/bootstrap/src/core/build_steps/check.rs b/src/bootstrap/src/core/build_steps/check.rs
index 911a51b0e161b..b50ede0d1f294 100644
--- a/src/bootstrap/src/core/build_steps/check.rs
+++ b/src/bootstrap/src/core/build_steps/check.rs
@@ -84,8 +84,6 @@ impl Step for Std {
             return;
         }
 
-        builder.require_submodule("library/stdarch", None);
-
         let stage = self.custom_stage.unwrap_or(builder.top_stage);
 
         let target = self.target;
diff --git a/src/bootstrap/src/core/build_steps/clippy.rs b/src/bootstrap/src/core/build_steps/clippy.rs
index 0652c08ff496e..e505153302710 100644
--- a/src/bootstrap/src/core/build_steps/clippy.rs
+++ b/src/bootstrap/src/core/build_steps/clippy.rs
@@ -141,8 +141,6 @@ impl Step for Std {
     }
 
     fn run(self, builder: &Builder<'_>) {
-        builder.require_submodule("library/stdarch", None);
-
         let target = self.target;
         let compiler = builder.compiler(builder.top_stage, builder.config.build);
 
diff --git a/src/bootstrap/src/core/build_steps/compile.rs b/src/bootstrap/src/core/build_steps/compile.rs
index 5ecce31fe1562..ace76193b29b9 100644
--- a/src/bootstrap/src/core/build_steps/compile.rs
+++ b/src/bootstrap/src/core/build_steps/compile.rs
@@ -195,8 +195,6 @@ impl Step for Std {
             return;
         }
 
-        builder.require_submodule("library/stdarch", None);
-
         let mut target_deps = builder.ensure(StartupObjects { compiler, target });
 
         let compiler_to_use = builder.compiler_for(compiler.stage, compiler.host, target);
diff --git a/src/bootstrap/src/core/build_steps/test.rs b/src/bootstrap/src/core/build_steps/test.rs
index dddce8fe05d1d..239ee3d744826 100644
--- a/src/bootstrap/src/core/build_steps/test.rs
+++ b/src/bootstrap/src/core/build_steps/test.rs
@@ -3533,6 +3533,68 @@ impl Step for CodegenGCC {
     }
 }
 
+/// Smoke test for stdarch which simply checks if we can build it with the in-tree
+/// compiler.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct Stdarch {
+    compiler: Compiler,
+    target: TargetSelection,
+}
+
+impl Step for Stdarch {
+    type Output = ();
+    const DEFAULT: bool = true;
+    const ONLY_HOSTS: bool = true;
+
+    fn should_run(run: ShouldRun<'_>) -> ShouldRun<'_> {
+        run.paths(&["library/stdarch"])
+    }
+
+    fn make_run(run: RunConfig<'_>) {
+        let builder = run.builder;
+        let host = run.build_triple();
+        let compiler = run.builder.compiler_for(run.builder.top_stage, host, host);
+
+        builder.ensure(Stdarch { compiler, target: run.target });
+    }
+
+    fn run(self, builder: &Builder<'_>) {
+        let compiler = self.compiler;
+        let target = self.target;
+
+        builder.ensure(compile::Std::new(compiler, target));
+
+        let compiler = builder.compiler_for(compiler.stage, compiler.host, target);
+
+        let mut cargo = builder::Cargo::new(
+            builder,
+            compiler,
+            Mode::ToolRustc,
+            SourceType::InTree,
+            target,
+            Kind::Check,
+        );
+
+        cargo.current_dir(&builder.src.join("library/stdarch"));
+        cargo.arg("--manifest-path").arg(builder.src.join("library/stdarch/Cargo.toml"));
+
+        // Just check that we can compile core_arch for the given target
+        cargo.arg("-p").arg("core_arch").arg("--all-targets");
+        cargo.env("TARGET", target.triple);
+
+        builder.info(&format!(
+            "{} stdarch stage{} ({} -> {})",
+            Kind::Test.description(),
+            compiler.stage,
+            &compiler.host,
+            target
+        ));
+        let _time = helpers::timeit(builder);
+
+        cargo.into_cmd().run(builder);
+    }
+}
+
 /// Test step that does two things:
 /// - Runs `cargo test` for the `src/tools/test-float-parse` tool.
 /// - Invokes the `test-float-parse` tool to test the standard library's
diff --git a/src/bootstrap/src/core/builder/mod.rs b/src/bootstrap/src/core/builder/mod.rs
index 19b79bfe818c2..e6083d612c6a2 100644
--- a/src/bootstrap/src/core/builder/mod.rs
+++ b/src/bootstrap/src/core/builder/mod.rs
@@ -1023,6 +1023,7 @@ impl<'a> Builder<'a> {
                 test::RustdocJson,
                 test::HtmlCheck,
                 test::RustInstaller,
+                test::Stdarch,
                 test::TestFloatParse,
                 test::CollectLicenseMetadata,
                 // Run bootstrap close to the end as it's unlikely to fail
diff --git a/src/bootstrap/src/lib.rs b/src/bootstrap/src/lib.rs
index 07772b8932d9d..68921454d210e 100644
--- a/src/bootstrap/src/lib.rs
+++ b/src/bootstrap/src/lib.rs
@@ -491,7 +491,7 @@ impl Build {
 
             // Make sure we update these before gathering metadata so we don't get an error about missing
             // Cargo.toml files.
-            let rust_submodules = ["library/backtrace", "library/stdarch"];
+            let rust_submodules = ["library/backtrace"];
             for s in rust_submodules {
                 build.require_submodule(
                     s,